diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,140033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 25.6, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00128, + "grad_norm": 0.6006259322166443, + "learning_rate": 5e-06, + "loss": 1.1793, + "step": 1 + }, + { + "epoch": 0.00256, + "grad_norm": 0.6071895956993103, + "learning_rate": 1e-05, + "loss": 1.2464, + "step": 2 + }, + { + "epoch": 0.00384, + "grad_norm": 0.584945797920227, + "learning_rate": 1.5e-05, + "loss": 1.2501, + "step": 3 + }, + { + "epoch": 0.00512, + "grad_norm": 0.6185263395309448, + "learning_rate": 2e-05, + "loss": 1.285, + "step": 4 + }, + { + "epoch": 0.0064, + "grad_norm": 0.6382588744163513, + "learning_rate": 2.5e-05, + "loss": 1.2345, + "step": 5 + }, + { + "epoch": 0.00768, + "grad_norm": 0.6335554122924805, + "learning_rate": 3e-05, + "loss": 1.2112, + "step": 6 + }, + { + "epoch": 0.00896, + "grad_norm": 0.619415283203125, + "learning_rate": 3.5e-05, + "loss": 1.2691, + "step": 7 + }, + { + "epoch": 0.01024, + "grad_norm": 0.6417765617370605, + "learning_rate": 4e-05, + "loss": 1.2458, + "step": 8 + }, + { + "epoch": 0.01152, + "grad_norm": 0.5834643244743347, + "learning_rate": 4.5e-05, + "loss": 1.119, + "step": 9 + }, + { + "epoch": 0.0128, + "grad_norm": 0.5374929308891296, + "learning_rate": 5e-05, + "loss": 1.0867, + "step": 10 + }, + { + "epoch": 0.01408, + "grad_norm": 0.5630415678024292, + "learning_rate": 4.9997999199679874e-05, + "loss": 1.1491, + "step": 11 + }, + { + "epoch": 0.01536, + "grad_norm": 0.5184459090232849, + "learning_rate": 4.9995998399359746e-05, + "loss": 1.1131, + "step": 12 + }, + { + "epoch": 0.01664, + "grad_norm": 0.5558237433433533, + "learning_rate": 4.999399759903962e-05, + "loss": 1.1601, + "step": 13 + }, + { + "epoch": 0.01792, + "grad_norm": 0.5303581953048706, + "learning_rate": 4.999199679871949e-05, + "loss": 1.0869, + "step": 14 + }, + { + "epoch": 0.0192, + "grad_norm": 0.570815920829773, + "learning_rate": 4.998999599839936e-05, + "loss": 1.074, + "step": 15 + }, + { + "epoch": 0.02048, + "grad_norm": 0.5760554075241089, + "learning_rate": 4.9987995198079234e-05, + "loss": 1.1381, + "step": 16 + }, + { + "epoch": 0.02176, + "grad_norm": 0.565275251865387, + "learning_rate": 4.9985994397759105e-05, + "loss": 1.1634, + "step": 17 + }, + { + "epoch": 0.02304, + "grad_norm": 0.5491355061531067, + "learning_rate": 4.998399359743898e-05, + "loss": 1.0564, + "step": 18 + }, + { + "epoch": 0.02432, + "grad_norm": 0.5034635663032532, + "learning_rate": 4.998199279711885e-05, + "loss": 0.9781, + "step": 19 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5160910487174988, + "learning_rate": 4.997999199679872e-05, + "loss": 1.0313, + "step": 20 + }, + { + "epoch": 0.02688, + "grad_norm": 0.5402336716651917, + "learning_rate": 4.99779911964786e-05, + "loss": 1.1054, + "step": 21 + }, + { + "epoch": 0.02816, + "grad_norm": 0.4777592420578003, + "learning_rate": 4.9975990396158465e-05, + "loss": 0.9927, + "step": 22 + }, + { + "epoch": 0.02944, + "grad_norm": 0.4950423240661621, + "learning_rate": 4.9973989595838337e-05, + "loss": 1.0234, + "step": 23 + }, + { + "epoch": 0.03072, + "grad_norm": 0.4756661057472229, + "learning_rate": 4.997198879551821e-05, + "loss": 0.9862, + "step": 24 + }, + { + "epoch": 0.032, + "grad_norm": 0.47625282406806946, + "learning_rate": 4.996998799519808e-05, + "loss": 0.9818, + "step": 25 + }, + { + "epoch": 0.03328, + "grad_norm": 0.4371047914028168, + "learning_rate": 4.996798719487795e-05, + "loss": 0.9961, + "step": 26 + }, + { + "epoch": 0.03456, + "grad_norm": 0.4368005692958832, + "learning_rate": 4.9965986394557824e-05, + "loss": 0.9859, + "step": 27 + }, + { + "epoch": 0.03584, + "grad_norm": 0.4649926424026489, + "learning_rate": 4.99639855942377e-05, + "loss": 1.0016, + "step": 28 + }, + { + "epoch": 0.03712, + "grad_norm": 0.45246022939682007, + "learning_rate": 4.9961984793917574e-05, + "loss": 1.016, + "step": 29 + }, + { + "epoch": 0.0384, + "grad_norm": 0.41020557284355164, + "learning_rate": 4.995998399359744e-05, + "loss": 0.9839, + "step": 30 + }, + { + "epoch": 0.03968, + "grad_norm": 0.39636656641960144, + "learning_rate": 4.995798319327731e-05, + "loss": 0.9041, + "step": 31 + }, + { + "epoch": 0.04096, + "grad_norm": 0.4260682165622711, + "learning_rate": 4.995598239295718e-05, + "loss": 1.0016, + "step": 32 + }, + { + "epoch": 0.04224, + "grad_norm": 0.4026011526584625, + "learning_rate": 4.9953981592637055e-05, + "loss": 0.8832, + "step": 33 + }, + { + "epoch": 0.04352, + "grad_norm": 0.3997635841369629, + "learning_rate": 4.995198079231693e-05, + "loss": 0.968, + "step": 34 + }, + { + "epoch": 0.0448, + "grad_norm": 0.37634411454200745, + "learning_rate": 4.9949979991996806e-05, + "loss": 0.9207, + "step": 35 + }, + { + "epoch": 0.04608, + "grad_norm": 0.3642372786998749, + "learning_rate": 4.994797919167668e-05, + "loss": 0.8979, + "step": 36 + }, + { + "epoch": 0.04736, + "grad_norm": 0.3647741675376892, + "learning_rate": 4.994597839135655e-05, + "loss": 0.8897, + "step": 37 + }, + { + "epoch": 0.04864, + "grad_norm": 0.3974230885505676, + "learning_rate": 4.9943977591036414e-05, + "loss": 0.9336, + "step": 38 + }, + { + "epoch": 0.04992, + "grad_norm": 0.3915899693965912, + "learning_rate": 4.9941976790716286e-05, + "loss": 0.9408, + "step": 39 + }, + { + "epoch": 0.0512, + "grad_norm": 0.40685927867889404, + "learning_rate": 4.993997599039616e-05, + "loss": 0.9166, + "step": 40 + }, + { + "epoch": 0.05248, + "grad_norm": 0.35324081778526306, + "learning_rate": 4.993797519007603e-05, + "loss": 0.8683, + "step": 41 + }, + { + "epoch": 0.05376, + "grad_norm": 0.36212658882141113, + "learning_rate": 4.993597438975591e-05, + "loss": 0.8828, + "step": 42 + }, + { + "epoch": 0.05504, + "grad_norm": 0.38610896468162537, + "learning_rate": 4.993397358943578e-05, + "loss": 0.9092, + "step": 43 + }, + { + "epoch": 0.05632, + "grad_norm": 0.36929619312286377, + "learning_rate": 4.993197278911565e-05, + "loss": 0.9132, + "step": 44 + }, + { + "epoch": 0.0576, + "grad_norm": 0.3837580680847168, + "learning_rate": 4.9929971988795524e-05, + "loss": 0.9606, + "step": 45 + }, + { + "epoch": 0.05888, + "grad_norm": 0.36300909519195557, + "learning_rate": 4.992797118847539e-05, + "loss": 0.8827, + "step": 46 + }, + { + "epoch": 0.06016, + "grad_norm": 0.38039928674697876, + "learning_rate": 4.992597038815526e-05, + "loss": 0.8996, + "step": 47 + }, + { + "epoch": 0.06144, + "grad_norm": 0.3811199367046356, + "learning_rate": 4.992396958783513e-05, + "loss": 0.9114, + "step": 48 + }, + { + "epoch": 0.06272, + "grad_norm": 0.36827829480171204, + "learning_rate": 4.992196878751501e-05, + "loss": 0.8199, + "step": 49 + }, + { + "epoch": 0.064, + "grad_norm": 0.35475796461105347, + "learning_rate": 4.9919967987194883e-05, + "loss": 0.8741, + "step": 50 + }, + { + "epoch": 0.06528, + "grad_norm": 0.3906758725643158, + "learning_rate": 4.9917967186874755e-05, + "loss": 0.9037, + "step": 51 + }, + { + "epoch": 0.06656, + "grad_norm": 0.3880329430103302, + "learning_rate": 4.991596638655463e-05, + "loss": 0.9474, + "step": 52 + }, + { + "epoch": 0.06784, + "grad_norm": 0.38061219453811646, + "learning_rate": 4.99139655862345e-05, + "loss": 0.894, + "step": 53 + }, + { + "epoch": 0.06912, + "grad_norm": 0.39929622411727905, + "learning_rate": 4.9911964785914364e-05, + "loss": 0.9322, + "step": 54 + }, + { + "epoch": 0.0704, + "grad_norm": 0.38368162512779236, + "learning_rate": 4.9909963985594236e-05, + "loss": 0.9286, + "step": 55 + }, + { + "epoch": 0.07168, + "grad_norm": 0.3783194124698639, + "learning_rate": 4.9907963185274115e-05, + "loss": 0.9337, + "step": 56 + }, + { + "epoch": 0.07296, + "grad_norm": 0.3646315932273865, + "learning_rate": 4.9905962384953986e-05, + "loss": 0.8836, + "step": 57 + }, + { + "epoch": 0.07424, + "grad_norm": 0.3632262349128723, + "learning_rate": 4.990396158463386e-05, + "loss": 0.8382, + "step": 58 + }, + { + "epoch": 0.07552, + "grad_norm": 0.41220006346702576, + "learning_rate": 4.990196078431373e-05, + "loss": 0.9255, + "step": 59 + }, + { + "epoch": 0.0768, + "grad_norm": 0.39080944657325745, + "learning_rate": 4.98999599839936e-05, + "loss": 0.8664, + "step": 60 + }, + { + "epoch": 0.07808, + "grad_norm": 0.37954285740852356, + "learning_rate": 4.9897959183673474e-05, + "loss": 0.8716, + "step": 61 + }, + { + "epoch": 0.07936, + "grad_norm": 0.40623852610588074, + "learning_rate": 4.989595838335334e-05, + "loss": 0.9211, + "step": 62 + }, + { + "epoch": 0.08064, + "grad_norm": 0.3674965500831604, + "learning_rate": 4.989395758303322e-05, + "loss": 0.8427, + "step": 63 + }, + { + "epoch": 0.08192, + "grad_norm": 0.3909313380718231, + "learning_rate": 4.989195678271309e-05, + "loss": 0.8804, + "step": 64 + }, + { + "epoch": 0.0832, + "grad_norm": 0.397886723279953, + "learning_rate": 4.988995598239296e-05, + "loss": 0.8772, + "step": 65 + }, + { + "epoch": 0.08448, + "grad_norm": 0.38570407032966614, + "learning_rate": 4.988795518207283e-05, + "loss": 0.8593, + "step": 66 + }, + { + "epoch": 0.08576, + "grad_norm": 0.4147164225578308, + "learning_rate": 4.9885954381752705e-05, + "loss": 0.9182, + "step": 67 + }, + { + "epoch": 0.08704, + "grad_norm": 0.3853755593299866, + "learning_rate": 4.988395358143258e-05, + "loss": 0.8583, + "step": 68 + }, + { + "epoch": 0.08832, + "grad_norm": 0.3911557197570801, + "learning_rate": 4.988195278111245e-05, + "loss": 0.8066, + "step": 69 + }, + { + "epoch": 0.0896, + "grad_norm": 0.3869986832141876, + "learning_rate": 4.9879951980792314e-05, + "loss": 0.8863, + "step": 70 + }, + { + "epoch": 0.09088, + "grad_norm": 0.425576388835907, + "learning_rate": 4.987795118047219e-05, + "loss": 0.9285, + "step": 71 + }, + { + "epoch": 0.09216, + "grad_norm": 0.4300587773323059, + "learning_rate": 4.9875950380152064e-05, + "loss": 0.8604, + "step": 72 + }, + { + "epoch": 0.09344, + "grad_norm": 0.4201204478740692, + "learning_rate": 4.9873949579831936e-05, + "loss": 0.8818, + "step": 73 + }, + { + "epoch": 0.09472, + "grad_norm": 0.41632699966430664, + "learning_rate": 4.987194877951181e-05, + "loss": 0.8951, + "step": 74 + }, + { + "epoch": 0.096, + "grad_norm": 0.398713082075119, + "learning_rate": 4.986994797919168e-05, + "loss": 0.8571, + "step": 75 + }, + { + "epoch": 0.09728, + "grad_norm": 0.40266332030296326, + "learning_rate": 4.986794717887155e-05, + "loss": 0.8546, + "step": 76 + }, + { + "epoch": 0.09856, + "grad_norm": 0.41864725947380066, + "learning_rate": 4.9865946378551424e-05, + "loss": 0.8228, + "step": 77 + }, + { + "epoch": 0.09984, + "grad_norm": 0.3853597044944763, + "learning_rate": 4.9863945578231295e-05, + "loss": 0.8282, + "step": 78 + }, + { + "epoch": 0.10112, + "grad_norm": 0.44597557187080383, + "learning_rate": 4.986194477791117e-05, + "loss": 0.9163, + "step": 79 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4048195481300354, + "learning_rate": 4.985994397759104e-05, + "loss": 0.8456, + "step": 80 + }, + { + "epoch": 0.10368, + "grad_norm": 0.4152766466140747, + "learning_rate": 4.985794317727091e-05, + "loss": 0.8562, + "step": 81 + }, + { + "epoch": 0.10496, + "grad_norm": 0.4202408492565155, + "learning_rate": 4.985594237695078e-05, + "loss": 0.8878, + "step": 82 + }, + { + "epoch": 0.10624, + "grad_norm": 0.4306046962738037, + "learning_rate": 4.9853941576630655e-05, + "loss": 0.8816, + "step": 83 + }, + { + "epoch": 0.10752, + "grad_norm": 0.4268186688423157, + "learning_rate": 4.9851940776310527e-05, + "loss": 0.7838, + "step": 84 + }, + { + "epoch": 0.1088, + "grad_norm": 0.4262542128562927, + "learning_rate": 4.98499399759904e-05, + "loss": 0.812, + "step": 85 + }, + { + "epoch": 0.11008, + "grad_norm": 0.44477730989456177, + "learning_rate": 4.984793917567027e-05, + "loss": 0.8684, + "step": 86 + }, + { + "epoch": 0.11136, + "grad_norm": 0.4398692548274994, + "learning_rate": 4.984593837535014e-05, + "loss": 0.8871, + "step": 87 + }, + { + "epoch": 0.11264, + "grad_norm": 0.4244236350059509, + "learning_rate": 4.9843937575030014e-05, + "loss": 0.8801, + "step": 88 + }, + { + "epoch": 0.11392, + "grad_norm": 0.47705894708633423, + "learning_rate": 4.9841936774709886e-05, + "loss": 0.8609, + "step": 89 + }, + { + "epoch": 0.1152, + "grad_norm": 0.44810983538627625, + "learning_rate": 4.983993597438976e-05, + "loss": 0.8376, + "step": 90 + }, + { + "epoch": 0.11648, + "grad_norm": 0.41785943508148193, + "learning_rate": 4.983793517406963e-05, + "loss": 0.8113, + "step": 91 + }, + { + "epoch": 0.11776, + "grad_norm": 0.44845548272132874, + "learning_rate": 4.98359343737495e-05, + "loss": 0.8438, + "step": 92 + }, + { + "epoch": 0.11904, + "grad_norm": 0.4400695562362671, + "learning_rate": 4.983393357342937e-05, + "loss": 0.8242, + "step": 93 + }, + { + "epoch": 0.12032, + "grad_norm": 0.4483565092086792, + "learning_rate": 4.9831932773109245e-05, + "loss": 0.8877, + "step": 94 + }, + { + "epoch": 0.1216, + "grad_norm": 0.46313560009002686, + "learning_rate": 4.982993197278912e-05, + "loss": 0.8162, + "step": 95 + }, + { + "epoch": 0.12288, + "grad_norm": 0.4211414158344269, + "learning_rate": 4.982793117246899e-05, + "loss": 0.807, + "step": 96 + }, + { + "epoch": 0.12416, + "grad_norm": 0.44126561284065247, + "learning_rate": 4.982593037214886e-05, + "loss": 0.8403, + "step": 97 + }, + { + "epoch": 0.12544, + "grad_norm": 0.4854923486709595, + "learning_rate": 4.982392957182873e-05, + "loss": 0.8869, + "step": 98 + }, + { + "epoch": 0.12672, + "grad_norm": 0.5096859335899353, + "learning_rate": 4.982192877150861e-05, + "loss": 0.9123, + "step": 99 + }, + { + "epoch": 0.128, + "grad_norm": 0.4502103328704834, + "learning_rate": 4.9819927971188476e-05, + "loss": 0.834, + "step": 100 + }, + { + "epoch": 0.12928, + "grad_norm": 0.4698023796081543, + "learning_rate": 4.981792717086835e-05, + "loss": 0.8685, + "step": 101 + }, + { + "epoch": 0.13056, + "grad_norm": 0.464958131313324, + "learning_rate": 4.981592637054822e-05, + "loss": 0.8724, + "step": 102 + }, + { + "epoch": 0.13184, + "grad_norm": 0.44018036127090454, + "learning_rate": 4.981392557022809e-05, + "loss": 0.8166, + "step": 103 + }, + { + "epoch": 0.13312, + "grad_norm": 0.4710625112056732, + "learning_rate": 4.9811924769907964e-05, + "loss": 0.9041, + "step": 104 + }, + { + "epoch": 0.1344, + "grad_norm": 0.45761746168136597, + "learning_rate": 4.9809923969587836e-05, + "loss": 0.8414, + "step": 105 + }, + { + "epoch": 0.13568, + "grad_norm": 0.43286818265914917, + "learning_rate": 4.9807923169267714e-05, + "loss": 0.827, + "step": 106 + }, + { + "epoch": 0.13696, + "grad_norm": 0.4683651030063629, + "learning_rate": 4.9805922368947586e-05, + "loss": 0.8831, + "step": 107 + }, + { + "epoch": 0.13824, + "grad_norm": 0.4688320457935333, + "learning_rate": 4.980392156862745e-05, + "loss": 0.8693, + "step": 108 + }, + { + "epoch": 0.13952, + "grad_norm": 0.4783925712108612, + "learning_rate": 4.980192076830732e-05, + "loss": 0.8086, + "step": 109 + }, + { + "epoch": 0.1408, + "grad_norm": 0.463559091091156, + "learning_rate": 4.9799919967987195e-05, + "loss": 0.8274, + "step": 110 + }, + { + "epoch": 0.14208, + "grad_norm": 0.5109454989433289, + "learning_rate": 4.979791916766707e-05, + "loss": 0.9021, + "step": 111 + }, + { + "epoch": 0.14336, + "grad_norm": 0.44130197167396545, + "learning_rate": 4.979591836734694e-05, + "loss": 0.8358, + "step": 112 + }, + { + "epoch": 0.14464, + "grad_norm": 0.45340853929519653, + "learning_rate": 4.979391756702682e-05, + "loss": 0.8054, + "step": 113 + }, + { + "epoch": 0.14592, + "grad_norm": 0.496158629655838, + "learning_rate": 4.979191676670669e-05, + "loss": 0.8812, + "step": 114 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4708779454231262, + "learning_rate": 4.978991596638656e-05, + "loss": 0.7753, + "step": 115 + }, + { + "epoch": 0.14848, + "grad_norm": 0.5166846513748169, + "learning_rate": 4.9787915166066426e-05, + "loss": 0.9508, + "step": 116 + }, + { + "epoch": 0.14976, + "grad_norm": 0.4957675635814667, + "learning_rate": 4.97859143657463e-05, + "loss": 0.8593, + "step": 117 + }, + { + "epoch": 0.15104, + "grad_norm": 0.4897170066833496, + "learning_rate": 4.978391356542617e-05, + "loss": 0.8419, + "step": 118 + }, + { + "epoch": 0.15232, + "grad_norm": 0.4425666630268097, + "learning_rate": 4.978191276510604e-05, + "loss": 0.8184, + "step": 119 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4843791723251343, + "learning_rate": 4.977991196478592e-05, + "loss": 0.8335, + "step": 120 + }, + { + "epoch": 0.15488, + "grad_norm": 0.4781690239906311, + "learning_rate": 4.977791116446579e-05, + "loss": 0.8835, + "step": 121 + }, + { + "epoch": 0.15616, + "grad_norm": 0.5279680490493774, + "learning_rate": 4.9775910364145664e-05, + "loss": 0.8627, + "step": 122 + }, + { + "epoch": 0.15744, + "grad_norm": 0.525348961353302, + "learning_rate": 4.9773909563825536e-05, + "loss": 0.9173, + "step": 123 + }, + { + "epoch": 0.15872, + "grad_norm": 0.4671289324760437, + "learning_rate": 4.97719087635054e-05, + "loss": 0.8189, + "step": 124 + }, + { + "epoch": 0.16, + "grad_norm": 0.48094815015792847, + "learning_rate": 4.976990796318527e-05, + "loss": 0.8514, + "step": 125 + }, + { + "epoch": 0.16128, + "grad_norm": 0.49200230836868286, + "learning_rate": 4.9767907162865145e-05, + "loss": 0.8669, + "step": 126 + }, + { + "epoch": 0.16256, + "grad_norm": 0.45223578810691833, + "learning_rate": 4.976590636254502e-05, + "loss": 0.811, + "step": 127 + }, + { + "epoch": 0.16384, + "grad_norm": 0.48548707365989685, + "learning_rate": 4.9763905562224895e-05, + "loss": 0.8733, + "step": 128 + }, + { + "epoch": 0.16512, + "grad_norm": 0.4744742214679718, + "learning_rate": 4.976190476190477e-05, + "loss": 0.82, + "step": 129 + }, + { + "epoch": 0.1664, + "grad_norm": 0.47322678565979004, + "learning_rate": 4.975990396158464e-05, + "loss": 0.8076, + "step": 130 + }, + { + "epoch": 0.16768, + "grad_norm": 0.5006943345069885, + "learning_rate": 4.975790316126451e-05, + "loss": 0.9173, + "step": 131 + }, + { + "epoch": 0.16896, + "grad_norm": 0.5181525945663452, + "learning_rate": 4.9755902360944376e-05, + "loss": 0.8946, + "step": 132 + }, + { + "epoch": 0.17024, + "grad_norm": 0.4706552028656006, + "learning_rate": 4.975390156062425e-05, + "loss": 0.8297, + "step": 133 + }, + { + "epoch": 0.17152, + "grad_norm": 0.5205573439598083, + "learning_rate": 4.9751900760304126e-05, + "loss": 0.9313, + "step": 134 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5010756850242615, + "learning_rate": 4.9749899959984e-05, + "loss": 0.7662, + "step": 135 + }, + { + "epoch": 0.17408, + "grad_norm": 0.5203558206558228, + "learning_rate": 4.974789915966387e-05, + "loss": 0.8708, + "step": 136 + }, + { + "epoch": 0.17536, + "grad_norm": 0.5031847357749939, + "learning_rate": 4.974589835934374e-05, + "loss": 0.8258, + "step": 137 + }, + { + "epoch": 0.17664, + "grad_norm": 0.5000547170639038, + "learning_rate": 4.9743897559023614e-05, + "loss": 0.804, + "step": 138 + }, + { + "epoch": 0.17792, + "grad_norm": 0.5364178419113159, + "learning_rate": 4.9741896758703485e-05, + "loss": 0.8398, + "step": 139 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4716164171695709, + "learning_rate": 4.973989595838335e-05, + "loss": 0.8043, + "step": 140 + }, + { + "epoch": 0.18048, + "grad_norm": 0.48213207721710205, + "learning_rate": 4.973789515806323e-05, + "loss": 0.803, + "step": 141 + }, + { + "epoch": 0.18176, + "grad_norm": 0.49524998664855957, + "learning_rate": 4.97358943577431e-05, + "loss": 0.8401, + "step": 142 + }, + { + "epoch": 0.18304, + "grad_norm": 0.5219356417655945, + "learning_rate": 4.973389355742297e-05, + "loss": 0.8474, + "step": 143 + }, + { + "epoch": 0.18432, + "grad_norm": 0.5039879679679871, + "learning_rate": 4.9731892757102845e-05, + "loss": 0.8219, + "step": 144 + }, + { + "epoch": 0.1856, + "grad_norm": 0.48044779896736145, + "learning_rate": 4.9729891956782717e-05, + "loss": 0.8212, + "step": 145 + }, + { + "epoch": 0.18688, + "grad_norm": 0.5111274123191833, + "learning_rate": 4.972789115646259e-05, + "loss": 0.8266, + "step": 146 + }, + { + "epoch": 0.18816, + "grad_norm": 0.5483076572418213, + "learning_rate": 4.972589035614246e-05, + "loss": 0.8127, + "step": 147 + }, + { + "epoch": 0.18944, + "grad_norm": 0.4989505708217621, + "learning_rate": 4.972388955582233e-05, + "loss": 0.8285, + "step": 148 + }, + { + "epoch": 0.19072, + "grad_norm": 0.5818595886230469, + "learning_rate": 4.9721888755502204e-05, + "loss": 0.8504, + "step": 149 + }, + { + "epoch": 0.192, + "grad_norm": 0.530348539352417, + "learning_rate": 4.9719887955182076e-05, + "loss": 0.8316, + "step": 150 + }, + { + "epoch": 0.19328, + "grad_norm": 0.50677090883255, + "learning_rate": 4.971788715486195e-05, + "loss": 0.8048, + "step": 151 + }, + { + "epoch": 0.19456, + "grad_norm": 0.5426594614982605, + "learning_rate": 4.971588635454182e-05, + "loss": 0.8929, + "step": 152 + }, + { + "epoch": 0.19584, + "grad_norm": 0.5290346145629883, + "learning_rate": 4.971388555422169e-05, + "loss": 0.8454, + "step": 153 + }, + { + "epoch": 0.19712, + "grad_norm": 0.49939247965812683, + "learning_rate": 4.971188475390156e-05, + "loss": 0.8309, + "step": 154 + }, + { + "epoch": 0.1984, + "grad_norm": 0.527169406414032, + "learning_rate": 4.9709883953581435e-05, + "loss": 0.8509, + "step": 155 + }, + { + "epoch": 0.19968, + "grad_norm": 0.525046169757843, + "learning_rate": 4.970788315326131e-05, + "loss": 0.8518, + "step": 156 + }, + { + "epoch": 0.20096, + "grad_norm": 0.5176852345466614, + "learning_rate": 4.970588235294118e-05, + "loss": 0.8062, + "step": 157 + }, + { + "epoch": 0.20224, + "grad_norm": 0.5134345889091492, + "learning_rate": 4.970388155262105e-05, + "loss": 0.8723, + "step": 158 + }, + { + "epoch": 0.20352, + "grad_norm": 0.5024152994155884, + "learning_rate": 4.970188075230092e-05, + "loss": 0.8053, + "step": 159 + }, + { + "epoch": 0.2048, + "grad_norm": 0.5135634541511536, + "learning_rate": 4.9699879951980794e-05, + "loss": 0.7935, + "step": 160 + }, + { + "epoch": 0.20608, + "grad_norm": 0.5161725878715515, + "learning_rate": 4.9697879151660666e-05, + "loss": 0.808, + "step": 161 + }, + { + "epoch": 0.20736, + "grad_norm": 0.5259711146354675, + "learning_rate": 4.969587835134054e-05, + "loss": 0.8209, + "step": 162 + }, + { + "epoch": 0.20864, + "grad_norm": 0.4903233051300049, + "learning_rate": 4.969387755102041e-05, + "loss": 0.793, + "step": 163 + }, + { + "epoch": 0.20992, + "grad_norm": 0.5051981806755066, + "learning_rate": 4.969187675070028e-05, + "loss": 0.7971, + "step": 164 + }, + { + "epoch": 0.2112, + "grad_norm": 0.49914005398750305, + "learning_rate": 4.9689875950380154e-05, + "loss": 0.8292, + "step": 165 + }, + { + "epoch": 0.21248, + "grad_norm": 0.5912098288536072, + "learning_rate": 4.9687875150060026e-05, + "loss": 0.8573, + "step": 166 + }, + { + "epoch": 0.21376, + "grad_norm": 0.5313730239868164, + "learning_rate": 4.96858743497399e-05, + "loss": 0.8106, + "step": 167 + }, + { + "epoch": 0.21504, + "grad_norm": 0.5088040232658386, + "learning_rate": 4.968387354941977e-05, + "loss": 0.8159, + "step": 168 + }, + { + "epoch": 0.21632, + "grad_norm": 0.5306310057640076, + "learning_rate": 4.968187274909965e-05, + "loss": 0.7539, + "step": 169 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4969288110733032, + "learning_rate": 4.967987194877951e-05, + "loss": 0.7364, + "step": 170 + }, + { + "epoch": 0.21888, + "grad_norm": 0.5466715097427368, + "learning_rate": 4.9677871148459385e-05, + "loss": 0.7734, + "step": 171 + }, + { + "epoch": 0.22016, + "grad_norm": 0.5333448052406311, + "learning_rate": 4.967587034813926e-05, + "loss": 0.7622, + "step": 172 + }, + { + "epoch": 0.22144, + "grad_norm": 0.49693411588668823, + "learning_rate": 4.967386954781913e-05, + "loss": 0.7967, + "step": 173 + }, + { + "epoch": 0.22272, + "grad_norm": 0.5356098413467407, + "learning_rate": 4.9671868747499e-05, + "loss": 0.8457, + "step": 174 + }, + { + "epoch": 0.224, + "grad_norm": 0.5146982073783875, + "learning_rate": 4.966986794717887e-05, + "loss": 0.8258, + "step": 175 + }, + { + "epoch": 0.22528, + "grad_norm": 0.5248112082481384, + "learning_rate": 4.966786714685875e-05, + "loss": 0.7929, + "step": 176 + }, + { + "epoch": 0.22656, + "grad_norm": 0.5330566763877869, + "learning_rate": 4.966586634653862e-05, + "loss": 0.7916, + "step": 177 + }, + { + "epoch": 0.22784, + "grad_norm": 0.5528807044029236, + "learning_rate": 4.966386554621849e-05, + "loss": 0.8199, + "step": 178 + }, + { + "epoch": 0.22912, + "grad_norm": 0.5558779835700989, + "learning_rate": 4.966186474589836e-05, + "loss": 0.7826, + "step": 179 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5845450758934021, + "learning_rate": 4.965986394557823e-05, + "loss": 0.8208, + "step": 180 + }, + { + "epoch": 0.23168, + "grad_norm": 0.565697431564331, + "learning_rate": 4.9657863145258103e-05, + "loss": 0.8423, + "step": 181 + }, + { + "epoch": 0.23296, + "grad_norm": 0.49734389781951904, + "learning_rate": 4.9655862344937975e-05, + "loss": 0.7908, + "step": 182 + }, + { + "epoch": 0.23424, + "grad_norm": 0.5148658156394958, + "learning_rate": 4.965386154461785e-05, + "loss": 0.7928, + "step": 183 + }, + { + "epoch": 0.23552, + "grad_norm": 0.5373660922050476, + "learning_rate": 4.9651860744297726e-05, + "loss": 0.794, + "step": 184 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5560426712036133, + "learning_rate": 4.96498599439776e-05, + "loss": 0.8215, + "step": 185 + }, + { + "epoch": 0.23808, + "grad_norm": 0.5648142695426941, + "learning_rate": 4.964785914365746e-05, + "loss": 0.8003, + "step": 186 + }, + { + "epoch": 0.23936, + "grad_norm": 0.6051068902015686, + "learning_rate": 4.9645858343337335e-05, + "loss": 0.8386, + "step": 187 + }, + { + "epoch": 0.24064, + "grad_norm": 0.5387214422225952, + "learning_rate": 4.9643857543017206e-05, + "loss": 0.8217, + "step": 188 + }, + { + "epoch": 0.24192, + "grad_norm": 0.5685967803001404, + "learning_rate": 4.964185674269708e-05, + "loss": 0.8346, + "step": 189 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5596044659614563, + "learning_rate": 4.963985594237695e-05, + "loss": 0.8023, + "step": 190 + }, + { + "epoch": 0.24448, + "grad_norm": 0.556212842464447, + "learning_rate": 4.963785514205683e-05, + "loss": 0.8171, + "step": 191 + }, + { + "epoch": 0.24576, + "grad_norm": 0.5799182653427124, + "learning_rate": 4.96358543417367e-05, + "loss": 0.7663, + "step": 192 + }, + { + "epoch": 0.24704, + "grad_norm": 0.5294367074966431, + "learning_rate": 4.963385354141657e-05, + "loss": 0.7845, + "step": 193 + }, + { + "epoch": 0.24832, + "grad_norm": 0.5038023591041565, + "learning_rate": 4.963185274109644e-05, + "loss": 0.7471, + "step": 194 + }, + { + "epoch": 0.2496, + "grad_norm": 0.5687416195869446, + "learning_rate": 4.962985194077631e-05, + "loss": 0.8603, + "step": 195 + }, + { + "epoch": 0.25088, + "grad_norm": 0.5455114245414734, + "learning_rate": 4.962785114045618e-05, + "loss": 0.8024, + "step": 196 + }, + { + "epoch": 0.25216, + "grad_norm": 0.5598348379135132, + "learning_rate": 4.962585034013605e-05, + "loss": 0.7709, + "step": 197 + }, + { + "epoch": 0.25344, + "grad_norm": 0.5542442798614502, + "learning_rate": 4.962384953981593e-05, + "loss": 0.7776, + "step": 198 + }, + { + "epoch": 0.25472, + "grad_norm": 0.5675541162490845, + "learning_rate": 4.9621848739495804e-05, + "loss": 0.8026, + "step": 199 + }, + { + "epoch": 0.256, + "grad_norm": 0.5367075204849243, + "learning_rate": 4.9619847939175676e-05, + "loss": 0.8048, + "step": 200 + }, + { + "epoch": 0.25728, + "grad_norm": 0.5945519804954529, + "learning_rate": 4.961784713885555e-05, + "loss": 0.8014, + "step": 201 + }, + { + "epoch": 0.25856, + "grad_norm": 0.5422332882881165, + "learning_rate": 4.961584633853541e-05, + "loss": 0.8761, + "step": 202 + }, + { + "epoch": 0.25984, + "grad_norm": 0.5434099435806274, + "learning_rate": 4.9613845538215284e-05, + "loss": 0.793, + "step": 203 + }, + { + "epoch": 0.26112, + "grad_norm": 0.49900439381599426, + "learning_rate": 4.9611844737895156e-05, + "loss": 0.7663, + "step": 204 + }, + { + "epoch": 0.2624, + "grad_norm": 0.5360503792762756, + "learning_rate": 4.9609843937575035e-05, + "loss": 0.8549, + "step": 205 + }, + { + "epoch": 0.26368, + "grad_norm": 0.5519281625747681, + "learning_rate": 4.960784313725491e-05, + "loss": 0.8166, + "step": 206 + }, + { + "epoch": 0.26496, + "grad_norm": 0.5933863520622253, + "learning_rate": 4.960584233693478e-05, + "loss": 0.7875, + "step": 207 + }, + { + "epoch": 0.26624, + "grad_norm": 0.5276301503181458, + "learning_rate": 4.960384153661465e-05, + "loss": 0.798, + "step": 208 + }, + { + "epoch": 0.26752, + "grad_norm": 0.5891780853271484, + "learning_rate": 4.960184073629452e-05, + "loss": 0.7636, + "step": 209 + }, + { + "epoch": 0.2688, + "grad_norm": 0.6080418825149536, + "learning_rate": 4.959983993597439e-05, + "loss": 0.8056, + "step": 210 + }, + { + "epoch": 0.27008, + "grad_norm": 0.5800920724868774, + "learning_rate": 4.959783913565426e-05, + "loss": 0.7786, + "step": 211 + }, + { + "epoch": 0.27136, + "grad_norm": 0.544090747833252, + "learning_rate": 4.959583833533414e-05, + "loss": 0.807, + "step": 212 + }, + { + "epoch": 0.27264, + "grad_norm": 0.5422959923744202, + "learning_rate": 4.959383753501401e-05, + "loss": 0.8109, + "step": 213 + }, + { + "epoch": 0.27392, + "grad_norm": 0.5759320855140686, + "learning_rate": 4.959183673469388e-05, + "loss": 0.8202, + "step": 214 + }, + { + "epoch": 0.2752, + "grad_norm": 0.6158757209777832, + "learning_rate": 4.958983593437375e-05, + "loss": 0.8622, + "step": 215 + }, + { + "epoch": 0.27648, + "grad_norm": 0.5751588344573975, + "learning_rate": 4.9587835134053625e-05, + "loss": 0.8078, + "step": 216 + }, + { + "epoch": 0.27776, + "grad_norm": 0.5633991956710815, + "learning_rate": 4.95858343337335e-05, + "loss": 0.8264, + "step": 217 + }, + { + "epoch": 0.27904, + "grad_norm": 0.5534703135490417, + "learning_rate": 4.958383353341336e-05, + "loss": 0.8046, + "step": 218 + }, + { + "epoch": 0.28032, + "grad_norm": 0.5665048956871033, + "learning_rate": 4.958183273309324e-05, + "loss": 0.8078, + "step": 219 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5370583534240723, + "learning_rate": 4.957983193277311e-05, + "loss": 0.7598, + "step": 220 + }, + { + "epoch": 0.28288, + "grad_norm": 0.5629434585571289, + "learning_rate": 4.9577831132452985e-05, + "loss": 0.8066, + "step": 221 + }, + { + "epoch": 0.28416, + "grad_norm": 0.5847271680831909, + "learning_rate": 4.9575830332132856e-05, + "loss": 0.8446, + "step": 222 + }, + { + "epoch": 0.28544, + "grad_norm": 0.5916556119918823, + "learning_rate": 4.957382953181273e-05, + "loss": 0.802, + "step": 223 + }, + { + "epoch": 0.28672, + "grad_norm": 0.6037752628326416, + "learning_rate": 4.95718287314926e-05, + "loss": 0.8271, + "step": 224 + }, + { + "epoch": 0.288, + "grad_norm": 0.5692646503448486, + "learning_rate": 4.956982793117247e-05, + "loss": 0.8059, + "step": 225 + }, + { + "epoch": 0.28928, + "grad_norm": 0.516044020652771, + "learning_rate": 4.9567827130852344e-05, + "loss": 0.7265, + "step": 226 + }, + { + "epoch": 0.29056, + "grad_norm": 0.5782244801521301, + "learning_rate": 4.9565826330532216e-05, + "loss": 0.7515, + "step": 227 + }, + { + "epoch": 0.29184, + "grad_norm": 0.5291789174079895, + "learning_rate": 4.956382553021209e-05, + "loss": 0.7896, + "step": 228 + }, + { + "epoch": 0.29312, + "grad_norm": 0.5594845414161682, + "learning_rate": 4.956182472989196e-05, + "loss": 0.8484, + "step": 229 + }, + { + "epoch": 0.2944, + "grad_norm": 0.5874782800674438, + "learning_rate": 4.955982392957183e-05, + "loss": 0.8865, + "step": 230 + }, + { + "epoch": 0.29568, + "grad_norm": 0.5848100781440735, + "learning_rate": 4.95578231292517e-05, + "loss": 0.8274, + "step": 231 + }, + { + "epoch": 0.29696, + "grad_norm": 0.5242554545402527, + "learning_rate": 4.9555822328931575e-05, + "loss": 0.7192, + "step": 232 + }, + { + "epoch": 0.29824, + "grad_norm": 0.5743463039398193, + "learning_rate": 4.955382152861145e-05, + "loss": 0.82, + "step": 233 + }, + { + "epoch": 0.29952, + "grad_norm": 0.5418503880500793, + "learning_rate": 4.955182072829132e-05, + "loss": 0.7837, + "step": 234 + }, + { + "epoch": 0.3008, + "grad_norm": 0.5967192053794861, + "learning_rate": 4.954981992797119e-05, + "loss": 0.7851, + "step": 235 + }, + { + "epoch": 0.30208, + "grad_norm": 0.6640393137931824, + "learning_rate": 4.954781912765106e-05, + "loss": 0.868, + "step": 236 + }, + { + "epoch": 0.30336, + "grad_norm": 0.5320351123809814, + "learning_rate": 4.9545818327330934e-05, + "loss": 0.7881, + "step": 237 + }, + { + "epoch": 0.30464, + "grad_norm": 0.5583452582359314, + "learning_rate": 4.9543817527010806e-05, + "loss": 0.7926, + "step": 238 + }, + { + "epoch": 0.30592, + "grad_norm": 0.5930384993553162, + "learning_rate": 4.954181672669068e-05, + "loss": 0.8088, + "step": 239 + }, + { + "epoch": 0.3072, + "grad_norm": 0.5465940833091736, + "learning_rate": 4.953981592637055e-05, + "loss": 0.8136, + "step": 240 + }, + { + "epoch": 0.30848, + "grad_norm": 0.5826738476753235, + "learning_rate": 4.953781512605042e-05, + "loss": 0.8333, + "step": 241 + }, + { + "epoch": 0.30976, + "grad_norm": 0.5693926811218262, + "learning_rate": 4.9535814325730293e-05, + "loss": 0.8225, + "step": 242 + }, + { + "epoch": 0.31104, + "grad_norm": 0.5517392158508301, + "learning_rate": 4.9533813525410165e-05, + "loss": 0.797, + "step": 243 + }, + { + "epoch": 0.31232, + "grad_norm": 0.5277729034423828, + "learning_rate": 4.953181272509004e-05, + "loss": 0.74, + "step": 244 + }, + { + "epoch": 0.3136, + "grad_norm": 0.5529365539550781, + "learning_rate": 4.952981192476991e-05, + "loss": 0.785, + "step": 245 + }, + { + "epoch": 0.31488, + "grad_norm": 0.5595861673355103, + "learning_rate": 4.952781112444978e-05, + "loss": 0.8319, + "step": 246 + }, + { + "epoch": 0.31616, + "grad_norm": 0.5934503674507141, + "learning_rate": 4.952581032412966e-05, + "loss": 0.7452, + "step": 247 + }, + { + "epoch": 0.31744, + "grad_norm": 0.5756224989891052, + "learning_rate": 4.9523809523809525e-05, + "loss": 0.7893, + "step": 248 + }, + { + "epoch": 0.31872, + "grad_norm": 0.6283906698226929, + "learning_rate": 4.9521808723489396e-05, + "loss": 0.9012, + "step": 249 + }, + { + "epoch": 0.32, + "grad_norm": 0.5654857158660889, + "learning_rate": 4.951980792316927e-05, + "loss": 0.7181, + "step": 250 + }, + { + "epoch": 0.32128, + "grad_norm": 0.5287903547286987, + "learning_rate": 4.951780712284914e-05, + "loss": 0.7838, + "step": 251 + }, + { + "epoch": 0.32256, + "grad_norm": 0.5523903369903564, + "learning_rate": 4.951580632252901e-05, + "loss": 0.8112, + "step": 252 + }, + { + "epoch": 0.32384, + "grad_norm": 0.5311181545257568, + "learning_rate": 4.9513805522208884e-05, + "loss": 0.7635, + "step": 253 + }, + { + "epoch": 0.32512, + "grad_norm": 0.5373832583427429, + "learning_rate": 4.951180472188876e-05, + "loss": 0.8084, + "step": 254 + }, + { + "epoch": 0.3264, + "grad_norm": 0.506324052810669, + "learning_rate": 4.9509803921568634e-05, + "loss": 0.7529, + "step": 255 + }, + { + "epoch": 0.32768, + "grad_norm": 0.5614925622940063, + "learning_rate": 4.95078031212485e-05, + "loss": 0.7682, + "step": 256 + }, + { + "epoch": 0.32896, + "grad_norm": 0.5953945517539978, + "learning_rate": 4.950580232092837e-05, + "loss": 0.7948, + "step": 257 + }, + { + "epoch": 0.33024, + "grad_norm": 0.5029836297035217, + "learning_rate": 4.950380152060824e-05, + "loss": 0.7703, + "step": 258 + }, + { + "epoch": 0.33152, + "grad_norm": 0.591628909111023, + "learning_rate": 4.9501800720288115e-05, + "loss": 0.8775, + "step": 259 + }, + { + "epoch": 0.3328, + "grad_norm": 0.5612605214118958, + "learning_rate": 4.949979991996799e-05, + "loss": 0.8241, + "step": 260 + }, + { + "epoch": 0.33408, + "grad_norm": 0.5402380228042603, + "learning_rate": 4.9497799119647866e-05, + "loss": 0.7018, + "step": 261 + }, + { + "epoch": 0.33536, + "grad_norm": 0.5295984148979187, + "learning_rate": 4.949579831932774e-05, + "loss": 0.8048, + "step": 262 + }, + { + "epoch": 0.33664, + "grad_norm": 0.5453588962554932, + "learning_rate": 4.949379751900761e-05, + "loss": 0.8198, + "step": 263 + }, + { + "epoch": 0.33792, + "grad_norm": 0.5718904137611389, + "learning_rate": 4.9491796718687474e-05, + "loss": 0.744, + "step": 264 + }, + { + "epoch": 0.3392, + "grad_norm": 0.5623178482055664, + "learning_rate": 4.9489795918367346e-05, + "loss": 0.7693, + "step": 265 + }, + { + "epoch": 0.34048, + "grad_norm": 0.5621166229248047, + "learning_rate": 4.948779511804722e-05, + "loss": 0.7858, + "step": 266 + }, + { + "epoch": 0.34176, + "grad_norm": 0.5423117876052856, + "learning_rate": 4.948579431772709e-05, + "loss": 0.7858, + "step": 267 + }, + { + "epoch": 0.34304, + "grad_norm": 0.5965234637260437, + "learning_rate": 4.948379351740697e-05, + "loss": 0.7983, + "step": 268 + }, + { + "epoch": 0.34432, + "grad_norm": 0.5860886573791504, + "learning_rate": 4.948179271708684e-05, + "loss": 0.7928, + "step": 269 + }, + { + "epoch": 0.3456, + "grad_norm": 0.5337466597557068, + "learning_rate": 4.947979191676671e-05, + "loss": 0.792, + "step": 270 + }, + { + "epoch": 0.34688, + "grad_norm": 0.6088325381278992, + "learning_rate": 4.9477791116446584e-05, + "loss": 0.8432, + "step": 271 + }, + { + "epoch": 0.34816, + "grad_norm": 0.5539262890815735, + "learning_rate": 4.947579031612645e-05, + "loss": 0.8019, + "step": 272 + }, + { + "epoch": 0.34944, + "grad_norm": 0.5926856398582458, + "learning_rate": 4.947378951580632e-05, + "loss": 0.7869, + "step": 273 + }, + { + "epoch": 0.35072, + "grad_norm": 0.5703017711639404, + "learning_rate": 4.947178871548619e-05, + "loss": 0.8028, + "step": 274 + }, + { + "epoch": 0.352, + "grad_norm": 0.5751794576644897, + "learning_rate": 4.946978791516607e-05, + "loss": 0.8149, + "step": 275 + }, + { + "epoch": 0.35328, + "grad_norm": 0.5824299454689026, + "learning_rate": 4.9467787114845943e-05, + "loss": 0.7349, + "step": 276 + }, + { + "epoch": 0.35456, + "grad_norm": 0.5836261510848999, + "learning_rate": 4.9465786314525815e-05, + "loss": 0.8029, + "step": 277 + }, + { + "epoch": 0.35584, + "grad_norm": 0.6032565236091614, + "learning_rate": 4.946378551420569e-05, + "loss": 0.806, + "step": 278 + }, + { + "epoch": 0.35712, + "grad_norm": 0.5672261118888855, + "learning_rate": 4.946178471388556e-05, + "loss": 0.8044, + "step": 279 + }, + { + "epoch": 0.3584, + "grad_norm": 0.5680912733078003, + "learning_rate": 4.9459783913565424e-05, + "loss": 0.7874, + "step": 280 + }, + { + "epoch": 0.35968, + "grad_norm": 0.5770596861839294, + "learning_rate": 4.9457783113245296e-05, + "loss": 0.7395, + "step": 281 + }, + { + "epoch": 0.36096, + "grad_norm": 0.5634730458259583, + "learning_rate": 4.9455782312925175e-05, + "loss": 0.7117, + "step": 282 + }, + { + "epoch": 0.36224, + "grad_norm": 0.6248441934585571, + "learning_rate": 4.9453781512605046e-05, + "loss": 0.7941, + "step": 283 + }, + { + "epoch": 0.36352, + "grad_norm": 0.5985219478607178, + "learning_rate": 4.945178071228492e-05, + "loss": 0.7826, + "step": 284 + }, + { + "epoch": 0.3648, + "grad_norm": 0.5826301574707031, + "learning_rate": 4.944977991196479e-05, + "loss": 0.7999, + "step": 285 + }, + { + "epoch": 0.36608, + "grad_norm": 0.5907866954803467, + "learning_rate": 4.944777911164466e-05, + "loss": 0.7725, + "step": 286 + }, + { + "epoch": 0.36736, + "grad_norm": 0.6153370141983032, + "learning_rate": 4.9445778311324534e-05, + "loss": 0.8651, + "step": 287 + }, + { + "epoch": 0.36864, + "grad_norm": 0.5640325546264648, + "learning_rate": 4.94437775110044e-05, + "loss": 0.7517, + "step": 288 + }, + { + "epoch": 0.36992, + "grad_norm": 0.5785256028175354, + "learning_rate": 4.944177671068427e-05, + "loss": 0.8121, + "step": 289 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5664944052696228, + "learning_rate": 4.943977591036415e-05, + "loss": 0.7345, + "step": 290 + }, + { + "epoch": 0.37248, + "grad_norm": 0.5458950996398926, + "learning_rate": 4.943777511004402e-05, + "loss": 0.73, + "step": 291 + }, + { + "epoch": 0.37376, + "grad_norm": 0.5893869996070862, + "learning_rate": 4.943577430972389e-05, + "loss": 0.7927, + "step": 292 + }, + { + "epoch": 0.37504, + "grad_norm": 0.6073876619338989, + "learning_rate": 4.9433773509403765e-05, + "loss": 0.7971, + "step": 293 + }, + { + "epoch": 0.37632, + "grad_norm": 0.5632811784744263, + "learning_rate": 4.943177270908364e-05, + "loss": 0.7442, + "step": 294 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5832781791687012, + "learning_rate": 4.942977190876351e-05, + "loss": 0.8098, + "step": 295 + }, + { + "epoch": 0.37888, + "grad_norm": 0.5741657018661499, + "learning_rate": 4.9427771108443374e-05, + "loss": 0.7586, + "step": 296 + }, + { + "epoch": 0.38016, + "grad_norm": 0.6392624974250793, + "learning_rate": 4.942577030812325e-05, + "loss": 0.7998, + "step": 297 + }, + { + "epoch": 0.38144, + "grad_norm": 0.5990936160087585, + "learning_rate": 4.9423769507803124e-05, + "loss": 0.8197, + "step": 298 + }, + { + "epoch": 0.38272, + "grad_norm": 0.5655567049980164, + "learning_rate": 4.9421768707482996e-05, + "loss": 0.7936, + "step": 299 + }, + { + "epoch": 0.384, + "grad_norm": 0.5669339895248413, + "learning_rate": 4.941976790716287e-05, + "loss": 0.7923, + "step": 300 + }, + { + "epoch": 0.38528, + "grad_norm": 0.5685401558876038, + "learning_rate": 4.941776710684274e-05, + "loss": 0.8257, + "step": 301 + }, + { + "epoch": 0.38656, + "grad_norm": 0.581586480140686, + "learning_rate": 4.941576630652261e-05, + "loss": 0.7951, + "step": 302 + }, + { + "epoch": 0.38784, + "grad_norm": 0.5897729992866516, + "learning_rate": 4.9413765506202484e-05, + "loss": 0.7391, + "step": 303 + }, + { + "epoch": 0.38912, + "grad_norm": 0.5444718599319458, + "learning_rate": 4.9411764705882355e-05, + "loss": 0.7317, + "step": 304 + }, + { + "epoch": 0.3904, + "grad_norm": 0.5707020163536072, + "learning_rate": 4.940976390556223e-05, + "loss": 0.7453, + "step": 305 + }, + { + "epoch": 0.39168, + "grad_norm": 0.5731601119041443, + "learning_rate": 4.94077631052421e-05, + "loss": 0.7614, + "step": 306 + }, + { + "epoch": 0.39296, + "grad_norm": 0.5668581128120422, + "learning_rate": 4.940576230492197e-05, + "loss": 0.7696, + "step": 307 + }, + { + "epoch": 0.39424, + "grad_norm": 0.6144934892654419, + "learning_rate": 4.940376150460184e-05, + "loss": 0.7765, + "step": 308 + }, + { + "epoch": 0.39552, + "grad_norm": 0.6028556823730469, + "learning_rate": 4.9401760704281715e-05, + "loss": 0.8292, + "step": 309 + }, + { + "epoch": 0.3968, + "grad_norm": 0.5614394545555115, + "learning_rate": 4.9399759903961587e-05, + "loss": 0.7867, + "step": 310 + }, + { + "epoch": 0.39808, + "grad_norm": 0.6021682620048523, + "learning_rate": 4.939775910364146e-05, + "loss": 0.7805, + "step": 311 + }, + { + "epoch": 0.39936, + "grad_norm": 0.6178930401802063, + "learning_rate": 4.939575830332133e-05, + "loss": 0.846, + "step": 312 + }, + { + "epoch": 0.40064, + "grad_norm": 0.6026805639266968, + "learning_rate": 4.93937575030012e-05, + "loss": 0.7695, + "step": 313 + }, + { + "epoch": 0.40192, + "grad_norm": 0.6167937517166138, + "learning_rate": 4.9391756702681074e-05, + "loss": 0.7939, + "step": 314 + }, + { + "epoch": 0.4032, + "grad_norm": 0.5787027478218079, + "learning_rate": 4.9389755902360946e-05, + "loss": 0.7763, + "step": 315 + }, + { + "epoch": 0.40448, + "grad_norm": 0.5984156131744385, + "learning_rate": 4.938775510204082e-05, + "loss": 0.8166, + "step": 316 + }, + { + "epoch": 0.40576, + "grad_norm": 0.6511149406433105, + "learning_rate": 4.938575430172069e-05, + "loss": 0.8453, + "step": 317 + }, + { + "epoch": 0.40704, + "grad_norm": 0.6076487898826599, + "learning_rate": 4.938375350140057e-05, + "loss": 0.7335, + "step": 318 + }, + { + "epoch": 0.40832, + "grad_norm": 0.5762828588485718, + "learning_rate": 4.938175270108043e-05, + "loss": 0.7503, + "step": 319 + }, + { + "epoch": 0.4096, + "grad_norm": 0.6130267381668091, + "learning_rate": 4.9379751900760305e-05, + "loss": 0.7707, + "step": 320 + }, + { + "epoch": 0.41088, + "grad_norm": 0.6319014430046082, + "learning_rate": 4.937775110044018e-05, + "loss": 0.7817, + "step": 321 + }, + { + "epoch": 0.41216, + "grad_norm": 0.5863409638404846, + "learning_rate": 4.937575030012005e-05, + "loss": 0.8031, + "step": 322 + }, + { + "epoch": 0.41344, + "grad_norm": 0.5963563323020935, + "learning_rate": 4.937374949979992e-05, + "loss": 0.8168, + "step": 323 + }, + { + "epoch": 0.41472, + "grad_norm": 0.5877537131309509, + "learning_rate": 4.937174869947979e-05, + "loss": 0.8302, + "step": 324 + }, + { + "epoch": 0.416, + "grad_norm": 0.5984660387039185, + "learning_rate": 4.936974789915967e-05, + "loss": 0.8747, + "step": 325 + }, + { + "epoch": 0.41728, + "grad_norm": 0.5737277269363403, + "learning_rate": 4.936774709883954e-05, + "loss": 0.8451, + "step": 326 + }, + { + "epoch": 0.41856, + "grad_norm": 0.5724412798881531, + "learning_rate": 4.936574629851941e-05, + "loss": 0.8066, + "step": 327 + }, + { + "epoch": 0.41984, + "grad_norm": 0.5816782116889954, + "learning_rate": 4.936374549819928e-05, + "loss": 0.756, + "step": 328 + }, + { + "epoch": 0.42112, + "grad_norm": 0.592029333114624, + "learning_rate": 4.936174469787915e-05, + "loss": 0.7409, + "step": 329 + }, + { + "epoch": 0.4224, + "grad_norm": 0.5857312083244324, + "learning_rate": 4.9359743897559024e-05, + "loss": 0.7277, + "step": 330 + }, + { + "epoch": 0.42368, + "grad_norm": 0.6745790839195251, + "learning_rate": 4.9357743097238896e-05, + "loss": 0.8739, + "step": 331 + }, + { + "epoch": 0.42496, + "grad_norm": 0.5804886221885681, + "learning_rate": 4.9355742296918774e-05, + "loss": 0.7704, + "step": 332 + }, + { + "epoch": 0.42624, + "grad_norm": 0.6009431481361389, + "learning_rate": 4.9353741496598646e-05, + "loss": 0.7817, + "step": 333 + }, + { + "epoch": 0.42752, + "grad_norm": 0.5622819066047668, + "learning_rate": 4.935174069627852e-05, + "loss": 0.7781, + "step": 334 + }, + { + "epoch": 0.4288, + "grad_norm": 0.6032870411872864, + "learning_rate": 4.934973989595838e-05, + "loss": 0.8291, + "step": 335 + }, + { + "epoch": 0.43008, + "grad_norm": 0.6172161102294922, + "learning_rate": 4.9347739095638255e-05, + "loss": 0.8107, + "step": 336 + }, + { + "epoch": 0.43136, + "grad_norm": 0.5511362552642822, + "learning_rate": 4.934573829531813e-05, + "loss": 0.7283, + "step": 337 + }, + { + "epoch": 0.43264, + "grad_norm": 0.5732323527336121, + "learning_rate": 4.9343737494998e-05, + "loss": 0.694, + "step": 338 + }, + { + "epoch": 0.43392, + "grad_norm": 0.5522581934928894, + "learning_rate": 4.934173669467788e-05, + "loss": 0.8047, + "step": 339 + }, + { + "epoch": 0.4352, + "grad_norm": 0.5704146027565002, + "learning_rate": 4.933973589435775e-05, + "loss": 0.789, + "step": 340 + }, + { + "epoch": 0.43648, + "grad_norm": 0.564039409160614, + "learning_rate": 4.933773509403762e-05, + "loss": 0.764, + "step": 341 + }, + { + "epoch": 0.43776, + "grad_norm": 0.610349714756012, + "learning_rate": 4.933573429371749e-05, + "loss": 0.7716, + "step": 342 + }, + { + "epoch": 0.43904, + "grad_norm": 0.5944207310676575, + "learning_rate": 4.933373349339736e-05, + "loss": 0.7811, + "step": 343 + }, + { + "epoch": 0.44032, + "grad_norm": 0.5716763734817505, + "learning_rate": 4.933173269307723e-05, + "loss": 0.7904, + "step": 344 + }, + { + "epoch": 0.4416, + "grad_norm": 0.6018866896629333, + "learning_rate": 4.93297318927571e-05, + "loss": 0.783, + "step": 345 + }, + { + "epoch": 0.44288, + "grad_norm": 0.5952413082122803, + "learning_rate": 4.932773109243698e-05, + "loss": 0.8399, + "step": 346 + }, + { + "epoch": 0.44416, + "grad_norm": 0.5757237672805786, + "learning_rate": 4.932573029211685e-05, + "loss": 0.829, + "step": 347 + }, + { + "epoch": 0.44544, + "grad_norm": 0.6060802340507507, + "learning_rate": 4.9323729491796724e-05, + "loss": 0.8134, + "step": 348 + }, + { + "epoch": 0.44672, + "grad_norm": 0.6192966103553772, + "learning_rate": 4.9321728691476596e-05, + "loss": 0.8245, + "step": 349 + }, + { + "epoch": 0.448, + "grad_norm": 0.5844107270240784, + "learning_rate": 4.931972789115647e-05, + "loss": 0.7463, + "step": 350 + }, + { + "epoch": 0.44928, + "grad_norm": 0.6216725707054138, + "learning_rate": 4.931772709083633e-05, + "loss": 0.7797, + "step": 351 + }, + { + "epoch": 0.45056, + "grad_norm": 0.6098612546920776, + "learning_rate": 4.9315726290516205e-05, + "loss": 0.7743, + "step": 352 + }, + { + "epoch": 0.45184, + "grad_norm": 0.5867882966995239, + "learning_rate": 4.931372549019608e-05, + "loss": 0.7186, + "step": 353 + }, + { + "epoch": 0.45312, + "grad_norm": 0.5984062552452087, + "learning_rate": 4.9311724689875955e-05, + "loss": 0.7579, + "step": 354 + }, + { + "epoch": 0.4544, + "grad_norm": 0.6255276203155518, + "learning_rate": 4.930972388955583e-05, + "loss": 0.8468, + "step": 355 + }, + { + "epoch": 0.45568, + "grad_norm": 0.5381520986557007, + "learning_rate": 4.93077230892357e-05, + "loss": 0.7261, + "step": 356 + }, + { + "epoch": 0.45696, + "grad_norm": 0.5565890073776245, + "learning_rate": 4.930572228891557e-05, + "loss": 0.7827, + "step": 357 + }, + { + "epoch": 0.45824, + "grad_norm": 0.6231935620307922, + "learning_rate": 4.930372148859544e-05, + "loss": 0.8469, + "step": 358 + }, + { + "epoch": 0.45952, + "grad_norm": 0.6231086850166321, + "learning_rate": 4.930172068827531e-05, + "loss": 0.8246, + "step": 359 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5462002158164978, + "learning_rate": 4.9299719887955186e-05, + "loss": 0.7182, + "step": 360 + }, + { + "epoch": 0.46208, + "grad_norm": 0.5747554302215576, + "learning_rate": 4.929771908763506e-05, + "loss": 0.7594, + "step": 361 + }, + { + "epoch": 0.46336, + "grad_norm": 0.5723777413368225, + "learning_rate": 4.929571828731493e-05, + "loss": 0.7787, + "step": 362 + }, + { + "epoch": 0.46464, + "grad_norm": 0.6416032910346985, + "learning_rate": 4.92937174869948e-05, + "loss": 0.8315, + "step": 363 + }, + { + "epoch": 0.46592, + "grad_norm": 0.603192925453186, + "learning_rate": 4.9291716686674674e-05, + "loss": 0.7719, + "step": 364 + }, + { + "epoch": 0.4672, + "grad_norm": 0.6231751441955566, + "learning_rate": 4.9289715886354545e-05, + "loss": 0.8031, + "step": 365 + }, + { + "epoch": 0.46848, + "grad_norm": 0.6328988671302795, + "learning_rate": 4.928771508603442e-05, + "loss": 0.8228, + "step": 366 + }, + { + "epoch": 0.46976, + "grad_norm": 0.5893206000328064, + "learning_rate": 4.928571428571429e-05, + "loss": 0.7112, + "step": 367 + }, + { + "epoch": 0.47104, + "grad_norm": 0.5830476880073547, + "learning_rate": 4.928371348539416e-05, + "loss": 0.7569, + "step": 368 + }, + { + "epoch": 0.47232, + "grad_norm": 0.5955057740211487, + "learning_rate": 4.928171268507403e-05, + "loss": 0.7747, + "step": 369 + }, + { + "epoch": 0.4736, + "grad_norm": 0.5977997183799744, + "learning_rate": 4.9279711884753905e-05, + "loss": 0.7904, + "step": 370 + }, + { + "epoch": 0.47488, + "grad_norm": 0.5674768090248108, + "learning_rate": 4.9277711084433777e-05, + "loss": 0.752, + "step": 371 + }, + { + "epoch": 0.47616, + "grad_norm": 0.6169261932373047, + "learning_rate": 4.927571028411365e-05, + "loss": 0.7574, + "step": 372 + }, + { + "epoch": 0.47744, + "grad_norm": 0.618199348449707, + "learning_rate": 4.927370948379352e-05, + "loss": 0.7908, + "step": 373 + }, + { + "epoch": 0.47872, + "grad_norm": 0.5316541194915771, + "learning_rate": 4.927170868347339e-05, + "loss": 0.8064, + "step": 374 + }, + { + "epoch": 0.48, + "grad_norm": 0.5974490642547607, + "learning_rate": 4.9269707883153264e-05, + "loss": 0.7936, + "step": 375 + }, + { + "epoch": 0.48128, + "grad_norm": 0.6208757758140564, + "learning_rate": 4.9267707082833136e-05, + "loss": 0.7862, + "step": 376 + }, + { + "epoch": 0.48256, + "grad_norm": 0.6077625155448914, + "learning_rate": 4.926570628251301e-05, + "loss": 0.7473, + "step": 377 + }, + { + "epoch": 0.48384, + "grad_norm": 0.591814398765564, + "learning_rate": 4.926370548219288e-05, + "loss": 0.7455, + "step": 378 + }, + { + "epoch": 0.48512, + "grad_norm": 0.6128320097923279, + "learning_rate": 4.926170468187275e-05, + "loss": 0.8214, + "step": 379 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5914111733436584, + "learning_rate": 4.925970388155262e-05, + "loss": 0.8374, + "step": 380 + }, + { + "epoch": 0.48768, + "grad_norm": 0.5888047218322754, + "learning_rate": 4.9257703081232495e-05, + "loss": 0.7639, + "step": 381 + }, + { + "epoch": 0.48896, + "grad_norm": 0.5357518196105957, + "learning_rate": 4.925570228091237e-05, + "loss": 0.7629, + "step": 382 + }, + { + "epoch": 0.49024, + "grad_norm": 0.5791406631469727, + "learning_rate": 4.925370148059224e-05, + "loss": 0.77, + "step": 383 + }, + { + "epoch": 0.49152, + "grad_norm": 0.5658066272735596, + "learning_rate": 4.925170068027211e-05, + "loss": 0.8229, + "step": 384 + }, + { + "epoch": 0.4928, + "grad_norm": 0.5416905283927917, + "learning_rate": 4.924969987995198e-05, + "loss": 0.7255, + "step": 385 + }, + { + "epoch": 0.49408, + "grad_norm": 0.5073053240776062, + "learning_rate": 4.9247699079631854e-05, + "loss": 0.7436, + "step": 386 + }, + { + "epoch": 0.49536, + "grad_norm": 0.5704997777938843, + "learning_rate": 4.9245698279311726e-05, + "loss": 0.8392, + "step": 387 + }, + { + "epoch": 0.49664, + "grad_norm": 0.6032180786132812, + "learning_rate": 4.9243697478991605e-05, + "loss": 0.7793, + "step": 388 + }, + { + "epoch": 0.49792, + "grad_norm": 0.6236636638641357, + "learning_rate": 4.924169667867147e-05, + "loss": 0.8586, + "step": 389 + }, + { + "epoch": 0.4992, + "grad_norm": 0.5844170451164246, + "learning_rate": 4.923969587835134e-05, + "loss": 0.7596, + "step": 390 + }, + { + "epoch": 0.50048, + "grad_norm": 0.5753904581069946, + "learning_rate": 4.9237695078031214e-05, + "loss": 0.7724, + "step": 391 + }, + { + "epoch": 0.50176, + "grad_norm": 0.5851483941078186, + "learning_rate": 4.9235694277711086e-05, + "loss": 0.809, + "step": 392 + }, + { + "epoch": 0.50304, + "grad_norm": 0.6062473058700562, + "learning_rate": 4.923369347739096e-05, + "loss": 0.7379, + "step": 393 + }, + { + "epoch": 0.50432, + "grad_norm": 0.5940774083137512, + "learning_rate": 4.923169267707083e-05, + "loss": 0.7636, + "step": 394 + }, + { + "epoch": 0.5056, + "grad_norm": 0.6108981370925903, + "learning_rate": 4.922969187675071e-05, + "loss": 0.7407, + "step": 395 + }, + { + "epoch": 0.50688, + "grad_norm": 0.5814557671546936, + "learning_rate": 4.922769107643058e-05, + "loss": 0.7893, + "step": 396 + }, + { + "epoch": 0.50816, + "grad_norm": 0.5524982810020447, + "learning_rate": 4.9225690276110445e-05, + "loss": 0.7972, + "step": 397 + }, + { + "epoch": 0.50944, + "grad_norm": 0.6131865382194519, + "learning_rate": 4.922368947579032e-05, + "loss": 0.8582, + "step": 398 + }, + { + "epoch": 0.51072, + "grad_norm": 0.5640206336975098, + "learning_rate": 4.922168867547019e-05, + "loss": 0.7221, + "step": 399 + }, + { + "epoch": 0.512, + "grad_norm": 0.5649920701980591, + "learning_rate": 4.921968787515006e-05, + "loss": 0.7759, + "step": 400 + }, + { + "epoch": 0.51328, + "grad_norm": 0.6427083611488342, + "learning_rate": 4.921768707482993e-05, + "loss": 0.8388, + "step": 401 + }, + { + "epoch": 0.51456, + "grad_norm": 0.6196634769439697, + "learning_rate": 4.9215686274509804e-05, + "loss": 0.8383, + "step": 402 + }, + { + "epoch": 0.51584, + "grad_norm": 0.5725811123847961, + "learning_rate": 4.921368547418968e-05, + "loss": 0.7602, + "step": 403 + }, + { + "epoch": 0.51712, + "grad_norm": 0.6151264309883118, + "learning_rate": 4.9211684673869555e-05, + "loss": 0.7759, + "step": 404 + }, + { + "epoch": 0.5184, + "grad_norm": 0.6025890707969666, + "learning_rate": 4.920968387354942e-05, + "loss": 0.7741, + "step": 405 + }, + { + "epoch": 0.51968, + "grad_norm": 0.5948590636253357, + "learning_rate": 4.920768307322929e-05, + "loss": 0.7644, + "step": 406 + }, + { + "epoch": 0.52096, + "grad_norm": 0.5557575821876526, + "learning_rate": 4.9205682272909163e-05, + "loss": 0.7398, + "step": 407 + }, + { + "epoch": 0.52224, + "grad_norm": 0.6094503402709961, + "learning_rate": 4.9203681472589035e-05, + "loss": 0.7671, + "step": 408 + }, + { + "epoch": 0.52352, + "grad_norm": 0.6672384738922119, + "learning_rate": 4.920168067226891e-05, + "loss": 0.7938, + "step": 409 + }, + { + "epoch": 0.5248, + "grad_norm": 0.6114287376403809, + "learning_rate": 4.9199679871948786e-05, + "loss": 0.8331, + "step": 410 + }, + { + "epoch": 0.52608, + "grad_norm": 0.5579826831817627, + "learning_rate": 4.919767907162866e-05, + "loss": 0.7256, + "step": 411 + }, + { + "epoch": 0.52736, + "grad_norm": 0.5519280433654785, + "learning_rate": 4.919567827130853e-05, + "loss": 0.755, + "step": 412 + }, + { + "epoch": 0.52864, + "grad_norm": 0.5924770832061768, + "learning_rate": 4.9193677470988395e-05, + "loss": 0.7783, + "step": 413 + }, + { + "epoch": 0.52992, + "grad_norm": 0.5979952812194824, + "learning_rate": 4.9191676670668266e-05, + "loss": 0.7305, + "step": 414 + }, + { + "epoch": 0.5312, + "grad_norm": 0.6058392524719238, + "learning_rate": 4.918967587034814e-05, + "loss": 0.7754, + "step": 415 + }, + { + "epoch": 0.53248, + "grad_norm": 0.5810478925704956, + "learning_rate": 4.918767507002801e-05, + "loss": 0.7785, + "step": 416 + }, + { + "epoch": 0.53376, + "grad_norm": 0.5745630860328674, + "learning_rate": 4.918567426970789e-05, + "loss": 0.7649, + "step": 417 + }, + { + "epoch": 0.53504, + "grad_norm": 0.6739091277122498, + "learning_rate": 4.918367346938776e-05, + "loss": 0.8086, + "step": 418 + }, + { + "epoch": 0.53632, + "grad_norm": 0.6409255266189575, + "learning_rate": 4.918167266906763e-05, + "loss": 0.7439, + "step": 419 + }, + { + "epoch": 0.5376, + "grad_norm": 0.6018539667129517, + "learning_rate": 4.9179671868747504e-05, + "loss": 0.8367, + "step": 420 + }, + { + "epoch": 0.53888, + "grad_norm": 0.6245967745780945, + "learning_rate": 4.917767106842737e-05, + "loss": 0.8294, + "step": 421 + }, + { + "epoch": 0.54016, + "grad_norm": 0.5872459411621094, + "learning_rate": 4.917567026810724e-05, + "loss": 0.7508, + "step": 422 + }, + { + "epoch": 0.54144, + "grad_norm": 0.6180544495582581, + "learning_rate": 4.917366946778711e-05, + "loss": 0.8241, + "step": 423 + }, + { + "epoch": 0.54272, + "grad_norm": 0.56467604637146, + "learning_rate": 4.917166866746699e-05, + "loss": 0.7355, + "step": 424 + }, + { + "epoch": 0.544, + "grad_norm": 0.5801830887794495, + "learning_rate": 4.9169667867146864e-05, + "loss": 0.7786, + "step": 425 + }, + { + "epoch": 0.54528, + "grad_norm": 0.5825265049934387, + "learning_rate": 4.9167667066826735e-05, + "loss": 0.7876, + "step": 426 + }, + { + "epoch": 0.54656, + "grad_norm": 0.5895298719406128, + "learning_rate": 4.916566626650661e-05, + "loss": 0.8246, + "step": 427 + }, + { + "epoch": 0.54784, + "grad_norm": 0.5841552019119263, + "learning_rate": 4.916366546618648e-05, + "loss": 0.7721, + "step": 428 + }, + { + "epoch": 0.54912, + "grad_norm": 0.6116175651550293, + "learning_rate": 4.9161664665866344e-05, + "loss": 0.7855, + "step": 429 + }, + { + "epoch": 0.5504, + "grad_norm": 0.6221590042114258, + "learning_rate": 4.9159663865546216e-05, + "loss": 0.7947, + "step": 430 + }, + { + "epoch": 0.55168, + "grad_norm": 0.5872395634651184, + "learning_rate": 4.9157663065226095e-05, + "loss": 0.7433, + "step": 431 + }, + { + "epoch": 0.55296, + "grad_norm": 0.5529791712760925, + "learning_rate": 4.9155662264905967e-05, + "loss": 0.7531, + "step": 432 + }, + { + "epoch": 0.55424, + "grad_norm": 0.5675722360610962, + "learning_rate": 4.915366146458584e-05, + "loss": 0.7426, + "step": 433 + }, + { + "epoch": 0.55552, + "grad_norm": 0.5776454210281372, + "learning_rate": 4.915166066426571e-05, + "loss": 0.8128, + "step": 434 + }, + { + "epoch": 0.5568, + "grad_norm": 0.5539517402648926, + "learning_rate": 4.914965986394558e-05, + "loss": 0.806, + "step": 435 + }, + { + "epoch": 0.55808, + "grad_norm": 0.612989068031311, + "learning_rate": 4.9147659063625454e-05, + "loss": 0.7703, + "step": 436 + }, + { + "epoch": 0.55936, + "grad_norm": 0.6443326473236084, + "learning_rate": 4.914565826330532e-05, + "loss": 0.7351, + "step": 437 + }, + { + "epoch": 0.56064, + "grad_norm": 0.5801988244056702, + "learning_rate": 4.91436574629852e-05, + "loss": 0.792, + "step": 438 + }, + { + "epoch": 0.56192, + "grad_norm": 0.5797520875930786, + "learning_rate": 4.914165666266507e-05, + "loss": 0.7547, + "step": 439 + }, + { + "epoch": 0.5632, + "grad_norm": 0.6442052721977234, + "learning_rate": 4.913965586234494e-05, + "loss": 0.7916, + "step": 440 + }, + { + "epoch": 0.56448, + "grad_norm": 0.5808482766151428, + "learning_rate": 4.913765506202481e-05, + "loss": 0.7912, + "step": 441 + }, + { + "epoch": 0.56576, + "grad_norm": 0.6839514970779419, + "learning_rate": 4.9135654261704685e-05, + "loss": 0.8279, + "step": 442 + }, + { + "epoch": 0.56704, + "grad_norm": 0.6102442145347595, + "learning_rate": 4.913365346138456e-05, + "loss": 0.8261, + "step": 443 + }, + { + "epoch": 0.56832, + "grad_norm": 0.5921707153320312, + "learning_rate": 4.913165266106443e-05, + "loss": 0.7572, + "step": 444 + }, + { + "epoch": 0.5696, + "grad_norm": 0.6068132519721985, + "learning_rate": 4.91296518607443e-05, + "loss": 0.6949, + "step": 445 + }, + { + "epoch": 0.57088, + "grad_norm": 0.5876120924949646, + "learning_rate": 4.912765106042417e-05, + "loss": 0.7906, + "step": 446 + }, + { + "epoch": 0.57216, + "grad_norm": 0.587684154510498, + "learning_rate": 4.9125650260104044e-05, + "loss": 0.7745, + "step": 447 + }, + { + "epoch": 0.57344, + "grad_norm": 0.6146616339683533, + "learning_rate": 4.9123649459783916e-05, + "loss": 0.7909, + "step": 448 + }, + { + "epoch": 0.57472, + "grad_norm": 0.5915645360946655, + "learning_rate": 4.912164865946379e-05, + "loss": 0.7361, + "step": 449 + }, + { + "epoch": 0.576, + "grad_norm": 0.5974079966545105, + "learning_rate": 4.911964785914366e-05, + "loss": 0.7355, + "step": 450 + }, + { + "epoch": 0.57728, + "grad_norm": 0.6230568885803223, + "learning_rate": 4.911764705882353e-05, + "loss": 0.7886, + "step": 451 + }, + { + "epoch": 0.57856, + "grad_norm": 0.604843020439148, + "learning_rate": 4.9115646258503404e-05, + "loss": 0.7817, + "step": 452 + }, + { + "epoch": 0.57984, + "grad_norm": 0.6246700882911682, + "learning_rate": 4.9113645458183276e-05, + "loss": 0.7952, + "step": 453 + }, + { + "epoch": 0.58112, + "grad_norm": 0.5888630747795105, + "learning_rate": 4.911164465786315e-05, + "loss": 0.7482, + "step": 454 + }, + { + "epoch": 0.5824, + "grad_norm": 0.5762295722961426, + "learning_rate": 4.910964385754302e-05, + "loss": 0.7301, + "step": 455 + }, + { + "epoch": 0.58368, + "grad_norm": 0.6081997156143188, + "learning_rate": 4.910764305722289e-05, + "loss": 0.7295, + "step": 456 + }, + { + "epoch": 0.58496, + "grad_norm": 0.6235604286193848, + "learning_rate": 4.910564225690276e-05, + "loss": 0.8245, + "step": 457 + }, + { + "epoch": 0.58624, + "grad_norm": 0.6132957339286804, + "learning_rate": 4.9103641456582635e-05, + "loss": 0.7289, + "step": 458 + }, + { + "epoch": 0.58752, + "grad_norm": 0.6033244132995605, + "learning_rate": 4.910164065626251e-05, + "loss": 0.8088, + "step": 459 + }, + { + "epoch": 0.5888, + "grad_norm": 0.6521779894828796, + "learning_rate": 4.909963985594238e-05, + "loss": 0.8683, + "step": 460 + }, + { + "epoch": 0.59008, + "grad_norm": 0.6089750528335571, + "learning_rate": 4.909763905562225e-05, + "loss": 0.7418, + "step": 461 + }, + { + "epoch": 0.59136, + "grad_norm": 0.6081743836402893, + "learning_rate": 4.909563825530212e-05, + "loss": 0.7847, + "step": 462 + }, + { + "epoch": 0.59264, + "grad_norm": 0.5884706377983093, + "learning_rate": 4.9093637454981994e-05, + "loss": 0.7429, + "step": 463 + }, + { + "epoch": 0.59392, + "grad_norm": 0.5694007873535156, + "learning_rate": 4.9091636654661866e-05, + "loss": 0.7418, + "step": 464 + }, + { + "epoch": 0.5952, + "grad_norm": 0.5716164112091064, + "learning_rate": 4.908963585434174e-05, + "loss": 0.7683, + "step": 465 + }, + { + "epoch": 0.59648, + "grad_norm": 0.5955536365509033, + "learning_rate": 4.9087635054021617e-05, + "loss": 0.7523, + "step": 466 + }, + { + "epoch": 0.59776, + "grad_norm": 0.5757789611816406, + "learning_rate": 4.908563425370148e-05, + "loss": 0.7261, + "step": 467 + }, + { + "epoch": 0.59904, + "grad_norm": 0.5787334442138672, + "learning_rate": 4.9083633453381353e-05, + "loss": 0.7195, + "step": 468 + }, + { + "epoch": 0.60032, + "grad_norm": 0.568333625793457, + "learning_rate": 4.9081632653061225e-05, + "loss": 0.7462, + "step": 469 + }, + { + "epoch": 0.6016, + "grad_norm": 0.5744010806083679, + "learning_rate": 4.90796318527411e-05, + "loss": 0.8024, + "step": 470 + }, + { + "epoch": 0.60288, + "grad_norm": 0.6531276702880859, + "learning_rate": 4.907763105242097e-05, + "loss": 0.8175, + "step": 471 + }, + { + "epoch": 0.60416, + "grad_norm": 0.6218701004981995, + "learning_rate": 4.907563025210084e-05, + "loss": 0.7496, + "step": 472 + }, + { + "epoch": 0.60544, + "grad_norm": 0.6153817772865295, + "learning_rate": 4.907362945178072e-05, + "loss": 0.8135, + "step": 473 + }, + { + "epoch": 0.60672, + "grad_norm": 0.6718766689300537, + "learning_rate": 4.907162865146059e-05, + "loss": 0.8049, + "step": 474 + }, + { + "epoch": 0.608, + "grad_norm": 0.6055019497871399, + "learning_rate": 4.9069627851140456e-05, + "loss": 0.7722, + "step": 475 + }, + { + "epoch": 0.60928, + "grad_norm": 0.5902823805809021, + "learning_rate": 4.906762705082033e-05, + "loss": 0.7665, + "step": 476 + }, + { + "epoch": 0.61056, + "grad_norm": 0.5945876836776733, + "learning_rate": 4.90656262505002e-05, + "loss": 0.7589, + "step": 477 + }, + { + "epoch": 0.61184, + "grad_norm": 0.6314772963523865, + "learning_rate": 4.906362545018007e-05, + "loss": 0.8462, + "step": 478 + }, + { + "epoch": 0.61312, + "grad_norm": 0.6117784380912781, + "learning_rate": 4.9061624649859944e-05, + "loss": 0.7623, + "step": 479 + }, + { + "epoch": 0.6144, + "grad_norm": 0.5828944444656372, + "learning_rate": 4.905962384953982e-05, + "loss": 0.752, + "step": 480 + }, + { + "epoch": 0.61568, + "grad_norm": 0.6389551162719727, + "learning_rate": 4.9057623049219694e-05, + "loss": 0.7396, + "step": 481 + }, + { + "epoch": 0.61696, + "grad_norm": 0.5840926170349121, + "learning_rate": 4.9055622248899566e-05, + "loss": 0.7754, + "step": 482 + }, + { + "epoch": 0.61824, + "grad_norm": 0.6001207232475281, + "learning_rate": 4.905362144857943e-05, + "loss": 0.7678, + "step": 483 + }, + { + "epoch": 0.61952, + "grad_norm": 0.5531839728355408, + "learning_rate": 4.90516206482593e-05, + "loss": 0.7174, + "step": 484 + }, + { + "epoch": 0.6208, + "grad_norm": 0.6437347531318665, + "learning_rate": 4.9049619847939175e-05, + "loss": 0.7857, + "step": 485 + }, + { + "epoch": 0.62208, + "grad_norm": 0.5961335301399231, + "learning_rate": 4.904761904761905e-05, + "loss": 0.754, + "step": 486 + }, + { + "epoch": 0.62336, + "grad_norm": 0.5557673573493958, + "learning_rate": 4.9045618247298926e-05, + "loss": 0.7418, + "step": 487 + }, + { + "epoch": 0.62464, + "grad_norm": 0.6520072221755981, + "learning_rate": 4.90436174469788e-05, + "loss": 0.8426, + "step": 488 + }, + { + "epoch": 0.62592, + "grad_norm": 0.5974238514900208, + "learning_rate": 4.904161664665867e-05, + "loss": 0.7502, + "step": 489 + }, + { + "epoch": 0.6272, + "grad_norm": 0.6083403825759888, + "learning_rate": 4.903961584633854e-05, + "loss": 0.7499, + "step": 490 + }, + { + "epoch": 0.62848, + "grad_norm": 0.5932098031044006, + "learning_rate": 4.9037615046018406e-05, + "loss": 0.7344, + "step": 491 + }, + { + "epoch": 0.62976, + "grad_norm": 0.6413688659667969, + "learning_rate": 4.903561424569828e-05, + "loss": 0.7552, + "step": 492 + }, + { + "epoch": 0.63104, + "grad_norm": 0.6206446290016174, + "learning_rate": 4.903361344537815e-05, + "loss": 0.7687, + "step": 493 + }, + { + "epoch": 0.63232, + "grad_norm": 0.5776639580726624, + "learning_rate": 4.903161264505803e-05, + "loss": 0.8371, + "step": 494 + }, + { + "epoch": 0.6336, + "grad_norm": 0.6182857155799866, + "learning_rate": 4.90296118447379e-05, + "loss": 0.7361, + "step": 495 + }, + { + "epoch": 0.63488, + "grad_norm": 0.6016165018081665, + "learning_rate": 4.902761104441777e-05, + "loss": 0.8338, + "step": 496 + }, + { + "epoch": 0.63616, + "grad_norm": 0.6279016733169556, + "learning_rate": 4.9025610244097644e-05, + "loss": 0.756, + "step": 497 + }, + { + "epoch": 0.63744, + "grad_norm": 0.6500204205513, + "learning_rate": 4.9023609443777516e-05, + "loss": 0.8069, + "step": 498 + }, + { + "epoch": 0.63872, + "grad_norm": 0.6177827715873718, + "learning_rate": 4.902160864345738e-05, + "loss": 0.7365, + "step": 499 + }, + { + "epoch": 0.64, + "grad_norm": 0.5693172216415405, + "learning_rate": 4.901960784313725e-05, + "loss": 0.6754, + "step": 500 + }, + { + "epoch": 0.64128, + "grad_norm": 0.5654707551002502, + "learning_rate": 4.901760704281713e-05, + "loss": 0.7525, + "step": 501 + }, + { + "epoch": 0.64256, + "grad_norm": 0.648030161857605, + "learning_rate": 4.9015606242497e-05, + "loss": 0.764, + "step": 502 + }, + { + "epoch": 0.64384, + "grad_norm": 0.6066122651100159, + "learning_rate": 4.9013605442176875e-05, + "loss": 0.7487, + "step": 503 + }, + { + "epoch": 0.64512, + "grad_norm": 0.6264938712120056, + "learning_rate": 4.901160464185675e-05, + "loss": 0.8013, + "step": 504 + }, + { + "epoch": 0.6464, + "grad_norm": 0.60635906457901, + "learning_rate": 4.900960384153662e-05, + "loss": 0.7421, + "step": 505 + }, + { + "epoch": 0.64768, + "grad_norm": 0.5976872444152832, + "learning_rate": 4.900760304121649e-05, + "loss": 0.733, + "step": 506 + }, + { + "epoch": 0.64896, + "grad_norm": 0.5609980225563049, + "learning_rate": 4.9005602240896356e-05, + "loss": 0.7141, + "step": 507 + }, + { + "epoch": 0.65024, + "grad_norm": 0.6109974980354309, + "learning_rate": 4.9003601440576234e-05, + "loss": 0.7301, + "step": 508 + }, + { + "epoch": 0.65152, + "grad_norm": 0.6050900220870972, + "learning_rate": 4.9001600640256106e-05, + "loss": 0.7644, + "step": 509 + }, + { + "epoch": 0.6528, + "grad_norm": 0.571101188659668, + "learning_rate": 4.899959983993598e-05, + "loss": 0.7736, + "step": 510 + }, + { + "epoch": 0.65408, + "grad_norm": 0.5456352829933167, + "learning_rate": 4.899759903961585e-05, + "loss": 0.7738, + "step": 511 + }, + { + "epoch": 0.65536, + "grad_norm": 0.5890512466430664, + "learning_rate": 4.899559823929572e-05, + "loss": 0.6723, + "step": 512 + }, + { + "epoch": 0.65664, + "grad_norm": 0.5800885558128357, + "learning_rate": 4.8993597438975594e-05, + "loss": 0.7153, + "step": 513 + }, + { + "epoch": 0.65792, + "grad_norm": 0.6010210514068604, + "learning_rate": 4.8991596638655466e-05, + "loss": 0.7784, + "step": 514 + }, + { + "epoch": 0.6592, + "grad_norm": 0.6430292129516602, + "learning_rate": 4.898959583833533e-05, + "loss": 0.7992, + "step": 515 + }, + { + "epoch": 0.66048, + "grad_norm": 0.5662366151809692, + "learning_rate": 4.898759503801521e-05, + "loss": 0.7449, + "step": 516 + }, + { + "epoch": 0.66176, + "grad_norm": 0.6085848212242126, + "learning_rate": 4.898559423769508e-05, + "loss": 0.7972, + "step": 517 + }, + { + "epoch": 0.66304, + "grad_norm": 0.5684623122215271, + "learning_rate": 4.898359343737495e-05, + "loss": 0.7667, + "step": 518 + }, + { + "epoch": 0.66432, + "grad_norm": 0.5847243666648865, + "learning_rate": 4.8981592637054825e-05, + "loss": 0.7233, + "step": 519 + }, + { + "epoch": 0.6656, + "grad_norm": 0.6326200366020203, + "learning_rate": 4.89795918367347e-05, + "loss": 0.7656, + "step": 520 + }, + { + "epoch": 0.66688, + "grad_norm": 0.6132087707519531, + "learning_rate": 4.897759103641457e-05, + "loss": 0.7842, + "step": 521 + }, + { + "epoch": 0.66816, + "grad_norm": 0.5856083631515503, + "learning_rate": 4.897559023609444e-05, + "loss": 0.7745, + "step": 522 + }, + { + "epoch": 0.66944, + "grad_norm": 0.6147796511650085, + "learning_rate": 4.897358943577431e-05, + "loss": 0.7964, + "step": 523 + }, + { + "epoch": 0.67072, + "grad_norm": 0.6051590442657471, + "learning_rate": 4.8971588635454184e-05, + "loss": 0.7573, + "step": 524 + }, + { + "epoch": 0.672, + "grad_norm": 0.6189625263214111, + "learning_rate": 4.8969587835134056e-05, + "loss": 0.8481, + "step": 525 + }, + { + "epoch": 0.67328, + "grad_norm": 0.5812535881996155, + "learning_rate": 4.896758703481393e-05, + "loss": 0.8213, + "step": 526 + }, + { + "epoch": 0.67456, + "grad_norm": 0.5988802909851074, + "learning_rate": 4.89655862344938e-05, + "loss": 0.6807, + "step": 527 + }, + { + "epoch": 0.67584, + "grad_norm": 0.5860095620155334, + "learning_rate": 4.896358543417367e-05, + "loss": 0.7816, + "step": 528 + }, + { + "epoch": 0.67712, + "grad_norm": 0.5878039598464966, + "learning_rate": 4.8961584633853543e-05, + "loss": 0.7771, + "step": 529 + }, + { + "epoch": 0.6784, + "grad_norm": 0.6504335403442383, + "learning_rate": 4.8959583833533415e-05, + "loss": 0.82, + "step": 530 + }, + { + "epoch": 0.67968, + "grad_norm": 0.5590863227844238, + "learning_rate": 4.895758303321329e-05, + "loss": 0.7686, + "step": 531 + }, + { + "epoch": 0.68096, + "grad_norm": 0.6558895707130432, + "learning_rate": 4.895558223289316e-05, + "loss": 0.8116, + "step": 532 + }, + { + "epoch": 0.68224, + "grad_norm": 0.6584180593490601, + "learning_rate": 4.895358143257303e-05, + "loss": 0.7509, + "step": 533 + }, + { + "epoch": 0.68352, + "grad_norm": 0.5821325778961182, + "learning_rate": 4.89515806322529e-05, + "loss": 0.7289, + "step": 534 + }, + { + "epoch": 0.6848, + "grad_norm": 0.6342353820800781, + "learning_rate": 4.8949579831932775e-05, + "loss": 0.7905, + "step": 535 + }, + { + "epoch": 0.68608, + "grad_norm": 0.595000684261322, + "learning_rate": 4.8947579031612646e-05, + "loss": 0.7344, + "step": 536 + }, + { + "epoch": 0.68736, + "grad_norm": 0.6255506277084351, + "learning_rate": 4.894557823129252e-05, + "loss": 0.7994, + "step": 537 + }, + { + "epoch": 0.68864, + "grad_norm": 0.6021307110786438, + "learning_rate": 4.894357743097239e-05, + "loss": 0.8138, + "step": 538 + }, + { + "epoch": 0.68992, + "grad_norm": 0.5913227796554565, + "learning_rate": 4.894157663065226e-05, + "loss": 0.7822, + "step": 539 + }, + { + "epoch": 0.6912, + "grad_norm": 0.6197808980941772, + "learning_rate": 4.8939575830332134e-05, + "loss": 0.7853, + "step": 540 + }, + { + "epoch": 0.69248, + "grad_norm": 0.5861679911613464, + "learning_rate": 4.8937575030012006e-05, + "loss": 0.7293, + "step": 541 + }, + { + "epoch": 0.69376, + "grad_norm": 0.5833053588867188, + "learning_rate": 4.893557422969188e-05, + "loss": 0.7416, + "step": 542 + }, + { + "epoch": 0.69504, + "grad_norm": 0.5544353723526001, + "learning_rate": 4.893357342937175e-05, + "loss": 0.7278, + "step": 543 + }, + { + "epoch": 0.69632, + "grad_norm": 0.5617831945419312, + "learning_rate": 4.893157262905163e-05, + "loss": 0.7219, + "step": 544 + }, + { + "epoch": 0.6976, + "grad_norm": 0.6455240845680237, + "learning_rate": 4.892957182873149e-05, + "loss": 0.7656, + "step": 545 + }, + { + "epoch": 0.69888, + "grad_norm": 0.63936448097229, + "learning_rate": 4.8927571028411365e-05, + "loss": 0.8029, + "step": 546 + }, + { + "epoch": 0.70016, + "grad_norm": 0.5780683755874634, + "learning_rate": 4.892557022809124e-05, + "loss": 0.7684, + "step": 547 + }, + { + "epoch": 0.70144, + "grad_norm": 0.571263313293457, + "learning_rate": 4.892356942777111e-05, + "loss": 0.6714, + "step": 548 + }, + { + "epoch": 0.70272, + "grad_norm": 0.6045280694961548, + "learning_rate": 4.892156862745098e-05, + "loss": 0.7374, + "step": 549 + }, + { + "epoch": 0.704, + "grad_norm": 0.5887953639030457, + "learning_rate": 4.891956782713085e-05, + "loss": 0.7341, + "step": 550 + }, + { + "epoch": 0.70528, + "grad_norm": 0.6200657486915588, + "learning_rate": 4.891756702681073e-05, + "loss": 0.7835, + "step": 551 + }, + { + "epoch": 0.70656, + "grad_norm": 0.6312881112098694, + "learning_rate": 4.89155662264906e-05, + "loss": 0.8279, + "step": 552 + }, + { + "epoch": 0.70784, + "grad_norm": 0.6151871681213379, + "learning_rate": 4.891356542617047e-05, + "loss": 0.7467, + "step": 553 + }, + { + "epoch": 0.70912, + "grad_norm": 0.5729169249534607, + "learning_rate": 4.891156462585034e-05, + "loss": 0.7405, + "step": 554 + }, + { + "epoch": 0.7104, + "grad_norm": 0.5994395017623901, + "learning_rate": 4.890956382553021e-05, + "loss": 0.8057, + "step": 555 + }, + { + "epoch": 0.71168, + "grad_norm": 0.564448893070221, + "learning_rate": 4.8907563025210084e-05, + "loss": 0.7428, + "step": 556 + }, + { + "epoch": 0.71296, + "grad_norm": 0.6355032920837402, + "learning_rate": 4.8905562224889955e-05, + "loss": 0.79, + "step": 557 + }, + { + "epoch": 0.71424, + "grad_norm": 0.5675954222679138, + "learning_rate": 4.8903561424569834e-05, + "loss": 0.7338, + "step": 558 + }, + { + "epoch": 0.71552, + "grad_norm": 0.5565802454948425, + "learning_rate": 4.8901560624249706e-05, + "loss": 0.7997, + "step": 559 + }, + { + "epoch": 0.7168, + "grad_norm": 0.5979486107826233, + "learning_rate": 4.889955982392958e-05, + "loss": 0.7952, + "step": 560 + }, + { + "epoch": 0.71808, + "grad_norm": 0.5932238101959229, + "learning_rate": 4.889755902360944e-05, + "loss": 0.7106, + "step": 561 + }, + { + "epoch": 0.71936, + "grad_norm": 0.659325361251831, + "learning_rate": 4.8895558223289315e-05, + "loss": 0.703, + "step": 562 + }, + { + "epoch": 0.72064, + "grad_norm": 0.6149618029594421, + "learning_rate": 4.889355742296919e-05, + "loss": 0.762, + "step": 563 + }, + { + "epoch": 0.72192, + "grad_norm": 0.5758309364318848, + "learning_rate": 4.889155662264906e-05, + "loss": 0.7698, + "step": 564 + }, + { + "epoch": 0.7232, + "grad_norm": 0.5630546808242798, + "learning_rate": 4.888955582232894e-05, + "loss": 0.7165, + "step": 565 + }, + { + "epoch": 0.72448, + "grad_norm": 0.6149067282676697, + "learning_rate": 4.888755502200881e-05, + "loss": 0.8136, + "step": 566 + }, + { + "epoch": 0.72576, + "grad_norm": 0.6086531281471252, + "learning_rate": 4.888555422168868e-05, + "loss": 0.7992, + "step": 567 + }, + { + "epoch": 0.72704, + "grad_norm": 0.6159600615501404, + "learning_rate": 4.888355342136855e-05, + "loss": 0.8229, + "step": 568 + }, + { + "epoch": 0.72832, + "grad_norm": 0.5952989459037781, + "learning_rate": 4.888155262104842e-05, + "loss": 0.7035, + "step": 569 + }, + { + "epoch": 0.7296, + "grad_norm": 0.6345956921577454, + "learning_rate": 4.887955182072829e-05, + "loss": 0.8317, + "step": 570 + }, + { + "epoch": 0.73088, + "grad_norm": 0.6076996326446533, + "learning_rate": 4.887755102040816e-05, + "loss": 0.7937, + "step": 571 + }, + { + "epoch": 0.73216, + "grad_norm": 0.5981006622314453, + "learning_rate": 4.887555022008804e-05, + "loss": 0.6843, + "step": 572 + }, + { + "epoch": 0.73344, + "grad_norm": 0.5810612440109253, + "learning_rate": 4.887354941976791e-05, + "loss": 0.6974, + "step": 573 + }, + { + "epoch": 0.73472, + "grad_norm": 0.5900784730911255, + "learning_rate": 4.8871548619447784e-05, + "loss": 0.7718, + "step": 574 + }, + { + "epoch": 0.736, + "grad_norm": 0.5939428210258484, + "learning_rate": 4.8869547819127656e-05, + "loss": 0.7176, + "step": 575 + }, + { + "epoch": 0.73728, + "grad_norm": 0.6226340532302856, + "learning_rate": 4.886754701880753e-05, + "loss": 0.728, + "step": 576 + }, + { + "epoch": 0.73856, + "grad_norm": 0.6123561859130859, + "learning_rate": 4.886554621848739e-05, + "loss": 0.7569, + "step": 577 + }, + { + "epoch": 0.73984, + "grad_norm": 0.6113354563713074, + "learning_rate": 4.8863545418167264e-05, + "loss": 0.7683, + "step": 578 + }, + { + "epoch": 0.74112, + "grad_norm": 0.6293046474456787, + "learning_rate": 4.886154461784714e-05, + "loss": 0.8001, + "step": 579 + }, + { + "epoch": 0.7424, + "grad_norm": 0.5882707238197327, + "learning_rate": 4.8859543817527015e-05, + "loss": 0.6901, + "step": 580 + }, + { + "epoch": 0.74368, + "grad_norm": 0.5842380523681641, + "learning_rate": 4.885754301720689e-05, + "loss": 0.7069, + "step": 581 + }, + { + "epoch": 0.74496, + "grad_norm": 0.6349176168441772, + "learning_rate": 4.885554221688676e-05, + "loss": 0.7323, + "step": 582 + }, + { + "epoch": 0.74624, + "grad_norm": 0.5746724605560303, + "learning_rate": 4.885354141656663e-05, + "loss": 0.7153, + "step": 583 + }, + { + "epoch": 0.74752, + "grad_norm": 0.5570270419120789, + "learning_rate": 4.88515406162465e-05, + "loss": 0.7684, + "step": 584 + }, + { + "epoch": 0.7488, + "grad_norm": 0.6300098896026611, + "learning_rate": 4.884953981592637e-05, + "loss": 0.7983, + "step": 585 + }, + { + "epoch": 0.75008, + "grad_norm": 0.6173220872879028, + "learning_rate": 4.8847539015606246e-05, + "loss": 0.8115, + "step": 586 + }, + { + "epoch": 0.75136, + "grad_norm": 0.6043545007705688, + "learning_rate": 4.884553821528612e-05, + "loss": 0.7274, + "step": 587 + }, + { + "epoch": 0.75264, + "grad_norm": 0.6310338377952576, + "learning_rate": 4.884353741496599e-05, + "loss": 0.7706, + "step": 588 + }, + { + "epoch": 0.75392, + "grad_norm": 0.6150200366973877, + "learning_rate": 4.884153661464586e-05, + "loss": 0.7907, + "step": 589 + }, + { + "epoch": 0.7552, + "grad_norm": 0.5880206823348999, + "learning_rate": 4.8839535814325734e-05, + "loss": 0.7382, + "step": 590 + }, + { + "epoch": 0.75648, + "grad_norm": 0.6823338866233826, + "learning_rate": 4.8837535014005605e-05, + "loss": 0.7575, + "step": 591 + }, + { + "epoch": 0.75776, + "grad_norm": 0.6266196370124817, + "learning_rate": 4.883553421368548e-05, + "loss": 0.7685, + "step": 592 + }, + { + "epoch": 0.75904, + "grad_norm": 0.637712836265564, + "learning_rate": 4.883353341336535e-05, + "loss": 0.7326, + "step": 593 + }, + { + "epoch": 0.76032, + "grad_norm": 0.5898723006248474, + "learning_rate": 4.883153261304522e-05, + "loss": 0.749, + "step": 594 + }, + { + "epoch": 0.7616, + "grad_norm": 0.6104643940925598, + "learning_rate": 4.882953181272509e-05, + "loss": 0.7539, + "step": 595 + }, + { + "epoch": 0.76288, + "grad_norm": 0.615857720375061, + "learning_rate": 4.8827531012404965e-05, + "loss": 0.7938, + "step": 596 + }, + { + "epoch": 0.76416, + "grad_norm": 0.633080780506134, + "learning_rate": 4.8825530212084837e-05, + "loss": 0.8089, + "step": 597 + }, + { + "epoch": 0.76544, + "grad_norm": 0.615404486656189, + "learning_rate": 4.882352941176471e-05, + "loss": 0.8339, + "step": 598 + }, + { + "epoch": 0.76672, + "grad_norm": 0.5635229349136353, + "learning_rate": 4.882152861144458e-05, + "loss": 0.6853, + "step": 599 + }, + { + "epoch": 0.768, + "grad_norm": 0.579932689666748, + "learning_rate": 4.881952781112445e-05, + "loss": 0.7536, + "step": 600 + }, + { + "epoch": 0.76928, + "grad_norm": 0.5975157618522644, + "learning_rate": 4.8817527010804324e-05, + "loss": 0.7634, + "step": 601 + }, + { + "epoch": 0.77056, + "grad_norm": 0.5809814929962158, + "learning_rate": 4.8815526210484196e-05, + "loss": 0.7197, + "step": 602 + }, + { + "epoch": 0.77184, + "grad_norm": 0.5941184163093567, + "learning_rate": 4.881352541016407e-05, + "loss": 0.7532, + "step": 603 + }, + { + "epoch": 0.77312, + "grad_norm": 0.629640519618988, + "learning_rate": 4.881152460984394e-05, + "loss": 0.7685, + "step": 604 + }, + { + "epoch": 0.7744, + "grad_norm": 0.6509769558906555, + "learning_rate": 4.880952380952381e-05, + "loss": 0.738, + "step": 605 + }, + { + "epoch": 0.77568, + "grad_norm": 0.5938847661018372, + "learning_rate": 4.880752300920368e-05, + "loss": 0.7364, + "step": 606 + }, + { + "epoch": 0.77696, + "grad_norm": 0.5702788829803467, + "learning_rate": 4.8805522208883555e-05, + "loss": 0.7057, + "step": 607 + }, + { + "epoch": 0.77824, + "grad_norm": 0.5880799889564514, + "learning_rate": 4.880352140856343e-05, + "loss": 0.6999, + "step": 608 + }, + { + "epoch": 0.77952, + "grad_norm": 0.6025612950325012, + "learning_rate": 4.88015206082433e-05, + "loss": 0.7189, + "step": 609 + }, + { + "epoch": 0.7808, + "grad_norm": 0.6370987296104431, + "learning_rate": 4.879951980792317e-05, + "loss": 0.7947, + "step": 610 + }, + { + "epoch": 0.78208, + "grad_norm": 0.5612797737121582, + "learning_rate": 4.879751900760304e-05, + "loss": 0.6949, + "step": 611 + }, + { + "epoch": 0.78336, + "grad_norm": 0.6249443888664246, + "learning_rate": 4.8795518207282914e-05, + "loss": 0.7593, + "step": 612 + }, + { + "epoch": 0.78464, + "grad_norm": 0.6149105429649353, + "learning_rate": 4.8793517406962786e-05, + "loss": 0.8293, + "step": 613 + }, + { + "epoch": 0.78592, + "grad_norm": 0.6441102623939514, + "learning_rate": 4.8791516606642665e-05, + "loss": 0.7343, + "step": 614 + }, + { + "epoch": 0.7872, + "grad_norm": 0.5925320386886597, + "learning_rate": 4.878951580632253e-05, + "loss": 0.7466, + "step": 615 + }, + { + "epoch": 0.78848, + "grad_norm": 0.6006816029548645, + "learning_rate": 4.87875150060024e-05, + "loss": 0.8136, + "step": 616 + }, + { + "epoch": 0.78976, + "grad_norm": 0.6164388060569763, + "learning_rate": 4.8785514205682274e-05, + "loss": 0.7459, + "step": 617 + }, + { + "epoch": 0.79104, + "grad_norm": 0.6219509840011597, + "learning_rate": 4.8783513405362146e-05, + "loss": 0.7392, + "step": 618 + }, + { + "epoch": 0.79232, + "grad_norm": 0.6259922981262207, + "learning_rate": 4.878151260504202e-05, + "loss": 0.7768, + "step": 619 + }, + { + "epoch": 0.7936, + "grad_norm": 0.5972528457641602, + "learning_rate": 4.877951180472189e-05, + "loss": 0.7749, + "step": 620 + }, + { + "epoch": 0.79488, + "grad_norm": 0.5769963264465332, + "learning_rate": 4.877751100440177e-05, + "loss": 0.7324, + "step": 621 + }, + { + "epoch": 0.79616, + "grad_norm": 0.5725414752960205, + "learning_rate": 4.877551020408164e-05, + "loss": 0.7089, + "step": 622 + }, + { + "epoch": 0.79744, + "grad_norm": 0.6348921656608582, + "learning_rate": 4.8773509403761505e-05, + "loss": 0.8087, + "step": 623 + }, + { + "epoch": 0.79872, + "grad_norm": 0.6241890788078308, + "learning_rate": 4.877150860344138e-05, + "loss": 0.7478, + "step": 624 + }, + { + "epoch": 0.8, + "grad_norm": 0.6045325398445129, + "learning_rate": 4.876950780312125e-05, + "loss": 0.7607, + "step": 625 + }, + { + "epoch": 0.80128, + "grad_norm": 0.6093403100967407, + "learning_rate": 4.876750700280112e-05, + "loss": 0.7503, + "step": 626 + }, + { + "epoch": 0.80256, + "grad_norm": 0.6271349787712097, + "learning_rate": 4.876550620248099e-05, + "loss": 0.7631, + "step": 627 + }, + { + "epoch": 0.80384, + "grad_norm": 0.5978842377662659, + "learning_rate": 4.8763505402160864e-05, + "loss": 0.75, + "step": 628 + }, + { + "epoch": 0.80512, + "grad_norm": 0.5923817157745361, + "learning_rate": 4.876150460184074e-05, + "loss": 0.6844, + "step": 629 + }, + { + "epoch": 0.8064, + "grad_norm": 0.604054868221283, + "learning_rate": 4.8759503801520615e-05, + "loss": 0.7224, + "step": 630 + }, + { + "epoch": 0.80768, + "grad_norm": 0.645494282245636, + "learning_rate": 4.875750300120048e-05, + "loss": 0.7912, + "step": 631 + }, + { + "epoch": 0.80896, + "grad_norm": 0.6240355968475342, + "learning_rate": 4.875550220088035e-05, + "loss": 0.7429, + "step": 632 + }, + { + "epoch": 0.81024, + "grad_norm": 0.6154593229293823, + "learning_rate": 4.875350140056022e-05, + "loss": 0.7693, + "step": 633 + }, + { + "epoch": 0.81152, + "grad_norm": 0.6306690573692322, + "learning_rate": 4.8751500600240095e-05, + "loss": 0.733, + "step": 634 + }, + { + "epoch": 0.8128, + "grad_norm": 0.5954400897026062, + "learning_rate": 4.874949979991997e-05, + "loss": 0.8013, + "step": 635 + }, + { + "epoch": 0.81408, + "grad_norm": 0.6127607226371765, + "learning_rate": 4.8747498999599846e-05, + "loss": 0.7676, + "step": 636 + }, + { + "epoch": 0.81536, + "grad_norm": 0.6720303893089294, + "learning_rate": 4.874549819927972e-05, + "loss": 0.7987, + "step": 637 + }, + { + "epoch": 0.81664, + "grad_norm": 0.5854519605636597, + "learning_rate": 4.874349739895959e-05, + "loss": 0.769, + "step": 638 + }, + { + "epoch": 0.81792, + "grad_norm": 0.6683208346366882, + "learning_rate": 4.8741496598639455e-05, + "loss": 0.7729, + "step": 639 + }, + { + "epoch": 0.8192, + "grad_norm": 0.5927050709724426, + "learning_rate": 4.8739495798319326e-05, + "loss": 0.7671, + "step": 640 + }, + { + "epoch": 0.82048, + "grad_norm": 0.5779833793640137, + "learning_rate": 4.87374949979992e-05, + "loss": 0.6955, + "step": 641 + }, + { + "epoch": 0.82176, + "grad_norm": 0.6800764799118042, + "learning_rate": 4.873549419767907e-05, + "loss": 0.7995, + "step": 642 + }, + { + "epoch": 0.82304, + "grad_norm": 0.5779314041137695, + "learning_rate": 4.873349339735895e-05, + "loss": 0.7296, + "step": 643 + }, + { + "epoch": 0.82432, + "grad_norm": 0.5840075016021729, + "learning_rate": 4.873149259703882e-05, + "loss": 0.7412, + "step": 644 + }, + { + "epoch": 0.8256, + "grad_norm": 0.5941234230995178, + "learning_rate": 4.872949179671869e-05, + "loss": 0.8138, + "step": 645 + }, + { + "epoch": 0.82688, + "grad_norm": 0.5719572305679321, + "learning_rate": 4.8727490996398564e-05, + "loss": 0.7713, + "step": 646 + }, + { + "epoch": 0.82816, + "grad_norm": 0.6207135915756226, + "learning_rate": 4.872549019607843e-05, + "loss": 0.7417, + "step": 647 + }, + { + "epoch": 0.82944, + "grad_norm": 0.6460603475570679, + "learning_rate": 4.87234893957583e-05, + "loss": 0.8659, + "step": 648 + }, + { + "epoch": 0.83072, + "grad_norm": 0.5850974321365356, + "learning_rate": 4.872148859543817e-05, + "loss": 0.7083, + "step": 649 + }, + { + "epoch": 0.832, + "grad_norm": 0.5269494652748108, + "learning_rate": 4.871948779511805e-05, + "loss": 0.6849, + "step": 650 + }, + { + "epoch": 0.83328, + "grad_norm": 0.6789783239364624, + "learning_rate": 4.8717486994797924e-05, + "loss": 0.8186, + "step": 651 + }, + { + "epoch": 0.83456, + "grad_norm": 0.5852586030960083, + "learning_rate": 4.8715486194477795e-05, + "loss": 0.759, + "step": 652 + }, + { + "epoch": 0.83584, + "grad_norm": 0.661586582660675, + "learning_rate": 4.871348539415767e-05, + "loss": 0.8003, + "step": 653 + }, + { + "epoch": 0.83712, + "grad_norm": 0.6112899780273438, + "learning_rate": 4.871148459383754e-05, + "loss": 0.7195, + "step": 654 + }, + { + "epoch": 0.8384, + "grad_norm": 0.6239529252052307, + "learning_rate": 4.8709483793517404e-05, + "loss": 0.6815, + "step": 655 + }, + { + "epoch": 0.83968, + "grad_norm": 0.6071779131889343, + "learning_rate": 4.8707482993197276e-05, + "loss": 0.6849, + "step": 656 + }, + { + "epoch": 0.84096, + "grad_norm": 0.6142399311065674, + "learning_rate": 4.8705482192877155e-05, + "loss": 0.7898, + "step": 657 + }, + { + "epoch": 0.84224, + "grad_norm": 0.5916489958763123, + "learning_rate": 4.8703481392557027e-05, + "loss": 0.7373, + "step": 658 + }, + { + "epoch": 0.84352, + "grad_norm": 0.6097525954246521, + "learning_rate": 4.87014805922369e-05, + "loss": 0.751, + "step": 659 + }, + { + "epoch": 0.8448, + "grad_norm": 0.5901023149490356, + "learning_rate": 4.869947979191677e-05, + "loss": 0.6901, + "step": 660 + }, + { + "epoch": 0.84608, + "grad_norm": 0.5702456831932068, + "learning_rate": 4.869747899159664e-05, + "loss": 0.7518, + "step": 661 + }, + { + "epoch": 0.84736, + "grad_norm": 0.606547474861145, + "learning_rate": 4.8695478191276514e-05, + "loss": 0.7508, + "step": 662 + }, + { + "epoch": 0.84864, + "grad_norm": 0.6051300764083862, + "learning_rate": 4.869347739095638e-05, + "loss": 0.7659, + "step": 663 + }, + { + "epoch": 0.84992, + "grad_norm": 0.6050798296928406, + "learning_rate": 4.869147659063626e-05, + "loss": 0.7429, + "step": 664 + }, + { + "epoch": 0.8512, + "grad_norm": 0.6163650751113892, + "learning_rate": 4.868947579031613e-05, + "loss": 0.7387, + "step": 665 + }, + { + "epoch": 0.85248, + "grad_norm": 0.6229680180549622, + "learning_rate": 4.8687474989996e-05, + "loss": 0.7673, + "step": 666 + }, + { + "epoch": 0.85376, + "grad_norm": 0.6289007663726807, + "learning_rate": 4.868547418967587e-05, + "loss": 0.738, + "step": 667 + }, + { + "epoch": 0.85504, + "grad_norm": 0.609407901763916, + "learning_rate": 4.8683473389355745e-05, + "loss": 0.7457, + "step": 668 + }, + { + "epoch": 0.85632, + "grad_norm": 0.6026242971420288, + "learning_rate": 4.868147258903562e-05, + "loss": 0.7617, + "step": 669 + }, + { + "epoch": 0.8576, + "grad_norm": 0.6227514147758484, + "learning_rate": 4.867947178871549e-05, + "loss": 0.7938, + "step": 670 + }, + { + "epoch": 0.85888, + "grad_norm": 0.6264092922210693, + "learning_rate": 4.867747098839536e-05, + "loss": 0.7826, + "step": 671 + }, + { + "epoch": 0.86016, + "grad_norm": 0.5935805439949036, + "learning_rate": 4.867547018807523e-05, + "loss": 0.7143, + "step": 672 + }, + { + "epoch": 0.86144, + "grad_norm": 0.6176136136054993, + "learning_rate": 4.8673469387755104e-05, + "loss": 0.7985, + "step": 673 + }, + { + "epoch": 0.86272, + "grad_norm": 0.6620816588401794, + "learning_rate": 4.8671468587434976e-05, + "loss": 0.7727, + "step": 674 + }, + { + "epoch": 0.864, + "grad_norm": 0.6603094935417175, + "learning_rate": 4.866946778711485e-05, + "loss": 0.7125, + "step": 675 + }, + { + "epoch": 0.86528, + "grad_norm": 0.6638785600662231, + "learning_rate": 4.866746698679472e-05, + "loss": 0.7305, + "step": 676 + }, + { + "epoch": 0.86656, + "grad_norm": 0.6319957375526428, + "learning_rate": 4.866546618647459e-05, + "loss": 0.7595, + "step": 677 + }, + { + "epoch": 0.86784, + "grad_norm": 0.5931079387664795, + "learning_rate": 4.8663465386154464e-05, + "loss": 0.7415, + "step": 678 + }, + { + "epoch": 0.86912, + "grad_norm": 0.6137109398841858, + "learning_rate": 4.8661464585834336e-05, + "loss": 0.7438, + "step": 679 + }, + { + "epoch": 0.8704, + "grad_norm": 0.5985450744628906, + "learning_rate": 4.865946378551421e-05, + "loss": 0.7409, + "step": 680 + }, + { + "epoch": 0.87168, + "grad_norm": 0.62948077917099, + "learning_rate": 4.865746298519408e-05, + "loss": 0.8124, + "step": 681 + }, + { + "epoch": 0.87296, + "grad_norm": 0.5815073251724243, + "learning_rate": 4.865546218487395e-05, + "loss": 0.73, + "step": 682 + }, + { + "epoch": 0.87424, + "grad_norm": 0.5903521180152893, + "learning_rate": 4.865346138455382e-05, + "loss": 0.7071, + "step": 683 + }, + { + "epoch": 0.87552, + "grad_norm": 0.6045056581497192, + "learning_rate": 4.8651460584233695e-05, + "loss": 0.7571, + "step": 684 + }, + { + "epoch": 0.8768, + "grad_norm": 0.6277945041656494, + "learning_rate": 4.8649459783913573e-05, + "loss": 0.7475, + "step": 685 + }, + { + "epoch": 0.87808, + "grad_norm": 0.6049608588218689, + "learning_rate": 4.864745898359344e-05, + "loss": 0.7857, + "step": 686 + }, + { + "epoch": 0.87936, + "grad_norm": 0.6526109576225281, + "learning_rate": 4.864545818327331e-05, + "loss": 0.7536, + "step": 687 + }, + { + "epoch": 0.88064, + "grad_norm": 0.6064791679382324, + "learning_rate": 4.864345738295318e-05, + "loss": 0.7731, + "step": 688 + }, + { + "epoch": 0.88192, + "grad_norm": 0.5711452960968018, + "learning_rate": 4.8641456582633054e-05, + "loss": 0.7581, + "step": 689 + }, + { + "epoch": 0.8832, + "grad_norm": 0.603302538394928, + "learning_rate": 4.8639455782312926e-05, + "loss": 0.7323, + "step": 690 + }, + { + "epoch": 0.88448, + "grad_norm": 0.6006559729576111, + "learning_rate": 4.86374549819928e-05, + "loss": 0.7074, + "step": 691 + }, + { + "epoch": 0.88576, + "grad_norm": 0.5934480428695679, + "learning_rate": 4.8635454181672676e-05, + "loss": 0.73, + "step": 692 + }, + { + "epoch": 0.88704, + "grad_norm": 0.5922892093658447, + "learning_rate": 4.863345338135255e-05, + "loss": 0.7075, + "step": 693 + }, + { + "epoch": 0.88832, + "grad_norm": 0.5788107514381409, + "learning_rate": 4.8631452581032413e-05, + "loss": 0.7307, + "step": 694 + }, + { + "epoch": 0.8896, + "grad_norm": 0.5999823808670044, + "learning_rate": 4.8629451780712285e-05, + "loss": 0.811, + "step": 695 + }, + { + "epoch": 0.89088, + "grad_norm": 0.6252598762512207, + "learning_rate": 4.862745098039216e-05, + "loss": 0.7221, + "step": 696 + }, + { + "epoch": 0.89216, + "grad_norm": 0.6279902458190918, + "learning_rate": 4.862545018007203e-05, + "loss": 0.7428, + "step": 697 + }, + { + "epoch": 0.89344, + "grad_norm": 0.6252185702323914, + "learning_rate": 4.86234493797519e-05, + "loss": 0.7786, + "step": 698 + }, + { + "epoch": 0.89472, + "grad_norm": 0.6247672438621521, + "learning_rate": 4.862144857943178e-05, + "loss": 0.7813, + "step": 699 + }, + { + "epoch": 0.896, + "grad_norm": 0.6344950199127197, + "learning_rate": 4.861944777911165e-05, + "loss": 0.7709, + "step": 700 + }, + { + "epoch": 0.89728, + "grad_norm": 0.6928293704986572, + "learning_rate": 4.861744697879152e-05, + "loss": 0.7902, + "step": 701 + }, + { + "epoch": 0.89856, + "grad_norm": 0.5995805859565735, + "learning_rate": 4.861544617847139e-05, + "loss": 0.7567, + "step": 702 + }, + { + "epoch": 0.89984, + "grad_norm": 0.6318543553352356, + "learning_rate": 4.861344537815126e-05, + "loss": 0.7079, + "step": 703 + }, + { + "epoch": 0.90112, + "grad_norm": 0.5997210144996643, + "learning_rate": 4.861144457783113e-05, + "loss": 0.7424, + "step": 704 + }, + { + "epoch": 0.9024, + "grad_norm": 0.6286918520927429, + "learning_rate": 4.8609443777511004e-05, + "loss": 0.7609, + "step": 705 + }, + { + "epoch": 0.90368, + "grad_norm": 0.5855825543403625, + "learning_rate": 4.860744297719088e-05, + "loss": 0.7097, + "step": 706 + }, + { + "epoch": 0.90496, + "grad_norm": 0.6897373795509338, + "learning_rate": 4.8605442176870754e-05, + "loss": 0.8089, + "step": 707 + }, + { + "epoch": 0.90624, + "grad_norm": 0.6260521411895752, + "learning_rate": 4.8603441376550626e-05, + "loss": 0.7375, + "step": 708 + }, + { + "epoch": 0.90752, + "grad_norm": 0.5700168013572693, + "learning_rate": 4.86014405762305e-05, + "loss": 0.7927, + "step": 709 + }, + { + "epoch": 0.9088, + "grad_norm": 0.6189404726028442, + "learning_rate": 4.859943977591036e-05, + "loss": 0.7137, + "step": 710 + }, + { + "epoch": 0.91008, + "grad_norm": 0.6204996109008789, + "learning_rate": 4.8597438975590235e-05, + "loss": 0.7494, + "step": 711 + }, + { + "epoch": 0.91136, + "grad_norm": 0.5846236944198608, + "learning_rate": 4.859543817527011e-05, + "loss": 0.7146, + "step": 712 + }, + { + "epoch": 0.91264, + "grad_norm": 0.6079381704330444, + "learning_rate": 4.8593437374949985e-05, + "loss": 0.7166, + "step": 713 + }, + { + "epoch": 0.91392, + "grad_norm": 0.6260196566581726, + "learning_rate": 4.859143657462986e-05, + "loss": 0.7639, + "step": 714 + }, + { + "epoch": 0.9152, + "grad_norm": 0.6194947361946106, + "learning_rate": 4.858943577430973e-05, + "loss": 0.7466, + "step": 715 + }, + { + "epoch": 0.91648, + "grad_norm": 0.5631256699562073, + "learning_rate": 4.85874349739896e-05, + "loss": 0.7732, + "step": 716 + }, + { + "epoch": 0.91776, + "grad_norm": 0.6009548306465149, + "learning_rate": 4.858543417366947e-05, + "loss": 0.7793, + "step": 717 + }, + { + "epoch": 0.91904, + "grad_norm": 0.6175373792648315, + "learning_rate": 4.858343337334934e-05, + "loss": 0.7672, + "step": 718 + }, + { + "epoch": 0.92032, + "grad_norm": 0.6090966463088989, + "learning_rate": 4.858143257302921e-05, + "loss": 0.7359, + "step": 719 + }, + { + "epoch": 0.9216, + "grad_norm": 0.5752703547477722, + "learning_rate": 4.857943177270909e-05, + "loss": 0.7208, + "step": 720 + }, + { + "epoch": 0.92288, + "grad_norm": 0.5956680774688721, + "learning_rate": 4.857743097238896e-05, + "loss": 0.7028, + "step": 721 + }, + { + "epoch": 0.92416, + "grad_norm": 0.6232988238334656, + "learning_rate": 4.857543017206883e-05, + "loss": 0.759, + "step": 722 + }, + { + "epoch": 0.92544, + "grad_norm": 0.6512808799743652, + "learning_rate": 4.8573429371748704e-05, + "loss": 0.7903, + "step": 723 + }, + { + "epoch": 0.92672, + "grad_norm": 0.6665694117546082, + "learning_rate": 4.8571428571428576e-05, + "loss": 0.7636, + "step": 724 + }, + { + "epoch": 0.928, + "grad_norm": 0.5766971111297607, + "learning_rate": 4.856942777110845e-05, + "loss": 0.7045, + "step": 725 + }, + { + "epoch": 0.92928, + "grad_norm": 0.610283613204956, + "learning_rate": 4.856742697078831e-05, + "loss": 0.7486, + "step": 726 + }, + { + "epoch": 0.93056, + "grad_norm": 0.6264128088951111, + "learning_rate": 4.856542617046819e-05, + "loss": 0.7867, + "step": 727 + }, + { + "epoch": 0.93184, + "grad_norm": 0.6030863523483276, + "learning_rate": 4.856342537014806e-05, + "loss": 0.75, + "step": 728 + }, + { + "epoch": 0.93312, + "grad_norm": 0.6077870726585388, + "learning_rate": 4.8561424569827935e-05, + "loss": 0.795, + "step": 729 + }, + { + "epoch": 0.9344, + "grad_norm": 0.554818332195282, + "learning_rate": 4.855942376950781e-05, + "loss": 0.7304, + "step": 730 + }, + { + "epoch": 0.93568, + "grad_norm": 0.5891968607902527, + "learning_rate": 4.855742296918768e-05, + "loss": 0.7437, + "step": 731 + }, + { + "epoch": 0.93696, + "grad_norm": 0.5956717133522034, + "learning_rate": 4.855542216886755e-05, + "loss": 0.7851, + "step": 732 + }, + { + "epoch": 0.93824, + "grad_norm": 0.6364977359771729, + "learning_rate": 4.855342136854742e-05, + "loss": 0.7926, + "step": 733 + }, + { + "epoch": 0.93952, + "grad_norm": 0.6908712387084961, + "learning_rate": 4.8551420568227294e-05, + "loss": 0.7565, + "step": 734 + }, + { + "epoch": 0.9408, + "grad_norm": 0.5801886320114136, + "learning_rate": 4.8549419767907166e-05, + "loss": 0.7251, + "step": 735 + }, + { + "epoch": 0.94208, + "grad_norm": 0.6222661137580872, + "learning_rate": 4.854741896758704e-05, + "loss": 0.6952, + "step": 736 + }, + { + "epoch": 0.94336, + "grad_norm": 0.6288057565689087, + "learning_rate": 4.854541816726691e-05, + "loss": 0.7691, + "step": 737 + }, + { + "epoch": 0.94464, + "grad_norm": 0.6167927384376526, + "learning_rate": 4.854341736694678e-05, + "loss": 0.7479, + "step": 738 + }, + { + "epoch": 0.94592, + "grad_norm": 0.5813360214233398, + "learning_rate": 4.8541416566626654e-05, + "loss": 0.7309, + "step": 739 + }, + { + "epoch": 0.9472, + "grad_norm": 0.6290002465248108, + "learning_rate": 4.8539415766306526e-05, + "loss": 0.7725, + "step": 740 + }, + { + "epoch": 0.94848, + "grad_norm": 0.6278480291366577, + "learning_rate": 4.85374149659864e-05, + "loss": 0.762, + "step": 741 + }, + { + "epoch": 0.94976, + "grad_norm": 0.5977178812026978, + "learning_rate": 4.853541416566627e-05, + "loss": 0.7463, + "step": 742 + }, + { + "epoch": 0.95104, + "grad_norm": 0.6514163613319397, + "learning_rate": 4.853341336534614e-05, + "loss": 0.7873, + "step": 743 + }, + { + "epoch": 0.95232, + "grad_norm": 0.5944337844848633, + "learning_rate": 4.853141256502601e-05, + "loss": 0.7534, + "step": 744 + }, + { + "epoch": 0.9536, + "grad_norm": 0.605015218257904, + "learning_rate": 4.8529411764705885e-05, + "loss": 0.7595, + "step": 745 + }, + { + "epoch": 0.95488, + "grad_norm": 0.6433084607124329, + "learning_rate": 4.852741096438576e-05, + "loss": 0.8321, + "step": 746 + }, + { + "epoch": 0.95616, + "grad_norm": 0.5718684792518616, + "learning_rate": 4.852541016406563e-05, + "loss": 0.7344, + "step": 747 + }, + { + "epoch": 0.95744, + "grad_norm": 0.6268803477287292, + "learning_rate": 4.85234093637455e-05, + "loss": 0.772, + "step": 748 + }, + { + "epoch": 0.95872, + "grad_norm": 0.6136980056762695, + "learning_rate": 4.852140856342537e-05, + "loss": 0.7513, + "step": 749 + }, + { + "epoch": 0.96, + "grad_norm": 0.6517449617385864, + "learning_rate": 4.8519407763105244e-05, + "loss": 0.7786, + "step": 750 + }, + { + "epoch": 0.96128, + "grad_norm": 0.5884830355644226, + "learning_rate": 4.8517406962785116e-05, + "loss": 0.7912, + "step": 751 + }, + { + "epoch": 0.96256, + "grad_norm": 0.6458854675292969, + "learning_rate": 4.851540616246499e-05, + "loss": 0.7881, + "step": 752 + }, + { + "epoch": 0.96384, + "grad_norm": 0.6329476237297058, + "learning_rate": 4.851340536214486e-05, + "loss": 0.7819, + "step": 753 + }, + { + "epoch": 0.96512, + "grad_norm": 0.5884618163108826, + "learning_rate": 4.851140456182473e-05, + "loss": 0.7786, + "step": 754 + }, + { + "epoch": 0.9664, + "grad_norm": 0.5475010871887207, + "learning_rate": 4.8509403761504603e-05, + "loss": 0.7172, + "step": 755 + }, + { + "epoch": 0.96768, + "grad_norm": 0.6388839483261108, + "learning_rate": 4.8507402961184475e-05, + "loss": 0.7659, + "step": 756 + }, + { + "epoch": 0.96896, + "grad_norm": 0.6201974153518677, + "learning_rate": 4.850540216086435e-05, + "loss": 0.7723, + "step": 757 + }, + { + "epoch": 0.97024, + "grad_norm": 0.6046715974807739, + "learning_rate": 4.850340136054422e-05, + "loss": 0.7486, + "step": 758 + }, + { + "epoch": 0.97152, + "grad_norm": 0.6070242524147034, + "learning_rate": 4.850140056022409e-05, + "loss": 0.763, + "step": 759 + }, + { + "epoch": 0.9728, + "grad_norm": 0.6031877994537354, + "learning_rate": 4.849939975990396e-05, + "loss": 0.7415, + "step": 760 + }, + { + "epoch": 0.97408, + "grad_norm": 0.5899202823638916, + "learning_rate": 4.8497398959583835e-05, + "loss": 0.7867, + "step": 761 + }, + { + "epoch": 0.97536, + "grad_norm": 0.6172879338264465, + "learning_rate": 4.8495398159263706e-05, + "loss": 0.6983, + "step": 762 + }, + { + "epoch": 0.97664, + "grad_norm": 0.639527440071106, + "learning_rate": 4.8493397358943585e-05, + "loss": 0.7358, + "step": 763 + }, + { + "epoch": 0.97792, + "grad_norm": 0.6101019382476807, + "learning_rate": 4.849139655862345e-05, + "loss": 0.7267, + "step": 764 + }, + { + "epoch": 0.9792, + "grad_norm": 0.6153104305267334, + "learning_rate": 4.848939575830332e-05, + "loss": 0.7648, + "step": 765 + }, + { + "epoch": 0.98048, + "grad_norm": 0.5717357993125916, + "learning_rate": 4.8487394957983194e-05, + "loss": 0.7197, + "step": 766 + }, + { + "epoch": 0.98176, + "grad_norm": 0.6005909442901611, + "learning_rate": 4.8485394157663066e-05, + "loss": 0.6786, + "step": 767 + }, + { + "epoch": 0.98304, + "grad_norm": 0.6198951005935669, + "learning_rate": 4.848339335734294e-05, + "loss": 0.759, + "step": 768 + }, + { + "epoch": 0.98432, + "grad_norm": 0.6164072155952454, + "learning_rate": 4.848139255702281e-05, + "loss": 0.757, + "step": 769 + }, + { + "epoch": 0.9856, + "grad_norm": 0.5673977136611938, + "learning_rate": 4.847939175670269e-05, + "loss": 0.7746, + "step": 770 + }, + { + "epoch": 0.98688, + "grad_norm": 0.6161131262779236, + "learning_rate": 4.847739095638256e-05, + "loss": 0.77, + "step": 771 + }, + { + "epoch": 0.98816, + "grad_norm": 0.623842716217041, + "learning_rate": 4.8475390156062425e-05, + "loss": 0.7528, + "step": 772 + }, + { + "epoch": 0.98944, + "grad_norm": 0.6155294179916382, + "learning_rate": 4.84733893557423e-05, + "loss": 0.7517, + "step": 773 + }, + { + "epoch": 0.99072, + "grad_norm": 0.6041366457939148, + "learning_rate": 4.847138855542217e-05, + "loss": 0.7704, + "step": 774 + }, + { + "epoch": 0.992, + "grad_norm": 0.6505810618400574, + "learning_rate": 4.846938775510204e-05, + "loss": 0.8253, + "step": 775 + }, + { + "epoch": 0.99328, + "grad_norm": 0.6835885643959045, + "learning_rate": 4.846738695478191e-05, + "loss": 0.8475, + "step": 776 + }, + { + "epoch": 0.99456, + "grad_norm": 0.6628032922744751, + "learning_rate": 4.846538615446179e-05, + "loss": 0.7722, + "step": 777 + }, + { + "epoch": 0.99584, + "grad_norm": 0.5734715461730957, + "learning_rate": 4.846338535414166e-05, + "loss": 0.7254, + "step": 778 + }, + { + "epoch": 0.99712, + "grad_norm": 0.6373343467712402, + "learning_rate": 4.8461384553821535e-05, + "loss": 0.7545, + "step": 779 + }, + { + "epoch": 0.9984, + "grad_norm": 0.6454253196716309, + "learning_rate": 4.84593837535014e-05, + "loss": 0.7757, + "step": 780 + }, + { + "epoch": 0.99968, + "grad_norm": 0.6249713897705078, + "learning_rate": 4.845738295318127e-05, + "loss": 0.6912, + "step": 781 + }, + { + "epoch": 1.00096, + "grad_norm": 1.1785544157028198, + "learning_rate": 4.8455382152861144e-05, + "loss": 1.291, + "step": 782 + }, + { + "epoch": 1.00224, + "grad_norm": 0.6030226349830627, + "learning_rate": 4.8453381352541015e-05, + "loss": 0.794, + "step": 783 + }, + { + "epoch": 1.00352, + "grad_norm": 0.5952867865562439, + "learning_rate": 4.8451380552220894e-05, + "loss": 0.7109, + "step": 784 + }, + { + "epoch": 1.0048, + "grad_norm": 0.595658004283905, + "learning_rate": 4.8449379751900766e-05, + "loss": 0.7261, + "step": 785 + }, + { + "epoch": 1.00608, + "grad_norm": 0.6058535575866699, + "learning_rate": 4.844737895158064e-05, + "loss": 0.7573, + "step": 786 + }, + { + "epoch": 1.00736, + "grad_norm": 0.5743350982666016, + "learning_rate": 4.844537815126051e-05, + "loss": 0.6788, + "step": 787 + }, + { + "epoch": 1.00864, + "grad_norm": 0.6272056102752686, + "learning_rate": 4.8443377350940375e-05, + "loss": 0.7366, + "step": 788 + }, + { + "epoch": 1.00992, + "grad_norm": 0.6139172315597534, + "learning_rate": 4.8441376550620247e-05, + "loss": 0.7665, + "step": 789 + }, + { + "epoch": 1.0112, + "grad_norm": 0.6504907011985779, + "learning_rate": 4.843937575030012e-05, + "loss": 0.7647, + "step": 790 + }, + { + "epoch": 1.01248, + "grad_norm": 0.6235556602478027, + "learning_rate": 4.843737494998e-05, + "loss": 0.7775, + "step": 791 + }, + { + "epoch": 1.01376, + "grad_norm": 0.5893948078155518, + "learning_rate": 4.843537414965987e-05, + "loss": 0.7583, + "step": 792 + }, + { + "epoch": 1.01504, + "grad_norm": 0.6206098198890686, + "learning_rate": 4.843337334933974e-05, + "loss": 0.7281, + "step": 793 + }, + { + "epoch": 1.01632, + "grad_norm": 0.639717161655426, + "learning_rate": 4.843137254901961e-05, + "loss": 0.7478, + "step": 794 + }, + { + "epoch": 1.0176, + "grad_norm": 0.6055462956428528, + "learning_rate": 4.8429371748699484e-05, + "loss": 0.698, + "step": 795 + }, + { + "epoch": 1.01888, + "grad_norm": 0.6277878880500793, + "learning_rate": 4.842737094837935e-05, + "loss": 0.7602, + "step": 796 + }, + { + "epoch": 1.02016, + "grad_norm": 0.6001823544502258, + "learning_rate": 4.842537014805922e-05, + "loss": 0.6931, + "step": 797 + }, + { + "epoch": 1.02144, + "grad_norm": 0.6369767785072327, + "learning_rate": 4.84233693477391e-05, + "loss": 0.7378, + "step": 798 + }, + { + "epoch": 1.02272, + "grad_norm": 0.6533408761024475, + "learning_rate": 4.842136854741897e-05, + "loss": 0.7738, + "step": 799 + }, + { + "epoch": 1.024, + "grad_norm": 0.6154987812042236, + "learning_rate": 4.8419367747098844e-05, + "loss": 0.7081, + "step": 800 + }, + { + "epoch": 1.02528, + "grad_norm": 0.5977925062179565, + "learning_rate": 4.8417366946778716e-05, + "loss": 0.6907, + "step": 801 + }, + { + "epoch": 1.02656, + "grad_norm": 0.5921528935432434, + "learning_rate": 4.841536614645859e-05, + "loss": 0.7432, + "step": 802 + }, + { + "epoch": 1.02784, + "grad_norm": 0.5911890268325806, + "learning_rate": 4.841336534613846e-05, + "loss": 0.6892, + "step": 803 + }, + { + "epoch": 1.02912, + "grad_norm": 0.6107017993927002, + "learning_rate": 4.8411364545818324e-05, + "loss": 0.7641, + "step": 804 + }, + { + "epoch": 1.0304, + "grad_norm": 0.6192318797111511, + "learning_rate": 4.84093637454982e-05, + "loss": 0.7411, + "step": 805 + }, + { + "epoch": 1.03168, + "grad_norm": 0.5901237726211548, + "learning_rate": 4.8407362945178075e-05, + "loss": 0.7253, + "step": 806 + }, + { + "epoch": 1.03296, + "grad_norm": 0.6684927344322205, + "learning_rate": 4.840536214485795e-05, + "loss": 0.7597, + "step": 807 + }, + { + "epoch": 1.03424, + "grad_norm": 0.6069619059562683, + "learning_rate": 4.840336134453782e-05, + "loss": 0.709, + "step": 808 + }, + { + "epoch": 1.03552, + "grad_norm": 0.6064357757568359, + "learning_rate": 4.840136054421769e-05, + "loss": 0.7321, + "step": 809 + }, + { + "epoch": 1.0368, + "grad_norm": 0.5926334261894226, + "learning_rate": 4.839935974389756e-05, + "loss": 0.7346, + "step": 810 + }, + { + "epoch": 1.03808, + "grad_norm": 0.6023288369178772, + "learning_rate": 4.8397358943577434e-05, + "loss": 0.7672, + "step": 811 + }, + { + "epoch": 1.03936, + "grad_norm": 0.6245213747024536, + "learning_rate": 4.8395358143257306e-05, + "loss": 0.7458, + "step": 812 + }, + { + "epoch": 1.04064, + "grad_norm": 0.601436197757721, + "learning_rate": 4.839335734293718e-05, + "loss": 0.7848, + "step": 813 + }, + { + "epoch": 1.04192, + "grad_norm": 0.6355320811271667, + "learning_rate": 4.839135654261705e-05, + "loss": 0.8105, + "step": 814 + }, + { + "epoch": 1.0432, + "grad_norm": 0.5975676774978638, + "learning_rate": 4.838935574229692e-05, + "loss": 0.692, + "step": 815 + }, + { + "epoch": 1.04448, + "grad_norm": 0.6143369674682617, + "learning_rate": 4.8387354941976793e-05, + "loss": 0.7551, + "step": 816 + }, + { + "epoch": 1.04576, + "grad_norm": 0.6289949417114258, + "learning_rate": 4.8385354141656665e-05, + "loss": 0.6912, + "step": 817 + }, + { + "epoch": 1.04704, + "grad_norm": 0.6380932331085205, + "learning_rate": 4.838335334133654e-05, + "loss": 0.6864, + "step": 818 + }, + { + "epoch": 1.04832, + "grad_norm": 0.6500244736671448, + "learning_rate": 4.838135254101641e-05, + "loss": 0.7205, + "step": 819 + }, + { + "epoch": 1.0496, + "grad_norm": 0.6350643038749695, + "learning_rate": 4.837935174069628e-05, + "loss": 0.7197, + "step": 820 + }, + { + "epoch": 1.05088, + "grad_norm": 0.6777990460395813, + "learning_rate": 4.837735094037615e-05, + "loss": 0.8366, + "step": 821 + }, + { + "epoch": 1.05216, + "grad_norm": 0.6251175999641418, + "learning_rate": 4.8375350140056025e-05, + "loss": 0.7125, + "step": 822 + }, + { + "epoch": 1.05344, + "grad_norm": 0.6193746328353882, + "learning_rate": 4.8373349339735896e-05, + "loss": 0.798, + "step": 823 + }, + { + "epoch": 1.05472, + "grad_norm": 0.6032739877700806, + "learning_rate": 4.837134853941577e-05, + "loss": 0.7334, + "step": 824 + }, + { + "epoch": 1.056, + "grad_norm": 0.6197022199630737, + "learning_rate": 4.836934773909564e-05, + "loss": 0.737, + "step": 825 + }, + { + "epoch": 1.05728, + "grad_norm": 0.646630585193634, + "learning_rate": 4.836734693877551e-05, + "loss": 0.7796, + "step": 826 + }, + { + "epoch": 1.05856, + "grad_norm": 0.5807000398635864, + "learning_rate": 4.8365346138455384e-05, + "loss": 0.7328, + "step": 827 + }, + { + "epoch": 1.05984, + "grad_norm": 0.6248646974563599, + "learning_rate": 4.8363345338135256e-05, + "loss": 0.7364, + "step": 828 + }, + { + "epoch": 1.06112, + "grad_norm": 0.6000564098358154, + "learning_rate": 4.836134453781513e-05, + "loss": 0.6898, + "step": 829 + }, + { + "epoch": 1.0624, + "grad_norm": 0.603438138961792, + "learning_rate": 4.8359343737495e-05, + "loss": 0.7358, + "step": 830 + }, + { + "epoch": 1.06368, + "grad_norm": 0.605436384677887, + "learning_rate": 4.835734293717487e-05, + "loss": 0.7493, + "step": 831 + }, + { + "epoch": 1.06496, + "grad_norm": 0.6158525347709656, + "learning_rate": 4.835534213685474e-05, + "loss": 0.743, + "step": 832 + }, + { + "epoch": 1.06624, + "grad_norm": 0.6049724817276001, + "learning_rate": 4.835334133653462e-05, + "loss": 0.7252, + "step": 833 + }, + { + "epoch": 1.06752, + "grad_norm": 0.5752277970314026, + "learning_rate": 4.835134053621449e-05, + "loss": 0.6848, + "step": 834 + }, + { + "epoch": 1.0688, + "grad_norm": 0.6556239128112793, + "learning_rate": 4.834933973589436e-05, + "loss": 0.7814, + "step": 835 + }, + { + "epoch": 1.07008, + "grad_norm": 0.6204028129577637, + "learning_rate": 4.834733893557423e-05, + "loss": 0.7009, + "step": 836 + }, + { + "epoch": 1.07136, + "grad_norm": 0.5991467833518982, + "learning_rate": 4.83453381352541e-05, + "loss": 0.7251, + "step": 837 + }, + { + "epoch": 1.07264, + "grad_norm": 0.5856754779815674, + "learning_rate": 4.8343337334933974e-05, + "loss": 0.7455, + "step": 838 + }, + { + "epoch": 1.07392, + "grad_norm": 0.6454278826713562, + "learning_rate": 4.8341336534613846e-05, + "loss": 0.7808, + "step": 839 + }, + { + "epoch": 1.0752, + "grad_norm": 0.6240683197975159, + "learning_rate": 4.8339335734293725e-05, + "loss": 0.7066, + "step": 840 + }, + { + "epoch": 1.07648, + "grad_norm": 0.6424701809883118, + "learning_rate": 4.83373349339736e-05, + "loss": 0.7447, + "step": 841 + }, + { + "epoch": 1.07776, + "grad_norm": 0.5974248051643372, + "learning_rate": 4.833533413365346e-05, + "loss": 0.735, + "step": 842 + }, + { + "epoch": 1.07904, + "grad_norm": 0.6363813281059265, + "learning_rate": 4.8333333333333334e-05, + "loss": 0.7302, + "step": 843 + }, + { + "epoch": 1.08032, + "grad_norm": 0.6119391918182373, + "learning_rate": 4.8331332533013205e-05, + "loss": 0.7222, + "step": 844 + }, + { + "epoch": 1.0816, + "grad_norm": 0.5887010097503662, + "learning_rate": 4.832933173269308e-05, + "loss": 0.7207, + "step": 845 + }, + { + "epoch": 1.08288, + "grad_norm": 0.6205199360847473, + "learning_rate": 4.832733093237295e-05, + "loss": 0.72, + "step": 846 + }, + { + "epoch": 1.08416, + "grad_norm": 0.570545494556427, + "learning_rate": 4.832533013205282e-05, + "loss": 0.7201, + "step": 847 + }, + { + "epoch": 1.08544, + "grad_norm": 0.6642616987228394, + "learning_rate": 4.83233293317327e-05, + "loss": 0.8064, + "step": 848 + }, + { + "epoch": 1.08672, + "grad_norm": 0.650346577167511, + "learning_rate": 4.832132853141257e-05, + "loss": 0.7717, + "step": 849 + }, + { + "epoch": 1.088, + "grad_norm": 0.5787684917449951, + "learning_rate": 4.831932773109244e-05, + "loss": 0.6377, + "step": 850 + }, + { + "epoch": 1.08928, + "grad_norm": 0.6291863322257996, + "learning_rate": 4.831732693077231e-05, + "loss": 0.7332, + "step": 851 + }, + { + "epoch": 1.09056, + "grad_norm": 0.6485406160354614, + "learning_rate": 4.831532613045218e-05, + "loss": 0.7445, + "step": 852 + }, + { + "epoch": 1.09184, + "grad_norm": 0.6354153156280518, + "learning_rate": 4.831332533013205e-05, + "loss": 0.7272, + "step": 853 + }, + { + "epoch": 1.09312, + "grad_norm": 0.5846593379974365, + "learning_rate": 4.8311324529811924e-05, + "loss": 0.7165, + "step": 854 + }, + { + "epoch": 1.0944, + "grad_norm": 0.6126893758773804, + "learning_rate": 4.83093237294918e-05, + "loss": 0.6913, + "step": 855 + }, + { + "epoch": 1.09568, + "grad_norm": 0.631846010684967, + "learning_rate": 4.8307322929171675e-05, + "loss": 0.7566, + "step": 856 + }, + { + "epoch": 1.09696, + "grad_norm": 0.6731998920440674, + "learning_rate": 4.8305322128851546e-05, + "loss": 0.7488, + "step": 857 + }, + { + "epoch": 1.09824, + "grad_norm": 0.6166826486587524, + "learning_rate": 4.830332132853141e-05, + "loss": 0.7271, + "step": 858 + }, + { + "epoch": 1.09952, + "grad_norm": 0.5922271013259888, + "learning_rate": 4.830132052821128e-05, + "loss": 0.6596, + "step": 859 + }, + { + "epoch": 1.1008, + "grad_norm": 0.6259810328483582, + "learning_rate": 4.8299319727891155e-05, + "loss": 0.7917, + "step": 860 + }, + { + "epoch": 1.10208, + "grad_norm": 0.6506845355033875, + "learning_rate": 4.829731892757103e-05, + "loss": 0.7029, + "step": 861 + }, + { + "epoch": 1.10336, + "grad_norm": 0.6502883434295654, + "learning_rate": 4.8295318127250906e-05, + "loss": 0.6832, + "step": 862 + }, + { + "epoch": 1.10464, + "grad_norm": 0.6381411552429199, + "learning_rate": 4.829331732693078e-05, + "loss": 0.7275, + "step": 863 + }, + { + "epoch": 1.10592, + "grad_norm": 0.629329264163971, + "learning_rate": 4.829131652661065e-05, + "loss": 0.7542, + "step": 864 + }, + { + "epoch": 1.1072, + "grad_norm": 0.6186816692352295, + "learning_rate": 4.828931572629052e-05, + "loss": 0.7195, + "step": 865 + }, + { + "epoch": 1.10848, + "grad_norm": 0.5985262393951416, + "learning_rate": 4.8287314925970386e-05, + "loss": 0.7651, + "step": 866 + }, + { + "epoch": 1.10976, + "grad_norm": 0.6066650152206421, + "learning_rate": 4.828531412565026e-05, + "loss": 0.7572, + "step": 867 + }, + { + "epoch": 1.11104, + "grad_norm": 0.6421188712120056, + "learning_rate": 4.828331332533013e-05, + "loss": 0.7796, + "step": 868 + }, + { + "epoch": 1.11232, + "grad_norm": 0.6231709122657776, + "learning_rate": 4.828131252501001e-05, + "loss": 0.7564, + "step": 869 + }, + { + "epoch": 1.1136, + "grad_norm": 0.6402696967124939, + "learning_rate": 4.827931172468988e-05, + "loss": 0.7095, + "step": 870 + }, + { + "epoch": 1.11488, + "grad_norm": 0.5962916612625122, + "learning_rate": 4.827731092436975e-05, + "loss": 0.7085, + "step": 871 + }, + { + "epoch": 1.11616, + "grad_norm": 0.6202763319015503, + "learning_rate": 4.8275310124049624e-05, + "loss": 0.7063, + "step": 872 + }, + { + "epoch": 1.11744, + "grad_norm": 0.626986026763916, + "learning_rate": 4.8273309323729496e-05, + "loss": 0.7705, + "step": 873 + }, + { + "epoch": 1.11872, + "grad_norm": 0.6388139724731445, + "learning_rate": 4.827130852340936e-05, + "loss": 0.754, + "step": 874 + }, + { + "epoch": 1.12, + "grad_norm": 0.6084983348846436, + "learning_rate": 4.826930772308923e-05, + "loss": 0.7012, + "step": 875 + }, + { + "epoch": 1.12128, + "grad_norm": 0.6018005609512329, + "learning_rate": 4.826730692276911e-05, + "loss": 0.7186, + "step": 876 + }, + { + "epoch": 1.12256, + "grad_norm": 0.6124555468559265, + "learning_rate": 4.8265306122448984e-05, + "loss": 0.7023, + "step": 877 + }, + { + "epoch": 1.12384, + "grad_norm": 0.610120415687561, + "learning_rate": 4.8263305322128855e-05, + "loss": 0.6897, + "step": 878 + }, + { + "epoch": 1.12512, + "grad_norm": 0.6392123103141785, + "learning_rate": 4.826130452180873e-05, + "loss": 0.7632, + "step": 879 + }, + { + "epoch": 1.1264, + "grad_norm": 0.6437667608261108, + "learning_rate": 4.82593037214886e-05, + "loss": 0.8141, + "step": 880 + }, + { + "epoch": 1.12768, + "grad_norm": 0.6260906457901001, + "learning_rate": 4.825730292116847e-05, + "loss": 0.7017, + "step": 881 + }, + { + "epoch": 1.12896, + "grad_norm": 0.6186864972114563, + "learning_rate": 4.8255302120848336e-05, + "loss": 0.6796, + "step": 882 + }, + { + "epoch": 1.13024, + "grad_norm": 0.6303755044937134, + "learning_rate": 4.8253301320528215e-05, + "loss": 0.7379, + "step": 883 + }, + { + "epoch": 1.13152, + "grad_norm": 0.5899752378463745, + "learning_rate": 4.8251300520208087e-05, + "loss": 0.7175, + "step": 884 + }, + { + "epoch": 1.1328, + "grad_norm": 0.5789473652839661, + "learning_rate": 4.824929971988796e-05, + "loss": 0.6513, + "step": 885 + }, + { + "epoch": 1.13408, + "grad_norm": 0.6217113733291626, + "learning_rate": 4.824729891956783e-05, + "loss": 0.7529, + "step": 886 + }, + { + "epoch": 1.13536, + "grad_norm": 0.6412451267242432, + "learning_rate": 4.82452981192477e-05, + "loss": 0.7977, + "step": 887 + }, + { + "epoch": 1.13664, + "grad_norm": 0.6278442144393921, + "learning_rate": 4.8243297318927574e-05, + "loss": 0.7078, + "step": 888 + }, + { + "epoch": 1.13792, + "grad_norm": 0.6618484854698181, + "learning_rate": 4.8241296518607446e-05, + "loss": 0.7483, + "step": 889 + }, + { + "epoch": 1.1392, + "grad_norm": 0.603378176689148, + "learning_rate": 4.823929571828732e-05, + "loss": 0.7063, + "step": 890 + }, + { + "epoch": 1.14048, + "grad_norm": 0.6106945276260376, + "learning_rate": 4.823729491796719e-05, + "loss": 0.7299, + "step": 891 + }, + { + "epoch": 1.14176, + "grad_norm": 0.6208406090736389, + "learning_rate": 4.823529411764706e-05, + "loss": 0.7362, + "step": 892 + }, + { + "epoch": 1.14304, + "grad_norm": 0.63667231798172, + "learning_rate": 4.823329331732693e-05, + "loss": 0.744, + "step": 893 + }, + { + "epoch": 1.14432, + "grad_norm": 0.5953905582427979, + "learning_rate": 4.8231292517006805e-05, + "loss": 0.7047, + "step": 894 + }, + { + "epoch": 1.1456, + "grad_norm": 0.6081019043922424, + "learning_rate": 4.822929171668668e-05, + "loss": 0.7546, + "step": 895 + }, + { + "epoch": 1.14688, + "grad_norm": 0.6807315349578857, + "learning_rate": 4.822729091636655e-05, + "loss": 0.7761, + "step": 896 + }, + { + "epoch": 1.14816, + "grad_norm": 0.6396183967590332, + "learning_rate": 4.822529011604642e-05, + "loss": 0.7651, + "step": 897 + }, + { + "epoch": 1.14944, + "grad_norm": 0.6671329736709595, + "learning_rate": 4.822328931572629e-05, + "loss": 0.8052, + "step": 898 + }, + { + "epoch": 1.15072, + "grad_norm": 0.6230823397636414, + "learning_rate": 4.8221288515406164e-05, + "loss": 0.7748, + "step": 899 + }, + { + "epoch": 1.152, + "grad_norm": 0.6346234083175659, + "learning_rate": 4.8219287715086036e-05, + "loss": 0.7335, + "step": 900 + }, + { + "epoch": 1.15328, + "grad_norm": 0.6477737426757812, + "learning_rate": 4.821728691476591e-05, + "loss": 0.7561, + "step": 901 + }, + { + "epoch": 1.15456, + "grad_norm": 0.6157343983650208, + "learning_rate": 4.821528611444578e-05, + "loss": 0.7422, + "step": 902 + }, + { + "epoch": 1.15584, + "grad_norm": 0.6227743625640869, + "learning_rate": 4.821328531412565e-05, + "loss": 0.7081, + "step": 903 + }, + { + "epoch": 1.15712, + "grad_norm": 0.6442050337791443, + "learning_rate": 4.8211284513805524e-05, + "loss": 0.7545, + "step": 904 + }, + { + "epoch": 1.1584, + "grad_norm": 0.5827674865722656, + "learning_rate": 4.8209283713485396e-05, + "loss": 0.6298, + "step": 905 + }, + { + "epoch": 1.15968, + "grad_norm": 0.6147791147232056, + "learning_rate": 4.820728291316527e-05, + "loss": 0.7058, + "step": 906 + }, + { + "epoch": 1.16096, + "grad_norm": 0.6040810942649841, + "learning_rate": 4.820528211284514e-05, + "loss": 0.7474, + "step": 907 + }, + { + "epoch": 1.16224, + "grad_norm": 0.6344786286354065, + "learning_rate": 4.820328131252501e-05, + "loss": 0.732, + "step": 908 + }, + { + "epoch": 1.16352, + "grad_norm": 0.6694657802581787, + "learning_rate": 4.820128051220488e-05, + "loss": 0.7453, + "step": 909 + }, + { + "epoch": 1.1648, + "grad_norm": 0.6136607527732849, + "learning_rate": 4.8199279711884755e-05, + "loss": 0.7458, + "step": 910 + }, + { + "epoch": 1.16608, + "grad_norm": 0.6418918371200562, + "learning_rate": 4.8197278911564633e-05, + "loss": 0.723, + "step": 911 + }, + { + "epoch": 1.16736, + "grad_norm": 0.6443724036216736, + "learning_rate": 4.81952781112445e-05, + "loss": 0.6964, + "step": 912 + }, + { + "epoch": 1.16864, + "grad_norm": 0.6705388426780701, + "learning_rate": 4.819327731092437e-05, + "loss": 0.7893, + "step": 913 + }, + { + "epoch": 1.16992, + "grad_norm": 0.6297101974487305, + "learning_rate": 4.819127651060424e-05, + "loss": 0.7399, + "step": 914 + }, + { + "epoch": 1.1712, + "grad_norm": 0.6698152422904968, + "learning_rate": 4.8189275710284114e-05, + "loss": 0.7513, + "step": 915 + }, + { + "epoch": 1.17248, + "grad_norm": 0.6113006472587585, + "learning_rate": 4.8187274909963986e-05, + "loss": 0.7495, + "step": 916 + }, + { + "epoch": 1.17376, + "grad_norm": 0.6909430027008057, + "learning_rate": 4.818527410964386e-05, + "loss": 0.795, + "step": 917 + }, + { + "epoch": 1.17504, + "grad_norm": 0.6630053520202637, + "learning_rate": 4.8183273309323736e-05, + "loss": 0.7305, + "step": 918 + }, + { + "epoch": 1.17632, + "grad_norm": 0.6622920036315918, + "learning_rate": 4.818127250900361e-05, + "loss": 0.7672, + "step": 919 + }, + { + "epoch": 1.1776, + "grad_norm": 0.6172346472740173, + "learning_rate": 4.817927170868347e-05, + "loss": 0.6956, + "step": 920 + }, + { + "epoch": 1.17888, + "grad_norm": 0.6722678542137146, + "learning_rate": 4.8177270908363345e-05, + "loss": 0.7633, + "step": 921 + }, + { + "epoch": 1.1801599999999999, + "grad_norm": 0.6573194861412048, + "learning_rate": 4.817527010804322e-05, + "loss": 0.7313, + "step": 922 + }, + { + "epoch": 1.18144, + "grad_norm": 0.598237931728363, + "learning_rate": 4.817326930772309e-05, + "loss": 0.6963, + "step": 923 + }, + { + "epoch": 1.18272, + "grad_norm": 0.6254197359085083, + "learning_rate": 4.817126850740296e-05, + "loss": 0.727, + "step": 924 + }, + { + "epoch": 1.184, + "grad_norm": 0.6584347486495972, + "learning_rate": 4.816926770708284e-05, + "loss": 0.7281, + "step": 925 + }, + { + "epoch": 1.1852800000000001, + "grad_norm": 0.6371662020683289, + "learning_rate": 4.816726690676271e-05, + "loss": 0.7286, + "step": 926 + }, + { + "epoch": 1.18656, + "grad_norm": 0.6230917572975159, + "learning_rate": 4.816526610644258e-05, + "loss": 0.725, + "step": 927 + }, + { + "epoch": 1.18784, + "grad_norm": 0.613785445690155, + "learning_rate": 4.816326530612245e-05, + "loss": 0.7596, + "step": 928 + }, + { + "epoch": 1.18912, + "grad_norm": 0.6123350262641907, + "learning_rate": 4.816126450580232e-05, + "loss": 0.7271, + "step": 929 + }, + { + "epoch": 1.1904, + "grad_norm": 0.604227602481842, + "learning_rate": 4.815926370548219e-05, + "loss": 0.72, + "step": 930 + }, + { + "epoch": 1.19168, + "grad_norm": 0.6275907754898071, + "learning_rate": 4.8157262905162064e-05, + "loss": 0.6766, + "step": 931 + }, + { + "epoch": 1.19296, + "grad_norm": 0.6855752468109131, + "learning_rate": 4.815526210484194e-05, + "loss": 0.7628, + "step": 932 + }, + { + "epoch": 1.19424, + "grad_norm": 0.6172601580619812, + "learning_rate": 4.8153261304521814e-05, + "loss": 0.7254, + "step": 933 + }, + { + "epoch": 1.19552, + "grad_norm": 0.6443458795547485, + "learning_rate": 4.8151260504201686e-05, + "loss": 0.7489, + "step": 934 + }, + { + "epoch": 1.1968, + "grad_norm": 0.6204928755760193, + "learning_rate": 4.814925970388156e-05, + "loss": 0.7606, + "step": 935 + }, + { + "epoch": 1.19808, + "grad_norm": 0.6195954084396362, + "learning_rate": 4.814725890356142e-05, + "loss": 0.7121, + "step": 936 + }, + { + "epoch": 1.19936, + "grad_norm": 0.6397810578346252, + "learning_rate": 4.8145258103241295e-05, + "loss": 0.7392, + "step": 937 + }, + { + "epoch": 1.20064, + "grad_norm": 0.6325953602790833, + "learning_rate": 4.814325730292117e-05, + "loss": 0.7099, + "step": 938 + }, + { + "epoch": 1.2019199999999999, + "grad_norm": 0.5596916079521179, + "learning_rate": 4.8141256502601045e-05, + "loss": 0.6416, + "step": 939 + }, + { + "epoch": 1.2032, + "grad_norm": 0.6255161762237549, + "learning_rate": 4.813925570228092e-05, + "loss": 0.6897, + "step": 940 + }, + { + "epoch": 1.20448, + "grad_norm": 0.6093683242797852, + "learning_rate": 4.813725490196079e-05, + "loss": 0.7421, + "step": 941 + }, + { + "epoch": 1.20576, + "grad_norm": 0.6216311454772949, + "learning_rate": 4.813525410164066e-05, + "loss": 0.6835, + "step": 942 + }, + { + "epoch": 1.2070400000000001, + "grad_norm": 0.671823263168335, + "learning_rate": 4.813325330132053e-05, + "loss": 0.7734, + "step": 943 + }, + { + "epoch": 1.20832, + "grad_norm": 0.7053853869438171, + "learning_rate": 4.81312525010004e-05, + "loss": 0.7447, + "step": 944 + }, + { + "epoch": 1.2096, + "grad_norm": 0.5679812431335449, + "learning_rate": 4.812925170068027e-05, + "loss": 0.6654, + "step": 945 + }, + { + "epoch": 1.21088, + "grad_norm": 0.5954858660697937, + "learning_rate": 4.812725090036015e-05, + "loss": 0.6828, + "step": 946 + }, + { + "epoch": 1.21216, + "grad_norm": 0.6449232697486877, + "learning_rate": 4.812525010004002e-05, + "loss": 0.7228, + "step": 947 + }, + { + "epoch": 1.21344, + "grad_norm": 0.6288172006607056, + "learning_rate": 4.812324929971989e-05, + "loss": 0.7043, + "step": 948 + }, + { + "epoch": 1.21472, + "grad_norm": 0.6665524244308472, + "learning_rate": 4.8121248499399764e-05, + "loss": 0.7963, + "step": 949 + }, + { + "epoch": 1.216, + "grad_norm": 0.6399210095405579, + "learning_rate": 4.8119247699079636e-05, + "loss": 0.7559, + "step": 950 + }, + { + "epoch": 1.21728, + "grad_norm": 0.611251711845398, + "learning_rate": 4.811724689875951e-05, + "loss": 0.7279, + "step": 951 + }, + { + "epoch": 1.21856, + "grad_norm": 0.6060947775840759, + "learning_rate": 4.811524609843937e-05, + "loss": 0.7066, + "step": 952 + }, + { + "epoch": 1.21984, + "grad_norm": 0.660533607006073, + "learning_rate": 4.811324529811925e-05, + "loss": 0.7361, + "step": 953 + }, + { + "epoch": 1.22112, + "grad_norm": 0.6510452032089233, + "learning_rate": 4.811124449779912e-05, + "loss": 0.7185, + "step": 954 + }, + { + "epoch": 1.2224, + "grad_norm": 0.649373471736908, + "learning_rate": 4.8109243697478995e-05, + "loss": 0.7748, + "step": 955 + }, + { + "epoch": 1.2236799999999999, + "grad_norm": 0.6497557759284973, + "learning_rate": 4.810724289715887e-05, + "loss": 0.7367, + "step": 956 + }, + { + "epoch": 1.22496, + "grad_norm": 0.6782010197639465, + "learning_rate": 4.810524209683874e-05, + "loss": 0.6939, + "step": 957 + }, + { + "epoch": 1.22624, + "grad_norm": 0.647323727607727, + "learning_rate": 4.810324129651861e-05, + "loss": 0.6888, + "step": 958 + }, + { + "epoch": 1.22752, + "grad_norm": 0.6662117838859558, + "learning_rate": 4.810124049619848e-05, + "loss": 0.7344, + "step": 959 + }, + { + "epoch": 1.2288000000000001, + "grad_norm": 0.6221624612808228, + "learning_rate": 4.809923969587835e-05, + "loss": 0.7277, + "step": 960 + }, + { + "epoch": 1.23008, + "grad_norm": 0.6320923566818237, + "learning_rate": 4.8097238895558226e-05, + "loss": 0.7305, + "step": 961 + }, + { + "epoch": 1.23136, + "grad_norm": 0.6177527904510498, + "learning_rate": 4.80952380952381e-05, + "loss": 0.7247, + "step": 962 + }, + { + "epoch": 1.23264, + "grad_norm": 0.6447619199752808, + "learning_rate": 4.809323729491797e-05, + "loss": 0.739, + "step": 963 + }, + { + "epoch": 1.23392, + "grad_norm": 0.6220279335975647, + "learning_rate": 4.809123649459784e-05, + "loss": 0.7493, + "step": 964 + }, + { + "epoch": 1.2352, + "grad_norm": 0.6294159889221191, + "learning_rate": 4.8089235694277714e-05, + "loss": 0.7178, + "step": 965 + }, + { + "epoch": 1.23648, + "grad_norm": 0.6304751634597778, + "learning_rate": 4.8087234893957586e-05, + "loss": 0.7529, + "step": 966 + }, + { + "epoch": 1.23776, + "grad_norm": 0.654147744178772, + "learning_rate": 4.808523409363746e-05, + "loss": 0.7369, + "step": 967 + }, + { + "epoch": 1.23904, + "grad_norm": 0.6700395345687866, + "learning_rate": 4.808323329331733e-05, + "loss": 0.7395, + "step": 968 + }, + { + "epoch": 1.24032, + "grad_norm": 0.6803591251373291, + "learning_rate": 4.80812324929972e-05, + "loss": 0.815, + "step": 969 + }, + { + "epoch": 1.2416, + "grad_norm": 0.6240243911743164, + "learning_rate": 4.807923169267707e-05, + "loss": 0.7645, + "step": 970 + }, + { + "epoch": 1.24288, + "grad_norm": 0.6563379764556885, + "learning_rate": 4.8077230892356945e-05, + "loss": 0.7685, + "step": 971 + }, + { + "epoch": 1.24416, + "grad_norm": 0.6459086537361145, + "learning_rate": 4.807523009203682e-05, + "loss": 0.7109, + "step": 972 + }, + { + "epoch": 1.2454399999999999, + "grad_norm": 0.6518861651420593, + "learning_rate": 4.807322929171669e-05, + "loss": 0.7225, + "step": 973 + }, + { + "epoch": 1.24672, + "grad_norm": 0.6392287015914917, + "learning_rate": 4.807122849139656e-05, + "loss": 0.7454, + "step": 974 + }, + { + "epoch": 1.248, + "grad_norm": 0.639618456363678, + "learning_rate": 4.806922769107643e-05, + "loss": 0.7345, + "step": 975 + }, + { + "epoch": 1.24928, + "grad_norm": 0.6422220468521118, + "learning_rate": 4.8067226890756304e-05, + "loss": 0.7571, + "step": 976 + }, + { + "epoch": 1.2505600000000001, + "grad_norm": 0.6119744777679443, + "learning_rate": 4.8065226090436176e-05, + "loss": 0.7367, + "step": 977 + }, + { + "epoch": 1.25184, + "grad_norm": 0.6147500872612, + "learning_rate": 4.806322529011605e-05, + "loss": 0.6999, + "step": 978 + }, + { + "epoch": 1.25312, + "grad_norm": 0.6690681576728821, + "learning_rate": 4.806122448979592e-05, + "loss": 0.7657, + "step": 979 + }, + { + "epoch": 1.2544, + "grad_norm": 0.6589951515197754, + "learning_rate": 4.805922368947579e-05, + "loss": 0.6975, + "step": 980 + }, + { + "epoch": 1.25568, + "grad_norm": 0.6869469285011292, + "learning_rate": 4.8057222889155663e-05, + "loss": 0.7403, + "step": 981 + }, + { + "epoch": 1.25696, + "grad_norm": 0.6274691224098206, + "learning_rate": 4.805522208883554e-05, + "loss": 0.7046, + "step": 982 + }, + { + "epoch": 1.25824, + "grad_norm": 0.6332339644432068, + "learning_rate": 4.805322128851541e-05, + "loss": 0.7729, + "step": 983 + }, + { + "epoch": 1.25952, + "grad_norm": 0.6267856955528259, + "learning_rate": 4.805122048819528e-05, + "loss": 0.75, + "step": 984 + }, + { + "epoch": 1.2608, + "grad_norm": 0.6330248117446899, + "learning_rate": 4.804921968787515e-05, + "loss": 0.7085, + "step": 985 + }, + { + "epoch": 1.26208, + "grad_norm": 0.6453738212585449, + "learning_rate": 4.804721888755502e-05, + "loss": 0.7433, + "step": 986 + }, + { + "epoch": 1.26336, + "grad_norm": 0.6724556684494019, + "learning_rate": 4.8045218087234895e-05, + "loss": 0.7734, + "step": 987 + }, + { + "epoch": 1.26464, + "grad_norm": 0.6251585483551025, + "learning_rate": 4.8043217286914766e-05, + "loss": 0.6533, + "step": 988 + }, + { + "epoch": 1.26592, + "grad_norm": 0.6305882930755615, + "learning_rate": 4.8041216486594645e-05, + "loss": 0.7489, + "step": 989 + }, + { + "epoch": 1.2671999999999999, + "grad_norm": 0.6422156095504761, + "learning_rate": 4.803921568627452e-05, + "loss": 0.7583, + "step": 990 + }, + { + "epoch": 1.26848, + "grad_norm": 0.6040436029434204, + "learning_rate": 4.803721488595438e-05, + "loss": 0.6738, + "step": 991 + }, + { + "epoch": 1.26976, + "grad_norm": 0.6677682399749756, + "learning_rate": 4.8035214085634254e-05, + "loss": 0.7181, + "step": 992 + }, + { + "epoch": 1.27104, + "grad_norm": 0.6229025721549988, + "learning_rate": 4.8033213285314126e-05, + "loss": 0.6651, + "step": 993 + }, + { + "epoch": 1.2723200000000001, + "grad_norm": 0.6611243486404419, + "learning_rate": 4.8031212484994e-05, + "loss": 0.787, + "step": 994 + }, + { + "epoch": 1.2736, + "grad_norm": 0.6305986642837524, + "learning_rate": 4.802921168467387e-05, + "loss": 0.763, + "step": 995 + }, + { + "epoch": 1.27488, + "grad_norm": 0.6216512322425842, + "learning_rate": 4.802721088435375e-05, + "loss": 0.7323, + "step": 996 + }, + { + "epoch": 1.27616, + "grad_norm": 0.6225135922431946, + "learning_rate": 4.802521008403362e-05, + "loss": 0.7429, + "step": 997 + }, + { + "epoch": 1.27744, + "grad_norm": 0.6232655048370361, + "learning_rate": 4.802320928371349e-05, + "loss": 0.7163, + "step": 998 + }, + { + "epoch": 1.27872, + "grad_norm": 0.585407555103302, + "learning_rate": 4.802120848339336e-05, + "loss": 0.6991, + "step": 999 + }, + { + "epoch": 1.28, + "grad_norm": 0.6161494255065918, + "learning_rate": 4.801920768307323e-05, + "loss": 0.7389, + "step": 1000 + }, + { + "epoch": 1.28128, + "grad_norm": 0.5975562930107117, + "learning_rate": 4.80172068827531e-05, + "loss": 0.7076, + "step": 1001 + }, + { + "epoch": 1.28256, + "grad_norm": 0.6090825200080872, + "learning_rate": 4.801520608243297e-05, + "loss": 0.724, + "step": 1002 + }, + { + "epoch": 1.28384, + "grad_norm": 0.6444687843322754, + "learning_rate": 4.801320528211285e-05, + "loss": 0.7562, + "step": 1003 + }, + { + "epoch": 1.28512, + "grad_norm": 0.6110180616378784, + "learning_rate": 4.801120448179272e-05, + "loss": 0.6253, + "step": 1004 + }, + { + "epoch": 1.2864, + "grad_norm": 0.683491051197052, + "learning_rate": 4.8009203681472595e-05, + "loss": 0.794, + "step": 1005 + }, + { + "epoch": 1.28768, + "grad_norm": 0.6164990663528442, + "learning_rate": 4.8007202881152467e-05, + "loss": 0.6802, + "step": 1006 + }, + { + "epoch": 1.2889599999999999, + "grad_norm": 0.6573185920715332, + "learning_rate": 4.800520208083233e-05, + "loss": 0.7123, + "step": 1007 + }, + { + "epoch": 1.29024, + "grad_norm": 0.6175433993339539, + "learning_rate": 4.8003201280512204e-05, + "loss": 0.6906, + "step": 1008 + }, + { + "epoch": 1.29152, + "grad_norm": 0.6465235948562622, + "learning_rate": 4.8001200480192075e-05, + "loss": 0.6843, + "step": 1009 + }, + { + "epoch": 1.2928, + "grad_norm": 0.617728590965271, + "learning_rate": 4.7999199679871954e-05, + "loss": 0.7187, + "step": 1010 + }, + { + "epoch": 1.2940800000000001, + "grad_norm": 0.660306990146637, + "learning_rate": 4.7997198879551826e-05, + "loss": 0.7547, + "step": 1011 + }, + { + "epoch": 1.29536, + "grad_norm": 0.6021302938461304, + "learning_rate": 4.79951980792317e-05, + "loss": 0.6895, + "step": 1012 + }, + { + "epoch": 1.29664, + "grad_norm": 0.6052932143211365, + "learning_rate": 4.799319727891157e-05, + "loss": 0.7183, + "step": 1013 + }, + { + "epoch": 1.29792, + "grad_norm": 0.6130071878433228, + "learning_rate": 4.799119647859144e-05, + "loss": 0.7347, + "step": 1014 + }, + { + "epoch": 1.2992, + "grad_norm": 0.659451961517334, + "learning_rate": 4.7989195678271307e-05, + "loss": 0.7356, + "step": 1015 + }, + { + "epoch": 1.30048, + "grad_norm": 0.6528733372688293, + "learning_rate": 4.798719487795118e-05, + "loss": 0.6957, + "step": 1016 + }, + { + "epoch": 1.30176, + "grad_norm": 0.6614980697631836, + "learning_rate": 4.798519407763106e-05, + "loss": 0.7466, + "step": 1017 + }, + { + "epoch": 1.30304, + "grad_norm": 0.6211322546005249, + "learning_rate": 4.798319327731093e-05, + "loss": 0.7367, + "step": 1018 + }, + { + "epoch": 1.30432, + "grad_norm": 0.6198946833610535, + "learning_rate": 4.79811924769908e-05, + "loss": 0.7108, + "step": 1019 + }, + { + "epoch": 1.3056, + "grad_norm": 0.6257739067077637, + "learning_rate": 4.797919167667067e-05, + "loss": 0.7444, + "step": 1020 + }, + { + "epoch": 1.30688, + "grad_norm": 0.6333909630775452, + "learning_rate": 4.7977190876350544e-05, + "loss": 0.6794, + "step": 1021 + }, + { + "epoch": 1.30816, + "grad_norm": 0.6291680335998535, + "learning_rate": 4.7975190076030416e-05, + "loss": 0.7711, + "step": 1022 + }, + { + "epoch": 1.30944, + "grad_norm": 0.6223838925361633, + "learning_rate": 4.797318927571028e-05, + "loss": 0.7198, + "step": 1023 + }, + { + "epoch": 1.3107199999999999, + "grad_norm": 0.6127943992614746, + "learning_rate": 4.797118847539016e-05, + "loss": 0.6988, + "step": 1024 + }, + { + "epoch": 1.312, + "grad_norm": 0.6955495476722717, + "learning_rate": 4.796918767507003e-05, + "loss": 0.7987, + "step": 1025 + }, + { + "epoch": 1.31328, + "grad_norm": 0.6549299359321594, + "learning_rate": 4.7967186874749904e-05, + "loss": 0.7838, + "step": 1026 + }, + { + "epoch": 1.31456, + "grad_norm": 0.5967473387718201, + "learning_rate": 4.7965186074429776e-05, + "loss": 0.7516, + "step": 1027 + }, + { + "epoch": 1.3158400000000001, + "grad_norm": 0.6326315999031067, + "learning_rate": 4.796318527410965e-05, + "loss": 0.7809, + "step": 1028 + }, + { + "epoch": 1.31712, + "grad_norm": 0.6114999651908875, + "learning_rate": 4.796118447378952e-05, + "loss": 0.7084, + "step": 1029 + }, + { + "epoch": 1.3184, + "grad_norm": 0.6324224472045898, + "learning_rate": 4.795918367346939e-05, + "loss": 0.7194, + "step": 1030 + }, + { + "epoch": 1.31968, + "grad_norm": 0.6108016967773438, + "learning_rate": 4.795718287314926e-05, + "loss": 0.7008, + "step": 1031 + }, + { + "epoch": 1.32096, + "grad_norm": 0.6326055526733398, + "learning_rate": 4.7955182072829135e-05, + "loss": 0.747, + "step": 1032 + }, + { + "epoch": 1.32224, + "grad_norm": 0.5859755277633667, + "learning_rate": 4.795318127250901e-05, + "loss": 0.7171, + "step": 1033 + }, + { + "epoch": 1.32352, + "grad_norm": 0.6281324028968811, + "learning_rate": 4.795118047218888e-05, + "loss": 0.7921, + "step": 1034 + }, + { + "epoch": 1.3248, + "grad_norm": 0.5920976996421814, + "learning_rate": 4.794917967186875e-05, + "loss": 0.6885, + "step": 1035 + }, + { + "epoch": 1.32608, + "grad_norm": 0.6835001111030579, + "learning_rate": 4.794717887154862e-05, + "loss": 0.7674, + "step": 1036 + }, + { + "epoch": 1.32736, + "grad_norm": 0.6190808415412903, + "learning_rate": 4.7945178071228494e-05, + "loss": 0.7408, + "step": 1037 + }, + { + "epoch": 1.32864, + "grad_norm": 0.6121822595596313, + "learning_rate": 4.7943177270908366e-05, + "loss": 0.6875, + "step": 1038 + }, + { + "epoch": 1.32992, + "grad_norm": 0.6977471709251404, + "learning_rate": 4.794117647058824e-05, + "loss": 0.8305, + "step": 1039 + }, + { + "epoch": 1.3312, + "grad_norm": 0.6396989822387695, + "learning_rate": 4.793917567026811e-05, + "loss": 0.705, + "step": 1040 + }, + { + "epoch": 1.3324799999999999, + "grad_norm": 0.6352539658546448, + "learning_rate": 4.793717486994798e-05, + "loss": 0.8305, + "step": 1041 + }, + { + "epoch": 1.33376, + "grad_norm": 0.6385099291801453, + "learning_rate": 4.7935174069627853e-05, + "loss": 0.723, + "step": 1042 + }, + { + "epoch": 1.33504, + "grad_norm": 0.6024138927459717, + "learning_rate": 4.7933173269307725e-05, + "loss": 0.684, + "step": 1043 + }, + { + "epoch": 1.33632, + "grad_norm": 0.6106967329978943, + "learning_rate": 4.79311724689876e-05, + "loss": 0.762, + "step": 1044 + }, + { + "epoch": 1.3376000000000001, + "grad_norm": 0.6504346132278442, + "learning_rate": 4.792917166866747e-05, + "loss": 0.6937, + "step": 1045 + }, + { + "epoch": 1.33888, + "grad_norm": 0.6468571424484253, + "learning_rate": 4.792717086834734e-05, + "loss": 0.7919, + "step": 1046 + }, + { + "epoch": 1.34016, + "grad_norm": 0.6253741979598999, + "learning_rate": 4.792517006802721e-05, + "loss": 0.7041, + "step": 1047 + }, + { + "epoch": 1.34144, + "grad_norm": 0.6577421426773071, + "learning_rate": 4.7923169267707085e-05, + "loss": 0.8185, + "step": 1048 + }, + { + "epoch": 1.34272, + "grad_norm": 0.6654506325721741, + "learning_rate": 4.7921168467386956e-05, + "loss": 0.7253, + "step": 1049 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.6202994585037231, + "learning_rate": 4.791916766706683e-05, + "loss": 0.6825, + "step": 1050 + }, + { + "epoch": 1.34528, + "grad_norm": 0.6256506443023682, + "learning_rate": 4.79171668667467e-05, + "loss": 0.6949, + "step": 1051 + }, + { + "epoch": 1.34656, + "grad_norm": 0.6409286856651306, + "learning_rate": 4.791516606642658e-05, + "loss": 0.72, + "step": 1052 + }, + { + "epoch": 1.34784, + "grad_norm": 0.6390777230262756, + "learning_rate": 4.7913165266106444e-05, + "loss": 0.7029, + "step": 1053 + }, + { + "epoch": 1.34912, + "grad_norm": 0.620672345161438, + "learning_rate": 4.7911164465786316e-05, + "loss": 0.7373, + "step": 1054 + }, + { + "epoch": 1.3504, + "grad_norm": 0.6143883466720581, + "learning_rate": 4.790916366546619e-05, + "loss": 0.7371, + "step": 1055 + }, + { + "epoch": 1.35168, + "grad_norm": 0.6494544744491577, + "learning_rate": 4.790716286514606e-05, + "loss": 0.7677, + "step": 1056 + }, + { + "epoch": 1.35296, + "grad_norm": 0.6207596659660339, + "learning_rate": 4.790516206482593e-05, + "loss": 0.7106, + "step": 1057 + }, + { + "epoch": 1.3542399999999999, + "grad_norm": 0.6054299473762512, + "learning_rate": 4.79031612645058e-05, + "loss": 0.7055, + "step": 1058 + }, + { + "epoch": 1.35552, + "grad_norm": 0.6713016629219055, + "learning_rate": 4.790116046418568e-05, + "loss": 0.7229, + "step": 1059 + }, + { + "epoch": 1.3568, + "grad_norm": 0.6039015054702759, + "learning_rate": 4.7899159663865554e-05, + "loss": 0.6853, + "step": 1060 + }, + { + "epoch": 1.35808, + "grad_norm": 0.6022716164588928, + "learning_rate": 4.789715886354542e-05, + "loss": 0.7085, + "step": 1061 + }, + { + "epoch": 1.3593600000000001, + "grad_norm": 0.6193332076072693, + "learning_rate": 4.789515806322529e-05, + "loss": 0.7405, + "step": 1062 + }, + { + "epoch": 1.36064, + "grad_norm": 0.6295928359031677, + "learning_rate": 4.789315726290516e-05, + "loss": 0.7142, + "step": 1063 + }, + { + "epoch": 1.36192, + "grad_norm": 0.6390275359153748, + "learning_rate": 4.7891156462585034e-05, + "loss": 0.7402, + "step": 1064 + }, + { + "epoch": 1.3632, + "grad_norm": 0.6579723954200745, + "learning_rate": 4.7889155662264906e-05, + "loss": 0.7671, + "step": 1065 + }, + { + "epoch": 1.36448, + "grad_norm": 0.6290743350982666, + "learning_rate": 4.7887154861944785e-05, + "loss": 0.7414, + "step": 1066 + }, + { + "epoch": 1.3657599999999999, + "grad_norm": 0.6132893562316895, + "learning_rate": 4.788515406162466e-05, + "loss": 0.6352, + "step": 1067 + }, + { + "epoch": 1.36704, + "grad_norm": 0.6277240514755249, + "learning_rate": 4.788315326130453e-05, + "loss": 0.6889, + "step": 1068 + }, + { + "epoch": 1.36832, + "grad_norm": 0.6457761526107788, + "learning_rate": 4.7881152460984394e-05, + "loss": 0.7332, + "step": 1069 + }, + { + "epoch": 1.3696, + "grad_norm": 0.6207125186920166, + "learning_rate": 4.7879151660664265e-05, + "loss": 0.711, + "step": 1070 + }, + { + "epoch": 1.37088, + "grad_norm": 0.662109375, + "learning_rate": 4.787715086034414e-05, + "loss": 0.8474, + "step": 1071 + }, + { + "epoch": 1.37216, + "grad_norm": 0.6627180576324463, + "learning_rate": 4.787515006002401e-05, + "loss": 0.7317, + "step": 1072 + }, + { + "epoch": 1.37344, + "grad_norm": 0.6555449962615967, + "learning_rate": 4.787314925970388e-05, + "loss": 0.767, + "step": 1073 + }, + { + "epoch": 1.37472, + "grad_norm": 0.619805634021759, + "learning_rate": 4.787114845938376e-05, + "loss": 0.6963, + "step": 1074 + }, + { + "epoch": 1.376, + "grad_norm": 0.6373822093009949, + "learning_rate": 4.786914765906363e-05, + "loss": 0.7016, + "step": 1075 + }, + { + "epoch": 1.37728, + "grad_norm": 0.6532891392707825, + "learning_rate": 4.78671468587435e-05, + "loss": 0.694, + "step": 1076 + }, + { + "epoch": 1.37856, + "grad_norm": 0.6646358370780945, + "learning_rate": 4.786514605842337e-05, + "loss": 0.7097, + "step": 1077 + }, + { + "epoch": 1.37984, + "grad_norm": 0.6382097601890564, + "learning_rate": 4.786314525810324e-05, + "loss": 0.7613, + "step": 1078 + }, + { + "epoch": 1.3811200000000001, + "grad_norm": 0.5848649144172668, + "learning_rate": 4.786114445778311e-05, + "loss": 0.7163, + "step": 1079 + }, + { + "epoch": 1.3824, + "grad_norm": 0.6216439008712769, + "learning_rate": 4.7859143657462984e-05, + "loss": 0.708, + "step": 1080 + }, + { + "epoch": 1.38368, + "grad_norm": 0.6398138999938965, + "learning_rate": 4.785714285714286e-05, + "loss": 0.7093, + "step": 1081 + }, + { + "epoch": 1.38496, + "grad_norm": 0.601285994052887, + "learning_rate": 4.7855142056822734e-05, + "loss": 0.7506, + "step": 1082 + }, + { + "epoch": 1.38624, + "grad_norm": 0.5848278999328613, + "learning_rate": 4.7853141256502606e-05, + "loss": 0.7303, + "step": 1083 + }, + { + "epoch": 1.3875199999999999, + "grad_norm": 0.6471388339996338, + "learning_rate": 4.785114045618248e-05, + "loss": 0.7204, + "step": 1084 + }, + { + "epoch": 1.3888, + "grad_norm": 0.6106283068656921, + "learning_rate": 4.784913965586234e-05, + "loss": 0.6882, + "step": 1085 + }, + { + "epoch": 1.39008, + "grad_norm": 0.6192972660064697, + "learning_rate": 4.7847138855542215e-05, + "loss": 0.7204, + "step": 1086 + }, + { + "epoch": 1.39136, + "grad_norm": 0.6262179613113403, + "learning_rate": 4.784513805522209e-05, + "loss": 0.7779, + "step": 1087 + }, + { + "epoch": 1.39264, + "grad_norm": 0.5966930985450745, + "learning_rate": 4.7843137254901966e-05, + "loss": 0.7018, + "step": 1088 + }, + { + "epoch": 1.39392, + "grad_norm": 0.5986944437026978, + "learning_rate": 4.784113645458184e-05, + "loss": 0.7918, + "step": 1089 + }, + { + "epoch": 1.3952, + "grad_norm": 0.649640679359436, + "learning_rate": 4.783913565426171e-05, + "loss": 0.8004, + "step": 1090 + }, + { + "epoch": 1.39648, + "grad_norm": 0.6140245199203491, + "learning_rate": 4.783713485394158e-05, + "loss": 0.7116, + "step": 1091 + }, + { + "epoch": 1.39776, + "grad_norm": 0.6077390909194946, + "learning_rate": 4.783513405362145e-05, + "loss": 0.7018, + "step": 1092 + }, + { + "epoch": 1.39904, + "grad_norm": 0.6407623887062073, + "learning_rate": 4.783313325330132e-05, + "loss": 0.7147, + "step": 1093 + }, + { + "epoch": 1.40032, + "grad_norm": 0.6380225419998169, + "learning_rate": 4.783113245298119e-05, + "loss": 0.7089, + "step": 1094 + }, + { + "epoch": 1.4016, + "grad_norm": 0.5923783183097839, + "learning_rate": 4.782913165266107e-05, + "loss": 0.6903, + "step": 1095 + }, + { + "epoch": 1.4028800000000001, + "grad_norm": 0.6157001852989197, + "learning_rate": 4.782713085234094e-05, + "loss": 0.7125, + "step": 1096 + }, + { + "epoch": 1.40416, + "grad_norm": 0.6462724208831787, + "learning_rate": 4.782513005202081e-05, + "loss": 0.7091, + "step": 1097 + }, + { + "epoch": 1.40544, + "grad_norm": 0.6279814839363098, + "learning_rate": 4.7823129251700684e-05, + "loss": 0.7286, + "step": 1098 + }, + { + "epoch": 1.40672, + "grad_norm": 0.6624937057495117, + "learning_rate": 4.7821128451380556e-05, + "loss": 0.7458, + "step": 1099 + }, + { + "epoch": 1.408, + "grad_norm": 0.5872619152069092, + "learning_rate": 4.781912765106043e-05, + "loss": 0.6632, + "step": 1100 + }, + { + "epoch": 1.4092799999999999, + "grad_norm": 0.6288279294967651, + "learning_rate": 4.781712685074029e-05, + "loss": 0.7713, + "step": 1101 + }, + { + "epoch": 1.41056, + "grad_norm": 0.6384450793266296, + "learning_rate": 4.781512605042017e-05, + "loss": 0.7663, + "step": 1102 + }, + { + "epoch": 1.41184, + "grad_norm": 0.6555289626121521, + "learning_rate": 4.7813125250100043e-05, + "loss": 0.7339, + "step": 1103 + }, + { + "epoch": 1.41312, + "grad_norm": 0.6459196209907532, + "learning_rate": 4.7811124449779915e-05, + "loss": 0.7491, + "step": 1104 + }, + { + "epoch": 1.4144, + "grad_norm": 0.6260380148887634, + "learning_rate": 4.780912364945979e-05, + "loss": 0.7219, + "step": 1105 + }, + { + "epoch": 1.41568, + "grad_norm": 0.602838933467865, + "learning_rate": 4.780712284913966e-05, + "loss": 0.6817, + "step": 1106 + }, + { + "epoch": 1.41696, + "grad_norm": 0.6588259935379028, + "learning_rate": 4.780512204881953e-05, + "loss": 0.7484, + "step": 1107 + }, + { + "epoch": 1.41824, + "grad_norm": 0.6435507535934448, + "learning_rate": 4.78031212484994e-05, + "loss": 0.7629, + "step": 1108 + }, + { + "epoch": 1.41952, + "grad_norm": 0.6327363848686218, + "learning_rate": 4.7801120448179275e-05, + "loss": 0.7366, + "step": 1109 + }, + { + "epoch": 1.4208, + "grad_norm": 0.7007765769958496, + "learning_rate": 4.7799119647859146e-05, + "loss": 0.7356, + "step": 1110 + }, + { + "epoch": 1.42208, + "grad_norm": 0.6083685159683228, + "learning_rate": 4.779711884753902e-05, + "loss": 0.7023, + "step": 1111 + }, + { + "epoch": 1.42336, + "grad_norm": 0.6256844997406006, + "learning_rate": 4.779511804721889e-05, + "loss": 0.709, + "step": 1112 + }, + { + "epoch": 1.4246400000000001, + "grad_norm": 0.6442143321037292, + "learning_rate": 4.779311724689876e-05, + "loss": 0.6688, + "step": 1113 + }, + { + "epoch": 1.42592, + "grad_norm": 0.6138997673988342, + "learning_rate": 4.7791116446578634e-05, + "loss": 0.7211, + "step": 1114 + }, + { + "epoch": 1.4272, + "grad_norm": 0.6425867676734924, + "learning_rate": 4.7789115646258506e-05, + "loss": 0.7207, + "step": 1115 + }, + { + "epoch": 1.42848, + "grad_norm": 0.6066508889198303, + "learning_rate": 4.778711484593838e-05, + "loss": 0.7242, + "step": 1116 + }, + { + "epoch": 1.42976, + "grad_norm": 0.5861117243766785, + "learning_rate": 4.778511404561825e-05, + "loss": 0.6949, + "step": 1117 + }, + { + "epoch": 1.4310399999999999, + "grad_norm": 0.6210717558860779, + "learning_rate": 4.778311324529812e-05, + "loss": 0.7018, + "step": 1118 + }, + { + "epoch": 1.43232, + "grad_norm": 0.6068648099899292, + "learning_rate": 4.778111244497799e-05, + "loss": 0.7291, + "step": 1119 + }, + { + "epoch": 1.4336, + "grad_norm": 0.5990647673606873, + "learning_rate": 4.7779111644657865e-05, + "loss": 0.7016, + "step": 1120 + }, + { + "epoch": 1.43488, + "grad_norm": 0.5916838049888611, + "learning_rate": 4.777711084433774e-05, + "loss": 0.6895, + "step": 1121 + }, + { + "epoch": 1.43616, + "grad_norm": 0.6179905533790588, + "learning_rate": 4.777511004401761e-05, + "loss": 0.6555, + "step": 1122 + }, + { + "epoch": 1.43744, + "grad_norm": 0.626921534538269, + "learning_rate": 4.777310924369748e-05, + "loss": 0.7184, + "step": 1123 + }, + { + "epoch": 1.43872, + "grad_norm": 0.6084997057914734, + "learning_rate": 4.777110844337735e-05, + "loss": 0.6929, + "step": 1124 + }, + { + "epoch": 1.44, + "grad_norm": 0.6844547390937805, + "learning_rate": 4.7769107643057224e-05, + "loss": 0.7536, + "step": 1125 + }, + { + "epoch": 1.44128, + "grad_norm": 0.6585316061973572, + "learning_rate": 4.7767106842737096e-05, + "loss": 0.7163, + "step": 1126 + }, + { + "epoch": 1.44256, + "grad_norm": 0.6486889123916626, + "learning_rate": 4.776510604241697e-05, + "loss": 0.7305, + "step": 1127 + }, + { + "epoch": 1.44384, + "grad_norm": 0.5947285890579224, + "learning_rate": 4.776310524209684e-05, + "loss": 0.7082, + "step": 1128 + }, + { + "epoch": 1.44512, + "grad_norm": 0.6200329065322876, + "learning_rate": 4.776110444177671e-05, + "loss": 0.7104, + "step": 1129 + }, + { + "epoch": 1.4464000000000001, + "grad_norm": 0.6373615860939026, + "learning_rate": 4.775910364145659e-05, + "loss": 0.7293, + "step": 1130 + }, + { + "epoch": 1.44768, + "grad_norm": 0.6362440586090088, + "learning_rate": 4.7757102841136455e-05, + "loss": 0.7632, + "step": 1131 + }, + { + "epoch": 1.44896, + "grad_norm": 0.6665335893630981, + "learning_rate": 4.775510204081633e-05, + "loss": 0.7378, + "step": 1132 + }, + { + "epoch": 1.45024, + "grad_norm": 0.6526025533676147, + "learning_rate": 4.77531012404962e-05, + "loss": 0.7462, + "step": 1133 + }, + { + "epoch": 1.45152, + "grad_norm": 0.5860944986343384, + "learning_rate": 4.775110044017607e-05, + "loss": 0.6819, + "step": 1134 + }, + { + "epoch": 1.4527999999999999, + "grad_norm": 0.6410444378852844, + "learning_rate": 4.774909963985594e-05, + "loss": 0.7381, + "step": 1135 + }, + { + "epoch": 1.45408, + "grad_norm": 0.6227912306785583, + "learning_rate": 4.7747098839535815e-05, + "loss": 0.6726, + "step": 1136 + }, + { + "epoch": 1.45536, + "grad_norm": 0.633658230304718, + "learning_rate": 4.774509803921569e-05, + "loss": 0.705, + "step": 1137 + }, + { + "epoch": 1.45664, + "grad_norm": 0.6156677007675171, + "learning_rate": 4.7743097238895565e-05, + "loss": 0.6913, + "step": 1138 + }, + { + "epoch": 1.45792, + "grad_norm": 0.6303706169128418, + "learning_rate": 4.774109643857543e-05, + "loss": 0.721, + "step": 1139 + }, + { + "epoch": 1.4592, + "grad_norm": 0.5811339020729065, + "learning_rate": 4.77390956382553e-05, + "loss": 0.727, + "step": 1140 + }, + { + "epoch": 1.46048, + "grad_norm": 0.6627085208892822, + "learning_rate": 4.7737094837935174e-05, + "loss": 0.7594, + "step": 1141 + }, + { + "epoch": 1.46176, + "grad_norm": 0.640595555305481, + "learning_rate": 4.7735094037615046e-05, + "loss": 0.7016, + "step": 1142 + }, + { + "epoch": 1.46304, + "grad_norm": 0.623562216758728, + "learning_rate": 4.773309323729492e-05, + "loss": 0.7622, + "step": 1143 + }, + { + "epoch": 1.46432, + "grad_norm": 0.6135162711143494, + "learning_rate": 4.7731092436974796e-05, + "loss": 0.6906, + "step": 1144 + }, + { + "epoch": 1.4656, + "grad_norm": 0.6718834042549133, + "learning_rate": 4.772909163665467e-05, + "loss": 0.7215, + "step": 1145 + }, + { + "epoch": 1.46688, + "grad_norm": 0.6534852981567383, + "learning_rate": 4.772709083633454e-05, + "loss": 0.7794, + "step": 1146 + }, + { + "epoch": 1.4681600000000001, + "grad_norm": 0.6073404550552368, + "learning_rate": 4.7725090036014405e-05, + "loss": 0.6651, + "step": 1147 + }, + { + "epoch": 1.46944, + "grad_norm": 0.6143197417259216, + "learning_rate": 4.772308923569428e-05, + "loss": 0.7012, + "step": 1148 + }, + { + "epoch": 1.47072, + "grad_norm": 0.6453569531440735, + "learning_rate": 4.772108843537415e-05, + "loss": 0.7459, + "step": 1149 + }, + { + "epoch": 1.472, + "grad_norm": 0.6633815169334412, + "learning_rate": 4.771908763505402e-05, + "loss": 0.7477, + "step": 1150 + }, + { + "epoch": 1.47328, + "grad_norm": 0.6284077167510986, + "learning_rate": 4.77170868347339e-05, + "loss": 0.7471, + "step": 1151 + }, + { + "epoch": 1.4745599999999999, + "grad_norm": 0.6360135674476624, + "learning_rate": 4.771508603441377e-05, + "loss": 0.7069, + "step": 1152 + }, + { + "epoch": 1.47584, + "grad_norm": 0.6397899389266968, + "learning_rate": 4.771308523409364e-05, + "loss": 0.6937, + "step": 1153 + }, + { + "epoch": 1.47712, + "grad_norm": 0.6112055778503418, + "learning_rate": 4.7711084433773515e-05, + "loss": 0.7514, + "step": 1154 + }, + { + "epoch": 1.4784, + "grad_norm": 0.608342170715332, + "learning_rate": 4.770908363345338e-05, + "loss": 0.7085, + "step": 1155 + }, + { + "epoch": 1.47968, + "grad_norm": 0.6625191569328308, + "learning_rate": 4.770708283313325e-05, + "loss": 0.7201, + "step": 1156 + }, + { + "epoch": 1.48096, + "grad_norm": 0.6654885411262512, + "learning_rate": 4.7705082032813124e-05, + "loss": 0.7487, + "step": 1157 + }, + { + "epoch": 1.48224, + "grad_norm": 0.6362460851669312, + "learning_rate": 4.7703081232493e-05, + "loss": 0.7151, + "step": 1158 + }, + { + "epoch": 1.48352, + "grad_norm": 0.6087484359741211, + "learning_rate": 4.7701080432172874e-05, + "loss": 0.7062, + "step": 1159 + }, + { + "epoch": 1.4848, + "grad_norm": 0.6134206652641296, + "learning_rate": 4.7699079631852746e-05, + "loss": 0.7385, + "step": 1160 + }, + { + "epoch": 1.48608, + "grad_norm": 0.5883345603942871, + "learning_rate": 4.769707883153262e-05, + "loss": 0.7447, + "step": 1161 + }, + { + "epoch": 1.48736, + "grad_norm": 0.6443062424659729, + "learning_rate": 4.769507803121249e-05, + "loss": 0.6859, + "step": 1162 + }, + { + "epoch": 1.48864, + "grad_norm": 0.6045071482658386, + "learning_rate": 4.7693077230892355e-05, + "loss": 0.7062, + "step": 1163 + }, + { + "epoch": 1.4899200000000001, + "grad_norm": 0.6627300381660461, + "learning_rate": 4.769107643057223e-05, + "loss": 0.7499, + "step": 1164 + }, + { + "epoch": 1.4912, + "grad_norm": 0.6466721892356873, + "learning_rate": 4.7689075630252105e-05, + "loss": 0.7218, + "step": 1165 + }, + { + "epoch": 1.49248, + "grad_norm": 0.6242368817329407, + "learning_rate": 4.768707482993198e-05, + "loss": 0.7273, + "step": 1166 + }, + { + "epoch": 1.49376, + "grad_norm": 0.614842414855957, + "learning_rate": 4.768507402961185e-05, + "loss": 0.7216, + "step": 1167 + }, + { + "epoch": 1.49504, + "grad_norm": 0.6312155723571777, + "learning_rate": 4.768307322929172e-05, + "loss": 0.7566, + "step": 1168 + }, + { + "epoch": 1.4963199999999999, + "grad_norm": 0.6303775906562805, + "learning_rate": 4.768107242897159e-05, + "loss": 0.7076, + "step": 1169 + }, + { + "epoch": 1.4976, + "grad_norm": 0.6277086734771729, + "learning_rate": 4.7679071628651465e-05, + "loss": 0.6755, + "step": 1170 + }, + { + "epoch": 1.49888, + "grad_norm": 0.6203471422195435, + "learning_rate": 4.767707082833133e-05, + "loss": 0.716, + "step": 1171 + }, + { + "epoch": 1.5001600000000002, + "grad_norm": 0.6107301115989685, + "learning_rate": 4.767507002801121e-05, + "loss": 0.7121, + "step": 1172 + }, + { + "epoch": 1.50144, + "grad_norm": 0.6433287858963013, + "learning_rate": 4.767306922769108e-05, + "loss": 0.7116, + "step": 1173 + }, + { + "epoch": 1.50272, + "grad_norm": 0.6727626919746399, + "learning_rate": 4.767106842737095e-05, + "loss": 0.7444, + "step": 1174 + }, + { + "epoch": 1.504, + "grad_norm": 0.6506801247596741, + "learning_rate": 4.7669067627050824e-05, + "loss": 0.6786, + "step": 1175 + }, + { + "epoch": 1.50528, + "grad_norm": 0.642342746257782, + "learning_rate": 4.7667066826730696e-05, + "loss": 0.7178, + "step": 1176 + }, + { + "epoch": 1.50656, + "grad_norm": 0.6692764759063721, + "learning_rate": 4.766506602641057e-05, + "loss": 0.7793, + "step": 1177 + }, + { + "epoch": 1.5078399999999998, + "grad_norm": 0.6183244585990906, + "learning_rate": 4.766306522609044e-05, + "loss": 0.7034, + "step": 1178 + }, + { + "epoch": 1.50912, + "grad_norm": 0.6591295003890991, + "learning_rate": 4.766106442577031e-05, + "loss": 0.7507, + "step": 1179 + }, + { + "epoch": 1.5104, + "grad_norm": 0.6517340540885925, + "learning_rate": 4.765906362545018e-05, + "loss": 0.7276, + "step": 1180 + }, + { + "epoch": 1.5116800000000001, + "grad_norm": 0.6169537901878357, + "learning_rate": 4.7657062825130055e-05, + "loss": 0.7647, + "step": 1181 + }, + { + "epoch": 1.51296, + "grad_norm": 0.6543555855751038, + "learning_rate": 4.765506202480993e-05, + "loss": 0.7227, + "step": 1182 + }, + { + "epoch": 1.51424, + "grad_norm": 0.5778194665908813, + "learning_rate": 4.76530612244898e-05, + "loss": 0.6666, + "step": 1183 + }, + { + "epoch": 1.51552, + "grad_norm": 0.6411781907081604, + "learning_rate": 4.765106042416967e-05, + "loss": 0.7447, + "step": 1184 + }, + { + "epoch": 1.5168, + "grad_norm": 0.6526811718940735, + "learning_rate": 4.764905962384954e-05, + "loss": 0.6795, + "step": 1185 + }, + { + "epoch": 1.5180799999999999, + "grad_norm": 0.6295871734619141, + "learning_rate": 4.7647058823529414e-05, + "loss": 0.7025, + "step": 1186 + }, + { + "epoch": 1.51936, + "grad_norm": 0.6581454277038574, + "learning_rate": 4.7645058023209286e-05, + "loss": 0.7639, + "step": 1187 + }, + { + "epoch": 1.52064, + "grad_norm": 0.6672399640083313, + "learning_rate": 4.764305722288916e-05, + "loss": 0.7384, + "step": 1188 + }, + { + "epoch": 1.5219200000000002, + "grad_norm": 0.6672317385673523, + "learning_rate": 4.764105642256903e-05, + "loss": 0.7389, + "step": 1189 + }, + { + "epoch": 1.5232, + "grad_norm": 0.6444031000137329, + "learning_rate": 4.76390556222489e-05, + "loss": 0.7833, + "step": 1190 + }, + { + "epoch": 1.52448, + "grad_norm": 0.6317417621612549, + "learning_rate": 4.7637054821928774e-05, + "loss": 0.7112, + "step": 1191 + }, + { + "epoch": 1.52576, + "grad_norm": 0.627162516117096, + "learning_rate": 4.7635054021608646e-05, + "loss": 0.6628, + "step": 1192 + }, + { + "epoch": 1.52704, + "grad_norm": 0.5882164239883423, + "learning_rate": 4.763305322128852e-05, + "loss": 0.7103, + "step": 1193 + }, + { + "epoch": 1.52832, + "grad_norm": 0.6305367350578308, + "learning_rate": 4.763105242096839e-05, + "loss": 0.7103, + "step": 1194 + }, + { + "epoch": 1.5295999999999998, + "grad_norm": 0.6403663158416748, + "learning_rate": 4.762905162064826e-05, + "loss": 0.6768, + "step": 1195 + }, + { + "epoch": 1.53088, + "grad_norm": 0.6659502983093262, + "learning_rate": 4.762705082032813e-05, + "loss": 0.7166, + "step": 1196 + }, + { + "epoch": 1.53216, + "grad_norm": 0.6176123023033142, + "learning_rate": 4.7625050020008005e-05, + "loss": 0.6794, + "step": 1197 + }, + { + "epoch": 1.5334400000000001, + "grad_norm": 0.6616097688674927, + "learning_rate": 4.762304921968788e-05, + "loss": 0.7685, + "step": 1198 + }, + { + "epoch": 1.53472, + "grad_norm": 0.6266618967056274, + "learning_rate": 4.762104841936775e-05, + "loss": 0.6784, + "step": 1199 + }, + { + "epoch": 1.536, + "grad_norm": 0.6575819849967957, + "learning_rate": 4.761904761904762e-05, + "loss": 0.7432, + "step": 1200 + }, + { + "epoch": 1.53728, + "grad_norm": 0.6002248525619507, + "learning_rate": 4.761704681872749e-05, + "loss": 0.6946, + "step": 1201 + }, + { + "epoch": 1.53856, + "grad_norm": 0.6573600172996521, + "learning_rate": 4.7615046018407364e-05, + "loss": 0.7185, + "step": 1202 + }, + { + "epoch": 1.5398399999999999, + "grad_norm": 0.6227654218673706, + "learning_rate": 4.7613045218087236e-05, + "loss": 0.7381, + "step": 1203 + }, + { + "epoch": 1.54112, + "grad_norm": 0.6044524908065796, + "learning_rate": 4.761104441776711e-05, + "loss": 0.704, + "step": 1204 + }, + { + "epoch": 1.5424, + "grad_norm": 0.6178364753723145, + "learning_rate": 4.760904361744698e-05, + "loss": 0.6916, + "step": 1205 + }, + { + "epoch": 1.5436800000000002, + "grad_norm": 0.6074690222740173, + "learning_rate": 4.760704281712685e-05, + "loss": 0.7256, + "step": 1206 + }, + { + "epoch": 1.54496, + "grad_norm": 0.6533793807029724, + "learning_rate": 4.760504201680672e-05, + "loss": 0.6969, + "step": 1207 + }, + { + "epoch": 1.54624, + "grad_norm": 0.6194373965263367, + "learning_rate": 4.76030412164866e-05, + "loss": 0.7138, + "step": 1208 + }, + { + "epoch": 1.54752, + "grad_norm": 0.6618860363960266, + "learning_rate": 4.760104041616647e-05, + "loss": 0.7754, + "step": 1209 + }, + { + "epoch": 1.5488, + "grad_norm": 0.6461208462715149, + "learning_rate": 4.759903961584634e-05, + "loss": 0.7114, + "step": 1210 + }, + { + "epoch": 1.55008, + "grad_norm": 0.6476582884788513, + "learning_rate": 4.759703881552621e-05, + "loss": 0.6973, + "step": 1211 + }, + { + "epoch": 1.5513599999999999, + "grad_norm": 0.6671127080917358, + "learning_rate": 4.759503801520608e-05, + "loss": 0.7009, + "step": 1212 + }, + { + "epoch": 1.55264, + "grad_norm": 0.6456823348999023, + "learning_rate": 4.7593037214885954e-05, + "loss": 0.7214, + "step": 1213 + }, + { + "epoch": 1.55392, + "grad_norm": 0.6629720330238342, + "learning_rate": 4.7591036414565826e-05, + "loss": 0.7583, + "step": 1214 + }, + { + "epoch": 1.5552000000000001, + "grad_norm": 0.649437427520752, + "learning_rate": 4.7589035614245705e-05, + "loss": 0.7532, + "step": 1215 + }, + { + "epoch": 1.55648, + "grad_norm": 0.6041565537452698, + "learning_rate": 4.758703481392558e-05, + "loss": 0.6884, + "step": 1216 + }, + { + "epoch": 1.55776, + "grad_norm": 0.6424383521080017, + "learning_rate": 4.758503401360544e-05, + "loss": 0.7084, + "step": 1217 + }, + { + "epoch": 1.55904, + "grad_norm": 0.6945663690567017, + "learning_rate": 4.7583033213285314e-05, + "loss": 0.75, + "step": 1218 + }, + { + "epoch": 1.56032, + "grad_norm": 0.6336300373077393, + "learning_rate": 4.7581032412965186e-05, + "loss": 0.7537, + "step": 1219 + }, + { + "epoch": 1.5615999999999999, + "grad_norm": 0.6776460409164429, + "learning_rate": 4.757903161264506e-05, + "loss": 0.7617, + "step": 1220 + }, + { + "epoch": 1.56288, + "grad_norm": 0.6466001272201538, + "learning_rate": 4.757703081232493e-05, + "loss": 0.7482, + "step": 1221 + }, + { + "epoch": 1.56416, + "grad_norm": 0.630434513092041, + "learning_rate": 4.757503001200481e-05, + "loss": 0.724, + "step": 1222 + }, + { + "epoch": 1.5654400000000002, + "grad_norm": 0.6976414322853088, + "learning_rate": 4.757302921168468e-05, + "loss": 0.7818, + "step": 1223 + }, + { + "epoch": 1.5667200000000001, + "grad_norm": 0.6347379684448242, + "learning_rate": 4.757102841136455e-05, + "loss": 0.7551, + "step": 1224 + }, + { + "epoch": 1.568, + "grad_norm": 0.6358786225318909, + "learning_rate": 4.756902761104442e-05, + "loss": 0.6909, + "step": 1225 + }, + { + "epoch": 1.56928, + "grad_norm": 0.6460133194923401, + "learning_rate": 4.756702681072429e-05, + "loss": 0.7101, + "step": 1226 + }, + { + "epoch": 1.57056, + "grad_norm": 0.6669771671295166, + "learning_rate": 4.756502601040416e-05, + "loss": 0.7721, + "step": 1227 + }, + { + "epoch": 1.57184, + "grad_norm": 0.6039112210273743, + "learning_rate": 4.756302521008403e-05, + "loss": 0.712, + "step": 1228 + }, + { + "epoch": 1.5731199999999999, + "grad_norm": 0.6793951392173767, + "learning_rate": 4.756102440976391e-05, + "loss": 0.747, + "step": 1229 + }, + { + "epoch": 1.5744, + "grad_norm": 0.6434732675552368, + "learning_rate": 4.755902360944378e-05, + "loss": 0.6894, + "step": 1230 + }, + { + "epoch": 1.57568, + "grad_norm": 0.6476708650588989, + "learning_rate": 4.7557022809123655e-05, + "loss": 0.6805, + "step": 1231 + }, + { + "epoch": 1.5769600000000001, + "grad_norm": 0.6917094588279724, + "learning_rate": 4.7555022008803527e-05, + "loss": 0.6978, + "step": 1232 + }, + { + "epoch": 1.57824, + "grad_norm": 0.631247878074646, + "learning_rate": 4.755302120848339e-05, + "loss": 0.6858, + "step": 1233 + }, + { + "epoch": 1.57952, + "grad_norm": 0.6304442286491394, + "learning_rate": 4.7551020408163263e-05, + "loss": 0.6866, + "step": 1234 + }, + { + "epoch": 1.5808, + "grad_norm": 0.6345778703689575, + "learning_rate": 4.7549019607843135e-05, + "loss": 0.7146, + "step": 1235 + }, + { + "epoch": 1.58208, + "grad_norm": 0.639773964881897, + "learning_rate": 4.7547018807523014e-05, + "loss": 0.7268, + "step": 1236 + }, + { + "epoch": 1.5833599999999999, + "grad_norm": 0.6575612425804138, + "learning_rate": 4.7545018007202886e-05, + "loss": 0.7142, + "step": 1237 + }, + { + "epoch": 1.58464, + "grad_norm": 0.6063089966773987, + "learning_rate": 4.754301720688276e-05, + "loss": 0.7651, + "step": 1238 + }, + { + "epoch": 1.58592, + "grad_norm": 0.6471563577651978, + "learning_rate": 4.754101640656263e-05, + "loss": 0.7748, + "step": 1239 + }, + { + "epoch": 1.5872000000000002, + "grad_norm": 0.6223157048225403, + "learning_rate": 4.75390156062425e-05, + "loss": 0.7069, + "step": 1240 + }, + { + "epoch": 1.5884800000000001, + "grad_norm": 0.64573073387146, + "learning_rate": 4.7537014805922366e-05, + "loss": 0.766, + "step": 1241 + }, + { + "epoch": 1.58976, + "grad_norm": 0.6501642465591431, + "learning_rate": 4.753501400560224e-05, + "loss": 0.7455, + "step": 1242 + }, + { + "epoch": 1.59104, + "grad_norm": 0.6377203464508057, + "learning_rate": 4.753301320528212e-05, + "loss": 0.6988, + "step": 1243 + }, + { + "epoch": 1.59232, + "grad_norm": 0.6198776364326477, + "learning_rate": 4.753101240496199e-05, + "loss": 0.7555, + "step": 1244 + }, + { + "epoch": 1.5936, + "grad_norm": 0.6620365381240845, + "learning_rate": 4.752901160464186e-05, + "loss": 0.7119, + "step": 1245 + }, + { + "epoch": 1.5948799999999999, + "grad_norm": 0.6182314157485962, + "learning_rate": 4.752701080432173e-05, + "loss": 0.6488, + "step": 1246 + }, + { + "epoch": 1.59616, + "grad_norm": 0.6267403960227966, + "learning_rate": 4.7525010004001604e-05, + "loss": 0.7483, + "step": 1247 + }, + { + "epoch": 1.59744, + "grad_norm": 0.6369834542274475, + "learning_rate": 4.7523009203681476e-05, + "loss": 0.6785, + "step": 1248 + }, + { + "epoch": 1.5987200000000001, + "grad_norm": 0.6468684077262878, + "learning_rate": 4.752100840336134e-05, + "loss": 0.7245, + "step": 1249 + }, + { + "epoch": 1.6, + "grad_norm": 0.6526497006416321, + "learning_rate": 4.751900760304122e-05, + "loss": 0.7311, + "step": 1250 + }, + { + "epoch": 1.60128, + "grad_norm": 0.6682162284851074, + "learning_rate": 4.751700680272109e-05, + "loss": 0.7086, + "step": 1251 + }, + { + "epoch": 1.60256, + "grad_norm": 0.6937553286552429, + "learning_rate": 4.7515006002400964e-05, + "loss": 0.6991, + "step": 1252 + }, + { + "epoch": 1.60384, + "grad_norm": 0.6066246032714844, + "learning_rate": 4.7513005202080836e-05, + "loss": 0.6762, + "step": 1253 + }, + { + "epoch": 1.6051199999999999, + "grad_norm": 0.7019811272621155, + "learning_rate": 4.751100440176071e-05, + "loss": 0.7746, + "step": 1254 + }, + { + "epoch": 1.6064, + "grad_norm": 0.6307134032249451, + "learning_rate": 4.750900360144058e-05, + "loss": 0.7058, + "step": 1255 + }, + { + "epoch": 1.60768, + "grad_norm": 0.6239479780197144, + "learning_rate": 4.750700280112045e-05, + "loss": 0.6974, + "step": 1256 + }, + { + "epoch": 1.60896, + "grad_norm": 0.6378730535507202, + "learning_rate": 4.750500200080032e-05, + "loss": 0.7755, + "step": 1257 + }, + { + "epoch": 1.6102400000000001, + "grad_norm": 0.6123172640800476, + "learning_rate": 4.7503001200480195e-05, + "loss": 0.6952, + "step": 1258 + }, + { + "epoch": 1.61152, + "grad_norm": 0.6375351548194885, + "learning_rate": 4.750100040016007e-05, + "loss": 0.7457, + "step": 1259 + }, + { + "epoch": 1.6128, + "grad_norm": 0.6497233510017395, + "learning_rate": 4.749899959983994e-05, + "loss": 0.7198, + "step": 1260 + }, + { + "epoch": 1.61408, + "grad_norm": 0.6439145803451538, + "learning_rate": 4.749699879951981e-05, + "loss": 0.7273, + "step": 1261 + }, + { + "epoch": 1.61536, + "grad_norm": 0.590140700340271, + "learning_rate": 4.749499799919968e-05, + "loss": 0.6665, + "step": 1262 + }, + { + "epoch": 1.6166399999999999, + "grad_norm": 0.6305953860282898, + "learning_rate": 4.7492997198879554e-05, + "loss": 0.7422, + "step": 1263 + }, + { + "epoch": 1.61792, + "grad_norm": 0.6392373442649841, + "learning_rate": 4.7490996398559426e-05, + "loss": 0.7263, + "step": 1264 + }, + { + "epoch": 1.6192, + "grad_norm": 0.6296020746231079, + "learning_rate": 4.74889955982393e-05, + "loss": 0.7455, + "step": 1265 + }, + { + "epoch": 1.6204800000000001, + "grad_norm": 0.6070896983146667, + "learning_rate": 4.748699479791917e-05, + "loss": 0.7014, + "step": 1266 + }, + { + "epoch": 1.62176, + "grad_norm": 0.6309097409248352, + "learning_rate": 4.748499399759904e-05, + "loss": 0.7737, + "step": 1267 + }, + { + "epoch": 1.62304, + "grad_norm": 0.591622531414032, + "learning_rate": 4.7482993197278913e-05, + "loss": 0.7032, + "step": 1268 + }, + { + "epoch": 1.62432, + "grad_norm": 0.5985342860221863, + "learning_rate": 4.7480992396958785e-05, + "loss": 0.7006, + "step": 1269 + }, + { + "epoch": 1.6256, + "grad_norm": 0.5956323146820068, + "learning_rate": 4.747899159663866e-05, + "loss": 0.6925, + "step": 1270 + }, + { + "epoch": 1.6268799999999999, + "grad_norm": 0.6757445931434631, + "learning_rate": 4.747699079631853e-05, + "loss": 0.7091, + "step": 1271 + }, + { + "epoch": 1.62816, + "grad_norm": 0.6045747399330139, + "learning_rate": 4.74749899959984e-05, + "loss": 0.6801, + "step": 1272 + }, + { + "epoch": 1.62944, + "grad_norm": 0.6032465696334839, + "learning_rate": 4.747298919567827e-05, + "loss": 0.6913, + "step": 1273 + }, + { + "epoch": 1.63072, + "grad_norm": 0.6105408072471619, + "learning_rate": 4.7470988395358145e-05, + "loss": 0.7102, + "step": 1274 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.6021878123283386, + "learning_rate": 4.7468987595038016e-05, + "loss": 0.66, + "step": 1275 + }, + { + "epoch": 1.63328, + "grad_norm": 0.6317958235740662, + "learning_rate": 4.746698679471789e-05, + "loss": 0.7622, + "step": 1276 + }, + { + "epoch": 1.63456, + "grad_norm": 0.6472170352935791, + "learning_rate": 4.746498599439776e-05, + "loss": 0.8176, + "step": 1277 + }, + { + "epoch": 1.63584, + "grad_norm": 0.6169067621231079, + "learning_rate": 4.746298519407764e-05, + "loss": 0.7191, + "step": 1278 + }, + { + "epoch": 1.63712, + "grad_norm": 0.6021884083747864, + "learning_rate": 4.7460984393757504e-05, + "loss": 0.7359, + "step": 1279 + }, + { + "epoch": 1.6383999999999999, + "grad_norm": 0.6332974433898926, + "learning_rate": 4.7458983593437376e-05, + "loss": 0.7494, + "step": 1280 + }, + { + "epoch": 1.63968, + "grad_norm": 0.6417760252952576, + "learning_rate": 4.745698279311725e-05, + "loss": 0.7384, + "step": 1281 + }, + { + "epoch": 1.64096, + "grad_norm": 0.6329305768013, + "learning_rate": 4.745498199279712e-05, + "loss": 0.7398, + "step": 1282 + }, + { + "epoch": 1.6422400000000001, + "grad_norm": 0.6128019690513611, + "learning_rate": 4.745298119247699e-05, + "loss": 0.6854, + "step": 1283 + }, + { + "epoch": 1.64352, + "grad_norm": 0.6532317399978638, + "learning_rate": 4.745098039215686e-05, + "loss": 0.6853, + "step": 1284 + }, + { + "epoch": 1.6448, + "grad_norm": 0.6134045720100403, + "learning_rate": 4.744897959183674e-05, + "loss": 0.6766, + "step": 1285 + }, + { + "epoch": 1.64608, + "grad_norm": 0.6327788829803467, + "learning_rate": 4.7446978791516614e-05, + "loss": 0.6836, + "step": 1286 + }, + { + "epoch": 1.64736, + "grad_norm": 0.669122040271759, + "learning_rate": 4.744497799119648e-05, + "loss": 0.7196, + "step": 1287 + }, + { + "epoch": 1.6486399999999999, + "grad_norm": 0.614807665348053, + "learning_rate": 4.744297719087635e-05, + "loss": 0.7626, + "step": 1288 + }, + { + "epoch": 1.64992, + "grad_norm": 0.6070460081100464, + "learning_rate": 4.744097639055622e-05, + "loss": 0.6698, + "step": 1289 + }, + { + "epoch": 1.6512, + "grad_norm": 0.6391851902008057, + "learning_rate": 4.7438975590236094e-05, + "loss": 0.7167, + "step": 1290 + }, + { + "epoch": 1.65248, + "grad_norm": 0.6460067629814148, + "learning_rate": 4.7436974789915966e-05, + "loss": 0.7165, + "step": 1291 + }, + { + "epoch": 1.6537600000000001, + "grad_norm": 0.6238282918930054, + "learning_rate": 4.7434973989595845e-05, + "loss": 0.7058, + "step": 1292 + }, + { + "epoch": 1.65504, + "grad_norm": 0.6249229311943054, + "learning_rate": 4.7432973189275717e-05, + "loss": 0.7196, + "step": 1293 + }, + { + "epoch": 1.65632, + "grad_norm": 0.6393983960151672, + "learning_rate": 4.743097238895559e-05, + "loss": 0.6811, + "step": 1294 + }, + { + "epoch": 1.6576, + "grad_norm": 0.6782253384590149, + "learning_rate": 4.7428971588635454e-05, + "loss": 0.7559, + "step": 1295 + }, + { + "epoch": 1.65888, + "grad_norm": 0.5967546701431274, + "learning_rate": 4.7426970788315325e-05, + "loss": 0.6603, + "step": 1296 + }, + { + "epoch": 1.6601599999999999, + "grad_norm": 0.6373478174209595, + "learning_rate": 4.74249699879952e-05, + "loss": 0.7279, + "step": 1297 + }, + { + "epoch": 1.66144, + "grad_norm": 0.6224157214164734, + "learning_rate": 4.742296918767507e-05, + "loss": 0.6636, + "step": 1298 + }, + { + "epoch": 1.66272, + "grad_norm": 0.6072138547897339, + "learning_rate": 4.742096838735494e-05, + "loss": 0.6899, + "step": 1299 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.6565014719963074, + "learning_rate": 4.741896758703482e-05, + "loss": 0.7157, + "step": 1300 + }, + { + "epoch": 1.66528, + "grad_norm": 0.653264045715332, + "learning_rate": 4.741696678671469e-05, + "loss": 0.7142, + "step": 1301 + }, + { + "epoch": 1.66656, + "grad_norm": 0.6316604018211365, + "learning_rate": 4.741496598639456e-05, + "loss": 0.6723, + "step": 1302 + }, + { + "epoch": 1.66784, + "grad_norm": 0.6184001564979553, + "learning_rate": 4.741296518607443e-05, + "loss": 0.7288, + "step": 1303 + }, + { + "epoch": 1.66912, + "grad_norm": 0.6457557082176208, + "learning_rate": 4.74109643857543e-05, + "loss": 0.7429, + "step": 1304 + }, + { + "epoch": 1.6703999999999999, + "grad_norm": 0.6284366250038147, + "learning_rate": 4.740896358543417e-05, + "loss": 0.6764, + "step": 1305 + }, + { + "epoch": 1.67168, + "grad_norm": 0.6020685434341431, + "learning_rate": 4.7406962785114044e-05, + "loss": 0.6978, + "step": 1306 + }, + { + "epoch": 1.67296, + "grad_norm": 0.6325390338897705, + "learning_rate": 4.740496198479392e-05, + "loss": 0.7271, + "step": 1307 + }, + { + "epoch": 1.67424, + "grad_norm": 0.6092963814735413, + "learning_rate": 4.7402961184473794e-05, + "loss": 0.7342, + "step": 1308 + }, + { + "epoch": 1.6755200000000001, + "grad_norm": 0.6341113448143005, + "learning_rate": 4.7400960384153666e-05, + "loss": 0.7043, + "step": 1309 + }, + { + "epoch": 1.6768, + "grad_norm": 0.6368173360824585, + "learning_rate": 4.739895958383354e-05, + "loss": 0.7099, + "step": 1310 + }, + { + "epoch": 1.67808, + "grad_norm": 0.6337018609046936, + "learning_rate": 4.73969587835134e-05, + "loss": 0.6765, + "step": 1311 + }, + { + "epoch": 1.67936, + "grad_norm": 0.7150176167488098, + "learning_rate": 4.7394957983193275e-05, + "loss": 0.7172, + "step": 1312 + }, + { + "epoch": 1.68064, + "grad_norm": 0.6691837906837463, + "learning_rate": 4.739295718287315e-05, + "loss": 0.7095, + "step": 1313 + }, + { + "epoch": 1.6819199999999999, + "grad_norm": 0.6361916661262512, + "learning_rate": 4.7390956382553026e-05, + "loss": 0.7367, + "step": 1314 + }, + { + "epoch": 1.6832, + "grad_norm": 0.6631631255149841, + "learning_rate": 4.73889555822329e-05, + "loss": 0.7083, + "step": 1315 + }, + { + "epoch": 1.68448, + "grad_norm": 0.6436387896537781, + "learning_rate": 4.738695478191277e-05, + "loss": 0.7754, + "step": 1316 + }, + { + "epoch": 1.6857600000000001, + "grad_norm": 0.644277811050415, + "learning_rate": 4.738495398159264e-05, + "loss": 0.7025, + "step": 1317 + }, + { + "epoch": 1.68704, + "grad_norm": 0.6064467430114746, + "learning_rate": 4.738295318127251e-05, + "loss": 0.7252, + "step": 1318 + }, + { + "epoch": 1.68832, + "grad_norm": 0.6227229237556458, + "learning_rate": 4.738095238095238e-05, + "loss": 0.6998, + "step": 1319 + }, + { + "epoch": 1.6896, + "grad_norm": 0.644025981426239, + "learning_rate": 4.737895158063225e-05, + "loss": 0.718, + "step": 1320 + }, + { + "epoch": 1.69088, + "grad_norm": 0.6950893998146057, + "learning_rate": 4.737695078031213e-05, + "loss": 0.7618, + "step": 1321 + }, + { + "epoch": 1.6921599999999999, + "grad_norm": 0.6096273064613342, + "learning_rate": 4.7374949979992e-05, + "loss": 0.7492, + "step": 1322 + }, + { + "epoch": 1.6934399999999998, + "grad_norm": 0.624535322189331, + "learning_rate": 4.737294917967187e-05, + "loss": 0.6749, + "step": 1323 + }, + { + "epoch": 1.69472, + "grad_norm": 0.7543303966522217, + "learning_rate": 4.7370948379351744e-05, + "loss": 0.7168, + "step": 1324 + }, + { + "epoch": 1.696, + "grad_norm": 0.6681160926818848, + "learning_rate": 4.7368947579031616e-05, + "loss": 0.7414, + "step": 1325 + }, + { + "epoch": 1.6972800000000001, + "grad_norm": 0.6174480319023132, + "learning_rate": 4.736694677871149e-05, + "loss": 0.702, + "step": 1326 + }, + { + "epoch": 1.69856, + "grad_norm": 0.6031041145324707, + "learning_rate": 4.736494597839135e-05, + "loss": 0.6811, + "step": 1327 + }, + { + "epoch": 1.69984, + "grad_norm": 0.6663240194320679, + "learning_rate": 4.736294517807123e-05, + "loss": 0.7624, + "step": 1328 + }, + { + "epoch": 1.70112, + "grad_norm": 0.6095741391181946, + "learning_rate": 4.7360944377751103e-05, + "loss": 0.6708, + "step": 1329 + }, + { + "epoch": 1.7024, + "grad_norm": 0.6273066997528076, + "learning_rate": 4.7358943577430975e-05, + "loss": 0.7313, + "step": 1330 + }, + { + "epoch": 1.7036799999999999, + "grad_norm": 0.6379857063293457, + "learning_rate": 4.735694277711085e-05, + "loss": 0.7075, + "step": 1331 + }, + { + "epoch": 1.70496, + "grad_norm": 0.606400728225708, + "learning_rate": 4.735494197679072e-05, + "loss": 0.7267, + "step": 1332 + }, + { + "epoch": 1.70624, + "grad_norm": 0.6663306355476379, + "learning_rate": 4.735294117647059e-05, + "loss": 0.7154, + "step": 1333 + }, + { + "epoch": 1.7075200000000001, + "grad_norm": 0.6471573114395142, + "learning_rate": 4.735094037615046e-05, + "loss": 0.7464, + "step": 1334 + }, + { + "epoch": 1.7088, + "grad_norm": 0.6170628070831299, + "learning_rate": 4.7348939575830335e-05, + "loss": 0.7251, + "step": 1335 + }, + { + "epoch": 1.71008, + "grad_norm": 0.6125094890594482, + "learning_rate": 4.7346938775510206e-05, + "loss": 0.7548, + "step": 1336 + }, + { + "epoch": 1.71136, + "grad_norm": 0.6116237044334412, + "learning_rate": 4.734493797519008e-05, + "loss": 0.7347, + "step": 1337 + }, + { + "epoch": 1.71264, + "grad_norm": 0.6676164269447327, + "learning_rate": 4.734293717486995e-05, + "loss": 0.7439, + "step": 1338 + }, + { + "epoch": 1.7139199999999999, + "grad_norm": 0.6193406581878662, + "learning_rate": 4.734093637454982e-05, + "loss": 0.7186, + "step": 1339 + }, + { + "epoch": 1.7151999999999998, + "grad_norm": 0.5905833840370178, + "learning_rate": 4.7338935574229694e-05, + "loss": 0.6775, + "step": 1340 + }, + { + "epoch": 1.71648, + "grad_norm": 0.6464902758598328, + "learning_rate": 4.7336934773909566e-05, + "loss": 0.7586, + "step": 1341 + }, + { + "epoch": 1.71776, + "grad_norm": 0.6692549586296082, + "learning_rate": 4.733493397358944e-05, + "loss": 0.7933, + "step": 1342 + }, + { + "epoch": 1.7190400000000001, + "grad_norm": 0.6424185037612915, + "learning_rate": 4.733293317326931e-05, + "loss": 0.7422, + "step": 1343 + }, + { + "epoch": 1.72032, + "grad_norm": 0.6273303031921387, + "learning_rate": 4.733093237294918e-05, + "loss": 0.6837, + "step": 1344 + }, + { + "epoch": 1.7216, + "grad_norm": 0.6037375330924988, + "learning_rate": 4.732893157262905e-05, + "loss": 0.7306, + "step": 1345 + }, + { + "epoch": 1.72288, + "grad_norm": 0.6088179349899292, + "learning_rate": 4.7326930772308925e-05, + "loss": 0.7008, + "step": 1346 + }, + { + "epoch": 1.72416, + "grad_norm": 0.6654016375541687, + "learning_rate": 4.73249299719888e-05, + "loss": 0.7958, + "step": 1347 + }, + { + "epoch": 1.7254399999999999, + "grad_norm": 0.6818312406539917, + "learning_rate": 4.732292917166867e-05, + "loss": 0.7136, + "step": 1348 + }, + { + "epoch": 1.72672, + "grad_norm": 0.5944163799285889, + "learning_rate": 4.732092837134855e-05, + "loss": 0.7097, + "step": 1349 + }, + { + "epoch": 1.728, + "grad_norm": 0.6351943016052246, + "learning_rate": 4.731892757102841e-05, + "loss": 0.7255, + "step": 1350 + }, + { + "epoch": 1.7292800000000002, + "grad_norm": 0.6025944948196411, + "learning_rate": 4.7316926770708284e-05, + "loss": 0.6275, + "step": 1351 + }, + { + "epoch": 1.73056, + "grad_norm": 0.6137747764587402, + "learning_rate": 4.7314925970388156e-05, + "loss": 0.722, + "step": 1352 + }, + { + "epoch": 1.73184, + "grad_norm": 0.6036489605903625, + "learning_rate": 4.731292517006803e-05, + "loss": 0.7132, + "step": 1353 + }, + { + "epoch": 1.73312, + "grad_norm": 0.6218430399894714, + "learning_rate": 4.73109243697479e-05, + "loss": 0.6794, + "step": 1354 + }, + { + "epoch": 1.7344, + "grad_norm": 0.582331120967865, + "learning_rate": 4.730892356942777e-05, + "loss": 0.7159, + "step": 1355 + }, + { + "epoch": 1.73568, + "grad_norm": 0.626875638961792, + "learning_rate": 4.730692276910765e-05, + "loss": 0.7335, + "step": 1356 + }, + { + "epoch": 1.7369599999999998, + "grad_norm": 0.6015035510063171, + "learning_rate": 4.730492196878752e-05, + "loss": 0.6889, + "step": 1357 + }, + { + "epoch": 1.73824, + "grad_norm": 0.6084362268447876, + "learning_rate": 4.730292116846739e-05, + "loss": 0.7034, + "step": 1358 + }, + { + "epoch": 1.73952, + "grad_norm": 0.6344590187072754, + "learning_rate": 4.730092036814726e-05, + "loss": 0.6738, + "step": 1359 + }, + { + "epoch": 1.7408000000000001, + "grad_norm": 0.6038119792938232, + "learning_rate": 4.729891956782713e-05, + "loss": 0.7495, + "step": 1360 + }, + { + "epoch": 1.74208, + "grad_norm": 0.5990369319915771, + "learning_rate": 4.7296918767507e-05, + "loss": 0.7376, + "step": 1361 + }, + { + "epoch": 1.74336, + "grad_norm": 0.6152486205101013, + "learning_rate": 4.7294917967186875e-05, + "loss": 0.6552, + "step": 1362 + }, + { + "epoch": 1.74464, + "grad_norm": 0.6658695936203003, + "learning_rate": 4.729291716686675e-05, + "loss": 0.7151, + "step": 1363 + }, + { + "epoch": 1.74592, + "grad_norm": 0.6392641067504883, + "learning_rate": 4.7290916366546625e-05, + "loss": 0.7298, + "step": 1364 + }, + { + "epoch": 1.7471999999999999, + "grad_norm": 0.6059170961380005, + "learning_rate": 4.72889155662265e-05, + "loss": 0.7054, + "step": 1365 + }, + { + "epoch": 1.74848, + "grad_norm": 0.5970711708068848, + "learning_rate": 4.728691476590636e-05, + "loss": 0.7038, + "step": 1366 + }, + { + "epoch": 1.74976, + "grad_norm": 0.6231157183647156, + "learning_rate": 4.7284913965586234e-05, + "loss": 0.6903, + "step": 1367 + }, + { + "epoch": 1.7510400000000002, + "grad_norm": 0.6200019121170044, + "learning_rate": 4.7282913165266106e-05, + "loss": 0.6916, + "step": 1368 + }, + { + "epoch": 1.75232, + "grad_norm": 0.62948077917099, + "learning_rate": 4.728091236494598e-05, + "loss": 0.7023, + "step": 1369 + }, + { + "epoch": 1.7536, + "grad_norm": 0.6121958494186401, + "learning_rate": 4.7278911564625856e-05, + "loss": 0.6912, + "step": 1370 + }, + { + "epoch": 1.75488, + "grad_norm": 0.6384387612342834, + "learning_rate": 4.727691076430573e-05, + "loss": 0.7216, + "step": 1371 + }, + { + "epoch": 1.75616, + "grad_norm": 0.6484329700469971, + "learning_rate": 4.72749099639856e-05, + "loss": 0.6696, + "step": 1372 + }, + { + "epoch": 1.75744, + "grad_norm": 0.6550239324569702, + "learning_rate": 4.727290916366547e-05, + "loss": 0.7213, + "step": 1373 + }, + { + "epoch": 1.7587199999999998, + "grad_norm": 0.6201356649398804, + "learning_rate": 4.727090836334534e-05, + "loss": 0.7322, + "step": 1374 + }, + { + "epoch": 1.76, + "grad_norm": 0.6567116379737854, + "learning_rate": 4.726890756302521e-05, + "loss": 0.7732, + "step": 1375 + }, + { + "epoch": 1.76128, + "grad_norm": 0.6301355957984924, + "learning_rate": 4.726690676270508e-05, + "loss": 0.7397, + "step": 1376 + }, + { + "epoch": 1.7625600000000001, + "grad_norm": 0.6590511202812195, + "learning_rate": 4.726490596238496e-05, + "loss": 0.7581, + "step": 1377 + }, + { + "epoch": 1.76384, + "grad_norm": 0.6398580074310303, + "learning_rate": 4.726290516206483e-05, + "loss": 0.7062, + "step": 1378 + }, + { + "epoch": 1.76512, + "grad_norm": 0.6533066630363464, + "learning_rate": 4.72609043617447e-05, + "loss": 0.7078, + "step": 1379 + }, + { + "epoch": 1.7664, + "grad_norm": 0.6254117488861084, + "learning_rate": 4.7258903561424575e-05, + "loss": 0.6917, + "step": 1380 + }, + { + "epoch": 1.76768, + "grad_norm": 0.6385900378227234, + "learning_rate": 4.725690276110445e-05, + "loss": 0.7951, + "step": 1381 + }, + { + "epoch": 1.7689599999999999, + "grad_norm": 0.6786746978759766, + "learning_rate": 4.725490196078431e-05, + "loss": 0.7645, + "step": 1382 + }, + { + "epoch": 1.77024, + "grad_norm": 0.6253941059112549, + "learning_rate": 4.7252901160464184e-05, + "loss": 0.7064, + "step": 1383 + }, + { + "epoch": 1.77152, + "grad_norm": 0.6229246258735657, + "learning_rate": 4.725090036014406e-05, + "loss": 0.7386, + "step": 1384 + }, + { + "epoch": 1.7728000000000002, + "grad_norm": 0.6111705303192139, + "learning_rate": 4.7248899559823934e-05, + "loss": 0.6646, + "step": 1385 + }, + { + "epoch": 1.77408, + "grad_norm": 0.6427714228630066, + "learning_rate": 4.7246898759503806e-05, + "loss": 0.7058, + "step": 1386 + }, + { + "epoch": 1.77536, + "grad_norm": 0.635543704032898, + "learning_rate": 4.724489795918368e-05, + "loss": 0.7016, + "step": 1387 + }, + { + "epoch": 1.77664, + "grad_norm": 0.6031673550605774, + "learning_rate": 4.724289715886355e-05, + "loss": 0.6427, + "step": 1388 + }, + { + "epoch": 1.77792, + "grad_norm": 0.6220544576644897, + "learning_rate": 4.724089635854342e-05, + "loss": 0.731, + "step": 1389 + }, + { + "epoch": 1.7792, + "grad_norm": 0.6029757261276245, + "learning_rate": 4.723889555822329e-05, + "loss": 0.7165, + "step": 1390 + }, + { + "epoch": 1.7804799999999998, + "grad_norm": 0.6523854732513428, + "learning_rate": 4.7236894757903165e-05, + "loss": 0.7179, + "step": 1391 + }, + { + "epoch": 1.78176, + "grad_norm": 0.6103372573852539, + "learning_rate": 4.723489395758304e-05, + "loss": 0.6943, + "step": 1392 + }, + { + "epoch": 1.78304, + "grad_norm": 0.631454348564148, + "learning_rate": 4.723289315726291e-05, + "loss": 0.7071, + "step": 1393 + }, + { + "epoch": 1.7843200000000001, + "grad_norm": 0.6054040193557739, + "learning_rate": 4.723089235694278e-05, + "loss": 0.6621, + "step": 1394 + }, + { + "epoch": 1.7856, + "grad_norm": 0.6552069783210754, + "learning_rate": 4.722889155662265e-05, + "loss": 0.7767, + "step": 1395 + }, + { + "epoch": 1.78688, + "grad_norm": 0.6483544111251831, + "learning_rate": 4.7226890756302525e-05, + "loss": 0.759, + "step": 1396 + }, + { + "epoch": 1.78816, + "grad_norm": 0.6473653316497803, + "learning_rate": 4.7224889955982396e-05, + "loss": 0.6879, + "step": 1397 + }, + { + "epoch": 1.78944, + "grad_norm": 0.6504471898078918, + "learning_rate": 4.722288915566227e-05, + "loss": 0.7329, + "step": 1398 + }, + { + "epoch": 1.7907199999999999, + "grad_norm": 0.6064606308937073, + "learning_rate": 4.722088835534214e-05, + "loss": 0.7472, + "step": 1399 + }, + { + "epoch": 1.792, + "grad_norm": 0.6191816926002502, + "learning_rate": 4.721888755502201e-05, + "loss": 0.7031, + "step": 1400 + }, + { + "epoch": 1.79328, + "grad_norm": 0.6464985013008118, + "learning_rate": 4.7216886754701884e-05, + "loss": 0.7053, + "step": 1401 + }, + { + "epoch": 1.7945600000000002, + "grad_norm": 0.6391028165817261, + "learning_rate": 4.7214885954381756e-05, + "loss": 0.6984, + "step": 1402 + }, + { + "epoch": 1.79584, + "grad_norm": 0.6238522529602051, + "learning_rate": 4.721288515406163e-05, + "loss": 0.7134, + "step": 1403 + }, + { + "epoch": 1.79712, + "grad_norm": 0.6372814774513245, + "learning_rate": 4.72108843537415e-05, + "loss": 0.741, + "step": 1404 + }, + { + "epoch": 1.7984, + "grad_norm": 0.6125516295433044, + "learning_rate": 4.720888355342137e-05, + "loss": 0.7364, + "step": 1405 + }, + { + "epoch": 1.79968, + "grad_norm": 0.6260775923728943, + "learning_rate": 4.720688275310124e-05, + "loss": 0.7158, + "step": 1406 + }, + { + "epoch": 1.80096, + "grad_norm": 0.6567710041999817, + "learning_rate": 4.7204881952781115e-05, + "loss": 0.7745, + "step": 1407 + }, + { + "epoch": 1.8022399999999998, + "grad_norm": 0.6455616354942322, + "learning_rate": 4.720288115246099e-05, + "loss": 0.7038, + "step": 1408 + }, + { + "epoch": 1.80352, + "grad_norm": 0.6271963715553284, + "learning_rate": 4.720088035214086e-05, + "loss": 0.7339, + "step": 1409 + }, + { + "epoch": 1.8048, + "grad_norm": 0.6165766716003418, + "learning_rate": 4.719887955182073e-05, + "loss": 0.8227, + "step": 1410 + }, + { + "epoch": 1.8060800000000001, + "grad_norm": 0.6498475074768066, + "learning_rate": 4.71968787515006e-05, + "loss": 0.7552, + "step": 1411 + }, + { + "epoch": 1.80736, + "grad_norm": 0.6534706950187683, + "learning_rate": 4.7194877951180474e-05, + "loss": 0.7461, + "step": 1412 + }, + { + "epoch": 1.80864, + "grad_norm": 0.6396170854568481, + "learning_rate": 4.7192877150860346e-05, + "loss": 0.7051, + "step": 1413 + }, + { + "epoch": 1.80992, + "grad_norm": 0.6736233234405518, + "learning_rate": 4.719087635054022e-05, + "loss": 0.7181, + "step": 1414 + }, + { + "epoch": 1.8112, + "grad_norm": 0.7057044506072998, + "learning_rate": 4.718887555022009e-05, + "loss": 0.7447, + "step": 1415 + }, + { + "epoch": 1.8124799999999999, + "grad_norm": 0.639115571975708, + "learning_rate": 4.718687474989996e-05, + "loss": 0.702, + "step": 1416 + }, + { + "epoch": 1.81376, + "grad_norm": 0.6411137580871582, + "learning_rate": 4.7184873949579834e-05, + "loss": 0.7378, + "step": 1417 + }, + { + "epoch": 1.81504, + "grad_norm": 0.618817925453186, + "learning_rate": 4.7182873149259705e-05, + "loss": 0.6965, + "step": 1418 + }, + { + "epoch": 1.8163200000000002, + "grad_norm": 0.6112127304077148, + "learning_rate": 4.718087234893958e-05, + "loss": 0.7387, + "step": 1419 + }, + { + "epoch": 1.8176, + "grad_norm": 0.6073436737060547, + "learning_rate": 4.717887154861945e-05, + "loss": 0.6605, + "step": 1420 + }, + { + "epoch": 1.81888, + "grad_norm": 0.5831905007362366, + "learning_rate": 4.717687074829932e-05, + "loss": 0.6903, + "step": 1421 + }, + { + "epoch": 1.82016, + "grad_norm": 0.6208034753799438, + "learning_rate": 4.717486994797919e-05, + "loss": 0.7061, + "step": 1422 + }, + { + "epoch": 1.82144, + "grad_norm": 0.6343234181404114, + "learning_rate": 4.7172869147659065e-05, + "loss": 0.7128, + "step": 1423 + }, + { + "epoch": 1.82272, + "grad_norm": 0.6611493229866028, + "learning_rate": 4.7170868347338937e-05, + "loss": 0.7567, + "step": 1424 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.6101629734039307, + "learning_rate": 4.716886754701881e-05, + "loss": 0.7273, + "step": 1425 + }, + { + "epoch": 1.82528, + "grad_norm": 0.5913015604019165, + "learning_rate": 4.716686674669868e-05, + "loss": 0.6477, + "step": 1426 + }, + { + "epoch": 1.82656, + "grad_norm": 0.5725111365318298, + "learning_rate": 4.716486594637856e-05, + "loss": 0.7042, + "step": 1427 + }, + { + "epoch": 1.8278400000000001, + "grad_norm": 0.5874539017677307, + "learning_rate": 4.7162865146058424e-05, + "loss": 0.6511, + "step": 1428 + }, + { + "epoch": 1.82912, + "grad_norm": 0.6199377179145813, + "learning_rate": 4.7160864345738296e-05, + "loss": 0.6984, + "step": 1429 + }, + { + "epoch": 1.8304, + "grad_norm": 0.6231164932250977, + "learning_rate": 4.715886354541817e-05, + "loss": 0.6653, + "step": 1430 + }, + { + "epoch": 1.83168, + "grad_norm": 0.6409709453582764, + "learning_rate": 4.715686274509804e-05, + "loss": 0.7573, + "step": 1431 + }, + { + "epoch": 1.83296, + "grad_norm": 0.5936411023139954, + "learning_rate": 4.715486194477791e-05, + "loss": 0.7043, + "step": 1432 + }, + { + "epoch": 1.8342399999999999, + "grad_norm": 0.6537723541259766, + "learning_rate": 4.715286114445778e-05, + "loss": 0.7084, + "step": 1433 + }, + { + "epoch": 1.83552, + "grad_norm": 0.5906792283058167, + "learning_rate": 4.715086034413766e-05, + "loss": 0.6799, + "step": 1434 + }, + { + "epoch": 1.8368, + "grad_norm": 0.5938474535942078, + "learning_rate": 4.7148859543817534e-05, + "loss": 0.6764, + "step": 1435 + }, + { + "epoch": 1.8380800000000002, + "grad_norm": 0.6261917948722839, + "learning_rate": 4.71468587434974e-05, + "loss": 0.7256, + "step": 1436 + }, + { + "epoch": 1.83936, + "grad_norm": 0.6289463043212891, + "learning_rate": 4.714485794317727e-05, + "loss": 0.7277, + "step": 1437 + }, + { + "epoch": 1.84064, + "grad_norm": 0.606020987033844, + "learning_rate": 4.714285714285714e-05, + "loss": 0.6845, + "step": 1438 + }, + { + "epoch": 1.84192, + "grad_norm": 0.6375628709793091, + "learning_rate": 4.7140856342537014e-05, + "loss": 0.7181, + "step": 1439 + }, + { + "epoch": 1.8432, + "grad_norm": 0.6486073136329651, + "learning_rate": 4.7138855542216886e-05, + "loss": 0.7067, + "step": 1440 + }, + { + "epoch": 1.84448, + "grad_norm": 0.6745935082435608, + "learning_rate": 4.7136854741896765e-05, + "loss": 0.7309, + "step": 1441 + }, + { + "epoch": 1.8457599999999998, + "grad_norm": 0.604806125164032, + "learning_rate": 4.713485394157664e-05, + "loss": 0.7268, + "step": 1442 + }, + { + "epoch": 1.84704, + "grad_norm": 0.6913792490959167, + "learning_rate": 4.713285314125651e-05, + "loss": 0.7569, + "step": 1443 + }, + { + "epoch": 1.84832, + "grad_norm": 0.5856841802597046, + "learning_rate": 4.7130852340936374e-05, + "loss": 0.6421, + "step": 1444 + }, + { + "epoch": 1.8496000000000001, + "grad_norm": 0.6320177316665649, + "learning_rate": 4.7128851540616246e-05, + "loss": 0.7385, + "step": 1445 + }, + { + "epoch": 1.85088, + "grad_norm": 0.6420151591300964, + "learning_rate": 4.712685074029612e-05, + "loss": 0.6881, + "step": 1446 + }, + { + "epoch": 1.85216, + "grad_norm": 0.6870813965797424, + "learning_rate": 4.712484993997599e-05, + "loss": 0.7299, + "step": 1447 + }, + { + "epoch": 1.85344, + "grad_norm": 0.6391651034355164, + "learning_rate": 4.712284913965587e-05, + "loss": 0.68, + "step": 1448 + }, + { + "epoch": 1.85472, + "grad_norm": 0.681445300579071, + "learning_rate": 4.712084833933574e-05, + "loss": 0.7795, + "step": 1449 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.6424078941345215, + "learning_rate": 4.711884753901561e-05, + "loss": 0.7234, + "step": 1450 + }, + { + "epoch": 1.85728, + "grad_norm": 0.625423014163971, + "learning_rate": 4.7116846738695484e-05, + "loss": 0.6631, + "step": 1451 + }, + { + "epoch": 1.85856, + "grad_norm": 0.611112117767334, + "learning_rate": 4.711484593837535e-05, + "loss": 0.7227, + "step": 1452 + }, + { + "epoch": 1.8598400000000002, + "grad_norm": 0.6501821279525757, + "learning_rate": 4.711284513805522e-05, + "loss": 0.7494, + "step": 1453 + }, + { + "epoch": 1.86112, + "grad_norm": 0.6695547699928284, + "learning_rate": 4.711084433773509e-05, + "loss": 0.7335, + "step": 1454 + }, + { + "epoch": 1.8624, + "grad_norm": 0.6903998851776123, + "learning_rate": 4.710884353741497e-05, + "loss": 0.7594, + "step": 1455 + }, + { + "epoch": 1.86368, + "grad_norm": 0.6672253608703613, + "learning_rate": 4.710684273709484e-05, + "loss": 0.7182, + "step": 1456 + }, + { + "epoch": 1.86496, + "grad_norm": 0.6630045175552368, + "learning_rate": 4.7104841936774715e-05, + "loss": 0.7251, + "step": 1457 + }, + { + "epoch": 1.86624, + "grad_norm": 0.6247894167900085, + "learning_rate": 4.7102841136454587e-05, + "loss": 0.7358, + "step": 1458 + }, + { + "epoch": 1.8675199999999998, + "grad_norm": 0.595182478427887, + "learning_rate": 4.710084033613446e-05, + "loss": 0.6497, + "step": 1459 + }, + { + "epoch": 1.8688, + "grad_norm": 0.6355342864990234, + "learning_rate": 4.7098839535814323e-05, + "loss": 0.6925, + "step": 1460 + }, + { + "epoch": 1.87008, + "grad_norm": 0.6582742929458618, + "learning_rate": 4.7096838735494195e-05, + "loss": 0.7062, + "step": 1461 + }, + { + "epoch": 1.8713600000000001, + "grad_norm": 0.5886150598526001, + "learning_rate": 4.7094837935174074e-05, + "loss": 0.6701, + "step": 1462 + }, + { + "epoch": 1.87264, + "grad_norm": 0.6048754453659058, + "learning_rate": 4.7092837134853946e-05, + "loss": 0.6841, + "step": 1463 + }, + { + "epoch": 1.87392, + "grad_norm": 0.6366539597511292, + "learning_rate": 4.709083633453382e-05, + "loss": 0.6991, + "step": 1464 + }, + { + "epoch": 1.8752, + "grad_norm": 0.5844667553901672, + "learning_rate": 4.708883553421369e-05, + "loss": 0.6269, + "step": 1465 + }, + { + "epoch": 1.87648, + "grad_norm": 0.5977579951286316, + "learning_rate": 4.708683473389356e-05, + "loss": 0.6561, + "step": 1466 + }, + { + "epoch": 1.8777599999999999, + "grad_norm": 0.6138444542884827, + "learning_rate": 4.708483393357343e-05, + "loss": 0.6814, + "step": 1467 + }, + { + "epoch": 1.87904, + "grad_norm": 0.6392626762390137, + "learning_rate": 4.70828331332533e-05, + "loss": 0.6931, + "step": 1468 + }, + { + "epoch": 1.88032, + "grad_norm": 0.666708767414093, + "learning_rate": 4.708083233293318e-05, + "loss": 0.7474, + "step": 1469 + }, + { + "epoch": 1.8816000000000002, + "grad_norm": 0.6352314949035645, + "learning_rate": 4.707883153261305e-05, + "loss": 0.7012, + "step": 1470 + }, + { + "epoch": 1.88288, + "grad_norm": 0.6222551465034485, + "learning_rate": 4.707683073229292e-05, + "loss": 0.7133, + "step": 1471 + }, + { + "epoch": 1.88416, + "grad_norm": 0.643947958946228, + "learning_rate": 4.707482993197279e-05, + "loss": 0.7556, + "step": 1472 + }, + { + "epoch": 1.88544, + "grad_norm": 0.5932632684707642, + "learning_rate": 4.7072829131652664e-05, + "loss": 0.6847, + "step": 1473 + }, + { + "epoch": 1.88672, + "grad_norm": 0.6231780648231506, + "learning_rate": 4.7070828331332536e-05, + "loss": 0.6971, + "step": 1474 + }, + { + "epoch": 1.888, + "grad_norm": 0.6272242665290833, + "learning_rate": 4.706882753101241e-05, + "loss": 0.7149, + "step": 1475 + }, + { + "epoch": 1.8892799999999998, + "grad_norm": 0.6251195073127747, + "learning_rate": 4.706682673069228e-05, + "loss": 0.685, + "step": 1476 + }, + { + "epoch": 1.89056, + "grad_norm": 0.6322864890098572, + "learning_rate": 4.706482593037215e-05, + "loss": 0.7115, + "step": 1477 + }, + { + "epoch": 1.89184, + "grad_norm": 0.6262922286987305, + "learning_rate": 4.7062825130052024e-05, + "loss": 0.7272, + "step": 1478 + }, + { + "epoch": 1.8931200000000001, + "grad_norm": 0.6692927479743958, + "learning_rate": 4.7060824329731896e-05, + "loss": 0.7201, + "step": 1479 + }, + { + "epoch": 1.8944, + "grad_norm": 0.58826744556427, + "learning_rate": 4.705882352941177e-05, + "loss": 0.6663, + "step": 1480 + }, + { + "epoch": 1.89568, + "grad_norm": 0.6134018301963806, + "learning_rate": 4.705682272909164e-05, + "loss": 0.7319, + "step": 1481 + }, + { + "epoch": 1.89696, + "grad_norm": 0.6586698889732361, + "learning_rate": 4.705482192877151e-05, + "loss": 0.7324, + "step": 1482 + }, + { + "epoch": 1.89824, + "grad_norm": 0.6096124649047852, + "learning_rate": 4.705282112845138e-05, + "loss": 0.7266, + "step": 1483 + }, + { + "epoch": 1.8995199999999999, + "grad_norm": 0.6256430149078369, + "learning_rate": 4.7050820328131255e-05, + "loss": 0.6926, + "step": 1484 + }, + { + "epoch": 1.9008, + "grad_norm": 0.5955308675765991, + "learning_rate": 4.704881952781113e-05, + "loss": 0.7232, + "step": 1485 + }, + { + "epoch": 1.90208, + "grad_norm": 0.6191076040267944, + "learning_rate": 4.7046818727491e-05, + "loss": 0.6879, + "step": 1486 + }, + { + "epoch": 1.9033600000000002, + "grad_norm": 0.6182354688644409, + "learning_rate": 4.704481792717087e-05, + "loss": 0.7449, + "step": 1487 + }, + { + "epoch": 1.90464, + "grad_norm": 0.6782128214836121, + "learning_rate": 4.704281712685074e-05, + "loss": 0.7688, + "step": 1488 + }, + { + "epoch": 1.90592, + "grad_norm": 0.6402320265769958, + "learning_rate": 4.7040816326530614e-05, + "loss": 0.7376, + "step": 1489 + }, + { + "epoch": 1.9072, + "grad_norm": 0.6744869351387024, + "learning_rate": 4.7038815526210486e-05, + "loss": 0.8047, + "step": 1490 + }, + { + "epoch": 1.90848, + "grad_norm": 0.6262235045433044, + "learning_rate": 4.703681472589036e-05, + "loss": 0.6589, + "step": 1491 + }, + { + "epoch": 1.90976, + "grad_norm": 0.6338122487068176, + "learning_rate": 4.703481392557023e-05, + "loss": 0.7067, + "step": 1492 + }, + { + "epoch": 1.9110399999999998, + "grad_norm": 0.6535120010375977, + "learning_rate": 4.70328131252501e-05, + "loss": 0.6962, + "step": 1493 + }, + { + "epoch": 1.91232, + "grad_norm": 0.6537796854972839, + "learning_rate": 4.703081232492997e-05, + "loss": 0.7182, + "step": 1494 + }, + { + "epoch": 1.9136, + "grad_norm": 0.6153785586357117, + "learning_rate": 4.7028811524609845e-05, + "loss": 0.6872, + "step": 1495 + }, + { + "epoch": 1.9148800000000001, + "grad_norm": 0.6132374405860901, + "learning_rate": 4.702681072428972e-05, + "loss": 0.6838, + "step": 1496 + }, + { + "epoch": 1.91616, + "grad_norm": 0.6539681553840637, + "learning_rate": 4.7024809923969596e-05, + "loss": 0.7467, + "step": 1497 + }, + { + "epoch": 1.91744, + "grad_norm": 0.6296167373657227, + "learning_rate": 4.702280912364946e-05, + "loss": 0.6825, + "step": 1498 + }, + { + "epoch": 1.91872, + "grad_norm": 0.6461489796638489, + "learning_rate": 4.702080832332933e-05, + "loss": 0.7336, + "step": 1499 + }, + { + "epoch": 1.92, + "grad_norm": 0.6219279170036316, + "learning_rate": 4.7018807523009204e-05, + "loss": 0.7172, + "step": 1500 + }, + { + "epoch": 1.9212799999999999, + "grad_norm": 0.6014849543571472, + "learning_rate": 4.7016806722689076e-05, + "loss": 0.7528, + "step": 1501 + }, + { + "epoch": 1.92256, + "grad_norm": 0.6287034749984741, + "learning_rate": 4.701480592236895e-05, + "loss": 0.7724, + "step": 1502 + }, + { + "epoch": 1.92384, + "grad_norm": 0.6444774270057678, + "learning_rate": 4.701280512204882e-05, + "loss": 0.7873, + "step": 1503 + }, + { + "epoch": 1.9251200000000002, + "grad_norm": 0.631040096282959, + "learning_rate": 4.70108043217287e-05, + "loss": 0.7204, + "step": 1504 + }, + { + "epoch": 1.9264000000000001, + "grad_norm": 0.6389703750610352, + "learning_rate": 4.700880352140857e-05, + "loss": 0.7623, + "step": 1505 + }, + { + "epoch": 1.92768, + "grad_norm": 0.6128464341163635, + "learning_rate": 4.7006802721088436e-05, + "loss": 0.7311, + "step": 1506 + }, + { + "epoch": 1.92896, + "grad_norm": 0.6340726613998413, + "learning_rate": 4.700480192076831e-05, + "loss": 0.7277, + "step": 1507 + }, + { + "epoch": 1.93024, + "grad_norm": 0.6153057813644409, + "learning_rate": 4.700280112044818e-05, + "loss": 0.7087, + "step": 1508 + }, + { + "epoch": 1.93152, + "grad_norm": 0.6262139081954956, + "learning_rate": 4.700080032012805e-05, + "loss": 0.6813, + "step": 1509 + }, + { + "epoch": 1.9327999999999999, + "grad_norm": 0.6372281908988953, + "learning_rate": 4.699879951980792e-05, + "loss": 0.6867, + "step": 1510 + }, + { + "epoch": 1.93408, + "grad_norm": 0.6328516602516174, + "learning_rate": 4.69967987194878e-05, + "loss": 0.7376, + "step": 1511 + }, + { + "epoch": 1.93536, + "grad_norm": 0.6271541118621826, + "learning_rate": 4.6994797919167674e-05, + "loss": 0.7694, + "step": 1512 + }, + { + "epoch": 1.9366400000000001, + "grad_norm": 0.6466159224510193, + "learning_rate": 4.6992797118847545e-05, + "loss": 0.7072, + "step": 1513 + }, + { + "epoch": 1.93792, + "grad_norm": 0.6909953355789185, + "learning_rate": 4.699079631852741e-05, + "loss": 0.7634, + "step": 1514 + }, + { + "epoch": 1.9392, + "grad_norm": 0.6303381323814392, + "learning_rate": 4.698879551820728e-05, + "loss": 0.802, + "step": 1515 + }, + { + "epoch": 1.94048, + "grad_norm": 0.6296682953834534, + "learning_rate": 4.6986794717887154e-05, + "loss": 0.7861, + "step": 1516 + }, + { + "epoch": 1.94176, + "grad_norm": 0.6130115985870361, + "learning_rate": 4.6984793917567026e-05, + "loss": 0.697, + "step": 1517 + }, + { + "epoch": 1.9430399999999999, + "grad_norm": 0.6313830614089966, + "learning_rate": 4.69827931172469e-05, + "loss": 0.7331, + "step": 1518 + }, + { + "epoch": 1.94432, + "grad_norm": 0.5878955125808716, + "learning_rate": 4.6980792316926777e-05, + "loss": 0.6528, + "step": 1519 + }, + { + "epoch": 1.9456, + "grad_norm": 0.6016858220100403, + "learning_rate": 4.697879151660665e-05, + "loss": 0.687, + "step": 1520 + }, + { + "epoch": 1.9468800000000002, + "grad_norm": 0.6477680802345276, + "learning_rate": 4.697679071628652e-05, + "loss": 0.7101, + "step": 1521 + }, + { + "epoch": 1.9481600000000001, + "grad_norm": 0.6151396632194519, + "learning_rate": 4.6974789915966385e-05, + "loss": 0.7067, + "step": 1522 + }, + { + "epoch": 1.94944, + "grad_norm": 0.6041195392608643, + "learning_rate": 4.697278911564626e-05, + "loss": 0.7499, + "step": 1523 + }, + { + "epoch": 1.95072, + "grad_norm": 0.6384760737419128, + "learning_rate": 4.697078831532613e-05, + "loss": 0.7452, + "step": 1524 + }, + { + "epoch": 1.952, + "grad_norm": 0.6926515698432922, + "learning_rate": 4.6968787515006e-05, + "loss": 0.7656, + "step": 1525 + }, + { + "epoch": 1.95328, + "grad_norm": 0.652237594127655, + "learning_rate": 4.696678671468588e-05, + "loss": 0.7503, + "step": 1526 + }, + { + "epoch": 1.9545599999999999, + "grad_norm": 0.6000808477401733, + "learning_rate": 4.696478591436575e-05, + "loss": 0.6663, + "step": 1527 + }, + { + "epoch": 1.95584, + "grad_norm": 0.6068828105926514, + "learning_rate": 4.696278511404562e-05, + "loss": 0.8046, + "step": 1528 + }, + { + "epoch": 1.95712, + "grad_norm": 0.6311643123626709, + "learning_rate": 4.6960784313725495e-05, + "loss": 0.7473, + "step": 1529 + }, + { + "epoch": 1.9584000000000001, + "grad_norm": 0.6560335755348206, + "learning_rate": 4.695878351340536e-05, + "loss": 0.7188, + "step": 1530 + }, + { + "epoch": 1.95968, + "grad_norm": 0.6460652351379395, + "learning_rate": 4.695678271308523e-05, + "loss": 0.6784, + "step": 1531 + }, + { + "epoch": 1.96096, + "grad_norm": 0.6522865295410156, + "learning_rate": 4.6954781912765104e-05, + "loss": 0.7078, + "step": 1532 + }, + { + "epoch": 1.96224, + "grad_norm": 0.6220058798789978, + "learning_rate": 4.695278111244498e-05, + "loss": 0.7121, + "step": 1533 + }, + { + "epoch": 1.96352, + "grad_norm": 0.6365997791290283, + "learning_rate": 4.6950780312124854e-05, + "loss": 0.6876, + "step": 1534 + }, + { + "epoch": 1.9647999999999999, + "grad_norm": 0.6620326042175293, + "learning_rate": 4.6948779511804726e-05, + "loss": 0.7311, + "step": 1535 + }, + { + "epoch": 1.96608, + "grad_norm": 0.6124232411384583, + "learning_rate": 4.69467787114846e-05, + "loss": 0.7134, + "step": 1536 + }, + { + "epoch": 1.96736, + "grad_norm": 0.6828178763389587, + "learning_rate": 4.694477791116447e-05, + "loss": 0.7439, + "step": 1537 + }, + { + "epoch": 1.96864, + "grad_norm": 0.6395293474197388, + "learning_rate": 4.6942777110844335e-05, + "loss": 0.7836, + "step": 1538 + }, + { + "epoch": 1.9699200000000001, + "grad_norm": 0.6207759976387024, + "learning_rate": 4.694077631052421e-05, + "loss": 0.6739, + "step": 1539 + }, + { + "epoch": 1.9712, + "grad_norm": 0.6403487324714661, + "learning_rate": 4.6938775510204086e-05, + "loss": 0.7665, + "step": 1540 + }, + { + "epoch": 1.97248, + "grad_norm": 0.6810711026191711, + "learning_rate": 4.693677470988396e-05, + "loss": 0.7825, + "step": 1541 + }, + { + "epoch": 1.97376, + "grad_norm": 0.6638922095298767, + "learning_rate": 4.693477390956383e-05, + "loss": 0.7517, + "step": 1542 + }, + { + "epoch": 1.97504, + "grad_norm": 0.5913483500480652, + "learning_rate": 4.69327731092437e-05, + "loss": 0.6869, + "step": 1543 + }, + { + "epoch": 1.9763199999999999, + "grad_norm": 0.6187199354171753, + "learning_rate": 4.693077230892357e-05, + "loss": 0.7321, + "step": 1544 + }, + { + "epoch": 1.9776, + "grad_norm": 0.6598728895187378, + "learning_rate": 4.6928771508603445e-05, + "loss": 0.7358, + "step": 1545 + }, + { + "epoch": 1.97888, + "grad_norm": 0.6453779935836792, + "learning_rate": 4.692677070828331e-05, + "loss": 0.7885, + "step": 1546 + }, + { + "epoch": 1.9801600000000001, + "grad_norm": 0.62159663438797, + "learning_rate": 4.692476990796319e-05, + "loss": 0.6755, + "step": 1547 + }, + { + "epoch": 1.98144, + "grad_norm": 0.5968536734580994, + "learning_rate": 4.692276910764306e-05, + "loss": 0.6611, + "step": 1548 + }, + { + "epoch": 1.98272, + "grad_norm": 0.6375271081924438, + "learning_rate": 4.692076830732293e-05, + "loss": 0.6993, + "step": 1549 + }, + { + "epoch": 1.984, + "grad_norm": 0.6619069576263428, + "learning_rate": 4.6918767507002804e-05, + "loss": 0.6928, + "step": 1550 + }, + { + "epoch": 1.98528, + "grad_norm": 0.6253162026405334, + "learning_rate": 4.6916766706682676e-05, + "loss": 0.66, + "step": 1551 + }, + { + "epoch": 1.9865599999999999, + "grad_norm": 0.6479495167732239, + "learning_rate": 4.691476590636255e-05, + "loss": 0.6365, + "step": 1552 + }, + { + "epoch": 1.98784, + "grad_norm": 0.6379725337028503, + "learning_rate": 4.691276510604242e-05, + "loss": 0.7077, + "step": 1553 + }, + { + "epoch": 1.98912, + "grad_norm": 0.6354682445526123, + "learning_rate": 4.691076430572229e-05, + "loss": 0.7052, + "step": 1554 + }, + { + "epoch": 1.9904, + "grad_norm": 0.6414213180541992, + "learning_rate": 4.6908763505402163e-05, + "loss": 0.6586, + "step": 1555 + }, + { + "epoch": 1.9916800000000001, + "grad_norm": 0.6093000173568726, + "learning_rate": 4.6906762705082035e-05, + "loss": 0.7352, + "step": 1556 + }, + { + "epoch": 1.99296, + "grad_norm": 0.6520658135414124, + "learning_rate": 4.690476190476191e-05, + "loss": 0.7425, + "step": 1557 + }, + { + "epoch": 1.99424, + "grad_norm": 0.635176420211792, + "learning_rate": 4.690276110444178e-05, + "loss": 0.7179, + "step": 1558 + }, + { + "epoch": 1.99552, + "grad_norm": 0.6376418471336365, + "learning_rate": 4.690076030412165e-05, + "loss": 0.7567, + "step": 1559 + }, + { + "epoch": 1.9968, + "grad_norm": 0.6329599022865295, + "learning_rate": 4.689875950380152e-05, + "loss": 0.7107, + "step": 1560 + }, + { + "epoch": 1.9980799999999999, + "grad_norm": 0.6018700003623962, + "learning_rate": 4.6896758703481395e-05, + "loss": 0.6841, + "step": 1561 + }, + { + "epoch": 1.99936, + "grad_norm": 0.6212313771247864, + "learning_rate": 4.6894757903161266e-05, + "loss": 0.7403, + "step": 1562 + }, + { + "epoch": 2.00064, + "grad_norm": 1.2921080589294434, + "learning_rate": 4.689275710284114e-05, + "loss": 1.147, + "step": 1563 + }, + { + "epoch": 2.00192, + "grad_norm": 0.6272991299629211, + "learning_rate": 4.689075630252101e-05, + "loss": 0.6922, + "step": 1564 + }, + { + "epoch": 2.0032, + "grad_norm": 0.6323875188827515, + "learning_rate": 4.688875550220088e-05, + "loss": 0.721, + "step": 1565 + }, + { + "epoch": 2.00448, + "grad_norm": 0.6615299582481384, + "learning_rate": 4.6886754701880754e-05, + "loss": 0.7782, + "step": 1566 + }, + { + "epoch": 2.00576, + "grad_norm": 0.6855239868164062, + "learning_rate": 4.6884753901560626e-05, + "loss": 0.7006, + "step": 1567 + }, + { + "epoch": 2.00704, + "grad_norm": 0.5969696640968323, + "learning_rate": 4.68827531012405e-05, + "loss": 0.6765, + "step": 1568 + }, + { + "epoch": 2.00832, + "grad_norm": 0.6360341310501099, + "learning_rate": 4.688075230092037e-05, + "loss": 0.7551, + "step": 1569 + }, + { + "epoch": 2.0096, + "grad_norm": 0.6344780325889587, + "learning_rate": 4.687875150060024e-05, + "loss": 0.7285, + "step": 1570 + }, + { + "epoch": 2.0108800000000002, + "grad_norm": 0.6265085935592651, + "learning_rate": 4.687675070028011e-05, + "loss": 0.693, + "step": 1571 + }, + { + "epoch": 2.01216, + "grad_norm": 0.6241620182991028, + "learning_rate": 4.6874749899959985e-05, + "loss": 0.6636, + "step": 1572 + }, + { + "epoch": 2.01344, + "grad_norm": 0.6464859843254089, + "learning_rate": 4.687274909963986e-05, + "loss": 0.7299, + "step": 1573 + }, + { + "epoch": 2.01472, + "grad_norm": 0.6327393651008606, + "learning_rate": 4.687074829931973e-05, + "loss": 0.6387, + "step": 1574 + }, + { + "epoch": 2.016, + "grad_norm": 0.6345120072364807, + "learning_rate": 4.686874749899961e-05, + "loss": 0.6943, + "step": 1575 + }, + { + "epoch": 2.01728, + "grad_norm": 0.6209046244621277, + "learning_rate": 4.686674669867947e-05, + "loss": 0.6712, + "step": 1576 + }, + { + "epoch": 2.01856, + "grad_norm": 0.6531869173049927, + "learning_rate": 4.6864745898359344e-05, + "loss": 0.7195, + "step": 1577 + }, + { + "epoch": 2.01984, + "grad_norm": 0.6831356287002563, + "learning_rate": 4.6862745098039216e-05, + "loss": 0.7457, + "step": 1578 + }, + { + "epoch": 2.02112, + "grad_norm": 0.6502741575241089, + "learning_rate": 4.686074429771909e-05, + "loss": 0.7308, + "step": 1579 + }, + { + "epoch": 2.0224, + "grad_norm": 0.661711573600769, + "learning_rate": 4.685874349739896e-05, + "loss": 0.7113, + "step": 1580 + }, + { + "epoch": 2.02368, + "grad_norm": 0.6132927536964417, + "learning_rate": 4.685674269707883e-05, + "loss": 0.6587, + "step": 1581 + }, + { + "epoch": 2.02496, + "grad_norm": 0.6607900261878967, + "learning_rate": 4.685474189675871e-05, + "loss": 0.6695, + "step": 1582 + }, + { + "epoch": 2.02624, + "grad_norm": 0.6305644512176514, + "learning_rate": 4.685274109643858e-05, + "loss": 0.6834, + "step": 1583 + }, + { + "epoch": 2.02752, + "grad_norm": 0.6461015939712524, + "learning_rate": 4.685074029611845e-05, + "loss": 0.7033, + "step": 1584 + }, + { + "epoch": 2.0288, + "grad_norm": 0.6151067614555359, + "learning_rate": 4.684873949579832e-05, + "loss": 0.7138, + "step": 1585 + }, + { + "epoch": 2.03008, + "grad_norm": 0.6099941730499268, + "learning_rate": 4.684673869547819e-05, + "loss": 0.6364, + "step": 1586 + }, + { + "epoch": 2.03136, + "grad_norm": 0.6969642639160156, + "learning_rate": 4.684473789515806e-05, + "loss": 0.755, + "step": 1587 + }, + { + "epoch": 2.03264, + "grad_norm": 0.6614975929260254, + "learning_rate": 4.6842737094837935e-05, + "loss": 0.7028, + "step": 1588 + }, + { + "epoch": 2.03392, + "grad_norm": 0.6838937401771545, + "learning_rate": 4.684073629451781e-05, + "loss": 0.8004, + "step": 1589 + }, + { + "epoch": 2.0352, + "grad_norm": 0.6104751229286194, + "learning_rate": 4.6838735494197685e-05, + "loss": 0.7308, + "step": 1590 + }, + { + "epoch": 2.03648, + "grad_norm": 0.6918980479240417, + "learning_rate": 4.683673469387756e-05, + "loss": 0.7753, + "step": 1591 + }, + { + "epoch": 2.03776, + "grad_norm": 0.6084235906600952, + "learning_rate": 4.683473389355742e-05, + "loss": 0.6957, + "step": 1592 + }, + { + "epoch": 2.03904, + "grad_norm": 0.5943331718444824, + "learning_rate": 4.6832733093237294e-05, + "loss": 0.6609, + "step": 1593 + }, + { + "epoch": 2.04032, + "grad_norm": 0.6553683876991272, + "learning_rate": 4.6830732292917166e-05, + "loss": 0.7259, + "step": 1594 + }, + { + "epoch": 2.0416, + "grad_norm": 0.6380487680435181, + "learning_rate": 4.682873149259704e-05, + "loss": 0.7648, + "step": 1595 + }, + { + "epoch": 2.04288, + "grad_norm": 0.6057306528091431, + "learning_rate": 4.6826730692276916e-05, + "loss": 0.6415, + "step": 1596 + }, + { + "epoch": 2.04416, + "grad_norm": 0.6594085693359375, + "learning_rate": 4.682472989195679e-05, + "loss": 0.7322, + "step": 1597 + }, + { + "epoch": 2.04544, + "grad_norm": 0.6331745982170105, + "learning_rate": 4.682272909163666e-05, + "loss": 0.7065, + "step": 1598 + }, + { + "epoch": 2.04672, + "grad_norm": 0.7005310654640198, + "learning_rate": 4.682072829131653e-05, + "loss": 0.7582, + "step": 1599 + }, + { + "epoch": 2.048, + "grad_norm": 0.6412195563316345, + "learning_rate": 4.68187274909964e-05, + "loss": 0.7186, + "step": 1600 + }, + { + "epoch": 2.04928, + "grad_norm": 0.6990569233894348, + "learning_rate": 4.681672669067627e-05, + "loss": 0.7195, + "step": 1601 + }, + { + "epoch": 2.05056, + "grad_norm": 0.6446850895881653, + "learning_rate": 4.681472589035614e-05, + "loss": 0.6461, + "step": 1602 + }, + { + "epoch": 2.05184, + "grad_norm": 0.6356726288795471, + "learning_rate": 4.681272509003602e-05, + "loss": 0.6688, + "step": 1603 + }, + { + "epoch": 2.05312, + "grad_norm": 0.6811710000038147, + "learning_rate": 4.681072428971589e-05, + "loss": 0.7049, + "step": 1604 + }, + { + "epoch": 2.0544, + "grad_norm": 0.6494273543357849, + "learning_rate": 4.680872348939576e-05, + "loss": 0.6843, + "step": 1605 + }, + { + "epoch": 2.05568, + "grad_norm": 0.6489901542663574, + "learning_rate": 4.6806722689075635e-05, + "loss": 0.6958, + "step": 1606 + }, + { + "epoch": 2.05696, + "grad_norm": 0.6661378145217896, + "learning_rate": 4.680472188875551e-05, + "loss": 0.7385, + "step": 1607 + }, + { + "epoch": 2.05824, + "grad_norm": 0.6342524290084839, + "learning_rate": 4.680272108843537e-05, + "loss": 0.7179, + "step": 1608 + }, + { + "epoch": 2.05952, + "grad_norm": 0.6314605474472046, + "learning_rate": 4.6800720288115244e-05, + "loss": 0.7109, + "step": 1609 + }, + { + "epoch": 2.0608, + "grad_norm": 0.665981650352478, + "learning_rate": 4.679871948779512e-05, + "loss": 0.7259, + "step": 1610 + }, + { + "epoch": 2.06208, + "grad_norm": 0.6300519704818726, + "learning_rate": 4.6796718687474994e-05, + "loss": 0.7014, + "step": 1611 + }, + { + "epoch": 2.06336, + "grad_norm": 0.6580458879470825, + "learning_rate": 4.6794717887154866e-05, + "loss": 0.6672, + "step": 1612 + }, + { + "epoch": 2.06464, + "grad_norm": 0.669180154800415, + "learning_rate": 4.679271708683474e-05, + "loss": 0.6437, + "step": 1613 + }, + { + "epoch": 2.06592, + "grad_norm": 0.690592348575592, + "learning_rate": 4.679071628651461e-05, + "loss": 0.7331, + "step": 1614 + }, + { + "epoch": 2.0672, + "grad_norm": 0.6528162360191345, + "learning_rate": 4.678871548619448e-05, + "loss": 0.6798, + "step": 1615 + }, + { + "epoch": 2.06848, + "grad_norm": 0.619225800037384, + "learning_rate": 4.678671468587435e-05, + "loss": 0.7003, + "step": 1616 + }, + { + "epoch": 2.06976, + "grad_norm": 0.6643122434616089, + "learning_rate": 4.6784713885554225e-05, + "loss": 0.764, + "step": 1617 + }, + { + "epoch": 2.07104, + "grad_norm": 0.6456024050712585, + "learning_rate": 4.67827130852341e-05, + "loss": 0.6792, + "step": 1618 + }, + { + "epoch": 2.07232, + "grad_norm": 0.6370171308517456, + "learning_rate": 4.678071228491397e-05, + "loss": 0.6821, + "step": 1619 + }, + { + "epoch": 2.0736, + "grad_norm": 0.646872341632843, + "learning_rate": 4.677871148459384e-05, + "loss": 0.6942, + "step": 1620 + }, + { + "epoch": 2.07488, + "grad_norm": 0.6307012438774109, + "learning_rate": 4.677671068427371e-05, + "loss": 0.6781, + "step": 1621 + }, + { + "epoch": 2.07616, + "grad_norm": 0.6719081401824951, + "learning_rate": 4.6774709883953585e-05, + "loss": 0.7221, + "step": 1622 + }, + { + "epoch": 2.07744, + "grad_norm": 0.6334235072135925, + "learning_rate": 4.6772709083633456e-05, + "loss": 0.6652, + "step": 1623 + }, + { + "epoch": 2.07872, + "grad_norm": 0.6129449009895325, + "learning_rate": 4.677070828331333e-05, + "loss": 0.6324, + "step": 1624 + }, + { + "epoch": 2.08, + "grad_norm": 0.617857813835144, + "learning_rate": 4.67687074829932e-05, + "loss": 0.6846, + "step": 1625 + }, + { + "epoch": 2.08128, + "grad_norm": 0.6237433552742004, + "learning_rate": 4.676670668267307e-05, + "loss": 0.6622, + "step": 1626 + }, + { + "epoch": 2.08256, + "grad_norm": 0.606203019618988, + "learning_rate": 4.6764705882352944e-05, + "loss": 0.6895, + "step": 1627 + }, + { + "epoch": 2.08384, + "grad_norm": 0.6327537894248962, + "learning_rate": 4.6762705082032816e-05, + "loss": 0.6087, + "step": 1628 + }, + { + "epoch": 2.08512, + "grad_norm": 0.6112367510795593, + "learning_rate": 4.676070428171269e-05, + "loss": 0.6854, + "step": 1629 + }, + { + "epoch": 2.0864, + "grad_norm": 0.5821278691291809, + "learning_rate": 4.675870348139256e-05, + "loss": 0.6405, + "step": 1630 + }, + { + "epoch": 2.08768, + "grad_norm": 0.6095936894416809, + "learning_rate": 4.675670268107243e-05, + "loss": 0.6359, + "step": 1631 + }, + { + "epoch": 2.08896, + "grad_norm": 0.6587210893630981, + "learning_rate": 4.67547018807523e-05, + "loss": 0.7537, + "step": 1632 + }, + { + "epoch": 2.09024, + "grad_norm": 0.6856895089149475, + "learning_rate": 4.6752701080432175e-05, + "loss": 0.7386, + "step": 1633 + }, + { + "epoch": 2.09152, + "grad_norm": 0.6704199910163879, + "learning_rate": 4.675070028011205e-05, + "loss": 0.7063, + "step": 1634 + }, + { + "epoch": 2.0928, + "grad_norm": 0.6308407187461853, + "learning_rate": 4.674869947979192e-05, + "loss": 0.6713, + "step": 1635 + }, + { + "epoch": 2.09408, + "grad_norm": 0.6690980195999146, + "learning_rate": 4.674669867947179e-05, + "loss": 0.6961, + "step": 1636 + }, + { + "epoch": 2.09536, + "grad_norm": 0.6405916213989258, + "learning_rate": 4.674469787915166e-05, + "loss": 0.6758, + "step": 1637 + }, + { + "epoch": 2.09664, + "grad_norm": 0.638323962688446, + "learning_rate": 4.6742697078831534e-05, + "loss": 0.7383, + "step": 1638 + }, + { + "epoch": 2.09792, + "grad_norm": 0.6798389554023743, + "learning_rate": 4.6740696278511406e-05, + "loss": 0.7503, + "step": 1639 + }, + { + "epoch": 2.0992, + "grad_norm": 0.6478346586227417, + "learning_rate": 4.673869547819128e-05, + "loss": 0.6709, + "step": 1640 + }, + { + "epoch": 2.10048, + "grad_norm": 0.6550549864768982, + "learning_rate": 4.673669467787115e-05, + "loss": 0.7377, + "step": 1641 + }, + { + "epoch": 2.10176, + "grad_norm": 0.611666738986969, + "learning_rate": 4.673469387755102e-05, + "loss": 0.6414, + "step": 1642 + }, + { + "epoch": 2.10304, + "grad_norm": 0.6572237610816956, + "learning_rate": 4.6732693077230894e-05, + "loss": 0.7387, + "step": 1643 + }, + { + "epoch": 2.10432, + "grad_norm": 0.6161869764328003, + "learning_rate": 4.6730692276910765e-05, + "loss": 0.6336, + "step": 1644 + }, + { + "epoch": 2.1056, + "grad_norm": 0.6543852090835571, + "learning_rate": 4.672869147659064e-05, + "loss": 0.7487, + "step": 1645 + }, + { + "epoch": 2.10688, + "grad_norm": 0.657584011554718, + "learning_rate": 4.6726690676270516e-05, + "loss": 0.7025, + "step": 1646 + }, + { + "epoch": 2.10816, + "grad_norm": 0.6363831162452698, + "learning_rate": 4.672468987595038e-05, + "loss": 0.7098, + "step": 1647 + }, + { + "epoch": 2.10944, + "grad_norm": 0.6271912455558777, + "learning_rate": 4.672268907563025e-05, + "loss": 0.6601, + "step": 1648 + }, + { + "epoch": 2.11072, + "grad_norm": 0.6789674162864685, + "learning_rate": 4.6720688275310125e-05, + "loss": 0.739, + "step": 1649 + }, + { + "epoch": 2.112, + "grad_norm": 0.6508240699768066, + "learning_rate": 4.6718687474989997e-05, + "loss": 0.7454, + "step": 1650 + }, + { + "epoch": 2.11328, + "grad_norm": 0.6586835980415344, + "learning_rate": 4.671668667466987e-05, + "loss": 0.6957, + "step": 1651 + }, + { + "epoch": 2.11456, + "grad_norm": 0.6778126955032349, + "learning_rate": 4.671468587434974e-05, + "loss": 0.7017, + "step": 1652 + }, + { + "epoch": 2.11584, + "grad_norm": 0.6158040165901184, + "learning_rate": 4.671268507402962e-05, + "loss": 0.6696, + "step": 1653 + }, + { + "epoch": 2.11712, + "grad_norm": 0.6658079624176025, + "learning_rate": 4.671068427370949e-05, + "loss": 0.735, + "step": 1654 + }, + { + "epoch": 2.1184, + "grad_norm": 0.6727830171585083, + "learning_rate": 4.6708683473389356e-05, + "loss": 0.685, + "step": 1655 + }, + { + "epoch": 2.11968, + "grad_norm": 0.614812970161438, + "learning_rate": 4.670668267306923e-05, + "loss": 0.6622, + "step": 1656 + }, + { + "epoch": 2.12096, + "grad_norm": 0.6530376076698303, + "learning_rate": 4.67046818727491e-05, + "loss": 0.7208, + "step": 1657 + }, + { + "epoch": 2.12224, + "grad_norm": 0.6509586572647095, + "learning_rate": 4.670268107242897e-05, + "loss": 0.7607, + "step": 1658 + }, + { + "epoch": 2.12352, + "grad_norm": 0.6249855756759644, + "learning_rate": 4.670068027210884e-05, + "loss": 0.6638, + "step": 1659 + }, + { + "epoch": 2.1248, + "grad_norm": 0.6153408885002136, + "learning_rate": 4.669867947178872e-05, + "loss": 0.699, + "step": 1660 + }, + { + "epoch": 2.12608, + "grad_norm": 0.6448095440864563, + "learning_rate": 4.6696678671468594e-05, + "loss": 0.7032, + "step": 1661 + }, + { + "epoch": 2.12736, + "grad_norm": 0.616129457950592, + "learning_rate": 4.6694677871148466e-05, + "loss": 0.68, + "step": 1662 + }, + { + "epoch": 2.12864, + "grad_norm": 0.6616100072860718, + "learning_rate": 4.669267707082833e-05, + "loss": 0.6944, + "step": 1663 + }, + { + "epoch": 2.12992, + "grad_norm": 0.6487681865692139, + "learning_rate": 4.66906762705082e-05, + "loss": 0.651, + "step": 1664 + }, + { + "epoch": 2.1312, + "grad_norm": 0.6391069889068604, + "learning_rate": 4.6688675470188074e-05, + "loss": 0.6815, + "step": 1665 + }, + { + "epoch": 2.13248, + "grad_norm": 0.6539847254753113, + "learning_rate": 4.6686674669867946e-05, + "loss": 0.7119, + "step": 1666 + }, + { + "epoch": 2.13376, + "grad_norm": 0.6501457691192627, + "learning_rate": 4.6684673869547825e-05, + "loss": 0.6649, + "step": 1667 + }, + { + "epoch": 2.13504, + "grad_norm": 0.6687941551208496, + "learning_rate": 4.66826730692277e-05, + "loss": 0.6558, + "step": 1668 + }, + { + "epoch": 2.13632, + "grad_norm": 0.690743625164032, + "learning_rate": 4.668067226890757e-05, + "loss": 0.6878, + "step": 1669 + }, + { + "epoch": 2.1376, + "grad_norm": 0.6361063122749329, + "learning_rate": 4.667867146858744e-05, + "loss": 0.7181, + "step": 1670 + }, + { + "epoch": 2.13888, + "grad_norm": 0.6546103954315186, + "learning_rate": 4.6676670668267306e-05, + "loss": 0.7105, + "step": 1671 + }, + { + "epoch": 2.14016, + "grad_norm": 0.6859259605407715, + "learning_rate": 4.667466986794718e-05, + "loss": 0.7206, + "step": 1672 + }, + { + "epoch": 2.1414400000000002, + "grad_norm": 0.6436448097229004, + "learning_rate": 4.667266906762705e-05, + "loss": 0.6466, + "step": 1673 + }, + { + "epoch": 2.14272, + "grad_norm": 0.6501901745796204, + "learning_rate": 4.667066826730693e-05, + "loss": 0.6934, + "step": 1674 + }, + { + "epoch": 2.144, + "grad_norm": 0.6573314070701599, + "learning_rate": 4.66686674669868e-05, + "loss": 0.6902, + "step": 1675 + }, + { + "epoch": 2.14528, + "grad_norm": 0.6532529592514038, + "learning_rate": 4.666666666666667e-05, + "loss": 0.6551, + "step": 1676 + }, + { + "epoch": 2.14656, + "grad_norm": 0.6756119132041931, + "learning_rate": 4.6664665866346543e-05, + "loss": 0.7094, + "step": 1677 + }, + { + "epoch": 2.14784, + "grad_norm": 0.6881611943244934, + "learning_rate": 4.6662665066026415e-05, + "loss": 0.7356, + "step": 1678 + }, + { + "epoch": 2.14912, + "grad_norm": 0.6921321749687195, + "learning_rate": 4.666066426570628e-05, + "loss": 0.704, + "step": 1679 + }, + { + "epoch": 2.1504, + "grad_norm": 0.6233019828796387, + "learning_rate": 4.665866346538615e-05, + "loss": 0.6581, + "step": 1680 + }, + { + "epoch": 2.15168, + "grad_norm": 0.6320359110832214, + "learning_rate": 4.665666266506603e-05, + "loss": 0.7222, + "step": 1681 + }, + { + "epoch": 2.15296, + "grad_norm": 0.6233236789703369, + "learning_rate": 4.66546618647459e-05, + "loss": 0.6272, + "step": 1682 + }, + { + "epoch": 2.15424, + "grad_norm": 0.6322048306465149, + "learning_rate": 4.6652661064425775e-05, + "loss": 0.6732, + "step": 1683 + }, + { + "epoch": 2.15552, + "grad_norm": 0.6570281386375427, + "learning_rate": 4.6650660264105646e-05, + "loss": 0.6651, + "step": 1684 + }, + { + "epoch": 2.1568, + "grad_norm": 0.6593979597091675, + "learning_rate": 4.664865946378552e-05, + "loss": 0.7143, + "step": 1685 + }, + { + "epoch": 2.15808, + "grad_norm": 0.6577056050300598, + "learning_rate": 4.664665866346539e-05, + "loss": 0.6509, + "step": 1686 + }, + { + "epoch": 2.15936, + "grad_norm": 0.6036557555198669, + "learning_rate": 4.6644657863145255e-05, + "loss": 0.6528, + "step": 1687 + }, + { + "epoch": 2.16064, + "grad_norm": 0.6405543684959412, + "learning_rate": 4.6642657062825134e-05, + "loss": 0.6667, + "step": 1688 + }, + { + "epoch": 2.16192, + "grad_norm": 0.7007610201835632, + "learning_rate": 4.6640656262505006e-05, + "loss": 0.7203, + "step": 1689 + }, + { + "epoch": 2.1632, + "grad_norm": 0.6296299695968628, + "learning_rate": 4.663865546218488e-05, + "loss": 0.6863, + "step": 1690 + }, + { + "epoch": 2.16448, + "grad_norm": 0.6606785655021667, + "learning_rate": 4.663665466186475e-05, + "loss": 0.7131, + "step": 1691 + }, + { + "epoch": 2.16576, + "grad_norm": 0.6546313166618347, + "learning_rate": 4.663465386154462e-05, + "loss": 0.6784, + "step": 1692 + }, + { + "epoch": 2.16704, + "grad_norm": 0.6348084807395935, + "learning_rate": 4.663265306122449e-05, + "loss": 0.7113, + "step": 1693 + }, + { + "epoch": 2.16832, + "grad_norm": 0.6655004024505615, + "learning_rate": 4.6630652260904365e-05, + "loss": 0.6833, + "step": 1694 + }, + { + "epoch": 2.1696, + "grad_norm": 0.6627703905105591, + "learning_rate": 4.662865146058424e-05, + "loss": 0.6661, + "step": 1695 + }, + { + "epoch": 2.17088, + "grad_norm": 0.6439046263694763, + "learning_rate": 4.662665066026411e-05, + "loss": 0.7075, + "step": 1696 + }, + { + "epoch": 2.17216, + "grad_norm": 0.6709191203117371, + "learning_rate": 4.662464985994398e-05, + "loss": 0.7076, + "step": 1697 + }, + { + "epoch": 2.17344, + "grad_norm": 0.6734464168548584, + "learning_rate": 4.662264905962385e-05, + "loss": 0.7296, + "step": 1698 + }, + { + "epoch": 2.1747199999999998, + "grad_norm": 0.663478434085846, + "learning_rate": 4.6620648259303724e-05, + "loss": 0.6552, + "step": 1699 + }, + { + "epoch": 2.176, + "grad_norm": 0.654140055179596, + "learning_rate": 4.6618647458983596e-05, + "loss": 0.7028, + "step": 1700 + }, + { + "epoch": 2.17728, + "grad_norm": 0.6052113175392151, + "learning_rate": 4.661664665866347e-05, + "loss": 0.6056, + "step": 1701 + }, + { + "epoch": 2.17856, + "grad_norm": 0.6727505922317505, + "learning_rate": 4.661464585834334e-05, + "loss": 0.6829, + "step": 1702 + }, + { + "epoch": 2.17984, + "grad_norm": 0.6504889130592346, + "learning_rate": 4.661264505802321e-05, + "loss": 0.6782, + "step": 1703 + }, + { + "epoch": 2.18112, + "grad_norm": 0.6764382123947144, + "learning_rate": 4.6610644257703084e-05, + "loss": 0.7543, + "step": 1704 + }, + { + "epoch": 2.1824, + "grad_norm": 0.6401752829551697, + "learning_rate": 4.6608643457382955e-05, + "loss": 0.6898, + "step": 1705 + }, + { + "epoch": 2.18368, + "grad_norm": 0.6742962002754211, + "learning_rate": 4.660664265706283e-05, + "loss": 0.6884, + "step": 1706 + }, + { + "epoch": 2.1849600000000002, + "grad_norm": 0.6535323262214661, + "learning_rate": 4.66046418567427e-05, + "loss": 0.6647, + "step": 1707 + }, + { + "epoch": 2.18624, + "grad_norm": 0.6376174092292786, + "learning_rate": 4.660264105642257e-05, + "loss": 0.6964, + "step": 1708 + }, + { + "epoch": 2.18752, + "grad_norm": 0.6661503314971924, + "learning_rate": 4.660064025610244e-05, + "loss": 0.6898, + "step": 1709 + }, + { + "epoch": 2.1888, + "grad_norm": 0.6385532021522522, + "learning_rate": 4.6598639455782315e-05, + "loss": 0.6951, + "step": 1710 + }, + { + "epoch": 2.19008, + "grad_norm": 0.6412584185600281, + "learning_rate": 4.6596638655462187e-05, + "loss": 0.7281, + "step": 1711 + }, + { + "epoch": 2.19136, + "grad_norm": 0.7024900317192078, + "learning_rate": 4.659463785514206e-05, + "loss": 0.7168, + "step": 1712 + }, + { + "epoch": 2.19264, + "grad_norm": 0.6655505895614624, + "learning_rate": 4.659263705482193e-05, + "loss": 0.6731, + "step": 1713 + }, + { + "epoch": 2.19392, + "grad_norm": 0.6673122048377991, + "learning_rate": 4.65906362545018e-05, + "loss": 0.757, + "step": 1714 + }, + { + "epoch": 2.1952, + "grad_norm": 0.6695178151130676, + "learning_rate": 4.6588635454181674e-05, + "loss": 0.7752, + "step": 1715 + }, + { + "epoch": 2.19648, + "grad_norm": 0.6591713428497314, + "learning_rate": 4.658663465386155e-05, + "loss": 0.6715, + "step": 1716 + }, + { + "epoch": 2.19776, + "grad_norm": 0.6705875396728516, + "learning_rate": 4.658463385354142e-05, + "loss": 0.6875, + "step": 1717 + }, + { + "epoch": 2.19904, + "grad_norm": 0.6780663728713989, + "learning_rate": 4.658263305322129e-05, + "loss": 0.697, + "step": 1718 + }, + { + "epoch": 2.20032, + "grad_norm": 0.6735832691192627, + "learning_rate": 4.658063225290116e-05, + "loss": 0.6991, + "step": 1719 + }, + { + "epoch": 2.2016, + "grad_norm": 0.640477180480957, + "learning_rate": 4.657863145258103e-05, + "loss": 0.694, + "step": 1720 + }, + { + "epoch": 2.20288, + "grad_norm": 0.6748470067977905, + "learning_rate": 4.6576630652260905e-05, + "loss": 0.6939, + "step": 1721 + }, + { + "epoch": 2.20416, + "grad_norm": 0.607291579246521, + "learning_rate": 4.657462985194078e-05, + "loss": 0.651, + "step": 1722 + }, + { + "epoch": 2.20544, + "grad_norm": 0.6633931398391724, + "learning_rate": 4.6572629051620656e-05, + "loss": 0.7131, + "step": 1723 + }, + { + "epoch": 2.20672, + "grad_norm": 0.7354316711425781, + "learning_rate": 4.657062825130053e-05, + "loss": 0.6963, + "step": 1724 + }, + { + "epoch": 2.208, + "grad_norm": 0.708829402923584, + "learning_rate": 4.656862745098039e-05, + "loss": 0.736, + "step": 1725 + }, + { + "epoch": 2.20928, + "grad_norm": 0.6382274031639099, + "learning_rate": 4.6566626650660264e-05, + "loss": 0.663, + "step": 1726 + }, + { + "epoch": 2.21056, + "grad_norm": 0.6313372850418091, + "learning_rate": 4.6564625850340136e-05, + "loss": 0.6861, + "step": 1727 + }, + { + "epoch": 2.21184, + "grad_norm": 0.6234421730041504, + "learning_rate": 4.656262505002001e-05, + "loss": 0.6846, + "step": 1728 + }, + { + "epoch": 2.21312, + "grad_norm": 0.6155267953872681, + "learning_rate": 4.656062424969988e-05, + "loss": 0.6289, + "step": 1729 + }, + { + "epoch": 2.2144, + "grad_norm": 0.6332612633705139, + "learning_rate": 4.655862344937976e-05, + "loss": 0.6752, + "step": 1730 + }, + { + "epoch": 2.21568, + "grad_norm": 0.6155014634132385, + "learning_rate": 4.655662264905963e-05, + "loss": 0.6088, + "step": 1731 + }, + { + "epoch": 2.21696, + "grad_norm": 0.6483979821205139, + "learning_rate": 4.65546218487395e-05, + "loss": 0.688, + "step": 1732 + }, + { + "epoch": 2.2182399999999998, + "grad_norm": 0.6692848801612854, + "learning_rate": 4.655262104841937e-05, + "loss": 0.6918, + "step": 1733 + }, + { + "epoch": 2.21952, + "grad_norm": 0.6489692330360413, + "learning_rate": 4.655062024809924e-05, + "loss": 0.7016, + "step": 1734 + }, + { + "epoch": 2.2208, + "grad_norm": 0.6827375888824463, + "learning_rate": 4.654861944777911e-05, + "loss": 0.7165, + "step": 1735 + }, + { + "epoch": 2.22208, + "grad_norm": 0.652456521987915, + "learning_rate": 4.654661864745898e-05, + "loss": 0.7366, + "step": 1736 + }, + { + "epoch": 2.22336, + "grad_norm": 0.6942132711410522, + "learning_rate": 4.654461784713886e-05, + "loss": 0.674, + "step": 1737 + }, + { + "epoch": 2.22464, + "grad_norm": 0.6576859354972839, + "learning_rate": 4.6542617046818734e-05, + "loss": 0.7573, + "step": 1738 + }, + { + "epoch": 2.22592, + "grad_norm": 0.6581286191940308, + "learning_rate": 4.6540616246498605e-05, + "loss": 0.6484, + "step": 1739 + }, + { + "epoch": 2.2272, + "grad_norm": 0.6722337007522583, + "learning_rate": 4.653861544617848e-05, + "loss": 0.7328, + "step": 1740 + }, + { + "epoch": 2.22848, + "grad_norm": 0.6128942966461182, + "learning_rate": 4.653661464585834e-05, + "loss": 0.6935, + "step": 1741 + }, + { + "epoch": 2.22976, + "grad_norm": 0.632098913192749, + "learning_rate": 4.6534613845538214e-05, + "loss": 0.6824, + "step": 1742 + }, + { + "epoch": 2.23104, + "grad_norm": 0.6399368047714233, + "learning_rate": 4.6532613045218086e-05, + "loss": 0.6962, + "step": 1743 + }, + { + "epoch": 2.23232, + "grad_norm": 0.6291220188140869, + "learning_rate": 4.653061224489796e-05, + "loss": 0.6935, + "step": 1744 + }, + { + "epoch": 2.2336, + "grad_norm": 0.6679767966270447, + "learning_rate": 4.6528611444577837e-05, + "loss": 0.685, + "step": 1745 + }, + { + "epoch": 2.23488, + "grad_norm": 0.652439534664154, + "learning_rate": 4.652661064425771e-05, + "loss": 0.731, + "step": 1746 + }, + { + "epoch": 2.23616, + "grad_norm": 0.6402837038040161, + "learning_rate": 4.652460984393758e-05, + "loss": 0.6515, + "step": 1747 + }, + { + "epoch": 2.23744, + "grad_norm": 0.6584830284118652, + "learning_rate": 4.652260904361745e-05, + "loss": 0.7117, + "step": 1748 + }, + { + "epoch": 2.23872, + "grad_norm": 0.6643098592758179, + "learning_rate": 4.652060824329732e-05, + "loss": 0.6786, + "step": 1749 + }, + { + "epoch": 2.24, + "grad_norm": 0.6529515385627747, + "learning_rate": 4.651860744297719e-05, + "loss": 0.6985, + "step": 1750 + }, + { + "epoch": 2.24128, + "grad_norm": 0.6570754051208496, + "learning_rate": 4.651660664265706e-05, + "loss": 0.6892, + "step": 1751 + }, + { + "epoch": 2.24256, + "grad_norm": 0.6258246898651123, + "learning_rate": 4.651460584233694e-05, + "loss": 0.6781, + "step": 1752 + }, + { + "epoch": 2.24384, + "grad_norm": 0.6045702695846558, + "learning_rate": 4.651260504201681e-05, + "loss": 0.6712, + "step": 1753 + }, + { + "epoch": 2.24512, + "grad_norm": 0.6391083002090454, + "learning_rate": 4.651060424169668e-05, + "loss": 0.6869, + "step": 1754 + }, + { + "epoch": 2.2464, + "grad_norm": 0.6669121980667114, + "learning_rate": 4.6508603441376555e-05, + "loss": 0.7874, + "step": 1755 + }, + { + "epoch": 2.24768, + "grad_norm": 0.6193825006484985, + "learning_rate": 4.650660264105643e-05, + "loss": 0.6625, + "step": 1756 + }, + { + "epoch": 2.24896, + "grad_norm": 0.6318017840385437, + "learning_rate": 4.650460184073629e-05, + "loss": 0.6807, + "step": 1757 + }, + { + "epoch": 2.25024, + "grad_norm": 0.6676175594329834, + "learning_rate": 4.6502601040416164e-05, + "loss": 0.6965, + "step": 1758 + }, + { + "epoch": 2.25152, + "grad_norm": 0.652651846408844, + "learning_rate": 4.650060024009604e-05, + "loss": 0.6623, + "step": 1759 + }, + { + "epoch": 2.2528, + "grad_norm": 0.6952486038208008, + "learning_rate": 4.6498599439775914e-05, + "loss": 0.7059, + "step": 1760 + }, + { + "epoch": 2.25408, + "grad_norm": 0.713154673576355, + "learning_rate": 4.6496598639455786e-05, + "loss": 0.7397, + "step": 1761 + }, + { + "epoch": 2.25536, + "grad_norm": 0.636633574962616, + "learning_rate": 4.649459783913566e-05, + "loss": 0.6193, + "step": 1762 + }, + { + "epoch": 2.25664, + "grad_norm": 0.6551604866981506, + "learning_rate": 4.649259703881553e-05, + "loss": 0.7048, + "step": 1763 + }, + { + "epoch": 2.25792, + "grad_norm": 0.6308698654174805, + "learning_rate": 4.64905962384954e-05, + "loss": 0.6646, + "step": 1764 + }, + { + "epoch": 2.2592, + "grad_norm": 0.6496629118919373, + "learning_rate": 4.648859543817527e-05, + "loss": 0.6879, + "step": 1765 + }, + { + "epoch": 2.26048, + "grad_norm": 0.6222250461578369, + "learning_rate": 4.6486594637855145e-05, + "loss": 0.6635, + "step": 1766 + }, + { + "epoch": 2.2617599999999998, + "grad_norm": 0.6518458724021912, + "learning_rate": 4.648459383753502e-05, + "loss": 0.6757, + "step": 1767 + }, + { + "epoch": 2.26304, + "grad_norm": 0.656060516834259, + "learning_rate": 4.648259303721489e-05, + "loss": 0.6818, + "step": 1768 + }, + { + "epoch": 2.26432, + "grad_norm": 0.6683648228645325, + "learning_rate": 4.648059223689476e-05, + "loss": 0.7028, + "step": 1769 + }, + { + "epoch": 2.2656, + "grad_norm": 0.6672934889793396, + "learning_rate": 4.647859143657463e-05, + "loss": 0.6928, + "step": 1770 + }, + { + "epoch": 2.26688, + "grad_norm": 0.6802359223365784, + "learning_rate": 4.6476590636254505e-05, + "loss": 0.6616, + "step": 1771 + }, + { + "epoch": 2.26816, + "grad_norm": 0.6378535628318787, + "learning_rate": 4.647458983593438e-05, + "loss": 0.689, + "step": 1772 + }, + { + "epoch": 2.26944, + "grad_norm": 0.677901566028595, + "learning_rate": 4.647258903561425e-05, + "loss": 0.7436, + "step": 1773 + }, + { + "epoch": 2.27072, + "grad_norm": 0.6340510249137878, + "learning_rate": 4.647058823529412e-05, + "loss": 0.7241, + "step": 1774 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 0.6160155534744263, + "learning_rate": 4.646858743497399e-05, + "loss": 0.635, + "step": 1775 + }, + { + "epoch": 2.27328, + "grad_norm": 0.6366074085235596, + "learning_rate": 4.6466586634653864e-05, + "loss": 0.6915, + "step": 1776 + }, + { + "epoch": 2.27456, + "grad_norm": 0.6163462996482849, + "learning_rate": 4.6464585834333736e-05, + "loss": 0.6227, + "step": 1777 + }, + { + "epoch": 2.27584, + "grad_norm": 0.6638872027397156, + "learning_rate": 4.646258503401361e-05, + "loss": 0.753, + "step": 1778 + }, + { + "epoch": 2.27712, + "grad_norm": 0.6583823561668396, + "learning_rate": 4.646058423369348e-05, + "loss": 0.7791, + "step": 1779 + }, + { + "epoch": 2.2784, + "grad_norm": 0.6398965120315552, + "learning_rate": 4.645858343337335e-05, + "loss": 0.6842, + "step": 1780 + }, + { + "epoch": 2.27968, + "grad_norm": 0.6243705153465271, + "learning_rate": 4.645658263305322e-05, + "loss": 0.6953, + "step": 1781 + }, + { + "epoch": 2.28096, + "grad_norm": 0.6466498374938965, + "learning_rate": 4.6454581832733095e-05, + "loss": 0.7053, + "step": 1782 + }, + { + "epoch": 2.28224, + "grad_norm": 0.6308572292327881, + "learning_rate": 4.645258103241297e-05, + "loss": 0.6739, + "step": 1783 + }, + { + "epoch": 2.28352, + "grad_norm": 0.6494999527931213, + "learning_rate": 4.645058023209284e-05, + "loss": 0.7472, + "step": 1784 + }, + { + "epoch": 2.2848, + "grad_norm": 0.633786678314209, + "learning_rate": 4.644857943177271e-05, + "loss": 0.6696, + "step": 1785 + }, + { + "epoch": 2.28608, + "grad_norm": 0.63746577501297, + "learning_rate": 4.644657863145258e-05, + "loss": 0.6897, + "step": 1786 + }, + { + "epoch": 2.28736, + "grad_norm": 0.6449893116950989, + "learning_rate": 4.6444577831132454e-05, + "loss": 0.657, + "step": 1787 + }, + { + "epoch": 2.28864, + "grad_norm": 0.6666371822357178, + "learning_rate": 4.6442577030812326e-05, + "loss": 0.7013, + "step": 1788 + }, + { + "epoch": 2.28992, + "grad_norm": 0.6510623693466187, + "learning_rate": 4.64405762304922e-05, + "loss": 0.7175, + "step": 1789 + }, + { + "epoch": 2.2912, + "grad_norm": 0.6616090536117554, + "learning_rate": 4.643857543017207e-05, + "loss": 0.6673, + "step": 1790 + }, + { + "epoch": 2.29248, + "grad_norm": 0.7210375070571899, + "learning_rate": 4.643657462985194e-05, + "loss": 0.7318, + "step": 1791 + }, + { + "epoch": 2.29376, + "grad_norm": 0.6955354809761047, + "learning_rate": 4.6434573829531814e-05, + "loss": 0.7548, + "step": 1792 + }, + { + "epoch": 2.29504, + "grad_norm": 0.6595535278320312, + "learning_rate": 4.6432573029211686e-05, + "loss": 0.6672, + "step": 1793 + }, + { + "epoch": 2.29632, + "grad_norm": 0.6795318126678467, + "learning_rate": 4.6430572228891564e-05, + "loss": 0.6465, + "step": 1794 + }, + { + "epoch": 2.2976, + "grad_norm": 0.700689971446991, + "learning_rate": 4.642857142857143e-05, + "loss": 0.7189, + "step": 1795 + }, + { + "epoch": 2.29888, + "grad_norm": 0.7070762515068054, + "learning_rate": 4.64265706282513e-05, + "loss": 0.7249, + "step": 1796 + }, + { + "epoch": 2.30016, + "grad_norm": 0.6750733256340027, + "learning_rate": 4.642456982793117e-05, + "loss": 0.7291, + "step": 1797 + }, + { + "epoch": 2.30144, + "grad_norm": 0.6518023610115051, + "learning_rate": 4.6422569027611045e-05, + "loss": 0.7376, + "step": 1798 + }, + { + "epoch": 2.30272, + "grad_norm": 0.658316433429718, + "learning_rate": 4.642056822729092e-05, + "loss": 0.7014, + "step": 1799 + }, + { + "epoch": 2.304, + "grad_norm": 0.6573535799980164, + "learning_rate": 4.641856742697079e-05, + "loss": 0.673, + "step": 1800 + }, + { + "epoch": 2.3052799999999998, + "grad_norm": 0.6256746649742126, + "learning_rate": 4.641656662665067e-05, + "loss": 0.6777, + "step": 1801 + }, + { + "epoch": 2.30656, + "grad_norm": 0.6481615900993347, + "learning_rate": 4.641456582633054e-05, + "loss": 0.6787, + "step": 1802 + }, + { + "epoch": 2.30784, + "grad_norm": 0.6594406366348267, + "learning_rate": 4.6412565026010404e-05, + "loss": 0.6894, + "step": 1803 + }, + { + "epoch": 2.30912, + "grad_norm": 0.6397424936294556, + "learning_rate": 4.6410564225690276e-05, + "loss": 0.7325, + "step": 1804 + }, + { + "epoch": 2.3104, + "grad_norm": 0.6274937987327576, + "learning_rate": 4.640856342537015e-05, + "loss": 0.6842, + "step": 1805 + }, + { + "epoch": 2.31168, + "grad_norm": 0.7071935534477234, + "learning_rate": 4.640656262505002e-05, + "loss": 0.7394, + "step": 1806 + }, + { + "epoch": 2.31296, + "grad_norm": 0.6791349649429321, + "learning_rate": 4.640456182472989e-05, + "loss": 0.6907, + "step": 1807 + }, + { + "epoch": 2.31424, + "grad_norm": 0.6307082772254944, + "learning_rate": 4.640256102440977e-05, + "loss": 0.6735, + "step": 1808 + }, + { + "epoch": 2.3155200000000002, + "grad_norm": 0.6786848306655884, + "learning_rate": 4.640056022408964e-05, + "loss": 0.6944, + "step": 1809 + }, + { + "epoch": 2.3168, + "grad_norm": 0.6838906407356262, + "learning_rate": 4.6398559423769514e-05, + "loss": 0.7096, + "step": 1810 + }, + { + "epoch": 2.31808, + "grad_norm": 0.6599327921867371, + "learning_rate": 4.639655862344938e-05, + "loss": 0.6573, + "step": 1811 + }, + { + "epoch": 2.31936, + "grad_norm": 0.6728073358535767, + "learning_rate": 4.639455782312925e-05, + "loss": 0.7074, + "step": 1812 + }, + { + "epoch": 2.32064, + "grad_norm": 0.6599488258361816, + "learning_rate": 4.639255702280912e-05, + "loss": 0.6743, + "step": 1813 + }, + { + "epoch": 2.32192, + "grad_norm": 0.7124109864234924, + "learning_rate": 4.6390556222488995e-05, + "loss": 0.8051, + "step": 1814 + }, + { + "epoch": 2.3232, + "grad_norm": 0.6490312814712524, + "learning_rate": 4.638855542216887e-05, + "loss": 0.6868, + "step": 1815 + }, + { + "epoch": 2.32448, + "grad_norm": 0.6607807278633118, + "learning_rate": 4.6386554621848745e-05, + "loss": 0.7233, + "step": 1816 + }, + { + "epoch": 2.32576, + "grad_norm": 0.6332591772079468, + "learning_rate": 4.638455382152862e-05, + "loss": 0.6772, + "step": 1817 + }, + { + "epoch": 2.32704, + "grad_norm": 0.6462684273719788, + "learning_rate": 4.638255302120849e-05, + "loss": 0.6809, + "step": 1818 + }, + { + "epoch": 2.32832, + "grad_norm": 0.685043454170227, + "learning_rate": 4.6380552220888354e-05, + "loss": 0.672, + "step": 1819 + }, + { + "epoch": 2.3296, + "grad_norm": 0.6536259651184082, + "learning_rate": 4.6378551420568226e-05, + "loss": 0.6845, + "step": 1820 + }, + { + "epoch": 2.33088, + "grad_norm": 0.6470760703086853, + "learning_rate": 4.63765506202481e-05, + "loss": 0.6574, + "step": 1821 + }, + { + "epoch": 2.33216, + "grad_norm": 0.6793741583824158, + "learning_rate": 4.6374549819927976e-05, + "loss": 0.7286, + "step": 1822 + }, + { + "epoch": 2.33344, + "grad_norm": 0.633575975894928, + "learning_rate": 4.637254901960785e-05, + "loss": 0.689, + "step": 1823 + }, + { + "epoch": 2.33472, + "grad_norm": 0.6770448088645935, + "learning_rate": 4.637054821928772e-05, + "loss": 0.6672, + "step": 1824 + }, + { + "epoch": 2.336, + "grad_norm": 0.6367180943489075, + "learning_rate": 4.636854741896759e-05, + "loss": 0.712, + "step": 1825 + }, + { + "epoch": 2.33728, + "grad_norm": 0.6444728970527649, + "learning_rate": 4.6366546618647464e-05, + "loss": 0.78, + "step": 1826 + }, + { + "epoch": 2.33856, + "grad_norm": 0.6529805064201355, + "learning_rate": 4.636454581832733e-05, + "loss": 0.6594, + "step": 1827 + }, + { + "epoch": 2.33984, + "grad_norm": 0.6504684090614319, + "learning_rate": 4.63625450180072e-05, + "loss": 0.6725, + "step": 1828 + }, + { + "epoch": 2.34112, + "grad_norm": 0.645531177520752, + "learning_rate": 4.636054421768708e-05, + "loss": 0.6721, + "step": 1829 + }, + { + "epoch": 2.3424, + "grad_norm": 0.6722829341888428, + "learning_rate": 4.635854341736695e-05, + "loss": 0.772, + "step": 1830 + }, + { + "epoch": 2.34368, + "grad_norm": 0.6697701215744019, + "learning_rate": 4.635654261704682e-05, + "loss": 0.7057, + "step": 1831 + }, + { + "epoch": 2.34496, + "grad_norm": 0.6601827144622803, + "learning_rate": 4.6354541816726695e-05, + "loss": 0.644, + "step": 1832 + }, + { + "epoch": 2.34624, + "grad_norm": 0.6771488189697266, + "learning_rate": 4.635254101640657e-05, + "loss": 0.7449, + "step": 1833 + }, + { + "epoch": 2.34752, + "grad_norm": 0.6421104073524475, + "learning_rate": 4.635054021608644e-05, + "loss": 0.7342, + "step": 1834 + }, + { + "epoch": 2.3487999999999998, + "grad_norm": 0.6297141909599304, + "learning_rate": 4.6348539415766304e-05, + "loss": 0.706, + "step": 1835 + }, + { + "epoch": 2.35008, + "grad_norm": 0.6440832018852234, + "learning_rate": 4.634653861544618e-05, + "loss": 0.7282, + "step": 1836 + }, + { + "epoch": 2.35136, + "grad_norm": 0.6416788101196289, + "learning_rate": 4.6344537815126054e-05, + "loss": 0.6494, + "step": 1837 + }, + { + "epoch": 2.35264, + "grad_norm": 0.6922294497489929, + "learning_rate": 4.6342537014805926e-05, + "loss": 0.7179, + "step": 1838 + }, + { + "epoch": 2.35392, + "grad_norm": 0.674335241317749, + "learning_rate": 4.63405362144858e-05, + "loss": 0.6737, + "step": 1839 + }, + { + "epoch": 2.3552, + "grad_norm": 0.6704332232475281, + "learning_rate": 4.633853541416567e-05, + "loss": 0.7289, + "step": 1840 + }, + { + "epoch": 2.35648, + "grad_norm": 0.6815084218978882, + "learning_rate": 4.633653461384554e-05, + "loss": 0.6504, + "step": 1841 + }, + { + "epoch": 2.35776, + "grad_norm": 0.6917728781700134, + "learning_rate": 4.6334533813525413e-05, + "loss": 0.7133, + "step": 1842 + }, + { + "epoch": 2.3590400000000002, + "grad_norm": 0.6595756411552429, + "learning_rate": 4.6332533013205285e-05, + "loss": 0.6622, + "step": 1843 + }, + { + "epoch": 2.3603199999999998, + "grad_norm": 0.6047163009643555, + "learning_rate": 4.633053221288516e-05, + "loss": 0.682, + "step": 1844 + }, + { + "epoch": 2.3616, + "grad_norm": 0.6585202217102051, + "learning_rate": 4.632853141256503e-05, + "loss": 0.7206, + "step": 1845 + }, + { + "epoch": 2.36288, + "grad_norm": 0.6378026604652405, + "learning_rate": 4.63265306122449e-05, + "loss": 0.6693, + "step": 1846 + }, + { + "epoch": 2.36416, + "grad_norm": 0.6626023054122925, + "learning_rate": 4.632452981192477e-05, + "loss": 0.7427, + "step": 1847 + }, + { + "epoch": 2.36544, + "grad_norm": 0.652944803237915, + "learning_rate": 4.6322529011604645e-05, + "loss": 0.6706, + "step": 1848 + }, + { + "epoch": 2.36672, + "grad_norm": 0.6104753613471985, + "learning_rate": 4.6320528211284516e-05, + "loss": 0.6231, + "step": 1849 + }, + { + "epoch": 2.368, + "grad_norm": 0.5891187191009521, + "learning_rate": 4.631852741096439e-05, + "loss": 0.6612, + "step": 1850 + }, + { + "epoch": 2.36928, + "grad_norm": 0.6433348059654236, + "learning_rate": 4.631652661064426e-05, + "loss": 0.6903, + "step": 1851 + }, + { + "epoch": 2.3705600000000002, + "grad_norm": 0.6553032994270325, + "learning_rate": 4.631452581032413e-05, + "loss": 0.6837, + "step": 1852 + }, + { + "epoch": 2.37184, + "grad_norm": 0.6701604723930359, + "learning_rate": 4.6312525010004004e-05, + "loss": 0.7339, + "step": 1853 + }, + { + "epoch": 2.37312, + "grad_norm": 0.6654971837997437, + "learning_rate": 4.6310524209683876e-05, + "loss": 0.7244, + "step": 1854 + }, + { + "epoch": 2.3744, + "grad_norm": 0.6845753788948059, + "learning_rate": 4.630852340936375e-05, + "loss": 0.742, + "step": 1855 + }, + { + "epoch": 2.37568, + "grad_norm": 0.651664137840271, + "learning_rate": 4.630652260904362e-05, + "loss": 0.6837, + "step": 1856 + }, + { + "epoch": 2.37696, + "grad_norm": 0.7059352993965149, + "learning_rate": 4.630452180872349e-05, + "loss": 0.7274, + "step": 1857 + }, + { + "epoch": 2.37824, + "grad_norm": 0.6303350329399109, + "learning_rate": 4.630252100840336e-05, + "loss": 0.7218, + "step": 1858 + }, + { + "epoch": 2.37952, + "grad_norm": 0.6301616430282593, + "learning_rate": 4.6300520208083235e-05, + "loss": 0.6528, + "step": 1859 + }, + { + "epoch": 2.3808, + "grad_norm": 0.6489315032958984, + "learning_rate": 4.629851940776311e-05, + "loss": 0.6807, + "step": 1860 + }, + { + "epoch": 2.38208, + "grad_norm": 0.6146230697631836, + "learning_rate": 4.629651860744298e-05, + "loss": 0.671, + "step": 1861 + }, + { + "epoch": 2.38336, + "grad_norm": 0.6300916075706482, + "learning_rate": 4.629451780712285e-05, + "loss": 0.6797, + "step": 1862 + }, + { + "epoch": 2.38464, + "grad_norm": 0.6397101283073425, + "learning_rate": 4.629251700680272e-05, + "loss": 0.6652, + "step": 1863 + }, + { + "epoch": 2.38592, + "grad_norm": 0.6682645678520203, + "learning_rate": 4.6290516206482594e-05, + "loss": 0.7132, + "step": 1864 + }, + { + "epoch": 2.3872, + "grad_norm": 0.6869545578956604, + "learning_rate": 4.6288515406162466e-05, + "loss": 0.6986, + "step": 1865 + }, + { + "epoch": 2.38848, + "grad_norm": 0.6472262144088745, + "learning_rate": 4.628651460584234e-05, + "loss": 0.6647, + "step": 1866 + }, + { + "epoch": 2.38976, + "grad_norm": 0.6385058164596558, + "learning_rate": 4.628451380552221e-05, + "loss": 0.6489, + "step": 1867 + }, + { + "epoch": 2.39104, + "grad_norm": 0.6617813110351562, + "learning_rate": 4.628251300520208e-05, + "loss": 0.715, + "step": 1868 + }, + { + "epoch": 2.39232, + "grad_norm": 0.6766805648803711, + "learning_rate": 4.6280512204881954e-05, + "loss": 0.6902, + "step": 1869 + }, + { + "epoch": 2.3936, + "grad_norm": 0.6299700736999512, + "learning_rate": 4.6278511404561825e-05, + "loss": 0.6678, + "step": 1870 + }, + { + "epoch": 2.39488, + "grad_norm": 0.7133849263191223, + "learning_rate": 4.62765106042417e-05, + "loss": 0.7164, + "step": 1871 + }, + { + "epoch": 2.39616, + "grad_norm": 0.6625071167945862, + "learning_rate": 4.6274509803921576e-05, + "loss": 0.7179, + "step": 1872 + }, + { + "epoch": 2.39744, + "grad_norm": 0.6794329881668091, + "learning_rate": 4.627250900360144e-05, + "loss": 0.7199, + "step": 1873 + }, + { + "epoch": 2.39872, + "grad_norm": 0.630755603313446, + "learning_rate": 4.627050820328131e-05, + "loss": 0.7723, + "step": 1874 + }, + { + "epoch": 2.4, + "grad_norm": 0.6161158680915833, + "learning_rate": 4.6268507402961185e-05, + "loss": 0.5934, + "step": 1875 + }, + { + "epoch": 2.40128, + "grad_norm": 0.6303229928016663, + "learning_rate": 4.6266506602641057e-05, + "loss": 0.689, + "step": 1876 + }, + { + "epoch": 2.40256, + "grad_norm": 0.6833181977272034, + "learning_rate": 4.626450580232093e-05, + "loss": 0.7554, + "step": 1877 + }, + { + "epoch": 2.4038399999999998, + "grad_norm": 0.6531204581260681, + "learning_rate": 4.62625050020008e-05, + "loss": 0.6816, + "step": 1878 + }, + { + "epoch": 2.40512, + "grad_norm": 0.7215322256088257, + "learning_rate": 4.626050420168068e-05, + "loss": 0.7196, + "step": 1879 + }, + { + "epoch": 2.4064, + "grad_norm": 0.6577185988426208, + "learning_rate": 4.625850340136055e-05, + "loss": 0.6797, + "step": 1880 + }, + { + "epoch": 2.40768, + "grad_norm": 0.6590116620063782, + "learning_rate": 4.6256502601040416e-05, + "loss": 0.6712, + "step": 1881 + }, + { + "epoch": 2.40896, + "grad_norm": 0.715766191482544, + "learning_rate": 4.625450180072029e-05, + "loss": 0.7324, + "step": 1882 + }, + { + "epoch": 2.41024, + "grad_norm": 0.6765993237495422, + "learning_rate": 4.625250100040016e-05, + "loss": 0.726, + "step": 1883 + }, + { + "epoch": 2.41152, + "grad_norm": 0.6403140425682068, + "learning_rate": 4.625050020008003e-05, + "loss": 0.6877, + "step": 1884 + }, + { + "epoch": 2.4128, + "grad_norm": 0.6255505681037903, + "learning_rate": 4.62484993997599e-05, + "loss": 0.6142, + "step": 1885 + }, + { + "epoch": 2.4140800000000002, + "grad_norm": 0.6832484006881714, + "learning_rate": 4.624649859943978e-05, + "loss": 0.6964, + "step": 1886 + }, + { + "epoch": 2.41536, + "grad_norm": 0.6387737989425659, + "learning_rate": 4.6244497799119654e-05, + "loss": 0.7023, + "step": 1887 + }, + { + "epoch": 2.41664, + "grad_norm": 0.6823368072509766, + "learning_rate": 4.6242496998799526e-05, + "loss": 0.7162, + "step": 1888 + }, + { + "epoch": 2.41792, + "grad_norm": 0.6266238689422607, + "learning_rate": 4.624049619847939e-05, + "loss": 0.6378, + "step": 1889 + }, + { + "epoch": 2.4192, + "grad_norm": 0.705518901348114, + "learning_rate": 4.623849539815926e-05, + "loss": 0.7405, + "step": 1890 + }, + { + "epoch": 2.42048, + "grad_norm": 0.6303145885467529, + "learning_rate": 4.6236494597839134e-05, + "loss": 0.6536, + "step": 1891 + }, + { + "epoch": 2.42176, + "grad_norm": 0.6812713146209717, + "learning_rate": 4.6234493797519006e-05, + "loss": 0.6988, + "step": 1892 + }, + { + "epoch": 2.42304, + "grad_norm": 0.6497737169265747, + "learning_rate": 4.6232492997198885e-05, + "loss": 0.6926, + "step": 1893 + }, + { + "epoch": 2.42432, + "grad_norm": 0.669630765914917, + "learning_rate": 4.623049219687876e-05, + "loss": 0.7162, + "step": 1894 + }, + { + "epoch": 2.4256, + "grad_norm": 0.6586965918540955, + "learning_rate": 4.622849139655863e-05, + "loss": 0.6695, + "step": 1895 + }, + { + "epoch": 2.42688, + "grad_norm": 0.6347070336341858, + "learning_rate": 4.62264905962385e-05, + "loss": 0.6818, + "step": 1896 + }, + { + "epoch": 2.42816, + "grad_norm": 0.6584689617156982, + "learning_rate": 4.6224489795918366e-05, + "loss": 0.733, + "step": 1897 + }, + { + "epoch": 2.42944, + "grad_norm": 0.6446535587310791, + "learning_rate": 4.622248899559824e-05, + "loss": 0.6938, + "step": 1898 + }, + { + "epoch": 2.43072, + "grad_norm": 0.6775670051574707, + "learning_rate": 4.622048819527811e-05, + "loss": 0.7274, + "step": 1899 + }, + { + "epoch": 2.432, + "grad_norm": 0.636137068271637, + "learning_rate": 4.621848739495799e-05, + "loss": 0.6261, + "step": 1900 + }, + { + "epoch": 2.43328, + "grad_norm": 0.6404497623443604, + "learning_rate": 4.621648659463786e-05, + "loss": 0.6962, + "step": 1901 + }, + { + "epoch": 2.43456, + "grad_norm": 0.661217451095581, + "learning_rate": 4.621448579431773e-05, + "loss": 0.7293, + "step": 1902 + }, + { + "epoch": 2.43584, + "grad_norm": 0.6509891748428345, + "learning_rate": 4.6212484993997603e-05, + "loss": 0.7254, + "step": 1903 + }, + { + "epoch": 2.43712, + "grad_norm": 0.644158661365509, + "learning_rate": 4.6210484193677475e-05, + "loss": 0.6958, + "step": 1904 + }, + { + "epoch": 2.4384, + "grad_norm": 0.6828984022140503, + "learning_rate": 4.620848339335734e-05, + "loss": 0.6665, + "step": 1905 + }, + { + "epoch": 2.43968, + "grad_norm": 0.6128406524658203, + "learning_rate": 4.620648259303721e-05, + "loss": 0.6624, + "step": 1906 + }, + { + "epoch": 2.44096, + "grad_norm": 0.7112566232681274, + "learning_rate": 4.620448179271709e-05, + "loss": 0.7079, + "step": 1907 + }, + { + "epoch": 2.44224, + "grad_norm": 0.6818531155586243, + "learning_rate": 4.620248099239696e-05, + "loss": 0.7034, + "step": 1908 + }, + { + "epoch": 2.44352, + "grad_norm": 0.6345267295837402, + "learning_rate": 4.6200480192076835e-05, + "loss": 0.6841, + "step": 1909 + }, + { + "epoch": 2.4448, + "grad_norm": 0.7214930057525635, + "learning_rate": 4.6198479391756706e-05, + "loss": 0.7026, + "step": 1910 + }, + { + "epoch": 2.44608, + "grad_norm": 0.6622048616409302, + "learning_rate": 4.619647859143658e-05, + "loss": 0.6964, + "step": 1911 + }, + { + "epoch": 2.4473599999999998, + "grad_norm": 0.6323049664497375, + "learning_rate": 4.619447779111645e-05, + "loss": 0.7194, + "step": 1912 + }, + { + "epoch": 2.44864, + "grad_norm": 0.6646015644073486, + "learning_rate": 4.6192476990796315e-05, + "loss": 0.6881, + "step": 1913 + }, + { + "epoch": 2.44992, + "grad_norm": 0.6655915975570679, + "learning_rate": 4.6190476190476194e-05, + "loss": 0.6974, + "step": 1914 + }, + { + "epoch": 2.4512, + "grad_norm": 0.6498141288757324, + "learning_rate": 4.6188475390156066e-05, + "loss": 0.6626, + "step": 1915 + }, + { + "epoch": 2.45248, + "grad_norm": 0.6257582306861877, + "learning_rate": 4.618647458983594e-05, + "loss": 0.6945, + "step": 1916 + }, + { + "epoch": 2.45376, + "grad_norm": 0.6620806455612183, + "learning_rate": 4.618447378951581e-05, + "loss": 0.6976, + "step": 1917 + }, + { + "epoch": 2.45504, + "grad_norm": 0.6166614890098572, + "learning_rate": 4.618247298919568e-05, + "loss": 0.7004, + "step": 1918 + }, + { + "epoch": 2.45632, + "grad_norm": 0.6367224454879761, + "learning_rate": 4.618047218887555e-05, + "loss": 0.7098, + "step": 1919 + }, + { + "epoch": 2.4576000000000002, + "grad_norm": 0.6246047019958496, + "learning_rate": 4.6178471388555425e-05, + "loss": 0.6933, + "step": 1920 + }, + { + "epoch": 2.45888, + "grad_norm": 0.6195082664489746, + "learning_rate": 4.61764705882353e-05, + "loss": 0.6455, + "step": 1921 + }, + { + "epoch": 2.46016, + "grad_norm": 0.6568025350570679, + "learning_rate": 4.617446978791517e-05, + "loss": 0.7072, + "step": 1922 + }, + { + "epoch": 2.46144, + "grad_norm": 0.6700795292854309, + "learning_rate": 4.617246898759504e-05, + "loss": 0.7353, + "step": 1923 + }, + { + "epoch": 2.46272, + "grad_norm": 0.6941995620727539, + "learning_rate": 4.617046818727491e-05, + "loss": 0.7014, + "step": 1924 + }, + { + "epoch": 2.464, + "grad_norm": 0.6903062462806702, + "learning_rate": 4.6168467386954784e-05, + "loss": 0.7458, + "step": 1925 + }, + { + "epoch": 2.46528, + "grad_norm": 0.6764419078826904, + "learning_rate": 4.6166466586634656e-05, + "loss": 0.6934, + "step": 1926 + }, + { + "epoch": 2.46656, + "grad_norm": 0.6628318428993225, + "learning_rate": 4.616446578631453e-05, + "loss": 0.6735, + "step": 1927 + }, + { + "epoch": 2.46784, + "grad_norm": 0.6923201084136963, + "learning_rate": 4.61624649859944e-05, + "loss": 0.7516, + "step": 1928 + }, + { + "epoch": 2.46912, + "grad_norm": 0.6832184791564941, + "learning_rate": 4.616046418567427e-05, + "loss": 0.7106, + "step": 1929 + }, + { + "epoch": 2.4704, + "grad_norm": 0.6620609164237976, + "learning_rate": 4.6158463385354144e-05, + "loss": 0.6961, + "step": 1930 + }, + { + "epoch": 2.47168, + "grad_norm": 0.7005398273468018, + "learning_rate": 4.6156462585034015e-05, + "loss": 0.7223, + "step": 1931 + }, + { + "epoch": 2.47296, + "grad_norm": 0.6928220987319946, + "learning_rate": 4.615446178471389e-05, + "loss": 0.7758, + "step": 1932 + }, + { + "epoch": 2.47424, + "grad_norm": 0.6193888783454895, + "learning_rate": 4.615246098439376e-05, + "loss": 0.6345, + "step": 1933 + }, + { + "epoch": 2.47552, + "grad_norm": 0.6694657802581787, + "learning_rate": 4.615046018407363e-05, + "loss": 0.6957, + "step": 1934 + }, + { + "epoch": 2.4768, + "grad_norm": 0.6846767663955688, + "learning_rate": 4.61484593837535e-05, + "loss": 0.6667, + "step": 1935 + }, + { + "epoch": 2.47808, + "grad_norm": 0.6446174383163452, + "learning_rate": 4.6146458583433375e-05, + "loss": 0.7216, + "step": 1936 + }, + { + "epoch": 2.47936, + "grad_norm": 0.6601637601852417, + "learning_rate": 4.6144457783113247e-05, + "loss": 0.7262, + "step": 1937 + }, + { + "epoch": 2.48064, + "grad_norm": 0.6476382613182068, + "learning_rate": 4.614245698279312e-05, + "loss": 0.7142, + "step": 1938 + }, + { + "epoch": 2.48192, + "grad_norm": 0.6557601094245911, + "learning_rate": 4.614045618247299e-05, + "loss": 0.7386, + "step": 1939 + }, + { + "epoch": 2.4832, + "grad_norm": 0.6669868230819702, + "learning_rate": 4.613845538215286e-05, + "loss": 0.7643, + "step": 1940 + }, + { + "epoch": 2.48448, + "grad_norm": 0.6425213813781738, + "learning_rate": 4.6136454581832734e-05, + "loss": 0.643, + "step": 1941 + }, + { + "epoch": 2.48576, + "grad_norm": 0.6689165234565735, + "learning_rate": 4.613445378151261e-05, + "loss": 0.7025, + "step": 1942 + }, + { + "epoch": 2.48704, + "grad_norm": 0.6275350451469421, + "learning_rate": 4.613245298119248e-05, + "loss": 0.6482, + "step": 1943 + }, + { + "epoch": 2.48832, + "grad_norm": 0.6056965589523315, + "learning_rate": 4.613045218087235e-05, + "loss": 0.6807, + "step": 1944 + }, + { + "epoch": 2.4896, + "grad_norm": 0.6282046437263489, + "learning_rate": 4.612845138055222e-05, + "loss": 0.6564, + "step": 1945 + }, + { + "epoch": 2.4908799999999998, + "grad_norm": 0.640059769153595, + "learning_rate": 4.612645058023209e-05, + "loss": 0.6873, + "step": 1946 + }, + { + "epoch": 2.49216, + "grad_norm": 0.6487705707550049, + "learning_rate": 4.6124449779911965e-05, + "loss": 0.6516, + "step": 1947 + }, + { + "epoch": 2.49344, + "grad_norm": 0.6214262843132019, + "learning_rate": 4.612244897959184e-05, + "loss": 0.699, + "step": 1948 + }, + { + "epoch": 2.49472, + "grad_norm": 0.6714819073677063, + "learning_rate": 4.6120448179271716e-05, + "loss": 0.6946, + "step": 1949 + }, + { + "epoch": 2.496, + "grad_norm": 0.6522096991539001, + "learning_rate": 4.611844737895159e-05, + "loss": 0.7193, + "step": 1950 + }, + { + "epoch": 2.49728, + "grad_norm": 0.6565654277801514, + "learning_rate": 4.611644657863145e-05, + "loss": 0.7604, + "step": 1951 + }, + { + "epoch": 2.49856, + "grad_norm": 0.602493166923523, + "learning_rate": 4.6114445778311324e-05, + "loss": 0.6623, + "step": 1952 + }, + { + "epoch": 2.49984, + "grad_norm": 0.6453181505203247, + "learning_rate": 4.6112444977991196e-05, + "loss": 0.6349, + "step": 1953 + }, + { + "epoch": 2.5011200000000002, + "grad_norm": 0.6643373966217041, + "learning_rate": 4.611044417767107e-05, + "loss": 0.7352, + "step": 1954 + }, + { + "epoch": 2.5023999999999997, + "grad_norm": 0.6704357862472534, + "learning_rate": 4.610844337735094e-05, + "loss": 0.744, + "step": 1955 + }, + { + "epoch": 2.50368, + "grad_norm": 0.6634871363639832, + "learning_rate": 4.610644257703082e-05, + "loss": 0.6748, + "step": 1956 + }, + { + "epoch": 2.50496, + "grad_norm": 0.6585037708282471, + "learning_rate": 4.610444177671069e-05, + "loss": 0.6893, + "step": 1957 + }, + { + "epoch": 2.50624, + "grad_norm": 0.6182425022125244, + "learning_rate": 4.610244097639056e-05, + "loss": 0.7226, + "step": 1958 + }, + { + "epoch": 2.50752, + "grad_norm": 0.6366656422615051, + "learning_rate": 4.610044017607043e-05, + "loss": 0.6535, + "step": 1959 + }, + { + "epoch": 2.5088, + "grad_norm": 0.6848028302192688, + "learning_rate": 4.60984393757503e-05, + "loss": 0.7085, + "step": 1960 + }, + { + "epoch": 2.51008, + "grad_norm": 0.6243534684181213, + "learning_rate": 4.609643857543017e-05, + "loss": 0.6508, + "step": 1961 + }, + { + "epoch": 2.51136, + "grad_norm": 0.6199184656143188, + "learning_rate": 4.609443777511004e-05, + "loss": 0.6997, + "step": 1962 + }, + { + "epoch": 2.51264, + "grad_norm": 0.6194076538085938, + "learning_rate": 4.6092436974789915e-05, + "loss": 0.6587, + "step": 1963 + }, + { + "epoch": 2.51392, + "grad_norm": 0.6427433490753174, + "learning_rate": 4.6090436174469793e-05, + "loss": 0.6716, + "step": 1964 + }, + { + "epoch": 2.5152, + "grad_norm": 0.6448578238487244, + "learning_rate": 4.6088435374149665e-05, + "loss": 0.7232, + "step": 1965 + }, + { + "epoch": 2.51648, + "grad_norm": 0.6704204678535461, + "learning_rate": 4.608643457382954e-05, + "loss": 0.7486, + "step": 1966 + }, + { + "epoch": 2.51776, + "grad_norm": 0.6715102195739746, + "learning_rate": 4.60844337735094e-05, + "loss": 0.6453, + "step": 1967 + }, + { + "epoch": 2.51904, + "grad_norm": 0.7025008201599121, + "learning_rate": 4.6082432973189274e-05, + "loss": 0.7043, + "step": 1968 + }, + { + "epoch": 2.52032, + "grad_norm": 0.689388632774353, + "learning_rate": 4.6080432172869146e-05, + "loss": 0.686, + "step": 1969 + }, + { + "epoch": 2.5216, + "grad_norm": 0.6979616284370422, + "learning_rate": 4.607843137254902e-05, + "loss": 0.7104, + "step": 1970 + }, + { + "epoch": 2.52288, + "grad_norm": 0.6773265600204468, + "learning_rate": 4.6076430572228896e-05, + "loss": 0.7321, + "step": 1971 + }, + { + "epoch": 2.52416, + "grad_norm": 0.6521515250205994, + "learning_rate": 4.607442977190877e-05, + "loss": 0.6446, + "step": 1972 + }, + { + "epoch": 2.52544, + "grad_norm": 0.6768288612365723, + "learning_rate": 4.607242897158864e-05, + "loss": 0.672, + "step": 1973 + }, + { + "epoch": 2.52672, + "grad_norm": 0.6806971430778503, + "learning_rate": 4.607042817126851e-05, + "loss": 0.7247, + "step": 1974 + }, + { + "epoch": 2.528, + "grad_norm": 0.6461851596832275, + "learning_rate": 4.606842737094838e-05, + "loss": 0.6563, + "step": 1975 + }, + { + "epoch": 2.52928, + "grad_norm": 0.6352160573005676, + "learning_rate": 4.606642657062825e-05, + "loss": 0.6648, + "step": 1976 + }, + { + "epoch": 2.53056, + "grad_norm": 0.6496264338493347, + "learning_rate": 4.606442577030812e-05, + "loss": 0.7253, + "step": 1977 + }, + { + "epoch": 2.53184, + "grad_norm": 0.6366921067237854, + "learning_rate": 4.6062424969988e-05, + "loss": 0.6862, + "step": 1978 + }, + { + "epoch": 2.5331200000000003, + "grad_norm": 0.6239694952964783, + "learning_rate": 4.606042416966787e-05, + "loss": 0.6554, + "step": 1979 + }, + { + "epoch": 2.5343999999999998, + "grad_norm": 0.6517849564552307, + "learning_rate": 4.605842336934774e-05, + "loss": 0.7166, + "step": 1980 + }, + { + "epoch": 2.53568, + "grad_norm": 0.6458687782287598, + "learning_rate": 4.6056422569027615e-05, + "loss": 0.7159, + "step": 1981 + }, + { + "epoch": 2.53696, + "grad_norm": 0.6582731008529663, + "learning_rate": 4.605442176870749e-05, + "loss": 0.6434, + "step": 1982 + }, + { + "epoch": 2.53824, + "grad_norm": 0.6515088081359863, + "learning_rate": 4.605242096838735e-05, + "loss": 0.7313, + "step": 1983 + }, + { + "epoch": 2.53952, + "grad_norm": 0.65621417760849, + "learning_rate": 4.6050420168067224e-05, + "loss": 0.657, + "step": 1984 + }, + { + "epoch": 2.5408, + "grad_norm": 0.6879109740257263, + "learning_rate": 4.60484193677471e-05, + "loss": 0.6906, + "step": 1985 + }, + { + "epoch": 2.54208, + "grad_norm": 0.6400864124298096, + "learning_rate": 4.6046418567426974e-05, + "loss": 0.703, + "step": 1986 + }, + { + "epoch": 2.54336, + "grad_norm": 0.6810969114303589, + "learning_rate": 4.6044417767106846e-05, + "loss": 0.712, + "step": 1987 + }, + { + "epoch": 2.5446400000000002, + "grad_norm": 0.6492802500724792, + "learning_rate": 4.604241696678672e-05, + "loss": 0.6362, + "step": 1988 + }, + { + "epoch": 2.5459199999999997, + "grad_norm": 0.6736019253730774, + "learning_rate": 4.604041616646659e-05, + "loss": 0.7507, + "step": 1989 + }, + { + "epoch": 2.5472, + "grad_norm": 0.6537976861000061, + "learning_rate": 4.603841536614646e-05, + "loss": 0.6868, + "step": 1990 + }, + { + "epoch": 2.54848, + "grad_norm": 0.6979777812957764, + "learning_rate": 4.603641456582633e-05, + "loss": 0.7275, + "step": 1991 + }, + { + "epoch": 2.54976, + "grad_norm": 0.6660913825035095, + "learning_rate": 4.6034413765506205e-05, + "loss": 0.6566, + "step": 1992 + }, + { + "epoch": 2.55104, + "grad_norm": 0.6853949427604675, + "learning_rate": 4.603241296518608e-05, + "loss": 0.7594, + "step": 1993 + }, + { + "epoch": 2.55232, + "grad_norm": 0.6438176035881042, + "learning_rate": 4.603041216486595e-05, + "loss": 0.6594, + "step": 1994 + }, + { + "epoch": 2.5536, + "grad_norm": 0.6791808009147644, + "learning_rate": 4.602841136454582e-05, + "loss": 0.6871, + "step": 1995 + }, + { + "epoch": 2.55488, + "grad_norm": 0.6433961391448975, + "learning_rate": 4.602641056422569e-05, + "loss": 0.6616, + "step": 1996 + }, + { + "epoch": 2.55616, + "grad_norm": 0.6358078122138977, + "learning_rate": 4.6024409763905565e-05, + "loss": 0.6676, + "step": 1997 + }, + { + "epoch": 2.55744, + "grad_norm": 0.6184052228927612, + "learning_rate": 4.6022408963585437e-05, + "loss": 0.7061, + "step": 1998 + }, + { + "epoch": 2.55872, + "grad_norm": 0.6975054740905762, + "learning_rate": 4.602040816326531e-05, + "loss": 0.7147, + "step": 1999 + }, + { + "epoch": 2.56, + "grad_norm": 0.7169183492660522, + "learning_rate": 4.601840736294518e-05, + "loss": 0.7385, + "step": 2000 + }, + { + "epoch": 2.56128, + "grad_norm": 0.6839573383331299, + "learning_rate": 4.601640656262505e-05, + "loss": 0.7901, + "step": 2001 + }, + { + "epoch": 2.56256, + "grad_norm": 0.6511650681495667, + "learning_rate": 4.6014405762304924e-05, + "loss": 0.7281, + "step": 2002 + }, + { + "epoch": 2.56384, + "grad_norm": 0.6317888498306274, + "learning_rate": 4.6012404961984796e-05, + "loss": 0.6668, + "step": 2003 + }, + { + "epoch": 2.56512, + "grad_norm": 0.6695280075073242, + "learning_rate": 4.601040416166467e-05, + "loss": 0.6941, + "step": 2004 + }, + { + "epoch": 2.5664, + "grad_norm": 0.6781874895095825, + "learning_rate": 4.600840336134454e-05, + "loss": 0.7154, + "step": 2005 + }, + { + "epoch": 2.56768, + "grad_norm": 0.6626363396644592, + "learning_rate": 4.600640256102441e-05, + "loss": 0.6992, + "step": 2006 + }, + { + "epoch": 2.56896, + "grad_norm": 0.702985405921936, + "learning_rate": 4.600440176070428e-05, + "loss": 0.6923, + "step": 2007 + }, + { + "epoch": 2.57024, + "grad_norm": 0.6602315306663513, + "learning_rate": 4.6002400960384155e-05, + "loss": 0.6645, + "step": 2008 + }, + { + "epoch": 2.57152, + "grad_norm": 0.6675458550453186, + "learning_rate": 4.600040016006403e-05, + "loss": 0.6893, + "step": 2009 + }, + { + "epoch": 2.5728, + "grad_norm": 0.7174723744392395, + "learning_rate": 4.59983993597439e-05, + "loss": 0.7271, + "step": 2010 + }, + { + "epoch": 2.57408, + "grad_norm": 0.7206019163131714, + "learning_rate": 4.599639855942377e-05, + "loss": 0.711, + "step": 2011 + }, + { + "epoch": 2.57536, + "grad_norm": 0.6554808616638184, + "learning_rate": 4.599439775910364e-05, + "loss": 0.6322, + "step": 2012 + }, + { + "epoch": 2.5766400000000003, + "grad_norm": 0.6562361717224121, + "learning_rate": 4.599239695878352e-05, + "loss": 0.7233, + "step": 2013 + }, + { + "epoch": 2.5779199999999998, + "grad_norm": 0.6553013920783997, + "learning_rate": 4.5990396158463386e-05, + "loss": 0.6693, + "step": 2014 + }, + { + "epoch": 2.5792, + "grad_norm": 0.651155948638916, + "learning_rate": 4.598839535814326e-05, + "loss": 0.7005, + "step": 2015 + }, + { + "epoch": 2.58048, + "grad_norm": 0.6626810431480408, + "learning_rate": 4.598639455782313e-05, + "loss": 0.6962, + "step": 2016 + }, + { + "epoch": 2.58176, + "grad_norm": 0.6690220832824707, + "learning_rate": 4.5984393757503e-05, + "loss": 0.6897, + "step": 2017 + }, + { + "epoch": 2.58304, + "grad_norm": 0.6718343496322632, + "learning_rate": 4.5982392957182874e-05, + "loss": 0.7071, + "step": 2018 + }, + { + "epoch": 2.58432, + "grad_norm": 0.6581987142562866, + "learning_rate": 4.5980392156862746e-05, + "loss": 0.682, + "step": 2019 + }, + { + "epoch": 2.5856, + "grad_norm": 0.6765472888946533, + "learning_rate": 4.5978391356542624e-05, + "loss": 0.7696, + "step": 2020 + }, + { + "epoch": 2.58688, + "grad_norm": 0.630276083946228, + "learning_rate": 4.5976390556222496e-05, + "loss": 0.6681, + "step": 2021 + }, + { + "epoch": 2.5881600000000002, + "grad_norm": 0.6415339112281799, + "learning_rate": 4.597438975590236e-05, + "loss": 0.6739, + "step": 2022 + }, + { + "epoch": 2.5894399999999997, + "grad_norm": 0.6534903645515442, + "learning_rate": 4.597238895558223e-05, + "loss": 0.6743, + "step": 2023 + }, + { + "epoch": 2.59072, + "grad_norm": 0.704707145690918, + "learning_rate": 4.5970388155262105e-05, + "loss": 0.749, + "step": 2024 + }, + { + "epoch": 2.592, + "grad_norm": 0.6433389782905579, + "learning_rate": 4.596838735494198e-05, + "loss": 0.6899, + "step": 2025 + }, + { + "epoch": 2.59328, + "grad_norm": 0.6151252388954163, + "learning_rate": 4.596638655462185e-05, + "loss": 0.6721, + "step": 2026 + }, + { + "epoch": 2.59456, + "grad_norm": 0.6191518902778625, + "learning_rate": 4.596438575430173e-05, + "loss": 0.7129, + "step": 2027 + }, + { + "epoch": 2.59584, + "grad_norm": 0.6529395580291748, + "learning_rate": 4.59623849539816e-05, + "loss": 0.6866, + "step": 2028 + }, + { + "epoch": 2.59712, + "grad_norm": 0.6465093493461609, + "learning_rate": 4.596038415366147e-05, + "loss": 0.6547, + "step": 2029 + }, + { + "epoch": 2.5984, + "grad_norm": 0.6575112342834473, + "learning_rate": 4.5958383353341336e-05, + "loss": 0.6982, + "step": 2030 + }, + { + "epoch": 2.59968, + "grad_norm": 0.6782650947570801, + "learning_rate": 4.595638255302121e-05, + "loss": 0.7034, + "step": 2031 + }, + { + "epoch": 2.60096, + "grad_norm": 0.7053045034408569, + "learning_rate": 4.595438175270108e-05, + "loss": 0.7387, + "step": 2032 + }, + { + "epoch": 2.60224, + "grad_norm": 0.6682645678520203, + "learning_rate": 4.595238095238095e-05, + "loss": 0.6812, + "step": 2033 + }, + { + "epoch": 2.60352, + "grad_norm": 0.6382338404655457, + "learning_rate": 4.595038015206083e-05, + "loss": 0.6417, + "step": 2034 + }, + { + "epoch": 2.6048, + "grad_norm": 0.6588988900184631, + "learning_rate": 4.59483793517407e-05, + "loss": 0.7028, + "step": 2035 + }, + { + "epoch": 2.60608, + "grad_norm": 0.6959824562072754, + "learning_rate": 4.5946378551420574e-05, + "loss": 0.7239, + "step": 2036 + }, + { + "epoch": 2.60736, + "grad_norm": 0.6393638253211975, + "learning_rate": 4.5944377751100446e-05, + "loss": 0.7437, + "step": 2037 + }, + { + "epoch": 2.60864, + "grad_norm": 0.6316011548042297, + "learning_rate": 4.594237695078031e-05, + "loss": 0.7059, + "step": 2038 + }, + { + "epoch": 2.60992, + "grad_norm": 0.6398905515670776, + "learning_rate": 4.594037615046018e-05, + "loss": 0.692, + "step": 2039 + }, + { + "epoch": 2.6112, + "grad_norm": 0.6583664417266846, + "learning_rate": 4.5938375350140055e-05, + "loss": 0.6969, + "step": 2040 + }, + { + "epoch": 2.61248, + "grad_norm": 0.6351532340049744, + "learning_rate": 4.593637454981993e-05, + "loss": 0.6436, + "step": 2041 + }, + { + "epoch": 2.61376, + "grad_norm": 0.6414986252784729, + "learning_rate": 4.5934373749499805e-05, + "loss": 0.6872, + "step": 2042 + }, + { + "epoch": 2.61504, + "grad_norm": 0.632988452911377, + "learning_rate": 4.593237294917968e-05, + "loss": 0.7222, + "step": 2043 + }, + { + "epoch": 2.61632, + "grad_norm": 0.6204145550727844, + "learning_rate": 4.593037214885955e-05, + "loss": 0.653, + "step": 2044 + }, + { + "epoch": 2.6176, + "grad_norm": 0.6726123094558716, + "learning_rate": 4.592837134853942e-05, + "loss": 0.7185, + "step": 2045 + }, + { + "epoch": 2.61888, + "grad_norm": 0.6624434590339661, + "learning_rate": 4.5926370548219286e-05, + "loss": 0.6756, + "step": 2046 + }, + { + "epoch": 2.6201600000000003, + "grad_norm": 0.66437166929245, + "learning_rate": 4.592436974789916e-05, + "loss": 0.6542, + "step": 2047 + }, + { + "epoch": 2.6214399999999998, + "grad_norm": 0.6689863801002502, + "learning_rate": 4.5922368947579036e-05, + "loss": 0.6601, + "step": 2048 + }, + { + "epoch": 2.62272, + "grad_norm": 0.6922819018363953, + "learning_rate": 4.592036814725891e-05, + "loss": 0.7033, + "step": 2049 + }, + { + "epoch": 2.624, + "grad_norm": 0.6346952319145203, + "learning_rate": 4.591836734693878e-05, + "loss": 0.6276, + "step": 2050 + }, + { + "epoch": 2.62528, + "grad_norm": 0.6946719884872437, + "learning_rate": 4.591636654661865e-05, + "loss": 0.6687, + "step": 2051 + }, + { + "epoch": 2.62656, + "grad_norm": 0.6633927226066589, + "learning_rate": 4.5914365746298524e-05, + "loss": 0.6556, + "step": 2052 + }, + { + "epoch": 2.62784, + "grad_norm": 0.6315405964851379, + "learning_rate": 4.5912364945978395e-05, + "loss": 0.6286, + "step": 2053 + }, + { + "epoch": 2.62912, + "grad_norm": 0.6600960493087769, + "learning_rate": 4.591036414565826e-05, + "loss": 0.7148, + "step": 2054 + }, + { + "epoch": 2.6304, + "grad_norm": 0.6884539723396301, + "learning_rate": 4.590836334533814e-05, + "loss": 0.7345, + "step": 2055 + }, + { + "epoch": 2.6316800000000002, + "grad_norm": 0.6405014395713806, + "learning_rate": 4.590636254501801e-05, + "loss": 0.6786, + "step": 2056 + }, + { + "epoch": 2.6329599999999997, + "grad_norm": 0.6440674662590027, + "learning_rate": 4.590436174469788e-05, + "loss": 0.7232, + "step": 2057 + }, + { + "epoch": 2.63424, + "grad_norm": 0.6079061627388, + "learning_rate": 4.5902360944377755e-05, + "loss": 0.6926, + "step": 2058 + }, + { + "epoch": 2.63552, + "grad_norm": 0.6500112414360046, + "learning_rate": 4.590036014405763e-05, + "loss": 0.6942, + "step": 2059 + }, + { + "epoch": 2.6368, + "grad_norm": 0.6738446950912476, + "learning_rate": 4.58983593437375e-05, + "loss": 0.7175, + "step": 2060 + }, + { + "epoch": 2.63808, + "grad_norm": 0.6589712500572205, + "learning_rate": 4.589635854341737e-05, + "loss": 0.6648, + "step": 2061 + }, + { + "epoch": 2.63936, + "grad_norm": 0.6929957866668701, + "learning_rate": 4.589435774309724e-05, + "loss": 0.7793, + "step": 2062 + }, + { + "epoch": 2.64064, + "grad_norm": 0.6504958271980286, + "learning_rate": 4.5892356942777114e-05, + "loss": 0.6443, + "step": 2063 + }, + { + "epoch": 2.64192, + "grad_norm": 0.6380278468132019, + "learning_rate": 4.5890356142456986e-05, + "loss": 0.7045, + "step": 2064 + }, + { + "epoch": 2.6432, + "grad_norm": 0.6529109477996826, + "learning_rate": 4.588835534213686e-05, + "loss": 0.6324, + "step": 2065 + }, + { + "epoch": 2.64448, + "grad_norm": 0.647123396396637, + "learning_rate": 4.588635454181673e-05, + "loss": 0.6828, + "step": 2066 + }, + { + "epoch": 2.64576, + "grad_norm": 0.6754167079925537, + "learning_rate": 4.58843537414966e-05, + "loss": 0.6967, + "step": 2067 + }, + { + "epoch": 2.64704, + "grad_norm": 0.7286065220832825, + "learning_rate": 4.588235294117647e-05, + "loss": 0.7515, + "step": 2068 + }, + { + "epoch": 2.64832, + "grad_norm": 0.6146422624588013, + "learning_rate": 4.5880352140856345e-05, + "loss": 0.6442, + "step": 2069 + }, + { + "epoch": 2.6496, + "grad_norm": 0.6109498143196106, + "learning_rate": 4.587835134053622e-05, + "loss": 0.6513, + "step": 2070 + }, + { + "epoch": 2.65088, + "grad_norm": 0.6009126305580139, + "learning_rate": 4.587635054021609e-05, + "loss": 0.6343, + "step": 2071 + }, + { + "epoch": 2.65216, + "grad_norm": 0.6297327280044556, + "learning_rate": 4.587434973989596e-05, + "loss": 0.6925, + "step": 2072 + }, + { + "epoch": 2.65344, + "grad_norm": 0.6399157047271729, + "learning_rate": 4.587234893957583e-05, + "loss": 0.6359, + "step": 2073 + }, + { + "epoch": 2.65472, + "grad_norm": 0.626350998878479, + "learning_rate": 4.5870348139255704e-05, + "loss": 0.6937, + "step": 2074 + }, + { + "epoch": 2.656, + "grad_norm": 0.6313692927360535, + "learning_rate": 4.5868347338935576e-05, + "loss": 0.644, + "step": 2075 + }, + { + "epoch": 2.65728, + "grad_norm": 0.671847403049469, + "learning_rate": 4.586634653861545e-05, + "loss": 0.7126, + "step": 2076 + }, + { + "epoch": 2.65856, + "grad_norm": 0.688568651676178, + "learning_rate": 4.586434573829532e-05, + "loss": 0.6857, + "step": 2077 + }, + { + "epoch": 2.65984, + "grad_norm": 0.6881417036056519, + "learning_rate": 4.586234493797519e-05, + "loss": 0.7783, + "step": 2078 + }, + { + "epoch": 2.66112, + "grad_norm": 0.648615300655365, + "learning_rate": 4.5860344137655064e-05, + "loss": 0.6637, + "step": 2079 + }, + { + "epoch": 2.6624, + "grad_norm": 0.6464314460754395, + "learning_rate": 4.5858343337334936e-05, + "loss": 0.6959, + "step": 2080 + }, + { + "epoch": 2.6636800000000003, + "grad_norm": 0.6369978189468384, + "learning_rate": 4.585634253701481e-05, + "loss": 0.6529, + "step": 2081 + }, + { + "epoch": 2.6649599999999998, + "grad_norm": 0.6303538084030151, + "learning_rate": 4.585434173669468e-05, + "loss": 0.6863, + "step": 2082 + }, + { + "epoch": 2.66624, + "grad_norm": 0.6283568143844604, + "learning_rate": 4.585234093637455e-05, + "loss": 0.6879, + "step": 2083 + }, + { + "epoch": 2.66752, + "grad_norm": 0.6396946310997009, + "learning_rate": 4.585034013605442e-05, + "loss": 0.6658, + "step": 2084 + }, + { + "epoch": 2.6688, + "grad_norm": 0.6821319460868835, + "learning_rate": 4.5848339335734295e-05, + "loss": 0.7968, + "step": 2085 + }, + { + "epoch": 2.67008, + "grad_norm": 0.6359503865242004, + "learning_rate": 4.584633853541417e-05, + "loss": 0.6889, + "step": 2086 + }, + { + "epoch": 2.67136, + "grad_norm": 0.6165139079093933, + "learning_rate": 4.584433773509404e-05, + "loss": 0.641, + "step": 2087 + }, + { + "epoch": 2.67264, + "grad_norm": 0.6269519329071045, + "learning_rate": 4.584233693477391e-05, + "loss": 0.6457, + "step": 2088 + }, + { + "epoch": 2.67392, + "grad_norm": 0.6549934148788452, + "learning_rate": 4.584033613445378e-05, + "loss": 0.7314, + "step": 2089 + }, + { + "epoch": 2.6752000000000002, + "grad_norm": 0.6697399616241455, + "learning_rate": 4.5838335334133654e-05, + "loss": 0.701, + "step": 2090 + }, + { + "epoch": 2.6764799999999997, + "grad_norm": 0.6521748304367065, + "learning_rate": 4.583633453381353e-05, + "loss": 0.6884, + "step": 2091 + }, + { + "epoch": 2.67776, + "grad_norm": 0.6441870331764221, + "learning_rate": 4.58343337334934e-05, + "loss": 0.6971, + "step": 2092 + }, + { + "epoch": 2.67904, + "grad_norm": 0.679710328578949, + "learning_rate": 4.583233293317327e-05, + "loss": 0.7295, + "step": 2093 + }, + { + "epoch": 2.68032, + "grad_norm": 0.6788901686668396, + "learning_rate": 4.583033213285314e-05, + "loss": 0.712, + "step": 2094 + }, + { + "epoch": 2.6816, + "grad_norm": 0.6429007053375244, + "learning_rate": 4.5828331332533013e-05, + "loss": 0.7209, + "step": 2095 + }, + { + "epoch": 2.68288, + "grad_norm": 0.6366958022117615, + "learning_rate": 4.5826330532212885e-05, + "loss": 0.6718, + "step": 2096 + }, + { + "epoch": 2.68416, + "grad_norm": 0.6946751475334167, + "learning_rate": 4.582432973189276e-05, + "loss": 0.7622, + "step": 2097 + }, + { + "epoch": 2.68544, + "grad_norm": 0.6607045531272888, + "learning_rate": 4.5822328931572636e-05, + "loss": 0.697, + "step": 2098 + }, + { + "epoch": 2.68672, + "grad_norm": 0.6583904027938843, + "learning_rate": 4.582032813125251e-05, + "loss": 0.7422, + "step": 2099 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 0.6572896838188171, + "learning_rate": 4.581832733093237e-05, + "loss": 0.7195, + "step": 2100 + }, + { + "epoch": 2.68928, + "grad_norm": 0.6300676465034485, + "learning_rate": 4.5816326530612245e-05, + "loss": 0.6429, + "step": 2101 + }, + { + "epoch": 2.69056, + "grad_norm": 0.6641280651092529, + "learning_rate": 4.5814325730292116e-05, + "loss": 0.7224, + "step": 2102 + }, + { + "epoch": 2.69184, + "grad_norm": 0.6447017788887024, + "learning_rate": 4.581232492997199e-05, + "loss": 0.7085, + "step": 2103 + }, + { + "epoch": 2.69312, + "grad_norm": 0.683560848236084, + "learning_rate": 4.581032412965186e-05, + "loss": 0.7218, + "step": 2104 + }, + { + "epoch": 2.6944, + "grad_norm": 0.6947901248931885, + "learning_rate": 4.580832332933174e-05, + "loss": 0.6343, + "step": 2105 + }, + { + "epoch": 2.69568, + "grad_norm": 0.6173972487449646, + "learning_rate": 4.580632252901161e-05, + "loss": 0.7048, + "step": 2106 + }, + { + "epoch": 2.69696, + "grad_norm": 0.6312255263328552, + "learning_rate": 4.580432172869148e-05, + "loss": 0.6644, + "step": 2107 + }, + { + "epoch": 2.69824, + "grad_norm": 0.6200210452079773, + "learning_rate": 4.580232092837135e-05, + "loss": 0.688, + "step": 2108 + }, + { + "epoch": 2.69952, + "grad_norm": 0.6290738582611084, + "learning_rate": 4.580032012805122e-05, + "loss": 0.6392, + "step": 2109 + }, + { + "epoch": 2.7008, + "grad_norm": 0.6984908580780029, + "learning_rate": 4.579831932773109e-05, + "loss": 0.7356, + "step": 2110 + }, + { + "epoch": 2.70208, + "grad_norm": 0.6521112322807312, + "learning_rate": 4.579631852741096e-05, + "loss": 0.6927, + "step": 2111 + }, + { + "epoch": 2.70336, + "grad_norm": 0.622561514377594, + "learning_rate": 4.579431772709084e-05, + "loss": 0.6233, + "step": 2112 + }, + { + "epoch": 2.70464, + "grad_norm": 0.6508548259735107, + "learning_rate": 4.5792316926770714e-05, + "loss": 0.678, + "step": 2113 + }, + { + "epoch": 2.70592, + "grad_norm": 0.6437860131263733, + "learning_rate": 4.5790316126450586e-05, + "loss": 0.6495, + "step": 2114 + }, + { + "epoch": 2.7072000000000003, + "grad_norm": 0.6867465376853943, + "learning_rate": 4.578831532613046e-05, + "loss": 0.6763, + "step": 2115 + }, + { + "epoch": 2.7084799999999998, + "grad_norm": 0.6844937801361084, + "learning_rate": 4.578631452581032e-05, + "loss": 0.7097, + "step": 2116 + }, + { + "epoch": 2.70976, + "grad_norm": 0.647807240486145, + "learning_rate": 4.5784313725490194e-05, + "loss": 0.7127, + "step": 2117 + }, + { + "epoch": 2.71104, + "grad_norm": 0.6669386625289917, + "learning_rate": 4.5782312925170066e-05, + "loss": 0.6953, + "step": 2118 + }, + { + "epoch": 2.71232, + "grad_norm": 0.6261999607086182, + "learning_rate": 4.5780312124849945e-05, + "loss": 0.623, + "step": 2119 + }, + { + "epoch": 2.7136, + "grad_norm": 0.655799925327301, + "learning_rate": 4.577831132452982e-05, + "loss": 0.6649, + "step": 2120 + }, + { + "epoch": 2.71488, + "grad_norm": 0.5952524542808533, + "learning_rate": 4.577631052420969e-05, + "loss": 0.6339, + "step": 2121 + }, + { + "epoch": 2.71616, + "grad_norm": 0.6749306321144104, + "learning_rate": 4.577430972388956e-05, + "loss": 0.7158, + "step": 2122 + }, + { + "epoch": 2.71744, + "grad_norm": 0.6410848498344421, + "learning_rate": 4.577230892356943e-05, + "loss": 0.6914, + "step": 2123 + }, + { + "epoch": 2.7187200000000002, + "grad_norm": 0.63245689868927, + "learning_rate": 4.57703081232493e-05, + "loss": 0.7241, + "step": 2124 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.6515913009643555, + "learning_rate": 4.576830732292917e-05, + "loss": 0.7299, + "step": 2125 + }, + { + "epoch": 2.72128, + "grad_norm": 0.6468390226364136, + "learning_rate": 4.576630652260905e-05, + "loss": 0.6929, + "step": 2126 + }, + { + "epoch": 2.72256, + "grad_norm": 0.6881574988365173, + "learning_rate": 4.576430572228892e-05, + "loss": 0.7325, + "step": 2127 + }, + { + "epoch": 2.72384, + "grad_norm": 0.6494055986404419, + "learning_rate": 4.576230492196879e-05, + "loss": 0.6923, + "step": 2128 + }, + { + "epoch": 2.72512, + "grad_norm": 0.6755759119987488, + "learning_rate": 4.576030412164866e-05, + "loss": 0.6909, + "step": 2129 + }, + { + "epoch": 2.7264, + "grad_norm": 0.6174530982971191, + "learning_rate": 4.5758303321328535e-05, + "loss": 0.6562, + "step": 2130 + }, + { + "epoch": 2.72768, + "grad_norm": 0.7030787467956543, + "learning_rate": 4.575630252100841e-05, + "loss": 0.6916, + "step": 2131 + }, + { + "epoch": 2.72896, + "grad_norm": 0.6742822527885437, + "learning_rate": 4.575430172068827e-05, + "loss": 0.6501, + "step": 2132 + }, + { + "epoch": 2.7302400000000002, + "grad_norm": 0.6416477560997009, + "learning_rate": 4.575230092036815e-05, + "loss": 0.7267, + "step": 2133 + }, + { + "epoch": 2.7315199999999997, + "grad_norm": 0.664212703704834, + "learning_rate": 4.575030012004802e-05, + "loss": 0.7052, + "step": 2134 + }, + { + "epoch": 2.7328, + "grad_norm": 0.6820107102394104, + "learning_rate": 4.5748299319727895e-05, + "loss": 0.7232, + "step": 2135 + }, + { + "epoch": 2.73408, + "grad_norm": 0.6734192967414856, + "learning_rate": 4.5746298519407766e-05, + "loss": 0.7513, + "step": 2136 + }, + { + "epoch": 2.73536, + "grad_norm": 0.6329904794692993, + "learning_rate": 4.574429771908764e-05, + "loss": 0.5988, + "step": 2137 + }, + { + "epoch": 2.73664, + "grad_norm": 0.6895211338996887, + "learning_rate": 4.574229691876751e-05, + "loss": 0.7194, + "step": 2138 + }, + { + "epoch": 2.73792, + "grad_norm": 0.6408979892730713, + "learning_rate": 4.574029611844738e-05, + "loss": 0.6905, + "step": 2139 + }, + { + "epoch": 2.7392, + "grad_norm": 0.6711249947547913, + "learning_rate": 4.5738295318127254e-05, + "loss": 0.7265, + "step": 2140 + }, + { + "epoch": 2.74048, + "grad_norm": 0.6658822894096375, + "learning_rate": 4.5736294517807126e-05, + "loss": 0.6549, + "step": 2141 + }, + { + "epoch": 2.74176, + "grad_norm": 0.6828049421310425, + "learning_rate": 4.5734293717487e-05, + "loss": 0.6709, + "step": 2142 + }, + { + "epoch": 2.74304, + "grad_norm": 0.6909437775611877, + "learning_rate": 4.573229291716687e-05, + "loss": 0.7724, + "step": 2143 + }, + { + "epoch": 2.74432, + "grad_norm": 0.639430582523346, + "learning_rate": 4.573029211684674e-05, + "loss": 0.6728, + "step": 2144 + }, + { + "epoch": 2.7456, + "grad_norm": 0.6498085856437683, + "learning_rate": 4.572829131652661e-05, + "loss": 0.7088, + "step": 2145 + }, + { + "epoch": 2.74688, + "grad_norm": 0.6685836911201477, + "learning_rate": 4.5726290516206485e-05, + "loss": 0.6646, + "step": 2146 + }, + { + "epoch": 2.74816, + "grad_norm": 0.6966959834098816, + "learning_rate": 4.572428971588636e-05, + "loss": 0.7059, + "step": 2147 + }, + { + "epoch": 2.74944, + "grad_norm": 0.6679326891899109, + "learning_rate": 4.572228891556623e-05, + "loss": 0.6814, + "step": 2148 + }, + { + "epoch": 2.7507200000000003, + "grad_norm": 0.675516664981842, + "learning_rate": 4.57202881152461e-05, + "loss": 0.8011, + "step": 2149 + }, + { + "epoch": 2.752, + "grad_norm": 0.661491334438324, + "learning_rate": 4.571828731492597e-05, + "loss": 0.6971, + "step": 2150 + }, + { + "epoch": 2.75328, + "grad_norm": 0.6797171831130981, + "learning_rate": 4.5716286514605844e-05, + "loss": 0.6434, + "step": 2151 + }, + { + "epoch": 2.75456, + "grad_norm": 0.6701538562774658, + "learning_rate": 4.5714285714285716e-05, + "loss": 0.6716, + "step": 2152 + }, + { + "epoch": 2.75584, + "grad_norm": 0.651821494102478, + "learning_rate": 4.571228491396559e-05, + "loss": 0.6881, + "step": 2153 + }, + { + "epoch": 2.75712, + "grad_norm": 0.6361088156700134, + "learning_rate": 4.571028411364546e-05, + "loss": 0.6827, + "step": 2154 + }, + { + "epoch": 2.7584, + "grad_norm": 0.6350401043891907, + "learning_rate": 4.570828331332533e-05, + "loss": 0.7337, + "step": 2155 + }, + { + "epoch": 2.75968, + "grad_norm": 0.7136991024017334, + "learning_rate": 4.5706282513005204e-05, + "loss": 0.6458, + "step": 2156 + }, + { + "epoch": 2.76096, + "grad_norm": 0.6868817210197449, + "learning_rate": 4.5704281712685075e-05, + "loss": 0.7136, + "step": 2157 + }, + { + "epoch": 2.7622400000000003, + "grad_norm": 0.676253616809845, + "learning_rate": 4.570228091236495e-05, + "loss": 0.7317, + "step": 2158 + }, + { + "epoch": 2.7635199999999998, + "grad_norm": 0.6780885457992554, + "learning_rate": 4.570028011204482e-05, + "loss": 0.7259, + "step": 2159 + }, + { + "epoch": 2.7648, + "grad_norm": 0.6793497204780579, + "learning_rate": 4.569827931172469e-05, + "loss": 0.7271, + "step": 2160 + }, + { + "epoch": 2.76608, + "grad_norm": 0.6723445653915405, + "learning_rate": 4.569627851140457e-05, + "loss": 0.6884, + "step": 2161 + }, + { + "epoch": 2.76736, + "grad_norm": 0.634547233581543, + "learning_rate": 4.5694277711084435e-05, + "loss": 0.6513, + "step": 2162 + }, + { + "epoch": 2.76864, + "grad_norm": 0.6849969625473022, + "learning_rate": 4.5692276910764307e-05, + "loss": 0.742, + "step": 2163 + }, + { + "epoch": 2.76992, + "grad_norm": 0.678898274898529, + "learning_rate": 4.569027611044418e-05, + "loss": 0.6751, + "step": 2164 + }, + { + "epoch": 2.7712, + "grad_norm": 0.6287670135498047, + "learning_rate": 4.568827531012405e-05, + "loss": 0.6337, + "step": 2165 + }, + { + "epoch": 2.77248, + "grad_norm": 0.6324494481086731, + "learning_rate": 4.568627450980392e-05, + "loss": 0.6311, + "step": 2166 + }, + { + "epoch": 2.7737600000000002, + "grad_norm": 0.6523123383522034, + "learning_rate": 4.5684273709483794e-05, + "loss": 0.6919, + "step": 2167 + }, + { + "epoch": 2.7750399999999997, + "grad_norm": 0.6603489518165588, + "learning_rate": 4.568227290916367e-05, + "loss": 0.6948, + "step": 2168 + }, + { + "epoch": 2.77632, + "grad_norm": 0.6603338122367859, + "learning_rate": 4.5680272108843544e-05, + "loss": 0.6322, + "step": 2169 + }, + { + "epoch": 2.7776, + "grad_norm": 0.6867691874504089, + "learning_rate": 4.567827130852341e-05, + "loss": 0.6886, + "step": 2170 + }, + { + "epoch": 2.77888, + "grad_norm": 0.6577850580215454, + "learning_rate": 4.567627050820328e-05, + "loss": 0.6591, + "step": 2171 + }, + { + "epoch": 2.78016, + "grad_norm": 0.6284435987472534, + "learning_rate": 4.567426970788315e-05, + "loss": 0.6754, + "step": 2172 + }, + { + "epoch": 2.78144, + "grad_norm": 0.6540303826332092, + "learning_rate": 4.5672268907563025e-05, + "loss": 0.6868, + "step": 2173 + }, + { + "epoch": 2.78272, + "grad_norm": 0.6498759984970093, + "learning_rate": 4.56702681072429e-05, + "loss": 0.7164, + "step": 2174 + }, + { + "epoch": 2.784, + "grad_norm": 0.686326265335083, + "learning_rate": 4.5668267306922776e-05, + "loss": 0.7155, + "step": 2175 + }, + { + "epoch": 2.78528, + "grad_norm": 0.6437170505523682, + "learning_rate": 4.566626650660265e-05, + "loss": 0.6718, + "step": 2176 + }, + { + "epoch": 2.78656, + "grad_norm": 0.6241775155067444, + "learning_rate": 4.566426570628252e-05, + "loss": 0.6818, + "step": 2177 + }, + { + "epoch": 2.78784, + "grad_norm": 0.6604918837547302, + "learning_rate": 4.5662264905962384e-05, + "loss": 0.6968, + "step": 2178 + }, + { + "epoch": 2.78912, + "grad_norm": 0.6711975932121277, + "learning_rate": 4.5660264105642256e-05, + "loss": 0.6325, + "step": 2179 + }, + { + "epoch": 2.7904, + "grad_norm": 0.6874549984931946, + "learning_rate": 4.565826330532213e-05, + "loss": 0.7189, + "step": 2180 + }, + { + "epoch": 2.79168, + "grad_norm": 0.6761820316314697, + "learning_rate": 4.5656262505002e-05, + "loss": 0.6852, + "step": 2181 + }, + { + "epoch": 2.79296, + "grad_norm": 0.6783404350280762, + "learning_rate": 4.565426170468188e-05, + "loss": 0.6999, + "step": 2182 + }, + { + "epoch": 2.79424, + "grad_norm": 0.692419171333313, + "learning_rate": 4.565226090436175e-05, + "loss": 0.6883, + "step": 2183 + }, + { + "epoch": 2.79552, + "grad_norm": 0.6205313205718994, + "learning_rate": 4.565026010404162e-05, + "loss": 0.6789, + "step": 2184 + }, + { + "epoch": 2.7968, + "grad_norm": 0.6563310027122498, + "learning_rate": 4.5648259303721494e-05, + "loss": 0.6559, + "step": 2185 + }, + { + "epoch": 2.79808, + "grad_norm": 0.6110013127326965, + "learning_rate": 4.564625850340136e-05, + "loss": 0.6487, + "step": 2186 + }, + { + "epoch": 2.79936, + "grad_norm": 0.6310566067695618, + "learning_rate": 4.564425770308123e-05, + "loss": 0.6614, + "step": 2187 + }, + { + "epoch": 2.80064, + "grad_norm": 0.6590198278427124, + "learning_rate": 4.56422569027611e-05, + "loss": 0.6885, + "step": 2188 + }, + { + "epoch": 2.80192, + "grad_norm": 0.6192741394042969, + "learning_rate": 4.5640256102440975e-05, + "loss": 0.668, + "step": 2189 + }, + { + "epoch": 2.8032, + "grad_norm": 0.6905452013015747, + "learning_rate": 4.5638255302120853e-05, + "loss": 0.7039, + "step": 2190 + }, + { + "epoch": 2.80448, + "grad_norm": 0.7045333385467529, + "learning_rate": 4.5636254501800725e-05, + "loss": 0.7192, + "step": 2191 + }, + { + "epoch": 2.8057600000000003, + "grad_norm": 0.6958116888999939, + "learning_rate": 4.56342537014806e-05, + "loss": 0.6825, + "step": 2192 + }, + { + "epoch": 2.8070399999999998, + "grad_norm": 0.6681202054023743, + "learning_rate": 4.563225290116047e-05, + "loss": 0.6615, + "step": 2193 + }, + { + "epoch": 2.80832, + "grad_norm": 0.6813097596168518, + "learning_rate": 4.5630252100840334e-05, + "loss": 0.7027, + "step": 2194 + }, + { + "epoch": 2.8096, + "grad_norm": 0.6501699090003967, + "learning_rate": 4.5628251300520206e-05, + "loss": 0.6703, + "step": 2195 + }, + { + "epoch": 2.81088, + "grad_norm": 0.6794613599777222, + "learning_rate": 4.562625050020008e-05, + "loss": 0.681, + "step": 2196 + }, + { + "epoch": 2.81216, + "grad_norm": 0.6721237301826477, + "learning_rate": 4.5624249699879956e-05, + "loss": 0.6923, + "step": 2197 + }, + { + "epoch": 2.81344, + "grad_norm": 0.6417657732963562, + "learning_rate": 4.562224889955983e-05, + "loss": 0.6285, + "step": 2198 + }, + { + "epoch": 2.81472, + "grad_norm": 0.6990517973899841, + "learning_rate": 4.56202480992397e-05, + "loss": 0.6986, + "step": 2199 + }, + { + "epoch": 2.816, + "grad_norm": 0.6576136350631714, + "learning_rate": 4.561824729891957e-05, + "loss": 0.7217, + "step": 2200 + }, + { + "epoch": 2.8172800000000002, + "grad_norm": 0.6665319204330444, + "learning_rate": 4.5616246498599444e-05, + "loss": 0.6833, + "step": 2201 + }, + { + "epoch": 2.8185599999999997, + "grad_norm": 0.6458298563957214, + "learning_rate": 4.561424569827931e-05, + "loss": 0.7176, + "step": 2202 + }, + { + "epoch": 2.81984, + "grad_norm": 0.624575674533844, + "learning_rate": 4.561224489795918e-05, + "loss": 0.6285, + "step": 2203 + }, + { + "epoch": 2.82112, + "grad_norm": 0.644964337348938, + "learning_rate": 4.561024409763906e-05, + "loss": 0.7044, + "step": 2204 + }, + { + "epoch": 2.8224, + "grad_norm": 0.6390849947929382, + "learning_rate": 4.560824329731893e-05, + "loss": 0.7089, + "step": 2205 + }, + { + "epoch": 2.82368, + "grad_norm": 0.6157050132751465, + "learning_rate": 4.56062424969988e-05, + "loss": 0.6307, + "step": 2206 + }, + { + "epoch": 2.82496, + "grad_norm": 0.6521828770637512, + "learning_rate": 4.5604241696678675e-05, + "loss": 0.6278, + "step": 2207 + }, + { + "epoch": 2.82624, + "grad_norm": 0.6566979289054871, + "learning_rate": 4.560224089635855e-05, + "loss": 0.698, + "step": 2208 + }, + { + "epoch": 2.82752, + "grad_norm": 0.6705225110054016, + "learning_rate": 4.560024009603842e-05, + "loss": 0.7031, + "step": 2209 + }, + { + "epoch": 2.8288, + "grad_norm": 0.6485233306884766, + "learning_rate": 4.5598239295718284e-05, + "loss": 0.6328, + "step": 2210 + }, + { + "epoch": 2.83008, + "grad_norm": 0.6989656686782837, + "learning_rate": 4.559623849539816e-05, + "loss": 0.7367, + "step": 2211 + }, + { + "epoch": 2.83136, + "grad_norm": 0.705863356590271, + "learning_rate": 4.5594237695078034e-05, + "loss": 0.6894, + "step": 2212 + }, + { + "epoch": 2.83264, + "grad_norm": 0.65732741355896, + "learning_rate": 4.5592236894757906e-05, + "loss": 0.7177, + "step": 2213 + }, + { + "epoch": 2.83392, + "grad_norm": 0.6545193195343018, + "learning_rate": 4.559023609443778e-05, + "loss": 0.7393, + "step": 2214 + }, + { + "epoch": 2.8352, + "grad_norm": 0.6775088906288147, + "learning_rate": 4.558823529411765e-05, + "loss": 0.7286, + "step": 2215 + }, + { + "epoch": 2.83648, + "grad_norm": 0.7127577662467957, + "learning_rate": 4.558623449379752e-05, + "loss": 0.6682, + "step": 2216 + }, + { + "epoch": 2.83776, + "grad_norm": 0.6672844886779785, + "learning_rate": 4.5584233693477394e-05, + "loss": 0.6333, + "step": 2217 + }, + { + "epoch": 2.83904, + "grad_norm": 0.6709384322166443, + "learning_rate": 4.5582232893157265e-05, + "loss": 0.6946, + "step": 2218 + }, + { + "epoch": 2.84032, + "grad_norm": 0.6919859647750854, + "learning_rate": 4.558023209283714e-05, + "loss": 0.7289, + "step": 2219 + }, + { + "epoch": 2.8416, + "grad_norm": 0.663409411907196, + "learning_rate": 4.557823129251701e-05, + "loss": 0.6832, + "step": 2220 + }, + { + "epoch": 2.84288, + "grad_norm": 0.7317196130752563, + "learning_rate": 4.557623049219688e-05, + "loss": 0.7423, + "step": 2221 + }, + { + "epoch": 2.84416, + "grad_norm": 0.6240825057029724, + "learning_rate": 4.557422969187675e-05, + "loss": 0.6365, + "step": 2222 + }, + { + "epoch": 2.84544, + "grad_norm": 0.6449801921844482, + "learning_rate": 4.5572228891556625e-05, + "loss": 0.7191, + "step": 2223 + }, + { + "epoch": 2.84672, + "grad_norm": 0.6569290161132812, + "learning_rate": 4.5570228091236497e-05, + "loss": 0.6668, + "step": 2224 + }, + { + "epoch": 2.848, + "grad_norm": 0.6520342826843262, + "learning_rate": 4.556822729091637e-05, + "loss": 0.6671, + "step": 2225 + }, + { + "epoch": 2.8492800000000003, + "grad_norm": 0.708814263343811, + "learning_rate": 4.556622649059624e-05, + "loss": 0.7664, + "step": 2226 + }, + { + "epoch": 2.8505599999999998, + "grad_norm": 0.6431106925010681, + "learning_rate": 4.556422569027611e-05, + "loss": 0.6481, + "step": 2227 + }, + { + "epoch": 2.85184, + "grad_norm": 0.6694732904434204, + "learning_rate": 4.5562224889955984e-05, + "loss": 0.6742, + "step": 2228 + }, + { + "epoch": 2.85312, + "grad_norm": 0.6356359124183655, + "learning_rate": 4.5560224089635856e-05, + "loss": 0.6854, + "step": 2229 + }, + { + "epoch": 2.8544, + "grad_norm": 0.6448719501495361, + "learning_rate": 4.555822328931573e-05, + "loss": 0.7208, + "step": 2230 + }, + { + "epoch": 2.85568, + "grad_norm": 0.6238778233528137, + "learning_rate": 4.55562224889956e-05, + "loss": 0.6774, + "step": 2231 + }, + { + "epoch": 2.85696, + "grad_norm": 0.6807205677032471, + "learning_rate": 4.555422168867547e-05, + "loss": 0.6773, + "step": 2232 + }, + { + "epoch": 2.85824, + "grad_norm": 0.6966913938522339, + "learning_rate": 4.555222088835534e-05, + "loss": 0.7078, + "step": 2233 + }, + { + "epoch": 2.85952, + "grad_norm": 0.666556715965271, + "learning_rate": 4.5550220088035215e-05, + "loss": 0.714, + "step": 2234 + }, + { + "epoch": 2.8608000000000002, + "grad_norm": 0.6896193027496338, + "learning_rate": 4.554821928771509e-05, + "loss": 0.7264, + "step": 2235 + }, + { + "epoch": 2.8620799999999997, + "grad_norm": 0.6884152293205261, + "learning_rate": 4.554621848739496e-05, + "loss": 0.7435, + "step": 2236 + }, + { + "epoch": 2.86336, + "grad_norm": 0.7019590735435486, + "learning_rate": 4.554421768707483e-05, + "loss": 0.7466, + "step": 2237 + }, + { + "epoch": 2.86464, + "grad_norm": 0.6901242733001709, + "learning_rate": 4.55422168867547e-05, + "loss": 0.6879, + "step": 2238 + }, + { + "epoch": 2.86592, + "grad_norm": 0.6424590349197388, + "learning_rate": 4.554021608643458e-05, + "loss": 0.6982, + "step": 2239 + }, + { + "epoch": 2.8672, + "grad_norm": 0.6135256290435791, + "learning_rate": 4.5538215286114446e-05, + "loss": 0.6547, + "step": 2240 + }, + { + "epoch": 2.86848, + "grad_norm": 0.6518721580505371, + "learning_rate": 4.553621448579432e-05, + "loss": 0.6951, + "step": 2241 + }, + { + "epoch": 2.86976, + "grad_norm": 0.6382459402084351, + "learning_rate": 4.553421368547419e-05, + "loss": 0.6453, + "step": 2242 + }, + { + "epoch": 2.87104, + "grad_norm": 0.6710212230682373, + "learning_rate": 4.553221288515406e-05, + "loss": 0.718, + "step": 2243 + }, + { + "epoch": 2.87232, + "grad_norm": 0.6573660373687744, + "learning_rate": 4.5530212084833934e-05, + "loss": 0.6752, + "step": 2244 + }, + { + "epoch": 2.8736, + "grad_norm": 0.6997604966163635, + "learning_rate": 4.5528211284513806e-05, + "loss": 0.7185, + "step": 2245 + }, + { + "epoch": 2.87488, + "grad_norm": 0.6322104930877686, + "learning_rate": 4.5526210484193684e-05, + "loss": 0.6109, + "step": 2246 + }, + { + "epoch": 2.87616, + "grad_norm": 0.6722203493118286, + "learning_rate": 4.5524209683873556e-05, + "loss": 0.702, + "step": 2247 + }, + { + "epoch": 2.87744, + "grad_norm": 0.6825948357582092, + "learning_rate": 4.552220888355342e-05, + "loss": 0.6802, + "step": 2248 + }, + { + "epoch": 2.87872, + "grad_norm": 0.6763216853141785, + "learning_rate": 4.552020808323329e-05, + "loss": 0.7054, + "step": 2249 + }, + { + "epoch": 2.88, + "grad_norm": 0.6940529346466064, + "learning_rate": 4.5518207282913165e-05, + "loss": 0.7395, + "step": 2250 + }, + { + "epoch": 2.88128, + "grad_norm": 0.6334916949272156, + "learning_rate": 4.551620648259304e-05, + "loss": 0.6658, + "step": 2251 + }, + { + "epoch": 2.88256, + "grad_norm": 0.6614584922790527, + "learning_rate": 4.551420568227291e-05, + "loss": 0.7377, + "step": 2252 + }, + { + "epoch": 2.88384, + "grad_norm": 0.6642113924026489, + "learning_rate": 4.551220488195279e-05, + "loss": 0.6981, + "step": 2253 + }, + { + "epoch": 2.88512, + "grad_norm": 0.624686598777771, + "learning_rate": 4.551020408163266e-05, + "loss": 0.7133, + "step": 2254 + }, + { + "epoch": 2.8864, + "grad_norm": 0.67439204454422, + "learning_rate": 4.550820328131253e-05, + "loss": 0.7079, + "step": 2255 + }, + { + "epoch": 2.88768, + "grad_norm": 0.6298664212226868, + "learning_rate": 4.5506202480992396e-05, + "loss": 0.6848, + "step": 2256 + }, + { + "epoch": 2.88896, + "grad_norm": 0.604558527469635, + "learning_rate": 4.550420168067227e-05, + "loss": 0.6442, + "step": 2257 + }, + { + "epoch": 2.89024, + "grad_norm": 0.6485775113105774, + "learning_rate": 4.550220088035214e-05, + "loss": 0.6867, + "step": 2258 + }, + { + "epoch": 2.89152, + "grad_norm": 0.6755024194717407, + "learning_rate": 4.550020008003201e-05, + "loss": 0.705, + "step": 2259 + }, + { + "epoch": 2.8928000000000003, + "grad_norm": 0.664548397064209, + "learning_rate": 4.549819927971189e-05, + "loss": 0.657, + "step": 2260 + }, + { + "epoch": 2.8940799999999998, + "grad_norm": 0.6621407866477966, + "learning_rate": 4.549619847939176e-05, + "loss": 0.7003, + "step": 2261 + }, + { + "epoch": 2.89536, + "grad_norm": 0.6824017763137817, + "learning_rate": 4.5494197679071634e-05, + "loss": 0.7026, + "step": 2262 + }, + { + "epoch": 2.89664, + "grad_norm": 0.6626001000404358, + "learning_rate": 4.5492196878751506e-05, + "loss": 0.6712, + "step": 2263 + }, + { + "epoch": 2.89792, + "grad_norm": 0.6786776781082153, + "learning_rate": 4.549019607843137e-05, + "loss": 0.6515, + "step": 2264 + }, + { + "epoch": 2.8992, + "grad_norm": 0.677880048751831, + "learning_rate": 4.548819527811124e-05, + "loss": 0.6956, + "step": 2265 + }, + { + "epoch": 2.90048, + "grad_norm": 0.6466307640075684, + "learning_rate": 4.5486194477791115e-05, + "loss": 0.7, + "step": 2266 + }, + { + "epoch": 2.90176, + "grad_norm": 0.6700591444969177, + "learning_rate": 4.548419367747099e-05, + "loss": 0.708, + "step": 2267 + }, + { + "epoch": 2.90304, + "grad_norm": 0.6479355096817017, + "learning_rate": 4.5482192877150865e-05, + "loss": 0.6921, + "step": 2268 + }, + { + "epoch": 2.9043200000000002, + "grad_norm": 0.6609050631523132, + "learning_rate": 4.548019207683074e-05, + "loss": 0.7045, + "step": 2269 + }, + { + "epoch": 2.9055999999999997, + "grad_norm": 0.6681724190711975, + "learning_rate": 4.547819127651061e-05, + "loss": 0.6764, + "step": 2270 + }, + { + "epoch": 2.90688, + "grad_norm": 0.6616440415382385, + "learning_rate": 4.547619047619048e-05, + "loss": 0.6934, + "step": 2271 + }, + { + "epoch": 2.90816, + "grad_norm": 0.6577678322792053, + "learning_rate": 4.5474189675870346e-05, + "loss": 0.7208, + "step": 2272 + }, + { + "epoch": 2.90944, + "grad_norm": 0.6378509402275085, + "learning_rate": 4.547218887555022e-05, + "loss": 0.6296, + "step": 2273 + }, + { + "epoch": 2.91072, + "grad_norm": 0.6398113965988159, + "learning_rate": 4.5470188075230096e-05, + "loss": 0.631, + "step": 2274 + }, + { + "epoch": 2.912, + "grad_norm": 0.6845940947532654, + "learning_rate": 4.546818727490997e-05, + "loss": 0.7193, + "step": 2275 + }, + { + "epoch": 2.91328, + "grad_norm": 0.6516909003257751, + "learning_rate": 4.546618647458984e-05, + "loss": 0.7033, + "step": 2276 + }, + { + "epoch": 2.91456, + "grad_norm": 0.6223090887069702, + "learning_rate": 4.546418567426971e-05, + "loss": 0.6473, + "step": 2277 + }, + { + "epoch": 2.91584, + "grad_norm": 0.6670078635215759, + "learning_rate": 4.5462184873949584e-05, + "loss": 0.7206, + "step": 2278 + }, + { + "epoch": 2.91712, + "grad_norm": 0.6714773774147034, + "learning_rate": 4.5460184073629455e-05, + "loss": 0.6349, + "step": 2279 + }, + { + "epoch": 2.9184, + "grad_norm": 0.6824267506599426, + "learning_rate": 4.545818327330932e-05, + "loss": 0.6335, + "step": 2280 + }, + { + "epoch": 2.91968, + "grad_norm": 0.7514560222625732, + "learning_rate": 4.54561824729892e-05, + "loss": 0.7474, + "step": 2281 + }, + { + "epoch": 2.92096, + "grad_norm": 0.6569649577140808, + "learning_rate": 4.545418167266907e-05, + "loss": 0.7029, + "step": 2282 + }, + { + "epoch": 2.92224, + "grad_norm": 0.700855553150177, + "learning_rate": 4.545218087234894e-05, + "loss": 0.7706, + "step": 2283 + }, + { + "epoch": 2.92352, + "grad_norm": 0.6456267833709717, + "learning_rate": 4.5450180072028815e-05, + "loss": 0.7021, + "step": 2284 + }, + { + "epoch": 2.9248, + "grad_norm": 0.6533935070037842, + "learning_rate": 4.5448179271708687e-05, + "loss": 0.6852, + "step": 2285 + }, + { + "epoch": 2.92608, + "grad_norm": 0.6358088850975037, + "learning_rate": 4.544617847138856e-05, + "loss": 0.7173, + "step": 2286 + }, + { + "epoch": 2.92736, + "grad_norm": 0.6510772109031677, + "learning_rate": 4.544417767106843e-05, + "loss": 0.6866, + "step": 2287 + }, + { + "epoch": 2.92864, + "grad_norm": 0.642501950263977, + "learning_rate": 4.54421768707483e-05, + "loss": 0.7078, + "step": 2288 + }, + { + "epoch": 2.92992, + "grad_norm": 0.6543188095092773, + "learning_rate": 4.5440176070428174e-05, + "loss": 0.6925, + "step": 2289 + }, + { + "epoch": 2.9312, + "grad_norm": 0.7026666402816772, + "learning_rate": 4.5438175270108046e-05, + "loss": 0.7661, + "step": 2290 + }, + { + "epoch": 2.93248, + "grad_norm": 0.6860962510108948, + "learning_rate": 4.543617446978792e-05, + "loss": 0.7165, + "step": 2291 + }, + { + "epoch": 2.93376, + "grad_norm": 0.6254853010177612, + "learning_rate": 4.543417366946779e-05, + "loss": 0.6598, + "step": 2292 + }, + { + "epoch": 2.93504, + "grad_norm": 0.6507871150970459, + "learning_rate": 4.543217286914766e-05, + "loss": 0.6557, + "step": 2293 + }, + { + "epoch": 2.9363200000000003, + "grad_norm": 0.6633324027061462, + "learning_rate": 4.543017206882753e-05, + "loss": 0.6782, + "step": 2294 + }, + { + "epoch": 2.9375999999999998, + "grad_norm": 0.6420159935951233, + "learning_rate": 4.5428171268507405e-05, + "loss": 0.7025, + "step": 2295 + }, + { + "epoch": 2.93888, + "grad_norm": 0.6702166795730591, + "learning_rate": 4.542617046818728e-05, + "loss": 0.7024, + "step": 2296 + }, + { + "epoch": 2.94016, + "grad_norm": 0.6596609950065613, + "learning_rate": 4.542416966786715e-05, + "loss": 0.6926, + "step": 2297 + }, + { + "epoch": 2.94144, + "grad_norm": 0.6574816703796387, + "learning_rate": 4.542216886754702e-05, + "loss": 0.6624, + "step": 2298 + }, + { + "epoch": 2.94272, + "grad_norm": 0.6869375109672546, + "learning_rate": 4.542016806722689e-05, + "loss": 0.6966, + "step": 2299 + }, + { + "epoch": 2.944, + "grad_norm": 0.659354031085968, + "learning_rate": 4.5418167266906764e-05, + "loss": 0.6866, + "step": 2300 + }, + { + "epoch": 2.94528, + "grad_norm": 0.6519255042076111, + "learning_rate": 4.5416166466586636e-05, + "loss": 0.6968, + "step": 2301 + }, + { + "epoch": 2.94656, + "grad_norm": 0.6179057955741882, + "learning_rate": 4.541416566626651e-05, + "loss": 0.6456, + "step": 2302 + }, + { + "epoch": 2.9478400000000002, + "grad_norm": 0.6320080161094666, + "learning_rate": 4.541216486594638e-05, + "loss": 0.6268, + "step": 2303 + }, + { + "epoch": 2.9491199999999997, + "grad_norm": 0.6388428807258606, + "learning_rate": 4.541016406562625e-05, + "loss": 0.7033, + "step": 2304 + }, + { + "epoch": 2.9504, + "grad_norm": 0.6056840419769287, + "learning_rate": 4.5408163265306124e-05, + "loss": 0.6709, + "step": 2305 + }, + { + "epoch": 2.95168, + "grad_norm": 0.6787373423576355, + "learning_rate": 4.5406162464985996e-05, + "loss": 0.6955, + "step": 2306 + }, + { + "epoch": 2.95296, + "grad_norm": 0.6520240306854248, + "learning_rate": 4.540416166466587e-05, + "loss": 0.645, + "step": 2307 + }, + { + "epoch": 2.95424, + "grad_norm": 0.6489017605781555, + "learning_rate": 4.540216086434574e-05, + "loss": 0.6428, + "step": 2308 + }, + { + "epoch": 2.95552, + "grad_norm": 0.6715541481971741, + "learning_rate": 4.540016006402561e-05, + "loss": 0.6948, + "step": 2309 + }, + { + "epoch": 2.9568, + "grad_norm": 0.7156697511672974, + "learning_rate": 4.539815926370549e-05, + "loss": 0.696, + "step": 2310 + }, + { + "epoch": 2.95808, + "grad_norm": 0.6907376050949097, + "learning_rate": 4.5396158463385355e-05, + "loss": 0.7406, + "step": 2311 + }, + { + "epoch": 2.95936, + "grad_norm": 0.6593761444091797, + "learning_rate": 4.539415766306523e-05, + "loss": 0.6731, + "step": 2312 + }, + { + "epoch": 2.96064, + "grad_norm": 0.6654943227767944, + "learning_rate": 4.53921568627451e-05, + "loss": 0.6773, + "step": 2313 + }, + { + "epoch": 2.96192, + "grad_norm": 0.6800094246864319, + "learning_rate": 4.539015606242497e-05, + "loss": 0.7691, + "step": 2314 + }, + { + "epoch": 2.9632, + "grad_norm": 0.6977217793464661, + "learning_rate": 4.538815526210484e-05, + "loss": 0.6491, + "step": 2315 + }, + { + "epoch": 2.96448, + "grad_norm": 0.6390252113342285, + "learning_rate": 4.5386154461784714e-05, + "loss": 0.6644, + "step": 2316 + }, + { + "epoch": 2.96576, + "grad_norm": 0.6670240163803101, + "learning_rate": 4.538415366146459e-05, + "loss": 0.6976, + "step": 2317 + }, + { + "epoch": 2.96704, + "grad_norm": 0.6732922196388245, + "learning_rate": 4.5382152861144465e-05, + "loss": 0.6924, + "step": 2318 + }, + { + "epoch": 2.96832, + "grad_norm": 0.6866724491119385, + "learning_rate": 4.538015206082433e-05, + "loss": 0.6729, + "step": 2319 + }, + { + "epoch": 2.9696, + "grad_norm": 0.6516912579536438, + "learning_rate": 4.53781512605042e-05, + "loss": 0.7145, + "step": 2320 + }, + { + "epoch": 2.97088, + "grad_norm": 0.6390223503112793, + "learning_rate": 4.5376150460184073e-05, + "loss": 0.6462, + "step": 2321 + }, + { + "epoch": 2.97216, + "grad_norm": 0.6120672821998596, + "learning_rate": 4.5374149659863945e-05, + "loss": 0.6401, + "step": 2322 + }, + { + "epoch": 2.97344, + "grad_norm": 0.6073777675628662, + "learning_rate": 4.537214885954382e-05, + "loss": 0.6114, + "step": 2323 + }, + { + "epoch": 2.97472, + "grad_norm": 0.698907732963562, + "learning_rate": 4.5370148059223696e-05, + "loss": 0.7557, + "step": 2324 + }, + { + "epoch": 2.976, + "grad_norm": 0.7005800008773804, + "learning_rate": 4.536814725890357e-05, + "loss": 0.7338, + "step": 2325 + }, + { + "epoch": 2.97728, + "grad_norm": 0.673547089099884, + "learning_rate": 4.536614645858344e-05, + "loss": 0.6493, + "step": 2326 + }, + { + "epoch": 2.97856, + "grad_norm": 0.6851591467857361, + "learning_rate": 4.5364145658263305e-05, + "loss": 0.7215, + "step": 2327 + }, + { + "epoch": 2.9798400000000003, + "grad_norm": 0.5817590355873108, + "learning_rate": 4.5362144857943176e-05, + "loss": 0.5828, + "step": 2328 + }, + { + "epoch": 2.9811199999999998, + "grad_norm": 0.6559504866600037, + "learning_rate": 4.536014405762305e-05, + "loss": 0.6832, + "step": 2329 + }, + { + "epoch": 2.9824, + "grad_norm": 0.6387779712677002, + "learning_rate": 4.535814325730292e-05, + "loss": 0.6812, + "step": 2330 + }, + { + "epoch": 2.98368, + "grad_norm": 0.6166095733642578, + "learning_rate": 4.53561424569828e-05, + "loss": 0.6638, + "step": 2331 + }, + { + "epoch": 2.98496, + "grad_norm": 0.6481328010559082, + "learning_rate": 4.535414165666267e-05, + "loss": 0.6361, + "step": 2332 + }, + { + "epoch": 2.98624, + "grad_norm": 0.6827143430709839, + "learning_rate": 4.535214085634254e-05, + "loss": 0.687, + "step": 2333 + }, + { + "epoch": 2.98752, + "grad_norm": 0.6266081929206848, + "learning_rate": 4.5350140056022414e-05, + "loss": 0.6573, + "step": 2334 + }, + { + "epoch": 2.9888, + "grad_norm": 0.6480961441993713, + "learning_rate": 4.534813925570228e-05, + "loss": 0.6742, + "step": 2335 + }, + { + "epoch": 2.99008, + "grad_norm": 0.6894843578338623, + "learning_rate": 4.534613845538215e-05, + "loss": 0.672, + "step": 2336 + }, + { + "epoch": 2.9913600000000002, + "grad_norm": 0.6723330020904541, + "learning_rate": 4.534413765506202e-05, + "loss": 0.6999, + "step": 2337 + }, + { + "epoch": 2.9926399999999997, + "grad_norm": 0.7179523706436157, + "learning_rate": 4.53421368547419e-05, + "loss": 0.7847, + "step": 2338 + }, + { + "epoch": 2.99392, + "grad_norm": 0.648439884185791, + "learning_rate": 4.5340136054421774e-05, + "loss": 0.6769, + "step": 2339 + }, + { + "epoch": 2.9952, + "grad_norm": 0.6338523030281067, + "learning_rate": 4.5338135254101645e-05, + "loss": 0.6469, + "step": 2340 + }, + { + "epoch": 2.99648, + "grad_norm": 0.6572247743606567, + "learning_rate": 4.533613445378152e-05, + "loss": 0.7077, + "step": 2341 + }, + { + "epoch": 2.99776, + "grad_norm": 0.6434239149093628, + "learning_rate": 4.533413365346139e-05, + "loss": 0.6655, + "step": 2342 + }, + { + "epoch": 2.99904, + "grad_norm": 0.67351895570755, + "learning_rate": 4.5332132853141254e-05, + "loss": 0.7103, + "step": 2343 + }, + { + "epoch": 3.00032, + "grad_norm": 1.486992359161377, + "learning_rate": 4.5330132052821126e-05, + "loss": 1.2061, + "step": 2344 + }, + { + "epoch": 3.0016, + "grad_norm": 0.6494001150131226, + "learning_rate": 4.5328131252501005e-05, + "loss": 0.7025, + "step": 2345 + }, + { + "epoch": 3.00288, + "grad_norm": 0.6204409003257751, + "learning_rate": 4.532613045218088e-05, + "loss": 0.665, + "step": 2346 + }, + { + "epoch": 3.00416, + "grad_norm": 0.6547411680221558, + "learning_rate": 4.532412965186075e-05, + "loss": 0.6653, + "step": 2347 + }, + { + "epoch": 3.00544, + "grad_norm": 0.6656926274299622, + "learning_rate": 4.532212885154062e-05, + "loss": 0.6842, + "step": 2348 + }, + { + "epoch": 3.00672, + "grad_norm": 0.7050084471702576, + "learning_rate": 4.532012805122049e-05, + "loss": 0.6961, + "step": 2349 + }, + { + "epoch": 3.008, + "grad_norm": 0.6964038610458374, + "learning_rate": 4.5318127250900364e-05, + "loss": 0.6978, + "step": 2350 + }, + { + "epoch": 3.00928, + "grad_norm": 0.6723408699035645, + "learning_rate": 4.531612645058023e-05, + "loss": 0.7163, + "step": 2351 + }, + { + "epoch": 3.01056, + "grad_norm": 0.6292080879211426, + "learning_rate": 4.531412565026011e-05, + "loss": 0.6484, + "step": 2352 + }, + { + "epoch": 3.01184, + "grad_norm": 0.6262539625167847, + "learning_rate": 4.531212484993998e-05, + "loss": 0.6386, + "step": 2353 + }, + { + "epoch": 3.01312, + "grad_norm": 0.650016725063324, + "learning_rate": 4.531012404961985e-05, + "loss": 0.6554, + "step": 2354 + }, + { + "epoch": 3.0144, + "grad_norm": 0.6762639880180359, + "learning_rate": 4.530812324929972e-05, + "loss": 0.6389, + "step": 2355 + }, + { + "epoch": 3.01568, + "grad_norm": 0.7196240425109863, + "learning_rate": 4.5306122448979595e-05, + "loss": 0.6837, + "step": 2356 + }, + { + "epoch": 3.01696, + "grad_norm": 0.6458487510681152, + "learning_rate": 4.530412164865947e-05, + "loss": 0.6213, + "step": 2357 + }, + { + "epoch": 3.01824, + "grad_norm": 0.6251707077026367, + "learning_rate": 4.530212084833934e-05, + "loss": 0.612, + "step": 2358 + }, + { + "epoch": 3.01952, + "grad_norm": 0.6338829398155212, + "learning_rate": 4.530012004801921e-05, + "loss": 0.6327, + "step": 2359 + }, + { + "epoch": 3.0208, + "grad_norm": 0.6653558611869812, + "learning_rate": 4.529811924769908e-05, + "loss": 0.6595, + "step": 2360 + }, + { + "epoch": 3.02208, + "grad_norm": 0.677976131439209, + "learning_rate": 4.5296118447378954e-05, + "loss": 0.6325, + "step": 2361 + }, + { + "epoch": 3.02336, + "grad_norm": 0.6878952980041504, + "learning_rate": 4.5294117647058826e-05, + "loss": 0.7399, + "step": 2362 + }, + { + "epoch": 3.02464, + "grad_norm": 0.6632867455482483, + "learning_rate": 4.52921168467387e-05, + "loss": 0.7067, + "step": 2363 + }, + { + "epoch": 3.02592, + "grad_norm": 0.6564420461654663, + "learning_rate": 4.529011604641857e-05, + "loss": 0.6509, + "step": 2364 + }, + { + "epoch": 3.0272, + "grad_norm": 0.6492841243743896, + "learning_rate": 4.528811524609844e-05, + "loss": 0.6339, + "step": 2365 + }, + { + "epoch": 3.02848, + "grad_norm": 0.682312548160553, + "learning_rate": 4.5286114445778314e-05, + "loss": 0.6906, + "step": 2366 + }, + { + "epoch": 3.02976, + "grad_norm": 0.6361678242683411, + "learning_rate": 4.5284113645458186e-05, + "loss": 0.6743, + "step": 2367 + }, + { + "epoch": 3.03104, + "grad_norm": 0.6740372776985168, + "learning_rate": 4.528211284513806e-05, + "loss": 0.643, + "step": 2368 + }, + { + "epoch": 3.03232, + "grad_norm": 0.6516414880752563, + "learning_rate": 4.528011204481793e-05, + "loss": 0.6258, + "step": 2369 + }, + { + "epoch": 3.0336, + "grad_norm": 0.677844762802124, + "learning_rate": 4.52781112444978e-05, + "loss": 0.7175, + "step": 2370 + }, + { + "epoch": 3.03488, + "grad_norm": 0.6672567129135132, + "learning_rate": 4.527611044417767e-05, + "loss": 0.6363, + "step": 2371 + }, + { + "epoch": 3.03616, + "grad_norm": 0.6582069993019104, + "learning_rate": 4.5274109643857545e-05, + "loss": 0.6774, + "step": 2372 + }, + { + "epoch": 3.03744, + "grad_norm": 0.670803427696228, + "learning_rate": 4.527210884353742e-05, + "loss": 0.6809, + "step": 2373 + }, + { + "epoch": 3.03872, + "grad_norm": 0.6593008041381836, + "learning_rate": 4.527010804321729e-05, + "loss": 0.648, + "step": 2374 + }, + { + "epoch": 3.04, + "grad_norm": 0.6505773067474365, + "learning_rate": 4.526810724289716e-05, + "loss": 0.6273, + "step": 2375 + }, + { + "epoch": 3.04128, + "grad_norm": 0.7069177031517029, + "learning_rate": 4.526610644257703e-05, + "loss": 0.7197, + "step": 2376 + }, + { + "epoch": 3.04256, + "grad_norm": 0.6976633667945862, + "learning_rate": 4.5264105642256904e-05, + "loss": 0.6741, + "step": 2377 + }, + { + "epoch": 3.04384, + "grad_norm": 0.7404224276542664, + "learning_rate": 4.5262104841936776e-05, + "loss": 0.7145, + "step": 2378 + }, + { + "epoch": 3.04512, + "grad_norm": 0.6723030209541321, + "learning_rate": 4.526010404161665e-05, + "loss": 0.6488, + "step": 2379 + }, + { + "epoch": 3.0464, + "grad_norm": 0.689201831817627, + "learning_rate": 4.5258103241296527e-05, + "loss": 0.7129, + "step": 2380 + }, + { + "epoch": 3.04768, + "grad_norm": 0.7270947098731995, + "learning_rate": 4.525610244097639e-05, + "loss": 0.6891, + "step": 2381 + }, + { + "epoch": 3.04896, + "grad_norm": 0.7034469246864319, + "learning_rate": 4.5254101640656263e-05, + "loss": 0.7154, + "step": 2382 + }, + { + "epoch": 3.05024, + "grad_norm": 0.7317506074905396, + "learning_rate": 4.5252100840336135e-05, + "loss": 0.6951, + "step": 2383 + }, + { + "epoch": 3.05152, + "grad_norm": 0.6207937598228455, + "learning_rate": 4.525010004001601e-05, + "loss": 0.6367, + "step": 2384 + }, + { + "epoch": 3.0528, + "grad_norm": 0.6545078158378601, + "learning_rate": 4.524809923969588e-05, + "loss": 0.6703, + "step": 2385 + }, + { + "epoch": 3.05408, + "grad_norm": 0.6788381934165955, + "learning_rate": 4.524609843937575e-05, + "loss": 0.6538, + "step": 2386 + }, + { + "epoch": 3.05536, + "grad_norm": 0.64985191822052, + "learning_rate": 4.524409763905563e-05, + "loss": 0.681, + "step": 2387 + }, + { + "epoch": 3.05664, + "grad_norm": 0.6559796333312988, + "learning_rate": 4.52420968387355e-05, + "loss": 0.7017, + "step": 2388 + }, + { + "epoch": 3.05792, + "grad_norm": 0.677920937538147, + "learning_rate": 4.5240096038415366e-05, + "loss": 0.6879, + "step": 2389 + }, + { + "epoch": 3.0592, + "grad_norm": 0.6159390211105347, + "learning_rate": 4.523809523809524e-05, + "loss": 0.6216, + "step": 2390 + }, + { + "epoch": 3.06048, + "grad_norm": 0.7163643836975098, + "learning_rate": 4.523609443777511e-05, + "loss": 0.7317, + "step": 2391 + }, + { + "epoch": 3.06176, + "grad_norm": 0.6356967091560364, + "learning_rate": 4.523409363745498e-05, + "loss": 0.6964, + "step": 2392 + }, + { + "epoch": 3.06304, + "grad_norm": 0.7133451700210571, + "learning_rate": 4.5232092837134854e-05, + "loss": 0.683, + "step": 2393 + }, + { + "epoch": 3.06432, + "grad_norm": 0.6646026372909546, + "learning_rate": 4.523009203681473e-05, + "loss": 0.6141, + "step": 2394 + }, + { + "epoch": 3.0656, + "grad_norm": 0.6448561549186707, + "learning_rate": 4.5228091236494604e-05, + "loss": 0.6628, + "step": 2395 + }, + { + "epoch": 3.06688, + "grad_norm": 0.6711647510528564, + "learning_rate": 4.5226090436174476e-05, + "loss": 0.6526, + "step": 2396 + }, + { + "epoch": 3.0681599999999998, + "grad_norm": 0.6663058996200562, + "learning_rate": 4.522408963585434e-05, + "loss": 0.6812, + "step": 2397 + }, + { + "epoch": 3.06944, + "grad_norm": 0.6636464595794678, + "learning_rate": 4.522208883553421e-05, + "loss": 0.6382, + "step": 2398 + }, + { + "epoch": 3.07072, + "grad_norm": 0.6658351421356201, + "learning_rate": 4.5220088035214085e-05, + "loss": 0.7137, + "step": 2399 + }, + { + "epoch": 3.072, + "grad_norm": 0.6860455870628357, + "learning_rate": 4.521808723489396e-05, + "loss": 0.679, + "step": 2400 + }, + { + "epoch": 3.07328, + "grad_norm": 0.6328697800636292, + "learning_rate": 4.5216086434573836e-05, + "loss": 0.6579, + "step": 2401 + }, + { + "epoch": 3.07456, + "grad_norm": 0.665577232837677, + "learning_rate": 4.521408563425371e-05, + "loss": 0.6864, + "step": 2402 + }, + { + "epoch": 3.07584, + "grad_norm": 0.6703231334686279, + "learning_rate": 4.521208483393358e-05, + "loss": 0.6518, + "step": 2403 + }, + { + "epoch": 3.07712, + "grad_norm": 0.7464866042137146, + "learning_rate": 4.521008403361345e-05, + "loss": 0.7464, + "step": 2404 + }, + { + "epoch": 3.0784, + "grad_norm": 0.6916598677635193, + "learning_rate": 4.5208083233293316e-05, + "loss": 0.6668, + "step": 2405 + }, + { + "epoch": 3.07968, + "grad_norm": 0.7007398009300232, + "learning_rate": 4.520608243297319e-05, + "loss": 0.6506, + "step": 2406 + }, + { + "epoch": 3.08096, + "grad_norm": 0.6712905168533325, + "learning_rate": 4.520408163265306e-05, + "loss": 0.6568, + "step": 2407 + }, + { + "epoch": 3.08224, + "grad_norm": 0.687311053276062, + "learning_rate": 4.520208083233294e-05, + "loss": 0.6959, + "step": 2408 + }, + { + "epoch": 3.08352, + "grad_norm": 0.6788797378540039, + "learning_rate": 4.520008003201281e-05, + "loss": 0.6703, + "step": 2409 + }, + { + "epoch": 3.0848, + "grad_norm": 0.6620363593101501, + "learning_rate": 4.519807923169268e-05, + "loss": 0.6754, + "step": 2410 + }, + { + "epoch": 3.08608, + "grad_norm": 0.6966630220413208, + "learning_rate": 4.5196078431372554e-05, + "loss": 0.7254, + "step": 2411 + }, + { + "epoch": 3.08736, + "grad_norm": 0.6657626032829285, + "learning_rate": 4.5194077631052426e-05, + "loss": 0.6667, + "step": 2412 + }, + { + "epoch": 3.08864, + "grad_norm": 0.6981019377708435, + "learning_rate": 4.519207683073229e-05, + "loss": 0.6876, + "step": 2413 + }, + { + "epoch": 3.08992, + "grad_norm": 0.7262740731239319, + "learning_rate": 4.519007603041216e-05, + "loss": 0.7387, + "step": 2414 + }, + { + "epoch": 3.0912, + "grad_norm": 0.7206056714057922, + "learning_rate": 4.5188075230092035e-05, + "loss": 0.7254, + "step": 2415 + }, + { + "epoch": 3.09248, + "grad_norm": 0.700833261013031, + "learning_rate": 4.518607442977191e-05, + "loss": 0.6647, + "step": 2416 + }, + { + "epoch": 3.09376, + "grad_norm": 0.6799070239067078, + "learning_rate": 4.5184073629451785e-05, + "loss": 0.6117, + "step": 2417 + }, + { + "epoch": 3.09504, + "grad_norm": 0.7079071998596191, + "learning_rate": 4.518207282913166e-05, + "loss": 0.714, + "step": 2418 + }, + { + "epoch": 3.09632, + "grad_norm": 0.7541791200637817, + "learning_rate": 4.518007202881153e-05, + "loss": 0.7597, + "step": 2419 + }, + { + "epoch": 3.0976, + "grad_norm": 0.7132668495178223, + "learning_rate": 4.51780712284914e-05, + "loss": 0.6674, + "step": 2420 + }, + { + "epoch": 3.09888, + "grad_norm": 0.6781483292579651, + "learning_rate": 4.5176070428171266e-05, + "loss": 0.7066, + "step": 2421 + }, + { + "epoch": 3.10016, + "grad_norm": 0.6593953371047974, + "learning_rate": 4.517406962785114e-05, + "loss": 0.6443, + "step": 2422 + }, + { + "epoch": 3.10144, + "grad_norm": 0.6513901948928833, + "learning_rate": 4.5172068827531016e-05, + "loss": 0.6421, + "step": 2423 + }, + { + "epoch": 3.10272, + "grad_norm": 0.6750826239585876, + "learning_rate": 4.517006802721089e-05, + "loss": 0.6934, + "step": 2424 + }, + { + "epoch": 3.104, + "grad_norm": 0.6588460803031921, + "learning_rate": 4.516806722689076e-05, + "loss": 0.6222, + "step": 2425 + }, + { + "epoch": 3.10528, + "grad_norm": 0.6957754492759705, + "learning_rate": 4.516606642657063e-05, + "loss": 0.7149, + "step": 2426 + }, + { + "epoch": 3.10656, + "grad_norm": 0.644356906414032, + "learning_rate": 4.5164065626250504e-05, + "loss": 0.6543, + "step": 2427 + }, + { + "epoch": 3.10784, + "grad_norm": 0.65400230884552, + "learning_rate": 4.5162064825930376e-05, + "loss": 0.6291, + "step": 2428 + }, + { + "epoch": 3.10912, + "grad_norm": 0.6292628049850464, + "learning_rate": 4.516006402561024e-05, + "loss": 0.6608, + "step": 2429 + }, + { + "epoch": 3.1104, + "grad_norm": 0.6592010855674744, + "learning_rate": 4.515806322529012e-05, + "loss": 0.6669, + "step": 2430 + }, + { + "epoch": 3.11168, + "grad_norm": 0.6952319145202637, + "learning_rate": 4.515606242496999e-05, + "loss": 0.6414, + "step": 2431 + }, + { + "epoch": 3.11296, + "grad_norm": 0.679885745048523, + "learning_rate": 4.515406162464986e-05, + "loss": 0.6805, + "step": 2432 + }, + { + "epoch": 3.11424, + "grad_norm": 0.6636596918106079, + "learning_rate": 4.5152060824329735e-05, + "loss": 0.673, + "step": 2433 + }, + { + "epoch": 3.11552, + "grad_norm": 0.6836968660354614, + "learning_rate": 4.515006002400961e-05, + "loss": 0.6919, + "step": 2434 + }, + { + "epoch": 3.1168, + "grad_norm": 0.6769798994064331, + "learning_rate": 4.514805922368948e-05, + "loss": 0.6437, + "step": 2435 + }, + { + "epoch": 3.11808, + "grad_norm": 0.6906677484512329, + "learning_rate": 4.514605842336935e-05, + "loss": 0.6757, + "step": 2436 + }, + { + "epoch": 3.11936, + "grad_norm": 0.6719872951507568, + "learning_rate": 4.514405762304922e-05, + "loss": 0.7031, + "step": 2437 + }, + { + "epoch": 3.12064, + "grad_norm": 0.667978048324585, + "learning_rate": 4.5142056822729094e-05, + "loss": 0.6664, + "step": 2438 + }, + { + "epoch": 3.12192, + "grad_norm": 0.6768871545791626, + "learning_rate": 4.5140056022408966e-05, + "loss": 0.6375, + "step": 2439 + }, + { + "epoch": 3.1232, + "grad_norm": 0.6770617961883545, + "learning_rate": 4.513805522208884e-05, + "loss": 0.7146, + "step": 2440 + }, + { + "epoch": 3.12448, + "grad_norm": 0.7039771676063538, + "learning_rate": 4.513605442176871e-05, + "loss": 0.6729, + "step": 2441 + }, + { + "epoch": 3.12576, + "grad_norm": 0.7217344641685486, + "learning_rate": 4.513405362144858e-05, + "loss": 0.7069, + "step": 2442 + }, + { + "epoch": 3.12704, + "grad_norm": 0.6702495217323303, + "learning_rate": 4.5132052821128454e-05, + "loss": 0.707, + "step": 2443 + }, + { + "epoch": 3.12832, + "grad_norm": 0.6823554635047913, + "learning_rate": 4.5130052020808325e-05, + "loss": 0.6586, + "step": 2444 + }, + { + "epoch": 3.1296, + "grad_norm": 0.6816961765289307, + "learning_rate": 4.51280512204882e-05, + "loss": 0.716, + "step": 2445 + }, + { + "epoch": 3.13088, + "grad_norm": 0.6727588176727295, + "learning_rate": 4.512605042016807e-05, + "loss": 0.6628, + "step": 2446 + }, + { + "epoch": 3.13216, + "grad_norm": 0.6545873284339905, + "learning_rate": 4.512404961984794e-05, + "loss": 0.6654, + "step": 2447 + }, + { + "epoch": 3.1334400000000002, + "grad_norm": 0.6535388827323914, + "learning_rate": 4.512204881952781e-05, + "loss": 0.6513, + "step": 2448 + }, + { + "epoch": 3.13472, + "grad_norm": 0.6998023986816406, + "learning_rate": 4.5120048019207685e-05, + "loss": 0.6721, + "step": 2449 + }, + { + "epoch": 3.136, + "grad_norm": 0.7023734450340271, + "learning_rate": 4.5118047218887557e-05, + "loss": 0.705, + "step": 2450 + }, + { + "epoch": 3.13728, + "grad_norm": 0.6585432291030884, + "learning_rate": 4.511604641856743e-05, + "loss": 0.6781, + "step": 2451 + }, + { + "epoch": 3.13856, + "grad_norm": 0.6865501403808594, + "learning_rate": 4.51140456182473e-05, + "loss": 0.6667, + "step": 2452 + }, + { + "epoch": 3.13984, + "grad_norm": 0.6533448100090027, + "learning_rate": 4.511204481792717e-05, + "loss": 0.6903, + "step": 2453 + }, + { + "epoch": 3.14112, + "grad_norm": 0.6679571270942688, + "learning_rate": 4.5110044017607044e-05, + "loss": 0.6596, + "step": 2454 + }, + { + "epoch": 3.1424, + "grad_norm": 0.6311309337615967, + "learning_rate": 4.5108043217286916e-05, + "loss": 0.6224, + "step": 2455 + }, + { + "epoch": 3.14368, + "grad_norm": 0.6664997935295105, + "learning_rate": 4.510604241696679e-05, + "loss": 0.6438, + "step": 2456 + }, + { + "epoch": 3.14496, + "grad_norm": 0.6539314389228821, + "learning_rate": 4.510404161664666e-05, + "loss": 0.6767, + "step": 2457 + }, + { + "epoch": 3.14624, + "grad_norm": 0.6571037769317627, + "learning_rate": 4.510204081632654e-05, + "loss": 0.6322, + "step": 2458 + }, + { + "epoch": 3.14752, + "grad_norm": 0.6855980753898621, + "learning_rate": 4.51000400160064e-05, + "loss": 0.7187, + "step": 2459 + }, + { + "epoch": 3.1488, + "grad_norm": 0.6939951181411743, + "learning_rate": 4.5098039215686275e-05, + "loss": 0.7095, + "step": 2460 + }, + { + "epoch": 3.15008, + "grad_norm": 0.6432140469551086, + "learning_rate": 4.509603841536615e-05, + "loss": 0.6465, + "step": 2461 + }, + { + "epoch": 3.15136, + "grad_norm": 0.670173704624176, + "learning_rate": 4.509403761504602e-05, + "loss": 0.6325, + "step": 2462 + }, + { + "epoch": 3.15264, + "grad_norm": 0.6773591041564941, + "learning_rate": 4.509203681472589e-05, + "loss": 0.6375, + "step": 2463 + }, + { + "epoch": 3.15392, + "grad_norm": 0.6705629229545593, + "learning_rate": 4.509003601440576e-05, + "loss": 0.6594, + "step": 2464 + }, + { + "epoch": 3.1552, + "grad_norm": 0.7062819004058838, + "learning_rate": 4.508803521408564e-05, + "loss": 0.6698, + "step": 2465 + }, + { + "epoch": 3.15648, + "grad_norm": 0.7161005139350891, + "learning_rate": 4.508603441376551e-05, + "loss": 0.643, + "step": 2466 + }, + { + "epoch": 3.15776, + "grad_norm": 0.6911036968231201, + "learning_rate": 4.508403361344538e-05, + "loss": 0.6432, + "step": 2467 + }, + { + "epoch": 3.15904, + "grad_norm": 0.6515189409255981, + "learning_rate": 4.508203281312525e-05, + "loss": 0.6367, + "step": 2468 + }, + { + "epoch": 3.16032, + "grad_norm": 0.6516139507293701, + "learning_rate": 4.508003201280512e-05, + "loss": 0.6113, + "step": 2469 + }, + { + "epoch": 3.1616, + "grad_norm": 0.687893807888031, + "learning_rate": 4.5078031212484994e-05, + "loss": 0.6723, + "step": 2470 + }, + { + "epoch": 3.16288, + "grad_norm": 0.6721757650375366, + "learning_rate": 4.5076030412164866e-05, + "loss": 0.6546, + "step": 2471 + }, + { + "epoch": 3.16416, + "grad_norm": 0.67435622215271, + "learning_rate": 4.5074029611844744e-05, + "loss": 0.6481, + "step": 2472 + }, + { + "epoch": 3.16544, + "grad_norm": 0.6942387223243713, + "learning_rate": 4.5072028811524616e-05, + "loss": 0.7223, + "step": 2473 + }, + { + "epoch": 3.16672, + "grad_norm": 0.717036247253418, + "learning_rate": 4.507002801120449e-05, + "loss": 0.6983, + "step": 2474 + }, + { + "epoch": 3.168, + "grad_norm": 0.6933386325836182, + "learning_rate": 4.506802721088435e-05, + "loss": 0.6512, + "step": 2475 + }, + { + "epoch": 3.16928, + "grad_norm": 0.6787186861038208, + "learning_rate": 4.5066026410564225e-05, + "loss": 0.7028, + "step": 2476 + }, + { + "epoch": 3.17056, + "grad_norm": 0.6807264089584351, + "learning_rate": 4.50640256102441e-05, + "loss": 0.6713, + "step": 2477 + }, + { + "epoch": 3.17184, + "grad_norm": 0.6492279767990112, + "learning_rate": 4.506202480992397e-05, + "loss": 0.6451, + "step": 2478 + }, + { + "epoch": 3.17312, + "grad_norm": 0.6500716209411621, + "learning_rate": 4.506002400960385e-05, + "loss": 0.6752, + "step": 2479 + }, + { + "epoch": 3.1744, + "grad_norm": 0.7048171162605286, + "learning_rate": 4.505802320928372e-05, + "loss": 0.73, + "step": 2480 + }, + { + "epoch": 3.17568, + "grad_norm": 0.709684431552887, + "learning_rate": 4.505602240896359e-05, + "loss": 0.6861, + "step": 2481 + }, + { + "epoch": 3.1769600000000002, + "grad_norm": 0.6857057213783264, + "learning_rate": 4.505402160864346e-05, + "loss": 0.6706, + "step": 2482 + }, + { + "epoch": 3.17824, + "grad_norm": 0.7002062797546387, + "learning_rate": 4.505202080832333e-05, + "loss": 0.735, + "step": 2483 + }, + { + "epoch": 3.17952, + "grad_norm": 0.6401265859603882, + "learning_rate": 4.50500200080032e-05, + "loss": 0.6662, + "step": 2484 + }, + { + "epoch": 3.1808, + "grad_norm": 0.6491225361824036, + "learning_rate": 4.504801920768307e-05, + "loss": 0.6084, + "step": 2485 + }, + { + "epoch": 3.18208, + "grad_norm": 0.7253592014312744, + "learning_rate": 4.504601840736295e-05, + "loss": 0.6837, + "step": 2486 + }, + { + "epoch": 3.18336, + "grad_norm": 0.6980118751525879, + "learning_rate": 4.504401760704282e-05, + "loss": 0.663, + "step": 2487 + }, + { + "epoch": 3.18464, + "grad_norm": 0.6534168124198914, + "learning_rate": 4.5042016806722694e-05, + "loss": 0.633, + "step": 2488 + }, + { + "epoch": 3.18592, + "grad_norm": 0.7110708355903625, + "learning_rate": 4.5040016006402566e-05, + "loss": 0.6896, + "step": 2489 + }, + { + "epoch": 3.1872, + "grad_norm": 0.7201905846595764, + "learning_rate": 4.503801520608244e-05, + "loss": 0.6583, + "step": 2490 + }, + { + "epoch": 3.18848, + "grad_norm": 0.7668898701667786, + "learning_rate": 4.50360144057623e-05, + "loss": 0.7062, + "step": 2491 + }, + { + "epoch": 3.18976, + "grad_norm": 0.6760601997375488, + "learning_rate": 4.5034013605442174e-05, + "loss": 0.6544, + "step": 2492 + }, + { + "epoch": 3.19104, + "grad_norm": 0.7329314351081848, + "learning_rate": 4.503201280512205e-05, + "loss": 0.6991, + "step": 2493 + }, + { + "epoch": 3.19232, + "grad_norm": 0.7096596956253052, + "learning_rate": 4.5030012004801925e-05, + "loss": 0.7134, + "step": 2494 + }, + { + "epoch": 3.1936, + "grad_norm": 0.688031017780304, + "learning_rate": 4.50280112044818e-05, + "loss": 0.6861, + "step": 2495 + }, + { + "epoch": 3.19488, + "grad_norm": 0.6957057118415833, + "learning_rate": 4.502601040416167e-05, + "loss": 0.6978, + "step": 2496 + }, + { + "epoch": 3.19616, + "grad_norm": 0.6752075552940369, + "learning_rate": 4.502400960384154e-05, + "loss": 0.6878, + "step": 2497 + }, + { + "epoch": 3.19744, + "grad_norm": 0.6884586811065674, + "learning_rate": 4.502200880352141e-05, + "loss": 0.6982, + "step": 2498 + }, + { + "epoch": 3.19872, + "grad_norm": 0.6521093249320984, + "learning_rate": 4.502000800320128e-05, + "loss": 0.6606, + "step": 2499 + }, + { + "epoch": 3.2, + "grad_norm": 0.6644425392150879, + "learning_rate": 4.5018007202881156e-05, + "loss": 0.6753, + "step": 2500 + }, + { + "epoch": 3.20128, + "grad_norm": 0.6681901216506958, + "learning_rate": 4.501600640256103e-05, + "loss": 0.6476, + "step": 2501 + }, + { + "epoch": 3.20256, + "grad_norm": 0.6761760711669922, + "learning_rate": 4.50140056022409e-05, + "loss": 0.7179, + "step": 2502 + }, + { + "epoch": 3.20384, + "grad_norm": 0.6894738078117371, + "learning_rate": 4.501200480192077e-05, + "loss": 0.6737, + "step": 2503 + }, + { + "epoch": 3.20512, + "grad_norm": 0.714769184589386, + "learning_rate": 4.5010004001600644e-05, + "loss": 0.6867, + "step": 2504 + }, + { + "epoch": 3.2064, + "grad_norm": 0.6950749754905701, + "learning_rate": 4.5008003201280515e-05, + "loss": 0.6284, + "step": 2505 + }, + { + "epoch": 3.20768, + "grad_norm": 0.7155801057815552, + "learning_rate": 4.500600240096039e-05, + "loss": 0.7747, + "step": 2506 + }, + { + "epoch": 3.20896, + "grad_norm": 0.6814357042312622, + "learning_rate": 4.500400160064026e-05, + "loss": 0.6696, + "step": 2507 + }, + { + "epoch": 3.21024, + "grad_norm": 0.6706905364990234, + "learning_rate": 4.500200080032013e-05, + "loss": 0.6327, + "step": 2508 + }, + { + "epoch": 3.21152, + "grad_norm": 0.6843993067741394, + "learning_rate": 4.5e-05, + "loss": 0.6887, + "step": 2509 + }, + { + "epoch": 3.2128, + "grad_norm": 0.690980076789856, + "learning_rate": 4.4997999199679875e-05, + "loss": 0.6454, + "step": 2510 + }, + { + "epoch": 3.21408, + "grad_norm": 0.6931962370872498, + "learning_rate": 4.4995998399359747e-05, + "loss": 0.7268, + "step": 2511 + }, + { + "epoch": 3.21536, + "grad_norm": 0.6837199926376343, + "learning_rate": 4.499399759903962e-05, + "loss": 0.6747, + "step": 2512 + }, + { + "epoch": 3.21664, + "grad_norm": 0.6582213640213013, + "learning_rate": 4.499199679871949e-05, + "loss": 0.6761, + "step": 2513 + }, + { + "epoch": 3.21792, + "grad_norm": 0.6401382088661194, + "learning_rate": 4.498999599839936e-05, + "loss": 0.6742, + "step": 2514 + }, + { + "epoch": 3.2192, + "grad_norm": 0.6773951053619385, + "learning_rate": 4.4987995198079234e-05, + "loss": 0.6328, + "step": 2515 + }, + { + "epoch": 3.2204800000000002, + "grad_norm": 0.7156820893287659, + "learning_rate": 4.4985994397759106e-05, + "loss": 0.7116, + "step": 2516 + }, + { + "epoch": 3.22176, + "grad_norm": 0.6766327619552612, + "learning_rate": 4.498399359743898e-05, + "loss": 0.66, + "step": 2517 + }, + { + "epoch": 3.22304, + "grad_norm": 0.6747710704803467, + "learning_rate": 4.498199279711885e-05, + "loss": 0.6555, + "step": 2518 + }, + { + "epoch": 3.22432, + "grad_norm": 0.6878660917282104, + "learning_rate": 4.497999199679872e-05, + "loss": 0.7085, + "step": 2519 + }, + { + "epoch": 3.2256, + "grad_norm": 0.7145971655845642, + "learning_rate": 4.497799119647859e-05, + "loss": 0.65, + "step": 2520 + }, + { + "epoch": 3.22688, + "grad_norm": 0.664560854434967, + "learning_rate": 4.4975990396158465e-05, + "loss": 0.692, + "step": 2521 + }, + { + "epoch": 3.22816, + "grad_norm": 0.6799471378326416, + "learning_rate": 4.497398959583834e-05, + "loss": 0.7305, + "step": 2522 + }, + { + "epoch": 3.22944, + "grad_norm": 0.7038140892982483, + "learning_rate": 4.497198879551821e-05, + "loss": 0.6561, + "step": 2523 + }, + { + "epoch": 3.23072, + "grad_norm": 0.6589545607566833, + "learning_rate": 4.496998799519808e-05, + "loss": 0.6906, + "step": 2524 + }, + { + "epoch": 3.232, + "grad_norm": 0.6808072924613953, + "learning_rate": 4.496798719487795e-05, + "loss": 0.652, + "step": 2525 + }, + { + "epoch": 3.23328, + "grad_norm": 0.6572796702384949, + "learning_rate": 4.4965986394557824e-05, + "loss": 0.6557, + "step": 2526 + }, + { + "epoch": 3.23456, + "grad_norm": 0.6141901016235352, + "learning_rate": 4.4963985594237696e-05, + "loss": 0.6165, + "step": 2527 + }, + { + "epoch": 3.23584, + "grad_norm": 0.6819775104522705, + "learning_rate": 4.496198479391757e-05, + "loss": 0.6397, + "step": 2528 + }, + { + "epoch": 3.23712, + "grad_norm": 0.6745738387107849, + "learning_rate": 4.495998399359744e-05, + "loss": 0.6836, + "step": 2529 + }, + { + "epoch": 3.2384, + "grad_norm": 0.7165648937225342, + "learning_rate": 4.495798319327731e-05, + "loss": 0.7437, + "step": 2530 + }, + { + "epoch": 3.23968, + "grad_norm": 0.6725075840950012, + "learning_rate": 4.4955982392957184e-05, + "loss": 0.6651, + "step": 2531 + }, + { + "epoch": 3.24096, + "grad_norm": 0.6513887643814087, + "learning_rate": 4.4953981592637056e-05, + "loss": 0.6218, + "step": 2532 + }, + { + "epoch": 3.24224, + "grad_norm": 0.7130393385887146, + "learning_rate": 4.495198079231693e-05, + "loss": 0.7008, + "step": 2533 + }, + { + "epoch": 3.24352, + "grad_norm": 0.7220726013183594, + "learning_rate": 4.49499799919968e-05, + "loss": 0.7437, + "step": 2534 + }, + { + "epoch": 3.2448, + "grad_norm": 0.6959324479103088, + "learning_rate": 4.494797919167667e-05, + "loss": 0.6412, + "step": 2535 + }, + { + "epoch": 3.24608, + "grad_norm": 0.6925476789474487, + "learning_rate": 4.494597839135655e-05, + "loss": 0.671, + "step": 2536 + }, + { + "epoch": 3.24736, + "grad_norm": 0.6907139420509338, + "learning_rate": 4.4943977591036415e-05, + "loss": 0.7017, + "step": 2537 + }, + { + "epoch": 3.24864, + "grad_norm": 0.7118213176727295, + "learning_rate": 4.494197679071629e-05, + "loss": 0.6902, + "step": 2538 + }, + { + "epoch": 3.24992, + "grad_norm": 0.6986705660820007, + "learning_rate": 4.493997599039616e-05, + "loss": 0.6936, + "step": 2539 + }, + { + "epoch": 3.2512, + "grad_norm": 0.6764949560165405, + "learning_rate": 4.493797519007603e-05, + "loss": 0.6506, + "step": 2540 + }, + { + "epoch": 3.25248, + "grad_norm": 0.6302717328071594, + "learning_rate": 4.49359743897559e-05, + "loss": 0.6249, + "step": 2541 + }, + { + "epoch": 3.2537599999999998, + "grad_norm": 0.6727375388145447, + "learning_rate": 4.4933973589435774e-05, + "loss": 0.6644, + "step": 2542 + }, + { + "epoch": 3.25504, + "grad_norm": 0.6621202826499939, + "learning_rate": 4.493197278911565e-05, + "loss": 0.6559, + "step": 2543 + }, + { + "epoch": 3.25632, + "grad_norm": 0.67811119556427, + "learning_rate": 4.4929971988795525e-05, + "loss": 0.6871, + "step": 2544 + }, + { + "epoch": 3.2576, + "grad_norm": 0.7005154490470886, + "learning_rate": 4.492797118847539e-05, + "loss": 0.6732, + "step": 2545 + }, + { + "epoch": 3.25888, + "grad_norm": 0.6961964964866638, + "learning_rate": 4.492597038815526e-05, + "loss": 0.6727, + "step": 2546 + }, + { + "epoch": 3.26016, + "grad_norm": 0.6468445062637329, + "learning_rate": 4.4923969587835133e-05, + "loss": 0.6521, + "step": 2547 + }, + { + "epoch": 3.26144, + "grad_norm": 0.6833783984184265, + "learning_rate": 4.4921968787515005e-05, + "loss": 0.6713, + "step": 2548 + }, + { + "epoch": 3.26272, + "grad_norm": 0.6983469724655151, + "learning_rate": 4.491996798719488e-05, + "loss": 0.7341, + "step": 2549 + }, + { + "epoch": 3.2640000000000002, + "grad_norm": 0.6976143717765808, + "learning_rate": 4.4917967186874756e-05, + "loss": 0.6902, + "step": 2550 + }, + { + "epoch": 3.26528, + "grad_norm": 0.6914013028144836, + "learning_rate": 4.491596638655463e-05, + "loss": 0.6953, + "step": 2551 + }, + { + "epoch": 3.26656, + "grad_norm": 0.6625963449478149, + "learning_rate": 4.49139655862345e-05, + "loss": 0.6624, + "step": 2552 + }, + { + "epoch": 3.26784, + "grad_norm": 0.6785646677017212, + "learning_rate": 4.4911964785914365e-05, + "loss": 0.6387, + "step": 2553 + }, + { + "epoch": 3.26912, + "grad_norm": 0.6610428094863892, + "learning_rate": 4.4909963985594236e-05, + "loss": 0.6896, + "step": 2554 + }, + { + "epoch": 3.2704, + "grad_norm": 0.7095894813537598, + "learning_rate": 4.490796318527411e-05, + "loss": 0.6717, + "step": 2555 + }, + { + "epoch": 3.27168, + "grad_norm": 0.6596770286560059, + "learning_rate": 4.490596238495398e-05, + "loss": 0.627, + "step": 2556 + }, + { + "epoch": 3.27296, + "grad_norm": 0.6432579755783081, + "learning_rate": 4.490396158463386e-05, + "loss": 0.6196, + "step": 2557 + }, + { + "epoch": 3.27424, + "grad_norm": 0.6961793303489685, + "learning_rate": 4.490196078431373e-05, + "loss": 0.6624, + "step": 2558 + }, + { + "epoch": 3.27552, + "grad_norm": 0.6829546093940735, + "learning_rate": 4.48999599839936e-05, + "loss": 0.6567, + "step": 2559 + }, + { + "epoch": 3.2768, + "grad_norm": 0.7708462476730347, + "learning_rate": 4.4897959183673474e-05, + "loss": 0.7334, + "step": 2560 + }, + { + "epoch": 3.27808, + "grad_norm": 0.6607860326766968, + "learning_rate": 4.489595838335334e-05, + "loss": 0.6581, + "step": 2561 + }, + { + "epoch": 3.27936, + "grad_norm": 0.6741865873336792, + "learning_rate": 4.489395758303321e-05, + "loss": 0.6928, + "step": 2562 + }, + { + "epoch": 3.28064, + "grad_norm": 0.6427492499351501, + "learning_rate": 4.489195678271308e-05, + "loss": 0.6294, + "step": 2563 + }, + { + "epoch": 3.28192, + "grad_norm": 0.6945633292198181, + "learning_rate": 4.488995598239296e-05, + "loss": 0.7027, + "step": 2564 + }, + { + "epoch": 3.2832, + "grad_norm": 0.6338651776313782, + "learning_rate": 4.4887955182072834e-05, + "loss": 0.6274, + "step": 2565 + }, + { + "epoch": 3.28448, + "grad_norm": 0.6843262314796448, + "learning_rate": 4.4885954381752705e-05, + "loss": 0.6814, + "step": 2566 + }, + { + "epoch": 3.28576, + "grad_norm": 0.6916020512580872, + "learning_rate": 4.488395358143258e-05, + "loss": 0.7082, + "step": 2567 + }, + { + "epoch": 3.28704, + "grad_norm": 0.6470593214035034, + "learning_rate": 4.488195278111245e-05, + "loss": 0.6058, + "step": 2568 + }, + { + "epoch": 3.28832, + "grad_norm": 0.7212356328964233, + "learning_rate": 4.4879951980792314e-05, + "loss": 0.6994, + "step": 2569 + }, + { + "epoch": 3.2896, + "grad_norm": 0.6888067722320557, + "learning_rate": 4.4877951180472186e-05, + "loss": 0.6621, + "step": 2570 + }, + { + "epoch": 3.29088, + "grad_norm": 0.7158266305923462, + "learning_rate": 4.4875950380152065e-05, + "loss": 0.6782, + "step": 2571 + }, + { + "epoch": 3.29216, + "grad_norm": 0.6973638534545898, + "learning_rate": 4.4873949579831937e-05, + "loss": 0.7213, + "step": 2572 + }, + { + "epoch": 3.29344, + "grad_norm": 0.7036058306694031, + "learning_rate": 4.487194877951181e-05, + "loss": 0.6762, + "step": 2573 + }, + { + "epoch": 3.29472, + "grad_norm": 0.6940634846687317, + "learning_rate": 4.486994797919168e-05, + "loss": 0.6541, + "step": 2574 + }, + { + "epoch": 3.296, + "grad_norm": 0.7082141041755676, + "learning_rate": 4.486794717887155e-05, + "loss": 0.6707, + "step": 2575 + }, + { + "epoch": 3.2972799999999998, + "grad_norm": 0.6922471523284912, + "learning_rate": 4.4865946378551424e-05, + "loss": 0.6843, + "step": 2576 + }, + { + "epoch": 3.29856, + "grad_norm": 0.6800956130027771, + "learning_rate": 4.486394557823129e-05, + "loss": 0.6028, + "step": 2577 + }, + { + "epoch": 3.29984, + "grad_norm": 0.6691498160362244, + "learning_rate": 4.486194477791117e-05, + "loss": 0.6706, + "step": 2578 + }, + { + "epoch": 3.30112, + "grad_norm": 0.6471302509307861, + "learning_rate": 4.485994397759104e-05, + "loss": 0.6753, + "step": 2579 + }, + { + "epoch": 3.3024, + "grad_norm": 0.6529038548469543, + "learning_rate": 4.485794317727091e-05, + "loss": 0.6006, + "step": 2580 + }, + { + "epoch": 3.30368, + "grad_norm": 0.6317946314811707, + "learning_rate": 4.485594237695078e-05, + "loss": 0.6842, + "step": 2581 + }, + { + "epoch": 3.30496, + "grad_norm": 0.6559657454490662, + "learning_rate": 4.4853941576630655e-05, + "loss": 0.6628, + "step": 2582 + }, + { + "epoch": 3.30624, + "grad_norm": 0.6486949324607849, + "learning_rate": 4.485194077631053e-05, + "loss": 0.6775, + "step": 2583 + }, + { + "epoch": 3.3075200000000002, + "grad_norm": 0.6692925691604614, + "learning_rate": 4.48499399759904e-05, + "loss": 0.6721, + "step": 2584 + }, + { + "epoch": 3.3088, + "grad_norm": 0.6672170162200928, + "learning_rate": 4.484793917567027e-05, + "loss": 0.6424, + "step": 2585 + }, + { + "epoch": 3.31008, + "grad_norm": 0.668408215045929, + "learning_rate": 4.484593837535014e-05, + "loss": 0.6445, + "step": 2586 + }, + { + "epoch": 3.31136, + "grad_norm": 0.6862883567810059, + "learning_rate": 4.4843937575030014e-05, + "loss": 0.6994, + "step": 2587 + }, + { + "epoch": 3.31264, + "grad_norm": 0.6536902189254761, + "learning_rate": 4.4841936774709886e-05, + "loss": 0.6149, + "step": 2588 + }, + { + "epoch": 3.31392, + "grad_norm": 0.6626643538475037, + "learning_rate": 4.483993597438976e-05, + "loss": 0.6631, + "step": 2589 + }, + { + "epoch": 3.3152, + "grad_norm": 0.6667909622192383, + "learning_rate": 4.483793517406963e-05, + "loss": 0.6781, + "step": 2590 + }, + { + "epoch": 3.31648, + "grad_norm": 0.6631030440330505, + "learning_rate": 4.48359343737495e-05, + "loss": 0.6586, + "step": 2591 + }, + { + "epoch": 3.31776, + "grad_norm": 0.6788957118988037, + "learning_rate": 4.4833933573429374e-05, + "loss": 0.6355, + "step": 2592 + }, + { + "epoch": 3.31904, + "grad_norm": 0.7192303538322449, + "learning_rate": 4.4831932773109246e-05, + "loss": 0.6597, + "step": 2593 + }, + { + "epoch": 3.32032, + "grad_norm": 0.7045040726661682, + "learning_rate": 4.482993197278912e-05, + "loss": 0.7322, + "step": 2594 + }, + { + "epoch": 3.3216, + "grad_norm": 0.6571533679962158, + "learning_rate": 4.482793117246899e-05, + "loss": 0.6443, + "step": 2595 + }, + { + "epoch": 3.32288, + "grad_norm": 0.6708471775054932, + "learning_rate": 4.482593037214886e-05, + "loss": 0.6391, + "step": 2596 + }, + { + "epoch": 3.32416, + "grad_norm": 0.6779941916465759, + "learning_rate": 4.482392957182873e-05, + "loss": 0.6226, + "step": 2597 + }, + { + "epoch": 3.32544, + "grad_norm": 0.7229152321815491, + "learning_rate": 4.4821928771508605e-05, + "loss": 0.6434, + "step": 2598 + }, + { + "epoch": 3.32672, + "grad_norm": 0.7014409303665161, + "learning_rate": 4.481992797118848e-05, + "loss": 0.7082, + "step": 2599 + }, + { + "epoch": 3.328, + "grad_norm": 0.6607677340507507, + "learning_rate": 4.481792717086835e-05, + "loss": 0.6574, + "step": 2600 + }, + { + "epoch": 3.32928, + "grad_norm": 0.7238780856132507, + "learning_rate": 4.481592637054822e-05, + "loss": 0.6938, + "step": 2601 + }, + { + "epoch": 3.33056, + "grad_norm": 0.6686570048332214, + "learning_rate": 4.481392557022809e-05, + "loss": 0.6405, + "step": 2602 + }, + { + "epoch": 3.33184, + "grad_norm": 0.708084225654602, + "learning_rate": 4.4811924769907964e-05, + "loss": 0.6745, + "step": 2603 + }, + { + "epoch": 3.33312, + "grad_norm": 0.684820830821991, + "learning_rate": 4.4809923969587836e-05, + "loss": 0.6385, + "step": 2604 + }, + { + "epoch": 3.3344, + "grad_norm": 0.6804541945457458, + "learning_rate": 4.480792316926771e-05, + "loss": 0.677, + "step": 2605 + }, + { + "epoch": 3.33568, + "grad_norm": 0.6551414728164673, + "learning_rate": 4.4805922368947586e-05, + "loss": 0.6381, + "step": 2606 + }, + { + "epoch": 3.33696, + "grad_norm": 0.7107576131820679, + "learning_rate": 4.480392156862745e-05, + "loss": 0.7287, + "step": 2607 + }, + { + "epoch": 3.33824, + "grad_norm": 0.6644242405891418, + "learning_rate": 4.4801920768307323e-05, + "loss": 0.6999, + "step": 2608 + }, + { + "epoch": 3.33952, + "grad_norm": 0.6727672815322876, + "learning_rate": 4.4799919967987195e-05, + "loss": 0.6438, + "step": 2609 + }, + { + "epoch": 3.3407999999999998, + "grad_norm": 0.67037433385849, + "learning_rate": 4.479791916766707e-05, + "loss": 0.6282, + "step": 2610 + }, + { + "epoch": 3.34208, + "grad_norm": 0.650492250919342, + "learning_rate": 4.479591836734694e-05, + "loss": 0.6333, + "step": 2611 + }, + { + "epoch": 3.34336, + "grad_norm": 0.6534383893013, + "learning_rate": 4.479391756702681e-05, + "loss": 0.671, + "step": 2612 + }, + { + "epoch": 3.34464, + "grad_norm": 0.6915240287780762, + "learning_rate": 4.479191676670669e-05, + "loss": 0.6673, + "step": 2613 + }, + { + "epoch": 3.34592, + "grad_norm": 0.6916387677192688, + "learning_rate": 4.478991596638656e-05, + "loss": 0.6572, + "step": 2614 + }, + { + "epoch": 3.3472, + "grad_norm": 0.6631138324737549, + "learning_rate": 4.4787915166066426e-05, + "loss": 0.6512, + "step": 2615 + }, + { + "epoch": 3.34848, + "grad_norm": 0.6894577145576477, + "learning_rate": 4.47859143657463e-05, + "loss": 0.6438, + "step": 2616 + }, + { + "epoch": 3.34976, + "grad_norm": 0.6938015818595886, + "learning_rate": 4.478391356542617e-05, + "loss": 0.69, + "step": 2617 + }, + { + "epoch": 3.3510400000000002, + "grad_norm": 0.6625933647155762, + "learning_rate": 4.478191276510604e-05, + "loss": 0.6358, + "step": 2618 + }, + { + "epoch": 3.35232, + "grad_norm": 0.7050860524177551, + "learning_rate": 4.4779911964785914e-05, + "loss": 0.7098, + "step": 2619 + }, + { + "epoch": 3.3536, + "grad_norm": 0.7034277319908142, + "learning_rate": 4.477791116446579e-05, + "loss": 0.7057, + "step": 2620 + }, + { + "epoch": 3.35488, + "grad_norm": 0.7165651917457581, + "learning_rate": 4.4775910364145664e-05, + "loss": 0.7058, + "step": 2621 + }, + { + "epoch": 3.35616, + "grad_norm": 0.6885167956352234, + "learning_rate": 4.4773909563825536e-05, + "loss": 0.6334, + "step": 2622 + }, + { + "epoch": 3.35744, + "grad_norm": 0.6467083692550659, + "learning_rate": 4.47719087635054e-05, + "loss": 0.6097, + "step": 2623 + }, + { + "epoch": 3.35872, + "grad_norm": 0.7264028787612915, + "learning_rate": 4.476990796318527e-05, + "loss": 0.7396, + "step": 2624 + }, + { + "epoch": 3.36, + "grad_norm": 0.7132958769798279, + "learning_rate": 4.4767907162865145e-05, + "loss": 0.6642, + "step": 2625 + }, + { + "epoch": 3.36128, + "grad_norm": 0.7386727333068848, + "learning_rate": 4.476590636254502e-05, + "loss": 0.726, + "step": 2626 + }, + { + "epoch": 3.36256, + "grad_norm": 0.6800748705863953, + "learning_rate": 4.4763905562224895e-05, + "loss": 0.6936, + "step": 2627 + }, + { + "epoch": 3.36384, + "grad_norm": 0.6800684332847595, + "learning_rate": 4.476190476190477e-05, + "loss": 0.6065, + "step": 2628 + }, + { + "epoch": 3.36512, + "grad_norm": 0.6520423293113708, + "learning_rate": 4.475990396158464e-05, + "loss": 0.6725, + "step": 2629 + }, + { + "epoch": 3.3664, + "grad_norm": 0.6812400221824646, + "learning_rate": 4.475790316126451e-05, + "loss": 0.6495, + "step": 2630 + }, + { + "epoch": 3.36768, + "grad_norm": 0.6640177965164185, + "learning_rate": 4.4755902360944376e-05, + "loss": 0.6885, + "step": 2631 + }, + { + "epoch": 3.36896, + "grad_norm": 0.6409942507743835, + "learning_rate": 4.475390156062425e-05, + "loss": 0.6343, + "step": 2632 + }, + { + "epoch": 3.37024, + "grad_norm": 0.6690536141395569, + "learning_rate": 4.475190076030412e-05, + "loss": 0.6792, + "step": 2633 + }, + { + "epoch": 3.37152, + "grad_norm": 0.7020560503005981, + "learning_rate": 4.474989995998399e-05, + "loss": 0.7442, + "step": 2634 + }, + { + "epoch": 3.3728, + "grad_norm": 0.698049783706665, + "learning_rate": 4.474789915966387e-05, + "loss": 0.6836, + "step": 2635 + }, + { + "epoch": 3.37408, + "grad_norm": 0.6824793815612793, + "learning_rate": 4.474589835934374e-05, + "loss": 0.6818, + "step": 2636 + }, + { + "epoch": 3.37536, + "grad_norm": 0.6641604900360107, + "learning_rate": 4.4743897559023614e-05, + "loss": 0.6497, + "step": 2637 + }, + { + "epoch": 3.37664, + "grad_norm": 0.6908559203147888, + "learning_rate": 4.4741896758703486e-05, + "loss": 0.6655, + "step": 2638 + }, + { + "epoch": 3.37792, + "grad_norm": 0.6793168783187866, + "learning_rate": 4.473989595838335e-05, + "loss": 0.6648, + "step": 2639 + }, + { + "epoch": 3.3792, + "grad_norm": 0.7036957144737244, + "learning_rate": 4.473789515806322e-05, + "loss": 0.6691, + "step": 2640 + }, + { + "epoch": 3.38048, + "grad_norm": 0.684795081615448, + "learning_rate": 4.4735894357743095e-05, + "loss": 0.6324, + "step": 2641 + }, + { + "epoch": 3.38176, + "grad_norm": 0.6578769683837891, + "learning_rate": 4.473389355742297e-05, + "loss": 0.6746, + "step": 2642 + }, + { + "epoch": 3.38304, + "grad_norm": 0.6963245272636414, + "learning_rate": 4.4731892757102845e-05, + "loss": 0.7146, + "step": 2643 + }, + { + "epoch": 3.3843199999999998, + "grad_norm": 0.6984001398086548, + "learning_rate": 4.472989195678272e-05, + "loss": 0.6317, + "step": 2644 + }, + { + "epoch": 3.3856, + "grad_norm": 0.7258270978927612, + "learning_rate": 4.472789115646259e-05, + "loss": 0.6904, + "step": 2645 + }, + { + "epoch": 3.38688, + "grad_norm": 0.6447382569313049, + "learning_rate": 4.472589035614246e-05, + "loss": 0.6477, + "step": 2646 + }, + { + "epoch": 3.38816, + "grad_norm": 0.6710345149040222, + "learning_rate": 4.4723889555822326e-05, + "loss": 0.6563, + "step": 2647 + }, + { + "epoch": 3.38944, + "grad_norm": 0.7310519218444824, + "learning_rate": 4.47218887555022e-05, + "loss": 0.6717, + "step": 2648 + }, + { + "epoch": 3.39072, + "grad_norm": 0.7194939851760864, + "learning_rate": 4.4719887955182076e-05, + "loss": 0.6474, + "step": 2649 + }, + { + "epoch": 3.392, + "grad_norm": 0.6350496411323547, + "learning_rate": 4.471788715486195e-05, + "loss": 0.6131, + "step": 2650 + }, + { + "epoch": 3.39328, + "grad_norm": 0.6812024712562561, + "learning_rate": 4.471588635454182e-05, + "loss": 0.6778, + "step": 2651 + }, + { + "epoch": 3.3945600000000002, + "grad_norm": 0.6636951565742493, + "learning_rate": 4.471388555422169e-05, + "loss": 0.6575, + "step": 2652 + }, + { + "epoch": 3.39584, + "grad_norm": 0.6647194027900696, + "learning_rate": 4.4711884753901564e-05, + "loss": 0.6415, + "step": 2653 + }, + { + "epoch": 3.39712, + "grad_norm": 0.6898042559623718, + "learning_rate": 4.4709883953581436e-05, + "loss": 0.7263, + "step": 2654 + }, + { + "epoch": 3.3984, + "grad_norm": 0.7069958448410034, + "learning_rate": 4.47078831532613e-05, + "loss": 0.6351, + "step": 2655 + }, + { + "epoch": 3.39968, + "grad_norm": 0.6806748509407043, + "learning_rate": 4.470588235294118e-05, + "loss": 0.665, + "step": 2656 + }, + { + "epoch": 3.40096, + "grad_norm": 0.7422452569007874, + "learning_rate": 4.470388155262105e-05, + "loss": 0.73, + "step": 2657 + }, + { + "epoch": 3.40224, + "grad_norm": 0.7665037512779236, + "learning_rate": 4.470188075230092e-05, + "loss": 0.7148, + "step": 2658 + }, + { + "epoch": 3.40352, + "grad_norm": 0.7186295986175537, + "learning_rate": 4.4699879951980795e-05, + "loss": 0.6665, + "step": 2659 + }, + { + "epoch": 3.4048, + "grad_norm": 0.705100417137146, + "learning_rate": 4.469787915166067e-05, + "loss": 0.6724, + "step": 2660 + }, + { + "epoch": 3.40608, + "grad_norm": 0.6744673252105713, + "learning_rate": 4.469587835134054e-05, + "loss": 0.6658, + "step": 2661 + }, + { + "epoch": 3.40736, + "grad_norm": 0.6781793236732483, + "learning_rate": 4.469387755102041e-05, + "loss": 0.6664, + "step": 2662 + }, + { + "epoch": 3.40864, + "grad_norm": 0.6606245040893555, + "learning_rate": 4.469187675070028e-05, + "loss": 0.6189, + "step": 2663 + }, + { + "epoch": 3.40992, + "grad_norm": 0.6804131865501404, + "learning_rate": 4.4689875950380154e-05, + "loss": 0.7092, + "step": 2664 + }, + { + "epoch": 3.4112, + "grad_norm": 0.6703636050224304, + "learning_rate": 4.4687875150060026e-05, + "loss": 0.7, + "step": 2665 + }, + { + "epoch": 3.41248, + "grad_norm": 0.6748145818710327, + "learning_rate": 4.46858743497399e-05, + "loss": 0.6143, + "step": 2666 + }, + { + "epoch": 3.41376, + "grad_norm": 0.6967974305152893, + "learning_rate": 4.468387354941977e-05, + "loss": 0.6877, + "step": 2667 + }, + { + "epoch": 3.41504, + "grad_norm": 0.6868359446525574, + "learning_rate": 4.468187274909964e-05, + "loss": 0.6435, + "step": 2668 + }, + { + "epoch": 3.41632, + "grad_norm": 0.7249009013175964, + "learning_rate": 4.4679871948779513e-05, + "loss": 0.7003, + "step": 2669 + }, + { + "epoch": 3.4176, + "grad_norm": 0.6801961064338684, + "learning_rate": 4.4677871148459385e-05, + "loss": 0.6565, + "step": 2670 + }, + { + "epoch": 3.41888, + "grad_norm": 0.7236019372940063, + "learning_rate": 4.467587034813926e-05, + "loss": 0.6586, + "step": 2671 + }, + { + "epoch": 3.42016, + "grad_norm": 0.6557265520095825, + "learning_rate": 4.467386954781913e-05, + "loss": 0.6919, + "step": 2672 + }, + { + "epoch": 3.42144, + "grad_norm": 0.7524291276931763, + "learning_rate": 4.4671868747499e-05, + "loss": 0.7208, + "step": 2673 + }, + { + "epoch": 3.42272, + "grad_norm": 0.7023764848709106, + "learning_rate": 4.466986794717887e-05, + "loss": 0.7148, + "step": 2674 + }, + { + "epoch": 3.424, + "grad_norm": 0.7110322117805481, + "learning_rate": 4.4667867146858745e-05, + "loss": 0.7194, + "step": 2675 + }, + { + "epoch": 3.42528, + "grad_norm": 0.668428361415863, + "learning_rate": 4.4665866346538616e-05, + "loss": 0.663, + "step": 2676 + }, + { + "epoch": 3.42656, + "grad_norm": 0.6546775698661804, + "learning_rate": 4.4663865546218495e-05, + "loss": 0.5899, + "step": 2677 + }, + { + "epoch": 3.4278399999999998, + "grad_norm": 0.6761878132820129, + "learning_rate": 4.466186474589836e-05, + "loss": 0.6309, + "step": 2678 + }, + { + "epoch": 3.42912, + "grad_norm": 0.7158021330833435, + "learning_rate": 4.465986394557823e-05, + "loss": 0.737, + "step": 2679 + }, + { + "epoch": 3.4304, + "grad_norm": 0.7490652799606323, + "learning_rate": 4.4657863145258104e-05, + "loss": 0.6946, + "step": 2680 + }, + { + "epoch": 3.43168, + "grad_norm": 0.7451316118240356, + "learning_rate": 4.4655862344937976e-05, + "loss": 0.6851, + "step": 2681 + }, + { + "epoch": 3.43296, + "grad_norm": 0.6753928065299988, + "learning_rate": 4.465386154461785e-05, + "loss": 0.6876, + "step": 2682 + }, + { + "epoch": 3.43424, + "grad_norm": 0.6749107837677002, + "learning_rate": 4.465186074429772e-05, + "loss": 0.6256, + "step": 2683 + }, + { + "epoch": 3.43552, + "grad_norm": 0.6775442361831665, + "learning_rate": 4.46498599439776e-05, + "loss": 0.7097, + "step": 2684 + }, + { + "epoch": 3.4368, + "grad_norm": 0.7066680192947388, + "learning_rate": 4.464785914365747e-05, + "loss": 0.6892, + "step": 2685 + }, + { + "epoch": 3.4380800000000002, + "grad_norm": 0.7244362831115723, + "learning_rate": 4.4645858343337335e-05, + "loss": 0.6675, + "step": 2686 + }, + { + "epoch": 3.4393599999999998, + "grad_norm": 0.7037179470062256, + "learning_rate": 4.464385754301721e-05, + "loss": 0.7126, + "step": 2687 + }, + { + "epoch": 3.44064, + "grad_norm": 0.6872085332870483, + "learning_rate": 4.464185674269708e-05, + "loss": 0.6409, + "step": 2688 + }, + { + "epoch": 3.44192, + "grad_norm": 0.6594861149787903, + "learning_rate": 4.463985594237695e-05, + "loss": 0.6789, + "step": 2689 + }, + { + "epoch": 3.4432, + "grad_norm": 0.6587255001068115, + "learning_rate": 4.463785514205682e-05, + "loss": 0.6327, + "step": 2690 + }, + { + "epoch": 3.44448, + "grad_norm": 0.6550960540771484, + "learning_rate": 4.46358543417367e-05, + "loss": 0.5824, + "step": 2691 + }, + { + "epoch": 3.44576, + "grad_norm": 0.7198725342750549, + "learning_rate": 4.463385354141657e-05, + "loss": 0.6853, + "step": 2692 + }, + { + "epoch": 3.44704, + "grad_norm": 0.6870785355567932, + "learning_rate": 4.4631852741096445e-05, + "loss": 0.6366, + "step": 2693 + }, + { + "epoch": 3.44832, + "grad_norm": 0.6846055388450623, + "learning_rate": 4.462985194077631e-05, + "loss": 0.6755, + "step": 2694 + }, + { + "epoch": 3.4496, + "grad_norm": 0.6860753893852234, + "learning_rate": 4.462785114045618e-05, + "loss": 0.6605, + "step": 2695 + }, + { + "epoch": 3.45088, + "grad_norm": 0.7068583369255066, + "learning_rate": 4.4625850340136054e-05, + "loss": 0.6775, + "step": 2696 + }, + { + "epoch": 3.45216, + "grad_norm": 0.6669130921363831, + "learning_rate": 4.4623849539815925e-05, + "loss": 0.674, + "step": 2697 + }, + { + "epoch": 3.45344, + "grad_norm": 0.6609745025634766, + "learning_rate": 4.4621848739495804e-05, + "loss": 0.6717, + "step": 2698 + }, + { + "epoch": 3.45472, + "grad_norm": 0.6721084713935852, + "learning_rate": 4.4619847939175676e-05, + "loss": 0.6345, + "step": 2699 + }, + { + "epoch": 3.456, + "grad_norm": 0.6484143733978271, + "learning_rate": 4.461784713885555e-05, + "loss": 0.6415, + "step": 2700 + }, + { + "epoch": 3.45728, + "grad_norm": 0.6830167174339294, + "learning_rate": 4.461584633853542e-05, + "loss": 0.613, + "step": 2701 + }, + { + "epoch": 3.45856, + "grad_norm": 0.7231410145759583, + "learning_rate": 4.4613845538215285e-05, + "loss": 0.7048, + "step": 2702 + }, + { + "epoch": 3.45984, + "grad_norm": 0.6800318956375122, + "learning_rate": 4.4611844737895157e-05, + "loss": 0.6584, + "step": 2703 + }, + { + "epoch": 3.46112, + "grad_norm": 0.6745948791503906, + "learning_rate": 4.460984393757503e-05, + "loss": 0.6266, + "step": 2704 + }, + { + "epoch": 3.4624, + "grad_norm": 0.6727662086486816, + "learning_rate": 4.460784313725491e-05, + "loss": 0.6853, + "step": 2705 + }, + { + "epoch": 3.46368, + "grad_norm": 0.7079063653945923, + "learning_rate": 4.460584233693478e-05, + "loss": 0.659, + "step": 2706 + }, + { + "epoch": 3.46496, + "grad_norm": 0.6946644186973572, + "learning_rate": 4.460384153661465e-05, + "loss": 0.6874, + "step": 2707 + }, + { + "epoch": 3.46624, + "grad_norm": 0.6658247113227844, + "learning_rate": 4.460184073629452e-05, + "loss": 0.6348, + "step": 2708 + }, + { + "epoch": 3.46752, + "grad_norm": 0.6603049635887146, + "learning_rate": 4.4599839935974395e-05, + "loss": 0.6797, + "step": 2709 + }, + { + "epoch": 3.4688, + "grad_norm": 0.6833632588386536, + "learning_rate": 4.459783913565426e-05, + "loss": 0.6698, + "step": 2710 + }, + { + "epoch": 3.47008, + "grad_norm": 0.6787217855453491, + "learning_rate": 4.459583833533413e-05, + "loss": 0.6639, + "step": 2711 + }, + { + "epoch": 3.47136, + "grad_norm": 0.6830318570137024, + "learning_rate": 4.459383753501401e-05, + "loss": 0.6569, + "step": 2712 + }, + { + "epoch": 3.47264, + "grad_norm": 0.7140050530433655, + "learning_rate": 4.459183673469388e-05, + "loss": 0.7267, + "step": 2713 + }, + { + "epoch": 3.47392, + "grad_norm": 0.7092685103416443, + "learning_rate": 4.4589835934373754e-05, + "loss": 0.6364, + "step": 2714 + }, + { + "epoch": 3.4752, + "grad_norm": 0.6709200143814087, + "learning_rate": 4.4587835134053626e-05, + "loss": 0.6491, + "step": 2715 + }, + { + "epoch": 3.47648, + "grad_norm": 0.6295605897903442, + "learning_rate": 4.45858343337335e-05, + "loss": 0.6316, + "step": 2716 + }, + { + "epoch": 3.47776, + "grad_norm": 0.6191185116767883, + "learning_rate": 4.458383353341337e-05, + "loss": 0.6765, + "step": 2717 + }, + { + "epoch": 3.47904, + "grad_norm": 0.6618558168411255, + "learning_rate": 4.4581832733093234e-05, + "loss": 0.6509, + "step": 2718 + }, + { + "epoch": 3.48032, + "grad_norm": 0.6785896420478821, + "learning_rate": 4.457983193277311e-05, + "loss": 0.6234, + "step": 2719 + }, + { + "epoch": 3.4816, + "grad_norm": 0.7022203803062439, + "learning_rate": 4.4577831132452985e-05, + "loss": 0.7037, + "step": 2720 + }, + { + "epoch": 3.4828799999999998, + "grad_norm": 0.6869513988494873, + "learning_rate": 4.457583033213286e-05, + "loss": 0.6943, + "step": 2721 + }, + { + "epoch": 3.48416, + "grad_norm": 0.7011018395423889, + "learning_rate": 4.457382953181273e-05, + "loss": 0.693, + "step": 2722 + }, + { + "epoch": 3.48544, + "grad_norm": 0.6646562814712524, + "learning_rate": 4.45718287314926e-05, + "loss": 0.6767, + "step": 2723 + }, + { + "epoch": 3.48672, + "grad_norm": 0.6229949593544006, + "learning_rate": 4.456982793117247e-05, + "loss": 0.6177, + "step": 2724 + }, + { + "epoch": 3.488, + "grad_norm": 0.6661481261253357, + "learning_rate": 4.4567827130852344e-05, + "loss": 0.6401, + "step": 2725 + }, + { + "epoch": 3.48928, + "grad_norm": 0.6885098218917847, + "learning_rate": 4.4565826330532216e-05, + "loss": 0.6593, + "step": 2726 + }, + { + "epoch": 3.49056, + "grad_norm": 0.6794683933258057, + "learning_rate": 4.456382553021209e-05, + "loss": 0.641, + "step": 2727 + }, + { + "epoch": 3.49184, + "grad_norm": 0.6937509775161743, + "learning_rate": 4.456182472989196e-05, + "loss": 0.7112, + "step": 2728 + }, + { + "epoch": 3.4931200000000002, + "grad_norm": 0.7058037519454956, + "learning_rate": 4.455982392957183e-05, + "loss": 0.644, + "step": 2729 + }, + { + "epoch": 3.4944, + "grad_norm": 0.7502039670944214, + "learning_rate": 4.4557823129251704e-05, + "loss": 0.6871, + "step": 2730 + }, + { + "epoch": 3.49568, + "grad_norm": 0.6988638639450073, + "learning_rate": 4.4555822328931575e-05, + "loss": 0.6788, + "step": 2731 + }, + { + "epoch": 3.49696, + "grad_norm": 0.7283390760421753, + "learning_rate": 4.455382152861145e-05, + "loss": 0.6979, + "step": 2732 + }, + { + "epoch": 3.49824, + "grad_norm": 0.7279435396194458, + "learning_rate": 4.455182072829132e-05, + "loss": 0.7258, + "step": 2733 + }, + { + "epoch": 3.49952, + "grad_norm": 0.64103102684021, + "learning_rate": 4.454981992797119e-05, + "loss": 0.6253, + "step": 2734 + }, + { + "epoch": 3.5008, + "grad_norm": 0.6883367896080017, + "learning_rate": 4.454781912765106e-05, + "loss": 0.6615, + "step": 2735 + }, + { + "epoch": 3.50208, + "grad_norm": 0.6586665511131287, + "learning_rate": 4.4545818327330935e-05, + "loss": 0.6741, + "step": 2736 + }, + { + "epoch": 3.50336, + "grad_norm": 0.6852685213088989, + "learning_rate": 4.4543817527010807e-05, + "loss": 0.7288, + "step": 2737 + }, + { + "epoch": 3.50464, + "grad_norm": 0.6741801500320435, + "learning_rate": 4.454181672669068e-05, + "loss": 0.7345, + "step": 2738 + }, + { + "epoch": 3.50592, + "grad_norm": 0.6715037822723389, + "learning_rate": 4.453981592637055e-05, + "loss": 0.6525, + "step": 2739 + }, + { + "epoch": 3.5072, + "grad_norm": 0.6797910332679749, + "learning_rate": 4.453781512605042e-05, + "loss": 0.668, + "step": 2740 + }, + { + "epoch": 3.50848, + "grad_norm": 0.7185656428337097, + "learning_rate": 4.4535814325730294e-05, + "loss": 0.7052, + "step": 2741 + }, + { + "epoch": 3.50976, + "grad_norm": 0.6579523086547852, + "learning_rate": 4.4533813525410166e-05, + "loss": 0.6825, + "step": 2742 + }, + { + "epoch": 3.51104, + "grad_norm": 0.7128697037696838, + "learning_rate": 4.453181272509004e-05, + "loss": 0.7095, + "step": 2743 + }, + { + "epoch": 3.51232, + "grad_norm": 0.6932650208473206, + "learning_rate": 4.452981192476991e-05, + "loss": 0.6745, + "step": 2744 + }, + { + "epoch": 3.5136, + "grad_norm": 0.7073693871498108, + "learning_rate": 4.452781112444978e-05, + "loss": 0.6845, + "step": 2745 + }, + { + "epoch": 3.51488, + "grad_norm": 0.6620619893074036, + "learning_rate": 4.452581032412965e-05, + "loss": 0.6135, + "step": 2746 + }, + { + "epoch": 3.51616, + "grad_norm": 0.695382297039032, + "learning_rate": 4.4523809523809525e-05, + "loss": 0.6533, + "step": 2747 + }, + { + "epoch": 3.51744, + "grad_norm": 0.6715102195739746, + "learning_rate": 4.45218087234894e-05, + "loss": 0.6409, + "step": 2748 + }, + { + "epoch": 3.51872, + "grad_norm": 0.6616403460502625, + "learning_rate": 4.451980792316927e-05, + "loss": 0.6637, + "step": 2749 + }, + { + "epoch": 3.52, + "grad_norm": 0.6827410459518433, + "learning_rate": 4.451780712284914e-05, + "loss": 0.6183, + "step": 2750 + }, + { + "epoch": 3.52128, + "grad_norm": 0.7313747406005859, + "learning_rate": 4.451580632252901e-05, + "loss": 0.6647, + "step": 2751 + }, + { + "epoch": 3.52256, + "grad_norm": 0.6950268149375916, + "learning_rate": 4.4513805522208884e-05, + "loss": 0.6705, + "step": 2752 + }, + { + "epoch": 3.52384, + "grad_norm": 0.6453522443771362, + "learning_rate": 4.4511804721888756e-05, + "loss": 0.666, + "step": 2753 + }, + { + "epoch": 3.5251200000000003, + "grad_norm": 0.6691879034042358, + "learning_rate": 4.450980392156863e-05, + "loss": 0.7045, + "step": 2754 + }, + { + "epoch": 3.5263999999999998, + "grad_norm": 0.7268353700637817, + "learning_rate": 4.450780312124851e-05, + "loss": 0.6803, + "step": 2755 + }, + { + "epoch": 3.52768, + "grad_norm": 0.6457922458648682, + "learning_rate": 4.450580232092837e-05, + "loss": 0.6423, + "step": 2756 + }, + { + "epoch": 3.52896, + "grad_norm": 0.6515114307403564, + "learning_rate": 4.4503801520608244e-05, + "loss": 0.7078, + "step": 2757 + }, + { + "epoch": 3.53024, + "grad_norm": 0.7063488960266113, + "learning_rate": 4.4501800720288115e-05, + "loss": 0.6547, + "step": 2758 + }, + { + "epoch": 3.53152, + "grad_norm": 0.7043952345848083, + "learning_rate": 4.449979991996799e-05, + "loss": 0.6184, + "step": 2759 + }, + { + "epoch": 3.5328, + "grad_norm": 0.7184996008872986, + "learning_rate": 4.449779911964786e-05, + "loss": 0.7037, + "step": 2760 + }, + { + "epoch": 3.53408, + "grad_norm": 0.6916483640670776, + "learning_rate": 4.449579831932773e-05, + "loss": 0.6334, + "step": 2761 + }, + { + "epoch": 3.53536, + "grad_norm": 0.7161397337913513, + "learning_rate": 4.449379751900761e-05, + "loss": 0.654, + "step": 2762 + }, + { + "epoch": 3.5366400000000002, + "grad_norm": 0.7488949298858643, + "learning_rate": 4.449179671868748e-05, + "loss": 0.7023, + "step": 2763 + }, + { + "epoch": 3.5379199999999997, + "grad_norm": 0.7058433890342712, + "learning_rate": 4.448979591836735e-05, + "loss": 0.6553, + "step": 2764 + }, + { + "epoch": 3.5392, + "grad_norm": 0.6619923114776611, + "learning_rate": 4.448779511804722e-05, + "loss": 0.6527, + "step": 2765 + }, + { + "epoch": 3.54048, + "grad_norm": 0.6837087869644165, + "learning_rate": 4.448579431772709e-05, + "loss": 0.6352, + "step": 2766 + }, + { + "epoch": 3.54176, + "grad_norm": 0.7077759504318237, + "learning_rate": 4.448379351740696e-05, + "loss": 0.6671, + "step": 2767 + }, + { + "epoch": 3.54304, + "grad_norm": 0.6346209049224854, + "learning_rate": 4.4481792717086834e-05, + "loss": 0.612, + "step": 2768 + }, + { + "epoch": 3.54432, + "grad_norm": 0.6402051448822021, + "learning_rate": 4.447979191676671e-05, + "loss": 0.6399, + "step": 2769 + }, + { + "epoch": 3.5456, + "grad_norm": 0.6839762926101685, + "learning_rate": 4.4477791116446585e-05, + "loss": 0.7407, + "step": 2770 + }, + { + "epoch": 3.54688, + "grad_norm": 0.688944399356842, + "learning_rate": 4.4475790316126456e-05, + "loss": 0.6762, + "step": 2771 + }, + { + "epoch": 3.54816, + "grad_norm": 0.642185628414154, + "learning_rate": 4.447378951580632e-05, + "loss": 0.6375, + "step": 2772 + }, + { + "epoch": 3.54944, + "grad_norm": 0.6515256762504578, + "learning_rate": 4.447178871548619e-05, + "loss": 0.5938, + "step": 2773 + }, + { + "epoch": 3.55072, + "grad_norm": 0.6609624624252319, + "learning_rate": 4.4469787915166065e-05, + "loss": 0.7017, + "step": 2774 + }, + { + "epoch": 3.552, + "grad_norm": 0.6748828291893005, + "learning_rate": 4.446778711484594e-05, + "loss": 0.6911, + "step": 2775 + }, + { + "epoch": 3.55328, + "grad_norm": 0.7154529094696045, + "learning_rate": 4.4465786314525816e-05, + "loss": 0.6708, + "step": 2776 + }, + { + "epoch": 3.55456, + "grad_norm": 0.6632035374641418, + "learning_rate": 4.446378551420569e-05, + "loss": 0.6461, + "step": 2777 + }, + { + "epoch": 3.55584, + "grad_norm": 0.7373719811439514, + "learning_rate": 4.446178471388556e-05, + "loss": 0.7789, + "step": 2778 + }, + { + "epoch": 3.55712, + "grad_norm": 0.7006744146347046, + "learning_rate": 4.445978391356543e-05, + "loss": 0.6923, + "step": 2779 + }, + { + "epoch": 3.5584, + "grad_norm": 0.699441134929657, + "learning_rate": 4.4457783113245296e-05, + "loss": 0.7505, + "step": 2780 + }, + { + "epoch": 3.55968, + "grad_norm": 0.6537452936172485, + "learning_rate": 4.445578231292517e-05, + "loss": 0.671, + "step": 2781 + }, + { + "epoch": 3.56096, + "grad_norm": 0.6859011650085449, + "learning_rate": 4.445378151260504e-05, + "loss": 0.6701, + "step": 2782 + }, + { + "epoch": 3.56224, + "grad_norm": 0.6629458069801331, + "learning_rate": 4.445178071228492e-05, + "loss": 0.6463, + "step": 2783 + }, + { + "epoch": 3.56352, + "grad_norm": 0.6698895692825317, + "learning_rate": 4.444977991196479e-05, + "loss": 0.6575, + "step": 2784 + }, + { + "epoch": 3.5648, + "grad_norm": 0.7185133695602417, + "learning_rate": 4.444777911164466e-05, + "loss": 0.7278, + "step": 2785 + }, + { + "epoch": 3.56608, + "grad_norm": 0.699368417263031, + "learning_rate": 4.4445778311324534e-05, + "loss": 0.7271, + "step": 2786 + }, + { + "epoch": 3.56736, + "grad_norm": 0.6818522810935974, + "learning_rate": 4.4443777511004406e-05, + "loss": 0.6762, + "step": 2787 + }, + { + "epoch": 3.5686400000000003, + "grad_norm": 0.6821078062057495, + "learning_rate": 4.444177671068427e-05, + "loss": 0.6452, + "step": 2788 + }, + { + "epoch": 3.5699199999999998, + "grad_norm": 0.6925137042999268, + "learning_rate": 4.443977591036414e-05, + "loss": 0.6354, + "step": 2789 + }, + { + "epoch": 3.5712, + "grad_norm": 0.6931013464927673, + "learning_rate": 4.443777511004402e-05, + "loss": 0.6438, + "step": 2790 + }, + { + "epoch": 3.57248, + "grad_norm": 0.6680043935775757, + "learning_rate": 4.4435774309723894e-05, + "loss": 0.6732, + "step": 2791 + }, + { + "epoch": 3.57376, + "grad_norm": 0.7447218894958496, + "learning_rate": 4.4433773509403765e-05, + "loss": 0.7314, + "step": 2792 + }, + { + "epoch": 3.57504, + "grad_norm": 0.6944176554679871, + "learning_rate": 4.443177270908364e-05, + "loss": 0.6333, + "step": 2793 + }, + { + "epoch": 3.57632, + "grad_norm": 0.7107727527618408, + "learning_rate": 4.442977190876351e-05, + "loss": 0.673, + "step": 2794 + }, + { + "epoch": 3.5776, + "grad_norm": 0.6927473545074463, + "learning_rate": 4.442777110844338e-05, + "loss": 0.6667, + "step": 2795 + }, + { + "epoch": 3.57888, + "grad_norm": 0.7122114896774292, + "learning_rate": 4.4425770308123246e-05, + "loss": 0.6544, + "step": 2796 + }, + { + "epoch": 3.5801600000000002, + "grad_norm": 0.6794743537902832, + "learning_rate": 4.4423769507803125e-05, + "loss": 0.6743, + "step": 2797 + }, + { + "epoch": 3.5814399999999997, + "grad_norm": 0.6645591259002686, + "learning_rate": 4.4421768707482997e-05, + "loss": 0.7121, + "step": 2798 + }, + { + "epoch": 3.58272, + "grad_norm": 0.6730340123176575, + "learning_rate": 4.441976790716287e-05, + "loss": 0.6619, + "step": 2799 + }, + { + "epoch": 3.584, + "grad_norm": 0.6491687297821045, + "learning_rate": 4.441776710684274e-05, + "loss": 0.6227, + "step": 2800 + }, + { + "epoch": 3.58528, + "grad_norm": 0.698142945766449, + "learning_rate": 4.441576630652261e-05, + "loss": 0.6708, + "step": 2801 + }, + { + "epoch": 3.58656, + "grad_norm": 0.7014725804328918, + "learning_rate": 4.4413765506202484e-05, + "loss": 0.6487, + "step": 2802 + }, + { + "epoch": 3.58784, + "grad_norm": 0.6868982315063477, + "learning_rate": 4.4411764705882356e-05, + "loss": 0.719, + "step": 2803 + }, + { + "epoch": 3.58912, + "grad_norm": 0.6748588681221008, + "learning_rate": 4.440976390556223e-05, + "loss": 0.6158, + "step": 2804 + }, + { + "epoch": 3.5904, + "grad_norm": 0.7001504898071289, + "learning_rate": 4.44077631052421e-05, + "loss": 0.6752, + "step": 2805 + }, + { + "epoch": 3.59168, + "grad_norm": 0.6581389904022217, + "learning_rate": 4.440576230492197e-05, + "loss": 0.6909, + "step": 2806 + }, + { + "epoch": 3.59296, + "grad_norm": 0.6843972206115723, + "learning_rate": 4.440376150460184e-05, + "loss": 0.7346, + "step": 2807 + }, + { + "epoch": 3.59424, + "grad_norm": 0.6761201620101929, + "learning_rate": 4.4401760704281715e-05, + "loss": 0.6827, + "step": 2808 + }, + { + "epoch": 3.59552, + "grad_norm": 0.7041255235671997, + "learning_rate": 4.439975990396159e-05, + "loss": 0.6768, + "step": 2809 + }, + { + "epoch": 3.5968, + "grad_norm": 0.650201141834259, + "learning_rate": 4.439775910364146e-05, + "loss": 0.6364, + "step": 2810 + }, + { + "epoch": 3.59808, + "grad_norm": 0.7019467353820801, + "learning_rate": 4.439575830332133e-05, + "loss": 0.6991, + "step": 2811 + }, + { + "epoch": 3.59936, + "grad_norm": 0.6987199783325195, + "learning_rate": 4.43937575030012e-05, + "loss": 0.6525, + "step": 2812 + }, + { + "epoch": 3.60064, + "grad_norm": 0.6764307618141174, + "learning_rate": 4.4391756702681074e-05, + "loss": 0.6271, + "step": 2813 + }, + { + "epoch": 3.60192, + "grad_norm": 0.6464497447013855, + "learning_rate": 4.4389755902360946e-05, + "loss": 0.6348, + "step": 2814 + }, + { + "epoch": 3.6032, + "grad_norm": 0.704531192779541, + "learning_rate": 4.438775510204082e-05, + "loss": 0.6944, + "step": 2815 + }, + { + "epoch": 3.60448, + "grad_norm": 0.7004350423812866, + "learning_rate": 4.438575430172069e-05, + "loss": 0.6299, + "step": 2816 + }, + { + "epoch": 3.60576, + "grad_norm": 0.6948818564414978, + "learning_rate": 4.438375350140056e-05, + "loss": 0.6662, + "step": 2817 + }, + { + "epoch": 3.60704, + "grad_norm": 0.668674647808075, + "learning_rate": 4.4381752701080434e-05, + "loss": 0.6203, + "step": 2818 + }, + { + "epoch": 3.60832, + "grad_norm": 0.6800994277000427, + "learning_rate": 4.4379751900760306e-05, + "loss": 0.6619, + "step": 2819 + }, + { + "epoch": 3.6096, + "grad_norm": 0.667832612991333, + "learning_rate": 4.437775110044018e-05, + "loss": 0.6441, + "step": 2820 + }, + { + "epoch": 3.61088, + "grad_norm": 0.695292592048645, + "learning_rate": 4.437575030012005e-05, + "loss": 0.7054, + "step": 2821 + }, + { + "epoch": 3.6121600000000003, + "grad_norm": 0.7079086303710938, + "learning_rate": 4.437374949979992e-05, + "loss": 0.6772, + "step": 2822 + }, + { + "epoch": 3.6134399999999998, + "grad_norm": 0.6839944124221802, + "learning_rate": 4.437174869947979e-05, + "loss": 0.6553, + "step": 2823 + }, + { + "epoch": 3.61472, + "grad_norm": 0.6628500819206238, + "learning_rate": 4.4369747899159665e-05, + "loss": 0.6787, + "step": 2824 + }, + { + "epoch": 3.616, + "grad_norm": 0.6501266360282898, + "learning_rate": 4.4367747098839543e-05, + "loss": 0.6575, + "step": 2825 + }, + { + "epoch": 3.61728, + "grad_norm": 0.6804488301277161, + "learning_rate": 4.436574629851941e-05, + "loss": 0.647, + "step": 2826 + }, + { + "epoch": 3.61856, + "grad_norm": 0.7033595442771912, + "learning_rate": 4.436374549819928e-05, + "loss": 0.6988, + "step": 2827 + }, + { + "epoch": 3.61984, + "grad_norm": 0.6809185147285461, + "learning_rate": 4.436174469787915e-05, + "loss": 0.6726, + "step": 2828 + }, + { + "epoch": 3.62112, + "grad_norm": 0.7250016927719116, + "learning_rate": 4.4359743897559024e-05, + "loss": 0.6378, + "step": 2829 + }, + { + "epoch": 3.6224, + "grad_norm": 0.6764475703239441, + "learning_rate": 4.4357743097238896e-05, + "loss": 0.6564, + "step": 2830 + }, + { + "epoch": 3.6236800000000002, + "grad_norm": 0.7175212502479553, + "learning_rate": 4.435574229691877e-05, + "loss": 0.7327, + "step": 2831 + }, + { + "epoch": 3.6249599999999997, + "grad_norm": 0.6657410264015198, + "learning_rate": 4.4353741496598646e-05, + "loss": 0.6475, + "step": 2832 + }, + { + "epoch": 3.62624, + "grad_norm": 0.6739627122879028, + "learning_rate": 4.435174069627852e-05, + "loss": 0.6444, + "step": 2833 + }, + { + "epoch": 3.62752, + "grad_norm": 0.7084604501724243, + "learning_rate": 4.434973989595838e-05, + "loss": 0.6923, + "step": 2834 + }, + { + "epoch": 3.6288, + "grad_norm": 0.7034947276115417, + "learning_rate": 4.4347739095638255e-05, + "loss": 0.696, + "step": 2835 + }, + { + "epoch": 3.63008, + "grad_norm": 0.6603802442550659, + "learning_rate": 4.434573829531813e-05, + "loss": 0.6625, + "step": 2836 + }, + { + "epoch": 3.63136, + "grad_norm": 0.6719616055488586, + "learning_rate": 4.4343737494998e-05, + "loss": 0.6623, + "step": 2837 + }, + { + "epoch": 3.63264, + "grad_norm": 0.6790746450424194, + "learning_rate": 4.434173669467787e-05, + "loss": 0.7016, + "step": 2838 + }, + { + "epoch": 3.63392, + "grad_norm": 0.6688959002494812, + "learning_rate": 4.433973589435775e-05, + "loss": 0.6411, + "step": 2839 + }, + { + "epoch": 3.6352, + "grad_norm": 0.7280130386352539, + "learning_rate": 4.433773509403762e-05, + "loss": 0.7061, + "step": 2840 + }, + { + "epoch": 3.63648, + "grad_norm": 0.6903562545776367, + "learning_rate": 4.433573429371749e-05, + "loss": 0.6433, + "step": 2841 + }, + { + "epoch": 3.63776, + "grad_norm": 0.713874101638794, + "learning_rate": 4.433373349339736e-05, + "loss": 0.6956, + "step": 2842 + }, + { + "epoch": 3.63904, + "grad_norm": 0.6836928129196167, + "learning_rate": 4.433173269307723e-05, + "loss": 0.6829, + "step": 2843 + }, + { + "epoch": 3.64032, + "grad_norm": 0.6967549920082092, + "learning_rate": 4.43297318927571e-05, + "loss": 0.6793, + "step": 2844 + }, + { + "epoch": 3.6416, + "grad_norm": 0.7121291160583496, + "learning_rate": 4.4327731092436974e-05, + "loss": 0.6855, + "step": 2845 + }, + { + "epoch": 3.64288, + "grad_norm": 0.6488426327705383, + "learning_rate": 4.432573029211685e-05, + "loss": 0.6063, + "step": 2846 + }, + { + "epoch": 3.64416, + "grad_norm": 0.669741153717041, + "learning_rate": 4.4323729491796724e-05, + "loss": 0.6383, + "step": 2847 + }, + { + "epoch": 3.64544, + "grad_norm": 0.6769450306892395, + "learning_rate": 4.4321728691476596e-05, + "loss": 0.6116, + "step": 2848 + }, + { + "epoch": 3.64672, + "grad_norm": 0.6796509623527527, + "learning_rate": 4.431972789115647e-05, + "loss": 0.6567, + "step": 2849 + }, + { + "epoch": 3.648, + "grad_norm": 0.6543831825256348, + "learning_rate": 4.431772709083633e-05, + "loss": 0.6122, + "step": 2850 + }, + { + "epoch": 3.64928, + "grad_norm": 0.6729878187179565, + "learning_rate": 4.4315726290516205e-05, + "loss": 0.6636, + "step": 2851 + }, + { + "epoch": 3.65056, + "grad_norm": 0.7151674628257751, + "learning_rate": 4.431372549019608e-05, + "loss": 0.6863, + "step": 2852 + }, + { + "epoch": 3.65184, + "grad_norm": 0.6241359710693359, + "learning_rate": 4.4311724689875955e-05, + "loss": 0.6208, + "step": 2853 + }, + { + "epoch": 3.65312, + "grad_norm": 0.6886858940124512, + "learning_rate": 4.430972388955583e-05, + "loss": 0.679, + "step": 2854 + }, + { + "epoch": 3.6544, + "grad_norm": 0.7163287997245789, + "learning_rate": 4.43077230892357e-05, + "loss": 0.6783, + "step": 2855 + }, + { + "epoch": 3.6556800000000003, + "grad_norm": 0.698589026927948, + "learning_rate": 4.430572228891557e-05, + "loss": 0.6695, + "step": 2856 + }, + { + "epoch": 3.6569599999999998, + "grad_norm": 0.7296429872512817, + "learning_rate": 4.430372148859544e-05, + "loss": 0.7372, + "step": 2857 + }, + { + "epoch": 3.65824, + "grad_norm": 0.6619814038276672, + "learning_rate": 4.430172068827531e-05, + "loss": 0.6973, + "step": 2858 + }, + { + "epoch": 3.65952, + "grad_norm": 0.6731798052787781, + "learning_rate": 4.429971988795518e-05, + "loss": 0.652, + "step": 2859 + }, + { + "epoch": 3.6608, + "grad_norm": 0.7068068981170654, + "learning_rate": 4.429771908763505e-05, + "loss": 0.715, + "step": 2860 + }, + { + "epoch": 3.66208, + "grad_norm": 0.6874963641166687, + "learning_rate": 4.429571828731493e-05, + "loss": 0.7342, + "step": 2861 + }, + { + "epoch": 3.66336, + "grad_norm": 0.6647247672080994, + "learning_rate": 4.42937174869948e-05, + "loss": 0.6638, + "step": 2862 + }, + { + "epoch": 3.66464, + "grad_norm": 0.6708807349205017, + "learning_rate": 4.4291716686674674e-05, + "loss": 0.6494, + "step": 2863 + }, + { + "epoch": 3.66592, + "grad_norm": 0.6945130825042725, + "learning_rate": 4.4289715886354546e-05, + "loss": 0.6974, + "step": 2864 + }, + { + "epoch": 3.6672000000000002, + "grad_norm": 0.6519247889518738, + "learning_rate": 4.428771508603442e-05, + "loss": 0.5756, + "step": 2865 + }, + { + "epoch": 3.6684799999999997, + "grad_norm": 0.6893465518951416, + "learning_rate": 4.428571428571428e-05, + "loss": 0.6507, + "step": 2866 + }, + { + "epoch": 3.66976, + "grad_norm": 0.6804561018943787, + "learning_rate": 4.4283713485394155e-05, + "loss": 0.6864, + "step": 2867 + }, + { + "epoch": 3.67104, + "grad_norm": 0.7122829556465149, + "learning_rate": 4.428171268507403e-05, + "loss": 0.7419, + "step": 2868 + }, + { + "epoch": 3.67232, + "grad_norm": 0.64589524269104, + "learning_rate": 4.4279711884753905e-05, + "loss": 0.6927, + "step": 2869 + }, + { + "epoch": 3.6736, + "grad_norm": 0.6762197613716125, + "learning_rate": 4.427771108443378e-05, + "loss": 0.6725, + "step": 2870 + }, + { + "epoch": 3.67488, + "grad_norm": 0.6425807476043701, + "learning_rate": 4.427571028411365e-05, + "loss": 0.631, + "step": 2871 + }, + { + "epoch": 3.67616, + "grad_norm": 0.6912189722061157, + "learning_rate": 4.427370948379352e-05, + "loss": 0.6934, + "step": 2872 + }, + { + "epoch": 3.67744, + "grad_norm": 0.6705701947212219, + "learning_rate": 4.427170868347339e-05, + "loss": 0.6638, + "step": 2873 + }, + { + "epoch": 3.67872, + "grad_norm": 0.7085809111595154, + "learning_rate": 4.426970788315326e-05, + "loss": 0.6887, + "step": 2874 + }, + { + "epoch": 3.68, + "grad_norm": 0.683226466178894, + "learning_rate": 4.4267707082833136e-05, + "loss": 0.658, + "step": 2875 + }, + { + "epoch": 3.68128, + "grad_norm": 0.6895299553871155, + "learning_rate": 4.426570628251301e-05, + "loss": 0.638, + "step": 2876 + }, + { + "epoch": 3.68256, + "grad_norm": 0.6925261616706848, + "learning_rate": 4.426370548219288e-05, + "loss": 0.6365, + "step": 2877 + }, + { + "epoch": 3.68384, + "grad_norm": 0.7004500031471252, + "learning_rate": 4.426170468187275e-05, + "loss": 0.6739, + "step": 2878 + }, + { + "epoch": 3.68512, + "grad_norm": 0.7290331125259399, + "learning_rate": 4.4259703881552624e-05, + "loss": 0.6495, + "step": 2879 + }, + { + "epoch": 3.6864, + "grad_norm": 0.6692587733268738, + "learning_rate": 4.4257703081232496e-05, + "loss": 0.6887, + "step": 2880 + }, + { + "epoch": 3.68768, + "grad_norm": 0.6933017373085022, + "learning_rate": 4.425570228091237e-05, + "loss": 0.6722, + "step": 2881 + }, + { + "epoch": 3.68896, + "grad_norm": 0.707172691822052, + "learning_rate": 4.425370148059224e-05, + "loss": 0.7151, + "step": 2882 + }, + { + "epoch": 3.69024, + "grad_norm": 0.6823884844779968, + "learning_rate": 4.425170068027211e-05, + "loss": 0.6213, + "step": 2883 + }, + { + "epoch": 3.69152, + "grad_norm": 0.7094587087631226, + "learning_rate": 4.424969987995198e-05, + "loss": 0.7033, + "step": 2884 + }, + { + "epoch": 3.6928, + "grad_norm": 0.6758087277412415, + "learning_rate": 4.4247699079631855e-05, + "loss": 0.687, + "step": 2885 + }, + { + "epoch": 3.69408, + "grad_norm": 0.6843044757843018, + "learning_rate": 4.424569827931173e-05, + "loss": 0.6536, + "step": 2886 + }, + { + "epoch": 3.69536, + "grad_norm": 0.7044915556907654, + "learning_rate": 4.42436974789916e-05, + "loss": 0.7335, + "step": 2887 + }, + { + "epoch": 3.69664, + "grad_norm": 0.6591890454292297, + "learning_rate": 4.424169667867147e-05, + "loss": 0.645, + "step": 2888 + }, + { + "epoch": 3.69792, + "grad_norm": 0.7025469541549683, + "learning_rate": 4.423969587835134e-05, + "loss": 0.6961, + "step": 2889 + }, + { + "epoch": 3.6992000000000003, + "grad_norm": 0.663060188293457, + "learning_rate": 4.4237695078031214e-05, + "loss": 0.6113, + "step": 2890 + }, + { + "epoch": 3.7004799999999998, + "grad_norm": 0.6972171068191528, + "learning_rate": 4.4235694277711086e-05, + "loss": 0.6683, + "step": 2891 + }, + { + "epoch": 3.70176, + "grad_norm": 0.7062972187995911, + "learning_rate": 4.423369347739096e-05, + "loss": 0.7317, + "step": 2892 + }, + { + "epoch": 3.70304, + "grad_norm": 0.6868116855621338, + "learning_rate": 4.423169267707083e-05, + "loss": 0.6834, + "step": 2893 + }, + { + "epoch": 3.70432, + "grad_norm": 0.6620085835456848, + "learning_rate": 4.42296918767507e-05, + "loss": 0.6806, + "step": 2894 + }, + { + "epoch": 3.7056, + "grad_norm": 0.6830658912658691, + "learning_rate": 4.4227691076430573e-05, + "loss": 0.6771, + "step": 2895 + }, + { + "epoch": 3.70688, + "grad_norm": 0.6770226359367371, + "learning_rate": 4.4225690276110445e-05, + "loss": 0.6476, + "step": 2896 + }, + { + "epoch": 3.70816, + "grad_norm": 0.6751776933670044, + "learning_rate": 4.422368947579032e-05, + "loss": 0.6402, + "step": 2897 + }, + { + "epoch": 3.70944, + "grad_norm": 0.7226163148880005, + "learning_rate": 4.422168867547019e-05, + "loss": 0.7128, + "step": 2898 + }, + { + "epoch": 3.7107200000000002, + "grad_norm": 0.6574375033378601, + "learning_rate": 4.421968787515006e-05, + "loss": 0.7031, + "step": 2899 + }, + { + "epoch": 3.7119999999999997, + "grad_norm": 0.6752791404724121, + "learning_rate": 4.421768707482993e-05, + "loss": 0.7141, + "step": 2900 + }, + { + "epoch": 3.71328, + "grad_norm": 0.6757639646530151, + "learning_rate": 4.4215686274509805e-05, + "loss": 0.707, + "step": 2901 + }, + { + "epoch": 3.71456, + "grad_norm": 0.6414822936058044, + "learning_rate": 4.4213685474189676e-05, + "loss": 0.6637, + "step": 2902 + }, + { + "epoch": 3.71584, + "grad_norm": 0.7052628397941589, + "learning_rate": 4.4211684673869555e-05, + "loss": 0.6547, + "step": 2903 + }, + { + "epoch": 3.71712, + "grad_norm": 0.6637598872184753, + "learning_rate": 4.420968387354942e-05, + "loss": 0.6694, + "step": 2904 + }, + { + "epoch": 3.7184, + "grad_norm": 0.684610903263092, + "learning_rate": 4.420768307322929e-05, + "loss": 0.6892, + "step": 2905 + }, + { + "epoch": 3.71968, + "grad_norm": 0.6572564244270325, + "learning_rate": 4.4205682272909164e-05, + "loss": 0.6579, + "step": 2906 + }, + { + "epoch": 3.72096, + "grad_norm": 0.6893813014030457, + "learning_rate": 4.4203681472589036e-05, + "loss": 0.6612, + "step": 2907 + }, + { + "epoch": 3.72224, + "grad_norm": 0.6515512466430664, + "learning_rate": 4.420168067226891e-05, + "loss": 0.6371, + "step": 2908 + }, + { + "epoch": 3.72352, + "grad_norm": 0.7329177260398865, + "learning_rate": 4.419967987194878e-05, + "loss": 0.7172, + "step": 2909 + }, + { + "epoch": 3.7248, + "grad_norm": 0.6419927477836609, + "learning_rate": 4.419767907162866e-05, + "loss": 0.6113, + "step": 2910 + }, + { + "epoch": 3.72608, + "grad_norm": 0.6344096064567566, + "learning_rate": 4.419567827130853e-05, + "loss": 0.6277, + "step": 2911 + }, + { + "epoch": 3.72736, + "grad_norm": 0.7253386974334717, + "learning_rate": 4.4193677470988395e-05, + "loss": 0.6749, + "step": 2912 + }, + { + "epoch": 3.72864, + "grad_norm": 0.6708924174308777, + "learning_rate": 4.419167667066827e-05, + "loss": 0.6019, + "step": 2913 + }, + { + "epoch": 3.72992, + "grad_norm": 0.6822198033332825, + "learning_rate": 4.418967587034814e-05, + "loss": 0.6393, + "step": 2914 + }, + { + "epoch": 3.7312, + "grad_norm": 0.6997161507606506, + "learning_rate": 4.418767507002801e-05, + "loss": 0.668, + "step": 2915 + }, + { + "epoch": 3.73248, + "grad_norm": 0.67588210105896, + "learning_rate": 4.418567426970788e-05, + "loss": 0.6117, + "step": 2916 + }, + { + "epoch": 3.73376, + "grad_norm": 0.7122563123703003, + "learning_rate": 4.418367346938776e-05, + "loss": 0.6749, + "step": 2917 + }, + { + "epoch": 3.73504, + "grad_norm": 0.681527316570282, + "learning_rate": 4.418167266906763e-05, + "loss": 0.682, + "step": 2918 + }, + { + "epoch": 3.73632, + "grad_norm": 0.6719216704368591, + "learning_rate": 4.4179671868747505e-05, + "loss": 0.684, + "step": 2919 + }, + { + "epoch": 3.7376, + "grad_norm": 0.6405019164085388, + "learning_rate": 4.417767106842737e-05, + "loss": 0.6527, + "step": 2920 + }, + { + "epoch": 3.73888, + "grad_norm": 0.6861347556114197, + "learning_rate": 4.417567026810724e-05, + "loss": 0.6983, + "step": 2921 + }, + { + "epoch": 3.74016, + "grad_norm": 0.6605308055877686, + "learning_rate": 4.4173669467787114e-05, + "loss": 0.6757, + "step": 2922 + }, + { + "epoch": 3.74144, + "grad_norm": 0.6555644869804382, + "learning_rate": 4.4171668667466985e-05, + "loss": 0.681, + "step": 2923 + }, + { + "epoch": 3.7427200000000003, + "grad_norm": 0.6855587363243103, + "learning_rate": 4.4169667867146864e-05, + "loss": 0.6807, + "step": 2924 + }, + { + "epoch": 3.7439999999999998, + "grad_norm": 0.7204174399375916, + "learning_rate": 4.4167667066826736e-05, + "loss": 0.7226, + "step": 2925 + }, + { + "epoch": 3.74528, + "grad_norm": 0.6882557272911072, + "learning_rate": 4.416566626650661e-05, + "loss": 0.7127, + "step": 2926 + }, + { + "epoch": 3.74656, + "grad_norm": 0.6996170878410339, + "learning_rate": 4.416366546618648e-05, + "loss": 0.6309, + "step": 2927 + }, + { + "epoch": 3.74784, + "grad_norm": 0.6991925239562988, + "learning_rate": 4.4161664665866345e-05, + "loss": 0.6571, + "step": 2928 + }, + { + "epoch": 3.74912, + "grad_norm": 0.7202059030532837, + "learning_rate": 4.4159663865546217e-05, + "loss": 0.6948, + "step": 2929 + }, + { + "epoch": 3.7504, + "grad_norm": 0.7513526678085327, + "learning_rate": 4.415766306522609e-05, + "loss": 0.7387, + "step": 2930 + }, + { + "epoch": 3.75168, + "grad_norm": 0.6940328478813171, + "learning_rate": 4.415566226490597e-05, + "loss": 0.7083, + "step": 2931 + }, + { + "epoch": 3.75296, + "grad_norm": 0.665745198726654, + "learning_rate": 4.415366146458584e-05, + "loss": 0.6208, + "step": 2932 + }, + { + "epoch": 3.7542400000000002, + "grad_norm": 0.7266408801078796, + "learning_rate": 4.415166066426571e-05, + "loss": 0.6952, + "step": 2933 + }, + { + "epoch": 3.7555199999999997, + "grad_norm": 0.7092468738555908, + "learning_rate": 4.414965986394558e-05, + "loss": 0.6157, + "step": 2934 + }, + { + "epoch": 3.7568, + "grad_norm": 0.6691955327987671, + "learning_rate": 4.4147659063625454e-05, + "loss": 0.644, + "step": 2935 + }, + { + "epoch": 3.75808, + "grad_norm": 0.679655909538269, + "learning_rate": 4.414565826330532e-05, + "loss": 0.6441, + "step": 2936 + }, + { + "epoch": 3.75936, + "grad_norm": 0.7097312211990356, + "learning_rate": 4.414365746298519e-05, + "loss": 0.7353, + "step": 2937 + }, + { + "epoch": 3.76064, + "grad_norm": 0.730174720287323, + "learning_rate": 4.414165666266507e-05, + "loss": 0.6899, + "step": 2938 + }, + { + "epoch": 3.76192, + "grad_norm": 0.6965400576591492, + "learning_rate": 4.413965586234494e-05, + "loss": 0.654, + "step": 2939 + }, + { + "epoch": 3.7632, + "grad_norm": 0.7156361937522888, + "learning_rate": 4.4137655062024814e-05, + "loss": 0.6471, + "step": 2940 + }, + { + "epoch": 3.76448, + "grad_norm": 0.7233887910842896, + "learning_rate": 4.4135654261704686e-05, + "loss": 0.6766, + "step": 2941 + }, + { + "epoch": 3.76576, + "grad_norm": 0.708551287651062, + "learning_rate": 4.413365346138456e-05, + "loss": 0.6497, + "step": 2942 + }, + { + "epoch": 3.76704, + "grad_norm": 0.6958677768707275, + "learning_rate": 4.413165266106443e-05, + "loss": 0.6522, + "step": 2943 + }, + { + "epoch": 3.76832, + "grad_norm": 0.693276047706604, + "learning_rate": 4.4129651860744294e-05, + "loss": 0.6597, + "step": 2944 + }, + { + "epoch": 3.7696, + "grad_norm": 0.7062802910804749, + "learning_rate": 4.412765106042417e-05, + "loss": 0.6558, + "step": 2945 + }, + { + "epoch": 3.77088, + "grad_norm": 0.7109881639480591, + "learning_rate": 4.4125650260104045e-05, + "loss": 0.6426, + "step": 2946 + }, + { + "epoch": 3.77216, + "grad_norm": 0.6618438959121704, + "learning_rate": 4.412364945978392e-05, + "loss": 0.6421, + "step": 2947 + }, + { + "epoch": 3.77344, + "grad_norm": 0.6410247683525085, + "learning_rate": 4.412164865946379e-05, + "loss": 0.644, + "step": 2948 + }, + { + "epoch": 3.77472, + "grad_norm": 0.705467939376831, + "learning_rate": 4.411964785914366e-05, + "loss": 0.6873, + "step": 2949 + }, + { + "epoch": 3.776, + "grad_norm": 0.6557707190513611, + "learning_rate": 4.411764705882353e-05, + "loss": 0.6677, + "step": 2950 + }, + { + "epoch": 3.77728, + "grad_norm": 0.6946511268615723, + "learning_rate": 4.4115646258503404e-05, + "loss": 0.6773, + "step": 2951 + }, + { + "epoch": 3.77856, + "grad_norm": 0.6707186102867126, + "learning_rate": 4.4113645458183276e-05, + "loss": 0.6772, + "step": 2952 + }, + { + "epoch": 3.77984, + "grad_norm": 0.6657763123512268, + "learning_rate": 4.411164465786315e-05, + "loss": 0.6664, + "step": 2953 + }, + { + "epoch": 3.78112, + "grad_norm": 0.6746175289154053, + "learning_rate": 4.410964385754302e-05, + "loss": 0.6286, + "step": 2954 + }, + { + "epoch": 3.7824, + "grad_norm": 0.6620097756385803, + "learning_rate": 4.410764305722289e-05, + "loss": 0.6133, + "step": 2955 + }, + { + "epoch": 3.78368, + "grad_norm": 0.6786773204803467, + "learning_rate": 4.4105642256902763e-05, + "loss": 0.6386, + "step": 2956 + }, + { + "epoch": 3.78496, + "grad_norm": 0.6965882182121277, + "learning_rate": 4.4103641456582635e-05, + "loss": 0.6586, + "step": 2957 + }, + { + "epoch": 3.7862400000000003, + "grad_norm": 0.7074950337409973, + "learning_rate": 4.410164065626251e-05, + "loss": 0.6971, + "step": 2958 + }, + { + "epoch": 3.7875199999999998, + "grad_norm": 0.6736770272254944, + "learning_rate": 4.409963985594238e-05, + "loss": 0.6395, + "step": 2959 + }, + { + "epoch": 3.7888, + "grad_norm": 0.736041784286499, + "learning_rate": 4.409763905562225e-05, + "loss": 0.6891, + "step": 2960 + }, + { + "epoch": 3.79008, + "grad_norm": 0.6859921813011169, + "learning_rate": 4.409563825530212e-05, + "loss": 0.6503, + "step": 2961 + }, + { + "epoch": 3.79136, + "grad_norm": 0.7309426069259644, + "learning_rate": 4.4093637454981995e-05, + "loss": 0.7584, + "step": 2962 + }, + { + "epoch": 3.79264, + "grad_norm": 0.6685876846313477, + "learning_rate": 4.4091636654661866e-05, + "loss": 0.6737, + "step": 2963 + }, + { + "epoch": 3.79392, + "grad_norm": 0.6798075437545776, + "learning_rate": 4.408963585434174e-05, + "loss": 0.6157, + "step": 2964 + }, + { + "epoch": 3.7952, + "grad_norm": 0.677910327911377, + "learning_rate": 4.408763505402161e-05, + "loss": 0.6221, + "step": 2965 + }, + { + "epoch": 3.79648, + "grad_norm": 0.7607006430625916, + "learning_rate": 4.408563425370149e-05, + "loss": 0.7309, + "step": 2966 + }, + { + "epoch": 3.7977600000000002, + "grad_norm": 0.6692076921463013, + "learning_rate": 4.4083633453381354e-05, + "loss": 0.6585, + "step": 2967 + }, + { + "epoch": 3.7990399999999998, + "grad_norm": 0.6770291328430176, + "learning_rate": 4.4081632653061226e-05, + "loss": 0.6702, + "step": 2968 + }, + { + "epoch": 3.80032, + "grad_norm": 0.7067806720733643, + "learning_rate": 4.40796318527411e-05, + "loss": 0.6986, + "step": 2969 + }, + { + "epoch": 3.8016, + "grad_norm": 0.6886568069458008, + "learning_rate": 4.407763105242097e-05, + "loss": 0.7016, + "step": 2970 + }, + { + "epoch": 3.80288, + "grad_norm": 0.6451705694198608, + "learning_rate": 4.407563025210084e-05, + "loss": 0.6291, + "step": 2971 + }, + { + "epoch": 3.80416, + "grad_norm": 0.663578450679779, + "learning_rate": 4.407362945178071e-05, + "loss": 0.6653, + "step": 2972 + }, + { + "epoch": 3.80544, + "grad_norm": 0.71304851770401, + "learning_rate": 4.4071628651460585e-05, + "loss": 0.6694, + "step": 2973 + }, + { + "epoch": 3.80672, + "grad_norm": 0.6625712513923645, + "learning_rate": 4.4069627851140464e-05, + "loss": 0.6296, + "step": 2974 + }, + { + "epoch": 3.808, + "grad_norm": 0.6760493516921997, + "learning_rate": 4.406762705082033e-05, + "loss": 0.6438, + "step": 2975 + }, + { + "epoch": 3.80928, + "grad_norm": 0.6758971214294434, + "learning_rate": 4.40656262505002e-05, + "loss": 0.7146, + "step": 2976 + }, + { + "epoch": 3.8105599999999997, + "grad_norm": 0.6622211337089539, + "learning_rate": 4.406362545018007e-05, + "loss": 0.6595, + "step": 2977 + }, + { + "epoch": 3.81184, + "grad_norm": 0.6578704714775085, + "learning_rate": 4.4061624649859944e-05, + "loss": 0.6264, + "step": 2978 + }, + { + "epoch": 3.81312, + "grad_norm": 0.7323094606399536, + "learning_rate": 4.4059623849539816e-05, + "loss": 0.693, + "step": 2979 + }, + { + "epoch": 3.8144, + "grad_norm": 0.6730750799179077, + "learning_rate": 4.405762304921969e-05, + "loss": 0.6267, + "step": 2980 + }, + { + "epoch": 3.81568, + "grad_norm": 0.7026966214179993, + "learning_rate": 4.405562224889957e-05, + "loss": 0.6686, + "step": 2981 + }, + { + "epoch": 3.81696, + "grad_norm": 0.7159264087677002, + "learning_rate": 4.405362144857944e-05, + "loss": 0.6856, + "step": 2982 + }, + { + "epoch": 3.81824, + "grad_norm": 0.6785261034965515, + "learning_rate": 4.4051620648259304e-05, + "loss": 0.7115, + "step": 2983 + }, + { + "epoch": 3.81952, + "grad_norm": 0.6551268100738525, + "learning_rate": 4.4049619847939175e-05, + "loss": 0.6466, + "step": 2984 + }, + { + "epoch": 3.8208, + "grad_norm": 0.7004695534706116, + "learning_rate": 4.404761904761905e-05, + "loss": 0.6808, + "step": 2985 + }, + { + "epoch": 3.82208, + "grad_norm": 0.7341533303260803, + "learning_rate": 4.404561824729892e-05, + "loss": 0.7086, + "step": 2986 + }, + { + "epoch": 3.82336, + "grad_norm": 0.6831992268562317, + "learning_rate": 4.404361744697879e-05, + "loss": 0.6795, + "step": 2987 + }, + { + "epoch": 3.82464, + "grad_norm": 0.6428318023681641, + "learning_rate": 4.404161664665867e-05, + "loss": 0.6104, + "step": 2988 + }, + { + "epoch": 3.82592, + "grad_norm": 0.6942797899246216, + "learning_rate": 4.403961584633854e-05, + "loss": 0.7077, + "step": 2989 + }, + { + "epoch": 3.8272, + "grad_norm": 0.6718285083770752, + "learning_rate": 4.403761504601841e-05, + "loss": 0.6673, + "step": 2990 + }, + { + "epoch": 3.82848, + "grad_norm": 0.669506311416626, + "learning_rate": 4.403561424569828e-05, + "loss": 0.6671, + "step": 2991 + }, + { + "epoch": 3.8297600000000003, + "grad_norm": 0.6954408884048462, + "learning_rate": 4.403361344537815e-05, + "loss": 0.66, + "step": 2992 + }, + { + "epoch": 3.83104, + "grad_norm": 0.6513415575027466, + "learning_rate": 4.403161264505802e-05, + "loss": 0.6519, + "step": 2993 + }, + { + "epoch": 3.83232, + "grad_norm": 0.6512577533721924, + "learning_rate": 4.4029611844737894e-05, + "loss": 0.6356, + "step": 2994 + }, + { + "epoch": 3.8336, + "grad_norm": 0.69619220495224, + "learning_rate": 4.402761104441777e-05, + "loss": 0.6911, + "step": 2995 + }, + { + "epoch": 3.83488, + "grad_norm": 0.6886332035064697, + "learning_rate": 4.4025610244097645e-05, + "loss": 0.7061, + "step": 2996 + }, + { + "epoch": 3.83616, + "grad_norm": 0.7018471360206604, + "learning_rate": 4.4023609443777516e-05, + "loss": 0.6589, + "step": 2997 + }, + { + "epoch": 3.83744, + "grad_norm": 0.6925556659698486, + "learning_rate": 4.402160864345739e-05, + "loss": 0.7039, + "step": 2998 + }, + { + "epoch": 3.83872, + "grad_norm": 0.7044040560722351, + "learning_rate": 4.401960784313725e-05, + "loss": 0.7379, + "step": 2999 + }, + { + "epoch": 3.84, + "grad_norm": 0.6619036197662354, + "learning_rate": 4.4017607042817125e-05, + "loss": 0.6497, + "step": 3000 + }, + { + "epoch": 3.8412800000000002, + "grad_norm": 0.677604615688324, + "learning_rate": 4.4015606242497e-05, + "loss": 0.6742, + "step": 3001 + }, + { + "epoch": 3.8425599999999998, + "grad_norm": 0.6716386079788208, + "learning_rate": 4.4013605442176876e-05, + "loss": 0.6129, + "step": 3002 + }, + { + "epoch": 3.84384, + "grad_norm": 0.6958229541778564, + "learning_rate": 4.401160464185675e-05, + "loss": 0.6419, + "step": 3003 + }, + { + "epoch": 3.84512, + "grad_norm": 0.6683501601219177, + "learning_rate": 4.400960384153662e-05, + "loss": 0.583, + "step": 3004 + }, + { + "epoch": 3.8464, + "grad_norm": 0.6913495063781738, + "learning_rate": 4.400760304121649e-05, + "loss": 0.6845, + "step": 3005 + }, + { + "epoch": 3.84768, + "grad_norm": 0.7633008360862732, + "learning_rate": 4.400560224089636e-05, + "loss": 0.7119, + "step": 3006 + }, + { + "epoch": 3.84896, + "grad_norm": 0.68817138671875, + "learning_rate": 4.400360144057623e-05, + "loss": 0.6531, + "step": 3007 + }, + { + "epoch": 3.85024, + "grad_norm": 0.6548225283622742, + "learning_rate": 4.40016006402561e-05, + "loss": 0.6511, + "step": 3008 + }, + { + "epoch": 3.85152, + "grad_norm": 0.7012094259262085, + "learning_rate": 4.399959983993598e-05, + "loss": 0.6826, + "step": 3009 + }, + { + "epoch": 3.8528000000000002, + "grad_norm": 0.7215191721916199, + "learning_rate": 4.399759903961585e-05, + "loss": 0.7206, + "step": 3010 + }, + { + "epoch": 3.8540799999999997, + "grad_norm": 0.7269889116287231, + "learning_rate": 4.399559823929572e-05, + "loss": 0.6886, + "step": 3011 + }, + { + "epoch": 3.85536, + "grad_norm": 0.6882228255271912, + "learning_rate": 4.3993597438975594e-05, + "loss": 0.6738, + "step": 3012 + }, + { + "epoch": 3.85664, + "grad_norm": 0.6564989686012268, + "learning_rate": 4.3991596638655466e-05, + "loss": 0.614, + "step": 3013 + }, + { + "epoch": 3.85792, + "grad_norm": 0.7501393556594849, + "learning_rate": 4.398959583833534e-05, + "loss": 0.6957, + "step": 3014 + }, + { + "epoch": 3.8592, + "grad_norm": 0.6881626844406128, + "learning_rate": 4.39875950380152e-05, + "loss": 0.6503, + "step": 3015 + }, + { + "epoch": 3.86048, + "grad_norm": 0.7230293154716492, + "learning_rate": 4.398559423769508e-05, + "loss": 0.6886, + "step": 3016 + }, + { + "epoch": 3.86176, + "grad_norm": 0.7025682330131531, + "learning_rate": 4.3983593437374954e-05, + "loss": 0.6752, + "step": 3017 + }, + { + "epoch": 3.86304, + "grad_norm": 0.7032774686813354, + "learning_rate": 4.3981592637054825e-05, + "loss": 0.6771, + "step": 3018 + }, + { + "epoch": 3.86432, + "grad_norm": 0.6938534379005432, + "learning_rate": 4.39795918367347e-05, + "loss": 0.6688, + "step": 3019 + }, + { + "epoch": 3.8656, + "grad_norm": 0.6492636203765869, + "learning_rate": 4.397759103641457e-05, + "loss": 0.6702, + "step": 3020 + }, + { + "epoch": 3.86688, + "grad_norm": 0.698173463344574, + "learning_rate": 4.397559023609444e-05, + "loss": 0.616, + "step": 3021 + }, + { + "epoch": 3.86816, + "grad_norm": 0.7170678377151489, + "learning_rate": 4.397358943577431e-05, + "loss": 0.6048, + "step": 3022 + }, + { + "epoch": 3.86944, + "grad_norm": 0.6741195917129517, + "learning_rate": 4.3971588635454185e-05, + "loss": 0.6607, + "step": 3023 + }, + { + "epoch": 3.87072, + "grad_norm": 0.7016419172286987, + "learning_rate": 4.3969587835134056e-05, + "loss": 0.7229, + "step": 3024 + }, + { + "epoch": 3.872, + "grad_norm": 0.6970353126525879, + "learning_rate": 4.396758703481393e-05, + "loss": 0.6658, + "step": 3025 + }, + { + "epoch": 3.87328, + "grad_norm": 0.6936695575714111, + "learning_rate": 4.39655862344938e-05, + "loss": 0.7113, + "step": 3026 + }, + { + "epoch": 3.87456, + "grad_norm": 0.6938142776489258, + "learning_rate": 4.396358543417367e-05, + "loss": 0.6535, + "step": 3027 + }, + { + "epoch": 3.87584, + "grad_norm": 0.6950214505195618, + "learning_rate": 4.3961584633853544e-05, + "loss": 0.701, + "step": 3028 + }, + { + "epoch": 3.87712, + "grad_norm": 0.6649338603019714, + "learning_rate": 4.3959583833533416e-05, + "loss": 0.635, + "step": 3029 + }, + { + "epoch": 3.8784, + "grad_norm": 0.7024021744728088, + "learning_rate": 4.395758303321329e-05, + "loss": 0.7062, + "step": 3030 + }, + { + "epoch": 3.87968, + "grad_norm": 0.6412244439125061, + "learning_rate": 4.395558223289316e-05, + "loss": 0.6081, + "step": 3031 + }, + { + "epoch": 3.88096, + "grad_norm": 0.6740350127220154, + "learning_rate": 4.395358143257303e-05, + "loss": 0.6404, + "step": 3032 + }, + { + "epoch": 3.88224, + "grad_norm": 0.647661566734314, + "learning_rate": 4.39515806322529e-05, + "loss": 0.6506, + "step": 3033 + }, + { + "epoch": 3.88352, + "grad_norm": 0.6819959878921509, + "learning_rate": 4.3949579831932775e-05, + "loss": 0.6258, + "step": 3034 + }, + { + "epoch": 3.8848000000000003, + "grad_norm": 0.7153730988502502, + "learning_rate": 4.394757903161265e-05, + "loss": 0.6579, + "step": 3035 + }, + { + "epoch": 3.8860799999999998, + "grad_norm": 0.6946179270744324, + "learning_rate": 4.394557823129252e-05, + "loss": 0.6526, + "step": 3036 + }, + { + "epoch": 3.88736, + "grad_norm": 0.6954408288002014, + "learning_rate": 4.394357743097239e-05, + "loss": 0.6596, + "step": 3037 + }, + { + "epoch": 3.88864, + "grad_norm": 0.6411071419715881, + "learning_rate": 4.394157663065226e-05, + "loss": 0.6556, + "step": 3038 + }, + { + "epoch": 3.88992, + "grad_norm": 0.6873469948768616, + "learning_rate": 4.3939575830332134e-05, + "loss": 0.6791, + "step": 3039 + }, + { + "epoch": 3.8912, + "grad_norm": 0.6711555123329163, + "learning_rate": 4.3937575030012006e-05, + "loss": 0.6624, + "step": 3040 + }, + { + "epoch": 3.89248, + "grad_norm": 0.6283385753631592, + "learning_rate": 4.393557422969188e-05, + "loss": 0.6152, + "step": 3041 + }, + { + "epoch": 3.89376, + "grad_norm": 0.6898487210273743, + "learning_rate": 4.393357342937175e-05, + "loss": 0.7011, + "step": 3042 + }, + { + "epoch": 3.89504, + "grad_norm": 0.6428130865097046, + "learning_rate": 4.393157262905162e-05, + "loss": 0.6157, + "step": 3043 + }, + { + "epoch": 3.8963200000000002, + "grad_norm": 0.6543678641319275, + "learning_rate": 4.39295718287315e-05, + "loss": 0.6196, + "step": 3044 + }, + { + "epoch": 3.8975999999999997, + "grad_norm": 0.6453118324279785, + "learning_rate": 4.3927571028411365e-05, + "loss": 0.6832, + "step": 3045 + }, + { + "epoch": 3.89888, + "grad_norm": 0.6822550892829895, + "learning_rate": 4.392557022809124e-05, + "loss": 0.6523, + "step": 3046 + }, + { + "epoch": 3.90016, + "grad_norm": 0.7187889814376831, + "learning_rate": 4.392356942777111e-05, + "loss": 0.7118, + "step": 3047 + }, + { + "epoch": 3.90144, + "grad_norm": 0.7157731056213379, + "learning_rate": 4.392156862745098e-05, + "loss": 0.7197, + "step": 3048 + }, + { + "epoch": 3.90272, + "grad_norm": 0.686944305896759, + "learning_rate": 4.391956782713085e-05, + "loss": 0.6051, + "step": 3049 + }, + { + "epoch": 3.904, + "grad_norm": 0.7068070769309998, + "learning_rate": 4.3917567026810725e-05, + "loss": 0.7256, + "step": 3050 + }, + { + "epoch": 3.90528, + "grad_norm": 0.7173423767089844, + "learning_rate": 4.3915566226490603e-05, + "loss": 0.6701, + "step": 3051 + }, + { + "epoch": 3.90656, + "grad_norm": 0.6758041977882385, + "learning_rate": 4.3913565426170475e-05, + "loss": 0.6383, + "step": 3052 + }, + { + "epoch": 3.90784, + "grad_norm": 0.6902790665626526, + "learning_rate": 4.391156462585034e-05, + "loss": 0.6533, + "step": 3053 + }, + { + "epoch": 3.90912, + "grad_norm": 0.70408034324646, + "learning_rate": 4.390956382553021e-05, + "loss": 0.6577, + "step": 3054 + }, + { + "epoch": 3.9104, + "grad_norm": 0.6818796396255493, + "learning_rate": 4.3907563025210084e-05, + "loss": 0.6786, + "step": 3055 + }, + { + "epoch": 3.91168, + "grad_norm": 0.6927520036697388, + "learning_rate": 4.3905562224889956e-05, + "loss": 0.6924, + "step": 3056 + }, + { + "epoch": 3.91296, + "grad_norm": 0.7096193432807922, + "learning_rate": 4.390356142456983e-05, + "loss": 0.6852, + "step": 3057 + }, + { + "epoch": 3.91424, + "grad_norm": 0.6638479828834534, + "learning_rate": 4.3901560624249706e-05, + "loss": 0.6388, + "step": 3058 + }, + { + "epoch": 3.91552, + "grad_norm": 0.7214940190315247, + "learning_rate": 4.389955982392958e-05, + "loss": 0.7158, + "step": 3059 + }, + { + "epoch": 3.9168, + "grad_norm": 0.681658148765564, + "learning_rate": 4.389755902360945e-05, + "loss": 0.657, + "step": 3060 + }, + { + "epoch": 3.91808, + "grad_norm": 0.6954881548881531, + "learning_rate": 4.3895558223289315e-05, + "loss": 0.7083, + "step": 3061 + }, + { + "epoch": 3.91936, + "grad_norm": 0.681389331817627, + "learning_rate": 4.389355742296919e-05, + "loss": 0.6743, + "step": 3062 + }, + { + "epoch": 3.92064, + "grad_norm": 0.688248336315155, + "learning_rate": 4.389155662264906e-05, + "loss": 0.6632, + "step": 3063 + }, + { + "epoch": 3.92192, + "grad_norm": 0.7140293717384338, + "learning_rate": 4.388955582232893e-05, + "loss": 0.6964, + "step": 3064 + }, + { + "epoch": 3.9232, + "grad_norm": 0.692408561706543, + "learning_rate": 4.388755502200881e-05, + "loss": 0.6953, + "step": 3065 + }, + { + "epoch": 3.92448, + "grad_norm": 0.6348261833190918, + "learning_rate": 4.388555422168868e-05, + "loss": 0.6256, + "step": 3066 + }, + { + "epoch": 3.92576, + "grad_norm": 0.6396204233169556, + "learning_rate": 4.388355342136855e-05, + "loss": 0.6958, + "step": 3067 + }, + { + "epoch": 3.92704, + "grad_norm": 0.6573483347892761, + "learning_rate": 4.3881552621048425e-05, + "loss": 0.6314, + "step": 3068 + }, + { + "epoch": 3.9283200000000003, + "grad_norm": 0.6830142736434937, + "learning_rate": 4.387955182072829e-05, + "loss": 0.6347, + "step": 3069 + }, + { + "epoch": 3.9295999999999998, + "grad_norm": 0.6613914370536804, + "learning_rate": 4.387755102040816e-05, + "loss": 0.7057, + "step": 3070 + }, + { + "epoch": 3.93088, + "grad_norm": 0.7034798860549927, + "learning_rate": 4.3875550220088034e-05, + "loss": 0.6773, + "step": 3071 + }, + { + "epoch": 3.93216, + "grad_norm": 0.741765022277832, + "learning_rate": 4.387354941976791e-05, + "loss": 0.6941, + "step": 3072 + }, + { + "epoch": 3.93344, + "grad_norm": 0.7319201827049255, + "learning_rate": 4.3871548619447784e-05, + "loss": 0.7408, + "step": 3073 + }, + { + "epoch": 3.93472, + "grad_norm": 0.66302090883255, + "learning_rate": 4.3869547819127656e-05, + "loss": 0.624, + "step": 3074 + }, + { + "epoch": 3.936, + "grad_norm": 0.6886874437332153, + "learning_rate": 4.386754701880753e-05, + "loss": 0.6537, + "step": 3075 + }, + { + "epoch": 3.93728, + "grad_norm": 0.6793836951255798, + "learning_rate": 4.38655462184874e-05, + "loss": 0.6656, + "step": 3076 + }, + { + "epoch": 3.93856, + "grad_norm": 0.6942064762115479, + "learning_rate": 4.3863545418167265e-05, + "loss": 0.7116, + "step": 3077 + }, + { + "epoch": 3.9398400000000002, + "grad_norm": 0.6668751239776611, + "learning_rate": 4.386154461784714e-05, + "loss": 0.626, + "step": 3078 + }, + { + "epoch": 3.9411199999999997, + "grad_norm": 0.6711156964302063, + "learning_rate": 4.385954381752701e-05, + "loss": 0.6653, + "step": 3079 + }, + { + "epoch": 3.9424, + "grad_norm": 0.6742311716079712, + "learning_rate": 4.385754301720689e-05, + "loss": 0.7238, + "step": 3080 + }, + { + "epoch": 3.94368, + "grad_norm": 0.6560841202735901, + "learning_rate": 4.385554221688676e-05, + "loss": 0.6364, + "step": 3081 + }, + { + "epoch": 3.94496, + "grad_norm": 0.6376678347587585, + "learning_rate": 4.385354141656663e-05, + "loss": 0.6447, + "step": 3082 + }, + { + "epoch": 3.94624, + "grad_norm": 0.6691120862960815, + "learning_rate": 4.38515406162465e-05, + "loss": 0.6557, + "step": 3083 + }, + { + "epoch": 3.94752, + "grad_norm": 0.6742989420890808, + "learning_rate": 4.3849539815926375e-05, + "loss": 0.6029, + "step": 3084 + }, + { + "epoch": 3.9488, + "grad_norm": 0.7199971675872803, + "learning_rate": 4.384753901560624e-05, + "loss": 0.6881, + "step": 3085 + }, + { + "epoch": 3.95008, + "grad_norm": 0.7232701182365417, + "learning_rate": 4.384553821528611e-05, + "loss": 0.6548, + "step": 3086 + }, + { + "epoch": 3.95136, + "grad_norm": 0.7027313113212585, + "learning_rate": 4.384353741496599e-05, + "loss": 0.6444, + "step": 3087 + }, + { + "epoch": 3.95264, + "grad_norm": 0.6895410418510437, + "learning_rate": 4.384153661464586e-05, + "loss": 0.6935, + "step": 3088 + }, + { + "epoch": 3.95392, + "grad_norm": 0.6641124486923218, + "learning_rate": 4.3839535814325734e-05, + "loss": 0.6709, + "step": 3089 + }, + { + "epoch": 3.9552, + "grad_norm": 0.6645619869232178, + "learning_rate": 4.3837535014005606e-05, + "loss": 0.6736, + "step": 3090 + }, + { + "epoch": 3.95648, + "grad_norm": 0.680464506149292, + "learning_rate": 4.383553421368548e-05, + "loss": 0.6601, + "step": 3091 + }, + { + "epoch": 3.95776, + "grad_norm": 0.6810153126716614, + "learning_rate": 4.383353341336535e-05, + "loss": 0.6463, + "step": 3092 + }, + { + "epoch": 3.95904, + "grad_norm": 0.6904779076576233, + "learning_rate": 4.3831532613045215e-05, + "loss": 0.6375, + "step": 3093 + }, + { + "epoch": 3.96032, + "grad_norm": 0.7193115949630737, + "learning_rate": 4.382953181272509e-05, + "loss": 0.6649, + "step": 3094 + }, + { + "epoch": 3.9616, + "grad_norm": 0.6670559644699097, + "learning_rate": 4.3827531012404965e-05, + "loss": 0.6505, + "step": 3095 + }, + { + "epoch": 3.96288, + "grad_norm": 0.707496166229248, + "learning_rate": 4.382553021208484e-05, + "loss": 0.6726, + "step": 3096 + }, + { + "epoch": 3.96416, + "grad_norm": 0.6769973635673523, + "learning_rate": 4.382352941176471e-05, + "loss": 0.6529, + "step": 3097 + }, + { + "epoch": 3.96544, + "grad_norm": 0.6613832712173462, + "learning_rate": 4.382152861144458e-05, + "loss": 0.6218, + "step": 3098 + }, + { + "epoch": 3.96672, + "grad_norm": 0.6404051184654236, + "learning_rate": 4.381952781112445e-05, + "loss": 0.6685, + "step": 3099 + }, + { + "epoch": 3.968, + "grad_norm": 0.7350611090660095, + "learning_rate": 4.3817527010804324e-05, + "loss": 0.7538, + "step": 3100 + }, + { + "epoch": 3.96928, + "grad_norm": 0.7339857220649719, + "learning_rate": 4.3815526210484196e-05, + "loss": 0.7104, + "step": 3101 + }, + { + "epoch": 3.97056, + "grad_norm": 0.6971475481987, + "learning_rate": 4.381352541016407e-05, + "loss": 0.6595, + "step": 3102 + }, + { + "epoch": 3.9718400000000003, + "grad_norm": 0.6795316934585571, + "learning_rate": 4.381152460984394e-05, + "loss": 0.63, + "step": 3103 + }, + { + "epoch": 3.9731199999999998, + "grad_norm": 0.6590287685394287, + "learning_rate": 4.380952380952381e-05, + "loss": 0.6013, + "step": 3104 + }, + { + "epoch": 3.9744, + "grad_norm": 0.6492599248886108, + "learning_rate": 4.3807523009203684e-05, + "loss": 0.6626, + "step": 3105 + }, + { + "epoch": 3.97568, + "grad_norm": 0.7017719149589539, + "learning_rate": 4.3805522208883556e-05, + "loss": 0.6685, + "step": 3106 + }, + { + "epoch": 3.97696, + "grad_norm": 0.6492130160331726, + "learning_rate": 4.380352140856343e-05, + "loss": 0.6498, + "step": 3107 + }, + { + "epoch": 3.97824, + "grad_norm": 0.6498051881790161, + "learning_rate": 4.38015206082433e-05, + "loss": 0.6152, + "step": 3108 + }, + { + "epoch": 3.97952, + "grad_norm": 0.710146963596344, + "learning_rate": 4.379951980792317e-05, + "loss": 0.7109, + "step": 3109 + }, + { + "epoch": 3.9808, + "grad_norm": 0.6580629348754883, + "learning_rate": 4.379751900760304e-05, + "loss": 0.6464, + "step": 3110 + }, + { + "epoch": 3.98208, + "grad_norm": 0.671927809715271, + "learning_rate": 4.3795518207282915e-05, + "loss": 0.7003, + "step": 3111 + }, + { + "epoch": 3.9833600000000002, + "grad_norm": 0.64473956823349, + "learning_rate": 4.379351740696279e-05, + "loss": 0.6789, + "step": 3112 + }, + { + "epoch": 3.9846399999999997, + "grad_norm": 0.6603186130523682, + "learning_rate": 4.379151660664266e-05, + "loss": 0.6558, + "step": 3113 + }, + { + "epoch": 3.98592, + "grad_norm": 0.7090316414833069, + "learning_rate": 4.378951580632253e-05, + "loss": 0.6923, + "step": 3114 + }, + { + "epoch": 3.9872, + "grad_norm": 0.6954898238182068, + "learning_rate": 4.37875150060024e-05, + "loss": 0.6341, + "step": 3115 + }, + { + "epoch": 3.98848, + "grad_norm": 0.6501321196556091, + "learning_rate": 4.3785514205682274e-05, + "loss": 0.6481, + "step": 3116 + }, + { + "epoch": 3.98976, + "grad_norm": 0.6654294729232788, + "learning_rate": 4.3783513405362146e-05, + "loss": 0.6379, + "step": 3117 + }, + { + "epoch": 3.99104, + "grad_norm": 0.698321521282196, + "learning_rate": 4.378151260504202e-05, + "loss": 0.6881, + "step": 3118 + }, + { + "epoch": 3.99232, + "grad_norm": 0.7432014346122742, + "learning_rate": 4.377951180472189e-05, + "loss": 0.6836, + "step": 3119 + }, + { + "epoch": 3.9936, + "grad_norm": 0.6992458701133728, + "learning_rate": 4.377751100440176e-05, + "loss": 0.7221, + "step": 3120 + }, + { + "epoch": 3.99488, + "grad_norm": 0.6965485215187073, + "learning_rate": 4.377551020408163e-05, + "loss": 0.604, + "step": 3121 + }, + { + "epoch": 3.99616, + "grad_norm": 0.7211214900016785, + "learning_rate": 4.377350940376151e-05, + "loss": 0.7074, + "step": 3122 + }, + { + "epoch": 3.99744, + "grad_norm": 0.6553539633750916, + "learning_rate": 4.377150860344138e-05, + "loss": 0.6204, + "step": 3123 + }, + { + "epoch": 3.99872, + "grad_norm": 0.6686910390853882, + "learning_rate": 4.376950780312125e-05, + "loss": 0.7271, + "step": 3124 + }, + { + "epoch": 4.0, + "grad_norm": 1.5496490001678467, + "learning_rate": 4.376750700280112e-05, + "loss": 1.3281, + "step": 3125 + }, + { + "epoch": 4.00128, + "grad_norm": 0.6808484792709351, + "learning_rate": 4.376550620248099e-05, + "loss": 0.6388, + "step": 3126 + }, + { + "epoch": 4.00256, + "grad_norm": 0.6682015061378479, + "learning_rate": 4.3763505402160865e-05, + "loss": 0.6512, + "step": 3127 + }, + { + "epoch": 4.00384, + "grad_norm": 0.6814844608306885, + "learning_rate": 4.3761504601840736e-05, + "loss": 0.6496, + "step": 3128 + }, + { + "epoch": 4.00512, + "grad_norm": 0.698432981967926, + "learning_rate": 4.3759503801520615e-05, + "loss": 0.6272, + "step": 3129 + }, + { + "epoch": 4.0064, + "grad_norm": 0.6985964179039001, + "learning_rate": 4.375750300120049e-05, + "loss": 0.6447, + "step": 3130 + }, + { + "epoch": 4.00768, + "grad_norm": 0.648468554019928, + "learning_rate": 4.375550220088035e-05, + "loss": 0.6205, + "step": 3131 + }, + { + "epoch": 4.00896, + "grad_norm": 0.694003164768219, + "learning_rate": 4.3753501400560224e-05, + "loss": 0.6613, + "step": 3132 + }, + { + "epoch": 4.01024, + "grad_norm": 0.7265833616256714, + "learning_rate": 4.3751500600240096e-05, + "loss": 0.623, + "step": 3133 + }, + { + "epoch": 4.01152, + "grad_norm": 0.7631153464317322, + "learning_rate": 4.374949979991997e-05, + "loss": 0.6963, + "step": 3134 + }, + { + "epoch": 4.0128, + "grad_norm": 0.7075992822647095, + "learning_rate": 4.374749899959984e-05, + "loss": 0.6754, + "step": 3135 + }, + { + "epoch": 4.01408, + "grad_norm": 0.7054523229598999, + "learning_rate": 4.374549819927972e-05, + "loss": 0.6145, + "step": 3136 + }, + { + "epoch": 4.01536, + "grad_norm": 0.6870067119598389, + "learning_rate": 4.374349739895959e-05, + "loss": 0.663, + "step": 3137 + }, + { + "epoch": 4.01664, + "grad_norm": 0.6565841436386108, + "learning_rate": 4.374149659863946e-05, + "loss": 0.6111, + "step": 3138 + }, + { + "epoch": 4.01792, + "grad_norm": 0.6983011960983276, + "learning_rate": 4.373949579831933e-05, + "loss": 0.6528, + "step": 3139 + }, + { + "epoch": 4.0192, + "grad_norm": 0.6741187572479248, + "learning_rate": 4.37374949979992e-05, + "loss": 0.6625, + "step": 3140 + }, + { + "epoch": 4.02048, + "grad_norm": 0.6667806506156921, + "learning_rate": 4.373549419767907e-05, + "loss": 0.629, + "step": 3141 + }, + { + "epoch": 4.0217600000000004, + "grad_norm": 0.6826735734939575, + "learning_rate": 4.373349339735894e-05, + "loss": 0.6538, + "step": 3142 + }, + { + "epoch": 4.02304, + "grad_norm": 0.7120842933654785, + "learning_rate": 4.373149259703882e-05, + "loss": 0.6368, + "step": 3143 + }, + { + "epoch": 4.02432, + "grad_norm": 0.6919832825660706, + "learning_rate": 4.372949179671869e-05, + "loss": 0.6451, + "step": 3144 + }, + { + "epoch": 4.0256, + "grad_norm": 0.7019147276878357, + "learning_rate": 4.3727490996398565e-05, + "loss": 0.6605, + "step": 3145 + }, + { + "epoch": 4.02688, + "grad_norm": 0.7638674378395081, + "learning_rate": 4.3725490196078437e-05, + "loss": 0.6817, + "step": 3146 + }, + { + "epoch": 4.02816, + "grad_norm": 0.7084218263626099, + "learning_rate": 4.37234893957583e-05, + "loss": 0.6572, + "step": 3147 + }, + { + "epoch": 4.02944, + "grad_norm": 0.6604458093643188, + "learning_rate": 4.3721488595438174e-05, + "loss": 0.6048, + "step": 3148 + }, + { + "epoch": 4.03072, + "grad_norm": 0.6742260456085205, + "learning_rate": 4.3719487795118045e-05, + "loss": 0.6481, + "step": 3149 + }, + { + "epoch": 4.032, + "grad_norm": 0.666289210319519, + "learning_rate": 4.3717486994797924e-05, + "loss": 0.6421, + "step": 3150 + }, + { + "epoch": 4.03328, + "grad_norm": 0.6697606444358826, + "learning_rate": 4.3715486194477796e-05, + "loss": 0.6479, + "step": 3151 + }, + { + "epoch": 4.03456, + "grad_norm": 0.7304966449737549, + "learning_rate": 4.371348539415767e-05, + "loss": 0.6792, + "step": 3152 + }, + { + "epoch": 4.03584, + "grad_norm": 0.6816575527191162, + "learning_rate": 4.371148459383754e-05, + "loss": 0.601, + "step": 3153 + }, + { + "epoch": 4.03712, + "grad_norm": 0.7284610271453857, + "learning_rate": 4.370948379351741e-05, + "loss": 0.64, + "step": 3154 + }, + { + "epoch": 4.0384, + "grad_norm": 0.7191203832626343, + "learning_rate": 4.3707482993197277e-05, + "loss": 0.64, + "step": 3155 + }, + { + "epoch": 4.03968, + "grad_norm": 0.7250388264656067, + "learning_rate": 4.370548219287715e-05, + "loss": 0.696, + "step": 3156 + }, + { + "epoch": 4.04096, + "grad_norm": 0.6809269785881042, + "learning_rate": 4.370348139255703e-05, + "loss": 0.6339, + "step": 3157 + }, + { + "epoch": 4.04224, + "grad_norm": 0.6679171323776245, + "learning_rate": 4.37014805922369e-05, + "loss": 0.6256, + "step": 3158 + }, + { + "epoch": 4.04352, + "grad_norm": 0.7094346880912781, + "learning_rate": 4.369947979191677e-05, + "loss": 0.6825, + "step": 3159 + }, + { + "epoch": 4.0448, + "grad_norm": 0.6463338732719421, + "learning_rate": 4.369747899159664e-05, + "loss": 0.6085, + "step": 3160 + }, + { + "epoch": 4.04608, + "grad_norm": 0.6958500146865845, + "learning_rate": 4.3695478191276514e-05, + "loss": 0.6654, + "step": 3161 + }, + { + "epoch": 4.04736, + "grad_norm": 0.7284070253372192, + "learning_rate": 4.3693477390956386e-05, + "loss": 0.6706, + "step": 3162 + }, + { + "epoch": 4.04864, + "grad_norm": 0.7025485038757324, + "learning_rate": 4.369147659063625e-05, + "loss": 0.677, + "step": 3163 + }, + { + "epoch": 4.04992, + "grad_norm": 0.7521453499794006, + "learning_rate": 4.368947579031613e-05, + "loss": 0.7294, + "step": 3164 + }, + { + "epoch": 4.0512, + "grad_norm": 0.6905098557472229, + "learning_rate": 4.3687474989996e-05, + "loss": 0.7215, + "step": 3165 + }, + { + "epoch": 4.05248, + "grad_norm": 0.7111669182777405, + "learning_rate": 4.3685474189675874e-05, + "loss": 0.6637, + "step": 3166 + }, + { + "epoch": 4.05376, + "grad_norm": 0.7028812766075134, + "learning_rate": 4.3683473389355746e-05, + "loss": 0.665, + "step": 3167 + }, + { + "epoch": 4.05504, + "grad_norm": 0.7170726656913757, + "learning_rate": 4.368147258903562e-05, + "loss": 0.6078, + "step": 3168 + }, + { + "epoch": 4.05632, + "grad_norm": 0.702850878238678, + "learning_rate": 4.367947178871549e-05, + "loss": 0.6381, + "step": 3169 + }, + { + "epoch": 4.0576, + "grad_norm": 0.7177186012268066, + "learning_rate": 4.367747098839536e-05, + "loss": 0.6339, + "step": 3170 + }, + { + "epoch": 4.05888, + "grad_norm": 0.7100415229797363, + "learning_rate": 4.367547018807523e-05, + "loss": 0.6966, + "step": 3171 + }, + { + "epoch": 4.06016, + "grad_norm": 0.6687876582145691, + "learning_rate": 4.3673469387755105e-05, + "loss": 0.6423, + "step": 3172 + }, + { + "epoch": 4.06144, + "grad_norm": 0.7261446714401245, + "learning_rate": 4.367146858743498e-05, + "loss": 0.6613, + "step": 3173 + }, + { + "epoch": 4.06272, + "grad_norm": 0.6871818900108337, + "learning_rate": 4.366946778711485e-05, + "loss": 0.6553, + "step": 3174 + }, + { + "epoch": 4.064, + "grad_norm": 0.6970064640045166, + "learning_rate": 4.366746698679472e-05, + "loss": 0.6334, + "step": 3175 + }, + { + "epoch": 4.06528, + "grad_norm": 0.6694562435150146, + "learning_rate": 4.366546618647459e-05, + "loss": 0.5959, + "step": 3176 + }, + { + "epoch": 4.06656, + "grad_norm": 0.7168928384780884, + "learning_rate": 4.3663465386154464e-05, + "loss": 0.6292, + "step": 3177 + }, + { + "epoch": 4.06784, + "grad_norm": 0.6959748864173889, + "learning_rate": 4.3661464585834336e-05, + "loss": 0.6437, + "step": 3178 + }, + { + "epoch": 4.06912, + "grad_norm": 0.6844011545181274, + "learning_rate": 4.365946378551421e-05, + "loss": 0.6478, + "step": 3179 + }, + { + "epoch": 4.0704, + "grad_norm": 0.7261403799057007, + "learning_rate": 4.365746298519408e-05, + "loss": 0.7068, + "step": 3180 + }, + { + "epoch": 4.07168, + "grad_norm": 0.7560855746269226, + "learning_rate": 4.365546218487395e-05, + "loss": 0.6783, + "step": 3181 + }, + { + "epoch": 4.07296, + "grad_norm": 0.6811922788619995, + "learning_rate": 4.3653461384553823e-05, + "loss": 0.6544, + "step": 3182 + }, + { + "epoch": 4.07424, + "grad_norm": 0.7000817060470581, + "learning_rate": 4.3651460584233695e-05, + "loss": 0.7211, + "step": 3183 + }, + { + "epoch": 4.07552, + "grad_norm": 0.7141358852386475, + "learning_rate": 4.364945978391357e-05, + "loss": 0.6627, + "step": 3184 + }, + { + "epoch": 4.0768, + "grad_norm": 0.7260948419570923, + "learning_rate": 4.364745898359344e-05, + "loss": 0.6885, + "step": 3185 + }, + { + "epoch": 4.07808, + "grad_norm": 0.7163599133491516, + "learning_rate": 4.364545818327331e-05, + "loss": 0.6581, + "step": 3186 + }, + { + "epoch": 4.07936, + "grad_norm": 0.6844797730445862, + "learning_rate": 4.364345738295318e-05, + "loss": 0.6654, + "step": 3187 + }, + { + "epoch": 4.08064, + "grad_norm": 0.6640191078186035, + "learning_rate": 4.3641456582633055e-05, + "loss": 0.5865, + "step": 3188 + }, + { + "epoch": 4.08192, + "grad_norm": 0.6174594163894653, + "learning_rate": 4.3639455782312926e-05, + "loss": 0.595, + "step": 3189 + }, + { + "epoch": 4.0832, + "grad_norm": 0.6691709160804749, + "learning_rate": 4.36374549819928e-05, + "loss": 0.6298, + "step": 3190 + }, + { + "epoch": 4.08448, + "grad_norm": 0.7091382741928101, + "learning_rate": 4.363545418167267e-05, + "loss": 0.6369, + "step": 3191 + }, + { + "epoch": 4.08576, + "grad_norm": 0.6721692681312561, + "learning_rate": 4.363345338135254e-05, + "loss": 0.6149, + "step": 3192 + }, + { + "epoch": 4.08704, + "grad_norm": 0.7344068288803101, + "learning_rate": 4.3631452581032414e-05, + "loss": 0.6966, + "step": 3193 + }, + { + "epoch": 4.08832, + "grad_norm": 0.7092353105545044, + "learning_rate": 4.3629451780712286e-05, + "loss": 0.6499, + "step": 3194 + }, + { + "epoch": 4.0896, + "grad_norm": 0.7213243246078491, + "learning_rate": 4.362745098039216e-05, + "loss": 0.6762, + "step": 3195 + }, + { + "epoch": 4.09088, + "grad_norm": 0.734781801700592, + "learning_rate": 4.362545018007203e-05, + "loss": 0.7091, + "step": 3196 + }, + { + "epoch": 4.09216, + "grad_norm": 0.7117027044296265, + "learning_rate": 4.36234493797519e-05, + "loss": 0.6111, + "step": 3197 + }, + { + "epoch": 4.09344, + "grad_norm": 0.7281467914581299, + "learning_rate": 4.362144857943177e-05, + "loss": 0.6682, + "step": 3198 + }, + { + "epoch": 4.09472, + "grad_norm": 0.7199225425720215, + "learning_rate": 4.3619447779111645e-05, + "loss": 0.6847, + "step": 3199 + }, + { + "epoch": 4.096, + "grad_norm": 0.6403080224990845, + "learning_rate": 4.3617446978791524e-05, + "loss": 0.6134, + "step": 3200 + }, + { + "epoch": 4.09728, + "grad_norm": 0.7282823920249939, + "learning_rate": 4.361544617847139e-05, + "loss": 0.6369, + "step": 3201 + }, + { + "epoch": 4.09856, + "grad_norm": 0.673549234867096, + "learning_rate": 4.361344537815126e-05, + "loss": 0.6577, + "step": 3202 + }, + { + "epoch": 4.09984, + "grad_norm": 0.717682421207428, + "learning_rate": 4.361144457783113e-05, + "loss": 0.6413, + "step": 3203 + }, + { + "epoch": 4.10112, + "grad_norm": 0.6735298037528992, + "learning_rate": 4.3609443777511004e-05, + "loss": 0.6404, + "step": 3204 + }, + { + "epoch": 4.1024, + "grad_norm": 0.715994656085968, + "learning_rate": 4.3607442977190876e-05, + "loss": 0.677, + "step": 3205 + }, + { + "epoch": 4.10368, + "grad_norm": 0.7662949562072754, + "learning_rate": 4.360544217687075e-05, + "loss": 0.7697, + "step": 3206 + }, + { + "epoch": 4.10496, + "grad_norm": 0.7299420833587646, + "learning_rate": 4.360344137655063e-05, + "loss": 0.688, + "step": 3207 + }, + { + "epoch": 4.10624, + "grad_norm": 0.7071677446365356, + "learning_rate": 4.36014405762305e-05, + "loss": 0.6872, + "step": 3208 + }, + { + "epoch": 4.10752, + "grad_norm": 0.7498626708984375, + "learning_rate": 4.3599439775910364e-05, + "loss": 0.647, + "step": 3209 + }, + { + "epoch": 4.1088, + "grad_norm": 0.6997010707855225, + "learning_rate": 4.3597438975590235e-05, + "loss": 0.5932, + "step": 3210 + }, + { + "epoch": 4.11008, + "grad_norm": 0.6870605945587158, + "learning_rate": 4.359543817527011e-05, + "loss": 0.6438, + "step": 3211 + }, + { + "epoch": 4.11136, + "grad_norm": 0.6903749704360962, + "learning_rate": 4.359343737494998e-05, + "loss": 0.6307, + "step": 3212 + }, + { + "epoch": 4.11264, + "grad_norm": 0.7005348801612854, + "learning_rate": 4.359143657462985e-05, + "loss": 0.6214, + "step": 3213 + }, + { + "epoch": 4.11392, + "grad_norm": 0.6793337464332581, + "learning_rate": 4.358943577430973e-05, + "loss": 0.6668, + "step": 3214 + }, + { + "epoch": 4.1152, + "grad_norm": 0.7198171019554138, + "learning_rate": 4.35874349739896e-05, + "loss": 0.6384, + "step": 3215 + }, + { + "epoch": 4.11648, + "grad_norm": 0.7152752876281738, + "learning_rate": 4.358543417366947e-05, + "loss": 0.6446, + "step": 3216 + }, + { + "epoch": 4.11776, + "grad_norm": 0.7162675857543945, + "learning_rate": 4.358343337334934e-05, + "loss": 0.6621, + "step": 3217 + }, + { + "epoch": 4.11904, + "grad_norm": 0.72551029920578, + "learning_rate": 4.358143257302921e-05, + "loss": 0.6708, + "step": 3218 + }, + { + "epoch": 4.12032, + "grad_norm": 0.7436588406562805, + "learning_rate": 4.357943177270908e-05, + "loss": 0.6543, + "step": 3219 + }, + { + "epoch": 4.1216, + "grad_norm": 0.7259624600410461, + "learning_rate": 4.3577430972388954e-05, + "loss": 0.6642, + "step": 3220 + }, + { + "epoch": 4.12288, + "grad_norm": 0.6752135157585144, + "learning_rate": 4.357543017206883e-05, + "loss": 0.6495, + "step": 3221 + }, + { + "epoch": 4.12416, + "grad_norm": 0.698948323726654, + "learning_rate": 4.3573429371748704e-05, + "loss": 0.703, + "step": 3222 + }, + { + "epoch": 4.12544, + "grad_norm": 0.7195350527763367, + "learning_rate": 4.3571428571428576e-05, + "loss": 0.6924, + "step": 3223 + }, + { + "epoch": 4.12672, + "grad_norm": 0.6823775172233582, + "learning_rate": 4.356942777110845e-05, + "loss": 0.6378, + "step": 3224 + }, + { + "epoch": 4.128, + "grad_norm": 0.719937801361084, + "learning_rate": 4.356742697078831e-05, + "loss": 0.6959, + "step": 3225 + }, + { + "epoch": 4.12928, + "grad_norm": 0.6940419673919678, + "learning_rate": 4.3565426170468185e-05, + "loss": 0.6788, + "step": 3226 + }, + { + "epoch": 4.13056, + "grad_norm": 0.7241349220275879, + "learning_rate": 4.356342537014806e-05, + "loss": 0.6565, + "step": 3227 + }, + { + "epoch": 4.13184, + "grad_norm": 0.7489647269248962, + "learning_rate": 4.3561424569827936e-05, + "loss": 0.6415, + "step": 3228 + }, + { + "epoch": 4.13312, + "grad_norm": 0.7111489176750183, + "learning_rate": 4.355942376950781e-05, + "loss": 0.6258, + "step": 3229 + }, + { + "epoch": 4.1344, + "grad_norm": 0.6695640087127686, + "learning_rate": 4.355742296918768e-05, + "loss": 0.612, + "step": 3230 + }, + { + "epoch": 4.13568, + "grad_norm": 0.6810483336448669, + "learning_rate": 4.355542216886755e-05, + "loss": 0.6202, + "step": 3231 + }, + { + "epoch": 4.13696, + "grad_norm": 0.7654765844345093, + "learning_rate": 4.355342136854742e-05, + "loss": 0.6865, + "step": 3232 + }, + { + "epoch": 4.13824, + "grad_norm": 0.7125704288482666, + "learning_rate": 4.355142056822729e-05, + "loss": 0.6864, + "step": 3233 + }, + { + "epoch": 4.13952, + "grad_norm": 0.7397105097770691, + "learning_rate": 4.354941976790716e-05, + "loss": 0.6494, + "step": 3234 + }, + { + "epoch": 4.1408, + "grad_norm": 0.6764322519302368, + "learning_rate": 4.354741896758704e-05, + "loss": 0.6203, + "step": 3235 + }, + { + "epoch": 4.14208, + "grad_norm": 0.6737890839576721, + "learning_rate": 4.354541816726691e-05, + "loss": 0.6531, + "step": 3236 + }, + { + "epoch": 4.14336, + "grad_norm": 0.6234689950942993, + "learning_rate": 4.354341736694678e-05, + "loss": 0.5897, + "step": 3237 + }, + { + "epoch": 4.14464, + "grad_norm": 0.7278350591659546, + "learning_rate": 4.3541416566626654e-05, + "loss": 0.7032, + "step": 3238 + }, + { + "epoch": 4.14592, + "grad_norm": 0.645071804523468, + "learning_rate": 4.3539415766306526e-05, + "loss": 0.6347, + "step": 3239 + }, + { + "epoch": 4.1472, + "grad_norm": 0.6949378848075867, + "learning_rate": 4.35374149659864e-05, + "loss": 0.624, + "step": 3240 + }, + { + "epoch": 4.14848, + "grad_norm": 0.6974506974220276, + "learning_rate": 4.353541416566626e-05, + "loss": 0.6416, + "step": 3241 + }, + { + "epoch": 4.14976, + "grad_norm": 0.7012295722961426, + "learning_rate": 4.353341336534614e-05, + "loss": 0.6061, + "step": 3242 + }, + { + "epoch": 4.15104, + "grad_norm": 0.6732747554779053, + "learning_rate": 4.3531412565026013e-05, + "loss": 0.6482, + "step": 3243 + }, + { + "epoch": 4.15232, + "grad_norm": 0.6656915545463562, + "learning_rate": 4.3529411764705885e-05, + "loss": 0.6304, + "step": 3244 + }, + { + "epoch": 4.1536, + "grad_norm": 0.6815019845962524, + "learning_rate": 4.352741096438576e-05, + "loss": 0.5652, + "step": 3245 + }, + { + "epoch": 4.15488, + "grad_norm": 0.739524245262146, + "learning_rate": 4.352541016406563e-05, + "loss": 0.6389, + "step": 3246 + }, + { + "epoch": 4.15616, + "grad_norm": 0.641751766204834, + "learning_rate": 4.35234093637455e-05, + "loss": 0.6259, + "step": 3247 + }, + { + "epoch": 4.15744, + "grad_norm": 0.6507452130317688, + "learning_rate": 4.352140856342537e-05, + "loss": 0.5613, + "step": 3248 + }, + { + "epoch": 4.15872, + "grad_norm": 0.7066096663475037, + "learning_rate": 4.3519407763105245e-05, + "loss": 0.5779, + "step": 3249 + }, + { + "epoch": 4.16, + "grad_norm": 0.6899321675300598, + "learning_rate": 4.3517406962785116e-05, + "loss": 0.6098, + "step": 3250 + }, + { + "epoch": 4.16128, + "grad_norm": 0.7460212707519531, + "learning_rate": 4.351540616246499e-05, + "loss": 0.6969, + "step": 3251 + }, + { + "epoch": 4.16256, + "grad_norm": 0.6735222339630127, + "learning_rate": 4.351340536214486e-05, + "loss": 0.5961, + "step": 3252 + }, + { + "epoch": 4.16384, + "grad_norm": 0.668106198310852, + "learning_rate": 4.351140456182473e-05, + "loss": 0.588, + "step": 3253 + }, + { + "epoch": 4.16512, + "grad_norm": 0.7419319748878479, + "learning_rate": 4.3509403761504604e-05, + "loss": 0.6707, + "step": 3254 + }, + { + "epoch": 4.1664, + "grad_norm": 0.7336167693138123, + "learning_rate": 4.3507402961184476e-05, + "loss": 0.6756, + "step": 3255 + }, + { + "epoch": 4.16768, + "grad_norm": 0.7352126240730286, + "learning_rate": 4.350540216086435e-05, + "loss": 0.6582, + "step": 3256 + }, + { + "epoch": 4.16896, + "grad_norm": 0.6964594721794128, + "learning_rate": 4.350340136054422e-05, + "loss": 0.6481, + "step": 3257 + }, + { + "epoch": 4.17024, + "grad_norm": 0.6809775233268738, + "learning_rate": 4.350140056022409e-05, + "loss": 0.6216, + "step": 3258 + }, + { + "epoch": 4.17152, + "grad_norm": 0.7008639574050903, + "learning_rate": 4.349939975990396e-05, + "loss": 0.6148, + "step": 3259 + }, + { + "epoch": 4.1728, + "grad_norm": 0.7400866150856018, + "learning_rate": 4.3497398959583835e-05, + "loss": 0.6512, + "step": 3260 + }, + { + "epoch": 4.17408, + "grad_norm": 0.6811804175376892, + "learning_rate": 4.349539815926371e-05, + "loss": 0.6134, + "step": 3261 + }, + { + "epoch": 4.17536, + "grad_norm": 0.6632633805274963, + "learning_rate": 4.349339735894358e-05, + "loss": 0.6315, + "step": 3262 + }, + { + "epoch": 4.17664, + "grad_norm": 0.6872732043266296, + "learning_rate": 4.349139655862345e-05, + "loss": 0.6406, + "step": 3263 + }, + { + "epoch": 4.17792, + "grad_norm": 0.7408565282821655, + "learning_rate": 4.348939575830332e-05, + "loss": 0.6544, + "step": 3264 + }, + { + "epoch": 4.1792, + "grad_norm": 0.7121074199676514, + "learning_rate": 4.3487394957983194e-05, + "loss": 0.6336, + "step": 3265 + }, + { + "epoch": 4.18048, + "grad_norm": 0.7045162916183472, + "learning_rate": 4.3485394157663066e-05, + "loss": 0.7007, + "step": 3266 + }, + { + "epoch": 4.18176, + "grad_norm": 0.7155641317367554, + "learning_rate": 4.348339335734294e-05, + "loss": 0.7133, + "step": 3267 + }, + { + "epoch": 4.18304, + "grad_norm": 0.7041541934013367, + "learning_rate": 4.348139255702281e-05, + "loss": 0.6453, + "step": 3268 + }, + { + "epoch": 4.18432, + "grad_norm": 0.7294322848320007, + "learning_rate": 4.347939175670268e-05, + "loss": 0.6818, + "step": 3269 + }, + { + "epoch": 4.1856, + "grad_norm": 0.6842989325523376, + "learning_rate": 4.347739095638256e-05, + "loss": 0.654, + "step": 3270 + }, + { + "epoch": 4.18688, + "grad_norm": 0.7636930346488953, + "learning_rate": 4.3475390156062425e-05, + "loss": 0.6433, + "step": 3271 + }, + { + "epoch": 4.18816, + "grad_norm": 0.7250775694847107, + "learning_rate": 4.34733893557423e-05, + "loss": 0.6537, + "step": 3272 + }, + { + "epoch": 4.18944, + "grad_norm": 0.7232015132904053, + "learning_rate": 4.347138855542217e-05, + "loss": 0.6162, + "step": 3273 + }, + { + "epoch": 4.19072, + "grad_norm": 0.7661274671554565, + "learning_rate": 4.346938775510204e-05, + "loss": 0.6299, + "step": 3274 + }, + { + "epoch": 4.192, + "grad_norm": 0.6594750285148621, + "learning_rate": 4.346738695478191e-05, + "loss": 0.6141, + "step": 3275 + }, + { + "epoch": 4.19328, + "grad_norm": 0.7808811664581299, + "learning_rate": 4.3465386154461785e-05, + "loss": 0.7105, + "step": 3276 + }, + { + "epoch": 4.19456, + "grad_norm": 0.6678184866905212, + "learning_rate": 4.346338535414166e-05, + "loss": 0.618, + "step": 3277 + }, + { + "epoch": 4.19584, + "grad_norm": 0.7320860624313354, + "learning_rate": 4.3461384553821535e-05, + "loss": 0.6352, + "step": 3278 + }, + { + "epoch": 4.19712, + "grad_norm": 0.7203635573387146, + "learning_rate": 4.34593837535014e-05, + "loss": 0.6398, + "step": 3279 + }, + { + "epoch": 4.1984, + "grad_norm": 0.714809238910675, + "learning_rate": 4.345738295318127e-05, + "loss": 0.6567, + "step": 3280 + }, + { + "epoch": 4.19968, + "grad_norm": 0.6897768378257751, + "learning_rate": 4.3455382152861144e-05, + "loss": 0.6413, + "step": 3281 + }, + { + "epoch": 4.20096, + "grad_norm": 0.7241793274879456, + "learning_rate": 4.3453381352541016e-05, + "loss": 0.6331, + "step": 3282 + }, + { + "epoch": 4.20224, + "grad_norm": 0.6980943083763123, + "learning_rate": 4.345138055222089e-05, + "loss": 0.6244, + "step": 3283 + }, + { + "epoch": 4.20352, + "grad_norm": 0.7093860507011414, + "learning_rate": 4.3449379751900766e-05, + "loss": 0.6415, + "step": 3284 + }, + { + "epoch": 4.2048, + "grad_norm": 0.7445345520973206, + "learning_rate": 4.344737895158064e-05, + "loss": 0.6459, + "step": 3285 + }, + { + "epoch": 4.20608, + "grad_norm": 0.6987217664718628, + "learning_rate": 4.344537815126051e-05, + "loss": 0.6554, + "step": 3286 + }, + { + "epoch": 4.2073599999999995, + "grad_norm": 0.7271133065223694, + "learning_rate": 4.3443377350940375e-05, + "loss": 0.6412, + "step": 3287 + }, + { + "epoch": 4.20864, + "grad_norm": 0.7157135605812073, + "learning_rate": 4.344137655062025e-05, + "loss": 0.6508, + "step": 3288 + }, + { + "epoch": 4.20992, + "grad_norm": 0.666031539440155, + "learning_rate": 4.343937575030012e-05, + "loss": 0.6324, + "step": 3289 + }, + { + "epoch": 4.2112, + "grad_norm": 0.7204000353813171, + "learning_rate": 4.343737494997999e-05, + "loss": 0.6359, + "step": 3290 + }, + { + "epoch": 4.21248, + "grad_norm": 0.6386025547981262, + "learning_rate": 4.343537414965987e-05, + "loss": 0.5998, + "step": 3291 + }, + { + "epoch": 4.21376, + "grad_norm": 0.7287147641181946, + "learning_rate": 4.343337334933974e-05, + "loss": 0.6929, + "step": 3292 + }, + { + "epoch": 4.21504, + "grad_norm": 0.7094889283180237, + "learning_rate": 4.343137254901961e-05, + "loss": 0.6417, + "step": 3293 + }, + { + "epoch": 4.21632, + "grad_norm": 0.6859930753707886, + "learning_rate": 4.3429371748699485e-05, + "loss": 0.6024, + "step": 3294 + }, + { + "epoch": 4.2176, + "grad_norm": 0.6752464771270752, + "learning_rate": 4.342737094837935e-05, + "loss": 0.6713, + "step": 3295 + }, + { + "epoch": 4.21888, + "grad_norm": 0.6904641389846802, + "learning_rate": 4.342537014805922e-05, + "loss": 0.636, + "step": 3296 + }, + { + "epoch": 4.22016, + "grad_norm": 0.6931844353675842, + "learning_rate": 4.3423369347739094e-05, + "loss": 0.6189, + "step": 3297 + }, + { + "epoch": 4.22144, + "grad_norm": 0.6663493514060974, + "learning_rate": 4.342136854741897e-05, + "loss": 0.5673, + "step": 3298 + }, + { + "epoch": 4.22272, + "grad_norm": 0.7185217142105103, + "learning_rate": 4.3419367747098844e-05, + "loss": 0.6528, + "step": 3299 + }, + { + "epoch": 4.224, + "grad_norm": 0.7160610556602478, + "learning_rate": 4.3417366946778716e-05, + "loss": 0.6577, + "step": 3300 + }, + { + "epoch": 4.22528, + "grad_norm": 0.6835800409317017, + "learning_rate": 4.341536614645859e-05, + "loss": 0.6558, + "step": 3301 + }, + { + "epoch": 4.22656, + "grad_norm": 0.7105692028999329, + "learning_rate": 4.341336534613846e-05, + "loss": 0.6875, + "step": 3302 + }, + { + "epoch": 4.22784, + "grad_norm": 0.7384511232376099, + "learning_rate": 4.3411364545818325e-05, + "loss": 0.6484, + "step": 3303 + }, + { + "epoch": 4.22912, + "grad_norm": 0.7124598622322083, + "learning_rate": 4.34093637454982e-05, + "loss": 0.6571, + "step": 3304 + }, + { + "epoch": 4.2304, + "grad_norm": 0.718474268913269, + "learning_rate": 4.340736294517807e-05, + "loss": 0.6457, + "step": 3305 + }, + { + "epoch": 4.23168, + "grad_norm": 0.6663171648979187, + "learning_rate": 4.340536214485795e-05, + "loss": 0.592, + "step": 3306 + }, + { + "epoch": 4.23296, + "grad_norm": 0.6899667382240295, + "learning_rate": 4.340336134453782e-05, + "loss": 0.6845, + "step": 3307 + }, + { + "epoch": 4.23424, + "grad_norm": 0.7246834635734558, + "learning_rate": 4.340136054421769e-05, + "loss": 0.6739, + "step": 3308 + }, + { + "epoch": 4.23552, + "grad_norm": 0.7182645797729492, + "learning_rate": 4.339935974389756e-05, + "loss": 0.6249, + "step": 3309 + }, + { + "epoch": 4.2368, + "grad_norm": 0.706695020198822, + "learning_rate": 4.3397358943577435e-05, + "loss": 0.6775, + "step": 3310 + }, + { + "epoch": 4.23808, + "grad_norm": 0.7705980539321899, + "learning_rate": 4.33953581432573e-05, + "loss": 0.7309, + "step": 3311 + }, + { + "epoch": 4.23936, + "grad_norm": 0.7092674970626831, + "learning_rate": 4.339335734293717e-05, + "loss": 0.6438, + "step": 3312 + }, + { + "epoch": 4.24064, + "grad_norm": 0.7281712889671326, + "learning_rate": 4.339135654261705e-05, + "loss": 0.6273, + "step": 3313 + }, + { + "epoch": 4.24192, + "grad_norm": 0.7132616639137268, + "learning_rate": 4.338935574229692e-05, + "loss": 0.6189, + "step": 3314 + }, + { + "epoch": 4.2432, + "grad_norm": 0.707158625125885, + "learning_rate": 4.3387354941976794e-05, + "loss": 0.6604, + "step": 3315 + }, + { + "epoch": 4.24448, + "grad_norm": 0.7403045892715454, + "learning_rate": 4.3385354141656666e-05, + "loss": 0.6983, + "step": 3316 + }, + { + "epoch": 4.24576, + "grad_norm": 0.7587504982948303, + "learning_rate": 4.338335334133654e-05, + "loss": 0.6572, + "step": 3317 + }, + { + "epoch": 4.24704, + "grad_norm": 0.682956337928772, + "learning_rate": 4.338135254101641e-05, + "loss": 0.6519, + "step": 3318 + }, + { + "epoch": 4.24832, + "grad_norm": 0.6858289241790771, + "learning_rate": 4.3379351740696275e-05, + "loss": 0.6171, + "step": 3319 + }, + { + "epoch": 4.2496, + "grad_norm": 0.7262416481971741, + "learning_rate": 4.337735094037615e-05, + "loss": 0.6585, + "step": 3320 + }, + { + "epoch": 4.25088, + "grad_norm": 0.6875229477882385, + "learning_rate": 4.3375350140056025e-05, + "loss": 0.6579, + "step": 3321 + }, + { + "epoch": 4.25216, + "grad_norm": 0.7535980343818665, + "learning_rate": 4.33733493397359e-05, + "loss": 0.7405, + "step": 3322 + }, + { + "epoch": 4.25344, + "grad_norm": 0.6897569298744202, + "learning_rate": 4.337134853941577e-05, + "loss": 0.6639, + "step": 3323 + }, + { + "epoch": 4.25472, + "grad_norm": 0.7273359298706055, + "learning_rate": 4.336934773909564e-05, + "loss": 0.6645, + "step": 3324 + }, + { + "epoch": 4.256, + "grad_norm": 0.7021262645721436, + "learning_rate": 4.336734693877551e-05, + "loss": 0.5955, + "step": 3325 + }, + { + "epoch": 4.25728, + "grad_norm": 0.6609035134315491, + "learning_rate": 4.3365346138455384e-05, + "loss": 0.6416, + "step": 3326 + }, + { + "epoch": 4.25856, + "grad_norm": 0.8022797703742981, + "learning_rate": 4.3363345338135256e-05, + "loss": 0.6363, + "step": 3327 + }, + { + "epoch": 4.25984, + "grad_norm": 0.7287960648536682, + "learning_rate": 4.336134453781513e-05, + "loss": 0.6815, + "step": 3328 + }, + { + "epoch": 4.26112, + "grad_norm": 0.6661964654922485, + "learning_rate": 4.3359343737495e-05, + "loss": 0.6532, + "step": 3329 + }, + { + "epoch": 4.2624, + "grad_norm": 0.7338867783546448, + "learning_rate": 4.335734293717487e-05, + "loss": 0.7156, + "step": 3330 + }, + { + "epoch": 4.26368, + "grad_norm": 0.7184217572212219, + "learning_rate": 4.3355342136854744e-05, + "loss": 0.6892, + "step": 3331 + }, + { + "epoch": 4.26496, + "grad_norm": 0.715719997882843, + "learning_rate": 4.3353341336534615e-05, + "loss": 0.6508, + "step": 3332 + }, + { + "epoch": 4.26624, + "grad_norm": 0.6484602689743042, + "learning_rate": 4.335134053621449e-05, + "loss": 0.6111, + "step": 3333 + }, + { + "epoch": 4.26752, + "grad_norm": 0.6997168064117432, + "learning_rate": 4.334933973589436e-05, + "loss": 0.6312, + "step": 3334 + }, + { + "epoch": 4.2688, + "grad_norm": 0.7127143740653992, + "learning_rate": 4.334733893557423e-05, + "loss": 0.6445, + "step": 3335 + }, + { + "epoch": 4.27008, + "grad_norm": 0.736754298210144, + "learning_rate": 4.33453381352541e-05, + "loss": 0.6678, + "step": 3336 + }, + { + "epoch": 4.27136, + "grad_norm": 0.7001957893371582, + "learning_rate": 4.3343337334933975e-05, + "loss": 0.6383, + "step": 3337 + }, + { + "epoch": 4.27264, + "grad_norm": 0.6919710040092468, + "learning_rate": 4.334133653461385e-05, + "loss": 0.6978, + "step": 3338 + }, + { + "epoch": 4.27392, + "grad_norm": 0.7082419991493225, + "learning_rate": 4.333933573429372e-05, + "loss": 0.6756, + "step": 3339 + }, + { + "epoch": 4.2752, + "grad_norm": 0.7254000306129456, + "learning_rate": 4.333733493397359e-05, + "loss": 0.6301, + "step": 3340 + }, + { + "epoch": 4.27648, + "grad_norm": 0.7085341811180115, + "learning_rate": 4.333533413365347e-05, + "loss": 0.6495, + "step": 3341 + }, + { + "epoch": 4.27776, + "grad_norm": 0.699360728263855, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.6426, + "step": 3342 + }, + { + "epoch": 4.27904, + "grad_norm": 0.7209745049476624, + "learning_rate": 4.3331332533013206e-05, + "loss": 0.6308, + "step": 3343 + }, + { + "epoch": 4.28032, + "grad_norm": 0.7370860576629639, + "learning_rate": 4.332933173269308e-05, + "loss": 0.623, + "step": 3344 + }, + { + "epoch": 4.2816, + "grad_norm": 0.7051844596862793, + "learning_rate": 4.332733093237295e-05, + "loss": 0.6661, + "step": 3345 + }, + { + "epoch": 4.2828800000000005, + "grad_norm": 0.6612274050712585, + "learning_rate": 4.332533013205282e-05, + "loss": 0.5542, + "step": 3346 + }, + { + "epoch": 4.28416, + "grad_norm": 0.7362515926361084, + "learning_rate": 4.332332933173269e-05, + "loss": 0.7012, + "step": 3347 + }, + { + "epoch": 4.28544, + "grad_norm": 0.7105290293693542, + "learning_rate": 4.332132853141257e-05, + "loss": 0.6536, + "step": 3348 + }, + { + "epoch": 4.28672, + "grad_norm": 0.7252454161643982, + "learning_rate": 4.3319327731092444e-05, + "loss": 0.6018, + "step": 3349 + }, + { + "epoch": 4.288, + "grad_norm": 0.706749677658081, + "learning_rate": 4.331732693077231e-05, + "loss": 0.6721, + "step": 3350 + }, + { + "epoch": 4.28928, + "grad_norm": 0.6935198307037354, + "learning_rate": 4.331532613045218e-05, + "loss": 0.6268, + "step": 3351 + }, + { + "epoch": 4.29056, + "grad_norm": 0.708602786064148, + "learning_rate": 4.331332533013205e-05, + "loss": 0.6309, + "step": 3352 + }, + { + "epoch": 4.29184, + "grad_norm": 0.6960015892982483, + "learning_rate": 4.3311324529811924e-05, + "loss": 0.6418, + "step": 3353 + }, + { + "epoch": 4.29312, + "grad_norm": 0.7291163206100464, + "learning_rate": 4.3309323729491796e-05, + "loss": 0.6651, + "step": 3354 + }, + { + "epoch": 4.2943999999999996, + "grad_norm": 0.7136040925979614, + "learning_rate": 4.3307322929171675e-05, + "loss": 0.6568, + "step": 3355 + }, + { + "epoch": 4.29568, + "grad_norm": 0.7023734450340271, + "learning_rate": 4.330532212885155e-05, + "loss": 0.6535, + "step": 3356 + }, + { + "epoch": 4.29696, + "grad_norm": 0.706221878528595, + "learning_rate": 4.330332132853142e-05, + "loss": 0.6287, + "step": 3357 + }, + { + "epoch": 4.29824, + "grad_norm": 0.6975945830345154, + "learning_rate": 4.3301320528211284e-05, + "loss": 0.635, + "step": 3358 + }, + { + "epoch": 4.29952, + "grad_norm": 0.7090380787849426, + "learning_rate": 4.3299319727891156e-05, + "loss": 0.6462, + "step": 3359 + }, + { + "epoch": 4.3008, + "grad_norm": 0.7626847624778748, + "learning_rate": 4.329731892757103e-05, + "loss": 0.6438, + "step": 3360 + }, + { + "epoch": 4.30208, + "grad_norm": 0.6872966289520264, + "learning_rate": 4.32953181272509e-05, + "loss": 0.6682, + "step": 3361 + }, + { + "epoch": 4.30336, + "grad_norm": 0.6952762603759766, + "learning_rate": 4.329331732693078e-05, + "loss": 0.6368, + "step": 3362 + }, + { + "epoch": 4.30464, + "grad_norm": 0.7490139007568359, + "learning_rate": 4.329131652661065e-05, + "loss": 0.6352, + "step": 3363 + }, + { + "epoch": 4.30592, + "grad_norm": 0.6900007128715515, + "learning_rate": 4.328931572629052e-05, + "loss": 0.652, + "step": 3364 + }, + { + "epoch": 4.3072, + "grad_norm": 0.7102410197257996, + "learning_rate": 4.3287314925970394e-05, + "loss": 0.6628, + "step": 3365 + }, + { + "epoch": 4.30848, + "grad_norm": 0.7227296233177185, + "learning_rate": 4.328531412565026e-05, + "loss": 0.6377, + "step": 3366 + }, + { + "epoch": 4.30976, + "grad_norm": 0.7517803311347961, + "learning_rate": 4.328331332533013e-05, + "loss": 0.7112, + "step": 3367 + }, + { + "epoch": 4.31104, + "grad_norm": 0.6790129542350769, + "learning_rate": 4.328131252501e-05, + "loss": 0.5814, + "step": 3368 + }, + { + "epoch": 4.31232, + "grad_norm": 0.6734822988510132, + "learning_rate": 4.327931172468988e-05, + "loss": 0.6569, + "step": 3369 + }, + { + "epoch": 4.3136, + "grad_norm": 0.6671125888824463, + "learning_rate": 4.327731092436975e-05, + "loss": 0.6558, + "step": 3370 + }, + { + "epoch": 4.31488, + "grad_norm": 0.6953800916671753, + "learning_rate": 4.3275310124049625e-05, + "loss": 0.6398, + "step": 3371 + }, + { + "epoch": 4.31616, + "grad_norm": 0.669097900390625, + "learning_rate": 4.3273309323729497e-05, + "loss": 0.6175, + "step": 3372 + }, + { + "epoch": 4.31744, + "grad_norm": 0.6786286234855652, + "learning_rate": 4.327130852340937e-05, + "loss": 0.6225, + "step": 3373 + }, + { + "epoch": 4.31872, + "grad_norm": 0.7077885866165161, + "learning_rate": 4.3269307723089233e-05, + "loss": 0.6297, + "step": 3374 + }, + { + "epoch": 4.32, + "grad_norm": 0.7308838367462158, + "learning_rate": 4.3267306922769105e-05, + "loss": 0.6341, + "step": 3375 + }, + { + "epoch": 4.32128, + "grad_norm": 0.7163229584693909, + "learning_rate": 4.3265306122448984e-05, + "loss": 0.6429, + "step": 3376 + }, + { + "epoch": 4.32256, + "grad_norm": 0.7708077430725098, + "learning_rate": 4.3263305322128856e-05, + "loss": 0.7227, + "step": 3377 + }, + { + "epoch": 4.32384, + "grad_norm": 0.694496214389801, + "learning_rate": 4.326130452180873e-05, + "loss": 0.6162, + "step": 3378 + }, + { + "epoch": 4.32512, + "grad_norm": 0.666400671005249, + "learning_rate": 4.32593037214886e-05, + "loss": 0.5697, + "step": 3379 + }, + { + "epoch": 4.3264, + "grad_norm": 0.7009501457214355, + "learning_rate": 4.325730292116847e-05, + "loss": 0.664, + "step": 3380 + }, + { + "epoch": 4.32768, + "grad_norm": 0.7188718914985657, + "learning_rate": 4.325530212084834e-05, + "loss": 0.6408, + "step": 3381 + }, + { + "epoch": 4.32896, + "grad_norm": 0.6969630122184753, + "learning_rate": 4.325330132052821e-05, + "loss": 0.6023, + "step": 3382 + }, + { + "epoch": 4.33024, + "grad_norm": 0.6989743113517761, + "learning_rate": 4.325130052020809e-05, + "loss": 0.652, + "step": 3383 + }, + { + "epoch": 4.33152, + "grad_norm": 0.7306556701660156, + "learning_rate": 4.324929971988796e-05, + "loss": 0.7016, + "step": 3384 + }, + { + "epoch": 4.3328, + "grad_norm": 0.6765083074569702, + "learning_rate": 4.324729891956783e-05, + "loss": 0.6264, + "step": 3385 + }, + { + "epoch": 4.33408, + "grad_norm": 0.6959035396575928, + "learning_rate": 4.32452981192477e-05, + "loss": 0.5973, + "step": 3386 + }, + { + "epoch": 4.33536, + "grad_norm": 0.6781784892082214, + "learning_rate": 4.3243297318927574e-05, + "loss": 0.6569, + "step": 3387 + }, + { + "epoch": 4.33664, + "grad_norm": 0.7198589444160461, + "learning_rate": 4.3241296518607446e-05, + "loss": 0.6785, + "step": 3388 + }, + { + "epoch": 4.33792, + "grad_norm": 0.74173504114151, + "learning_rate": 4.323929571828732e-05, + "loss": 0.6489, + "step": 3389 + }, + { + "epoch": 4.3392, + "grad_norm": 0.7088286876678467, + "learning_rate": 4.323729491796719e-05, + "loss": 0.6311, + "step": 3390 + }, + { + "epoch": 4.34048, + "grad_norm": 0.7243875861167908, + "learning_rate": 4.323529411764706e-05, + "loss": 0.64, + "step": 3391 + }, + { + "epoch": 4.34176, + "grad_norm": 0.7260725498199463, + "learning_rate": 4.3233293317326934e-05, + "loss": 0.6259, + "step": 3392 + }, + { + "epoch": 4.34304, + "grad_norm": 0.6951209902763367, + "learning_rate": 4.3231292517006806e-05, + "loss": 0.6162, + "step": 3393 + }, + { + "epoch": 4.34432, + "grad_norm": 0.6746131181716919, + "learning_rate": 4.322929171668668e-05, + "loss": 0.6122, + "step": 3394 + }, + { + "epoch": 4.3456, + "grad_norm": 0.7057126760482788, + "learning_rate": 4.322729091636655e-05, + "loss": 0.6491, + "step": 3395 + }, + { + "epoch": 4.34688, + "grad_norm": 0.7594478726387024, + "learning_rate": 4.322529011604642e-05, + "loss": 0.6743, + "step": 3396 + }, + { + "epoch": 4.34816, + "grad_norm": 0.6961939930915833, + "learning_rate": 4.322328931572629e-05, + "loss": 0.6814, + "step": 3397 + }, + { + "epoch": 4.3494399999999995, + "grad_norm": 0.7641561627388, + "learning_rate": 4.3221288515406165e-05, + "loss": 0.6787, + "step": 3398 + }, + { + "epoch": 4.35072, + "grad_norm": 0.7088523507118225, + "learning_rate": 4.321928771508604e-05, + "loss": 0.6936, + "step": 3399 + }, + { + "epoch": 4.352, + "grad_norm": 0.7125248908996582, + "learning_rate": 4.321728691476591e-05, + "loss": 0.6512, + "step": 3400 + }, + { + "epoch": 4.35328, + "grad_norm": 0.7159749269485474, + "learning_rate": 4.321528611444578e-05, + "loss": 0.6013, + "step": 3401 + }, + { + "epoch": 4.35456, + "grad_norm": 0.7057827115058899, + "learning_rate": 4.321328531412565e-05, + "loss": 0.6754, + "step": 3402 + }, + { + "epoch": 4.35584, + "grad_norm": 0.7021535038948059, + "learning_rate": 4.3211284513805524e-05, + "loss": 0.6203, + "step": 3403 + }, + { + "epoch": 4.35712, + "grad_norm": 0.6961078643798828, + "learning_rate": 4.3209283713485396e-05, + "loss": 0.6115, + "step": 3404 + }, + { + "epoch": 4.3584, + "grad_norm": 0.6545549035072327, + "learning_rate": 4.320728291316527e-05, + "loss": 0.5919, + "step": 3405 + }, + { + "epoch": 4.35968, + "grad_norm": 0.7263978123664856, + "learning_rate": 4.320528211284514e-05, + "loss": 0.6691, + "step": 3406 + }, + { + "epoch": 4.36096, + "grad_norm": 0.7528616189956665, + "learning_rate": 4.320328131252501e-05, + "loss": 0.7051, + "step": 3407 + }, + { + "epoch": 4.36224, + "grad_norm": 0.6853641867637634, + "learning_rate": 4.320128051220488e-05, + "loss": 0.6242, + "step": 3408 + }, + { + "epoch": 4.36352, + "grad_norm": 0.7724465727806091, + "learning_rate": 4.3199279711884755e-05, + "loss": 0.5945, + "step": 3409 + }, + { + "epoch": 4.3648, + "grad_norm": 0.6866872310638428, + "learning_rate": 4.319727891156463e-05, + "loss": 0.6668, + "step": 3410 + }, + { + "epoch": 4.36608, + "grad_norm": 0.7362887263298035, + "learning_rate": 4.3195278111244506e-05, + "loss": 0.6773, + "step": 3411 + }, + { + "epoch": 4.36736, + "grad_norm": 0.7267001867294312, + "learning_rate": 4.319327731092437e-05, + "loss": 0.651, + "step": 3412 + }, + { + "epoch": 4.36864, + "grad_norm": 0.7169755697250366, + "learning_rate": 4.319127651060424e-05, + "loss": 0.7017, + "step": 3413 + }, + { + "epoch": 4.3699200000000005, + "grad_norm": 0.7027722597122192, + "learning_rate": 4.3189275710284115e-05, + "loss": 0.5944, + "step": 3414 + }, + { + "epoch": 4.3712, + "grad_norm": 0.7033854722976685, + "learning_rate": 4.3187274909963986e-05, + "loss": 0.6433, + "step": 3415 + }, + { + "epoch": 4.37248, + "grad_norm": 0.7184159755706787, + "learning_rate": 4.318527410964386e-05, + "loss": 0.6733, + "step": 3416 + }, + { + "epoch": 4.37376, + "grad_norm": 0.7299154996871948, + "learning_rate": 4.318327330932373e-05, + "loss": 0.7094, + "step": 3417 + }, + { + "epoch": 4.37504, + "grad_norm": 0.6535490155220032, + "learning_rate": 4.31812725090036e-05, + "loss": 0.6, + "step": 3418 + }, + { + "epoch": 4.37632, + "grad_norm": 0.6924900412559509, + "learning_rate": 4.317927170868348e-05, + "loss": 0.632, + "step": 3419 + }, + { + "epoch": 4.3776, + "grad_norm": 0.7481140494346619, + "learning_rate": 4.3177270908363346e-05, + "loss": 0.6669, + "step": 3420 + }, + { + "epoch": 4.37888, + "grad_norm": 0.6912330985069275, + "learning_rate": 4.317527010804322e-05, + "loss": 0.6514, + "step": 3421 + }, + { + "epoch": 4.38016, + "grad_norm": 0.7058601379394531, + "learning_rate": 4.317326930772309e-05, + "loss": 0.66, + "step": 3422 + }, + { + "epoch": 4.38144, + "grad_norm": 0.6704264879226685, + "learning_rate": 4.317126850740296e-05, + "loss": 0.6386, + "step": 3423 + }, + { + "epoch": 4.38272, + "grad_norm": 0.6729428172111511, + "learning_rate": 4.316926770708283e-05, + "loss": 0.6369, + "step": 3424 + }, + { + "epoch": 4.384, + "grad_norm": 0.6993991732597351, + "learning_rate": 4.3167266906762705e-05, + "loss": 0.6756, + "step": 3425 + }, + { + "epoch": 4.38528, + "grad_norm": 0.7253281474113464, + "learning_rate": 4.3165266106442584e-05, + "loss": 0.6467, + "step": 3426 + }, + { + "epoch": 4.38656, + "grad_norm": 0.7546789646148682, + "learning_rate": 4.3163265306122455e-05, + "loss": 0.6914, + "step": 3427 + }, + { + "epoch": 4.38784, + "grad_norm": 0.7461205720901489, + "learning_rate": 4.316126450580232e-05, + "loss": 0.6633, + "step": 3428 + }, + { + "epoch": 4.38912, + "grad_norm": 0.7659752368927002, + "learning_rate": 4.315926370548219e-05, + "loss": 0.6514, + "step": 3429 + }, + { + "epoch": 4.3904, + "grad_norm": 0.7351868152618408, + "learning_rate": 4.3157262905162064e-05, + "loss": 0.7025, + "step": 3430 + }, + { + "epoch": 4.39168, + "grad_norm": 0.7607386708259583, + "learning_rate": 4.3155262104841936e-05, + "loss": 0.6971, + "step": 3431 + }, + { + "epoch": 4.39296, + "grad_norm": 0.7106710076332092, + "learning_rate": 4.315326130452181e-05, + "loss": 0.6642, + "step": 3432 + }, + { + "epoch": 4.39424, + "grad_norm": 0.716480553150177, + "learning_rate": 4.3151260504201687e-05, + "loss": 0.6401, + "step": 3433 + }, + { + "epoch": 4.39552, + "grad_norm": 0.6813743710517883, + "learning_rate": 4.314925970388156e-05, + "loss": 0.6335, + "step": 3434 + }, + { + "epoch": 4.3968, + "grad_norm": 0.6941514611244202, + "learning_rate": 4.314725890356143e-05, + "loss": 0.6294, + "step": 3435 + }, + { + "epoch": 4.39808, + "grad_norm": 0.75897616147995, + "learning_rate": 4.3145258103241295e-05, + "loss": 0.674, + "step": 3436 + }, + { + "epoch": 4.39936, + "grad_norm": 0.6969784498214722, + "learning_rate": 4.314325730292117e-05, + "loss": 0.6424, + "step": 3437 + }, + { + "epoch": 4.40064, + "grad_norm": 0.6732988953590393, + "learning_rate": 4.314125650260104e-05, + "loss": 0.6206, + "step": 3438 + }, + { + "epoch": 4.40192, + "grad_norm": 0.6822874546051025, + "learning_rate": 4.313925570228091e-05, + "loss": 0.5994, + "step": 3439 + }, + { + "epoch": 4.4032, + "grad_norm": 0.6838374733924866, + "learning_rate": 4.313725490196079e-05, + "loss": 0.6224, + "step": 3440 + }, + { + "epoch": 4.40448, + "grad_norm": 0.7581614851951599, + "learning_rate": 4.313525410164066e-05, + "loss": 0.6899, + "step": 3441 + }, + { + "epoch": 4.40576, + "grad_norm": 0.7927963733673096, + "learning_rate": 4.313325330132053e-05, + "loss": 0.6671, + "step": 3442 + }, + { + "epoch": 4.40704, + "grad_norm": 0.7558853030204773, + "learning_rate": 4.3131252501000405e-05, + "loss": 0.6714, + "step": 3443 + }, + { + "epoch": 4.40832, + "grad_norm": 0.6957927346229553, + "learning_rate": 4.312925170068027e-05, + "loss": 0.6465, + "step": 3444 + }, + { + "epoch": 4.4096, + "grad_norm": 0.7612014412879944, + "learning_rate": 4.312725090036014e-05, + "loss": 0.6855, + "step": 3445 + }, + { + "epoch": 4.41088, + "grad_norm": 0.735772430896759, + "learning_rate": 4.3125250100040014e-05, + "loss": 0.6411, + "step": 3446 + }, + { + "epoch": 4.41216, + "grad_norm": 0.6750103235244751, + "learning_rate": 4.312324929971989e-05, + "loss": 0.5845, + "step": 3447 + }, + { + "epoch": 4.41344, + "grad_norm": 0.7323096990585327, + "learning_rate": 4.3121248499399764e-05, + "loss": 0.6724, + "step": 3448 + }, + { + "epoch": 4.41472, + "grad_norm": 0.706743597984314, + "learning_rate": 4.3119247699079636e-05, + "loss": 0.6313, + "step": 3449 + }, + { + "epoch": 4.416, + "grad_norm": 0.6990499496459961, + "learning_rate": 4.311724689875951e-05, + "loss": 0.6688, + "step": 3450 + }, + { + "epoch": 4.41728, + "grad_norm": 0.7238355875015259, + "learning_rate": 4.311524609843938e-05, + "loss": 0.6343, + "step": 3451 + }, + { + "epoch": 4.41856, + "grad_norm": 0.6958813667297363, + "learning_rate": 4.3113245298119245e-05, + "loss": 0.6341, + "step": 3452 + }, + { + "epoch": 4.41984, + "grad_norm": 0.7154257893562317, + "learning_rate": 4.311124449779912e-05, + "loss": 0.6938, + "step": 3453 + }, + { + "epoch": 4.42112, + "grad_norm": 0.6367705464363098, + "learning_rate": 4.3109243697478996e-05, + "loss": 0.5721, + "step": 3454 + }, + { + "epoch": 4.4224, + "grad_norm": 0.6861926913261414, + "learning_rate": 4.310724289715887e-05, + "loss": 0.6292, + "step": 3455 + }, + { + "epoch": 4.42368, + "grad_norm": 0.7064390182495117, + "learning_rate": 4.310524209683874e-05, + "loss": 0.6467, + "step": 3456 + }, + { + "epoch": 4.4249600000000004, + "grad_norm": 0.6897706985473633, + "learning_rate": 4.310324129651861e-05, + "loss": 0.6367, + "step": 3457 + }, + { + "epoch": 4.42624, + "grad_norm": 0.7397942543029785, + "learning_rate": 4.310124049619848e-05, + "loss": 0.7199, + "step": 3458 + }, + { + "epoch": 4.42752, + "grad_norm": 0.7196327447891235, + "learning_rate": 4.3099239695878355e-05, + "loss": 0.6154, + "step": 3459 + }, + { + "epoch": 4.4288, + "grad_norm": 0.7079979777336121, + "learning_rate": 4.309723889555822e-05, + "loss": 0.6571, + "step": 3460 + }, + { + "epoch": 4.43008, + "grad_norm": 0.7943414449691772, + "learning_rate": 4.30952380952381e-05, + "loss": 0.7078, + "step": 3461 + }, + { + "epoch": 4.43136, + "grad_norm": 0.6748960614204407, + "learning_rate": 4.309323729491797e-05, + "loss": 0.6341, + "step": 3462 + }, + { + "epoch": 4.43264, + "grad_norm": 0.7459009885787964, + "learning_rate": 4.309123649459784e-05, + "loss": 0.72, + "step": 3463 + }, + { + "epoch": 4.43392, + "grad_norm": 0.7245545983314514, + "learning_rate": 4.3089235694277714e-05, + "loss": 0.6007, + "step": 3464 + }, + { + "epoch": 4.4352, + "grad_norm": 0.6897006630897522, + "learning_rate": 4.3087234893957586e-05, + "loss": 0.6911, + "step": 3465 + }, + { + "epoch": 4.4364799999999995, + "grad_norm": 0.684449315071106, + "learning_rate": 4.308523409363746e-05, + "loss": 0.627, + "step": 3466 + }, + { + "epoch": 4.43776, + "grad_norm": 0.7077491283416748, + "learning_rate": 4.308323329331733e-05, + "loss": 0.6295, + "step": 3467 + }, + { + "epoch": 4.43904, + "grad_norm": 0.716648280620575, + "learning_rate": 4.30812324929972e-05, + "loss": 0.601, + "step": 3468 + }, + { + "epoch": 4.44032, + "grad_norm": 0.7259345650672913, + "learning_rate": 4.3079231692677073e-05, + "loss": 0.6443, + "step": 3469 + }, + { + "epoch": 4.4416, + "grad_norm": 0.7068425416946411, + "learning_rate": 4.3077230892356945e-05, + "loss": 0.6623, + "step": 3470 + }, + { + "epoch": 4.44288, + "grad_norm": 0.713869571685791, + "learning_rate": 4.307523009203682e-05, + "loss": 0.6596, + "step": 3471 + }, + { + "epoch": 4.44416, + "grad_norm": 0.6864603757858276, + "learning_rate": 4.307322929171669e-05, + "loss": 0.6143, + "step": 3472 + }, + { + "epoch": 4.44544, + "grad_norm": 0.7068229913711548, + "learning_rate": 4.307122849139656e-05, + "loss": 0.6255, + "step": 3473 + }, + { + "epoch": 4.44672, + "grad_norm": 0.7183898091316223, + "learning_rate": 4.306922769107643e-05, + "loss": 0.705, + "step": 3474 + }, + { + "epoch": 4.448, + "grad_norm": 0.6923708915710449, + "learning_rate": 4.3067226890756305e-05, + "loss": 0.6048, + "step": 3475 + }, + { + "epoch": 4.44928, + "grad_norm": 0.7035982608795166, + "learning_rate": 4.3065226090436176e-05, + "loss": 0.6131, + "step": 3476 + }, + { + "epoch": 4.45056, + "grad_norm": 0.7768038511276245, + "learning_rate": 4.306322529011605e-05, + "loss": 0.7338, + "step": 3477 + }, + { + "epoch": 4.45184, + "grad_norm": 0.7020729780197144, + "learning_rate": 4.306122448979592e-05, + "loss": 0.6507, + "step": 3478 + }, + { + "epoch": 4.45312, + "grad_norm": 0.6677333116531372, + "learning_rate": 4.305922368947579e-05, + "loss": 0.6101, + "step": 3479 + }, + { + "epoch": 4.4544, + "grad_norm": 0.6806728839874268, + "learning_rate": 4.3057222889155664e-05, + "loss": 0.6016, + "step": 3480 + }, + { + "epoch": 4.45568, + "grad_norm": 0.7218868732452393, + "learning_rate": 4.3055222088835536e-05, + "loss": 0.6874, + "step": 3481 + }, + { + "epoch": 4.45696, + "grad_norm": 0.757403552532196, + "learning_rate": 4.305322128851541e-05, + "loss": 0.7041, + "step": 3482 + }, + { + "epoch": 4.45824, + "grad_norm": 0.725426435470581, + "learning_rate": 4.305122048819528e-05, + "loss": 0.6705, + "step": 3483 + }, + { + "epoch": 4.45952, + "grad_norm": 0.7142929434776306, + "learning_rate": 4.304921968787515e-05, + "loss": 0.6241, + "step": 3484 + }, + { + "epoch": 4.4608, + "grad_norm": 0.7178435325622559, + "learning_rate": 4.304721888755502e-05, + "loss": 0.6537, + "step": 3485 + }, + { + "epoch": 4.46208, + "grad_norm": 0.7174624800682068, + "learning_rate": 4.3045218087234895e-05, + "loss": 0.6261, + "step": 3486 + }, + { + "epoch": 4.46336, + "grad_norm": 0.7432010173797607, + "learning_rate": 4.304321728691477e-05, + "loss": 0.6189, + "step": 3487 + }, + { + "epoch": 4.46464, + "grad_norm": 0.7320737838745117, + "learning_rate": 4.304121648659464e-05, + "loss": 0.6203, + "step": 3488 + }, + { + "epoch": 4.46592, + "grad_norm": 0.7159371972084045, + "learning_rate": 4.303921568627452e-05, + "loss": 0.6565, + "step": 3489 + }, + { + "epoch": 4.4672, + "grad_norm": 0.6766809225082397, + "learning_rate": 4.303721488595438e-05, + "loss": 0.6285, + "step": 3490 + }, + { + "epoch": 4.46848, + "grad_norm": 0.7048515677452087, + "learning_rate": 4.3035214085634254e-05, + "loss": 0.6259, + "step": 3491 + }, + { + "epoch": 4.46976, + "grad_norm": 0.7184696197509766, + "learning_rate": 4.3033213285314126e-05, + "loss": 0.6429, + "step": 3492 + }, + { + "epoch": 4.47104, + "grad_norm": 0.7331928014755249, + "learning_rate": 4.3031212484994e-05, + "loss": 0.6189, + "step": 3493 + }, + { + "epoch": 4.47232, + "grad_norm": 0.6746116876602173, + "learning_rate": 4.302921168467387e-05, + "loss": 0.6499, + "step": 3494 + }, + { + "epoch": 4.4736, + "grad_norm": 0.6947497725486755, + "learning_rate": 4.302721088435374e-05, + "loss": 0.6577, + "step": 3495 + }, + { + "epoch": 4.47488, + "grad_norm": 0.7183555960655212, + "learning_rate": 4.302521008403362e-05, + "loss": 0.6201, + "step": 3496 + }, + { + "epoch": 4.47616, + "grad_norm": 0.7647238373756409, + "learning_rate": 4.302320928371349e-05, + "loss": 0.6564, + "step": 3497 + }, + { + "epoch": 4.47744, + "grad_norm": 0.6781284809112549, + "learning_rate": 4.302120848339336e-05, + "loss": 0.6367, + "step": 3498 + }, + { + "epoch": 4.47872, + "grad_norm": 0.7044297456741333, + "learning_rate": 4.301920768307323e-05, + "loss": 0.6508, + "step": 3499 + }, + { + "epoch": 4.48, + "grad_norm": 0.7338101863861084, + "learning_rate": 4.30172068827531e-05, + "loss": 0.6807, + "step": 3500 + }, + { + "epoch": 4.48128, + "grad_norm": 0.7334948182106018, + "learning_rate": 4.301520608243297e-05, + "loss": 0.6962, + "step": 3501 + }, + { + "epoch": 4.48256, + "grad_norm": 0.7243069410324097, + "learning_rate": 4.3013205282112845e-05, + "loss": 0.6448, + "step": 3502 + }, + { + "epoch": 4.48384, + "grad_norm": 0.7103859782218933, + "learning_rate": 4.301120448179272e-05, + "loss": 0.6347, + "step": 3503 + }, + { + "epoch": 4.48512, + "grad_norm": 0.7307097911834717, + "learning_rate": 4.3009203681472595e-05, + "loss": 0.6947, + "step": 3504 + }, + { + "epoch": 4.4864, + "grad_norm": 0.6719437837600708, + "learning_rate": 4.300720288115247e-05, + "loss": 0.6101, + "step": 3505 + }, + { + "epoch": 4.48768, + "grad_norm": 0.6755344867706299, + "learning_rate": 4.300520208083233e-05, + "loss": 0.6134, + "step": 3506 + }, + { + "epoch": 4.48896, + "grad_norm": 0.7292615175247192, + "learning_rate": 4.3003201280512204e-05, + "loss": 0.7104, + "step": 3507 + }, + { + "epoch": 4.49024, + "grad_norm": 0.7445333003997803, + "learning_rate": 4.3001200480192076e-05, + "loss": 0.702, + "step": 3508 + }, + { + "epoch": 4.49152, + "grad_norm": 0.7050941586494446, + "learning_rate": 4.299919967987195e-05, + "loss": 0.6471, + "step": 3509 + }, + { + "epoch": 4.4928, + "grad_norm": 0.7550082206726074, + "learning_rate": 4.2997198879551826e-05, + "loss": 0.6283, + "step": 3510 + }, + { + "epoch": 4.49408, + "grad_norm": 0.6943626999855042, + "learning_rate": 4.29951980792317e-05, + "loss": 0.6155, + "step": 3511 + }, + { + "epoch": 4.49536, + "grad_norm": 0.7097110152244568, + "learning_rate": 4.299319727891157e-05, + "loss": 0.6762, + "step": 3512 + }, + { + "epoch": 4.49664, + "grad_norm": 0.7257136106491089, + "learning_rate": 4.299119647859144e-05, + "loss": 0.6559, + "step": 3513 + }, + { + "epoch": 4.49792, + "grad_norm": 0.7039051651954651, + "learning_rate": 4.298919567827131e-05, + "loss": 0.6605, + "step": 3514 + }, + { + "epoch": 4.4992, + "grad_norm": 0.7470154762268066, + "learning_rate": 4.298719487795118e-05, + "loss": 0.6569, + "step": 3515 + }, + { + "epoch": 4.50048, + "grad_norm": 0.7026588916778564, + "learning_rate": 4.298519407763105e-05, + "loss": 0.6678, + "step": 3516 + }, + { + "epoch": 4.50176, + "grad_norm": 0.7094910144805908, + "learning_rate": 4.298319327731093e-05, + "loss": 0.6832, + "step": 3517 + }, + { + "epoch": 4.50304, + "grad_norm": 0.701600968837738, + "learning_rate": 4.29811924769908e-05, + "loss": 0.6583, + "step": 3518 + }, + { + "epoch": 4.50432, + "grad_norm": 0.6822985410690308, + "learning_rate": 4.297919167667067e-05, + "loss": 0.6335, + "step": 3519 + }, + { + "epoch": 4.5056, + "grad_norm": 0.7292996048927307, + "learning_rate": 4.2977190876350545e-05, + "loss": 0.6355, + "step": 3520 + }, + { + "epoch": 4.50688, + "grad_norm": 0.6827433705329895, + "learning_rate": 4.297519007603042e-05, + "loss": 0.6288, + "step": 3521 + }, + { + "epoch": 4.50816, + "grad_norm": 0.6858897805213928, + "learning_rate": 4.297318927571028e-05, + "loss": 0.6333, + "step": 3522 + }, + { + "epoch": 4.50944, + "grad_norm": 0.7058318853378296, + "learning_rate": 4.2971188475390154e-05, + "loss": 0.6742, + "step": 3523 + }, + { + "epoch": 4.51072, + "grad_norm": 0.7074992060661316, + "learning_rate": 4.296918767507003e-05, + "loss": 0.6459, + "step": 3524 + }, + { + "epoch": 4.5120000000000005, + "grad_norm": 0.7430311441421509, + "learning_rate": 4.2967186874749904e-05, + "loss": 0.6278, + "step": 3525 + }, + { + "epoch": 4.51328, + "grad_norm": 0.6989936828613281, + "learning_rate": 4.2965186074429776e-05, + "loss": 0.6302, + "step": 3526 + }, + { + "epoch": 4.51456, + "grad_norm": 0.7324742674827576, + "learning_rate": 4.296318527410965e-05, + "loss": 0.6347, + "step": 3527 + }, + { + "epoch": 4.51584, + "grad_norm": 0.8047772645950317, + "learning_rate": 4.296118447378952e-05, + "loss": 0.6524, + "step": 3528 + }, + { + "epoch": 4.51712, + "grad_norm": 0.7454036474227905, + "learning_rate": 4.295918367346939e-05, + "loss": 0.6636, + "step": 3529 + }, + { + "epoch": 4.5184, + "grad_norm": 0.6896642446517944, + "learning_rate": 4.295718287314926e-05, + "loss": 0.6145, + "step": 3530 + }, + { + "epoch": 4.51968, + "grad_norm": 0.7129753828048706, + "learning_rate": 4.295518207282913e-05, + "loss": 0.6929, + "step": 3531 + }, + { + "epoch": 4.52096, + "grad_norm": 0.6888583898544312, + "learning_rate": 4.295318127250901e-05, + "loss": 0.6411, + "step": 3532 + }, + { + "epoch": 4.52224, + "grad_norm": 0.7414783239364624, + "learning_rate": 4.295118047218888e-05, + "loss": 0.7311, + "step": 3533 + }, + { + "epoch": 4.5235199999999995, + "grad_norm": 0.7330256104469299, + "learning_rate": 4.294917967186875e-05, + "loss": 0.6781, + "step": 3534 + }, + { + "epoch": 4.5248, + "grad_norm": 0.6906652450561523, + "learning_rate": 4.294717887154862e-05, + "loss": 0.6435, + "step": 3535 + }, + { + "epoch": 4.52608, + "grad_norm": 0.677453875541687, + "learning_rate": 4.2945178071228495e-05, + "loss": 0.6618, + "step": 3536 + }, + { + "epoch": 4.52736, + "grad_norm": 0.7467184066772461, + "learning_rate": 4.2943177270908366e-05, + "loss": 0.6991, + "step": 3537 + }, + { + "epoch": 4.52864, + "grad_norm": 0.7100308537483215, + "learning_rate": 4.294117647058823e-05, + "loss": 0.6298, + "step": 3538 + }, + { + "epoch": 4.52992, + "grad_norm": 0.6982895135879517, + "learning_rate": 4.293917567026811e-05, + "loss": 0.6826, + "step": 3539 + }, + { + "epoch": 4.5312, + "grad_norm": 0.6765713095664978, + "learning_rate": 4.293717486994798e-05, + "loss": 0.6307, + "step": 3540 + }, + { + "epoch": 4.53248, + "grad_norm": 0.6806417107582092, + "learning_rate": 4.2935174069627854e-05, + "loss": 0.6495, + "step": 3541 + }, + { + "epoch": 4.53376, + "grad_norm": 0.7208904027938843, + "learning_rate": 4.2933173269307726e-05, + "loss": 0.6566, + "step": 3542 + }, + { + "epoch": 4.53504, + "grad_norm": 0.7589842677116394, + "learning_rate": 4.29311724689876e-05, + "loss": 0.7152, + "step": 3543 + }, + { + "epoch": 4.53632, + "grad_norm": 0.7217769622802734, + "learning_rate": 4.292917166866747e-05, + "loss": 0.6694, + "step": 3544 + }, + { + "epoch": 4.5376, + "grad_norm": 0.681645393371582, + "learning_rate": 4.292717086834734e-05, + "loss": 0.6182, + "step": 3545 + }, + { + "epoch": 4.53888, + "grad_norm": 0.76906818151474, + "learning_rate": 4.292517006802721e-05, + "loss": 0.6863, + "step": 3546 + }, + { + "epoch": 4.54016, + "grad_norm": 0.6864603161811829, + "learning_rate": 4.2923169267707085e-05, + "loss": 0.5935, + "step": 3547 + }, + { + "epoch": 4.54144, + "grad_norm": 0.7130399942398071, + "learning_rate": 4.292116846738696e-05, + "loss": 0.6145, + "step": 3548 + }, + { + "epoch": 4.54272, + "grad_norm": 0.7169683575630188, + "learning_rate": 4.291916766706683e-05, + "loss": 0.6562, + "step": 3549 + }, + { + "epoch": 4.5440000000000005, + "grad_norm": 0.7166752815246582, + "learning_rate": 4.29171668667467e-05, + "loss": 0.6539, + "step": 3550 + }, + { + "epoch": 4.54528, + "grad_norm": 0.681341290473938, + "learning_rate": 4.291516606642657e-05, + "loss": 0.6518, + "step": 3551 + }, + { + "epoch": 4.54656, + "grad_norm": 0.6968530416488647, + "learning_rate": 4.2913165266106444e-05, + "loss": 0.6345, + "step": 3552 + }, + { + "epoch": 4.54784, + "grad_norm": 0.7314958572387695, + "learning_rate": 4.2911164465786316e-05, + "loss": 0.7076, + "step": 3553 + }, + { + "epoch": 4.54912, + "grad_norm": 0.7102290987968445, + "learning_rate": 4.290916366546619e-05, + "loss": 0.6552, + "step": 3554 + }, + { + "epoch": 4.5504, + "grad_norm": 0.7079906463623047, + "learning_rate": 4.290716286514606e-05, + "loss": 0.6323, + "step": 3555 + }, + { + "epoch": 4.55168, + "grad_norm": 0.6791037917137146, + "learning_rate": 4.290516206482593e-05, + "loss": 0.6335, + "step": 3556 + }, + { + "epoch": 4.55296, + "grad_norm": 0.6969685554504395, + "learning_rate": 4.2903161264505804e-05, + "loss": 0.6266, + "step": 3557 + }, + { + "epoch": 4.55424, + "grad_norm": 0.7246193885803223, + "learning_rate": 4.2901160464185675e-05, + "loss": 0.6518, + "step": 3558 + }, + { + "epoch": 4.55552, + "grad_norm": 0.679791271686554, + "learning_rate": 4.289915966386555e-05, + "loss": 0.6229, + "step": 3559 + }, + { + "epoch": 4.5568, + "grad_norm": 0.7322858572006226, + "learning_rate": 4.289715886354542e-05, + "loss": 0.6321, + "step": 3560 + }, + { + "epoch": 4.55808, + "grad_norm": 0.6904955506324768, + "learning_rate": 4.289515806322529e-05, + "loss": 0.6772, + "step": 3561 + }, + { + "epoch": 4.55936, + "grad_norm": 0.746525764465332, + "learning_rate": 4.289315726290516e-05, + "loss": 0.6534, + "step": 3562 + }, + { + "epoch": 4.56064, + "grad_norm": 0.7122935056686401, + "learning_rate": 4.2891156462585035e-05, + "loss": 0.6796, + "step": 3563 + }, + { + "epoch": 4.56192, + "grad_norm": 0.7172386050224304, + "learning_rate": 4.2889155662264907e-05, + "loss": 0.6768, + "step": 3564 + }, + { + "epoch": 4.5632, + "grad_norm": 0.7053015232086182, + "learning_rate": 4.288715486194478e-05, + "loss": 0.5911, + "step": 3565 + }, + { + "epoch": 4.56448, + "grad_norm": 0.6957263946533203, + "learning_rate": 4.288515406162465e-05, + "loss": 0.6538, + "step": 3566 + }, + { + "epoch": 4.56576, + "grad_norm": 0.6875421404838562, + "learning_rate": 4.288315326130453e-05, + "loss": 0.6839, + "step": 3567 + }, + { + "epoch": 4.56704, + "grad_norm": 0.7036089301109314, + "learning_rate": 4.2881152460984394e-05, + "loss": 0.6077, + "step": 3568 + }, + { + "epoch": 4.56832, + "grad_norm": 0.6996341943740845, + "learning_rate": 4.2879151660664266e-05, + "loss": 0.6304, + "step": 3569 + }, + { + "epoch": 4.5696, + "grad_norm": 0.7085795402526855, + "learning_rate": 4.287715086034414e-05, + "loss": 0.6497, + "step": 3570 + }, + { + "epoch": 4.57088, + "grad_norm": 0.6893650889396667, + "learning_rate": 4.287515006002401e-05, + "loss": 0.6503, + "step": 3571 + }, + { + "epoch": 4.57216, + "grad_norm": 0.6926917433738708, + "learning_rate": 4.287314925970388e-05, + "loss": 0.6245, + "step": 3572 + }, + { + "epoch": 4.57344, + "grad_norm": 0.7077941298484802, + "learning_rate": 4.287114845938375e-05, + "loss": 0.6522, + "step": 3573 + }, + { + "epoch": 4.57472, + "grad_norm": 0.7533298134803772, + "learning_rate": 4.286914765906363e-05, + "loss": 0.6637, + "step": 3574 + }, + { + "epoch": 4.576, + "grad_norm": 0.7265738844871521, + "learning_rate": 4.2867146858743504e-05, + "loss": 0.6432, + "step": 3575 + }, + { + "epoch": 4.57728, + "grad_norm": 0.7495373487472534, + "learning_rate": 4.286514605842337e-05, + "loss": 0.6487, + "step": 3576 + }, + { + "epoch": 4.5785599999999995, + "grad_norm": 0.7241270542144775, + "learning_rate": 4.286314525810324e-05, + "loss": 0.6962, + "step": 3577 + }, + { + "epoch": 4.57984, + "grad_norm": 0.6941715478897095, + "learning_rate": 4.286114445778311e-05, + "loss": 0.6421, + "step": 3578 + }, + { + "epoch": 4.58112, + "grad_norm": 0.7148897647857666, + "learning_rate": 4.2859143657462984e-05, + "loss": 0.6584, + "step": 3579 + }, + { + "epoch": 4.5824, + "grad_norm": 0.7130563855171204, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.6817, + "step": 3580 + }, + { + "epoch": 4.58368, + "grad_norm": 0.7111728191375732, + "learning_rate": 4.2855142056822735e-05, + "loss": 0.6443, + "step": 3581 + }, + { + "epoch": 4.58496, + "grad_norm": 0.7034164667129517, + "learning_rate": 4.285314125650261e-05, + "loss": 0.6662, + "step": 3582 + }, + { + "epoch": 4.58624, + "grad_norm": 0.6821185946464539, + "learning_rate": 4.285114045618248e-05, + "loss": 0.6381, + "step": 3583 + }, + { + "epoch": 4.58752, + "grad_norm": 0.6892850995063782, + "learning_rate": 4.2849139655862344e-05, + "loss": 0.6515, + "step": 3584 + }, + { + "epoch": 4.5888, + "grad_norm": 0.7385504245758057, + "learning_rate": 4.2847138855542216e-05, + "loss": 0.6632, + "step": 3585 + }, + { + "epoch": 4.59008, + "grad_norm": 0.6548694372177124, + "learning_rate": 4.284513805522209e-05, + "loss": 0.609, + "step": 3586 + }, + { + "epoch": 4.59136, + "grad_norm": 0.7125710248947144, + "learning_rate": 4.284313725490196e-05, + "loss": 0.6506, + "step": 3587 + }, + { + "epoch": 4.59264, + "grad_norm": 0.7181362509727478, + "learning_rate": 4.284113645458184e-05, + "loss": 0.5887, + "step": 3588 + }, + { + "epoch": 4.59392, + "grad_norm": 0.6801038980484009, + "learning_rate": 4.283913565426171e-05, + "loss": 0.6322, + "step": 3589 + }, + { + "epoch": 4.5952, + "grad_norm": 0.7283415198326111, + "learning_rate": 4.283713485394158e-05, + "loss": 0.6793, + "step": 3590 + }, + { + "epoch": 4.59648, + "grad_norm": 0.7576428055763245, + "learning_rate": 4.2835134053621453e-05, + "loss": 0.6391, + "step": 3591 + }, + { + "epoch": 4.59776, + "grad_norm": 0.7399201393127441, + "learning_rate": 4.283313325330132e-05, + "loss": 0.6719, + "step": 3592 + }, + { + "epoch": 4.5990400000000005, + "grad_norm": 0.7038607001304626, + "learning_rate": 4.283113245298119e-05, + "loss": 0.6885, + "step": 3593 + }, + { + "epoch": 4.60032, + "grad_norm": 0.7106660604476929, + "learning_rate": 4.282913165266106e-05, + "loss": 0.6382, + "step": 3594 + }, + { + "epoch": 4.6016, + "grad_norm": 0.7221371531486511, + "learning_rate": 4.282713085234094e-05, + "loss": 0.6697, + "step": 3595 + }, + { + "epoch": 4.60288, + "grad_norm": 0.702472984790802, + "learning_rate": 4.282513005202081e-05, + "loss": 0.6555, + "step": 3596 + }, + { + "epoch": 4.60416, + "grad_norm": 0.6985082626342773, + "learning_rate": 4.2823129251700685e-05, + "loss": 0.6591, + "step": 3597 + }, + { + "epoch": 4.60544, + "grad_norm": 0.6914473176002502, + "learning_rate": 4.2821128451380556e-05, + "loss": 0.6284, + "step": 3598 + }, + { + "epoch": 4.60672, + "grad_norm": 0.7164996266365051, + "learning_rate": 4.281912765106043e-05, + "loss": 0.6409, + "step": 3599 + }, + { + "epoch": 4.608, + "grad_norm": 0.7067193388938904, + "learning_rate": 4.2817126850740293e-05, + "loss": 0.6891, + "step": 3600 + }, + { + "epoch": 4.60928, + "grad_norm": 0.672056257724762, + "learning_rate": 4.2815126050420165e-05, + "loss": 0.6533, + "step": 3601 + }, + { + "epoch": 4.6105599999999995, + "grad_norm": 0.6592856049537659, + "learning_rate": 4.2813125250100044e-05, + "loss": 0.5726, + "step": 3602 + }, + { + "epoch": 4.61184, + "grad_norm": 0.7327611446380615, + "learning_rate": 4.2811124449779916e-05, + "loss": 0.6566, + "step": 3603 + }, + { + "epoch": 4.61312, + "grad_norm": 0.7556269764900208, + "learning_rate": 4.280912364945979e-05, + "loss": 0.7096, + "step": 3604 + }, + { + "epoch": 4.6144, + "grad_norm": 0.7422052025794983, + "learning_rate": 4.280712284913966e-05, + "loss": 0.7039, + "step": 3605 + }, + { + "epoch": 4.61568, + "grad_norm": 0.7220710515975952, + "learning_rate": 4.280512204881953e-05, + "loss": 0.6709, + "step": 3606 + }, + { + "epoch": 4.61696, + "grad_norm": 0.7694050669670105, + "learning_rate": 4.28031212484994e-05, + "loss": 0.6938, + "step": 3607 + }, + { + "epoch": 4.61824, + "grad_norm": 0.7003862261772156, + "learning_rate": 4.280112044817927e-05, + "loss": 0.6526, + "step": 3608 + }, + { + "epoch": 4.61952, + "grad_norm": 0.7154176831245422, + "learning_rate": 4.279911964785915e-05, + "loss": 0.6106, + "step": 3609 + }, + { + "epoch": 4.6208, + "grad_norm": 0.6721168160438538, + "learning_rate": 4.279711884753902e-05, + "loss": 0.6439, + "step": 3610 + }, + { + "epoch": 4.62208, + "grad_norm": 0.7174552083015442, + "learning_rate": 4.279511804721889e-05, + "loss": 0.6373, + "step": 3611 + }, + { + "epoch": 4.62336, + "grad_norm": 0.7415494918823242, + "learning_rate": 4.279311724689876e-05, + "loss": 0.6797, + "step": 3612 + }, + { + "epoch": 4.62464, + "grad_norm": 0.6769813299179077, + "learning_rate": 4.2791116446578634e-05, + "loss": 0.5868, + "step": 3613 + }, + { + "epoch": 4.62592, + "grad_norm": 0.6615371108055115, + "learning_rate": 4.2789115646258506e-05, + "loss": 0.5537, + "step": 3614 + }, + { + "epoch": 4.6272, + "grad_norm": 0.7274612188339233, + "learning_rate": 4.278711484593838e-05, + "loss": 0.6443, + "step": 3615 + }, + { + "epoch": 4.62848, + "grad_norm": 0.7486844658851624, + "learning_rate": 4.278511404561825e-05, + "loss": 0.6367, + "step": 3616 + }, + { + "epoch": 4.62976, + "grad_norm": 0.7376373410224915, + "learning_rate": 4.278311324529812e-05, + "loss": 0.6621, + "step": 3617 + }, + { + "epoch": 4.6310400000000005, + "grad_norm": 0.6872167587280273, + "learning_rate": 4.2781112444977994e-05, + "loss": 0.6174, + "step": 3618 + }, + { + "epoch": 4.63232, + "grad_norm": 0.7116283774375916, + "learning_rate": 4.2779111644657865e-05, + "loss": 0.6833, + "step": 3619 + }, + { + "epoch": 4.6336, + "grad_norm": 0.7109349966049194, + "learning_rate": 4.277711084433774e-05, + "loss": 0.6226, + "step": 3620 + }, + { + "epoch": 4.63488, + "grad_norm": 0.6518250703811646, + "learning_rate": 4.277511004401761e-05, + "loss": 0.6072, + "step": 3621 + }, + { + "epoch": 4.63616, + "grad_norm": 0.7209012508392334, + "learning_rate": 4.277310924369748e-05, + "loss": 0.6737, + "step": 3622 + }, + { + "epoch": 4.63744, + "grad_norm": 0.7458158731460571, + "learning_rate": 4.277110844337735e-05, + "loss": 0.702, + "step": 3623 + }, + { + "epoch": 4.63872, + "grad_norm": 0.7005342245101929, + "learning_rate": 4.2769107643057225e-05, + "loss": 0.6082, + "step": 3624 + }, + { + "epoch": 4.64, + "grad_norm": 0.7359268665313721, + "learning_rate": 4.27671068427371e-05, + "loss": 0.6727, + "step": 3625 + }, + { + "epoch": 4.64128, + "grad_norm": 0.7191507816314697, + "learning_rate": 4.276510604241697e-05, + "loss": 0.6743, + "step": 3626 + }, + { + "epoch": 4.64256, + "grad_norm": 0.7470155954360962, + "learning_rate": 4.276310524209684e-05, + "loss": 0.6833, + "step": 3627 + }, + { + "epoch": 4.64384, + "grad_norm": 0.7260749936103821, + "learning_rate": 4.276110444177671e-05, + "loss": 0.6241, + "step": 3628 + }, + { + "epoch": 4.64512, + "grad_norm": 0.7276033759117126, + "learning_rate": 4.2759103641456584e-05, + "loss": 0.6384, + "step": 3629 + }, + { + "epoch": 4.6464, + "grad_norm": 0.7192178964614868, + "learning_rate": 4.275710284113646e-05, + "loss": 0.6837, + "step": 3630 + }, + { + "epoch": 4.64768, + "grad_norm": 0.7400460839271545, + "learning_rate": 4.275510204081633e-05, + "loss": 0.6588, + "step": 3631 + }, + { + "epoch": 4.64896, + "grad_norm": 0.7040890455245972, + "learning_rate": 4.27531012404962e-05, + "loss": 0.6355, + "step": 3632 + }, + { + "epoch": 4.65024, + "grad_norm": 0.7764822840690613, + "learning_rate": 4.275110044017607e-05, + "loss": 0.6686, + "step": 3633 + }, + { + "epoch": 4.65152, + "grad_norm": 0.72344571352005, + "learning_rate": 4.274909963985594e-05, + "loss": 0.6094, + "step": 3634 + }, + { + "epoch": 4.6528, + "grad_norm": 0.7050177454948425, + "learning_rate": 4.2747098839535815e-05, + "loss": 0.6337, + "step": 3635 + }, + { + "epoch": 4.65408, + "grad_norm": 0.7142139673233032, + "learning_rate": 4.274509803921569e-05, + "loss": 0.6422, + "step": 3636 + }, + { + "epoch": 4.65536, + "grad_norm": 0.7412136793136597, + "learning_rate": 4.274309723889556e-05, + "loss": 0.6324, + "step": 3637 + }, + { + "epoch": 4.65664, + "grad_norm": 0.7400050163269043, + "learning_rate": 4.274109643857544e-05, + "loss": 0.6371, + "step": 3638 + }, + { + "epoch": 4.65792, + "grad_norm": 0.6862472295761108, + "learning_rate": 4.27390956382553e-05, + "loss": 0.6114, + "step": 3639 + }, + { + "epoch": 4.6592, + "grad_norm": 0.6816151142120361, + "learning_rate": 4.2737094837935174e-05, + "loss": 0.6169, + "step": 3640 + }, + { + "epoch": 4.66048, + "grad_norm": 0.7262451648712158, + "learning_rate": 4.2735094037615046e-05, + "loss": 0.6691, + "step": 3641 + }, + { + "epoch": 4.66176, + "grad_norm": 0.6490700244903564, + "learning_rate": 4.273309323729492e-05, + "loss": 0.5776, + "step": 3642 + }, + { + "epoch": 4.66304, + "grad_norm": 0.701505184173584, + "learning_rate": 4.273109243697479e-05, + "loss": 0.6501, + "step": 3643 + }, + { + "epoch": 4.66432, + "grad_norm": 0.731262743473053, + "learning_rate": 4.272909163665466e-05, + "loss": 0.6893, + "step": 3644 + }, + { + "epoch": 4.6655999999999995, + "grad_norm": 0.6694958209991455, + "learning_rate": 4.272709083633454e-05, + "loss": 0.6047, + "step": 3645 + }, + { + "epoch": 4.66688, + "grad_norm": 0.7336912155151367, + "learning_rate": 4.272509003601441e-05, + "loss": 0.6718, + "step": 3646 + }, + { + "epoch": 4.66816, + "grad_norm": 0.7041752338409424, + "learning_rate": 4.272308923569428e-05, + "loss": 0.6913, + "step": 3647 + }, + { + "epoch": 4.66944, + "grad_norm": 0.7393541932106018, + "learning_rate": 4.272108843537415e-05, + "loss": 0.6946, + "step": 3648 + }, + { + "epoch": 4.67072, + "grad_norm": 0.6891679167747498, + "learning_rate": 4.271908763505402e-05, + "loss": 0.5869, + "step": 3649 + }, + { + "epoch": 4.672, + "grad_norm": 0.6804254055023193, + "learning_rate": 4.271708683473389e-05, + "loss": 0.6375, + "step": 3650 + }, + { + "epoch": 4.67328, + "grad_norm": 0.7413250803947449, + "learning_rate": 4.2715086034413765e-05, + "loss": 0.6526, + "step": 3651 + }, + { + "epoch": 4.67456, + "grad_norm": 0.7123025059700012, + "learning_rate": 4.2713085234093644e-05, + "loss": 0.6193, + "step": 3652 + }, + { + "epoch": 4.67584, + "grad_norm": 0.6694692373275757, + "learning_rate": 4.2711084433773515e-05, + "loss": 0.6437, + "step": 3653 + }, + { + "epoch": 4.67712, + "grad_norm": 0.744706392288208, + "learning_rate": 4.270908363345339e-05, + "loss": 0.648, + "step": 3654 + }, + { + "epoch": 4.6784, + "grad_norm": 0.7176006436347961, + "learning_rate": 4.270708283313325e-05, + "loss": 0.6077, + "step": 3655 + }, + { + "epoch": 4.67968, + "grad_norm": 0.7272926568984985, + "learning_rate": 4.2705082032813124e-05, + "loss": 0.6571, + "step": 3656 + }, + { + "epoch": 4.68096, + "grad_norm": 0.6795682907104492, + "learning_rate": 4.2703081232492996e-05, + "loss": 0.6645, + "step": 3657 + }, + { + "epoch": 4.68224, + "grad_norm": 0.7655096650123596, + "learning_rate": 4.270108043217287e-05, + "loss": 0.6812, + "step": 3658 + }, + { + "epoch": 4.68352, + "grad_norm": 0.7039996981620789, + "learning_rate": 4.2699079631852747e-05, + "loss": 0.6863, + "step": 3659 + }, + { + "epoch": 4.6848, + "grad_norm": 0.7228350639343262, + "learning_rate": 4.269707883153262e-05, + "loss": 0.6519, + "step": 3660 + }, + { + "epoch": 4.6860800000000005, + "grad_norm": 0.658035397529602, + "learning_rate": 4.269507803121249e-05, + "loss": 0.6366, + "step": 3661 + }, + { + "epoch": 4.68736, + "grad_norm": 0.7360743880271912, + "learning_rate": 4.269307723089236e-05, + "loss": 0.6543, + "step": 3662 + }, + { + "epoch": 4.68864, + "grad_norm": 0.6831920146942139, + "learning_rate": 4.269107643057223e-05, + "loss": 0.6334, + "step": 3663 + }, + { + "epoch": 4.68992, + "grad_norm": 0.7199382185935974, + "learning_rate": 4.26890756302521e-05, + "loss": 0.6388, + "step": 3664 + }, + { + "epoch": 4.6912, + "grad_norm": 0.6735736131668091, + "learning_rate": 4.268707482993197e-05, + "loss": 0.6249, + "step": 3665 + }, + { + "epoch": 4.69248, + "grad_norm": 0.7085421681404114, + "learning_rate": 4.268507402961185e-05, + "loss": 0.6752, + "step": 3666 + }, + { + "epoch": 4.69376, + "grad_norm": 0.7214930057525635, + "learning_rate": 4.268307322929172e-05, + "loss": 0.6651, + "step": 3667 + }, + { + "epoch": 4.69504, + "grad_norm": 0.7339044213294983, + "learning_rate": 4.268107242897159e-05, + "loss": 0.7262, + "step": 3668 + }, + { + "epoch": 4.69632, + "grad_norm": 0.707331120967865, + "learning_rate": 4.2679071628651465e-05, + "loss": 0.5875, + "step": 3669 + }, + { + "epoch": 4.6975999999999996, + "grad_norm": 0.7132313847541809, + "learning_rate": 4.267707082833134e-05, + "loss": 0.6412, + "step": 3670 + }, + { + "epoch": 4.69888, + "grad_norm": 0.6542635560035706, + "learning_rate": 4.26750700280112e-05, + "loss": 0.6243, + "step": 3671 + }, + { + "epoch": 4.70016, + "grad_norm": 0.6929904818534851, + "learning_rate": 4.2673069227691074e-05, + "loss": 0.6621, + "step": 3672 + }, + { + "epoch": 4.70144, + "grad_norm": 0.6636141538619995, + "learning_rate": 4.267106842737095e-05, + "loss": 0.6205, + "step": 3673 + }, + { + "epoch": 4.70272, + "grad_norm": 0.7498050332069397, + "learning_rate": 4.2669067627050824e-05, + "loss": 0.6606, + "step": 3674 + }, + { + "epoch": 4.704, + "grad_norm": 0.6959406137466431, + "learning_rate": 4.2667066826730696e-05, + "loss": 0.627, + "step": 3675 + }, + { + "epoch": 4.70528, + "grad_norm": 0.7800446152687073, + "learning_rate": 4.266506602641057e-05, + "loss": 0.6928, + "step": 3676 + }, + { + "epoch": 4.70656, + "grad_norm": 0.7218673825263977, + "learning_rate": 4.266306522609044e-05, + "loss": 0.6479, + "step": 3677 + }, + { + "epoch": 4.70784, + "grad_norm": 0.6821399331092834, + "learning_rate": 4.266106442577031e-05, + "loss": 0.6622, + "step": 3678 + }, + { + "epoch": 4.70912, + "grad_norm": 0.6904776692390442, + "learning_rate": 4.265906362545018e-05, + "loss": 0.6627, + "step": 3679 + }, + { + "epoch": 4.7104, + "grad_norm": 0.6957124471664429, + "learning_rate": 4.2657062825130056e-05, + "loss": 0.6226, + "step": 3680 + }, + { + "epoch": 4.71168, + "grad_norm": 0.7129904627799988, + "learning_rate": 4.265506202480993e-05, + "loss": 0.6552, + "step": 3681 + }, + { + "epoch": 4.71296, + "grad_norm": 0.7037960886955261, + "learning_rate": 4.26530612244898e-05, + "loss": 0.6661, + "step": 3682 + }, + { + "epoch": 4.71424, + "grad_norm": 0.7117835283279419, + "learning_rate": 4.265106042416967e-05, + "loss": 0.6382, + "step": 3683 + }, + { + "epoch": 4.71552, + "grad_norm": 0.7245675325393677, + "learning_rate": 4.264905962384954e-05, + "loss": 0.625, + "step": 3684 + }, + { + "epoch": 4.7168, + "grad_norm": 0.7148410677909851, + "learning_rate": 4.2647058823529415e-05, + "loss": 0.6358, + "step": 3685 + }, + { + "epoch": 4.7180800000000005, + "grad_norm": 0.7175400257110596, + "learning_rate": 4.264505802320929e-05, + "loss": 0.6145, + "step": 3686 + }, + { + "epoch": 4.71936, + "grad_norm": 0.7203708291053772, + "learning_rate": 4.264305722288916e-05, + "loss": 0.6923, + "step": 3687 + }, + { + "epoch": 4.7206399999999995, + "grad_norm": 0.7105974555015564, + "learning_rate": 4.264105642256903e-05, + "loss": 0.6218, + "step": 3688 + }, + { + "epoch": 4.72192, + "grad_norm": 0.7175532579421997, + "learning_rate": 4.26390556222489e-05, + "loss": 0.6648, + "step": 3689 + }, + { + "epoch": 4.7232, + "grad_norm": 0.7026408910751343, + "learning_rate": 4.2637054821928774e-05, + "loss": 0.6619, + "step": 3690 + }, + { + "epoch": 4.72448, + "grad_norm": 0.7409774661064148, + "learning_rate": 4.2635054021608646e-05, + "loss": 0.7161, + "step": 3691 + }, + { + "epoch": 4.72576, + "grad_norm": 0.6791418194770813, + "learning_rate": 4.263305322128852e-05, + "loss": 0.6204, + "step": 3692 + }, + { + "epoch": 4.72704, + "grad_norm": 0.6994455456733704, + "learning_rate": 4.263105242096839e-05, + "loss": 0.6185, + "step": 3693 + }, + { + "epoch": 4.72832, + "grad_norm": 0.7309840321540833, + "learning_rate": 4.262905162064826e-05, + "loss": 0.7147, + "step": 3694 + }, + { + "epoch": 4.7296, + "grad_norm": 0.6696488261222839, + "learning_rate": 4.262705082032813e-05, + "loss": 0.5993, + "step": 3695 + }, + { + "epoch": 4.73088, + "grad_norm": 0.7723877429962158, + "learning_rate": 4.2625050020008005e-05, + "loss": 0.6397, + "step": 3696 + }, + { + "epoch": 4.73216, + "grad_norm": 0.6929299831390381, + "learning_rate": 4.262304921968788e-05, + "loss": 0.6304, + "step": 3697 + }, + { + "epoch": 4.73344, + "grad_norm": 0.6986497640609741, + "learning_rate": 4.262104841936775e-05, + "loss": 0.6173, + "step": 3698 + }, + { + "epoch": 4.73472, + "grad_norm": 0.7609420418739319, + "learning_rate": 4.261904761904762e-05, + "loss": 0.6912, + "step": 3699 + }, + { + "epoch": 4.736, + "grad_norm": 0.7289432287216187, + "learning_rate": 4.261704681872749e-05, + "loss": 0.6354, + "step": 3700 + }, + { + "epoch": 4.73728, + "grad_norm": 0.7391143441200256, + "learning_rate": 4.2615046018407365e-05, + "loss": 0.6755, + "step": 3701 + }, + { + "epoch": 4.73856, + "grad_norm": 0.7171617746353149, + "learning_rate": 4.2613045218087236e-05, + "loss": 0.6639, + "step": 3702 + }, + { + "epoch": 4.73984, + "grad_norm": 0.7035807371139526, + "learning_rate": 4.261104441776711e-05, + "loss": 0.6142, + "step": 3703 + }, + { + "epoch": 4.7411200000000004, + "grad_norm": 0.6329298615455627, + "learning_rate": 4.260904361744698e-05, + "loss": 0.6249, + "step": 3704 + }, + { + "epoch": 4.7424, + "grad_norm": 0.6702702045440674, + "learning_rate": 4.260704281712685e-05, + "loss": 0.5998, + "step": 3705 + }, + { + "epoch": 4.74368, + "grad_norm": 0.7044887542724609, + "learning_rate": 4.2605042016806724e-05, + "loss": 0.6308, + "step": 3706 + }, + { + "epoch": 4.74496, + "grad_norm": 0.6639620065689087, + "learning_rate": 4.2603041216486596e-05, + "loss": 0.6565, + "step": 3707 + }, + { + "epoch": 4.74624, + "grad_norm": 0.72665935754776, + "learning_rate": 4.2601040416166474e-05, + "loss": 0.6819, + "step": 3708 + }, + { + "epoch": 4.74752, + "grad_norm": 0.7186412811279297, + "learning_rate": 4.259903961584634e-05, + "loss": 0.6577, + "step": 3709 + }, + { + "epoch": 4.7488, + "grad_norm": 0.6942617893218994, + "learning_rate": 4.259703881552621e-05, + "loss": 0.6603, + "step": 3710 + }, + { + "epoch": 4.75008, + "grad_norm": 0.6832451224327087, + "learning_rate": 4.259503801520608e-05, + "loss": 0.6013, + "step": 3711 + }, + { + "epoch": 4.75136, + "grad_norm": 0.765796959400177, + "learning_rate": 4.2593037214885955e-05, + "loss": 0.6965, + "step": 3712 + }, + { + "epoch": 4.7526399999999995, + "grad_norm": 0.7410770654678345, + "learning_rate": 4.259103641456583e-05, + "loss": 0.6428, + "step": 3713 + }, + { + "epoch": 4.75392, + "grad_norm": 0.7163513898849487, + "learning_rate": 4.25890356142457e-05, + "loss": 0.6708, + "step": 3714 + }, + { + "epoch": 4.7552, + "grad_norm": 0.7363733053207397, + "learning_rate": 4.258703481392558e-05, + "loss": 0.6724, + "step": 3715 + }, + { + "epoch": 4.75648, + "grad_norm": 0.6866661310195923, + "learning_rate": 4.258503401360545e-05, + "loss": 0.6056, + "step": 3716 + }, + { + "epoch": 4.75776, + "grad_norm": 0.723304271697998, + "learning_rate": 4.2583033213285314e-05, + "loss": 0.6742, + "step": 3717 + }, + { + "epoch": 4.75904, + "grad_norm": 0.730629026889801, + "learning_rate": 4.2581032412965186e-05, + "loss": 0.6486, + "step": 3718 + }, + { + "epoch": 4.76032, + "grad_norm": 0.6811598539352417, + "learning_rate": 4.257903161264506e-05, + "loss": 0.6246, + "step": 3719 + }, + { + "epoch": 4.7616, + "grad_norm": 0.7193059921264648, + "learning_rate": 4.257703081232493e-05, + "loss": 0.6695, + "step": 3720 + }, + { + "epoch": 4.76288, + "grad_norm": 0.7182266116142273, + "learning_rate": 4.25750300120048e-05, + "loss": 0.6379, + "step": 3721 + }, + { + "epoch": 4.76416, + "grad_norm": 0.6965410113334656, + "learning_rate": 4.257302921168468e-05, + "loss": 0.6782, + "step": 3722 + }, + { + "epoch": 4.76544, + "grad_norm": 0.7410017251968384, + "learning_rate": 4.257102841136455e-05, + "loss": 0.6226, + "step": 3723 + }, + { + "epoch": 4.76672, + "grad_norm": 0.6835784316062927, + "learning_rate": 4.2569027611044424e-05, + "loss": 0.6244, + "step": 3724 + }, + { + "epoch": 4.768, + "grad_norm": 0.7124270796775818, + "learning_rate": 4.256702681072429e-05, + "loss": 0.6385, + "step": 3725 + }, + { + "epoch": 4.76928, + "grad_norm": 0.6720713376998901, + "learning_rate": 4.256502601040416e-05, + "loss": 0.6582, + "step": 3726 + }, + { + "epoch": 4.77056, + "grad_norm": 0.690466046333313, + "learning_rate": 4.256302521008403e-05, + "loss": 0.6574, + "step": 3727 + }, + { + "epoch": 4.77184, + "grad_norm": 0.7075904011726379, + "learning_rate": 4.2561024409763905e-05, + "loss": 0.6623, + "step": 3728 + }, + { + "epoch": 4.7731200000000005, + "grad_norm": 0.7090795636177063, + "learning_rate": 4.255902360944378e-05, + "loss": 0.6237, + "step": 3729 + }, + { + "epoch": 4.7744, + "grad_norm": 0.6967689394950867, + "learning_rate": 4.2557022809123655e-05, + "loss": 0.6401, + "step": 3730 + }, + { + "epoch": 4.77568, + "grad_norm": 0.7227605581283569, + "learning_rate": 4.255502200880353e-05, + "loss": 0.6841, + "step": 3731 + }, + { + "epoch": 4.77696, + "grad_norm": 0.7367991209030151, + "learning_rate": 4.25530212084834e-05, + "loss": 0.6267, + "step": 3732 + }, + { + "epoch": 4.77824, + "grad_norm": 0.655534565448761, + "learning_rate": 4.2551020408163264e-05, + "loss": 0.5815, + "step": 3733 + }, + { + "epoch": 4.77952, + "grad_norm": 0.7497329115867615, + "learning_rate": 4.2549019607843136e-05, + "loss": 0.6308, + "step": 3734 + }, + { + "epoch": 4.7808, + "grad_norm": 0.7356445789337158, + "learning_rate": 4.254701880752301e-05, + "loss": 0.6944, + "step": 3735 + }, + { + "epoch": 4.78208, + "grad_norm": 0.7261192202568054, + "learning_rate": 4.2545018007202886e-05, + "loss": 0.6922, + "step": 3736 + }, + { + "epoch": 4.78336, + "grad_norm": 0.727255642414093, + "learning_rate": 4.254301720688276e-05, + "loss": 0.6582, + "step": 3737 + }, + { + "epoch": 4.78464, + "grad_norm": 0.7310218811035156, + "learning_rate": 4.254101640656263e-05, + "loss": 0.6671, + "step": 3738 + }, + { + "epoch": 4.78592, + "grad_norm": 0.7008660435676575, + "learning_rate": 4.25390156062425e-05, + "loss": 0.6518, + "step": 3739 + }, + { + "epoch": 4.7872, + "grad_norm": 0.6646680235862732, + "learning_rate": 4.2537014805922374e-05, + "loss": 0.6003, + "step": 3740 + }, + { + "epoch": 4.78848, + "grad_norm": 0.6931464076042175, + "learning_rate": 4.253501400560224e-05, + "loss": 0.624, + "step": 3741 + }, + { + "epoch": 4.78976, + "grad_norm": 0.7339210510253906, + "learning_rate": 4.253301320528211e-05, + "loss": 0.6385, + "step": 3742 + }, + { + "epoch": 4.79104, + "grad_norm": 0.7324719429016113, + "learning_rate": 4.253101240496199e-05, + "loss": 0.6915, + "step": 3743 + }, + { + "epoch": 4.79232, + "grad_norm": 0.7005221843719482, + "learning_rate": 4.252901160464186e-05, + "loss": 0.6452, + "step": 3744 + }, + { + "epoch": 4.7936, + "grad_norm": 0.7293890118598938, + "learning_rate": 4.252701080432173e-05, + "loss": 0.6334, + "step": 3745 + }, + { + "epoch": 4.79488, + "grad_norm": 0.709510862827301, + "learning_rate": 4.2525010004001605e-05, + "loss": 0.6718, + "step": 3746 + }, + { + "epoch": 4.79616, + "grad_norm": 0.7241347432136536, + "learning_rate": 4.252300920368148e-05, + "loss": 0.6326, + "step": 3747 + }, + { + "epoch": 4.79744, + "grad_norm": 0.6769399642944336, + "learning_rate": 4.252100840336135e-05, + "loss": 0.6442, + "step": 3748 + }, + { + "epoch": 4.79872, + "grad_norm": 0.7203426361083984, + "learning_rate": 4.2519007603041214e-05, + "loss": 0.644, + "step": 3749 + }, + { + "epoch": 4.8, + "grad_norm": 0.734929084777832, + "learning_rate": 4.2517006802721085e-05, + "loss": 0.6644, + "step": 3750 + }, + { + "epoch": 4.80128, + "grad_norm": 0.7082551717758179, + "learning_rate": 4.2515006002400964e-05, + "loss": 0.6084, + "step": 3751 + }, + { + "epoch": 4.80256, + "grad_norm": 0.6955980062484741, + "learning_rate": 4.2513005202080836e-05, + "loss": 0.6303, + "step": 3752 + }, + { + "epoch": 4.80384, + "grad_norm": 0.6883904933929443, + "learning_rate": 4.251100440176071e-05, + "loss": 0.6464, + "step": 3753 + }, + { + "epoch": 4.80512, + "grad_norm": 0.6787061095237732, + "learning_rate": 4.250900360144058e-05, + "loss": 0.6351, + "step": 3754 + }, + { + "epoch": 4.8064, + "grad_norm": 0.6913610696792603, + "learning_rate": 4.250700280112045e-05, + "loss": 0.5745, + "step": 3755 + }, + { + "epoch": 4.8076799999999995, + "grad_norm": 0.7253073453903198, + "learning_rate": 4.2505002000800323e-05, + "loss": 0.6662, + "step": 3756 + }, + { + "epoch": 4.80896, + "grad_norm": 0.7172198295593262, + "learning_rate": 4.250300120048019e-05, + "loss": 0.6726, + "step": 3757 + }, + { + "epoch": 4.81024, + "grad_norm": 0.7208426594734192, + "learning_rate": 4.250100040016007e-05, + "loss": 0.6478, + "step": 3758 + }, + { + "epoch": 4.81152, + "grad_norm": 0.6597497463226318, + "learning_rate": 4.249899959983994e-05, + "loss": 0.6372, + "step": 3759 + }, + { + "epoch": 4.8128, + "grad_norm": 0.7280398011207581, + "learning_rate": 4.249699879951981e-05, + "loss": 0.6449, + "step": 3760 + }, + { + "epoch": 4.81408, + "grad_norm": 0.7056030035018921, + "learning_rate": 4.249499799919968e-05, + "loss": 0.6638, + "step": 3761 + }, + { + "epoch": 4.81536, + "grad_norm": 0.7291399836540222, + "learning_rate": 4.2492997198879555e-05, + "loss": 0.6717, + "step": 3762 + }, + { + "epoch": 4.81664, + "grad_norm": 0.7240449786186218, + "learning_rate": 4.2490996398559426e-05, + "loss": 0.6896, + "step": 3763 + }, + { + "epoch": 4.81792, + "grad_norm": 0.6927871704101562, + "learning_rate": 4.24889955982393e-05, + "loss": 0.6627, + "step": 3764 + }, + { + "epoch": 4.8192, + "grad_norm": 0.6421213150024414, + "learning_rate": 4.248699479791917e-05, + "loss": 0.6177, + "step": 3765 + }, + { + "epoch": 4.82048, + "grad_norm": 0.6733725666999817, + "learning_rate": 4.248499399759904e-05, + "loss": 0.5987, + "step": 3766 + }, + { + "epoch": 4.82176, + "grad_norm": 0.7016422152519226, + "learning_rate": 4.2482993197278914e-05, + "loss": 0.6563, + "step": 3767 + }, + { + "epoch": 4.82304, + "grad_norm": 0.662580132484436, + "learning_rate": 4.2480992396958786e-05, + "loss": 0.6167, + "step": 3768 + }, + { + "epoch": 4.82432, + "grad_norm": 0.6980850696563721, + "learning_rate": 4.247899159663866e-05, + "loss": 0.5772, + "step": 3769 + }, + { + "epoch": 4.8256, + "grad_norm": 0.6765894293785095, + "learning_rate": 4.247699079631853e-05, + "loss": 0.6503, + "step": 3770 + }, + { + "epoch": 4.82688, + "grad_norm": 0.739733099937439, + "learning_rate": 4.24749899959984e-05, + "loss": 0.7014, + "step": 3771 + }, + { + "epoch": 4.8281600000000005, + "grad_norm": 0.706731915473938, + "learning_rate": 4.247298919567827e-05, + "loss": 0.6687, + "step": 3772 + }, + { + "epoch": 4.82944, + "grad_norm": 0.7055926322937012, + "learning_rate": 4.2470988395358145e-05, + "loss": 0.6354, + "step": 3773 + }, + { + "epoch": 4.83072, + "grad_norm": 0.7980726361274719, + "learning_rate": 4.246898759503802e-05, + "loss": 0.7241, + "step": 3774 + }, + { + "epoch": 4.832, + "grad_norm": 0.6939478516578674, + "learning_rate": 4.246698679471789e-05, + "loss": 0.621, + "step": 3775 + }, + { + "epoch": 4.83328, + "grad_norm": 0.6984978318214417, + "learning_rate": 4.246498599439776e-05, + "loss": 0.591, + "step": 3776 + }, + { + "epoch": 4.83456, + "grad_norm": 0.7327542901039124, + "learning_rate": 4.246298519407763e-05, + "loss": 0.6889, + "step": 3777 + }, + { + "epoch": 4.83584, + "grad_norm": 0.7143282890319824, + "learning_rate": 4.2460984393757504e-05, + "loss": 0.6558, + "step": 3778 + }, + { + "epoch": 4.83712, + "grad_norm": 0.7296421527862549, + "learning_rate": 4.2458983593437376e-05, + "loss": 0.6599, + "step": 3779 + }, + { + "epoch": 4.8384, + "grad_norm": 0.7196743488311768, + "learning_rate": 4.245698279311725e-05, + "loss": 0.6421, + "step": 3780 + }, + { + "epoch": 4.8396799999999995, + "grad_norm": 0.6790671944618225, + "learning_rate": 4.245498199279712e-05, + "loss": 0.6075, + "step": 3781 + }, + { + "epoch": 4.84096, + "grad_norm": 0.7062405347824097, + "learning_rate": 4.245298119247699e-05, + "loss": 0.6633, + "step": 3782 + }, + { + "epoch": 4.84224, + "grad_norm": 0.766210675239563, + "learning_rate": 4.2450980392156864e-05, + "loss": 0.6746, + "step": 3783 + }, + { + "epoch": 4.84352, + "grad_norm": 0.6822754740715027, + "learning_rate": 4.2448979591836735e-05, + "loss": 0.6352, + "step": 3784 + }, + { + "epoch": 4.8448, + "grad_norm": 0.6726470589637756, + "learning_rate": 4.244697879151661e-05, + "loss": 0.6153, + "step": 3785 + }, + { + "epoch": 4.84608, + "grad_norm": 0.7688694596290588, + "learning_rate": 4.2444977991196486e-05, + "loss": 0.7037, + "step": 3786 + }, + { + "epoch": 4.84736, + "grad_norm": 0.6525343060493469, + "learning_rate": 4.244297719087635e-05, + "loss": 0.6385, + "step": 3787 + }, + { + "epoch": 4.84864, + "grad_norm": 0.7031245827674866, + "learning_rate": 4.244097639055622e-05, + "loss": 0.6051, + "step": 3788 + }, + { + "epoch": 4.84992, + "grad_norm": 0.7337315082550049, + "learning_rate": 4.2438975590236095e-05, + "loss": 0.6638, + "step": 3789 + }, + { + "epoch": 4.8512, + "grad_norm": 0.7109333276748657, + "learning_rate": 4.2436974789915967e-05, + "loss": 0.6656, + "step": 3790 + }, + { + "epoch": 4.85248, + "grad_norm": 0.7319778800010681, + "learning_rate": 4.243497398959584e-05, + "loss": 0.6261, + "step": 3791 + }, + { + "epoch": 4.85376, + "grad_norm": 0.7297192811965942, + "learning_rate": 4.243297318927571e-05, + "loss": 0.703, + "step": 3792 + }, + { + "epoch": 4.85504, + "grad_norm": 0.7348429560661316, + "learning_rate": 4.243097238895559e-05, + "loss": 0.6583, + "step": 3793 + }, + { + "epoch": 4.85632, + "grad_norm": 0.7114925384521484, + "learning_rate": 4.242897158863546e-05, + "loss": 0.6621, + "step": 3794 + }, + { + "epoch": 4.8576, + "grad_norm": 0.679014265537262, + "learning_rate": 4.2426970788315326e-05, + "loss": 0.615, + "step": 3795 + }, + { + "epoch": 4.85888, + "grad_norm": 0.7080205678939819, + "learning_rate": 4.24249699879952e-05, + "loss": 0.653, + "step": 3796 + }, + { + "epoch": 4.8601600000000005, + "grad_norm": 0.7360637187957764, + "learning_rate": 4.242296918767507e-05, + "loss": 0.6624, + "step": 3797 + }, + { + "epoch": 4.86144, + "grad_norm": 0.7157953977584839, + "learning_rate": 4.242096838735494e-05, + "loss": 0.6692, + "step": 3798 + }, + { + "epoch": 4.86272, + "grad_norm": 0.7022566199302673, + "learning_rate": 4.241896758703481e-05, + "loss": 0.6586, + "step": 3799 + }, + { + "epoch": 4.864, + "grad_norm": 0.7275310158729553, + "learning_rate": 4.241696678671469e-05, + "loss": 0.6602, + "step": 3800 + }, + { + "epoch": 4.86528, + "grad_norm": 0.7159553170204163, + "learning_rate": 4.2414965986394564e-05, + "loss": 0.6299, + "step": 3801 + }, + { + "epoch": 4.86656, + "grad_norm": 0.7265822887420654, + "learning_rate": 4.2412965186074436e-05, + "loss": 0.5922, + "step": 3802 + }, + { + "epoch": 4.86784, + "grad_norm": 0.6696851253509521, + "learning_rate": 4.24109643857543e-05, + "loss": 0.6128, + "step": 3803 + }, + { + "epoch": 4.86912, + "grad_norm": 0.7184013724327087, + "learning_rate": 4.240896358543417e-05, + "loss": 0.6601, + "step": 3804 + }, + { + "epoch": 4.8704, + "grad_norm": 0.7109546661376953, + "learning_rate": 4.2406962785114044e-05, + "loss": 0.6546, + "step": 3805 + }, + { + "epoch": 4.87168, + "grad_norm": 0.7122465372085571, + "learning_rate": 4.2404961984793916e-05, + "loss": 0.6324, + "step": 3806 + }, + { + "epoch": 4.87296, + "grad_norm": 0.7610680460929871, + "learning_rate": 4.2402961184473795e-05, + "loss": 0.6355, + "step": 3807 + }, + { + "epoch": 4.87424, + "grad_norm": 0.7229766845703125, + "learning_rate": 4.240096038415367e-05, + "loss": 0.6396, + "step": 3808 + }, + { + "epoch": 4.87552, + "grad_norm": 0.7144048810005188, + "learning_rate": 4.239895958383354e-05, + "loss": 0.6485, + "step": 3809 + }, + { + "epoch": 4.8768, + "grad_norm": 0.7222286462783813, + "learning_rate": 4.239695878351341e-05, + "loss": 0.6919, + "step": 3810 + }, + { + "epoch": 4.87808, + "grad_norm": 0.700534462928772, + "learning_rate": 4.2394957983193276e-05, + "loss": 0.6468, + "step": 3811 + }, + { + "epoch": 4.87936, + "grad_norm": 0.696043074131012, + "learning_rate": 4.239295718287315e-05, + "loss": 0.589, + "step": 3812 + }, + { + "epoch": 4.88064, + "grad_norm": 0.6939215064048767, + "learning_rate": 4.239095638255302e-05, + "loss": 0.6358, + "step": 3813 + }, + { + "epoch": 4.88192, + "grad_norm": 0.6854797601699829, + "learning_rate": 4.23889555822329e-05, + "loss": 0.6574, + "step": 3814 + }, + { + "epoch": 4.8832, + "grad_norm": 0.7024257183074951, + "learning_rate": 4.238695478191277e-05, + "loss": 0.6688, + "step": 3815 + }, + { + "epoch": 4.88448, + "grad_norm": 0.7299116849899292, + "learning_rate": 4.238495398159264e-05, + "loss": 0.6921, + "step": 3816 + }, + { + "epoch": 4.88576, + "grad_norm": 0.7275465130805969, + "learning_rate": 4.2382953181272513e-05, + "loss": 0.6454, + "step": 3817 + }, + { + "epoch": 4.88704, + "grad_norm": 0.7085564136505127, + "learning_rate": 4.2380952380952385e-05, + "loss": 0.6523, + "step": 3818 + }, + { + "epoch": 4.88832, + "grad_norm": 0.7188389897346497, + "learning_rate": 4.237895158063225e-05, + "loss": 0.6946, + "step": 3819 + }, + { + "epoch": 4.8896, + "grad_norm": 0.72255939245224, + "learning_rate": 4.237695078031212e-05, + "loss": 0.6166, + "step": 3820 + }, + { + "epoch": 4.89088, + "grad_norm": 0.6671111583709717, + "learning_rate": 4.2374949979992e-05, + "loss": 0.6173, + "step": 3821 + }, + { + "epoch": 4.89216, + "grad_norm": 0.7809231877326965, + "learning_rate": 4.237294917967187e-05, + "loss": 0.7262, + "step": 3822 + }, + { + "epoch": 4.89344, + "grad_norm": 0.6766269207000732, + "learning_rate": 4.2370948379351745e-05, + "loss": 0.6216, + "step": 3823 + }, + { + "epoch": 4.8947199999999995, + "grad_norm": 0.7346859574317932, + "learning_rate": 4.2368947579031616e-05, + "loss": 0.6529, + "step": 3824 + }, + { + "epoch": 4.896, + "grad_norm": 0.7254279851913452, + "learning_rate": 4.236694677871149e-05, + "loss": 0.6319, + "step": 3825 + }, + { + "epoch": 4.89728, + "grad_norm": 0.7144622802734375, + "learning_rate": 4.236494597839136e-05, + "loss": 0.6586, + "step": 3826 + }, + { + "epoch": 4.89856, + "grad_norm": 0.7005541324615479, + "learning_rate": 4.2362945178071225e-05, + "loss": 0.67, + "step": 3827 + }, + { + "epoch": 4.89984, + "grad_norm": 0.7724364995956421, + "learning_rate": 4.2360944377751104e-05, + "loss": 0.663, + "step": 3828 + }, + { + "epoch": 4.90112, + "grad_norm": 0.7261497974395752, + "learning_rate": 4.2358943577430976e-05, + "loss": 0.6781, + "step": 3829 + }, + { + "epoch": 4.9024, + "grad_norm": 0.673323929309845, + "learning_rate": 4.235694277711085e-05, + "loss": 0.6236, + "step": 3830 + }, + { + "epoch": 4.90368, + "grad_norm": 0.6874783635139465, + "learning_rate": 4.235494197679072e-05, + "loss": 0.644, + "step": 3831 + }, + { + "epoch": 4.90496, + "grad_norm": 0.7447608709335327, + "learning_rate": 4.235294117647059e-05, + "loss": 0.7078, + "step": 3832 + }, + { + "epoch": 4.90624, + "grad_norm": 0.7306890487670898, + "learning_rate": 4.235094037615046e-05, + "loss": 0.649, + "step": 3833 + }, + { + "epoch": 4.90752, + "grad_norm": 0.7219198942184448, + "learning_rate": 4.2348939575830335e-05, + "loss": 0.6817, + "step": 3834 + }, + { + "epoch": 4.9088, + "grad_norm": 0.7355913519859314, + "learning_rate": 4.234693877551021e-05, + "loss": 0.6291, + "step": 3835 + }, + { + "epoch": 4.91008, + "grad_norm": 0.7445414662361145, + "learning_rate": 4.234493797519008e-05, + "loss": 0.6903, + "step": 3836 + }, + { + "epoch": 4.91136, + "grad_norm": 0.6713249087333679, + "learning_rate": 4.234293717486995e-05, + "loss": 0.6459, + "step": 3837 + }, + { + "epoch": 4.91264, + "grad_norm": 0.7056488990783691, + "learning_rate": 4.234093637454982e-05, + "loss": 0.6648, + "step": 3838 + }, + { + "epoch": 4.91392, + "grad_norm": 0.6989180445671082, + "learning_rate": 4.2338935574229694e-05, + "loss": 0.705, + "step": 3839 + }, + { + "epoch": 4.9152000000000005, + "grad_norm": 0.753159761428833, + "learning_rate": 4.2336934773909566e-05, + "loss": 0.6622, + "step": 3840 + }, + { + "epoch": 4.91648, + "grad_norm": 0.7392045855522156, + "learning_rate": 4.233493397358944e-05, + "loss": 0.6205, + "step": 3841 + }, + { + "epoch": 4.91776, + "grad_norm": 0.7536989450454712, + "learning_rate": 4.233293317326931e-05, + "loss": 0.6748, + "step": 3842 + }, + { + "epoch": 4.91904, + "grad_norm": 0.7107492089271545, + "learning_rate": 4.233093237294918e-05, + "loss": 0.6372, + "step": 3843 + }, + { + "epoch": 4.92032, + "grad_norm": 0.6968305110931396, + "learning_rate": 4.2328931572629054e-05, + "loss": 0.6463, + "step": 3844 + }, + { + "epoch": 4.9216, + "grad_norm": 0.7517611980438232, + "learning_rate": 4.2326930772308925e-05, + "loss": 0.6505, + "step": 3845 + }, + { + "epoch": 4.92288, + "grad_norm": 0.7284044623374939, + "learning_rate": 4.23249299719888e-05, + "loss": 0.6767, + "step": 3846 + }, + { + "epoch": 4.92416, + "grad_norm": 0.676810085773468, + "learning_rate": 4.232292917166867e-05, + "loss": 0.6505, + "step": 3847 + }, + { + "epoch": 4.92544, + "grad_norm": 0.7427054643630981, + "learning_rate": 4.232092837134854e-05, + "loss": 0.684, + "step": 3848 + }, + { + "epoch": 4.9267199999999995, + "grad_norm": 0.7146655917167664, + "learning_rate": 4.231892757102841e-05, + "loss": 0.6527, + "step": 3849 + }, + { + "epoch": 4.928, + "grad_norm": 0.690930187702179, + "learning_rate": 4.2316926770708285e-05, + "loss": 0.6584, + "step": 3850 + }, + { + "epoch": 4.92928, + "grad_norm": 0.742597222328186, + "learning_rate": 4.2314925970388157e-05, + "loss": 0.6644, + "step": 3851 + }, + { + "epoch": 4.93056, + "grad_norm": 0.7021815776824951, + "learning_rate": 4.231292517006803e-05, + "loss": 0.6765, + "step": 3852 + }, + { + "epoch": 4.93184, + "grad_norm": 0.7302278876304626, + "learning_rate": 4.23109243697479e-05, + "loss": 0.6875, + "step": 3853 + }, + { + "epoch": 4.93312, + "grad_norm": 0.7111605405807495, + "learning_rate": 4.230892356942777e-05, + "loss": 0.6808, + "step": 3854 + }, + { + "epoch": 4.9344, + "grad_norm": 0.7146978378295898, + "learning_rate": 4.2306922769107644e-05, + "loss": 0.598, + "step": 3855 + }, + { + "epoch": 4.93568, + "grad_norm": 0.6885932683944702, + "learning_rate": 4.230492196878752e-05, + "loss": 0.6331, + "step": 3856 + }, + { + "epoch": 4.93696, + "grad_norm": 0.7933337092399597, + "learning_rate": 4.230292116846739e-05, + "loss": 0.7657, + "step": 3857 + }, + { + "epoch": 4.93824, + "grad_norm": 0.7346404790878296, + "learning_rate": 4.230092036814726e-05, + "loss": 0.7045, + "step": 3858 + }, + { + "epoch": 4.93952, + "grad_norm": 0.7179169058799744, + "learning_rate": 4.229891956782713e-05, + "loss": 0.6378, + "step": 3859 + }, + { + "epoch": 4.9408, + "grad_norm": 0.7097733020782471, + "learning_rate": 4.2296918767507e-05, + "loss": 0.6371, + "step": 3860 + }, + { + "epoch": 4.94208, + "grad_norm": 0.7483527064323425, + "learning_rate": 4.2294917967186875e-05, + "loss": 0.6334, + "step": 3861 + }, + { + "epoch": 4.94336, + "grad_norm": 0.7159331440925598, + "learning_rate": 4.229291716686675e-05, + "loss": 0.6409, + "step": 3862 + }, + { + "epoch": 4.94464, + "grad_norm": 0.7627670764923096, + "learning_rate": 4.229091636654662e-05, + "loss": 0.6671, + "step": 3863 + }, + { + "epoch": 4.94592, + "grad_norm": 0.7190044522285461, + "learning_rate": 4.22889155662265e-05, + "loss": 0.6906, + "step": 3864 + }, + { + "epoch": 4.9472000000000005, + "grad_norm": 0.7264943718910217, + "learning_rate": 4.228691476590636e-05, + "loss": 0.6485, + "step": 3865 + }, + { + "epoch": 4.94848, + "grad_norm": 0.7001417875289917, + "learning_rate": 4.2284913965586234e-05, + "loss": 0.6528, + "step": 3866 + }, + { + "epoch": 4.94976, + "grad_norm": 0.7295548319816589, + "learning_rate": 4.2282913165266106e-05, + "loss": 0.6047, + "step": 3867 + }, + { + "epoch": 4.95104, + "grad_norm": 0.7287285327911377, + "learning_rate": 4.228091236494598e-05, + "loss": 0.6203, + "step": 3868 + }, + { + "epoch": 4.95232, + "grad_norm": 0.6717093586921692, + "learning_rate": 4.227891156462585e-05, + "loss": 0.642, + "step": 3869 + }, + { + "epoch": 4.9536, + "grad_norm": 0.7274335622787476, + "learning_rate": 4.227691076430572e-05, + "loss": 0.7194, + "step": 3870 + }, + { + "epoch": 4.95488, + "grad_norm": 0.7339951395988464, + "learning_rate": 4.22749099639856e-05, + "loss": 0.6511, + "step": 3871 + }, + { + "epoch": 4.95616, + "grad_norm": 0.7088412046432495, + "learning_rate": 4.227290916366547e-05, + "loss": 0.6424, + "step": 3872 + }, + { + "epoch": 4.95744, + "grad_norm": 0.6983089447021484, + "learning_rate": 4.227090836334534e-05, + "loss": 0.6186, + "step": 3873 + }, + { + "epoch": 4.95872, + "grad_norm": 0.727528989315033, + "learning_rate": 4.226890756302521e-05, + "loss": 0.616, + "step": 3874 + }, + { + "epoch": 4.96, + "grad_norm": 0.732072651386261, + "learning_rate": 4.226690676270508e-05, + "loss": 0.6259, + "step": 3875 + }, + { + "epoch": 4.96128, + "grad_norm": 0.6999890208244324, + "learning_rate": 4.226490596238495e-05, + "loss": 0.677, + "step": 3876 + }, + { + "epoch": 4.96256, + "grad_norm": 0.6825425028800964, + "learning_rate": 4.2262905162064825e-05, + "loss": 0.6053, + "step": 3877 + }, + { + "epoch": 4.96384, + "grad_norm": 0.7285329699516296, + "learning_rate": 4.2260904361744703e-05, + "loss": 0.696, + "step": 3878 + }, + { + "epoch": 4.96512, + "grad_norm": 0.6932041645050049, + "learning_rate": 4.2258903561424575e-05, + "loss": 0.6412, + "step": 3879 + }, + { + "epoch": 4.9664, + "grad_norm": 0.7156229019165039, + "learning_rate": 4.225690276110445e-05, + "loss": 0.648, + "step": 3880 + }, + { + "epoch": 4.96768, + "grad_norm": 0.7821218371391296, + "learning_rate": 4.225490196078431e-05, + "loss": 0.72, + "step": 3881 + }, + { + "epoch": 4.96896, + "grad_norm": 0.6983861327171326, + "learning_rate": 4.2252901160464184e-05, + "loss": 0.6516, + "step": 3882 + }, + { + "epoch": 4.97024, + "grad_norm": 0.7392165660858154, + "learning_rate": 4.2250900360144056e-05, + "loss": 0.6855, + "step": 3883 + }, + { + "epoch": 4.97152, + "grad_norm": 0.7489770650863647, + "learning_rate": 4.224889955982393e-05, + "loss": 0.658, + "step": 3884 + }, + { + "epoch": 4.9728, + "grad_norm": 0.6763161420822144, + "learning_rate": 4.2246898759503806e-05, + "loss": 0.6016, + "step": 3885 + }, + { + "epoch": 4.97408, + "grad_norm": 0.7260777354240417, + "learning_rate": 4.224489795918368e-05, + "loss": 0.6908, + "step": 3886 + }, + { + "epoch": 4.97536, + "grad_norm": 0.7529964447021484, + "learning_rate": 4.224289715886355e-05, + "loss": 0.6736, + "step": 3887 + }, + { + "epoch": 4.97664, + "grad_norm": 0.6947912573814392, + "learning_rate": 4.224089635854342e-05, + "loss": 0.6518, + "step": 3888 + }, + { + "epoch": 4.97792, + "grad_norm": 0.6871297955513, + "learning_rate": 4.223889555822329e-05, + "loss": 0.6536, + "step": 3889 + }, + { + "epoch": 4.9792, + "grad_norm": 0.7125989198684692, + "learning_rate": 4.223689475790316e-05, + "loss": 0.7032, + "step": 3890 + }, + { + "epoch": 4.98048, + "grad_norm": 0.7379283308982849, + "learning_rate": 4.223489395758303e-05, + "loss": 0.6589, + "step": 3891 + }, + { + "epoch": 4.9817599999999995, + "grad_norm": 0.7186259031295776, + "learning_rate": 4.223289315726291e-05, + "loss": 0.6572, + "step": 3892 + }, + { + "epoch": 4.98304, + "grad_norm": 0.7020161151885986, + "learning_rate": 4.223089235694278e-05, + "loss": 0.6087, + "step": 3893 + }, + { + "epoch": 4.98432, + "grad_norm": 0.7505249977111816, + "learning_rate": 4.222889155662265e-05, + "loss": 0.7485, + "step": 3894 + }, + { + "epoch": 4.9856, + "grad_norm": 0.7452868223190308, + "learning_rate": 4.2226890756302525e-05, + "loss": 0.6414, + "step": 3895 + }, + { + "epoch": 4.98688, + "grad_norm": 0.7297542691230774, + "learning_rate": 4.22248899559824e-05, + "loss": 0.6662, + "step": 3896 + }, + { + "epoch": 4.98816, + "grad_norm": 0.7233591079711914, + "learning_rate": 4.222288915566226e-05, + "loss": 0.6443, + "step": 3897 + }, + { + "epoch": 4.98944, + "grad_norm": 0.6811563968658447, + "learning_rate": 4.2220888355342134e-05, + "loss": 0.6349, + "step": 3898 + }, + { + "epoch": 4.99072, + "grad_norm": 0.7110039591789246, + "learning_rate": 4.221888755502201e-05, + "loss": 0.5868, + "step": 3899 + }, + { + "epoch": 4.992, + "grad_norm": 0.685947060585022, + "learning_rate": 4.2216886754701884e-05, + "loss": 0.6747, + "step": 3900 + }, + { + "epoch": 4.99328, + "grad_norm": 0.6793825626373291, + "learning_rate": 4.2214885954381756e-05, + "loss": 0.6335, + "step": 3901 + }, + { + "epoch": 4.99456, + "grad_norm": 0.7450732588768005, + "learning_rate": 4.221288515406163e-05, + "loss": 0.6845, + "step": 3902 + }, + { + "epoch": 4.99584, + "grad_norm": 0.7151440978050232, + "learning_rate": 4.22108843537415e-05, + "loss": 0.6479, + "step": 3903 + }, + { + "epoch": 4.99712, + "grad_norm": 0.7348343133926392, + "learning_rate": 4.220888355342137e-05, + "loss": 0.6799, + "step": 3904 + }, + { + "epoch": 4.9984, + "grad_norm": 0.6973733305931091, + "learning_rate": 4.220688275310124e-05, + "loss": 0.6555, + "step": 3905 + }, + { + "epoch": 4.99968, + "grad_norm": 0.7058578133583069, + "learning_rate": 4.2204881952781115e-05, + "loss": 0.6924, + "step": 3906 + }, + { + "epoch": 5.00096, + "grad_norm": 1.5164827108383179, + "learning_rate": 4.220288115246099e-05, + "loss": 1.0917, + "step": 3907 + }, + { + "epoch": 5.00224, + "grad_norm": 0.7053406834602356, + "learning_rate": 4.220088035214086e-05, + "loss": 0.6068, + "step": 3908 + }, + { + "epoch": 5.00352, + "grad_norm": 0.6764478087425232, + "learning_rate": 4.219887955182073e-05, + "loss": 0.6047, + "step": 3909 + }, + { + "epoch": 5.0048, + "grad_norm": 0.6720277070999146, + "learning_rate": 4.21968787515006e-05, + "loss": 0.6328, + "step": 3910 + }, + { + "epoch": 5.00608, + "grad_norm": 0.715262234210968, + "learning_rate": 4.2194877951180475e-05, + "loss": 0.6962, + "step": 3911 + }, + { + "epoch": 5.00736, + "grad_norm": 0.6942645907402039, + "learning_rate": 4.219287715086035e-05, + "loss": 0.6243, + "step": 3912 + }, + { + "epoch": 5.00864, + "grad_norm": 0.7017389535903931, + "learning_rate": 4.219087635054022e-05, + "loss": 0.6112, + "step": 3913 + }, + { + "epoch": 5.00992, + "grad_norm": 0.7179716229438782, + "learning_rate": 4.218887555022009e-05, + "loss": 0.6935, + "step": 3914 + }, + { + "epoch": 5.0112, + "grad_norm": 0.7195164561271667, + "learning_rate": 4.218687474989996e-05, + "loss": 0.6682, + "step": 3915 + }, + { + "epoch": 5.01248, + "grad_norm": 0.7197250723838806, + "learning_rate": 4.2184873949579834e-05, + "loss": 0.6436, + "step": 3916 + }, + { + "epoch": 5.01376, + "grad_norm": 0.7025637626647949, + "learning_rate": 4.2182873149259706e-05, + "loss": 0.6336, + "step": 3917 + }, + { + "epoch": 5.01504, + "grad_norm": 0.7276149392127991, + "learning_rate": 4.218087234893958e-05, + "loss": 0.6207, + "step": 3918 + }, + { + "epoch": 5.01632, + "grad_norm": 0.7081112265586853, + "learning_rate": 4.217887154861945e-05, + "loss": 0.6396, + "step": 3919 + }, + { + "epoch": 5.0176, + "grad_norm": 0.7482070922851562, + "learning_rate": 4.217687074829932e-05, + "loss": 0.632, + "step": 3920 + }, + { + "epoch": 5.01888, + "grad_norm": 0.7229052186012268, + "learning_rate": 4.217486994797919e-05, + "loss": 0.5806, + "step": 3921 + }, + { + "epoch": 5.02016, + "grad_norm": 0.678459644317627, + "learning_rate": 4.2172869147659065e-05, + "loss": 0.5644, + "step": 3922 + }, + { + "epoch": 5.02144, + "grad_norm": 0.7244837880134583, + "learning_rate": 4.217086834733894e-05, + "loss": 0.6354, + "step": 3923 + }, + { + "epoch": 5.02272, + "grad_norm": 0.752399206161499, + "learning_rate": 4.216886754701881e-05, + "loss": 0.6326, + "step": 3924 + }, + { + "epoch": 5.024, + "grad_norm": 0.7514381408691406, + "learning_rate": 4.216686674669868e-05, + "loss": 0.6534, + "step": 3925 + }, + { + "epoch": 5.02528, + "grad_norm": 0.720856249332428, + "learning_rate": 4.216486594637855e-05, + "loss": 0.6045, + "step": 3926 + }, + { + "epoch": 5.02656, + "grad_norm": 0.7397941946983337, + "learning_rate": 4.2162865146058424e-05, + "loss": 0.6208, + "step": 3927 + }, + { + "epoch": 5.02784, + "grad_norm": 0.7684844732284546, + "learning_rate": 4.2160864345738296e-05, + "loss": 0.6398, + "step": 3928 + }, + { + "epoch": 5.02912, + "grad_norm": 0.7364361882209778, + "learning_rate": 4.215886354541817e-05, + "loss": 0.6167, + "step": 3929 + }, + { + "epoch": 5.0304, + "grad_norm": 0.6819130182266235, + "learning_rate": 4.215686274509804e-05, + "loss": 0.5566, + "step": 3930 + }, + { + "epoch": 5.03168, + "grad_norm": 0.7268734574317932, + "learning_rate": 4.215486194477791e-05, + "loss": 0.6476, + "step": 3931 + }, + { + "epoch": 5.03296, + "grad_norm": 0.7218887805938721, + "learning_rate": 4.2152861144457784e-05, + "loss": 0.6149, + "step": 3932 + }, + { + "epoch": 5.03424, + "grad_norm": 0.7900970578193665, + "learning_rate": 4.2150860344137656e-05, + "loss": 0.6828, + "step": 3933 + }, + { + "epoch": 5.03552, + "grad_norm": 0.6771689653396606, + "learning_rate": 4.2148859543817534e-05, + "loss": 0.5821, + "step": 3934 + }, + { + "epoch": 5.0368, + "grad_norm": 0.7154511213302612, + "learning_rate": 4.21468587434974e-05, + "loss": 0.5964, + "step": 3935 + }, + { + "epoch": 5.03808, + "grad_norm": 0.7221072912216187, + "learning_rate": 4.214485794317727e-05, + "loss": 0.6088, + "step": 3936 + }, + { + "epoch": 5.03936, + "grad_norm": 0.7826116681098938, + "learning_rate": 4.214285714285714e-05, + "loss": 0.6514, + "step": 3937 + }, + { + "epoch": 5.04064, + "grad_norm": 0.7515047192573547, + "learning_rate": 4.2140856342537015e-05, + "loss": 0.6776, + "step": 3938 + }, + { + "epoch": 5.04192, + "grad_norm": 0.7475928664207458, + "learning_rate": 4.213885554221689e-05, + "loss": 0.6478, + "step": 3939 + }, + { + "epoch": 5.0432, + "grad_norm": 0.7153226733207703, + "learning_rate": 4.213685474189676e-05, + "loss": 0.6474, + "step": 3940 + }, + { + "epoch": 5.04448, + "grad_norm": 0.6863680481910706, + "learning_rate": 4.213485394157664e-05, + "loss": 0.6281, + "step": 3941 + }, + { + "epoch": 5.04576, + "grad_norm": 0.6896254420280457, + "learning_rate": 4.213285314125651e-05, + "loss": 0.6058, + "step": 3942 + }, + { + "epoch": 5.04704, + "grad_norm": 0.7509266138076782, + "learning_rate": 4.2130852340936374e-05, + "loss": 0.6445, + "step": 3943 + }, + { + "epoch": 5.04832, + "grad_norm": 0.7177383303642273, + "learning_rate": 4.2128851540616246e-05, + "loss": 0.6522, + "step": 3944 + }, + { + "epoch": 5.0496, + "grad_norm": 0.7128506302833557, + "learning_rate": 4.212685074029612e-05, + "loss": 0.6011, + "step": 3945 + }, + { + "epoch": 5.05088, + "grad_norm": 0.6913168430328369, + "learning_rate": 4.212484993997599e-05, + "loss": 0.6598, + "step": 3946 + }, + { + "epoch": 5.05216, + "grad_norm": 0.740969181060791, + "learning_rate": 4.212284913965586e-05, + "loss": 0.6409, + "step": 3947 + }, + { + "epoch": 5.05344, + "grad_norm": 0.7747688293457031, + "learning_rate": 4.212084833933574e-05, + "loss": 0.6727, + "step": 3948 + }, + { + "epoch": 5.05472, + "grad_norm": 0.7710026502609253, + "learning_rate": 4.211884753901561e-05, + "loss": 0.6585, + "step": 3949 + }, + { + "epoch": 5.056, + "grad_norm": 0.713817834854126, + "learning_rate": 4.2116846738695484e-05, + "loss": 0.6174, + "step": 3950 + }, + { + "epoch": 5.05728, + "grad_norm": 0.7268139123916626, + "learning_rate": 4.211484593837535e-05, + "loss": 0.6679, + "step": 3951 + }, + { + "epoch": 5.05856, + "grad_norm": 0.7151246666908264, + "learning_rate": 4.211284513805522e-05, + "loss": 0.5802, + "step": 3952 + }, + { + "epoch": 5.05984, + "grad_norm": 0.6851016283035278, + "learning_rate": 4.211084433773509e-05, + "loss": 0.6258, + "step": 3953 + }, + { + "epoch": 5.06112, + "grad_norm": 0.7177205681800842, + "learning_rate": 4.2108843537414965e-05, + "loss": 0.6276, + "step": 3954 + }, + { + "epoch": 5.0624, + "grad_norm": 0.7391200661659241, + "learning_rate": 4.210684273709484e-05, + "loss": 0.6175, + "step": 3955 + }, + { + "epoch": 5.06368, + "grad_norm": 0.7397950887680054, + "learning_rate": 4.2104841936774715e-05, + "loss": 0.6149, + "step": 3956 + }, + { + "epoch": 5.06496, + "grad_norm": 0.7378024458885193, + "learning_rate": 4.210284113645459e-05, + "loss": 0.6524, + "step": 3957 + }, + { + "epoch": 5.06624, + "grad_norm": 0.7197895050048828, + "learning_rate": 4.210084033613446e-05, + "loss": 0.6747, + "step": 3958 + }, + { + "epoch": 5.06752, + "grad_norm": 0.7385547161102295, + "learning_rate": 4.2098839535814324e-05, + "loss": 0.6381, + "step": 3959 + }, + { + "epoch": 5.0688, + "grad_norm": 0.7747622728347778, + "learning_rate": 4.2096838735494196e-05, + "loss": 0.6198, + "step": 3960 + }, + { + "epoch": 5.07008, + "grad_norm": 0.7808424830436707, + "learning_rate": 4.209483793517407e-05, + "loss": 0.6496, + "step": 3961 + }, + { + "epoch": 5.07136, + "grad_norm": 0.7174487709999084, + "learning_rate": 4.2092837134853946e-05, + "loss": 0.5677, + "step": 3962 + }, + { + "epoch": 5.07264, + "grad_norm": 0.7417564392089844, + "learning_rate": 4.209083633453382e-05, + "loss": 0.6681, + "step": 3963 + }, + { + "epoch": 5.07392, + "grad_norm": 0.654253363609314, + "learning_rate": 4.208883553421369e-05, + "loss": 0.5682, + "step": 3964 + }, + { + "epoch": 5.0752, + "grad_norm": 0.7092409133911133, + "learning_rate": 4.208683473389356e-05, + "loss": 0.6277, + "step": 3965 + }, + { + "epoch": 5.07648, + "grad_norm": 0.7257412672042847, + "learning_rate": 4.2084833933573434e-05, + "loss": 0.6708, + "step": 3966 + }, + { + "epoch": 5.07776, + "grad_norm": 0.7401608228683472, + "learning_rate": 4.20828331332533e-05, + "loss": 0.6563, + "step": 3967 + }, + { + "epoch": 5.07904, + "grad_norm": 0.7241030335426331, + "learning_rate": 4.208083233293317e-05, + "loss": 0.6504, + "step": 3968 + }, + { + "epoch": 5.08032, + "grad_norm": 0.7237942218780518, + "learning_rate": 4.207883153261305e-05, + "loss": 0.6548, + "step": 3969 + }, + { + "epoch": 5.0816, + "grad_norm": 0.7459864616394043, + "learning_rate": 4.207683073229292e-05, + "loss": 0.6404, + "step": 3970 + }, + { + "epoch": 5.08288, + "grad_norm": 0.7194232940673828, + "learning_rate": 4.207482993197279e-05, + "loss": 0.6405, + "step": 3971 + }, + { + "epoch": 5.08416, + "grad_norm": 0.7170876860618591, + "learning_rate": 4.2072829131652665e-05, + "loss": 0.6667, + "step": 3972 + }, + { + "epoch": 5.08544, + "grad_norm": 0.7066750526428223, + "learning_rate": 4.207082833133254e-05, + "loss": 0.6285, + "step": 3973 + }, + { + "epoch": 5.08672, + "grad_norm": 0.7601994276046753, + "learning_rate": 4.206882753101241e-05, + "loss": 0.7017, + "step": 3974 + }, + { + "epoch": 5.088, + "grad_norm": 0.7173680663108826, + "learning_rate": 4.2066826730692274e-05, + "loss": 0.6869, + "step": 3975 + }, + { + "epoch": 5.08928, + "grad_norm": 0.7139931321144104, + "learning_rate": 4.2064825930372145e-05, + "loss": 0.6175, + "step": 3976 + }, + { + "epoch": 5.09056, + "grad_norm": 0.7550209760665894, + "learning_rate": 4.2062825130052024e-05, + "loss": 0.6521, + "step": 3977 + }, + { + "epoch": 5.09184, + "grad_norm": 0.7335852384567261, + "learning_rate": 4.2060824329731896e-05, + "loss": 0.6432, + "step": 3978 + }, + { + "epoch": 5.09312, + "grad_norm": 0.7130416035652161, + "learning_rate": 4.205882352941177e-05, + "loss": 0.616, + "step": 3979 + }, + { + "epoch": 5.0944, + "grad_norm": 0.7009579539299011, + "learning_rate": 4.205682272909164e-05, + "loss": 0.6022, + "step": 3980 + }, + { + "epoch": 5.09568, + "grad_norm": 0.7348793148994446, + "learning_rate": 4.205482192877151e-05, + "loss": 0.6118, + "step": 3981 + }, + { + "epoch": 5.09696, + "grad_norm": 0.7403352856636047, + "learning_rate": 4.205282112845138e-05, + "loss": 0.6397, + "step": 3982 + }, + { + "epoch": 5.09824, + "grad_norm": 0.7160905003547668, + "learning_rate": 4.205082032813125e-05, + "loss": 0.6391, + "step": 3983 + }, + { + "epoch": 5.09952, + "grad_norm": 0.7039486765861511, + "learning_rate": 4.204881952781113e-05, + "loss": 0.6062, + "step": 3984 + }, + { + "epoch": 5.1008, + "grad_norm": 0.752713680267334, + "learning_rate": 4.2046818727491e-05, + "loss": 0.6827, + "step": 3985 + }, + { + "epoch": 5.10208, + "grad_norm": 0.7083015441894531, + "learning_rate": 4.204481792717087e-05, + "loss": 0.6033, + "step": 3986 + }, + { + "epoch": 5.10336, + "grad_norm": 0.7464533448219299, + "learning_rate": 4.204281712685074e-05, + "loss": 0.6603, + "step": 3987 + }, + { + "epoch": 5.10464, + "grad_norm": 0.713887095451355, + "learning_rate": 4.2040816326530615e-05, + "loss": 0.6253, + "step": 3988 + }, + { + "epoch": 5.10592, + "grad_norm": 0.7454844117164612, + "learning_rate": 4.2038815526210486e-05, + "loss": 0.6518, + "step": 3989 + }, + { + "epoch": 5.1072, + "grad_norm": 0.770743191242218, + "learning_rate": 4.203681472589036e-05, + "loss": 0.6907, + "step": 3990 + }, + { + "epoch": 5.10848, + "grad_norm": 0.7465303540229797, + "learning_rate": 4.203481392557023e-05, + "loss": 0.7005, + "step": 3991 + }, + { + "epoch": 5.10976, + "grad_norm": 0.745933473110199, + "learning_rate": 4.20328131252501e-05, + "loss": 0.6115, + "step": 3992 + }, + { + "epoch": 5.11104, + "grad_norm": 0.6949446797370911, + "learning_rate": 4.2030812324929974e-05, + "loss": 0.5874, + "step": 3993 + }, + { + "epoch": 5.11232, + "grad_norm": 0.7319580912590027, + "learning_rate": 4.2028811524609846e-05, + "loss": 0.6294, + "step": 3994 + }, + { + "epoch": 5.1136, + "grad_norm": 0.7834770679473877, + "learning_rate": 4.202681072428972e-05, + "loss": 0.7396, + "step": 3995 + }, + { + "epoch": 5.11488, + "grad_norm": 0.7234769463539124, + "learning_rate": 4.202480992396959e-05, + "loss": 0.6397, + "step": 3996 + }, + { + "epoch": 5.11616, + "grad_norm": 0.7542564868927002, + "learning_rate": 4.202280912364946e-05, + "loss": 0.6028, + "step": 3997 + }, + { + "epoch": 5.11744, + "grad_norm": 0.7732244729995728, + "learning_rate": 4.202080832332933e-05, + "loss": 0.6938, + "step": 3998 + }, + { + "epoch": 5.11872, + "grad_norm": 0.6997548937797546, + "learning_rate": 4.2018807523009205e-05, + "loss": 0.6206, + "step": 3999 + }, + { + "epoch": 5.12, + "grad_norm": 0.7281977534294128, + "learning_rate": 4.201680672268908e-05, + "loss": 0.6279, + "step": 4000 + }, + { + "epoch": 5.12128, + "grad_norm": 0.7627147436141968, + "learning_rate": 4.201480592236895e-05, + "loss": 0.6297, + "step": 4001 + }, + { + "epoch": 5.12256, + "grad_norm": 0.7351235151290894, + "learning_rate": 4.201280512204882e-05, + "loss": 0.6132, + "step": 4002 + }, + { + "epoch": 5.12384, + "grad_norm": 0.7031277418136597, + "learning_rate": 4.201080432172869e-05, + "loss": 0.6561, + "step": 4003 + }, + { + "epoch": 5.12512, + "grad_norm": 0.7446048259735107, + "learning_rate": 4.2008803521408564e-05, + "loss": 0.6526, + "step": 4004 + }, + { + "epoch": 5.1264, + "grad_norm": 0.7449467778205872, + "learning_rate": 4.200680272108844e-05, + "loss": 0.6027, + "step": 4005 + }, + { + "epoch": 5.12768, + "grad_norm": 0.8109471201896667, + "learning_rate": 4.200480192076831e-05, + "loss": 0.6602, + "step": 4006 + }, + { + "epoch": 5.12896, + "grad_norm": 0.7701667547225952, + "learning_rate": 4.200280112044818e-05, + "loss": 0.6192, + "step": 4007 + }, + { + "epoch": 5.13024, + "grad_norm": 0.7170472741127014, + "learning_rate": 4.200080032012805e-05, + "loss": 0.5924, + "step": 4008 + }, + { + "epoch": 5.13152, + "grad_norm": 0.7401517033576965, + "learning_rate": 4.1998799519807924e-05, + "loss": 0.6098, + "step": 4009 + }, + { + "epoch": 5.1328, + "grad_norm": 0.7323094606399536, + "learning_rate": 4.1996798719487795e-05, + "loss": 0.6426, + "step": 4010 + }, + { + "epoch": 5.13408, + "grad_norm": 0.73775315284729, + "learning_rate": 4.199479791916767e-05, + "loss": 0.6091, + "step": 4011 + }, + { + "epoch": 5.13536, + "grad_norm": 0.7261705994606018, + "learning_rate": 4.1992797118847546e-05, + "loss": 0.6217, + "step": 4012 + }, + { + "epoch": 5.13664, + "grad_norm": 0.7077757716178894, + "learning_rate": 4.199079631852742e-05, + "loss": 0.6288, + "step": 4013 + }, + { + "epoch": 5.13792, + "grad_norm": 0.7402573823928833, + "learning_rate": 4.198879551820728e-05, + "loss": 0.6528, + "step": 4014 + }, + { + "epoch": 5.1392, + "grad_norm": 0.7423076629638672, + "learning_rate": 4.1986794717887155e-05, + "loss": 0.6715, + "step": 4015 + }, + { + "epoch": 5.14048, + "grad_norm": 0.76578289270401, + "learning_rate": 4.1984793917567026e-05, + "loss": 0.6479, + "step": 4016 + }, + { + "epoch": 5.14176, + "grad_norm": 0.723196268081665, + "learning_rate": 4.19827931172469e-05, + "loss": 0.6415, + "step": 4017 + }, + { + "epoch": 5.14304, + "grad_norm": 0.7332058548927307, + "learning_rate": 4.198079231692677e-05, + "loss": 0.6373, + "step": 4018 + }, + { + "epoch": 5.1443200000000004, + "grad_norm": 0.7015236616134644, + "learning_rate": 4.197879151660665e-05, + "loss": 0.5996, + "step": 4019 + }, + { + "epoch": 5.1456, + "grad_norm": 0.743124783039093, + "learning_rate": 4.197679071628652e-05, + "loss": 0.5971, + "step": 4020 + }, + { + "epoch": 5.14688, + "grad_norm": 0.7382779717445374, + "learning_rate": 4.197478991596639e-05, + "loss": 0.6285, + "step": 4021 + }, + { + "epoch": 5.14816, + "grad_norm": 0.7928086519241333, + "learning_rate": 4.197278911564626e-05, + "loss": 0.6687, + "step": 4022 + }, + { + "epoch": 5.14944, + "grad_norm": 0.7693018913269043, + "learning_rate": 4.197078831532613e-05, + "loss": 0.6553, + "step": 4023 + }, + { + "epoch": 5.15072, + "grad_norm": 0.7531712055206299, + "learning_rate": 4.1968787515006e-05, + "loss": 0.642, + "step": 4024 + }, + { + "epoch": 5.152, + "grad_norm": 0.7181296944618225, + "learning_rate": 4.196678671468587e-05, + "loss": 0.5862, + "step": 4025 + }, + { + "epoch": 5.15328, + "grad_norm": 0.724727988243103, + "learning_rate": 4.196478591436575e-05, + "loss": 0.6467, + "step": 4026 + }, + { + "epoch": 5.15456, + "grad_norm": 0.7461944222450256, + "learning_rate": 4.1962785114045624e-05, + "loss": 0.6054, + "step": 4027 + }, + { + "epoch": 5.15584, + "grad_norm": 0.7383855581283569, + "learning_rate": 4.1960784313725496e-05, + "loss": 0.621, + "step": 4028 + }, + { + "epoch": 5.15712, + "grad_norm": 0.6796069145202637, + "learning_rate": 4.195878351340537e-05, + "loss": 0.5679, + "step": 4029 + }, + { + "epoch": 5.1584, + "grad_norm": 0.7139230370521545, + "learning_rate": 4.195678271308523e-05, + "loss": 0.5988, + "step": 4030 + }, + { + "epoch": 5.15968, + "grad_norm": 0.7442138195037842, + "learning_rate": 4.1954781912765104e-05, + "loss": 0.5973, + "step": 4031 + }, + { + "epoch": 5.16096, + "grad_norm": 0.7515295147895813, + "learning_rate": 4.1952781112444976e-05, + "loss": 0.5769, + "step": 4032 + }, + { + "epoch": 5.16224, + "grad_norm": 0.7667818665504456, + "learning_rate": 4.1950780312124855e-05, + "loss": 0.6609, + "step": 4033 + }, + { + "epoch": 5.16352, + "grad_norm": 0.7336476445198059, + "learning_rate": 4.194877951180473e-05, + "loss": 0.6163, + "step": 4034 + }, + { + "epoch": 5.1648, + "grad_norm": 0.6899265646934509, + "learning_rate": 4.19467787114846e-05, + "loss": 0.5985, + "step": 4035 + }, + { + "epoch": 5.16608, + "grad_norm": 0.6779667735099792, + "learning_rate": 4.194477791116447e-05, + "loss": 0.57, + "step": 4036 + }, + { + "epoch": 5.16736, + "grad_norm": 0.6793948411941528, + "learning_rate": 4.194277711084434e-05, + "loss": 0.5694, + "step": 4037 + }, + { + "epoch": 5.16864, + "grad_norm": 0.763278067111969, + "learning_rate": 4.194077631052421e-05, + "loss": 0.6917, + "step": 4038 + }, + { + "epoch": 5.16992, + "grad_norm": 0.699497640132904, + "learning_rate": 4.193877551020408e-05, + "loss": 0.5704, + "step": 4039 + }, + { + "epoch": 5.1712, + "grad_norm": 0.7154186964035034, + "learning_rate": 4.193677470988396e-05, + "loss": 0.6079, + "step": 4040 + }, + { + "epoch": 5.17248, + "grad_norm": 0.7278900146484375, + "learning_rate": 4.193477390956383e-05, + "loss": 0.6887, + "step": 4041 + }, + { + "epoch": 5.17376, + "grad_norm": 0.7596189379692078, + "learning_rate": 4.19327731092437e-05, + "loss": 0.7018, + "step": 4042 + }, + { + "epoch": 5.17504, + "grad_norm": 0.7543389201164246, + "learning_rate": 4.1930772308923573e-05, + "loss": 0.6161, + "step": 4043 + }, + { + "epoch": 5.17632, + "grad_norm": 0.7368336915969849, + "learning_rate": 4.1928771508603445e-05, + "loss": 0.642, + "step": 4044 + }, + { + "epoch": 5.1776, + "grad_norm": 0.7439759969711304, + "learning_rate": 4.192677070828332e-05, + "loss": 0.6999, + "step": 4045 + }, + { + "epoch": 5.17888, + "grad_norm": 0.695544958114624, + "learning_rate": 4.192476990796318e-05, + "loss": 0.6256, + "step": 4046 + }, + { + "epoch": 5.18016, + "grad_norm": 0.7475451231002808, + "learning_rate": 4.192276910764306e-05, + "loss": 0.648, + "step": 4047 + }, + { + "epoch": 5.18144, + "grad_norm": 0.740695059299469, + "learning_rate": 4.192076830732293e-05, + "loss": 0.598, + "step": 4048 + }, + { + "epoch": 5.18272, + "grad_norm": 0.7205529808998108, + "learning_rate": 4.1918767507002805e-05, + "loss": 0.6135, + "step": 4049 + }, + { + "epoch": 5.184, + "grad_norm": 0.6971973776817322, + "learning_rate": 4.1916766706682676e-05, + "loss": 0.6237, + "step": 4050 + }, + { + "epoch": 5.18528, + "grad_norm": 0.7059743404388428, + "learning_rate": 4.191476590636255e-05, + "loss": 0.6197, + "step": 4051 + }, + { + "epoch": 5.18656, + "grad_norm": 0.7417049407958984, + "learning_rate": 4.191276510604242e-05, + "loss": 0.6321, + "step": 4052 + }, + { + "epoch": 5.18784, + "grad_norm": 0.726333498954773, + "learning_rate": 4.191076430572229e-05, + "loss": 0.5983, + "step": 4053 + }, + { + "epoch": 5.18912, + "grad_norm": 0.7621744275093079, + "learning_rate": 4.1908763505402164e-05, + "loss": 0.675, + "step": 4054 + }, + { + "epoch": 5.1904, + "grad_norm": 0.7966973185539246, + "learning_rate": 4.1906762705082036e-05, + "loss": 0.6198, + "step": 4055 + }, + { + "epoch": 5.19168, + "grad_norm": 0.7543626427650452, + "learning_rate": 4.190476190476191e-05, + "loss": 0.6282, + "step": 4056 + }, + { + "epoch": 5.19296, + "grad_norm": 0.7392343282699585, + "learning_rate": 4.190276110444178e-05, + "loss": 0.6496, + "step": 4057 + }, + { + "epoch": 5.19424, + "grad_norm": 0.7553439140319824, + "learning_rate": 4.190076030412165e-05, + "loss": 0.6604, + "step": 4058 + }, + { + "epoch": 5.19552, + "grad_norm": 0.7107070088386536, + "learning_rate": 4.189875950380152e-05, + "loss": 0.5846, + "step": 4059 + }, + { + "epoch": 5.1968, + "grad_norm": 0.7690718173980713, + "learning_rate": 4.1896758703481395e-05, + "loss": 0.6315, + "step": 4060 + }, + { + "epoch": 5.19808, + "grad_norm": 0.7583000063896179, + "learning_rate": 4.189475790316127e-05, + "loss": 0.6565, + "step": 4061 + }, + { + "epoch": 5.19936, + "grad_norm": 0.735431432723999, + "learning_rate": 4.189275710284114e-05, + "loss": 0.6382, + "step": 4062 + }, + { + "epoch": 5.20064, + "grad_norm": 0.749560534954071, + "learning_rate": 4.189075630252101e-05, + "loss": 0.6199, + "step": 4063 + }, + { + "epoch": 5.20192, + "grad_norm": 0.7637155055999756, + "learning_rate": 4.188875550220088e-05, + "loss": 0.6392, + "step": 4064 + }, + { + "epoch": 5.2032, + "grad_norm": 0.7183476686477661, + "learning_rate": 4.1886754701880754e-05, + "loss": 0.6105, + "step": 4065 + }, + { + "epoch": 5.20448, + "grad_norm": 0.7134882807731628, + "learning_rate": 4.1884753901560626e-05, + "loss": 0.6176, + "step": 4066 + }, + { + "epoch": 5.20576, + "grad_norm": 0.6925643682479858, + "learning_rate": 4.18827531012405e-05, + "loss": 0.6148, + "step": 4067 + }, + { + "epoch": 5.20704, + "grad_norm": 0.6939037442207336, + "learning_rate": 4.188075230092037e-05, + "loss": 0.5917, + "step": 4068 + }, + { + "epoch": 5.20832, + "grad_norm": 0.7910222411155701, + "learning_rate": 4.187875150060024e-05, + "loss": 0.7064, + "step": 4069 + }, + { + "epoch": 5.2096, + "grad_norm": 0.7466183304786682, + "learning_rate": 4.1876750700280114e-05, + "loss": 0.6069, + "step": 4070 + }, + { + "epoch": 5.21088, + "grad_norm": 0.7494035363197327, + "learning_rate": 4.1874749899959985e-05, + "loss": 0.6614, + "step": 4071 + }, + { + "epoch": 5.21216, + "grad_norm": 0.7295969724655151, + "learning_rate": 4.187274909963986e-05, + "loss": 0.6364, + "step": 4072 + }, + { + "epoch": 5.21344, + "grad_norm": 0.7598521709442139, + "learning_rate": 4.187074829931973e-05, + "loss": 0.6466, + "step": 4073 + }, + { + "epoch": 5.21472, + "grad_norm": 0.7255356907844543, + "learning_rate": 4.18687474989996e-05, + "loss": 0.6205, + "step": 4074 + }, + { + "epoch": 5.216, + "grad_norm": 0.7490739822387695, + "learning_rate": 4.186674669867948e-05, + "loss": 0.6438, + "step": 4075 + }, + { + "epoch": 5.21728, + "grad_norm": 0.7887585759162903, + "learning_rate": 4.1864745898359345e-05, + "loss": 0.5891, + "step": 4076 + }, + { + "epoch": 5.21856, + "grad_norm": 0.7423108220100403, + "learning_rate": 4.1862745098039217e-05, + "loss": 0.6278, + "step": 4077 + }, + { + "epoch": 5.21984, + "grad_norm": 0.747302234172821, + "learning_rate": 4.186074429771909e-05, + "loss": 0.6105, + "step": 4078 + }, + { + "epoch": 5.22112, + "grad_norm": 0.7417611479759216, + "learning_rate": 4.185874349739896e-05, + "loss": 0.6077, + "step": 4079 + }, + { + "epoch": 5.2224, + "grad_norm": 0.7022831439971924, + "learning_rate": 4.185674269707883e-05, + "loss": 0.5837, + "step": 4080 + }, + { + "epoch": 5.22368, + "grad_norm": 0.7441087365150452, + "learning_rate": 4.1854741896758704e-05, + "loss": 0.6113, + "step": 4081 + }, + { + "epoch": 5.22496, + "grad_norm": 0.7969503998756409, + "learning_rate": 4.185274109643858e-05, + "loss": 0.6077, + "step": 4082 + }, + { + "epoch": 5.22624, + "grad_norm": 0.7303327918052673, + "learning_rate": 4.1850740296118454e-05, + "loss": 0.5847, + "step": 4083 + }, + { + "epoch": 5.22752, + "grad_norm": 0.7135294675827026, + "learning_rate": 4.184873949579832e-05, + "loss": 0.5898, + "step": 4084 + }, + { + "epoch": 5.2288, + "grad_norm": 0.6808853149414062, + "learning_rate": 4.184673869547819e-05, + "loss": 0.6139, + "step": 4085 + }, + { + "epoch": 5.23008, + "grad_norm": 0.7298513054847717, + "learning_rate": 4.184473789515806e-05, + "loss": 0.6338, + "step": 4086 + }, + { + "epoch": 5.2313600000000005, + "grad_norm": 0.7220539450645447, + "learning_rate": 4.1842737094837935e-05, + "loss": 0.5868, + "step": 4087 + }, + { + "epoch": 5.23264, + "grad_norm": 0.7567940354347229, + "learning_rate": 4.184073629451781e-05, + "loss": 0.626, + "step": 4088 + }, + { + "epoch": 5.23392, + "grad_norm": 0.7327399253845215, + "learning_rate": 4.183873549419768e-05, + "loss": 0.6274, + "step": 4089 + }, + { + "epoch": 5.2352, + "grad_norm": 0.7534685134887695, + "learning_rate": 4.183673469387756e-05, + "loss": 0.6031, + "step": 4090 + }, + { + "epoch": 5.23648, + "grad_norm": 0.7302762269973755, + "learning_rate": 4.183473389355743e-05, + "loss": 0.5695, + "step": 4091 + }, + { + "epoch": 5.23776, + "grad_norm": 0.737391471862793, + "learning_rate": 4.1832733093237294e-05, + "loss": 0.6894, + "step": 4092 + }, + { + "epoch": 5.23904, + "grad_norm": 0.7553057670593262, + "learning_rate": 4.1830732292917166e-05, + "loss": 0.6557, + "step": 4093 + }, + { + "epoch": 5.24032, + "grad_norm": 0.7426884770393372, + "learning_rate": 4.182873149259704e-05, + "loss": 0.6648, + "step": 4094 + }, + { + "epoch": 5.2416, + "grad_norm": 0.7494625449180603, + "learning_rate": 4.182673069227691e-05, + "loss": 0.6473, + "step": 4095 + }, + { + "epoch": 5.24288, + "grad_norm": 0.6882435083389282, + "learning_rate": 4.182472989195678e-05, + "loss": 0.5835, + "step": 4096 + }, + { + "epoch": 5.24416, + "grad_norm": 0.7787857055664062, + "learning_rate": 4.182272909163666e-05, + "loss": 0.6566, + "step": 4097 + }, + { + "epoch": 5.24544, + "grad_norm": 0.8027628660202026, + "learning_rate": 4.182072829131653e-05, + "loss": 0.6464, + "step": 4098 + }, + { + "epoch": 5.24672, + "grad_norm": 0.7405392527580261, + "learning_rate": 4.1818727490996404e-05, + "loss": 0.6006, + "step": 4099 + }, + { + "epoch": 5.248, + "grad_norm": 0.7443488836288452, + "learning_rate": 4.181672669067627e-05, + "loss": 0.646, + "step": 4100 + }, + { + "epoch": 5.24928, + "grad_norm": 0.8020368218421936, + "learning_rate": 4.181472589035614e-05, + "loss": 0.6962, + "step": 4101 + }, + { + "epoch": 5.25056, + "grad_norm": 0.7345948219299316, + "learning_rate": 4.181272509003601e-05, + "loss": 0.6171, + "step": 4102 + }, + { + "epoch": 5.25184, + "grad_norm": 0.7310644388198853, + "learning_rate": 4.1810724289715885e-05, + "loss": 0.6516, + "step": 4103 + }, + { + "epoch": 5.25312, + "grad_norm": 0.6996403336524963, + "learning_rate": 4.1808723489395763e-05, + "loss": 0.5719, + "step": 4104 + }, + { + "epoch": 5.2544, + "grad_norm": 0.683502197265625, + "learning_rate": 4.1806722689075635e-05, + "loss": 0.5978, + "step": 4105 + }, + { + "epoch": 5.25568, + "grad_norm": 0.7105028629302979, + "learning_rate": 4.180472188875551e-05, + "loss": 0.6489, + "step": 4106 + }, + { + "epoch": 5.25696, + "grad_norm": 0.7716310620307922, + "learning_rate": 4.180272108843538e-05, + "loss": 0.6468, + "step": 4107 + }, + { + "epoch": 5.25824, + "grad_norm": 0.7370600700378418, + "learning_rate": 4.1800720288115244e-05, + "loss": 0.5911, + "step": 4108 + }, + { + "epoch": 5.25952, + "grad_norm": 0.7595857381820679, + "learning_rate": 4.1798719487795116e-05, + "loss": 0.6627, + "step": 4109 + }, + { + "epoch": 5.2608, + "grad_norm": 0.7154218554496765, + "learning_rate": 4.179671868747499e-05, + "loss": 0.6439, + "step": 4110 + }, + { + "epoch": 5.26208, + "grad_norm": 0.7370497584342957, + "learning_rate": 4.1794717887154866e-05, + "loss": 0.6511, + "step": 4111 + }, + { + "epoch": 5.26336, + "grad_norm": 0.7522472143173218, + "learning_rate": 4.179271708683474e-05, + "loss": 0.6171, + "step": 4112 + }, + { + "epoch": 5.26464, + "grad_norm": 0.7183180451393127, + "learning_rate": 4.179071628651461e-05, + "loss": 0.6296, + "step": 4113 + }, + { + "epoch": 5.26592, + "grad_norm": 0.7467809319496155, + "learning_rate": 4.178871548619448e-05, + "loss": 0.6705, + "step": 4114 + }, + { + "epoch": 5.2672, + "grad_norm": 0.7425380349159241, + "learning_rate": 4.1786714685874354e-05, + "loss": 0.6152, + "step": 4115 + }, + { + "epoch": 5.26848, + "grad_norm": 0.729131281375885, + "learning_rate": 4.178471388555422e-05, + "loss": 0.6485, + "step": 4116 + }, + { + "epoch": 5.26976, + "grad_norm": 0.7531429529190063, + "learning_rate": 4.178271308523409e-05, + "loss": 0.702, + "step": 4117 + }, + { + "epoch": 5.27104, + "grad_norm": 0.6844903826713562, + "learning_rate": 4.178071228491397e-05, + "loss": 0.5557, + "step": 4118 + }, + { + "epoch": 5.27232, + "grad_norm": 0.6600164175033569, + "learning_rate": 4.177871148459384e-05, + "loss": 0.5779, + "step": 4119 + }, + { + "epoch": 5.2736, + "grad_norm": 0.7316297888755798, + "learning_rate": 4.177671068427371e-05, + "loss": 0.6198, + "step": 4120 + }, + { + "epoch": 5.27488, + "grad_norm": 0.7616431713104248, + "learning_rate": 4.1774709883953585e-05, + "loss": 0.6581, + "step": 4121 + }, + { + "epoch": 5.27616, + "grad_norm": 0.7071192264556885, + "learning_rate": 4.177270908363346e-05, + "loss": 0.616, + "step": 4122 + }, + { + "epoch": 5.27744, + "grad_norm": 0.717705249786377, + "learning_rate": 4.177070828331333e-05, + "loss": 0.6081, + "step": 4123 + }, + { + "epoch": 5.27872, + "grad_norm": 0.7297554612159729, + "learning_rate": 4.1768707482993194e-05, + "loss": 0.6139, + "step": 4124 + }, + { + "epoch": 5.28, + "grad_norm": 0.7143238186836243, + "learning_rate": 4.176670668267307e-05, + "loss": 0.6375, + "step": 4125 + }, + { + "epoch": 5.28128, + "grad_norm": 0.701789915561676, + "learning_rate": 4.1764705882352944e-05, + "loss": 0.5887, + "step": 4126 + }, + { + "epoch": 5.28256, + "grad_norm": 0.7739312052726746, + "learning_rate": 4.1762705082032816e-05, + "loss": 0.6815, + "step": 4127 + }, + { + "epoch": 5.28384, + "grad_norm": 0.714019238948822, + "learning_rate": 4.176070428171269e-05, + "loss": 0.6148, + "step": 4128 + }, + { + "epoch": 5.28512, + "grad_norm": 0.7301841378211975, + "learning_rate": 4.175870348139256e-05, + "loss": 0.6268, + "step": 4129 + }, + { + "epoch": 5.2864, + "grad_norm": 0.7254041433334351, + "learning_rate": 4.175670268107243e-05, + "loss": 0.6933, + "step": 4130 + }, + { + "epoch": 5.28768, + "grad_norm": 0.7699323892593384, + "learning_rate": 4.1754701880752304e-05, + "loss": 0.7007, + "step": 4131 + }, + { + "epoch": 5.28896, + "grad_norm": 0.7059085965156555, + "learning_rate": 4.1752701080432175e-05, + "loss": 0.6194, + "step": 4132 + }, + { + "epoch": 5.29024, + "grad_norm": 0.7336937785148621, + "learning_rate": 4.175070028011205e-05, + "loss": 0.6656, + "step": 4133 + }, + { + "epoch": 5.29152, + "grad_norm": 0.7493408918380737, + "learning_rate": 4.174869947979192e-05, + "loss": 0.6766, + "step": 4134 + }, + { + "epoch": 5.2928, + "grad_norm": 0.7297886610031128, + "learning_rate": 4.174669867947179e-05, + "loss": 0.6352, + "step": 4135 + }, + { + "epoch": 5.29408, + "grad_norm": 0.7195264101028442, + "learning_rate": 4.174469787915166e-05, + "loss": 0.6102, + "step": 4136 + }, + { + "epoch": 5.29536, + "grad_norm": 0.7068727612495422, + "learning_rate": 4.1742697078831535e-05, + "loss": 0.556, + "step": 4137 + }, + { + "epoch": 5.29664, + "grad_norm": 0.7307595014572144, + "learning_rate": 4.1740696278511407e-05, + "loss": 0.6653, + "step": 4138 + }, + { + "epoch": 5.29792, + "grad_norm": 0.7298409938812256, + "learning_rate": 4.173869547819128e-05, + "loss": 0.6587, + "step": 4139 + }, + { + "epoch": 5.2992, + "grad_norm": 0.7297065258026123, + "learning_rate": 4.173669467787115e-05, + "loss": 0.63, + "step": 4140 + }, + { + "epoch": 5.30048, + "grad_norm": 0.7633817791938782, + "learning_rate": 4.173469387755102e-05, + "loss": 0.6555, + "step": 4141 + }, + { + "epoch": 5.30176, + "grad_norm": 0.7700405716896057, + "learning_rate": 4.1732693077230894e-05, + "loss": 0.6576, + "step": 4142 + }, + { + "epoch": 5.30304, + "grad_norm": 0.7559324502944946, + "learning_rate": 4.1730692276910766e-05, + "loss": 0.6633, + "step": 4143 + }, + { + "epoch": 5.30432, + "grad_norm": 0.782576322555542, + "learning_rate": 4.172869147659064e-05, + "loss": 0.639, + "step": 4144 + }, + { + "epoch": 5.3056, + "grad_norm": 0.7763882875442505, + "learning_rate": 4.172669067627051e-05, + "loss": 0.6591, + "step": 4145 + }, + { + "epoch": 5.30688, + "grad_norm": 0.7644296288490295, + "learning_rate": 4.172468987595038e-05, + "loss": 0.5824, + "step": 4146 + }, + { + "epoch": 5.30816, + "grad_norm": 0.7676565647125244, + "learning_rate": 4.172268907563025e-05, + "loss": 0.6401, + "step": 4147 + }, + { + "epoch": 5.30944, + "grad_norm": 0.7719744443893433, + "learning_rate": 4.1720688275310125e-05, + "loss": 0.701, + "step": 4148 + }, + { + "epoch": 5.31072, + "grad_norm": 0.7409470677375793, + "learning_rate": 4.171868747499e-05, + "loss": 0.596, + "step": 4149 + }, + { + "epoch": 5.312, + "grad_norm": 0.7712857723236084, + "learning_rate": 4.171668667466987e-05, + "loss": 0.6428, + "step": 4150 + }, + { + "epoch": 5.31328, + "grad_norm": 0.8165924549102783, + "learning_rate": 4.171468587434974e-05, + "loss": 0.6534, + "step": 4151 + }, + { + "epoch": 5.31456, + "grad_norm": 0.7560482025146484, + "learning_rate": 4.171268507402961e-05, + "loss": 0.6532, + "step": 4152 + }, + { + "epoch": 5.31584, + "grad_norm": 0.740051805973053, + "learning_rate": 4.171068427370949e-05, + "loss": 0.5908, + "step": 4153 + }, + { + "epoch": 5.31712, + "grad_norm": 0.7237354516983032, + "learning_rate": 4.1708683473389356e-05, + "loss": 0.6267, + "step": 4154 + }, + { + "epoch": 5.3184000000000005, + "grad_norm": 0.7441720962524414, + "learning_rate": 4.170668267306923e-05, + "loss": 0.5965, + "step": 4155 + }, + { + "epoch": 5.31968, + "grad_norm": 0.770770788192749, + "learning_rate": 4.17046818727491e-05, + "loss": 0.6421, + "step": 4156 + }, + { + "epoch": 5.32096, + "grad_norm": 0.7334791421890259, + "learning_rate": 4.170268107242897e-05, + "loss": 0.6324, + "step": 4157 + }, + { + "epoch": 5.32224, + "grad_norm": 0.7998023629188538, + "learning_rate": 4.1700680272108844e-05, + "loss": 0.6521, + "step": 4158 + }, + { + "epoch": 5.32352, + "grad_norm": 0.7167679071426392, + "learning_rate": 4.1698679471788716e-05, + "loss": 0.6568, + "step": 4159 + }, + { + "epoch": 5.3248, + "grad_norm": 0.6926052570343018, + "learning_rate": 4.1696678671468594e-05, + "loss": 0.5946, + "step": 4160 + }, + { + "epoch": 5.32608, + "grad_norm": 0.704468846321106, + "learning_rate": 4.1694677871148466e-05, + "loss": 0.5969, + "step": 4161 + }, + { + "epoch": 5.32736, + "grad_norm": 0.7154397368431091, + "learning_rate": 4.169267707082833e-05, + "loss": 0.6305, + "step": 4162 + }, + { + "epoch": 5.32864, + "grad_norm": 0.7278268337249756, + "learning_rate": 4.16906762705082e-05, + "loss": 0.6246, + "step": 4163 + }, + { + "epoch": 5.3299199999999995, + "grad_norm": 0.7837611436843872, + "learning_rate": 4.1688675470188075e-05, + "loss": 0.6799, + "step": 4164 + }, + { + "epoch": 5.3312, + "grad_norm": 0.7822878360748291, + "learning_rate": 4.168667466986795e-05, + "loss": 0.6387, + "step": 4165 + }, + { + "epoch": 5.33248, + "grad_norm": 0.7381898760795593, + "learning_rate": 4.168467386954782e-05, + "loss": 0.6071, + "step": 4166 + }, + { + "epoch": 5.33376, + "grad_norm": 0.7611213326454163, + "learning_rate": 4.16826730692277e-05, + "loss": 0.5542, + "step": 4167 + }, + { + "epoch": 5.33504, + "grad_norm": 0.7357604503631592, + "learning_rate": 4.168067226890757e-05, + "loss": 0.6474, + "step": 4168 + }, + { + "epoch": 5.33632, + "grad_norm": 0.7721554636955261, + "learning_rate": 4.167867146858744e-05, + "loss": 0.61, + "step": 4169 + }, + { + "epoch": 5.3376, + "grad_norm": 0.7680994868278503, + "learning_rate": 4.1676670668267306e-05, + "loss": 0.672, + "step": 4170 + }, + { + "epoch": 5.33888, + "grad_norm": 0.7215784192085266, + "learning_rate": 4.167466986794718e-05, + "loss": 0.6394, + "step": 4171 + }, + { + "epoch": 5.34016, + "grad_norm": 0.7135915160179138, + "learning_rate": 4.167266906762705e-05, + "loss": 0.6101, + "step": 4172 + }, + { + "epoch": 5.34144, + "grad_norm": 0.7465267181396484, + "learning_rate": 4.167066826730692e-05, + "loss": 0.6649, + "step": 4173 + }, + { + "epoch": 5.34272, + "grad_norm": 0.7213309407234192, + "learning_rate": 4.16686674669868e-05, + "loss": 0.595, + "step": 4174 + }, + { + "epoch": 5.344, + "grad_norm": 0.7122322916984558, + "learning_rate": 4.166666666666667e-05, + "loss": 0.643, + "step": 4175 + }, + { + "epoch": 5.34528, + "grad_norm": 0.7451832890510559, + "learning_rate": 4.1664665866346544e-05, + "loss": 0.6457, + "step": 4176 + }, + { + "epoch": 5.34656, + "grad_norm": 0.7480894923210144, + "learning_rate": 4.1662665066026416e-05, + "loss": 0.6788, + "step": 4177 + }, + { + "epoch": 5.34784, + "grad_norm": 0.742680013179779, + "learning_rate": 4.166066426570628e-05, + "loss": 0.6303, + "step": 4178 + }, + { + "epoch": 5.34912, + "grad_norm": 0.7494868636131287, + "learning_rate": 4.165866346538615e-05, + "loss": 0.6294, + "step": 4179 + }, + { + "epoch": 5.3504, + "grad_norm": 0.7331695556640625, + "learning_rate": 4.1656662665066025e-05, + "loss": 0.6088, + "step": 4180 + }, + { + "epoch": 5.35168, + "grad_norm": 0.7193648219108582, + "learning_rate": 4.16546618647459e-05, + "loss": 0.624, + "step": 4181 + }, + { + "epoch": 5.35296, + "grad_norm": 0.7384651303291321, + "learning_rate": 4.1652661064425775e-05, + "loss": 0.6541, + "step": 4182 + }, + { + "epoch": 5.35424, + "grad_norm": 0.7119219899177551, + "learning_rate": 4.165066026410565e-05, + "loss": 0.5827, + "step": 4183 + }, + { + "epoch": 5.35552, + "grad_norm": 0.7343322038650513, + "learning_rate": 4.164865946378552e-05, + "loss": 0.5935, + "step": 4184 + }, + { + "epoch": 5.3568, + "grad_norm": 0.7368735671043396, + "learning_rate": 4.164665866346539e-05, + "loss": 0.6512, + "step": 4185 + }, + { + "epoch": 5.35808, + "grad_norm": 0.7362673282623291, + "learning_rate": 4.1644657863145256e-05, + "loss": 0.5961, + "step": 4186 + }, + { + "epoch": 5.35936, + "grad_norm": 0.7422880530357361, + "learning_rate": 4.164265706282513e-05, + "loss": 0.5847, + "step": 4187 + }, + { + "epoch": 5.36064, + "grad_norm": 0.7207329869270325, + "learning_rate": 4.1640656262505006e-05, + "loss": 0.645, + "step": 4188 + }, + { + "epoch": 5.36192, + "grad_norm": 0.7351892590522766, + "learning_rate": 4.163865546218488e-05, + "loss": 0.6344, + "step": 4189 + }, + { + "epoch": 5.3632, + "grad_norm": 0.7149160504341125, + "learning_rate": 4.163665466186475e-05, + "loss": 0.6381, + "step": 4190 + }, + { + "epoch": 5.36448, + "grad_norm": 0.7507062554359436, + "learning_rate": 4.163465386154462e-05, + "loss": 0.6487, + "step": 4191 + }, + { + "epoch": 5.36576, + "grad_norm": 0.718169629573822, + "learning_rate": 4.1632653061224494e-05, + "loss": 0.5903, + "step": 4192 + }, + { + "epoch": 5.36704, + "grad_norm": 0.6719139218330383, + "learning_rate": 4.1630652260904365e-05, + "loss": 0.5836, + "step": 4193 + }, + { + "epoch": 5.36832, + "grad_norm": 0.704992413520813, + "learning_rate": 4.162865146058423e-05, + "loss": 0.6264, + "step": 4194 + }, + { + "epoch": 5.3696, + "grad_norm": 0.7397636771202087, + "learning_rate": 4.16266506602641e-05, + "loss": 0.6152, + "step": 4195 + }, + { + "epoch": 5.37088, + "grad_norm": 0.7924968600273132, + "learning_rate": 4.162464985994398e-05, + "loss": 0.6927, + "step": 4196 + }, + { + "epoch": 5.37216, + "grad_norm": 0.7544575929641724, + "learning_rate": 4.162264905962385e-05, + "loss": 0.6515, + "step": 4197 + }, + { + "epoch": 5.37344, + "grad_norm": 0.743186891078949, + "learning_rate": 4.1620648259303725e-05, + "loss": 0.5953, + "step": 4198 + }, + { + "epoch": 5.37472, + "grad_norm": 0.708406925201416, + "learning_rate": 4.16186474589836e-05, + "loss": 0.6146, + "step": 4199 + }, + { + "epoch": 5.376, + "grad_norm": 0.7445898652076721, + "learning_rate": 4.161664665866347e-05, + "loss": 0.6632, + "step": 4200 + }, + { + "epoch": 5.37728, + "grad_norm": 0.7561652660369873, + "learning_rate": 4.161464585834334e-05, + "loss": 0.678, + "step": 4201 + }, + { + "epoch": 5.37856, + "grad_norm": 0.7312771081924438, + "learning_rate": 4.1612645058023205e-05, + "loss": 0.5748, + "step": 4202 + }, + { + "epoch": 5.37984, + "grad_norm": 0.7716453671455383, + "learning_rate": 4.1610644257703084e-05, + "loss": 0.6249, + "step": 4203 + }, + { + "epoch": 5.38112, + "grad_norm": 0.7312607765197754, + "learning_rate": 4.1608643457382956e-05, + "loss": 0.66, + "step": 4204 + }, + { + "epoch": 5.3824, + "grad_norm": 0.7225040197372437, + "learning_rate": 4.160664265706283e-05, + "loss": 0.6272, + "step": 4205 + }, + { + "epoch": 5.38368, + "grad_norm": 0.7225370407104492, + "learning_rate": 4.16046418567427e-05, + "loss": 0.6611, + "step": 4206 + }, + { + "epoch": 5.38496, + "grad_norm": 0.7278191447257996, + "learning_rate": 4.160264105642257e-05, + "loss": 0.562, + "step": 4207 + }, + { + "epoch": 5.38624, + "grad_norm": 0.7607426047325134, + "learning_rate": 4.160064025610244e-05, + "loss": 0.6992, + "step": 4208 + }, + { + "epoch": 5.38752, + "grad_norm": 0.7173566818237305, + "learning_rate": 4.1598639455782315e-05, + "loss": 0.6655, + "step": 4209 + }, + { + "epoch": 5.3888, + "grad_norm": 0.6946642398834229, + "learning_rate": 4.159663865546219e-05, + "loss": 0.5802, + "step": 4210 + }, + { + "epoch": 5.39008, + "grad_norm": 0.7513164281845093, + "learning_rate": 4.159463785514206e-05, + "loss": 0.6538, + "step": 4211 + }, + { + "epoch": 5.39136, + "grad_norm": 0.7534035444259644, + "learning_rate": 4.159263705482193e-05, + "loss": 0.6571, + "step": 4212 + }, + { + "epoch": 5.39264, + "grad_norm": 0.7139204740524292, + "learning_rate": 4.15906362545018e-05, + "loss": 0.5496, + "step": 4213 + }, + { + "epoch": 5.39392, + "grad_norm": 0.7388843894004822, + "learning_rate": 4.1588635454181674e-05, + "loss": 0.6338, + "step": 4214 + }, + { + "epoch": 5.3952, + "grad_norm": 0.7447206974029541, + "learning_rate": 4.1586634653861546e-05, + "loss": 0.5863, + "step": 4215 + }, + { + "epoch": 5.39648, + "grad_norm": 0.7396455407142639, + "learning_rate": 4.158463385354142e-05, + "loss": 0.6237, + "step": 4216 + }, + { + "epoch": 5.39776, + "grad_norm": 0.7606258392333984, + "learning_rate": 4.158263305322129e-05, + "loss": 0.66, + "step": 4217 + }, + { + "epoch": 5.39904, + "grad_norm": 0.8394240140914917, + "learning_rate": 4.158063225290116e-05, + "loss": 0.6537, + "step": 4218 + }, + { + "epoch": 5.40032, + "grad_norm": 0.7464375495910645, + "learning_rate": 4.1578631452581034e-05, + "loss": 0.6092, + "step": 4219 + }, + { + "epoch": 5.4016, + "grad_norm": 0.7182469367980957, + "learning_rate": 4.1576630652260906e-05, + "loss": 0.6298, + "step": 4220 + }, + { + "epoch": 5.40288, + "grad_norm": 0.772256076335907, + "learning_rate": 4.157462985194078e-05, + "loss": 0.6736, + "step": 4221 + }, + { + "epoch": 5.40416, + "grad_norm": 0.7597767114639282, + "learning_rate": 4.157262905162065e-05, + "loss": 0.6706, + "step": 4222 + }, + { + "epoch": 5.4054400000000005, + "grad_norm": 0.7585070729255676, + "learning_rate": 4.157062825130052e-05, + "loss": 0.6311, + "step": 4223 + }, + { + "epoch": 5.40672, + "grad_norm": 0.7339140772819519, + "learning_rate": 4.156862745098039e-05, + "loss": 0.6276, + "step": 4224 + }, + { + "epoch": 5.408, + "grad_norm": 0.7652080059051514, + "learning_rate": 4.1566626650660265e-05, + "loss": 0.6573, + "step": 4225 + }, + { + "epoch": 5.40928, + "grad_norm": 0.7553825974464417, + "learning_rate": 4.156462585034014e-05, + "loss": 0.6619, + "step": 4226 + }, + { + "epoch": 5.41056, + "grad_norm": 0.753115177154541, + "learning_rate": 4.156262505002001e-05, + "loss": 0.6916, + "step": 4227 + }, + { + "epoch": 5.41184, + "grad_norm": 0.7845631241798401, + "learning_rate": 4.156062424969988e-05, + "loss": 0.6693, + "step": 4228 + }, + { + "epoch": 5.41312, + "grad_norm": 0.7271414995193481, + "learning_rate": 4.155862344937975e-05, + "loss": 0.5909, + "step": 4229 + }, + { + "epoch": 5.4144, + "grad_norm": 0.7683008313179016, + "learning_rate": 4.1556622649059624e-05, + "loss": 0.617, + "step": 4230 + }, + { + "epoch": 5.41568, + "grad_norm": 0.7922500967979431, + "learning_rate": 4.15546218487395e-05, + "loss": 0.6502, + "step": 4231 + }, + { + "epoch": 5.4169599999999996, + "grad_norm": 0.7644626498222351, + "learning_rate": 4.155262104841937e-05, + "loss": 0.6364, + "step": 4232 + }, + { + "epoch": 5.41824, + "grad_norm": 0.7368267774581909, + "learning_rate": 4.155062024809924e-05, + "loss": 0.6217, + "step": 4233 + }, + { + "epoch": 5.41952, + "grad_norm": 0.7373183369636536, + "learning_rate": 4.154861944777911e-05, + "loss": 0.6186, + "step": 4234 + }, + { + "epoch": 5.4208, + "grad_norm": 0.7321687936782837, + "learning_rate": 4.1546618647458983e-05, + "loss": 0.6367, + "step": 4235 + }, + { + "epoch": 5.42208, + "grad_norm": 0.7199650406837463, + "learning_rate": 4.1544617847138855e-05, + "loss": 0.5642, + "step": 4236 + }, + { + "epoch": 5.42336, + "grad_norm": 0.7745632529258728, + "learning_rate": 4.154261704681873e-05, + "loss": 0.6663, + "step": 4237 + }, + { + "epoch": 5.42464, + "grad_norm": 0.7231164574623108, + "learning_rate": 4.1540616246498606e-05, + "loss": 0.5956, + "step": 4238 + }, + { + "epoch": 5.42592, + "grad_norm": 0.7279819250106812, + "learning_rate": 4.153861544617848e-05, + "loss": 0.6507, + "step": 4239 + }, + { + "epoch": 5.4272, + "grad_norm": 0.7030388116836548, + "learning_rate": 4.153661464585834e-05, + "loss": 0.616, + "step": 4240 + }, + { + "epoch": 5.42848, + "grad_norm": 0.7477138042449951, + "learning_rate": 4.1534613845538215e-05, + "loss": 0.6875, + "step": 4241 + }, + { + "epoch": 5.42976, + "grad_norm": 0.6878876686096191, + "learning_rate": 4.1532613045218086e-05, + "loss": 0.5856, + "step": 4242 + }, + { + "epoch": 5.43104, + "grad_norm": 0.7027300596237183, + "learning_rate": 4.153061224489796e-05, + "loss": 0.6155, + "step": 4243 + }, + { + "epoch": 5.43232, + "grad_norm": 0.7319774627685547, + "learning_rate": 4.152861144457783e-05, + "loss": 0.6515, + "step": 4244 + }, + { + "epoch": 5.4336, + "grad_norm": 0.7408891916275024, + "learning_rate": 4.152661064425771e-05, + "loss": 0.6834, + "step": 4245 + }, + { + "epoch": 5.43488, + "grad_norm": 0.7682157754898071, + "learning_rate": 4.152460984393758e-05, + "loss": 0.6244, + "step": 4246 + }, + { + "epoch": 5.43616, + "grad_norm": 0.7219270467758179, + "learning_rate": 4.152260904361745e-05, + "loss": 0.631, + "step": 4247 + }, + { + "epoch": 5.43744, + "grad_norm": 0.7537044882774353, + "learning_rate": 4.152060824329732e-05, + "loss": 0.6456, + "step": 4248 + }, + { + "epoch": 5.43872, + "grad_norm": 0.7409535050392151, + "learning_rate": 4.151860744297719e-05, + "loss": 0.6145, + "step": 4249 + }, + { + "epoch": 5.44, + "grad_norm": 0.7161388993263245, + "learning_rate": 4.151660664265706e-05, + "loss": 0.608, + "step": 4250 + }, + { + "epoch": 5.44128, + "grad_norm": 0.7184132933616638, + "learning_rate": 4.151460584233693e-05, + "loss": 0.6225, + "step": 4251 + }, + { + "epoch": 5.44256, + "grad_norm": 0.7418175935745239, + "learning_rate": 4.151260504201681e-05, + "loss": 0.6434, + "step": 4252 + }, + { + "epoch": 5.44384, + "grad_norm": 0.791469395160675, + "learning_rate": 4.1510604241696684e-05, + "loss": 0.7016, + "step": 4253 + }, + { + "epoch": 5.44512, + "grad_norm": 0.7191916108131409, + "learning_rate": 4.1508603441376556e-05, + "loss": 0.5732, + "step": 4254 + }, + { + "epoch": 5.4464, + "grad_norm": 0.723224937915802, + "learning_rate": 4.150660264105643e-05, + "loss": 0.6206, + "step": 4255 + }, + { + "epoch": 5.44768, + "grad_norm": 0.7462792992591858, + "learning_rate": 4.150460184073629e-05, + "loss": 0.6278, + "step": 4256 + }, + { + "epoch": 5.44896, + "grad_norm": 0.7277225255966187, + "learning_rate": 4.1502601040416164e-05, + "loss": 0.6239, + "step": 4257 + }, + { + "epoch": 5.45024, + "grad_norm": 0.7746177911758423, + "learning_rate": 4.1500600240096036e-05, + "loss": 0.6549, + "step": 4258 + }, + { + "epoch": 5.45152, + "grad_norm": 0.7281146049499512, + "learning_rate": 4.1498599439775915e-05, + "loss": 0.6261, + "step": 4259 + }, + { + "epoch": 5.4528, + "grad_norm": 0.751801609992981, + "learning_rate": 4.149659863945579e-05, + "loss": 0.6514, + "step": 4260 + }, + { + "epoch": 5.45408, + "grad_norm": 0.7374839782714844, + "learning_rate": 4.149459783913566e-05, + "loss": 0.6498, + "step": 4261 + }, + { + "epoch": 5.45536, + "grad_norm": 0.7305448055267334, + "learning_rate": 4.149259703881553e-05, + "loss": 0.5913, + "step": 4262 + }, + { + "epoch": 5.45664, + "grad_norm": 0.7178943753242493, + "learning_rate": 4.14905962384954e-05, + "loss": 0.6412, + "step": 4263 + }, + { + "epoch": 5.45792, + "grad_norm": 0.7705517411231995, + "learning_rate": 4.148859543817527e-05, + "loss": 0.6451, + "step": 4264 + }, + { + "epoch": 5.4592, + "grad_norm": 0.7343666553497314, + "learning_rate": 4.148659463785514e-05, + "loss": 0.632, + "step": 4265 + }, + { + "epoch": 5.4604800000000004, + "grad_norm": 0.7479756474494934, + "learning_rate": 4.148459383753502e-05, + "loss": 0.6502, + "step": 4266 + }, + { + "epoch": 5.46176, + "grad_norm": 0.7750630378723145, + "learning_rate": 4.148259303721489e-05, + "loss": 0.6519, + "step": 4267 + }, + { + "epoch": 5.46304, + "grad_norm": 0.7741780281066895, + "learning_rate": 4.148059223689476e-05, + "loss": 0.6251, + "step": 4268 + }, + { + "epoch": 5.46432, + "grad_norm": 0.7677576541900635, + "learning_rate": 4.147859143657463e-05, + "loss": 0.6463, + "step": 4269 + }, + { + "epoch": 5.4656, + "grad_norm": 0.7549638748168945, + "learning_rate": 4.1476590636254505e-05, + "loss": 0.6795, + "step": 4270 + }, + { + "epoch": 5.46688, + "grad_norm": 0.7305397391319275, + "learning_rate": 4.147458983593438e-05, + "loss": 0.6243, + "step": 4271 + }, + { + "epoch": 5.46816, + "grad_norm": 0.7237127423286438, + "learning_rate": 4.147258903561424e-05, + "loss": 0.6224, + "step": 4272 + }, + { + "epoch": 5.46944, + "grad_norm": 0.6945486068725586, + "learning_rate": 4.147058823529412e-05, + "loss": 0.6099, + "step": 4273 + }, + { + "epoch": 5.47072, + "grad_norm": 0.7361446619033813, + "learning_rate": 4.146858743497399e-05, + "loss": 0.5984, + "step": 4274 + }, + { + "epoch": 5.4719999999999995, + "grad_norm": 0.6917027235031128, + "learning_rate": 4.1466586634653865e-05, + "loss": 0.5837, + "step": 4275 + }, + { + "epoch": 5.47328, + "grad_norm": 0.7172991633415222, + "learning_rate": 4.1464585834333736e-05, + "loss": 0.6568, + "step": 4276 + }, + { + "epoch": 5.47456, + "grad_norm": 0.6892058253288269, + "learning_rate": 4.146258503401361e-05, + "loss": 0.6219, + "step": 4277 + }, + { + "epoch": 5.47584, + "grad_norm": 0.7407313585281372, + "learning_rate": 4.146058423369348e-05, + "loss": 0.6509, + "step": 4278 + }, + { + "epoch": 5.47712, + "grad_norm": 0.7310953736305237, + "learning_rate": 4.145858343337335e-05, + "loss": 0.6227, + "step": 4279 + }, + { + "epoch": 5.4784, + "grad_norm": 0.7653133273124695, + "learning_rate": 4.1456582633053224e-05, + "loss": 0.6519, + "step": 4280 + }, + { + "epoch": 5.47968, + "grad_norm": 0.7355585098266602, + "learning_rate": 4.1454581832733096e-05, + "loss": 0.6163, + "step": 4281 + }, + { + "epoch": 5.48096, + "grad_norm": 0.7199105620384216, + "learning_rate": 4.145258103241297e-05, + "loss": 0.6104, + "step": 4282 + }, + { + "epoch": 5.48224, + "grad_norm": 0.7551198601722717, + "learning_rate": 4.145058023209284e-05, + "loss": 0.6599, + "step": 4283 + }, + { + "epoch": 5.48352, + "grad_norm": 0.7141267657279968, + "learning_rate": 4.144857943177271e-05, + "loss": 0.5802, + "step": 4284 + }, + { + "epoch": 5.4848, + "grad_norm": 0.7375865578651428, + "learning_rate": 4.144657863145258e-05, + "loss": 0.6395, + "step": 4285 + }, + { + "epoch": 5.48608, + "grad_norm": 0.7291966676712036, + "learning_rate": 4.1444577831132455e-05, + "loss": 0.6074, + "step": 4286 + }, + { + "epoch": 5.48736, + "grad_norm": 0.7387133836746216, + "learning_rate": 4.144257703081233e-05, + "loss": 0.6659, + "step": 4287 + }, + { + "epoch": 5.48864, + "grad_norm": 0.7290977835655212, + "learning_rate": 4.14405762304922e-05, + "loss": 0.5999, + "step": 4288 + }, + { + "epoch": 5.48992, + "grad_norm": 0.7163506746292114, + "learning_rate": 4.143857543017207e-05, + "loss": 0.5901, + "step": 4289 + }, + { + "epoch": 5.4912, + "grad_norm": 0.7519278526306152, + "learning_rate": 4.143657462985194e-05, + "loss": 0.6746, + "step": 4290 + }, + { + "epoch": 5.49248, + "grad_norm": 0.7682675123214722, + "learning_rate": 4.1434573829531814e-05, + "loss": 0.6694, + "step": 4291 + }, + { + "epoch": 5.49376, + "grad_norm": 0.7485939264297485, + "learning_rate": 4.1432573029211686e-05, + "loss": 0.6073, + "step": 4292 + }, + { + "epoch": 5.49504, + "grad_norm": 0.7766785621643066, + "learning_rate": 4.143057222889156e-05, + "loss": 0.5799, + "step": 4293 + }, + { + "epoch": 5.49632, + "grad_norm": 0.7536609768867493, + "learning_rate": 4.1428571428571437e-05, + "loss": 0.6169, + "step": 4294 + }, + { + "epoch": 5.4976, + "grad_norm": 0.725071907043457, + "learning_rate": 4.14265706282513e-05, + "loss": 0.6393, + "step": 4295 + }, + { + "epoch": 5.49888, + "grad_norm": 0.7046527862548828, + "learning_rate": 4.1424569827931173e-05, + "loss": 0.5988, + "step": 4296 + }, + { + "epoch": 5.50016, + "grad_norm": 0.7888898849487305, + "learning_rate": 4.1422569027611045e-05, + "loss": 0.6408, + "step": 4297 + }, + { + "epoch": 5.50144, + "grad_norm": 0.7429561614990234, + "learning_rate": 4.142056822729092e-05, + "loss": 0.6463, + "step": 4298 + }, + { + "epoch": 5.50272, + "grad_norm": 0.7440699934959412, + "learning_rate": 4.141856742697079e-05, + "loss": 0.652, + "step": 4299 + }, + { + "epoch": 5.504, + "grad_norm": 0.7715908885002136, + "learning_rate": 4.141656662665066e-05, + "loss": 0.6796, + "step": 4300 + }, + { + "epoch": 5.50528, + "grad_norm": 0.714020311832428, + "learning_rate": 4.141456582633054e-05, + "loss": 0.5743, + "step": 4301 + }, + { + "epoch": 5.50656, + "grad_norm": 0.7244009375572205, + "learning_rate": 4.141256502601041e-05, + "loss": 0.6498, + "step": 4302 + }, + { + "epoch": 5.50784, + "grad_norm": 0.7662729620933533, + "learning_rate": 4.1410564225690276e-05, + "loss": 0.6456, + "step": 4303 + }, + { + "epoch": 5.50912, + "grad_norm": 0.6930511593818665, + "learning_rate": 4.140856342537015e-05, + "loss": 0.5915, + "step": 4304 + }, + { + "epoch": 5.5104, + "grad_norm": 0.7424132227897644, + "learning_rate": 4.140656262505002e-05, + "loss": 0.6082, + "step": 4305 + }, + { + "epoch": 5.51168, + "grad_norm": 0.7727630734443665, + "learning_rate": 4.140456182472989e-05, + "loss": 0.6735, + "step": 4306 + }, + { + "epoch": 5.51296, + "grad_norm": 0.6813573837280273, + "learning_rate": 4.1402561024409764e-05, + "loss": 0.5995, + "step": 4307 + }, + { + "epoch": 5.51424, + "grad_norm": 0.7176674008369446, + "learning_rate": 4.1400560224089636e-05, + "loss": 0.6404, + "step": 4308 + }, + { + "epoch": 5.51552, + "grad_norm": 0.74003666639328, + "learning_rate": 4.1398559423769514e-05, + "loss": 0.6219, + "step": 4309 + }, + { + "epoch": 5.5168, + "grad_norm": 0.705366313457489, + "learning_rate": 4.1396558623449386e-05, + "loss": 0.6144, + "step": 4310 + }, + { + "epoch": 5.51808, + "grad_norm": 0.70579594373703, + "learning_rate": 4.139455782312925e-05, + "loss": 0.6345, + "step": 4311 + }, + { + "epoch": 5.51936, + "grad_norm": 0.776077926158905, + "learning_rate": 4.139255702280912e-05, + "loss": 0.6859, + "step": 4312 + }, + { + "epoch": 5.52064, + "grad_norm": 0.7283969521522522, + "learning_rate": 4.1390556222488995e-05, + "loss": 0.5956, + "step": 4313 + }, + { + "epoch": 5.52192, + "grad_norm": 0.7156528234481812, + "learning_rate": 4.138855542216887e-05, + "loss": 0.6284, + "step": 4314 + }, + { + "epoch": 5.5232, + "grad_norm": 0.7174420356750488, + "learning_rate": 4.138655462184874e-05, + "loss": 0.5826, + "step": 4315 + }, + { + "epoch": 5.52448, + "grad_norm": 0.7152808904647827, + "learning_rate": 4.138455382152862e-05, + "loss": 0.5983, + "step": 4316 + }, + { + "epoch": 5.52576, + "grad_norm": 0.7197295427322388, + "learning_rate": 4.138255302120849e-05, + "loss": 0.6156, + "step": 4317 + }, + { + "epoch": 5.5270399999999995, + "grad_norm": 0.7645232677459717, + "learning_rate": 4.138055222088836e-05, + "loss": 0.6525, + "step": 4318 + }, + { + "epoch": 5.52832, + "grad_norm": 0.6872177720069885, + "learning_rate": 4.1378551420568226e-05, + "loss": 0.6044, + "step": 4319 + }, + { + "epoch": 5.5296, + "grad_norm": 0.7118226289749146, + "learning_rate": 4.13765506202481e-05, + "loss": 0.6176, + "step": 4320 + }, + { + "epoch": 5.53088, + "grad_norm": 0.7357558608055115, + "learning_rate": 4.137454981992797e-05, + "loss": 0.6369, + "step": 4321 + }, + { + "epoch": 5.53216, + "grad_norm": 0.7684040069580078, + "learning_rate": 4.137254901960784e-05, + "loss": 0.639, + "step": 4322 + }, + { + "epoch": 5.53344, + "grad_norm": 0.7693250179290771, + "learning_rate": 4.137054821928772e-05, + "loss": 0.6801, + "step": 4323 + }, + { + "epoch": 5.53472, + "grad_norm": 0.6828309893608093, + "learning_rate": 4.136854741896759e-05, + "loss": 0.6232, + "step": 4324 + }, + { + "epoch": 5.536, + "grad_norm": 0.7307888865470886, + "learning_rate": 4.1366546618647464e-05, + "loss": 0.5854, + "step": 4325 + }, + { + "epoch": 5.53728, + "grad_norm": 0.7778929471969604, + "learning_rate": 4.1364545818327336e-05, + "loss": 0.6335, + "step": 4326 + }, + { + "epoch": 5.53856, + "grad_norm": 0.7381669878959656, + "learning_rate": 4.13625450180072e-05, + "loss": 0.6113, + "step": 4327 + }, + { + "epoch": 5.53984, + "grad_norm": 0.7228230834007263, + "learning_rate": 4.136054421768707e-05, + "loss": 0.6167, + "step": 4328 + }, + { + "epoch": 5.54112, + "grad_norm": 0.7991678714752197, + "learning_rate": 4.1358543417366945e-05, + "loss": 0.6445, + "step": 4329 + }, + { + "epoch": 5.5424, + "grad_norm": 0.7551771402359009, + "learning_rate": 4.1356542617046823e-05, + "loss": 0.6068, + "step": 4330 + }, + { + "epoch": 5.54368, + "grad_norm": 0.7122798562049866, + "learning_rate": 4.1354541816726695e-05, + "loss": 0.6084, + "step": 4331 + }, + { + "epoch": 5.54496, + "grad_norm": 0.7480770349502563, + "learning_rate": 4.135254101640657e-05, + "loss": 0.6652, + "step": 4332 + }, + { + "epoch": 5.54624, + "grad_norm": 0.7483392953872681, + "learning_rate": 4.135054021608644e-05, + "loss": 0.6547, + "step": 4333 + }, + { + "epoch": 5.5475200000000005, + "grad_norm": 0.7390307784080505, + "learning_rate": 4.134853941576631e-05, + "loss": 0.6251, + "step": 4334 + }, + { + "epoch": 5.5488, + "grad_norm": 0.7169946432113647, + "learning_rate": 4.1346538615446176e-05, + "loss": 0.5833, + "step": 4335 + }, + { + "epoch": 5.55008, + "grad_norm": 0.799982488155365, + "learning_rate": 4.134453781512605e-05, + "loss": 0.6722, + "step": 4336 + }, + { + "epoch": 5.55136, + "grad_norm": 0.7108681201934814, + "learning_rate": 4.1342537014805926e-05, + "loss": 0.5692, + "step": 4337 + }, + { + "epoch": 5.55264, + "grad_norm": 0.7432281374931335, + "learning_rate": 4.13405362144858e-05, + "loss": 0.6642, + "step": 4338 + }, + { + "epoch": 5.55392, + "grad_norm": 0.7395031452178955, + "learning_rate": 4.133853541416567e-05, + "loss": 0.6601, + "step": 4339 + }, + { + "epoch": 5.5552, + "grad_norm": 0.7047814130783081, + "learning_rate": 4.133653461384554e-05, + "loss": 0.6566, + "step": 4340 + }, + { + "epoch": 5.55648, + "grad_norm": 0.7198272347450256, + "learning_rate": 4.1334533813525414e-05, + "loss": 0.6494, + "step": 4341 + }, + { + "epoch": 5.55776, + "grad_norm": 0.690862238407135, + "learning_rate": 4.1332533013205286e-05, + "loss": 0.5978, + "step": 4342 + }, + { + "epoch": 5.5590399999999995, + "grad_norm": 0.7230415940284729, + "learning_rate": 4.133053221288515e-05, + "loss": 0.6449, + "step": 4343 + }, + { + "epoch": 5.56032, + "grad_norm": 0.7639696002006531, + "learning_rate": 4.132853141256503e-05, + "loss": 0.659, + "step": 4344 + }, + { + "epoch": 5.5616, + "grad_norm": 0.7363530397415161, + "learning_rate": 4.13265306122449e-05, + "loss": 0.5586, + "step": 4345 + }, + { + "epoch": 5.56288, + "grad_norm": 0.7204784750938416, + "learning_rate": 4.132452981192477e-05, + "loss": 0.6629, + "step": 4346 + }, + { + "epoch": 5.56416, + "grad_norm": 0.787613570690155, + "learning_rate": 4.1322529011604645e-05, + "loss": 0.6596, + "step": 4347 + }, + { + "epoch": 5.56544, + "grad_norm": 0.7643619179725647, + "learning_rate": 4.132052821128452e-05, + "loss": 0.6955, + "step": 4348 + }, + { + "epoch": 5.56672, + "grad_norm": 0.7102790474891663, + "learning_rate": 4.131852741096439e-05, + "loss": 0.6573, + "step": 4349 + }, + { + "epoch": 5.568, + "grad_norm": 0.7346861362457275, + "learning_rate": 4.131652661064426e-05, + "loss": 0.638, + "step": 4350 + }, + { + "epoch": 5.56928, + "grad_norm": 0.7016168832778931, + "learning_rate": 4.131452581032413e-05, + "loss": 0.641, + "step": 4351 + }, + { + "epoch": 5.57056, + "grad_norm": 0.7079355120658875, + "learning_rate": 4.1312525010004004e-05, + "loss": 0.6284, + "step": 4352 + }, + { + "epoch": 5.57184, + "grad_norm": 0.7500919699668884, + "learning_rate": 4.1310524209683876e-05, + "loss": 0.6272, + "step": 4353 + }, + { + "epoch": 5.57312, + "grad_norm": 0.708759605884552, + "learning_rate": 4.130852340936375e-05, + "loss": 0.5711, + "step": 4354 + }, + { + "epoch": 5.5744, + "grad_norm": 0.7522294521331787, + "learning_rate": 4.130652260904362e-05, + "loss": 0.6849, + "step": 4355 + }, + { + "epoch": 5.57568, + "grad_norm": 0.7236372828483582, + "learning_rate": 4.130452180872349e-05, + "loss": 0.612, + "step": 4356 + }, + { + "epoch": 5.57696, + "grad_norm": 0.7170467972755432, + "learning_rate": 4.1302521008403364e-05, + "loss": 0.6623, + "step": 4357 + }, + { + "epoch": 5.57824, + "grad_norm": 0.7366024255752563, + "learning_rate": 4.1300520208083235e-05, + "loss": 0.6025, + "step": 4358 + }, + { + "epoch": 5.5795200000000005, + "grad_norm": 0.7273461222648621, + "learning_rate": 4.129851940776311e-05, + "loss": 0.6107, + "step": 4359 + }, + { + "epoch": 5.5808, + "grad_norm": 0.7220586538314819, + "learning_rate": 4.129651860744298e-05, + "loss": 0.5956, + "step": 4360 + }, + { + "epoch": 5.58208, + "grad_norm": 0.6963488459587097, + "learning_rate": 4.129451780712285e-05, + "loss": 0.6292, + "step": 4361 + }, + { + "epoch": 5.58336, + "grad_norm": 0.7539929747581482, + "learning_rate": 4.129251700680272e-05, + "loss": 0.6307, + "step": 4362 + }, + { + "epoch": 5.58464, + "grad_norm": 0.7906867265701294, + "learning_rate": 4.1290516206482595e-05, + "loss": 0.6993, + "step": 4363 + }, + { + "epoch": 5.58592, + "grad_norm": 0.7354369163513184, + "learning_rate": 4.1288515406162467e-05, + "loss": 0.6817, + "step": 4364 + }, + { + "epoch": 5.5872, + "grad_norm": 0.7213559150695801, + "learning_rate": 4.128651460584234e-05, + "loss": 0.601, + "step": 4365 + }, + { + "epoch": 5.58848, + "grad_norm": 0.782337486743927, + "learning_rate": 4.128451380552221e-05, + "loss": 0.6586, + "step": 4366 + }, + { + "epoch": 5.58976, + "grad_norm": 0.720512330532074, + "learning_rate": 4.128251300520208e-05, + "loss": 0.5993, + "step": 4367 + }, + { + "epoch": 5.59104, + "grad_norm": 0.7558559775352478, + "learning_rate": 4.1280512204881954e-05, + "loss": 0.6544, + "step": 4368 + }, + { + "epoch": 5.59232, + "grad_norm": 0.7189001441001892, + "learning_rate": 4.1278511404561826e-05, + "loss": 0.5955, + "step": 4369 + }, + { + "epoch": 5.5936, + "grad_norm": 0.7266587018966675, + "learning_rate": 4.12765106042417e-05, + "loss": 0.6626, + "step": 4370 + }, + { + "epoch": 5.59488, + "grad_norm": 0.7241358160972595, + "learning_rate": 4.127450980392157e-05, + "loss": 0.5832, + "step": 4371 + }, + { + "epoch": 5.59616, + "grad_norm": 0.732850968837738, + "learning_rate": 4.127250900360145e-05, + "loss": 0.6341, + "step": 4372 + }, + { + "epoch": 5.59744, + "grad_norm": 0.7497940063476562, + "learning_rate": 4.127050820328131e-05, + "loss": 0.6735, + "step": 4373 + }, + { + "epoch": 5.59872, + "grad_norm": 0.7972435355186462, + "learning_rate": 4.1268507402961185e-05, + "loss": 0.6529, + "step": 4374 + }, + { + "epoch": 5.6, + "grad_norm": 0.7178497314453125, + "learning_rate": 4.126650660264106e-05, + "loss": 0.5903, + "step": 4375 + }, + { + "epoch": 5.60128, + "grad_norm": 0.7551731467247009, + "learning_rate": 4.126450580232093e-05, + "loss": 0.6286, + "step": 4376 + }, + { + "epoch": 5.60256, + "grad_norm": 0.7458837032318115, + "learning_rate": 4.12625050020008e-05, + "loss": 0.6696, + "step": 4377 + }, + { + "epoch": 5.60384, + "grad_norm": 0.7230226993560791, + "learning_rate": 4.126050420168067e-05, + "loss": 0.6121, + "step": 4378 + }, + { + "epoch": 5.60512, + "grad_norm": 0.714648425579071, + "learning_rate": 4.125850340136055e-05, + "loss": 0.6175, + "step": 4379 + }, + { + "epoch": 5.6064, + "grad_norm": 0.713045060634613, + "learning_rate": 4.125650260104042e-05, + "loss": 0.6181, + "step": 4380 + }, + { + "epoch": 5.60768, + "grad_norm": 0.7239891886711121, + "learning_rate": 4.125450180072029e-05, + "loss": 0.6454, + "step": 4381 + }, + { + "epoch": 5.60896, + "grad_norm": 0.7390968799591064, + "learning_rate": 4.125250100040016e-05, + "loss": 0.598, + "step": 4382 + }, + { + "epoch": 5.61024, + "grad_norm": 0.7643166780471802, + "learning_rate": 4.125050020008003e-05, + "loss": 0.6299, + "step": 4383 + }, + { + "epoch": 5.61152, + "grad_norm": 0.7632148861885071, + "learning_rate": 4.1248499399759904e-05, + "loss": 0.6601, + "step": 4384 + }, + { + "epoch": 5.6128, + "grad_norm": 0.74639493227005, + "learning_rate": 4.1246498599439776e-05, + "loss": 0.7099, + "step": 4385 + }, + { + "epoch": 5.6140799999999995, + "grad_norm": 0.7428092956542969, + "learning_rate": 4.1244497799119654e-05, + "loss": 0.6596, + "step": 4386 + }, + { + "epoch": 5.61536, + "grad_norm": 0.7230287790298462, + "learning_rate": 4.1242496998799526e-05, + "loss": 0.603, + "step": 4387 + }, + { + "epoch": 5.61664, + "grad_norm": 0.7466754913330078, + "learning_rate": 4.12404961984794e-05, + "loss": 0.702, + "step": 4388 + }, + { + "epoch": 5.61792, + "grad_norm": 0.7360090017318726, + "learning_rate": 4.123849539815926e-05, + "loss": 0.6036, + "step": 4389 + }, + { + "epoch": 5.6192, + "grad_norm": 0.7444376945495605, + "learning_rate": 4.1236494597839135e-05, + "loss": 0.6554, + "step": 4390 + }, + { + "epoch": 5.62048, + "grad_norm": 0.7407775521278381, + "learning_rate": 4.123449379751901e-05, + "loss": 0.6132, + "step": 4391 + }, + { + "epoch": 5.62176, + "grad_norm": 0.6796741485595703, + "learning_rate": 4.123249299719888e-05, + "loss": 0.5356, + "step": 4392 + }, + { + "epoch": 5.62304, + "grad_norm": 0.7613147497177124, + "learning_rate": 4.123049219687876e-05, + "loss": 0.6522, + "step": 4393 + }, + { + "epoch": 5.62432, + "grad_norm": 0.7880995273590088, + "learning_rate": 4.122849139655863e-05, + "loss": 0.6211, + "step": 4394 + }, + { + "epoch": 5.6256, + "grad_norm": 0.7633781433105469, + "learning_rate": 4.12264905962385e-05, + "loss": 0.6276, + "step": 4395 + }, + { + "epoch": 5.62688, + "grad_norm": 0.7847978472709656, + "learning_rate": 4.122448979591837e-05, + "loss": 0.6798, + "step": 4396 + }, + { + "epoch": 5.62816, + "grad_norm": 0.7588475346565247, + "learning_rate": 4.122248899559824e-05, + "loss": 0.642, + "step": 4397 + }, + { + "epoch": 5.62944, + "grad_norm": 0.7453870177268982, + "learning_rate": 4.122048819527811e-05, + "loss": 0.623, + "step": 4398 + }, + { + "epoch": 5.63072, + "grad_norm": 0.7075188755989075, + "learning_rate": 4.121848739495798e-05, + "loss": 0.6164, + "step": 4399 + }, + { + "epoch": 5.632, + "grad_norm": 0.7136649489402771, + "learning_rate": 4.121648659463786e-05, + "loss": 0.6288, + "step": 4400 + }, + { + "epoch": 5.63328, + "grad_norm": 0.7545991539955139, + "learning_rate": 4.121448579431773e-05, + "loss": 0.6107, + "step": 4401 + }, + { + "epoch": 5.6345600000000005, + "grad_norm": 0.7157647013664246, + "learning_rate": 4.1212484993997604e-05, + "loss": 0.6143, + "step": 4402 + }, + { + "epoch": 5.63584, + "grad_norm": 0.6989755630493164, + "learning_rate": 4.1210484193677476e-05, + "loss": 0.5937, + "step": 4403 + }, + { + "epoch": 5.63712, + "grad_norm": 0.7509260177612305, + "learning_rate": 4.120848339335735e-05, + "loss": 0.6746, + "step": 4404 + }, + { + "epoch": 5.6384, + "grad_norm": 0.7333345413208008, + "learning_rate": 4.120648259303721e-05, + "loss": 0.6164, + "step": 4405 + }, + { + "epoch": 5.63968, + "grad_norm": 0.739132285118103, + "learning_rate": 4.1204481792717085e-05, + "loss": 0.6531, + "step": 4406 + }, + { + "epoch": 5.64096, + "grad_norm": 0.7224278450012207, + "learning_rate": 4.120248099239696e-05, + "loss": 0.5819, + "step": 4407 + }, + { + "epoch": 5.64224, + "grad_norm": 0.7341471910476685, + "learning_rate": 4.1200480192076835e-05, + "loss": 0.607, + "step": 4408 + }, + { + "epoch": 5.64352, + "grad_norm": 0.7280572056770325, + "learning_rate": 4.119847939175671e-05, + "loss": 0.6057, + "step": 4409 + }, + { + "epoch": 5.6448, + "grad_norm": 0.759056568145752, + "learning_rate": 4.119647859143658e-05, + "loss": 0.629, + "step": 4410 + }, + { + "epoch": 5.6460799999999995, + "grad_norm": 0.7521056532859802, + "learning_rate": 4.119447779111645e-05, + "loss": 0.6231, + "step": 4411 + }, + { + "epoch": 5.64736, + "grad_norm": 0.7467377185821533, + "learning_rate": 4.119247699079632e-05, + "loss": 0.6442, + "step": 4412 + }, + { + "epoch": 5.64864, + "grad_norm": 0.7342444658279419, + "learning_rate": 4.119047619047619e-05, + "loss": 0.6292, + "step": 4413 + }, + { + "epoch": 5.64992, + "grad_norm": 0.7450172901153564, + "learning_rate": 4.1188475390156066e-05, + "loss": 0.5998, + "step": 4414 + }, + { + "epoch": 5.6512, + "grad_norm": 0.7140272855758667, + "learning_rate": 4.118647458983594e-05, + "loss": 0.6113, + "step": 4415 + }, + { + "epoch": 5.65248, + "grad_norm": 0.6929570436477661, + "learning_rate": 4.118447378951581e-05, + "loss": 0.6088, + "step": 4416 + }, + { + "epoch": 5.65376, + "grad_norm": 0.7283996939659119, + "learning_rate": 4.118247298919568e-05, + "loss": 0.6608, + "step": 4417 + }, + { + "epoch": 5.65504, + "grad_norm": 0.7235706448554993, + "learning_rate": 4.1180472188875554e-05, + "loss": 0.653, + "step": 4418 + }, + { + "epoch": 5.65632, + "grad_norm": 0.7216994166374207, + "learning_rate": 4.1178471388555425e-05, + "loss": 0.6264, + "step": 4419 + }, + { + "epoch": 5.6576, + "grad_norm": 0.7781568169593811, + "learning_rate": 4.11764705882353e-05, + "loss": 0.6419, + "step": 4420 + }, + { + "epoch": 5.65888, + "grad_norm": 0.8012334704399109, + "learning_rate": 4.117446978791516e-05, + "loss": 0.6591, + "step": 4421 + }, + { + "epoch": 5.66016, + "grad_norm": 0.7411988973617554, + "learning_rate": 4.117246898759504e-05, + "loss": 0.6353, + "step": 4422 + }, + { + "epoch": 5.66144, + "grad_norm": 0.7188833355903625, + "learning_rate": 4.117046818727491e-05, + "loss": 0.5684, + "step": 4423 + }, + { + "epoch": 5.66272, + "grad_norm": 0.7834116816520691, + "learning_rate": 4.1168467386954785e-05, + "loss": 0.6605, + "step": 4424 + }, + { + "epoch": 5.664, + "grad_norm": 0.7423588037490845, + "learning_rate": 4.1166466586634657e-05, + "loss": 0.6259, + "step": 4425 + }, + { + "epoch": 5.66528, + "grad_norm": 0.732661783695221, + "learning_rate": 4.116446578631453e-05, + "loss": 0.5846, + "step": 4426 + }, + { + "epoch": 5.6665600000000005, + "grad_norm": 0.6886220574378967, + "learning_rate": 4.11624649859944e-05, + "loss": 0.5771, + "step": 4427 + }, + { + "epoch": 5.66784, + "grad_norm": 0.7514188885688782, + "learning_rate": 4.116046418567427e-05, + "loss": 0.6466, + "step": 4428 + }, + { + "epoch": 5.66912, + "grad_norm": 0.7303938269615173, + "learning_rate": 4.1158463385354144e-05, + "loss": 0.6405, + "step": 4429 + }, + { + "epoch": 5.6704, + "grad_norm": 0.7757360935211182, + "learning_rate": 4.1156462585034016e-05, + "loss": 0.6433, + "step": 4430 + }, + { + "epoch": 5.67168, + "grad_norm": 0.7613855004310608, + "learning_rate": 4.115446178471389e-05, + "loss": 0.6125, + "step": 4431 + }, + { + "epoch": 5.67296, + "grad_norm": 0.739494800567627, + "learning_rate": 4.115246098439376e-05, + "loss": 0.5834, + "step": 4432 + }, + { + "epoch": 5.67424, + "grad_norm": 0.7127769589424133, + "learning_rate": 4.115046018407363e-05, + "loss": 0.5963, + "step": 4433 + }, + { + "epoch": 5.67552, + "grad_norm": 0.7135427594184875, + "learning_rate": 4.11484593837535e-05, + "loss": 0.6307, + "step": 4434 + }, + { + "epoch": 5.6768, + "grad_norm": 0.7762594819068909, + "learning_rate": 4.1146458583433375e-05, + "loss": 0.666, + "step": 4435 + }, + { + "epoch": 5.67808, + "grad_norm": 0.7698591351509094, + "learning_rate": 4.114445778311325e-05, + "loss": 0.5811, + "step": 4436 + }, + { + "epoch": 5.67936, + "grad_norm": 0.7814648151397705, + "learning_rate": 4.114245698279312e-05, + "loss": 0.6203, + "step": 4437 + }, + { + "epoch": 5.68064, + "grad_norm": 0.7443010210990906, + "learning_rate": 4.114045618247299e-05, + "loss": 0.6657, + "step": 4438 + }, + { + "epoch": 5.68192, + "grad_norm": 0.691595196723938, + "learning_rate": 4.113845538215286e-05, + "loss": 0.6411, + "step": 4439 + }, + { + "epoch": 5.6832, + "grad_norm": 0.7466495037078857, + "learning_rate": 4.1136454581832734e-05, + "loss": 0.6843, + "step": 4440 + }, + { + "epoch": 5.68448, + "grad_norm": 0.7032425999641418, + "learning_rate": 4.1134453781512606e-05, + "loss": 0.5646, + "step": 4441 + }, + { + "epoch": 5.68576, + "grad_norm": 0.7595518827438354, + "learning_rate": 4.113245298119248e-05, + "loss": 0.6562, + "step": 4442 + }, + { + "epoch": 5.68704, + "grad_norm": 0.7430517077445984, + "learning_rate": 4.113045218087235e-05, + "loss": 0.6283, + "step": 4443 + }, + { + "epoch": 5.68832, + "grad_norm": 0.7672387957572937, + "learning_rate": 4.112845138055222e-05, + "loss": 0.6236, + "step": 4444 + }, + { + "epoch": 5.6896, + "grad_norm": 0.7632207274436951, + "learning_rate": 4.1126450580232094e-05, + "loss": 0.6524, + "step": 4445 + }, + { + "epoch": 5.69088, + "grad_norm": 0.7333277463912964, + "learning_rate": 4.1124449779911966e-05, + "loss": 0.6461, + "step": 4446 + }, + { + "epoch": 5.69216, + "grad_norm": 0.7459308505058289, + "learning_rate": 4.112244897959184e-05, + "loss": 0.6702, + "step": 4447 + }, + { + "epoch": 5.69344, + "grad_norm": 0.7271379232406616, + "learning_rate": 4.112044817927171e-05, + "loss": 0.6392, + "step": 4448 + }, + { + "epoch": 5.69472, + "grad_norm": 0.7518323659896851, + "learning_rate": 4.111844737895158e-05, + "loss": 0.6591, + "step": 4449 + }, + { + "epoch": 5.696, + "grad_norm": 0.7861558794975281, + "learning_rate": 4.111644657863146e-05, + "loss": 0.6533, + "step": 4450 + }, + { + "epoch": 5.69728, + "grad_norm": 0.7315780520439148, + "learning_rate": 4.1114445778311325e-05, + "loss": 0.6908, + "step": 4451 + }, + { + "epoch": 5.69856, + "grad_norm": 0.7502129673957825, + "learning_rate": 4.11124449779912e-05, + "loss": 0.6766, + "step": 4452 + }, + { + "epoch": 5.69984, + "grad_norm": 0.7338371872901917, + "learning_rate": 4.111044417767107e-05, + "loss": 0.6115, + "step": 4453 + }, + { + "epoch": 5.7011199999999995, + "grad_norm": 0.7228209972381592, + "learning_rate": 4.110844337735094e-05, + "loss": 0.6202, + "step": 4454 + }, + { + "epoch": 5.7024, + "grad_norm": 0.7461691498756409, + "learning_rate": 4.110644257703081e-05, + "loss": 0.6349, + "step": 4455 + }, + { + "epoch": 5.70368, + "grad_norm": 0.7166462540626526, + "learning_rate": 4.1104441776710684e-05, + "loss": 0.6712, + "step": 4456 + }, + { + "epoch": 5.70496, + "grad_norm": 0.7871848940849304, + "learning_rate": 4.110244097639056e-05, + "loss": 0.6437, + "step": 4457 + }, + { + "epoch": 5.70624, + "grad_norm": 0.7596803307533264, + "learning_rate": 4.1100440176070435e-05, + "loss": 0.6566, + "step": 4458 + }, + { + "epoch": 5.70752, + "grad_norm": 0.7356720566749573, + "learning_rate": 4.10984393757503e-05, + "loss": 0.6138, + "step": 4459 + }, + { + "epoch": 5.7088, + "grad_norm": 0.7193202376365662, + "learning_rate": 4.109643857543017e-05, + "loss": 0.6187, + "step": 4460 + }, + { + "epoch": 5.71008, + "grad_norm": 0.7889663577079773, + "learning_rate": 4.1094437775110043e-05, + "loss": 0.6713, + "step": 4461 + }, + { + "epoch": 5.71136, + "grad_norm": 0.7016068696975708, + "learning_rate": 4.1092436974789915e-05, + "loss": 0.5817, + "step": 4462 + }, + { + "epoch": 5.71264, + "grad_norm": 0.7515783905982971, + "learning_rate": 4.109043617446979e-05, + "loss": 0.6497, + "step": 4463 + }, + { + "epoch": 5.71392, + "grad_norm": 0.7173634171485901, + "learning_rate": 4.1088435374149666e-05, + "loss": 0.5583, + "step": 4464 + }, + { + "epoch": 5.7152, + "grad_norm": 0.7629589438438416, + "learning_rate": 4.108643457382954e-05, + "loss": 0.6278, + "step": 4465 + }, + { + "epoch": 5.71648, + "grad_norm": 0.7493870854377747, + "learning_rate": 4.108443377350941e-05, + "loss": 0.6373, + "step": 4466 + }, + { + "epoch": 5.71776, + "grad_norm": 0.7296896576881409, + "learning_rate": 4.1082432973189275e-05, + "loss": 0.6011, + "step": 4467 + }, + { + "epoch": 5.71904, + "grad_norm": 0.7143272757530212, + "learning_rate": 4.1080432172869146e-05, + "loss": 0.6313, + "step": 4468 + }, + { + "epoch": 5.72032, + "grad_norm": 0.7367825508117676, + "learning_rate": 4.107843137254902e-05, + "loss": 0.6217, + "step": 4469 + }, + { + "epoch": 5.7216000000000005, + "grad_norm": 0.7655727863311768, + "learning_rate": 4.107643057222889e-05, + "loss": 0.6714, + "step": 4470 + }, + { + "epoch": 5.72288, + "grad_norm": 0.7177988290786743, + "learning_rate": 4.107442977190877e-05, + "loss": 0.5762, + "step": 4471 + }, + { + "epoch": 5.72416, + "grad_norm": 0.7632842063903809, + "learning_rate": 4.107242897158864e-05, + "loss": 0.6587, + "step": 4472 + }, + { + "epoch": 5.72544, + "grad_norm": 0.7221323251724243, + "learning_rate": 4.107042817126851e-05, + "loss": 0.6062, + "step": 4473 + }, + { + "epoch": 5.72672, + "grad_norm": 0.7250041365623474, + "learning_rate": 4.1068427370948384e-05, + "loss": 0.609, + "step": 4474 + }, + { + "epoch": 5.728, + "grad_norm": 0.7287482619285583, + "learning_rate": 4.106642657062825e-05, + "loss": 0.6225, + "step": 4475 + }, + { + "epoch": 5.72928, + "grad_norm": 0.7629007697105408, + "learning_rate": 4.106442577030812e-05, + "loss": 0.6546, + "step": 4476 + }, + { + "epoch": 5.73056, + "grad_norm": 0.7124512791633606, + "learning_rate": 4.106242496998799e-05, + "loss": 0.6365, + "step": 4477 + }, + { + "epoch": 5.73184, + "grad_norm": 0.7133152484893799, + "learning_rate": 4.106042416966787e-05, + "loss": 0.6063, + "step": 4478 + }, + { + "epoch": 5.7331199999999995, + "grad_norm": 0.7545233964920044, + "learning_rate": 4.1058423369347744e-05, + "loss": 0.6563, + "step": 4479 + }, + { + "epoch": 5.7344, + "grad_norm": 0.755072832107544, + "learning_rate": 4.1056422569027615e-05, + "loss": 0.6586, + "step": 4480 + }, + { + "epoch": 5.73568, + "grad_norm": 0.7463574409484863, + "learning_rate": 4.105442176870749e-05, + "loss": 0.6427, + "step": 4481 + }, + { + "epoch": 5.73696, + "grad_norm": 0.7250651121139526, + "learning_rate": 4.105242096838736e-05, + "loss": 0.6494, + "step": 4482 + }, + { + "epoch": 5.73824, + "grad_norm": 0.7188123464584351, + "learning_rate": 4.1050420168067224e-05, + "loss": 0.6417, + "step": 4483 + }, + { + "epoch": 5.73952, + "grad_norm": 0.758231520652771, + "learning_rate": 4.1048419367747096e-05, + "loss": 0.6194, + "step": 4484 + }, + { + "epoch": 5.7408, + "grad_norm": 0.7206960320472717, + "learning_rate": 4.1046418567426975e-05, + "loss": 0.6222, + "step": 4485 + }, + { + "epoch": 5.74208, + "grad_norm": 0.7939612865447998, + "learning_rate": 4.1044417767106847e-05, + "loss": 0.6451, + "step": 4486 + }, + { + "epoch": 5.74336, + "grad_norm": 0.7785453796386719, + "learning_rate": 4.104241696678672e-05, + "loss": 0.6497, + "step": 4487 + }, + { + "epoch": 5.74464, + "grad_norm": 0.7556130290031433, + "learning_rate": 4.104041616646659e-05, + "loss": 0.6284, + "step": 4488 + }, + { + "epoch": 5.74592, + "grad_norm": 0.6738141775131226, + "learning_rate": 4.103841536614646e-05, + "loss": 0.6161, + "step": 4489 + }, + { + "epoch": 5.7472, + "grad_norm": 0.7317538857460022, + "learning_rate": 4.1036414565826334e-05, + "loss": 0.6297, + "step": 4490 + }, + { + "epoch": 5.74848, + "grad_norm": 0.7428886890411377, + "learning_rate": 4.10344137655062e-05, + "loss": 0.5963, + "step": 4491 + }, + { + "epoch": 5.74976, + "grad_norm": 0.7947216629981995, + "learning_rate": 4.103241296518608e-05, + "loss": 0.695, + "step": 4492 + }, + { + "epoch": 5.75104, + "grad_norm": 0.8010162115097046, + "learning_rate": 4.103041216486595e-05, + "loss": 0.7321, + "step": 4493 + }, + { + "epoch": 5.75232, + "grad_norm": 0.7321368455886841, + "learning_rate": 4.102841136454582e-05, + "loss": 0.6789, + "step": 4494 + }, + { + "epoch": 5.7536000000000005, + "grad_norm": 0.7535264492034912, + "learning_rate": 4.102641056422569e-05, + "loss": 0.6474, + "step": 4495 + }, + { + "epoch": 5.75488, + "grad_norm": 0.7041172385215759, + "learning_rate": 4.1024409763905565e-05, + "loss": 0.5921, + "step": 4496 + }, + { + "epoch": 5.75616, + "grad_norm": 0.7587586641311646, + "learning_rate": 4.102240896358544e-05, + "loss": 0.6006, + "step": 4497 + }, + { + "epoch": 5.75744, + "grad_norm": 0.7653579115867615, + "learning_rate": 4.102040816326531e-05, + "loss": 0.6526, + "step": 4498 + }, + { + "epoch": 5.75872, + "grad_norm": 0.7497640252113342, + "learning_rate": 4.101840736294518e-05, + "loss": 0.6286, + "step": 4499 + }, + { + "epoch": 5.76, + "grad_norm": 0.7529040575027466, + "learning_rate": 4.101640656262505e-05, + "loss": 0.6229, + "step": 4500 + }, + { + "epoch": 5.76128, + "grad_norm": 0.7931720018386841, + "learning_rate": 4.1014405762304924e-05, + "loss": 0.6423, + "step": 4501 + }, + { + "epoch": 5.76256, + "grad_norm": 0.7556605935096741, + "learning_rate": 4.1012404961984796e-05, + "loss": 0.5925, + "step": 4502 + }, + { + "epoch": 5.76384, + "grad_norm": 0.7954467535018921, + "learning_rate": 4.101040416166467e-05, + "loss": 0.6246, + "step": 4503 + }, + { + "epoch": 5.76512, + "grad_norm": 0.7651516199111938, + "learning_rate": 4.100840336134454e-05, + "loss": 0.6147, + "step": 4504 + }, + { + "epoch": 5.7664, + "grad_norm": 0.758766770362854, + "learning_rate": 4.100640256102441e-05, + "loss": 0.6635, + "step": 4505 + }, + { + "epoch": 5.76768, + "grad_norm": 0.7414392828941345, + "learning_rate": 4.1004401760704284e-05, + "loss": 0.6027, + "step": 4506 + }, + { + "epoch": 5.76896, + "grad_norm": 0.7388911247253418, + "learning_rate": 4.1002400960384156e-05, + "loss": 0.6582, + "step": 4507 + }, + { + "epoch": 5.77024, + "grad_norm": 0.7597686052322388, + "learning_rate": 4.100040016006403e-05, + "loss": 0.6223, + "step": 4508 + }, + { + "epoch": 5.77152, + "grad_norm": 0.7913456559181213, + "learning_rate": 4.09983993597439e-05, + "loss": 0.6457, + "step": 4509 + }, + { + "epoch": 5.7728, + "grad_norm": 0.7759582996368408, + "learning_rate": 4.099639855942377e-05, + "loss": 0.6386, + "step": 4510 + }, + { + "epoch": 5.77408, + "grad_norm": 0.7403820753097534, + "learning_rate": 4.099439775910364e-05, + "loss": 0.6217, + "step": 4511 + }, + { + "epoch": 5.77536, + "grad_norm": 0.7305609583854675, + "learning_rate": 4.0992396958783515e-05, + "loss": 0.6371, + "step": 4512 + }, + { + "epoch": 5.77664, + "grad_norm": 0.7057348489761353, + "learning_rate": 4.099039615846339e-05, + "loss": 0.62, + "step": 4513 + }, + { + "epoch": 5.77792, + "grad_norm": 0.7573639750480652, + "learning_rate": 4.098839535814326e-05, + "loss": 0.639, + "step": 4514 + }, + { + "epoch": 5.7792, + "grad_norm": 0.7069675326347351, + "learning_rate": 4.098639455782313e-05, + "loss": 0.5887, + "step": 4515 + }, + { + "epoch": 5.78048, + "grad_norm": 0.7522568702697754, + "learning_rate": 4.0984393757503e-05, + "loss": 0.6604, + "step": 4516 + }, + { + "epoch": 5.78176, + "grad_norm": 0.7098150253295898, + "learning_rate": 4.0982392957182874e-05, + "loss": 0.6172, + "step": 4517 + }, + { + "epoch": 5.78304, + "grad_norm": 0.7658361792564392, + "learning_rate": 4.0980392156862746e-05, + "loss": 0.7318, + "step": 4518 + }, + { + "epoch": 5.78432, + "grad_norm": 0.7216033935546875, + "learning_rate": 4.097839135654262e-05, + "loss": 0.6195, + "step": 4519 + }, + { + "epoch": 5.7856, + "grad_norm": 0.7166069149971008, + "learning_rate": 4.0976390556222497e-05, + "loss": 0.5746, + "step": 4520 + }, + { + "epoch": 5.78688, + "grad_norm": 0.7510432004928589, + "learning_rate": 4.097438975590236e-05, + "loss": 0.6373, + "step": 4521 + }, + { + "epoch": 5.7881599999999995, + "grad_norm": 0.700448751449585, + "learning_rate": 4.0972388955582233e-05, + "loss": 0.5698, + "step": 4522 + }, + { + "epoch": 5.78944, + "grad_norm": 0.7070586681365967, + "learning_rate": 4.0970388155262105e-05, + "loss": 0.6247, + "step": 4523 + }, + { + "epoch": 5.79072, + "grad_norm": 0.6846281886100769, + "learning_rate": 4.096838735494198e-05, + "loss": 0.625, + "step": 4524 + }, + { + "epoch": 5.792, + "grad_norm": 0.745806872844696, + "learning_rate": 4.096638655462185e-05, + "loss": 0.6629, + "step": 4525 + }, + { + "epoch": 5.79328, + "grad_norm": 0.7132704854011536, + "learning_rate": 4.096438575430172e-05, + "loss": 0.6222, + "step": 4526 + }, + { + "epoch": 5.79456, + "grad_norm": 0.751122236251831, + "learning_rate": 4.09623849539816e-05, + "loss": 0.6406, + "step": 4527 + }, + { + "epoch": 5.79584, + "grad_norm": 0.7574059367179871, + "learning_rate": 4.096038415366147e-05, + "loss": 0.6449, + "step": 4528 + }, + { + "epoch": 5.79712, + "grad_norm": 0.6792533993721008, + "learning_rate": 4.0958383353341336e-05, + "loss": 0.5987, + "step": 4529 + }, + { + "epoch": 5.7984, + "grad_norm": 0.6878370046615601, + "learning_rate": 4.095638255302121e-05, + "loss": 0.5901, + "step": 4530 + }, + { + "epoch": 5.79968, + "grad_norm": 0.7586196660995483, + "learning_rate": 4.095438175270108e-05, + "loss": 0.6344, + "step": 4531 + }, + { + "epoch": 5.80096, + "grad_norm": 0.7338054180145264, + "learning_rate": 4.095238095238095e-05, + "loss": 0.6223, + "step": 4532 + }, + { + "epoch": 5.80224, + "grad_norm": 0.7303552627563477, + "learning_rate": 4.0950380152060824e-05, + "loss": 0.5915, + "step": 4533 + }, + { + "epoch": 5.80352, + "grad_norm": 0.7648298740386963, + "learning_rate": 4.0948379351740696e-05, + "loss": 0.615, + "step": 4534 + }, + { + "epoch": 5.8048, + "grad_norm": 0.8030759692192078, + "learning_rate": 4.0946378551420574e-05, + "loss": 0.6604, + "step": 4535 + }, + { + "epoch": 5.80608, + "grad_norm": 0.8049156069755554, + "learning_rate": 4.0944377751100446e-05, + "loss": 0.7182, + "step": 4536 + }, + { + "epoch": 5.80736, + "grad_norm": 0.7284945845603943, + "learning_rate": 4.094237695078031e-05, + "loss": 0.6061, + "step": 4537 + }, + { + "epoch": 5.8086400000000005, + "grad_norm": 0.6970748901367188, + "learning_rate": 4.094037615046018e-05, + "loss": 0.6016, + "step": 4538 + }, + { + "epoch": 5.80992, + "grad_norm": 0.7719080448150635, + "learning_rate": 4.0938375350140055e-05, + "loss": 0.7058, + "step": 4539 + }, + { + "epoch": 5.8112, + "grad_norm": 0.7578266859054565, + "learning_rate": 4.093637454981993e-05, + "loss": 0.6276, + "step": 4540 + }, + { + "epoch": 5.81248, + "grad_norm": 0.7241922616958618, + "learning_rate": 4.09343737494998e-05, + "loss": 0.6167, + "step": 4541 + }, + { + "epoch": 5.81376, + "grad_norm": 0.8004947304725647, + "learning_rate": 4.093237294917968e-05, + "loss": 0.7253, + "step": 4542 + }, + { + "epoch": 5.81504, + "grad_norm": 0.7189851999282837, + "learning_rate": 4.093037214885955e-05, + "loss": 0.6462, + "step": 4543 + }, + { + "epoch": 5.81632, + "grad_norm": 0.7319324016571045, + "learning_rate": 4.092837134853942e-05, + "loss": 0.6575, + "step": 4544 + }, + { + "epoch": 5.8176, + "grad_norm": 0.7121530175209045, + "learning_rate": 4.0926370548219286e-05, + "loss": 0.6186, + "step": 4545 + }, + { + "epoch": 5.81888, + "grad_norm": 0.7266112565994263, + "learning_rate": 4.092436974789916e-05, + "loss": 0.6121, + "step": 4546 + }, + { + "epoch": 5.82016, + "grad_norm": 0.7524951696395874, + "learning_rate": 4.092236894757903e-05, + "loss": 0.6035, + "step": 4547 + }, + { + "epoch": 5.82144, + "grad_norm": 0.7526006698608398, + "learning_rate": 4.09203681472589e-05, + "loss": 0.6494, + "step": 4548 + }, + { + "epoch": 5.82272, + "grad_norm": 0.7238689064979553, + "learning_rate": 4.091836734693878e-05, + "loss": 0.6186, + "step": 4549 + }, + { + "epoch": 5.824, + "grad_norm": 0.7418443560600281, + "learning_rate": 4.091636654661865e-05, + "loss": 0.6611, + "step": 4550 + }, + { + "epoch": 5.82528, + "grad_norm": 0.7511699795722961, + "learning_rate": 4.0914365746298524e-05, + "loss": 0.6564, + "step": 4551 + }, + { + "epoch": 5.82656, + "grad_norm": 0.6964154243469238, + "learning_rate": 4.0912364945978396e-05, + "loss": 0.6026, + "step": 4552 + }, + { + "epoch": 5.82784, + "grad_norm": 0.7538017630577087, + "learning_rate": 4.091036414565826e-05, + "loss": 0.7241, + "step": 4553 + }, + { + "epoch": 5.82912, + "grad_norm": 0.7111532092094421, + "learning_rate": 4.090836334533813e-05, + "loss": 0.6199, + "step": 4554 + }, + { + "epoch": 5.8304, + "grad_norm": 0.7004513740539551, + "learning_rate": 4.0906362545018005e-05, + "loss": 0.6281, + "step": 4555 + }, + { + "epoch": 5.83168, + "grad_norm": 0.743412971496582, + "learning_rate": 4.090436174469788e-05, + "loss": 0.6546, + "step": 4556 + }, + { + "epoch": 5.83296, + "grad_norm": 0.7199406623840332, + "learning_rate": 4.0902360944377755e-05, + "loss": 0.6265, + "step": 4557 + }, + { + "epoch": 5.83424, + "grad_norm": 0.7323152422904968, + "learning_rate": 4.090036014405763e-05, + "loss": 0.6174, + "step": 4558 + }, + { + "epoch": 5.83552, + "grad_norm": 0.7763957381248474, + "learning_rate": 4.08983593437375e-05, + "loss": 0.6362, + "step": 4559 + }, + { + "epoch": 5.8368, + "grad_norm": 0.7239270806312561, + "learning_rate": 4.089635854341737e-05, + "loss": 0.594, + "step": 4560 + }, + { + "epoch": 5.83808, + "grad_norm": 0.7253373265266418, + "learning_rate": 4.0894357743097236e-05, + "loss": 0.6342, + "step": 4561 + }, + { + "epoch": 5.83936, + "grad_norm": 0.7696877717971802, + "learning_rate": 4.089235694277711e-05, + "loss": 0.6545, + "step": 4562 + }, + { + "epoch": 5.8406400000000005, + "grad_norm": 0.7727090716362, + "learning_rate": 4.0890356142456986e-05, + "loss": 0.6611, + "step": 4563 + }, + { + "epoch": 5.84192, + "grad_norm": 0.6735355854034424, + "learning_rate": 4.088835534213686e-05, + "loss": 0.5948, + "step": 4564 + }, + { + "epoch": 5.8431999999999995, + "grad_norm": 0.7272473573684692, + "learning_rate": 4.088635454181673e-05, + "loss": 0.6286, + "step": 4565 + }, + { + "epoch": 5.84448, + "grad_norm": 0.760036289691925, + "learning_rate": 4.08843537414966e-05, + "loss": 0.6463, + "step": 4566 + }, + { + "epoch": 5.84576, + "grad_norm": 0.7171865701675415, + "learning_rate": 4.0882352941176474e-05, + "loss": 0.6485, + "step": 4567 + }, + { + "epoch": 5.84704, + "grad_norm": 0.7382311820983887, + "learning_rate": 4.0880352140856346e-05, + "loss": 0.6391, + "step": 4568 + }, + { + "epoch": 5.84832, + "grad_norm": 0.7612113356590271, + "learning_rate": 4.087835134053621e-05, + "loss": 0.6734, + "step": 4569 + }, + { + "epoch": 5.8496, + "grad_norm": 0.7614792585372925, + "learning_rate": 4.087635054021609e-05, + "loss": 0.635, + "step": 4570 + }, + { + "epoch": 5.85088, + "grad_norm": 0.8073767423629761, + "learning_rate": 4.087434973989596e-05, + "loss": 0.6494, + "step": 4571 + }, + { + "epoch": 5.85216, + "grad_norm": 0.7585006952285767, + "learning_rate": 4.087234893957583e-05, + "loss": 0.6522, + "step": 4572 + }, + { + "epoch": 5.85344, + "grad_norm": 0.7650215029716492, + "learning_rate": 4.0870348139255705e-05, + "loss": 0.6874, + "step": 4573 + }, + { + "epoch": 5.85472, + "grad_norm": 0.7307962775230408, + "learning_rate": 4.086834733893558e-05, + "loss": 0.6017, + "step": 4574 + }, + { + "epoch": 5.856, + "grad_norm": 0.7022853493690491, + "learning_rate": 4.086634653861545e-05, + "loss": 0.6255, + "step": 4575 + }, + { + "epoch": 5.85728, + "grad_norm": 0.7452864050865173, + "learning_rate": 4.086434573829532e-05, + "loss": 0.6482, + "step": 4576 + }, + { + "epoch": 5.85856, + "grad_norm": 0.686421811580658, + "learning_rate": 4.086234493797519e-05, + "loss": 0.5918, + "step": 4577 + }, + { + "epoch": 5.85984, + "grad_norm": 0.7287219166755676, + "learning_rate": 4.0860344137655064e-05, + "loss": 0.6691, + "step": 4578 + }, + { + "epoch": 5.86112, + "grad_norm": 0.7395564913749695, + "learning_rate": 4.0858343337334936e-05, + "loss": 0.6626, + "step": 4579 + }, + { + "epoch": 5.8624, + "grad_norm": 0.727815568447113, + "learning_rate": 4.085634253701481e-05, + "loss": 0.6524, + "step": 4580 + }, + { + "epoch": 5.8636800000000004, + "grad_norm": 0.7862719893455505, + "learning_rate": 4.085434173669468e-05, + "loss": 0.6451, + "step": 4581 + }, + { + "epoch": 5.86496, + "grad_norm": 0.7924978137016296, + "learning_rate": 4.085234093637455e-05, + "loss": 0.6689, + "step": 4582 + }, + { + "epoch": 5.86624, + "grad_norm": 0.7216591835021973, + "learning_rate": 4.0850340136054423e-05, + "loss": 0.6297, + "step": 4583 + }, + { + "epoch": 5.86752, + "grad_norm": 0.7293557524681091, + "learning_rate": 4.0848339335734295e-05, + "loss": 0.65, + "step": 4584 + }, + { + "epoch": 5.8688, + "grad_norm": 0.7389594912528992, + "learning_rate": 4.084633853541417e-05, + "loss": 0.6201, + "step": 4585 + }, + { + "epoch": 5.87008, + "grad_norm": 0.7208465337753296, + "learning_rate": 4.084433773509404e-05, + "loss": 0.5968, + "step": 4586 + }, + { + "epoch": 5.87136, + "grad_norm": 0.7426589131355286, + "learning_rate": 4.084233693477391e-05, + "loss": 0.6313, + "step": 4587 + }, + { + "epoch": 5.87264, + "grad_norm": 0.7646098732948303, + "learning_rate": 4.084033613445378e-05, + "loss": 0.7042, + "step": 4588 + }, + { + "epoch": 5.87392, + "grad_norm": 0.7568240165710449, + "learning_rate": 4.0838335334133655e-05, + "loss": 0.6097, + "step": 4589 + }, + { + "epoch": 5.8751999999999995, + "grad_norm": 0.7447047829627991, + "learning_rate": 4.0836334533813526e-05, + "loss": 0.6462, + "step": 4590 + }, + { + "epoch": 5.87648, + "grad_norm": 0.7372913360595703, + "learning_rate": 4.0834333733493405e-05, + "loss": 0.6434, + "step": 4591 + }, + { + "epoch": 5.87776, + "grad_norm": 0.7013605833053589, + "learning_rate": 4.083233293317327e-05, + "loss": 0.6086, + "step": 4592 + }, + { + "epoch": 5.87904, + "grad_norm": 0.7706699967384338, + "learning_rate": 4.083033213285314e-05, + "loss": 0.6739, + "step": 4593 + }, + { + "epoch": 5.88032, + "grad_norm": 0.7354162931442261, + "learning_rate": 4.0828331332533014e-05, + "loss": 0.6037, + "step": 4594 + }, + { + "epoch": 5.8816, + "grad_norm": 0.7437216639518738, + "learning_rate": 4.0826330532212886e-05, + "loss": 0.6416, + "step": 4595 + }, + { + "epoch": 5.88288, + "grad_norm": 0.7383934259414673, + "learning_rate": 4.082432973189276e-05, + "loss": 0.6556, + "step": 4596 + }, + { + "epoch": 5.88416, + "grad_norm": 0.7448809146881104, + "learning_rate": 4.082232893157263e-05, + "loss": 0.6923, + "step": 4597 + }, + { + "epoch": 5.88544, + "grad_norm": 0.7986266613006592, + "learning_rate": 4.082032813125251e-05, + "loss": 0.6663, + "step": 4598 + }, + { + "epoch": 5.88672, + "grad_norm": 0.7175683975219727, + "learning_rate": 4.081832733093238e-05, + "loss": 0.6453, + "step": 4599 + }, + { + "epoch": 5.888, + "grad_norm": 0.7115585803985596, + "learning_rate": 4.0816326530612245e-05, + "loss": 0.5905, + "step": 4600 + }, + { + "epoch": 5.88928, + "grad_norm": 0.6863852739334106, + "learning_rate": 4.081432573029212e-05, + "loss": 0.5642, + "step": 4601 + }, + { + "epoch": 5.89056, + "grad_norm": 0.6771440505981445, + "learning_rate": 4.081232492997199e-05, + "loss": 0.565, + "step": 4602 + }, + { + "epoch": 5.89184, + "grad_norm": 0.6936319470405579, + "learning_rate": 4.081032412965186e-05, + "loss": 0.5993, + "step": 4603 + }, + { + "epoch": 5.89312, + "grad_norm": 0.7660918831825256, + "learning_rate": 4.080832332933173e-05, + "loss": 0.6337, + "step": 4604 + }, + { + "epoch": 5.8944, + "grad_norm": 0.7964367270469666, + "learning_rate": 4.080632252901161e-05, + "loss": 0.6357, + "step": 4605 + }, + { + "epoch": 5.8956800000000005, + "grad_norm": 0.7397758364677429, + "learning_rate": 4.080432172869148e-05, + "loss": 0.5938, + "step": 4606 + }, + { + "epoch": 5.89696, + "grad_norm": 0.7152984738349915, + "learning_rate": 4.0802320928371355e-05, + "loss": 0.5801, + "step": 4607 + }, + { + "epoch": 5.89824, + "grad_norm": 0.7661724090576172, + "learning_rate": 4.080032012805122e-05, + "loss": 0.6988, + "step": 4608 + }, + { + "epoch": 5.89952, + "grad_norm": 0.7536041140556335, + "learning_rate": 4.079831932773109e-05, + "loss": 0.6331, + "step": 4609 + }, + { + "epoch": 5.9008, + "grad_norm": 0.7481626868247986, + "learning_rate": 4.0796318527410964e-05, + "loss": 0.6484, + "step": 4610 + }, + { + "epoch": 5.90208, + "grad_norm": 0.7638240456581116, + "learning_rate": 4.0794317727090835e-05, + "loss": 0.6959, + "step": 4611 + }, + { + "epoch": 5.90336, + "grad_norm": 0.7237250804901123, + "learning_rate": 4.0792316926770714e-05, + "loss": 0.605, + "step": 4612 + }, + { + "epoch": 5.90464, + "grad_norm": 0.7751877903938293, + "learning_rate": 4.0790316126450586e-05, + "loss": 0.6425, + "step": 4613 + }, + { + "epoch": 5.90592, + "grad_norm": 0.7242938280105591, + "learning_rate": 4.078831532613046e-05, + "loss": 0.616, + "step": 4614 + }, + { + "epoch": 5.9072, + "grad_norm": 0.7469364404678345, + "learning_rate": 4.078631452581033e-05, + "loss": 0.6072, + "step": 4615 + }, + { + "epoch": 5.90848, + "grad_norm": 0.728795051574707, + "learning_rate": 4.0784313725490195e-05, + "loss": 0.6552, + "step": 4616 + }, + { + "epoch": 5.90976, + "grad_norm": 0.7057138085365295, + "learning_rate": 4.078231292517007e-05, + "loss": 0.6434, + "step": 4617 + }, + { + "epoch": 5.91104, + "grad_norm": 0.7345725893974304, + "learning_rate": 4.078031212484994e-05, + "loss": 0.6371, + "step": 4618 + }, + { + "epoch": 5.91232, + "grad_norm": 0.7135785818099976, + "learning_rate": 4.077831132452982e-05, + "loss": 0.6094, + "step": 4619 + }, + { + "epoch": 5.9136, + "grad_norm": 0.7003117799758911, + "learning_rate": 4.077631052420969e-05, + "loss": 0.6538, + "step": 4620 + }, + { + "epoch": 5.91488, + "grad_norm": 0.7230803966522217, + "learning_rate": 4.077430972388956e-05, + "loss": 0.6399, + "step": 4621 + }, + { + "epoch": 5.91616, + "grad_norm": 0.7477660179138184, + "learning_rate": 4.077230892356943e-05, + "loss": 0.6334, + "step": 4622 + }, + { + "epoch": 5.91744, + "grad_norm": 0.7172017693519592, + "learning_rate": 4.0770308123249305e-05, + "loss": 0.6714, + "step": 4623 + }, + { + "epoch": 5.91872, + "grad_norm": 0.7592531442642212, + "learning_rate": 4.076830732292917e-05, + "loss": 0.6453, + "step": 4624 + }, + { + "epoch": 5.92, + "grad_norm": 0.6898649334907532, + "learning_rate": 4.076630652260904e-05, + "loss": 0.6417, + "step": 4625 + }, + { + "epoch": 5.92128, + "grad_norm": 0.7369380593299866, + "learning_rate": 4.076430572228892e-05, + "loss": 0.6391, + "step": 4626 + }, + { + "epoch": 5.92256, + "grad_norm": 0.7180305123329163, + "learning_rate": 4.076230492196879e-05, + "loss": 0.5914, + "step": 4627 + }, + { + "epoch": 5.92384, + "grad_norm": 0.7108614444732666, + "learning_rate": 4.0760304121648664e-05, + "loss": 0.629, + "step": 4628 + }, + { + "epoch": 5.92512, + "grad_norm": 0.753671407699585, + "learning_rate": 4.0758303321328536e-05, + "loss": 0.6227, + "step": 4629 + }, + { + "epoch": 5.9264, + "grad_norm": 0.7389430403709412, + "learning_rate": 4.075630252100841e-05, + "loss": 0.6287, + "step": 4630 + }, + { + "epoch": 5.92768, + "grad_norm": 0.7096794843673706, + "learning_rate": 4.075430172068828e-05, + "loss": 0.6025, + "step": 4631 + }, + { + "epoch": 5.92896, + "grad_norm": 0.7167031764984131, + "learning_rate": 4.0752300920368144e-05, + "loss": 0.5903, + "step": 4632 + }, + { + "epoch": 5.9302399999999995, + "grad_norm": 0.7445734143257141, + "learning_rate": 4.075030012004802e-05, + "loss": 0.6246, + "step": 4633 + }, + { + "epoch": 5.93152, + "grad_norm": 0.7259752750396729, + "learning_rate": 4.0748299319727895e-05, + "loss": 0.6677, + "step": 4634 + }, + { + "epoch": 5.9328, + "grad_norm": 0.7165055871009827, + "learning_rate": 4.074629851940777e-05, + "loss": 0.6135, + "step": 4635 + }, + { + "epoch": 5.93408, + "grad_norm": 0.7381249666213989, + "learning_rate": 4.074429771908764e-05, + "loss": 0.616, + "step": 4636 + }, + { + "epoch": 5.93536, + "grad_norm": 0.7970700263977051, + "learning_rate": 4.074229691876751e-05, + "loss": 0.7101, + "step": 4637 + }, + { + "epoch": 5.93664, + "grad_norm": 0.7447617053985596, + "learning_rate": 4.074029611844738e-05, + "loss": 0.6006, + "step": 4638 + }, + { + "epoch": 5.93792, + "grad_norm": 0.7397119402885437, + "learning_rate": 4.0738295318127254e-05, + "loss": 0.6462, + "step": 4639 + }, + { + "epoch": 5.9392, + "grad_norm": 0.6990648508071899, + "learning_rate": 4.073629451780712e-05, + "loss": 0.6027, + "step": 4640 + }, + { + "epoch": 5.94048, + "grad_norm": 0.739801824092865, + "learning_rate": 4.0734293717487e-05, + "loss": 0.6539, + "step": 4641 + }, + { + "epoch": 5.94176, + "grad_norm": 0.7326043248176575, + "learning_rate": 4.073229291716687e-05, + "loss": 0.6548, + "step": 4642 + }, + { + "epoch": 5.94304, + "grad_norm": 0.733765184879303, + "learning_rate": 4.073029211684674e-05, + "loss": 0.6622, + "step": 4643 + }, + { + "epoch": 5.94432, + "grad_norm": 0.7281367778778076, + "learning_rate": 4.0728291316526614e-05, + "loss": 0.6045, + "step": 4644 + }, + { + "epoch": 5.9456, + "grad_norm": 0.7234017848968506, + "learning_rate": 4.0726290516206485e-05, + "loss": 0.5911, + "step": 4645 + }, + { + "epoch": 5.94688, + "grad_norm": 0.7438223958015442, + "learning_rate": 4.072428971588636e-05, + "loss": 0.6348, + "step": 4646 + }, + { + "epoch": 5.94816, + "grad_norm": 0.7301828861236572, + "learning_rate": 4.072228891556623e-05, + "loss": 0.6204, + "step": 4647 + }, + { + "epoch": 5.94944, + "grad_norm": 0.7404515743255615, + "learning_rate": 4.07202881152461e-05, + "loss": 0.6279, + "step": 4648 + }, + { + "epoch": 5.9507200000000005, + "grad_norm": 0.6974015831947327, + "learning_rate": 4.071828731492597e-05, + "loss": 0.6196, + "step": 4649 + }, + { + "epoch": 5.952, + "grad_norm": 0.7563602924346924, + "learning_rate": 4.0716286514605845e-05, + "loss": 0.6058, + "step": 4650 + }, + { + "epoch": 5.95328, + "grad_norm": 0.7107272744178772, + "learning_rate": 4.0714285714285717e-05, + "loss": 0.6388, + "step": 4651 + }, + { + "epoch": 5.95456, + "grad_norm": 0.7473395466804504, + "learning_rate": 4.071228491396559e-05, + "loss": 0.586, + "step": 4652 + }, + { + "epoch": 5.95584, + "grad_norm": 0.6987488865852356, + "learning_rate": 4.071028411364546e-05, + "loss": 0.5951, + "step": 4653 + }, + { + "epoch": 5.95712, + "grad_norm": 0.7390433549880981, + "learning_rate": 4.070828331332533e-05, + "loss": 0.6402, + "step": 4654 + }, + { + "epoch": 5.9584, + "grad_norm": 0.7470793128013611, + "learning_rate": 4.0706282513005204e-05, + "loss": 0.6361, + "step": 4655 + }, + { + "epoch": 5.95968, + "grad_norm": 0.7746595740318298, + "learning_rate": 4.0704281712685076e-05, + "loss": 0.667, + "step": 4656 + }, + { + "epoch": 5.96096, + "grad_norm": 0.7226885557174683, + "learning_rate": 4.070228091236495e-05, + "loss": 0.6213, + "step": 4657 + }, + { + "epoch": 5.9622399999999995, + "grad_norm": 0.7263903021812439, + "learning_rate": 4.070028011204482e-05, + "loss": 0.6263, + "step": 4658 + }, + { + "epoch": 5.96352, + "grad_norm": 0.7590324282646179, + "learning_rate": 4.069827931172469e-05, + "loss": 0.6146, + "step": 4659 + }, + { + "epoch": 5.9648, + "grad_norm": 0.7753758430480957, + "learning_rate": 4.069627851140456e-05, + "loss": 0.6689, + "step": 4660 + }, + { + "epoch": 5.96608, + "grad_norm": 0.7328972220420837, + "learning_rate": 4.0694277711084435e-05, + "loss": 0.6338, + "step": 4661 + }, + { + "epoch": 5.96736, + "grad_norm": 0.7797688841819763, + "learning_rate": 4.069227691076431e-05, + "loss": 0.6802, + "step": 4662 + }, + { + "epoch": 5.96864, + "grad_norm": 0.741576075553894, + "learning_rate": 4.069027611044418e-05, + "loss": 0.6109, + "step": 4663 + }, + { + "epoch": 5.96992, + "grad_norm": 0.7838340997695923, + "learning_rate": 4.068827531012405e-05, + "loss": 0.6586, + "step": 4664 + }, + { + "epoch": 5.9712, + "grad_norm": 0.7517712712287903, + "learning_rate": 4.068627450980392e-05, + "loss": 0.6261, + "step": 4665 + }, + { + "epoch": 5.97248, + "grad_norm": 0.803371787071228, + "learning_rate": 4.0684273709483794e-05, + "loss": 0.6969, + "step": 4666 + }, + { + "epoch": 5.97376, + "grad_norm": 0.7221397161483765, + "learning_rate": 4.0682272909163666e-05, + "loss": 0.6129, + "step": 4667 + }, + { + "epoch": 5.97504, + "grad_norm": 0.7079233527183533, + "learning_rate": 4.068027210884354e-05, + "loss": 0.6356, + "step": 4668 + }, + { + "epoch": 5.97632, + "grad_norm": 0.6837298274040222, + "learning_rate": 4.067827130852342e-05, + "loss": 0.6188, + "step": 4669 + }, + { + "epoch": 5.9776, + "grad_norm": 0.7204573154449463, + "learning_rate": 4.067627050820328e-05, + "loss": 0.6068, + "step": 4670 + }, + { + "epoch": 5.97888, + "grad_norm": 0.7004908919334412, + "learning_rate": 4.0674269707883154e-05, + "loss": 0.5853, + "step": 4671 + }, + { + "epoch": 5.98016, + "grad_norm": 0.6996192932128906, + "learning_rate": 4.0672268907563026e-05, + "loss": 0.5958, + "step": 4672 + }, + { + "epoch": 5.98144, + "grad_norm": 0.7411110401153564, + "learning_rate": 4.06702681072429e-05, + "loss": 0.5999, + "step": 4673 + }, + { + "epoch": 5.9827200000000005, + "grad_norm": 0.7705451250076294, + "learning_rate": 4.066826730692277e-05, + "loss": 0.7017, + "step": 4674 + }, + { + "epoch": 5.984, + "grad_norm": 0.7225644588470459, + "learning_rate": 4.066626650660264e-05, + "loss": 0.5662, + "step": 4675 + }, + { + "epoch": 5.98528, + "grad_norm": 0.7521904706954956, + "learning_rate": 4.066426570628252e-05, + "loss": 0.6083, + "step": 4676 + }, + { + "epoch": 5.98656, + "grad_norm": 0.756615161895752, + "learning_rate": 4.066226490596239e-05, + "loss": 0.6185, + "step": 4677 + }, + { + "epoch": 5.98784, + "grad_norm": 0.733359158039093, + "learning_rate": 4.066026410564226e-05, + "loss": 0.6312, + "step": 4678 + }, + { + "epoch": 5.98912, + "grad_norm": 0.774350643157959, + "learning_rate": 4.065826330532213e-05, + "loss": 0.6328, + "step": 4679 + }, + { + "epoch": 5.9904, + "grad_norm": 0.7676503658294678, + "learning_rate": 4.0656262505002e-05, + "loss": 0.6359, + "step": 4680 + }, + { + "epoch": 5.99168, + "grad_norm": 0.7357990741729736, + "learning_rate": 4.065426170468187e-05, + "loss": 0.6147, + "step": 4681 + }, + { + "epoch": 5.99296, + "grad_norm": 0.7393483519554138, + "learning_rate": 4.0652260904361744e-05, + "loss": 0.6331, + "step": 4682 + }, + { + "epoch": 5.99424, + "grad_norm": 0.7861780524253845, + "learning_rate": 4.065026010404162e-05, + "loss": 0.6724, + "step": 4683 + }, + { + "epoch": 5.99552, + "grad_norm": 0.739687979221344, + "learning_rate": 4.0648259303721495e-05, + "loss": 0.6299, + "step": 4684 + }, + { + "epoch": 5.9968, + "grad_norm": 0.7430374622344971, + "learning_rate": 4.0646258503401366e-05, + "loss": 0.645, + "step": 4685 + }, + { + "epoch": 5.99808, + "grad_norm": 0.7406571507453918, + "learning_rate": 4.064425770308123e-05, + "loss": 0.6604, + "step": 4686 + }, + { + "epoch": 5.99936, + "grad_norm": 0.729308009147644, + "learning_rate": 4.06422569027611e-05, + "loss": 0.6399, + "step": 4687 + }, + { + "epoch": 6.00064, + "grad_norm": 1.4821006059646606, + "learning_rate": 4.0640256102440975e-05, + "loss": 1.1371, + "step": 4688 + }, + { + "epoch": 6.00192, + "grad_norm": 0.7121221423149109, + "learning_rate": 4.063825530212085e-05, + "loss": 0.6295, + "step": 4689 + }, + { + "epoch": 6.0032, + "grad_norm": 0.7151453495025635, + "learning_rate": 4.0636254501800726e-05, + "loss": 0.6071, + "step": 4690 + }, + { + "epoch": 6.00448, + "grad_norm": 0.7339667677879333, + "learning_rate": 4.06342537014806e-05, + "loss": 0.6585, + "step": 4691 + }, + { + "epoch": 6.00576, + "grad_norm": 0.7404779195785522, + "learning_rate": 4.063225290116047e-05, + "loss": 0.6038, + "step": 4692 + }, + { + "epoch": 6.00704, + "grad_norm": 0.7343531250953674, + "learning_rate": 4.063025210084034e-05, + "loss": 0.6758, + "step": 4693 + }, + { + "epoch": 6.00832, + "grad_norm": 0.7415891885757446, + "learning_rate": 4.0628251300520206e-05, + "loss": 0.6148, + "step": 4694 + }, + { + "epoch": 6.0096, + "grad_norm": 0.7926338911056519, + "learning_rate": 4.062625050020008e-05, + "loss": 0.6918, + "step": 4695 + }, + { + "epoch": 6.01088, + "grad_norm": 0.7479919791221619, + "learning_rate": 4.062424969987995e-05, + "loss": 0.6289, + "step": 4696 + }, + { + "epoch": 6.01216, + "grad_norm": 0.7507800459861755, + "learning_rate": 4.062224889955983e-05, + "loss": 0.6093, + "step": 4697 + }, + { + "epoch": 6.01344, + "grad_norm": 0.7076855301856995, + "learning_rate": 4.06202480992397e-05, + "loss": 0.5928, + "step": 4698 + }, + { + "epoch": 6.01472, + "grad_norm": 0.7334170341491699, + "learning_rate": 4.061824729891957e-05, + "loss": 0.6195, + "step": 4699 + }, + { + "epoch": 6.016, + "grad_norm": 0.7351385951042175, + "learning_rate": 4.0616246498599444e-05, + "loss": 0.6166, + "step": 4700 + }, + { + "epoch": 6.01728, + "grad_norm": 0.7694382071495056, + "learning_rate": 4.0614245698279316e-05, + "loss": 0.6092, + "step": 4701 + }, + { + "epoch": 6.01856, + "grad_norm": 0.7713647484779358, + "learning_rate": 4.061224489795918e-05, + "loss": 0.6331, + "step": 4702 + }, + { + "epoch": 6.01984, + "grad_norm": 0.7566975355148315, + "learning_rate": 4.061024409763905e-05, + "loss": 0.6276, + "step": 4703 + }, + { + "epoch": 6.02112, + "grad_norm": 0.6920115351676941, + "learning_rate": 4.060824329731893e-05, + "loss": 0.5527, + "step": 4704 + }, + { + "epoch": 6.0224, + "grad_norm": 0.7560464143753052, + "learning_rate": 4.0606242496998804e-05, + "loss": 0.6171, + "step": 4705 + }, + { + "epoch": 6.02368, + "grad_norm": 0.8139127492904663, + "learning_rate": 4.0604241696678675e-05, + "loss": 0.6635, + "step": 4706 + }, + { + "epoch": 6.02496, + "grad_norm": 0.7818084955215454, + "learning_rate": 4.060224089635855e-05, + "loss": 0.6318, + "step": 4707 + }, + { + "epoch": 6.02624, + "grad_norm": 0.7569645047187805, + "learning_rate": 4.060024009603842e-05, + "loss": 0.6523, + "step": 4708 + }, + { + "epoch": 6.02752, + "grad_norm": 0.7755677103996277, + "learning_rate": 4.059823929571829e-05, + "loss": 0.6469, + "step": 4709 + }, + { + "epoch": 6.0288, + "grad_norm": 0.762385904788971, + "learning_rate": 4.0596238495398156e-05, + "loss": 0.616, + "step": 4710 + }, + { + "epoch": 6.03008, + "grad_norm": 0.7784088253974915, + "learning_rate": 4.0594237695078035e-05, + "loss": 0.6624, + "step": 4711 + }, + { + "epoch": 6.03136, + "grad_norm": 0.7360923886299133, + "learning_rate": 4.0592236894757907e-05, + "loss": 0.5984, + "step": 4712 + }, + { + "epoch": 6.03264, + "grad_norm": 0.7154642343521118, + "learning_rate": 4.059023609443778e-05, + "loss": 0.6125, + "step": 4713 + }, + { + "epoch": 6.03392, + "grad_norm": 0.7827834486961365, + "learning_rate": 4.058823529411765e-05, + "loss": 0.5754, + "step": 4714 + }, + { + "epoch": 6.0352, + "grad_norm": 0.7146974205970764, + "learning_rate": 4.058623449379752e-05, + "loss": 0.584, + "step": 4715 + }, + { + "epoch": 6.03648, + "grad_norm": 0.7260125875473022, + "learning_rate": 4.0584233693477394e-05, + "loss": 0.6057, + "step": 4716 + }, + { + "epoch": 6.03776, + "grad_norm": 0.7503035068511963, + "learning_rate": 4.0582232893157266e-05, + "loss": 0.6242, + "step": 4717 + }, + { + "epoch": 6.03904, + "grad_norm": 0.7332813143730164, + "learning_rate": 4.058023209283714e-05, + "loss": 0.5681, + "step": 4718 + }, + { + "epoch": 6.04032, + "grad_norm": 0.7520899772644043, + "learning_rate": 4.057823129251701e-05, + "loss": 0.6185, + "step": 4719 + }, + { + "epoch": 6.0416, + "grad_norm": 0.7898109555244446, + "learning_rate": 4.057623049219688e-05, + "loss": 0.6065, + "step": 4720 + }, + { + "epoch": 6.04288, + "grad_norm": 0.7639850378036499, + "learning_rate": 4.057422969187675e-05, + "loss": 0.6498, + "step": 4721 + }, + { + "epoch": 6.04416, + "grad_norm": 0.7639596462249756, + "learning_rate": 4.0572228891556625e-05, + "loss": 0.6272, + "step": 4722 + }, + { + "epoch": 6.04544, + "grad_norm": 0.7639127969741821, + "learning_rate": 4.05702280912365e-05, + "loss": 0.6034, + "step": 4723 + }, + { + "epoch": 6.04672, + "grad_norm": 0.7917043566703796, + "learning_rate": 4.056822729091637e-05, + "loss": 0.6657, + "step": 4724 + }, + { + "epoch": 6.048, + "grad_norm": 0.7585735321044922, + "learning_rate": 4.056622649059624e-05, + "loss": 0.5913, + "step": 4725 + }, + { + "epoch": 6.04928, + "grad_norm": 0.7555089592933655, + "learning_rate": 4.056422569027611e-05, + "loss": 0.6295, + "step": 4726 + }, + { + "epoch": 6.05056, + "grad_norm": 0.7650047540664673, + "learning_rate": 4.0562224889955984e-05, + "loss": 0.5904, + "step": 4727 + }, + { + "epoch": 6.05184, + "grad_norm": 0.7548428177833557, + "learning_rate": 4.0560224089635856e-05, + "loss": 0.6117, + "step": 4728 + }, + { + "epoch": 6.05312, + "grad_norm": 0.7310490012168884, + "learning_rate": 4.055822328931573e-05, + "loss": 0.6228, + "step": 4729 + }, + { + "epoch": 6.0544, + "grad_norm": 0.7640831470489502, + "learning_rate": 4.05562224889956e-05, + "loss": 0.6002, + "step": 4730 + }, + { + "epoch": 6.05568, + "grad_norm": 0.7306190133094788, + "learning_rate": 4.055422168867547e-05, + "loss": 0.5617, + "step": 4731 + }, + { + "epoch": 6.05696, + "grad_norm": 0.767707109451294, + "learning_rate": 4.0552220888355344e-05, + "loss": 0.6073, + "step": 4732 + }, + { + "epoch": 6.05824, + "grad_norm": 0.7210080027580261, + "learning_rate": 4.0550220088035216e-05, + "loss": 0.5746, + "step": 4733 + }, + { + "epoch": 6.05952, + "grad_norm": 0.7745203971862793, + "learning_rate": 4.054821928771509e-05, + "loss": 0.6033, + "step": 4734 + }, + { + "epoch": 6.0608, + "grad_norm": 0.7430739998817444, + "learning_rate": 4.054621848739496e-05, + "loss": 0.6069, + "step": 4735 + }, + { + "epoch": 6.06208, + "grad_norm": 0.7837331891059875, + "learning_rate": 4.054421768707483e-05, + "loss": 0.6397, + "step": 4736 + }, + { + "epoch": 6.06336, + "grad_norm": 0.7566797137260437, + "learning_rate": 4.05422168867547e-05, + "loss": 0.6293, + "step": 4737 + }, + { + "epoch": 6.06464, + "grad_norm": 0.7480584979057312, + "learning_rate": 4.0540216086434575e-05, + "loss": 0.621, + "step": 4738 + }, + { + "epoch": 6.06592, + "grad_norm": 0.797978401184082, + "learning_rate": 4.0538215286114453e-05, + "loss": 0.6476, + "step": 4739 + }, + { + "epoch": 6.0672, + "grad_norm": 0.7844875454902649, + "learning_rate": 4.053621448579432e-05, + "loss": 0.6381, + "step": 4740 + }, + { + "epoch": 6.06848, + "grad_norm": 0.7457473278045654, + "learning_rate": 4.053421368547419e-05, + "loss": 0.6175, + "step": 4741 + }, + { + "epoch": 6.06976, + "grad_norm": 0.7767608761787415, + "learning_rate": 4.053221288515406e-05, + "loss": 0.6053, + "step": 4742 + }, + { + "epoch": 6.07104, + "grad_norm": 0.7705170512199402, + "learning_rate": 4.0530212084833934e-05, + "loss": 0.6364, + "step": 4743 + }, + { + "epoch": 6.07232, + "grad_norm": 0.7489408254623413, + "learning_rate": 4.0528211284513806e-05, + "loss": 0.6041, + "step": 4744 + }, + { + "epoch": 6.0736, + "grad_norm": 0.7729488611221313, + "learning_rate": 4.052621048419368e-05, + "loss": 0.6407, + "step": 4745 + }, + { + "epoch": 6.07488, + "grad_norm": 0.7681214213371277, + "learning_rate": 4.0524209683873556e-05, + "loss": 0.6271, + "step": 4746 + }, + { + "epoch": 6.07616, + "grad_norm": 0.7444661855697632, + "learning_rate": 4.052220888355343e-05, + "loss": 0.6305, + "step": 4747 + }, + { + "epoch": 6.07744, + "grad_norm": 0.7598939538002014, + "learning_rate": 4.0520208083233293e-05, + "loss": 0.6313, + "step": 4748 + }, + { + "epoch": 6.07872, + "grad_norm": 0.7359374761581421, + "learning_rate": 4.0518207282913165e-05, + "loss": 0.5767, + "step": 4749 + }, + { + "epoch": 6.08, + "grad_norm": 0.711103081703186, + "learning_rate": 4.051620648259304e-05, + "loss": 0.5656, + "step": 4750 + }, + { + "epoch": 6.08128, + "grad_norm": 0.7639884352684021, + "learning_rate": 4.051420568227291e-05, + "loss": 0.6345, + "step": 4751 + }, + { + "epoch": 6.08256, + "grad_norm": 0.772283673286438, + "learning_rate": 4.051220488195278e-05, + "loss": 0.6508, + "step": 4752 + }, + { + "epoch": 6.08384, + "grad_norm": 0.7561479210853577, + "learning_rate": 4.051020408163265e-05, + "loss": 0.6016, + "step": 4753 + }, + { + "epoch": 6.08512, + "grad_norm": 0.7787504196166992, + "learning_rate": 4.050820328131253e-05, + "loss": 0.6232, + "step": 4754 + }, + { + "epoch": 6.0864, + "grad_norm": 0.7278993129730225, + "learning_rate": 4.05062024809924e-05, + "loss": 0.6724, + "step": 4755 + }, + { + "epoch": 6.08768, + "grad_norm": 0.7617911100387573, + "learning_rate": 4.050420168067227e-05, + "loss": 0.6296, + "step": 4756 + }, + { + "epoch": 6.08896, + "grad_norm": 0.7467544674873352, + "learning_rate": 4.050220088035214e-05, + "loss": 0.5833, + "step": 4757 + }, + { + "epoch": 6.09024, + "grad_norm": 0.8187177777290344, + "learning_rate": 4.050020008003201e-05, + "loss": 0.6642, + "step": 4758 + }, + { + "epoch": 6.09152, + "grad_norm": 0.7508170008659363, + "learning_rate": 4.0498199279711884e-05, + "loss": 0.5794, + "step": 4759 + }, + { + "epoch": 6.0928, + "grad_norm": 0.7760382294654846, + "learning_rate": 4.0496198479391756e-05, + "loss": 0.6437, + "step": 4760 + }, + { + "epoch": 6.09408, + "grad_norm": 0.762019693851471, + "learning_rate": 4.0494197679071634e-05, + "loss": 0.5662, + "step": 4761 + }, + { + "epoch": 6.09536, + "grad_norm": 0.7687118053436279, + "learning_rate": 4.0492196878751506e-05, + "loss": 0.6324, + "step": 4762 + }, + { + "epoch": 6.09664, + "grad_norm": 0.7786553502082825, + "learning_rate": 4.049019607843138e-05, + "loss": 0.5897, + "step": 4763 + }, + { + "epoch": 6.09792, + "grad_norm": 0.7534022927284241, + "learning_rate": 4.048819527811124e-05, + "loss": 0.5713, + "step": 4764 + }, + { + "epoch": 6.0992, + "grad_norm": 0.786078929901123, + "learning_rate": 4.0486194477791115e-05, + "loss": 0.5774, + "step": 4765 + }, + { + "epoch": 6.10048, + "grad_norm": 0.7434951663017273, + "learning_rate": 4.048419367747099e-05, + "loss": 0.6195, + "step": 4766 + }, + { + "epoch": 6.10176, + "grad_norm": 0.7410392761230469, + "learning_rate": 4.048219287715086e-05, + "loss": 0.6466, + "step": 4767 + }, + { + "epoch": 6.10304, + "grad_norm": 0.7867603302001953, + "learning_rate": 4.048019207683074e-05, + "loss": 0.6098, + "step": 4768 + }, + { + "epoch": 6.10432, + "grad_norm": 0.7912724614143372, + "learning_rate": 4.047819127651061e-05, + "loss": 0.66, + "step": 4769 + }, + { + "epoch": 6.1056, + "grad_norm": 0.6925088167190552, + "learning_rate": 4.047619047619048e-05, + "loss": 0.5844, + "step": 4770 + }, + { + "epoch": 6.10688, + "grad_norm": 0.7406015992164612, + "learning_rate": 4.047418967587035e-05, + "loss": 0.6517, + "step": 4771 + }, + { + "epoch": 6.10816, + "grad_norm": 0.7543079257011414, + "learning_rate": 4.047218887555022e-05, + "loss": 0.5988, + "step": 4772 + }, + { + "epoch": 6.10944, + "grad_norm": 0.7825755476951599, + "learning_rate": 4.047018807523009e-05, + "loss": 0.6316, + "step": 4773 + }, + { + "epoch": 6.11072, + "grad_norm": 0.7861376404762268, + "learning_rate": 4.046818727490996e-05, + "loss": 0.6427, + "step": 4774 + }, + { + "epoch": 6.112, + "grad_norm": 0.7529568672180176, + "learning_rate": 4.046618647458984e-05, + "loss": 0.6156, + "step": 4775 + }, + { + "epoch": 6.11328, + "grad_norm": 0.7438963055610657, + "learning_rate": 4.046418567426971e-05, + "loss": 0.6108, + "step": 4776 + }, + { + "epoch": 6.11456, + "grad_norm": 0.7456843852996826, + "learning_rate": 4.0462184873949584e-05, + "loss": 0.6296, + "step": 4777 + }, + { + "epoch": 6.11584, + "grad_norm": 0.7504041194915771, + "learning_rate": 4.0460184073629456e-05, + "loss": 0.5838, + "step": 4778 + }, + { + "epoch": 6.11712, + "grad_norm": 0.7819758653640747, + "learning_rate": 4.045818327330933e-05, + "loss": 0.5966, + "step": 4779 + }, + { + "epoch": 6.1184, + "grad_norm": 0.7668342590332031, + "learning_rate": 4.045618247298919e-05, + "loss": 0.6158, + "step": 4780 + }, + { + "epoch": 6.11968, + "grad_norm": 0.7753564715385437, + "learning_rate": 4.0454181672669065e-05, + "loss": 0.642, + "step": 4781 + }, + { + "epoch": 6.12096, + "grad_norm": 0.8022623062133789, + "learning_rate": 4.045218087234894e-05, + "loss": 0.6358, + "step": 4782 + }, + { + "epoch": 6.12224, + "grad_norm": 0.7484129667282104, + "learning_rate": 4.0450180072028815e-05, + "loss": 0.5627, + "step": 4783 + }, + { + "epoch": 6.12352, + "grad_norm": 0.7918726205825806, + "learning_rate": 4.044817927170869e-05, + "loss": 0.6377, + "step": 4784 + }, + { + "epoch": 6.1248, + "grad_norm": 0.7575948238372803, + "learning_rate": 4.044617847138856e-05, + "loss": 0.589, + "step": 4785 + }, + { + "epoch": 6.12608, + "grad_norm": 0.7408061623573303, + "learning_rate": 4.044417767106843e-05, + "loss": 0.6352, + "step": 4786 + }, + { + "epoch": 6.12736, + "grad_norm": 0.7478961944580078, + "learning_rate": 4.04421768707483e-05, + "loss": 0.5852, + "step": 4787 + }, + { + "epoch": 6.12864, + "grad_norm": 0.7341578006744385, + "learning_rate": 4.044017607042817e-05, + "loss": 0.6114, + "step": 4788 + }, + { + "epoch": 6.12992, + "grad_norm": 0.7750809192657471, + "learning_rate": 4.0438175270108046e-05, + "loss": 0.6121, + "step": 4789 + }, + { + "epoch": 6.1312, + "grad_norm": 0.7869114875793457, + "learning_rate": 4.043617446978792e-05, + "loss": 0.6637, + "step": 4790 + }, + { + "epoch": 6.13248, + "grad_norm": 0.7512006163597107, + "learning_rate": 4.043417366946779e-05, + "loss": 0.5679, + "step": 4791 + }, + { + "epoch": 6.13376, + "grad_norm": 0.7822096347808838, + "learning_rate": 4.043217286914766e-05, + "loss": 0.6191, + "step": 4792 + }, + { + "epoch": 6.13504, + "grad_norm": 0.7540210485458374, + "learning_rate": 4.0430172068827534e-05, + "loss": 0.6011, + "step": 4793 + }, + { + "epoch": 6.1363199999999996, + "grad_norm": 0.7164133191108704, + "learning_rate": 4.0428171268507406e-05, + "loss": 0.5114, + "step": 4794 + }, + { + "epoch": 6.1376, + "grad_norm": 0.8122145533561707, + "learning_rate": 4.042617046818728e-05, + "loss": 0.6451, + "step": 4795 + }, + { + "epoch": 6.13888, + "grad_norm": 0.8163143396377563, + "learning_rate": 4.042416966786715e-05, + "loss": 0.6086, + "step": 4796 + }, + { + "epoch": 6.14016, + "grad_norm": 0.7786663770675659, + "learning_rate": 4.042216886754702e-05, + "loss": 0.6447, + "step": 4797 + }, + { + "epoch": 6.14144, + "grad_norm": 0.7091884016990662, + "learning_rate": 4.042016806722689e-05, + "loss": 0.5588, + "step": 4798 + }, + { + "epoch": 6.14272, + "grad_norm": 0.7768360376358032, + "learning_rate": 4.0418167266906765e-05, + "loss": 0.6151, + "step": 4799 + }, + { + "epoch": 6.144, + "grad_norm": 0.7751429677009583, + "learning_rate": 4.041616646658664e-05, + "loss": 0.6076, + "step": 4800 + }, + { + "epoch": 6.14528, + "grad_norm": 0.7756529450416565, + "learning_rate": 4.041416566626651e-05, + "loss": 0.6237, + "step": 4801 + }, + { + "epoch": 6.14656, + "grad_norm": 0.7572968006134033, + "learning_rate": 4.041216486594638e-05, + "loss": 0.6407, + "step": 4802 + }, + { + "epoch": 6.14784, + "grad_norm": 0.7175725698471069, + "learning_rate": 4.041016406562625e-05, + "loss": 0.588, + "step": 4803 + }, + { + "epoch": 6.14912, + "grad_norm": 0.7753421068191528, + "learning_rate": 4.0408163265306124e-05, + "loss": 0.628, + "step": 4804 + }, + { + "epoch": 6.1504, + "grad_norm": 0.777458131313324, + "learning_rate": 4.0406162464985996e-05, + "loss": 0.5963, + "step": 4805 + }, + { + "epoch": 6.15168, + "grad_norm": 0.7486628890037537, + "learning_rate": 4.040416166466587e-05, + "loss": 0.5623, + "step": 4806 + }, + { + "epoch": 6.15296, + "grad_norm": 0.7569031715393066, + "learning_rate": 4.040216086434574e-05, + "loss": 0.6062, + "step": 4807 + }, + { + "epoch": 6.15424, + "grad_norm": 0.7664810419082642, + "learning_rate": 4.040016006402561e-05, + "loss": 0.6069, + "step": 4808 + }, + { + "epoch": 6.15552, + "grad_norm": 0.7901382446289062, + "learning_rate": 4.0398159263705483e-05, + "loss": 0.6281, + "step": 4809 + }, + { + "epoch": 6.1568, + "grad_norm": 0.7850843667984009, + "learning_rate": 4.0396158463385355e-05, + "loss": 0.6222, + "step": 4810 + }, + { + "epoch": 6.15808, + "grad_norm": 0.77830570936203, + "learning_rate": 4.039415766306523e-05, + "loss": 0.6369, + "step": 4811 + }, + { + "epoch": 6.15936, + "grad_norm": 0.7659288048744202, + "learning_rate": 4.03921568627451e-05, + "loss": 0.6099, + "step": 4812 + }, + { + "epoch": 6.16064, + "grad_norm": 0.7957006096839905, + "learning_rate": 4.039015606242497e-05, + "loss": 0.6153, + "step": 4813 + }, + { + "epoch": 6.16192, + "grad_norm": 0.7597688436508179, + "learning_rate": 4.038815526210484e-05, + "loss": 0.632, + "step": 4814 + }, + { + "epoch": 6.1632, + "grad_norm": 0.7524228692054749, + "learning_rate": 4.0386154461784715e-05, + "loss": 0.5995, + "step": 4815 + }, + { + "epoch": 6.16448, + "grad_norm": 0.7459036111831665, + "learning_rate": 4.0384153661464586e-05, + "loss": 0.6228, + "step": 4816 + }, + { + "epoch": 6.16576, + "grad_norm": 0.7470423579216003, + "learning_rate": 4.0382152861144465e-05, + "loss": 0.6378, + "step": 4817 + }, + { + "epoch": 6.16704, + "grad_norm": 0.7385801076889038, + "learning_rate": 4.038015206082433e-05, + "loss": 0.6017, + "step": 4818 + }, + { + "epoch": 6.16832, + "grad_norm": 0.7643359303474426, + "learning_rate": 4.03781512605042e-05, + "loss": 0.6293, + "step": 4819 + }, + { + "epoch": 6.1696, + "grad_norm": 0.7111765146255493, + "learning_rate": 4.0376150460184074e-05, + "loss": 0.582, + "step": 4820 + }, + { + "epoch": 6.17088, + "grad_norm": 0.780404806137085, + "learning_rate": 4.0374149659863946e-05, + "loss": 0.6365, + "step": 4821 + }, + { + "epoch": 6.17216, + "grad_norm": 0.7903060913085938, + "learning_rate": 4.037214885954382e-05, + "loss": 0.7153, + "step": 4822 + }, + { + "epoch": 6.17344, + "grad_norm": 0.7531589865684509, + "learning_rate": 4.037014805922369e-05, + "loss": 0.5813, + "step": 4823 + }, + { + "epoch": 6.17472, + "grad_norm": 0.7753871083259583, + "learning_rate": 4.036814725890357e-05, + "loss": 0.6235, + "step": 4824 + }, + { + "epoch": 6.176, + "grad_norm": 0.7611346244812012, + "learning_rate": 4.036614645858344e-05, + "loss": 0.6133, + "step": 4825 + }, + { + "epoch": 6.17728, + "grad_norm": 0.7633965611457825, + "learning_rate": 4.0364145658263305e-05, + "loss": 0.591, + "step": 4826 + }, + { + "epoch": 6.17856, + "grad_norm": 0.8028787970542908, + "learning_rate": 4.036214485794318e-05, + "loss": 0.6118, + "step": 4827 + }, + { + "epoch": 6.17984, + "grad_norm": 0.7600660920143127, + "learning_rate": 4.036014405762305e-05, + "loss": 0.6283, + "step": 4828 + }, + { + "epoch": 6.18112, + "grad_norm": 0.754428505897522, + "learning_rate": 4.035814325730292e-05, + "loss": 0.6278, + "step": 4829 + }, + { + "epoch": 6.1824, + "grad_norm": 0.7525807619094849, + "learning_rate": 4.035614245698279e-05, + "loss": 0.5935, + "step": 4830 + }, + { + "epoch": 6.18368, + "grad_norm": 0.7541505694389343, + "learning_rate": 4.035414165666267e-05, + "loss": 0.6048, + "step": 4831 + }, + { + "epoch": 6.18496, + "grad_norm": 0.7696326375007629, + "learning_rate": 4.035214085634254e-05, + "loss": 0.6422, + "step": 4832 + }, + { + "epoch": 6.18624, + "grad_norm": 0.7631934881210327, + "learning_rate": 4.0350140056022415e-05, + "loss": 0.6121, + "step": 4833 + }, + { + "epoch": 6.18752, + "grad_norm": 0.7651000022888184, + "learning_rate": 4.034813925570228e-05, + "loss": 0.5951, + "step": 4834 + }, + { + "epoch": 6.1888, + "grad_norm": 0.7359318137168884, + "learning_rate": 4.034613845538215e-05, + "loss": 0.5828, + "step": 4835 + }, + { + "epoch": 6.19008, + "grad_norm": 0.7644410729408264, + "learning_rate": 4.0344137655062024e-05, + "loss": 0.6218, + "step": 4836 + }, + { + "epoch": 6.19136, + "grad_norm": 0.7283311486244202, + "learning_rate": 4.0342136854741895e-05, + "loss": 0.5594, + "step": 4837 + }, + { + "epoch": 6.19264, + "grad_norm": 0.7427007555961609, + "learning_rate": 4.0340136054421774e-05, + "loss": 0.5902, + "step": 4838 + }, + { + "epoch": 6.19392, + "grad_norm": 0.7666735649108887, + "learning_rate": 4.0338135254101646e-05, + "loss": 0.6229, + "step": 4839 + }, + { + "epoch": 6.1952, + "grad_norm": 0.7470300197601318, + "learning_rate": 4.033613445378152e-05, + "loss": 0.6134, + "step": 4840 + }, + { + "epoch": 6.19648, + "grad_norm": 0.7347720265388489, + "learning_rate": 4.033413365346139e-05, + "loss": 0.5755, + "step": 4841 + }, + { + "epoch": 6.19776, + "grad_norm": 0.7179660797119141, + "learning_rate": 4.0332132853141255e-05, + "loss": 0.54, + "step": 4842 + }, + { + "epoch": 6.19904, + "grad_norm": 0.7514517903327942, + "learning_rate": 4.0330132052821127e-05, + "loss": 0.6363, + "step": 4843 + }, + { + "epoch": 6.20032, + "grad_norm": 0.7676318883895874, + "learning_rate": 4.0328131252501e-05, + "loss": 0.6533, + "step": 4844 + }, + { + "epoch": 6.2016, + "grad_norm": 0.830500602722168, + "learning_rate": 4.032613045218088e-05, + "loss": 0.7654, + "step": 4845 + }, + { + "epoch": 6.20288, + "grad_norm": 0.7603785395622253, + "learning_rate": 4.032412965186075e-05, + "loss": 0.5547, + "step": 4846 + }, + { + "epoch": 6.20416, + "grad_norm": 0.7514224648475647, + "learning_rate": 4.032212885154062e-05, + "loss": 0.5948, + "step": 4847 + }, + { + "epoch": 6.20544, + "grad_norm": 0.757417619228363, + "learning_rate": 4.032012805122049e-05, + "loss": 0.5691, + "step": 4848 + }, + { + "epoch": 6.20672, + "grad_norm": 0.719901978969574, + "learning_rate": 4.0318127250900364e-05, + "loss": 0.6315, + "step": 4849 + }, + { + "epoch": 6.208, + "grad_norm": 0.7711414098739624, + "learning_rate": 4.031612645058023e-05, + "loss": 0.5801, + "step": 4850 + }, + { + "epoch": 6.20928, + "grad_norm": 0.7761551737785339, + "learning_rate": 4.03141256502601e-05, + "loss": 0.5992, + "step": 4851 + }, + { + "epoch": 6.21056, + "grad_norm": 0.7462658286094666, + "learning_rate": 4.031212484993998e-05, + "loss": 0.6378, + "step": 4852 + }, + { + "epoch": 6.21184, + "grad_norm": 0.7466690540313721, + "learning_rate": 4.031012404961985e-05, + "loss": 0.6151, + "step": 4853 + }, + { + "epoch": 6.21312, + "grad_norm": 0.8062450885772705, + "learning_rate": 4.0308123249299724e-05, + "loss": 0.6893, + "step": 4854 + }, + { + "epoch": 6.2144, + "grad_norm": 0.8179064989089966, + "learning_rate": 4.0306122448979596e-05, + "loss": 0.6184, + "step": 4855 + }, + { + "epoch": 6.21568, + "grad_norm": 0.7624944448471069, + "learning_rate": 4.030412164865947e-05, + "loss": 0.6016, + "step": 4856 + }, + { + "epoch": 6.21696, + "grad_norm": 0.7305824756622314, + "learning_rate": 4.030212084833934e-05, + "loss": 0.6336, + "step": 4857 + }, + { + "epoch": 6.21824, + "grad_norm": 0.7892305850982666, + "learning_rate": 4.0300120048019204e-05, + "loss": 0.6435, + "step": 4858 + }, + { + "epoch": 6.21952, + "grad_norm": 0.7659081220626831, + "learning_rate": 4.029811924769908e-05, + "loss": 0.6336, + "step": 4859 + }, + { + "epoch": 6.2208, + "grad_norm": 0.729543149471283, + "learning_rate": 4.0296118447378955e-05, + "loss": 0.6063, + "step": 4860 + }, + { + "epoch": 6.22208, + "grad_norm": 0.7288227677345276, + "learning_rate": 4.029411764705883e-05, + "loss": 0.5951, + "step": 4861 + }, + { + "epoch": 6.22336, + "grad_norm": 0.7727043032646179, + "learning_rate": 4.02921168467387e-05, + "loss": 0.6412, + "step": 4862 + }, + { + "epoch": 6.22464, + "grad_norm": 0.7801682949066162, + "learning_rate": 4.029011604641857e-05, + "loss": 0.6209, + "step": 4863 + }, + { + "epoch": 6.22592, + "grad_norm": 0.7641760110855103, + "learning_rate": 4.028811524609844e-05, + "loss": 0.6342, + "step": 4864 + }, + { + "epoch": 6.2272, + "grad_norm": 0.7689356803894043, + "learning_rate": 4.0286114445778314e-05, + "loss": 0.6232, + "step": 4865 + }, + { + "epoch": 6.22848, + "grad_norm": 0.7414539456367493, + "learning_rate": 4.028411364545818e-05, + "loss": 0.6114, + "step": 4866 + }, + { + "epoch": 6.22976, + "grad_norm": 0.7376709580421448, + "learning_rate": 4.028211284513806e-05, + "loss": 0.5968, + "step": 4867 + }, + { + "epoch": 6.23104, + "grad_norm": 0.785620391368866, + "learning_rate": 4.028011204481793e-05, + "loss": 0.6389, + "step": 4868 + }, + { + "epoch": 6.23232, + "grad_norm": 0.768951416015625, + "learning_rate": 4.02781112444978e-05, + "loss": 0.5874, + "step": 4869 + }, + { + "epoch": 6.2336, + "grad_norm": 0.7968980669975281, + "learning_rate": 4.0276110444177673e-05, + "loss": 0.6834, + "step": 4870 + }, + { + "epoch": 6.23488, + "grad_norm": 0.769282341003418, + "learning_rate": 4.0274109643857545e-05, + "loss": 0.6257, + "step": 4871 + }, + { + "epoch": 6.23616, + "grad_norm": 0.7746549844741821, + "learning_rate": 4.027210884353742e-05, + "loss": 0.6179, + "step": 4872 + }, + { + "epoch": 6.23744, + "grad_norm": 0.7487301230430603, + "learning_rate": 4.027010804321729e-05, + "loss": 0.6463, + "step": 4873 + }, + { + "epoch": 6.23872, + "grad_norm": 0.7296451330184937, + "learning_rate": 4.026810724289716e-05, + "loss": 0.6646, + "step": 4874 + }, + { + "epoch": 6.24, + "grad_norm": 0.7273462414741516, + "learning_rate": 4.026610644257703e-05, + "loss": 0.5974, + "step": 4875 + }, + { + "epoch": 6.24128, + "grad_norm": 0.7543179392814636, + "learning_rate": 4.0264105642256905e-05, + "loss": 0.6767, + "step": 4876 + }, + { + "epoch": 6.24256, + "grad_norm": 0.7834622859954834, + "learning_rate": 4.0262104841936776e-05, + "loss": 0.6277, + "step": 4877 + }, + { + "epoch": 6.24384, + "grad_norm": 0.7536554932594299, + "learning_rate": 4.026010404161665e-05, + "loss": 0.5714, + "step": 4878 + }, + { + "epoch": 6.24512, + "grad_norm": 0.7422727942466736, + "learning_rate": 4.025810324129652e-05, + "loss": 0.5911, + "step": 4879 + }, + { + "epoch": 6.2464, + "grad_norm": 0.7820923924446106, + "learning_rate": 4.025610244097639e-05, + "loss": 0.6481, + "step": 4880 + }, + { + "epoch": 6.24768, + "grad_norm": 0.6983485221862793, + "learning_rate": 4.0254101640656264e-05, + "loss": 0.5551, + "step": 4881 + }, + { + "epoch": 6.24896, + "grad_norm": 0.7593510746955872, + "learning_rate": 4.0252100840336136e-05, + "loss": 0.6049, + "step": 4882 + }, + { + "epoch": 6.25024, + "grad_norm": 0.803278386592865, + "learning_rate": 4.025010004001601e-05, + "loss": 0.6087, + "step": 4883 + }, + { + "epoch": 6.25152, + "grad_norm": 0.7537515163421631, + "learning_rate": 4.024809923969588e-05, + "loss": 0.5856, + "step": 4884 + }, + { + "epoch": 6.2528, + "grad_norm": 0.7161216735839844, + "learning_rate": 4.024609843937575e-05, + "loss": 0.5604, + "step": 4885 + }, + { + "epoch": 6.25408, + "grad_norm": 0.7833333015441895, + "learning_rate": 4.024409763905562e-05, + "loss": 0.6248, + "step": 4886 + }, + { + "epoch": 6.25536, + "grad_norm": 0.7813183069229126, + "learning_rate": 4.0242096838735495e-05, + "loss": 0.6205, + "step": 4887 + }, + { + "epoch": 6.25664, + "grad_norm": 0.8205659985542297, + "learning_rate": 4.024009603841537e-05, + "loss": 0.582, + "step": 4888 + }, + { + "epoch": 6.25792, + "grad_norm": 0.7965737581253052, + "learning_rate": 4.023809523809524e-05, + "loss": 0.6541, + "step": 4889 + }, + { + "epoch": 6.2592, + "grad_norm": 0.7237341403961182, + "learning_rate": 4.023609443777511e-05, + "loss": 0.6384, + "step": 4890 + }, + { + "epoch": 6.26048, + "grad_norm": 0.7347778081893921, + "learning_rate": 4.023409363745498e-05, + "loss": 0.6078, + "step": 4891 + }, + { + "epoch": 6.26176, + "grad_norm": 0.7442294359207153, + "learning_rate": 4.0232092837134854e-05, + "loss": 0.6534, + "step": 4892 + }, + { + "epoch": 6.26304, + "grad_norm": 0.7912835478782654, + "learning_rate": 4.0230092036814726e-05, + "loss": 0.5893, + "step": 4893 + }, + { + "epoch": 6.26432, + "grad_norm": 0.7610131502151489, + "learning_rate": 4.02280912364946e-05, + "loss": 0.6303, + "step": 4894 + }, + { + "epoch": 6.2656, + "grad_norm": 0.7433664798736572, + "learning_rate": 4.022609043617448e-05, + "loss": 0.6046, + "step": 4895 + }, + { + "epoch": 6.2668800000000005, + "grad_norm": 0.7865737080574036, + "learning_rate": 4.022408963585434e-05, + "loss": 0.6401, + "step": 4896 + }, + { + "epoch": 6.26816, + "grad_norm": 0.7448906898498535, + "learning_rate": 4.0222088835534214e-05, + "loss": 0.6217, + "step": 4897 + }, + { + "epoch": 6.26944, + "grad_norm": 0.7785893678665161, + "learning_rate": 4.0220088035214085e-05, + "loss": 0.6827, + "step": 4898 + }, + { + "epoch": 6.27072, + "grad_norm": 0.7872070670127869, + "learning_rate": 4.021808723489396e-05, + "loss": 0.6266, + "step": 4899 + }, + { + "epoch": 6.272, + "grad_norm": 0.7371118068695068, + "learning_rate": 4.021608643457383e-05, + "loss": 0.6029, + "step": 4900 + }, + { + "epoch": 6.27328, + "grad_norm": 0.7747698426246643, + "learning_rate": 4.02140856342537e-05, + "loss": 0.6047, + "step": 4901 + }, + { + "epoch": 6.27456, + "grad_norm": 0.7733140587806702, + "learning_rate": 4.021208483393358e-05, + "loss": 0.6443, + "step": 4902 + }, + { + "epoch": 6.27584, + "grad_norm": 0.7372586727142334, + "learning_rate": 4.021008403361345e-05, + "loss": 0.6, + "step": 4903 + }, + { + "epoch": 6.27712, + "grad_norm": 0.807420015335083, + "learning_rate": 4.020808323329332e-05, + "loss": 0.6279, + "step": 4904 + }, + { + "epoch": 6.2783999999999995, + "grad_norm": 0.7645445466041565, + "learning_rate": 4.020608243297319e-05, + "loss": 0.6336, + "step": 4905 + }, + { + "epoch": 6.27968, + "grad_norm": 0.7852516770362854, + "learning_rate": 4.020408163265306e-05, + "loss": 0.6219, + "step": 4906 + }, + { + "epoch": 6.28096, + "grad_norm": 0.7741815447807312, + "learning_rate": 4.020208083233293e-05, + "loss": 0.6526, + "step": 4907 + }, + { + "epoch": 6.28224, + "grad_norm": 0.7737951278686523, + "learning_rate": 4.0200080032012804e-05, + "loss": 0.6533, + "step": 4908 + }, + { + "epoch": 6.28352, + "grad_norm": 0.7810575366020203, + "learning_rate": 4.019807923169268e-05, + "loss": 0.6677, + "step": 4909 + }, + { + "epoch": 6.2848, + "grad_norm": 0.7242299914360046, + "learning_rate": 4.0196078431372555e-05, + "loss": 0.5908, + "step": 4910 + }, + { + "epoch": 6.28608, + "grad_norm": 0.7727022767066956, + "learning_rate": 4.0194077631052426e-05, + "loss": 0.592, + "step": 4911 + }, + { + "epoch": 6.28736, + "grad_norm": 0.7893999814987183, + "learning_rate": 4.019207683073229e-05, + "loss": 0.6288, + "step": 4912 + }, + { + "epoch": 6.28864, + "grad_norm": 0.7878124713897705, + "learning_rate": 4.019007603041216e-05, + "loss": 0.6547, + "step": 4913 + }, + { + "epoch": 6.28992, + "grad_norm": 0.7988905906677246, + "learning_rate": 4.0188075230092035e-05, + "loss": 0.6392, + "step": 4914 + }, + { + "epoch": 6.2912, + "grad_norm": 0.7619364261627197, + "learning_rate": 4.018607442977191e-05, + "loss": 0.6155, + "step": 4915 + }, + { + "epoch": 6.29248, + "grad_norm": 0.8226683735847473, + "learning_rate": 4.0184073629451786e-05, + "loss": 0.6236, + "step": 4916 + }, + { + "epoch": 6.29376, + "grad_norm": 0.77449631690979, + "learning_rate": 4.018207282913166e-05, + "loss": 0.5779, + "step": 4917 + }, + { + "epoch": 6.29504, + "grad_norm": 0.7637079358100891, + "learning_rate": 4.018007202881153e-05, + "loss": 0.5976, + "step": 4918 + }, + { + "epoch": 6.29632, + "grad_norm": 0.7683913707733154, + "learning_rate": 4.01780712284914e-05, + "loss": 0.6022, + "step": 4919 + }, + { + "epoch": 6.2976, + "grad_norm": 0.7907748818397522, + "learning_rate": 4.0176070428171266e-05, + "loss": 0.5968, + "step": 4920 + }, + { + "epoch": 6.29888, + "grad_norm": 0.7320314049720764, + "learning_rate": 4.017406962785114e-05, + "loss": 0.5732, + "step": 4921 + }, + { + "epoch": 6.30016, + "grad_norm": 0.8165443539619446, + "learning_rate": 4.017206882753101e-05, + "loss": 0.6384, + "step": 4922 + }, + { + "epoch": 6.30144, + "grad_norm": 0.8002688884735107, + "learning_rate": 4.017006802721089e-05, + "loss": 0.5705, + "step": 4923 + }, + { + "epoch": 6.30272, + "grad_norm": 0.7913237810134888, + "learning_rate": 4.016806722689076e-05, + "loss": 0.6072, + "step": 4924 + }, + { + "epoch": 6.304, + "grad_norm": 0.7377355098724365, + "learning_rate": 4.016606642657063e-05, + "loss": 0.6112, + "step": 4925 + }, + { + "epoch": 6.30528, + "grad_norm": 0.680272102355957, + "learning_rate": 4.0164065626250504e-05, + "loss": 0.5723, + "step": 4926 + }, + { + "epoch": 6.30656, + "grad_norm": 0.68758225440979, + "learning_rate": 4.0162064825930376e-05, + "loss": 0.5122, + "step": 4927 + }, + { + "epoch": 6.30784, + "grad_norm": 0.8090822100639343, + "learning_rate": 4.016006402561024e-05, + "loss": 0.6358, + "step": 4928 + }, + { + "epoch": 6.30912, + "grad_norm": 0.7640751600265503, + "learning_rate": 4.015806322529011e-05, + "loss": 0.6516, + "step": 4929 + }, + { + "epoch": 6.3104, + "grad_norm": 0.7689977288246155, + "learning_rate": 4.015606242496999e-05, + "loss": 0.602, + "step": 4930 + }, + { + "epoch": 6.31168, + "grad_norm": 0.7796737551689148, + "learning_rate": 4.0154061624649864e-05, + "loss": 0.5984, + "step": 4931 + }, + { + "epoch": 6.31296, + "grad_norm": 0.7571960687637329, + "learning_rate": 4.0152060824329735e-05, + "loss": 0.6333, + "step": 4932 + }, + { + "epoch": 6.31424, + "grad_norm": 0.773962676525116, + "learning_rate": 4.015006002400961e-05, + "loss": 0.5782, + "step": 4933 + }, + { + "epoch": 6.31552, + "grad_norm": 0.8556650280952454, + "learning_rate": 4.014805922368948e-05, + "loss": 0.692, + "step": 4934 + }, + { + "epoch": 6.3168, + "grad_norm": 0.7910232543945312, + "learning_rate": 4.014605842336935e-05, + "loss": 0.6353, + "step": 4935 + }, + { + "epoch": 6.31808, + "grad_norm": 0.7409533858299255, + "learning_rate": 4.0144057623049216e-05, + "loss": 0.5861, + "step": 4936 + }, + { + "epoch": 6.31936, + "grad_norm": 0.7902359366416931, + "learning_rate": 4.0142056822729095e-05, + "loss": 0.6325, + "step": 4937 + }, + { + "epoch": 6.32064, + "grad_norm": 0.7420294880867004, + "learning_rate": 4.0140056022408967e-05, + "loss": 0.5833, + "step": 4938 + }, + { + "epoch": 6.32192, + "grad_norm": 0.7837445139884949, + "learning_rate": 4.013805522208884e-05, + "loss": 0.6377, + "step": 4939 + }, + { + "epoch": 6.3232, + "grad_norm": 0.7776581645011902, + "learning_rate": 4.013605442176871e-05, + "loss": 0.6328, + "step": 4940 + }, + { + "epoch": 6.32448, + "grad_norm": 0.7739065885543823, + "learning_rate": 4.013405362144858e-05, + "loss": 0.5716, + "step": 4941 + }, + { + "epoch": 6.32576, + "grad_norm": 0.7609522342681885, + "learning_rate": 4.0132052821128454e-05, + "loss": 0.5454, + "step": 4942 + }, + { + "epoch": 6.32704, + "grad_norm": 0.7803019285202026, + "learning_rate": 4.0130052020808326e-05, + "loss": 0.6351, + "step": 4943 + }, + { + "epoch": 6.32832, + "grad_norm": 0.7924274206161499, + "learning_rate": 4.01280512204882e-05, + "loss": 0.6234, + "step": 4944 + }, + { + "epoch": 6.3296, + "grad_norm": 0.7934367060661316, + "learning_rate": 4.012605042016807e-05, + "loss": 0.632, + "step": 4945 + }, + { + "epoch": 6.33088, + "grad_norm": 0.7212319374084473, + "learning_rate": 4.012404961984794e-05, + "loss": 0.6216, + "step": 4946 + }, + { + "epoch": 6.33216, + "grad_norm": 0.7968099117279053, + "learning_rate": 4.012204881952781e-05, + "loss": 0.6441, + "step": 4947 + }, + { + "epoch": 6.33344, + "grad_norm": 0.7849327921867371, + "learning_rate": 4.0120048019207685e-05, + "loss": 0.6567, + "step": 4948 + }, + { + "epoch": 6.33472, + "grad_norm": 0.7716954350471497, + "learning_rate": 4.011804721888756e-05, + "loss": 0.6313, + "step": 4949 + }, + { + "epoch": 6.336, + "grad_norm": 0.7645626664161682, + "learning_rate": 4.011604641856743e-05, + "loss": 0.6403, + "step": 4950 + }, + { + "epoch": 6.33728, + "grad_norm": 0.7575722932815552, + "learning_rate": 4.01140456182473e-05, + "loss": 0.5777, + "step": 4951 + }, + { + "epoch": 6.33856, + "grad_norm": 0.7596458196640015, + "learning_rate": 4.011204481792717e-05, + "loss": 0.6296, + "step": 4952 + }, + { + "epoch": 6.33984, + "grad_norm": 0.7410492897033691, + "learning_rate": 4.0110044017607044e-05, + "loss": 0.5836, + "step": 4953 + }, + { + "epoch": 6.34112, + "grad_norm": 0.8087933659553528, + "learning_rate": 4.0108043217286916e-05, + "loss": 0.6035, + "step": 4954 + }, + { + "epoch": 6.3424, + "grad_norm": 0.7586066126823425, + "learning_rate": 4.010604241696679e-05, + "loss": 0.6121, + "step": 4955 + }, + { + "epoch": 6.34368, + "grad_norm": 0.7876436114311218, + "learning_rate": 4.010404161664666e-05, + "loss": 0.6206, + "step": 4956 + }, + { + "epoch": 6.34496, + "grad_norm": 0.7594673037528992, + "learning_rate": 4.010204081632653e-05, + "loss": 0.6085, + "step": 4957 + }, + { + "epoch": 6.34624, + "grad_norm": 0.7009496092796326, + "learning_rate": 4.010004001600641e-05, + "loss": 0.5725, + "step": 4958 + }, + { + "epoch": 6.34752, + "grad_norm": 0.7607222199440002, + "learning_rate": 4.0098039215686276e-05, + "loss": 0.6079, + "step": 4959 + }, + { + "epoch": 6.3488, + "grad_norm": 0.8146457076072693, + "learning_rate": 4.009603841536615e-05, + "loss": 0.6636, + "step": 4960 + }, + { + "epoch": 6.35008, + "grad_norm": 0.757946252822876, + "learning_rate": 4.009403761504602e-05, + "loss": 0.6275, + "step": 4961 + }, + { + "epoch": 6.35136, + "grad_norm": 0.7005823850631714, + "learning_rate": 4.009203681472589e-05, + "loss": 0.547, + "step": 4962 + }, + { + "epoch": 6.35264, + "grad_norm": 0.7344463467597961, + "learning_rate": 4.009003601440576e-05, + "loss": 0.6197, + "step": 4963 + }, + { + "epoch": 6.3539200000000005, + "grad_norm": 0.7441943883895874, + "learning_rate": 4.0088035214085635e-05, + "loss": 0.6031, + "step": 4964 + }, + { + "epoch": 6.3552, + "grad_norm": 0.7756435871124268, + "learning_rate": 4.0086034413765513e-05, + "loss": 0.6007, + "step": 4965 + }, + { + "epoch": 6.35648, + "grad_norm": 0.771628737449646, + "learning_rate": 4.0084033613445385e-05, + "loss": 0.6, + "step": 4966 + }, + { + "epoch": 6.35776, + "grad_norm": 0.7456242442131042, + "learning_rate": 4.008203281312525e-05, + "loss": 0.5856, + "step": 4967 + }, + { + "epoch": 6.35904, + "grad_norm": 0.7749770879745483, + "learning_rate": 4.008003201280512e-05, + "loss": 0.6364, + "step": 4968 + }, + { + "epoch": 6.36032, + "grad_norm": 0.7612026929855347, + "learning_rate": 4.0078031212484994e-05, + "loss": 0.5996, + "step": 4969 + }, + { + "epoch": 6.3616, + "grad_norm": 0.7761407494544983, + "learning_rate": 4.0076030412164866e-05, + "loss": 0.6143, + "step": 4970 + }, + { + "epoch": 6.36288, + "grad_norm": 0.7732192873954773, + "learning_rate": 4.007402961184474e-05, + "loss": 0.6296, + "step": 4971 + }, + { + "epoch": 6.36416, + "grad_norm": 0.7865023016929626, + "learning_rate": 4.0072028811524616e-05, + "loss": 0.6159, + "step": 4972 + }, + { + "epoch": 6.3654399999999995, + "grad_norm": 0.7385578155517578, + "learning_rate": 4.007002801120449e-05, + "loss": 0.6024, + "step": 4973 + }, + { + "epoch": 6.36672, + "grad_norm": 0.7962034344673157, + "learning_rate": 4.006802721088436e-05, + "loss": 0.6887, + "step": 4974 + }, + { + "epoch": 6.368, + "grad_norm": 0.7819473147392273, + "learning_rate": 4.0066026410564225e-05, + "loss": 0.6156, + "step": 4975 + }, + { + "epoch": 6.36928, + "grad_norm": 0.7631332278251648, + "learning_rate": 4.00640256102441e-05, + "loss": 0.6126, + "step": 4976 + }, + { + "epoch": 6.37056, + "grad_norm": 0.7788332104682922, + "learning_rate": 4.006202480992397e-05, + "loss": 0.6718, + "step": 4977 + }, + { + "epoch": 6.37184, + "grad_norm": 0.7646488547325134, + "learning_rate": 4.006002400960384e-05, + "loss": 0.6265, + "step": 4978 + }, + { + "epoch": 6.37312, + "grad_norm": 0.7828549742698669, + "learning_rate": 4.005802320928371e-05, + "loss": 0.6718, + "step": 4979 + }, + { + "epoch": 6.3744, + "grad_norm": 0.7134932279586792, + "learning_rate": 4.005602240896359e-05, + "loss": 0.5275, + "step": 4980 + }, + { + "epoch": 6.37568, + "grad_norm": 0.7558833360671997, + "learning_rate": 4.005402160864346e-05, + "loss": 0.6531, + "step": 4981 + }, + { + "epoch": 6.37696, + "grad_norm": 0.7762933373451233, + "learning_rate": 4.0052020808323335e-05, + "loss": 0.5951, + "step": 4982 + }, + { + "epoch": 6.37824, + "grad_norm": 0.7445576786994934, + "learning_rate": 4.00500200080032e-05, + "loss": 0.5966, + "step": 4983 + }, + { + "epoch": 6.37952, + "grad_norm": 0.7995836734771729, + "learning_rate": 4.004801920768307e-05, + "loss": 0.6623, + "step": 4984 + }, + { + "epoch": 6.3808, + "grad_norm": 0.7452435493469238, + "learning_rate": 4.0046018407362944e-05, + "loss": 0.5924, + "step": 4985 + }, + { + "epoch": 6.38208, + "grad_norm": 0.8057966828346252, + "learning_rate": 4.0044017607042816e-05, + "loss": 0.6282, + "step": 4986 + }, + { + "epoch": 6.38336, + "grad_norm": 0.8227620124816895, + "learning_rate": 4.0042016806722694e-05, + "loss": 0.6227, + "step": 4987 + }, + { + "epoch": 6.38464, + "grad_norm": 0.8170056343078613, + "learning_rate": 4.0040016006402566e-05, + "loss": 0.6289, + "step": 4988 + }, + { + "epoch": 6.38592, + "grad_norm": 0.7389938831329346, + "learning_rate": 4.003801520608244e-05, + "loss": 0.639, + "step": 4989 + }, + { + "epoch": 6.3872, + "grad_norm": 0.7633137106895447, + "learning_rate": 4.003601440576231e-05, + "loss": 0.6226, + "step": 4990 + }, + { + "epoch": 6.38848, + "grad_norm": 0.7744930982589722, + "learning_rate": 4.0034013605442175e-05, + "loss": 0.6296, + "step": 4991 + }, + { + "epoch": 6.38976, + "grad_norm": 0.7420474886894226, + "learning_rate": 4.003201280512205e-05, + "loss": 0.6408, + "step": 4992 + }, + { + "epoch": 6.39104, + "grad_norm": 0.7508412003517151, + "learning_rate": 4.003001200480192e-05, + "loss": 0.5994, + "step": 4993 + }, + { + "epoch": 6.39232, + "grad_norm": 0.783890962600708, + "learning_rate": 4.00280112044818e-05, + "loss": 0.6633, + "step": 4994 + }, + { + "epoch": 6.3936, + "grad_norm": 0.7705927491188049, + "learning_rate": 4.002601040416167e-05, + "loss": 0.6623, + "step": 4995 + }, + { + "epoch": 6.39488, + "grad_norm": 0.7970103621482849, + "learning_rate": 4.002400960384154e-05, + "loss": 0.6653, + "step": 4996 + }, + { + "epoch": 6.39616, + "grad_norm": 0.8254691958427429, + "learning_rate": 4.002200880352141e-05, + "loss": 0.6138, + "step": 4997 + }, + { + "epoch": 6.39744, + "grad_norm": 0.7935686707496643, + "learning_rate": 4.0020008003201285e-05, + "loss": 0.6288, + "step": 4998 + }, + { + "epoch": 6.39872, + "grad_norm": 0.7537709474563599, + "learning_rate": 4.001800720288115e-05, + "loss": 0.5739, + "step": 4999 + }, + { + "epoch": 6.4, + "grad_norm": 0.7888927459716797, + "learning_rate": 4.001600640256102e-05, + "loss": 0.6501, + "step": 5000 + }, + { + "epoch": 6.40128, + "grad_norm": 0.7110080718994141, + "learning_rate": 4.00140056022409e-05, + "loss": 0.5607, + "step": 5001 + }, + { + "epoch": 6.40256, + "grad_norm": 0.7363867163658142, + "learning_rate": 4.001200480192077e-05, + "loss": 0.6134, + "step": 5002 + }, + { + "epoch": 6.40384, + "grad_norm": 0.7256652116775513, + "learning_rate": 4.0010004001600644e-05, + "loss": 0.5868, + "step": 5003 + }, + { + "epoch": 6.40512, + "grad_norm": 0.7670110464096069, + "learning_rate": 4.0008003201280516e-05, + "loss": 0.5823, + "step": 5004 + }, + { + "epoch": 6.4064, + "grad_norm": 0.778918445110321, + "learning_rate": 4.000600240096039e-05, + "loss": 0.6436, + "step": 5005 + }, + { + "epoch": 6.40768, + "grad_norm": 0.7200184464454651, + "learning_rate": 4.000400160064026e-05, + "loss": 0.5635, + "step": 5006 + }, + { + "epoch": 6.40896, + "grad_norm": 0.7693450450897217, + "learning_rate": 4.0002000800320125e-05, + "loss": 0.6088, + "step": 5007 + }, + { + "epoch": 6.41024, + "grad_norm": 0.7648208737373352, + "learning_rate": 4e-05, + "loss": 0.6234, + "step": 5008 + }, + { + "epoch": 6.41152, + "grad_norm": 0.784186840057373, + "learning_rate": 3.9997999199679875e-05, + "loss": 0.6424, + "step": 5009 + }, + { + "epoch": 6.4128, + "grad_norm": 0.7577515244483948, + "learning_rate": 3.999599839935975e-05, + "loss": 0.6087, + "step": 5010 + }, + { + "epoch": 6.41408, + "grad_norm": 0.7444120645523071, + "learning_rate": 3.999399759903962e-05, + "loss": 0.5809, + "step": 5011 + }, + { + "epoch": 6.41536, + "grad_norm": 0.7742969393730164, + "learning_rate": 3.999199679871949e-05, + "loss": 0.6226, + "step": 5012 + }, + { + "epoch": 6.41664, + "grad_norm": 0.7459928393363953, + "learning_rate": 3.998999599839936e-05, + "loss": 0.5572, + "step": 5013 + }, + { + "epoch": 6.41792, + "grad_norm": 0.7630308270454407, + "learning_rate": 3.9987995198079234e-05, + "loss": 0.6129, + "step": 5014 + }, + { + "epoch": 6.4192, + "grad_norm": 0.7499691843986511, + "learning_rate": 3.9985994397759106e-05, + "loss": 0.5652, + "step": 5015 + }, + { + "epoch": 6.42048, + "grad_norm": 0.7551511526107788, + "learning_rate": 3.998399359743898e-05, + "loss": 0.6122, + "step": 5016 + }, + { + "epoch": 6.42176, + "grad_norm": 0.8000392913818359, + "learning_rate": 3.998199279711885e-05, + "loss": 0.6467, + "step": 5017 + }, + { + "epoch": 6.42304, + "grad_norm": 0.7956501245498657, + "learning_rate": 3.997999199679872e-05, + "loss": 0.6459, + "step": 5018 + }, + { + "epoch": 6.42432, + "grad_norm": 0.7600113153457642, + "learning_rate": 3.9977991196478594e-05, + "loss": 0.6714, + "step": 5019 + }, + { + "epoch": 6.4256, + "grad_norm": 0.7289563417434692, + "learning_rate": 3.9975990396158466e-05, + "loss": 0.5878, + "step": 5020 + }, + { + "epoch": 6.42688, + "grad_norm": 0.7440637350082397, + "learning_rate": 3.997398959583834e-05, + "loss": 0.6087, + "step": 5021 + }, + { + "epoch": 6.42816, + "grad_norm": 0.8092576265335083, + "learning_rate": 3.997198879551821e-05, + "loss": 0.6633, + "step": 5022 + }, + { + "epoch": 6.42944, + "grad_norm": 0.7994837164878845, + "learning_rate": 3.996998799519808e-05, + "loss": 0.5933, + "step": 5023 + }, + { + "epoch": 6.43072, + "grad_norm": 0.7767707109451294, + "learning_rate": 3.996798719487795e-05, + "loss": 0.653, + "step": 5024 + }, + { + "epoch": 6.432, + "grad_norm": 0.780889093875885, + "learning_rate": 3.9965986394557825e-05, + "loss": 0.7127, + "step": 5025 + }, + { + "epoch": 6.43328, + "grad_norm": 0.7731380462646484, + "learning_rate": 3.99639855942377e-05, + "loss": 0.6146, + "step": 5026 + }, + { + "epoch": 6.43456, + "grad_norm": 0.7348530292510986, + "learning_rate": 3.996198479391757e-05, + "loss": 0.5598, + "step": 5027 + }, + { + "epoch": 6.43584, + "grad_norm": 0.7755183577537537, + "learning_rate": 3.995998399359744e-05, + "loss": 0.6158, + "step": 5028 + }, + { + "epoch": 6.43712, + "grad_norm": 0.7527555227279663, + "learning_rate": 3.995798319327731e-05, + "loss": 0.6186, + "step": 5029 + }, + { + "epoch": 6.4384, + "grad_norm": 0.7150756120681763, + "learning_rate": 3.9955982392957184e-05, + "loss": 0.5473, + "step": 5030 + }, + { + "epoch": 6.43968, + "grad_norm": 0.7636802792549133, + "learning_rate": 3.9953981592637056e-05, + "loss": 0.6463, + "step": 5031 + }, + { + "epoch": 6.4409600000000005, + "grad_norm": 0.8033157587051392, + "learning_rate": 3.995198079231693e-05, + "loss": 0.6095, + "step": 5032 + }, + { + "epoch": 6.44224, + "grad_norm": 0.7744354009628296, + "learning_rate": 3.99499799919968e-05, + "loss": 0.6034, + "step": 5033 + }, + { + "epoch": 6.44352, + "grad_norm": 0.7521213293075562, + "learning_rate": 3.994797919167667e-05, + "loss": 0.6075, + "step": 5034 + }, + { + "epoch": 6.4448, + "grad_norm": 0.7580350041389465, + "learning_rate": 3.9945978391356543e-05, + "loss": 0.6328, + "step": 5035 + }, + { + "epoch": 6.44608, + "grad_norm": 0.7581676244735718, + "learning_rate": 3.994397759103642e-05, + "loss": 0.5721, + "step": 5036 + }, + { + "epoch": 6.44736, + "grad_norm": 0.7432277202606201, + "learning_rate": 3.994197679071629e-05, + "loss": 0.5626, + "step": 5037 + }, + { + "epoch": 6.44864, + "grad_norm": 0.7694210410118103, + "learning_rate": 3.993997599039616e-05, + "loss": 0.6688, + "step": 5038 + }, + { + "epoch": 6.44992, + "grad_norm": 0.7114573121070862, + "learning_rate": 3.993797519007603e-05, + "loss": 0.5948, + "step": 5039 + }, + { + "epoch": 6.4512, + "grad_norm": 0.7279742360115051, + "learning_rate": 3.99359743897559e-05, + "loss": 0.5615, + "step": 5040 + }, + { + "epoch": 6.4524799999999995, + "grad_norm": 0.7480107545852661, + "learning_rate": 3.9933973589435775e-05, + "loss": 0.605, + "step": 5041 + }, + { + "epoch": 6.45376, + "grad_norm": 0.7534326314926147, + "learning_rate": 3.9931972789115646e-05, + "loss": 0.5966, + "step": 5042 + }, + { + "epoch": 6.45504, + "grad_norm": 0.7341785430908203, + "learning_rate": 3.9929971988795525e-05, + "loss": 0.5677, + "step": 5043 + }, + { + "epoch": 6.45632, + "grad_norm": 0.7698660492897034, + "learning_rate": 3.99279711884754e-05, + "loss": 0.6088, + "step": 5044 + }, + { + "epoch": 6.4576, + "grad_norm": 0.777713418006897, + "learning_rate": 3.992597038815526e-05, + "loss": 0.6248, + "step": 5045 + }, + { + "epoch": 6.45888, + "grad_norm": 0.7565256357192993, + "learning_rate": 3.9923969587835134e-05, + "loss": 0.6141, + "step": 5046 + }, + { + "epoch": 6.46016, + "grad_norm": 0.7639655470848083, + "learning_rate": 3.9921968787515006e-05, + "loss": 0.6114, + "step": 5047 + }, + { + "epoch": 6.46144, + "grad_norm": 0.7447236776351929, + "learning_rate": 3.991996798719488e-05, + "loss": 0.6114, + "step": 5048 + }, + { + "epoch": 6.46272, + "grad_norm": 0.7928009033203125, + "learning_rate": 3.991796718687475e-05, + "loss": 0.6378, + "step": 5049 + }, + { + "epoch": 6.464, + "grad_norm": 0.8144562244415283, + "learning_rate": 3.991596638655463e-05, + "loss": 0.6235, + "step": 5050 + }, + { + "epoch": 6.46528, + "grad_norm": 0.6955716609954834, + "learning_rate": 3.99139655862345e-05, + "loss": 0.5668, + "step": 5051 + }, + { + "epoch": 6.46656, + "grad_norm": 0.756885290145874, + "learning_rate": 3.991196478591437e-05, + "loss": 0.6575, + "step": 5052 + }, + { + "epoch": 6.46784, + "grad_norm": 0.7933226227760315, + "learning_rate": 3.990996398559424e-05, + "loss": 0.6359, + "step": 5053 + }, + { + "epoch": 6.46912, + "grad_norm": 0.764266848564148, + "learning_rate": 3.990796318527411e-05, + "loss": 0.6044, + "step": 5054 + }, + { + "epoch": 6.4704, + "grad_norm": 0.770219087600708, + "learning_rate": 3.990596238495398e-05, + "loss": 0.6511, + "step": 5055 + }, + { + "epoch": 6.47168, + "grad_norm": 0.8092508912086487, + "learning_rate": 3.990396158463385e-05, + "loss": 0.6976, + "step": 5056 + }, + { + "epoch": 6.47296, + "grad_norm": 0.7517983913421631, + "learning_rate": 3.990196078431373e-05, + "loss": 0.5846, + "step": 5057 + }, + { + "epoch": 6.47424, + "grad_norm": 0.7917564511299133, + "learning_rate": 3.98999599839936e-05, + "loss": 0.652, + "step": 5058 + }, + { + "epoch": 6.47552, + "grad_norm": 0.7756854295730591, + "learning_rate": 3.9897959183673475e-05, + "loss": 0.5847, + "step": 5059 + }, + { + "epoch": 6.4768, + "grad_norm": 0.7966377139091492, + "learning_rate": 3.9895958383353347e-05, + "loss": 0.5864, + "step": 5060 + }, + { + "epoch": 6.47808, + "grad_norm": 0.793876051902771, + "learning_rate": 3.989395758303321e-05, + "loss": 0.5858, + "step": 5061 + }, + { + "epoch": 6.47936, + "grad_norm": 0.7717666029930115, + "learning_rate": 3.9891956782713084e-05, + "loss": 0.6202, + "step": 5062 + }, + { + "epoch": 6.48064, + "grad_norm": 0.7384968400001526, + "learning_rate": 3.9889955982392955e-05, + "loss": 0.6262, + "step": 5063 + }, + { + "epoch": 6.48192, + "grad_norm": 0.7273770570755005, + "learning_rate": 3.9887955182072834e-05, + "loss": 0.5539, + "step": 5064 + }, + { + "epoch": 6.4832, + "grad_norm": 0.7145727276802063, + "learning_rate": 3.9885954381752706e-05, + "loss": 0.5845, + "step": 5065 + }, + { + "epoch": 6.48448, + "grad_norm": 0.7780152559280396, + "learning_rate": 3.988395358143258e-05, + "loss": 0.596, + "step": 5066 + }, + { + "epoch": 6.48576, + "grad_norm": 0.8011901378631592, + "learning_rate": 3.988195278111245e-05, + "loss": 0.6526, + "step": 5067 + }, + { + "epoch": 6.48704, + "grad_norm": 0.780985414981842, + "learning_rate": 3.987995198079232e-05, + "loss": 0.5757, + "step": 5068 + }, + { + "epoch": 6.48832, + "grad_norm": 0.7850869297981262, + "learning_rate": 3.9877951180472187e-05, + "loss": 0.639, + "step": 5069 + }, + { + "epoch": 6.4896, + "grad_norm": 0.7978724837303162, + "learning_rate": 3.987595038015206e-05, + "loss": 0.5953, + "step": 5070 + }, + { + "epoch": 6.49088, + "grad_norm": 0.7666551470756531, + "learning_rate": 3.987394957983194e-05, + "loss": 0.5911, + "step": 5071 + }, + { + "epoch": 6.49216, + "grad_norm": 0.8040878176689148, + "learning_rate": 3.987194877951181e-05, + "loss": 0.6198, + "step": 5072 + }, + { + "epoch": 6.49344, + "grad_norm": 0.7172399759292603, + "learning_rate": 3.986994797919168e-05, + "loss": 0.6161, + "step": 5073 + }, + { + "epoch": 6.49472, + "grad_norm": 0.7347850203514099, + "learning_rate": 3.986794717887155e-05, + "loss": 0.6082, + "step": 5074 + }, + { + "epoch": 6.496, + "grad_norm": 0.7927334904670715, + "learning_rate": 3.9865946378551424e-05, + "loss": 0.6269, + "step": 5075 + }, + { + "epoch": 6.49728, + "grad_norm": 0.7884413003921509, + "learning_rate": 3.9863945578231296e-05, + "loss": 0.6417, + "step": 5076 + }, + { + "epoch": 6.49856, + "grad_norm": 0.7497519850730896, + "learning_rate": 3.986194477791116e-05, + "loss": 0.5945, + "step": 5077 + }, + { + "epoch": 6.49984, + "grad_norm": 0.8451282382011414, + "learning_rate": 3.985994397759104e-05, + "loss": 0.6606, + "step": 5078 + }, + { + "epoch": 6.50112, + "grad_norm": 0.8098703026771545, + "learning_rate": 3.985794317727091e-05, + "loss": 0.6289, + "step": 5079 + }, + { + "epoch": 6.5024, + "grad_norm": 0.8050368428230286, + "learning_rate": 3.9855942376950784e-05, + "loss": 0.6251, + "step": 5080 + }, + { + "epoch": 6.50368, + "grad_norm": 0.7607022523880005, + "learning_rate": 3.9853941576630656e-05, + "loss": 0.5769, + "step": 5081 + }, + { + "epoch": 6.50496, + "grad_norm": 0.7871085405349731, + "learning_rate": 3.985194077631053e-05, + "loss": 0.6361, + "step": 5082 + }, + { + "epoch": 6.50624, + "grad_norm": 0.7968281507492065, + "learning_rate": 3.98499399759904e-05, + "loss": 0.6802, + "step": 5083 + }, + { + "epoch": 6.5075199999999995, + "grad_norm": 0.7657204270362854, + "learning_rate": 3.984793917567027e-05, + "loss": 0.6098, + "step": 5084 + }, + { + "epoch": 6.5088, + "grad_norm": 0.7545149922370911, + "learning_rate": 3.984593837535014e-05, + "loss": 0.6096, + "step": 5085 + }, + { + "epoch": 6.51008, + "grad_norm": 0.7787616848945618, + "learning_rate": 3.9843937575030015e-05, + "loss": 0.6258, + "step": 5086 + }, + { + "epoch": 6.51136, + "grad_norm": 0.795444905757904, + "learning_rate": 3.984193677470989e-05, + "loss": 0.6293, + "step": 5087 + }, + { + "epoch": 6.51264, + "grad_norm": 0.7523403763771057, + "learning_rate": 3.983993597438976e-05, + "loss": 0.6098, + "step": 5088 + }, + { + "epoch": 6.51392, + "grad_norm": 0.7614408731460571, + "learning_rate": 3.983793517406963e-05, + "loss": 0.5827, + "step": 5089 + }, + { + "epoch": 6.5152, + "grad_norm": 0.7404950857162476, + "learning_rate": 3.98359343737495e-05, + "loss": 0.6255, + "step": 5090 + }, + { + "epoch": 6.51648, + "grad_norm": 0.7416658997535706, + "learning_rate": 3.9833933573429374e-05, + "loss": 0.6169, + "step": 5091 + }, + { + "epoch": 6.51776, + "grad_norm": 0.7457707524299622, + "learning_rate": 3.9831932773109246e-05, + "loss": 0.5626, + "step": 5092 + }, + { + "epoch": 6.51904, + "grad_norm": 0.7906910181045532, + "learning_rate": 3.982993197278912e-05, + "loss": 0.6424, + "step": 5093 + }, + { + "epoch": 6.52032, + "grad_norm": 0.798050045967102, + "learning_rate": 3.982793117246899e-05, + "loss": 0.6044, + "step": 5094 + }, + { + "epoch": 6.5216, + "grad_norm": 0.7886822819709778, + "learning_rate": 3.982593037214886e-05, + "loss": 0.637, + "step": 5095 + }, + { + "epoch": 6.52288, + "grad_norm": 0.7528558373451233, + "learning_rate": 3.9823929571828733e-05, + "loss": 0.5701, + "step": 5096 + }, + { + "epoch": 6.52416, + "grad_norm": 0.78963303565979, + "learning_rate": 3.9821928771508605e-05, + "loss": 0.6763, + "step": 5097 + }, + { + "epoch": 6.52544, + "grad_norm": 0.7659367918968201, + "learning_rate": 3.981992797118848e-05, + "loss": 0.6181, + "step": 5098 + }, + { + "epoch": 6.52672, + "grad_norm": 0.7585764527320862, + "learning_rate": 3.981792717086835e-05, + "loss": 0.609, + "step": 5099 + }, + { + "epoch": 6.5280000000000005, + "grad_norm": 0.7747802138328552, + "learning_rate": 3.981592637054822e-05, + "loss": 0.6033, + "step": 5100 + }, + { + "epoch": 6.52928, + "grad_norm": 0.7253979444503784, + "learning_rate": 3.981392557022809e-05, + "loss": 0.5512, + "step": 5101 + }, + { + "epoch": 6.53056, + "grad_norm": 0.7542280554771423, + "learning_rate": 3.9811924769907965e-05, + "loss": 0.6486, + "step": 5102 + }, + { + "epoch": 6.53184, + "grad_norm": 0.7517480254173279, + "learning_rate": 3.9809923969587836e-05, + "loss": 0.6327, + "step": 5103 + }, + { + "epoch": 6.53312, + "grad_norm": 0.7435446381568909, + "learning_rate": 3.980792316926771e-05, + "loss": 0.6079, + "step": 5104 + }, + { + "epoch": 6.5344, + "grad_norm": 0.7595201730728149, + "learning_rate": 3.980592236894758e-05, + "loss": 0.604, + "step": 5105 + }, + { + "epoch": 6.53568, + "grad_norm": 0.7278428673744202, + "learning_rate": 3.980392156862745e-05, + "loss": 0.6111, + "step": 5106 + }, + { + "epoch": 6.53696, + "grad_norm": 0.7786182165145874, + "learning_rate": 3.9801920768307324e-05, + "loss": 0.6558, + "step": 5107 + }, + { + "epoch": 6.53824, + "grad_norm": 0.8050516247749329, + "learning_rate": 3.9799919967987196e-05, + "loss": 0.6563, + "step": 5108 + }, + { + "epoch": 6.5395199999999996, + "grad_norm": 0.7350043058395386, + "learning_rate": 3.979791916766707e-05, + "loss": 0.6049, + "step": 5109 + }, + { + "epoch": 6.5408, + "grad_norm": 0.7202991843223572, + "learning_rate": 3.979591836734694e-05, + "loss": 0.5954, + "step": 5110 + }, + { + "epoch": 6.54208, + "grad_norm": 0.7836583256721497, + "learning_rate": 3.979391756702681e-05, + "loss": 0.6103, + "step": 5111 + }, + { + "epoch": 6.54336, + "grad_norm": 0.7699684500694275, + "learning_rate": 3.979191676670668e-05, + "loss": 0.6243, + "step": 5112 + }, + { + "epoch": 6.54464, + "grad_norm": 0.8052793145179749, + "learning_rate": 3.9789915966386555e-05, + "loss": 0.6234, + "step": 5113 + }, + { + "epoch": 6.54592, + "grad_norm": 0.7986263632774353, + "learning_rate": 3.9787915166066434e-05, + "loss": 0.6337, + "step": 5114 + }, + { + "epoch": 6.5472, + "grad_norm": 0.7898523807525635, + "learning_rate": 3.97859143657463e-05, + "loss": 0.6222, + "step": 5115 + }, + { + "epoch": 6.54848, + "grad_norm": 0.7434815764427185, + "learning_rate": 3.978391356542617e-05, + "loss": 0.6167, + "step": 5116 + }, + { + "epoch": 6.54976, + "grad_norm": 0.7909148931503296, + "learning_rate": 3.978191276510604e-05, + "loss": 0.6231, + "step": 5117 + }, + { + "epoch": 6.55104, + "grad_norm": 0.7848829030990601, + "learning_rate": 3.9779911964785914e-05, + "loss": 0.6086, + "step": 5118 + }, + { + "epoch": 6.55232, + "grad_norm": 0.7340389490127563, + "learning_rate": 3.9777911164465786e-05, + "loss": 0.5672, + "step": 5119 + }, + { + "epoch": 6.5536, + "grad_norm": 0.8030784726142883, + "learning_rate": 3.977591036414566e-05, + "loss": 0.5965, + "step": 5120 + }, + { + "epoch": 6.55488, + "grad_norm": 0.783450186252594, + "learning_rate": 3.977390956382554e-05, + "loss": 0.6128, + "step": 5121 + }, + { + "epoch": 6.55616, + "grad_norm": 0.7899596095085144, + "learning_rate": 3.977190876350541e-05, + "loss": 0.6482, + "step": 5122 + }, + { + "epoch": 6.55744, + "grad_norm": 0.7937018275260925, + "learning_rate": 3.9769907963185274e-05, + "loss": 0.6245, + "step": 5123 + }, + { + "epoch": 6.55872, + "grad_norm": 0.7522284388542175, + "learning_rate": 3.9767907162865145e-05, + "loss": 0.5998, + "step": 5124 + }, + { + "epoch": 6.5600000000000005, + "grad_norm": 0.7983690500259399, + "learning_rate": 3.976590636254502e-05, + "loss": 0.6299, + "step": 5125 + }, + { + "epoch": 6.56128, + "grad_norm": 0.7660322785377502, + "learning_rate": 3.976390556222489e-05, + "loss": 0.6138, + "step": 5126 + }, + { + "epoch": 6.5625599999999995, + "grad_norm": 0.8127145767211914, + "learning_rate": 3.976190476190476e-05, + "loss": 0.5622, + "step": 5127 + }, + { + "epoch": 6.56384, + "grad_norm": 0.8016258478164673, + "learning_rate": 3.975990396158464e-05, + "loss": 0.6375, + "step": 5128 + }, + { + "epoch": 6.56512, + "grad_norm": 0.756418764591217, + "learning_rate": 3.975790316126451e-05, + "loss": 0.6423, + "step": 5129 + }, + { + "epoch": 6.5664, + "grad_norm": 0.7691940069198608, + "learning_rate": 3.975590236094438e-05, + "loss": 0.6323, + "step": 5130 + }, + { + "epoch": 6.56768, + "grad_norm": 0.7534066438674927, + "learning_rate": 3.975390156062425e-05, + "loss": 0.6174, + "step": 5131 + }, + { + "epoch": 6.56896, + "grad_norm": 0.8069169521331787, + "learning_rate": 3.975190076030412e-05, + "loss": 0.6324, + "step": 5132 + }, + { + "epoch": 6.57024, + "grad_norm": 0.7579085230827332, + "learning_rate": 3.974989995998399e-05, + "loss": 0.6137, + "step": 5133 + }, + { + "epoch": 6.57152, + "grad_norm": 0.759647011756897, + "learning_rate": 3.9747899159663864e-05, + "loss": 0.597, + "step": 5134 + }, + { + "epoch": 6.5728, + "grad_norm": 0.7449114918708801, + "learning_rate": 3.974589835934374e-05, + "loss": 0.5865, + "step": 5135 + }, + { + "epoch": 6.57408, + "grad_norm": 0.7468611001968384, + "learning_rate": 3.9743897559023614e-05, + "loss": 0.5255, + "step": 5136 + }, + { + "epoch": 6.57536, + "grad_norm": 0.8116476535797119, + "learning_rate": 3.9741896758703486e-05, + "loss": 0.6977, + "step": 5137 + }, + { + "epoch": 6.57664, + "grad_norm": 0.7768375873565674, + "learning_rate": 3.973989595838336e-05, + "loss": 0.6375, + "step": 5138 + }, + { + "epoch": 6.57792, + "grad_norm": 0.751890242099762, + "learning_rate": 3.973789515806322e-05, + "loss": 0.5899, + "step": 5139 + }, + { + "epoch": 6.5792, + "grad_norm": 0.7473777532577515, + "learning_rate": 3.9735894357743095e-05, + "loss": 0.5912, + "step": 5140 + }, + { + "epoch": 6.58048, + "grad_norm": 0.7794525623321533, + "learning_rate": 3.973389355742297e-05, + "loss": 0.6331, + "step": 5141 + }, + { + "epoch": 6.58176, + "grad_norm": 0.7751368880271912, + "learning_rate": 3.9731892757102846e-05, + "loss": 0.627, + "step": 5142 + }, + { + "epoch": 6.5830400000000004, + "grad_norm": 0.7366233468055725, + "learning_rate": 3.972989195678272e-05, + "loss": 0.6061, + "step": 5143 + }, + { + "epoch": 6.58432, + "grad_norm": 0.7520456910133362, + "learning_rate": 3.972789115646259e-05, + "loss": 0.6021, + "step": 5144 + }, + { + "epoch": 6.5856, + "grad_norm": 0.7733432650566101, + "learning_rate": 3.972589035614246e-05, + "loss": 0.5873, + "step": 5145 + }, + { + "epoch": 6.58688, + "grad_norm": 0.7415776252746582, + "learning_rate": 3.972388955582233e-05, + "loss": 0.5845, + "step": 5146 + }, + { + "epoch": 6.58816, + "grad_norm": 0.7746995091438293, + "learning_rate": 3.97218887555022e-05, + "loss": 0.5892, + "step": 5147 + }, + { + "epoch": 6.58944, + "grad_norm": 0.736659049987793, + "learning_rate": 3.971988795518207e-05, + "loss": 0.5841, + "step": 5148 + }, + { + "epoch": 6.59072, + "grad_norm": 0.7600561380386353, + "learning_rate": 3.971788715486195e-05, + "loss": 0.5765, + "step": 5149 + }, + { + "epoch": 6.592, + "grad_norm": 0.7675446271896362, + "learning_rate": 3.971588635454182e-05, + "loss": 0.6319, + "step": 5150 + }, + { + "epoch": 6.59328, + "grad_norm": 0.7852813601493835, + "learning_rate": 3.971388555422169e-05, + "loss": 0.6026, + "step": 5151 + }, + { + "epoch": 6.5945599999999995, + "grad_norm": 0.7649344801902771, + "learning_rate": 3.9711884753901564e-05, + "loss": 0.6173, + "step": 5152 + }, + { + "epoch": 6.59584, + "grad_norm": 0.7568708062171936, + "learning_rate": 3.9709883953581436e-05, + "loss": 0.6124, + "step": 5153 + }, + { + "epoch": 6.59712, + "grad_norm": 0.7136748433113098, + "learning_rate": 3.970788315326131e-05, + "loss": 0.6117, + "step": 5154 + }, + { + "epoch": 6.5984, + "grad_norm": 0.8341229557991028, + "learning_rate": 3.970588235294117e-05, + "loss": 0.6245, + "step": 5155 + }, + { + "epoch": 6.59968, + "grad_norm": 0.8061261773109436, + "learning_rate": 3.970388155262105e-05, + "loss": 0.6325, + "step": 5156 + }, + { + "epoch": 6.60096, + "grad_norm": 0.7360426187515259, + "learning_rate": 3.9701880752300923e-05, + "loss": 0.6087, + "step": 5157 + }, + { + "epoch": 6.60224, + "grad_norm": 0.7481448650360107, + "learning_rate": 3.9699879951980795e-05, + "loss": 0.5674, + "step": 5158 + }, + { + "epoch": 6.60352, + "grad_norm": 0.7844505310058594, + "learning_rate": 3.969787915166067e-05, + "loss": 0.6471, + "step": 5159 + }, + { + "epoch": 6.6048, + "grad_norm": 0.8189229965209961, + "learning_rate": 3.969587835134054e-05, + "loss": 0.6488, + "step": 5160 + }, + { + "epoch": 6.60608, + "grad_norm": 0.7833296656608582, + "learning_rate": 3.969387755102041e-05, + "loss": 0.6595, + "step": 5161 + }, + { + "epoch": 6.60736, + "grad_norm": 0.7521294951438904, + "learning_rate": 3.969187675070028e-05, + "loss": 0.6137, + "step": 5162 + }, + { + "epoch": 6.60864, + "grad_norm": 0.7876133918762207, + "learning_rate": 3.9689875950380155e-05, + "loss": 0.6154, + "step": 5163 + }, + { + "epoch": 6.60992, + "grad_norm": 0.7582443952560425, + "learning_rate": 3.9687875150060026e-05, + "loss": 0.5877, + "step": 5164 + }, + { + "epoch": 6.6112, + "grad_norm": 0.7658904790878296, + "learning_rate": 3.96858743497399e-05, + "loss": 0.647, + "step": 5165 + }, + { + "epoch": 6.61248, + "grad_norm": 0.7374891638755798, + "learning_rate": 3.968387354941977e-05, + "loss": 0.6219, + "step": 5166 + }, + { + "epoch": 6.61376, + "grad_norm": 0.7132728695869446, + "learning_rate": 3.968187274909964e-05, + "loss": 0.5317, + "step": 5167 + }, + { + "epoch": 6.6150400000000005, + "grad_norm": 0.7678626775741577, + "learning_rate": 3.9679871948779514e-05, + "loss": 0.5931, + "step": 5168 + }, + { + "epoch": 6.61632, + "grad_norm": 0.8119401335716248, + "learning_rate": 3.9677871148459386e-05, + "loss": 0.6636, + "step": 5169 + }, + { + "epoch": 6.6176, + "grad_norm": 0.7630581259727478, + "learning_rate": 3.967587034813926e-05, + "loss": 0.6401, + "step": 5170 + }, + { + "epoch": 6.61888, + "grad_norm": 0.7518944144248962, + "learning_rate": 3.967386954781913e-05, + "loss": 0.6174, + "step": 5171 + }, + { + "epoch": 6.62016, + "grad_norm": 0.7786079049110413, + "learning_rate": 3.9671868747499e-05, + "loss": 0.6526, + "step": 5172 + }, + { + "epoch": 6.62144, + "grad_norm": 0.8287822604179382, + "learning_rate": 3.966986794717887e-05, + "loss": 0.713, + "step": 5173 + }, + { + "epoch": 6.62272, + "grad_norm": 0.7201914191246033, + "learning_rate": 3.9667867146858745e-05, + "loss": 0.5696, + "step": 5174 + }, + { + "epoch": 6.624, + "grad_norm": 0.7631100416183472, + "learning_rate": 3.966586634653862e-05, + "loss": 0.6302, + "step": 5175 + }, + { + "epoch": 6.62528, + "grad_norm": 0.769008994102478, + "learning_rate": 3.966386554621849e-05, + "loss": 0.5921, + "step": 5176 + }, + { + "epoch": 6.62656, + "grad_norm": 0.8017969727516174, + "learning_rate": 3.966186474589836e-05, + "loss": 0.6579, + "step": 5177 + }, + { + "epoch": 6.62784, + "grad_norm": 0.7824108600616455, + "learning_rate": 3.965986394557823e-05, + "loss": 0.6594, + "step": 5178 + }, + { + "epoch": 6.62912, + "grad_norm": 0.7433846592903137, + "learning_rate": 3.9657863145258104e-05, + "loss": 0.624, + "step": 5179 + }, + { + "epoch": 6.6304, + "grad_norm": 0.7518701553344727, + "learning_rate": 3.9655862344937976e-05, + "loss": 0.6058, + "step": 5180 + }, + { + "epoch": 6.63168, + "grad_norm": 0.7567682266235352, + "learning_rate": 3.965386154461785e-05, + "loss": 0.6081, + "step": 5181 + }, + { + "epoch": 6.63296, + "grad_norm": 0.7529020309448242, + "learning_rate": 3.965186074429772e-05, + "loss": 0.6016, + "step": 5182 + }, + { + "epoch": 6.63424, + "grad_norm": 0.7778772115707397, + "learning_rate": 3.964985994397759e-05, + "loss": 0.6077, + "step": 5183 + }, + { + "epoch": 6.63552, + "grad_norm": 0.7256116271018982, + "learning_rate": 3.964785914365747e-05, + "loss": 0.5214, + "step": 5184 + }, + { + "epoch": 6.6368, + "grad_norm": 0.820444643497467, + "learning_rate": 3.9645858343337335e-05, + "loss": 0.6515, + "step": 5185 + }, + { + "epoch": 6.63808, + "grad_norm": 0.7796686291694641, + "learning_rate": 3.964385754301721e-05, + "loss": 0.6436, + "step": 5186 + }, + { + "epoch": 6.63936, + "grad_norm": 0.7803254127502441, + "learning_rate": 3.964185674269708e-05, + "loss": 0.6102, + "step": 5187 + }, + { + "epoch": 6.64064, + "grad_norm": 0.7486667037010193, + "learning_rate": 3.963985594237695e-05, + "loss": 0.65, + "step": 5188 + }, + { + "epoch": 6.64192, + "grad_norm": 0.7695333361625671, + "learning_rate": 3.963785514205682e-05, + "loss": 0.6523, + "step": 5189 + }, + { + "epoch": 6.6432, + "grad_norm": 0.7668389678001404, + "learning_rate": 3.9635854341736695e-05, + "loss": 0.6391, + "step": 5190 + }, + { + "epoch": 6.64448, + "grad_norm": 0.7639618515968323, + "learning_rate": 3.963385354141657e-05, + "loss": 0.6011, + "step": 5191 + }, + { + "epoch": 6.64576, + "grad_norm": 0.7803028225898743, + "learning_rate": 3.9631852741096445e-05, + "loss": 0.6203, + "step": 5192 + }, + { + "epoch": 6.64704, + "grad_norm": 0.7438964247703552, + "learning_rate": 3.962985194077631e-05, + "loss": 0.5911, + "step": 5193 + }, + { + "epoch": 6.64832, + "grad_norm": 0.7312869429588318, + "learning_rate": 3.962785114045618e-05, + "loss": 0.5731, + "step": 5194 + }, + { + "epoch": 6.6495999999999995, + "grad_norm": 0.7577333450317383, + "learning_rate": 3.9625850340136054e-05, + "loss": 0.6012, + "step": 5195 + }, + { + "epoch": 6.65088, + "grad_norm": 0.7712530493736267, + "learning_rate": 3.9623849539815926e-05, + "loss": 0.6512, + "step": 5196 + }, + { + "epoch": 6.65216, + "grad_norm": 0.7147469520568848, + "learning_rate": 3.96218487394958e-05, + "loss": 0.6153, + "step": 5197 + }, + { + "epoch": 6.65344, + "grad_norm": 0.7817697525024414, + "learning_rate": 3.961984793917567e-05, + "loss": 0.6256, + "step": 5198 + }, + { + "epoch": 6.65472, + "grad_norm": 0.7692977786064148, + "learning_rate": 3.961784713885555e-05, + "loss": 0.5991, + "step": 5199 + }, + { + "epoch": 6.656, + "grad_norm": 0.7339797616004944, + "learning_rate": 3.961584633853542e-05, + "loss": 0.5563, + "step": 5200 + }, + { + "epoch": 6.65728, + "grad_norm": 0.7555981278419495, + "learning_rate": 3.9613845538215285e-05, + "loss": 0.6133, + "step": 5201 + }, + { + "epoch": 6.65856, + "grad_norm": 0.7478959560394287, + "learning_rate": 3.961184473789516e-05, + "loss": 0.614, + "step": 5202 + }, + { + "epoch": 6.65984, + "grad_norm": 0.7598313689231873, + "learning_rate": 3.960984393757503e-05, + "loss": 0.6197, + "step": 5203 + }, + { + "epoch": 6.66112, + "grad_norm": 0.7408673763275146, + "learning_rate": 3.96078431372549e-05, + "loss": 0.5736, + "step": 5204 + }, + { + "epoch": 6.6624, + "grad_norm": 0.8013281226158142, + "learning_rate": 3.960584233693477e-05, + "loss": 0.6783, + "step": 5205 + }, + { + "epoch": 6.66368, + "grad_norm": 0.7539663910865784, + "learning_rate": 3.960384153661465e-05, + "loss": 0.5824, + "step": 5206 + }, + { + "epoch": 6.66496, + "grad_norm": 0.7687118649482727, + "learning_rate": 3.960184073629452e-05, + "loss": 0.6185, + "step": 5207 + }, + { + "epoch": 6.66624, + "grad_norm": 0.803480327129364, + "learning_rate": 3.9599839935974395e-05, + "loss": 0.5955, + "step": 5208 + }, + { + "epoch": 6.66752, + "grad_norm": 0.7527867555618286, + "learning_rate": 3.959783913565426e-05, + "loss": 0.6128, + "step": 5209 + }, + { + "epoch": 6.6688, + "grad_norm": 0.7492735981941223, + "learning_rate": 3.959583833533413e-05, + "loss": 0.6112, + "step": 5210 + }, + { + "epoch": 6.6700800000000005, + "grad_norm": 0.757946252822876, + "learning_rate": 3.9593837535014004e-05, + "loss": 0.57, + "step": 5211 + }, + { + "epoch": 6.67136, + "grad_norm": 0.7378002405166626, + "learning_rate": 3.9591836734693876e-05, + "loss": 0.5691, + "step": 5212 + }, + { + "epoch": 6.67264, + "grad_norm": 0.7848967909812927, + "learning_rate": 3.9589835934373754e-05, + "loss": 0.6498, + "step": 5213 + }, + { + "epoch": 6.67392, + "grad_norm": 0.7497094869613647, + "learning_rate": 3.9587835134053626e-05, + "loss": 0.5919, + "step": 5214 + }, + { + "epoch": 6.6752, + "grad_norm": 0.7710285782814026, + "learning_rate": 3.95858343337335e-05, + "loss": 0.623, + "step": 5215 + }, + { + "epoch": 6.67648, + "grad_norm": 0.7714881300926208, + "learning_rate": 3.958383353341337e-05, + "loss": 0.6167, + "step": 5216 + }, + { + "epoch": 6.67776, + "grad_norm": 0.8070691823959351, + "learning_rate": 3.9581832733093235e-05, + "loss": 0.627, + "step": 5217 + }, + { + "epoch": 6.67904, + "grad_norm": 0.7932981848716736, + "learning_rate": 3.957983193277311e-05, + "loss": 0.6538, + "step": 5218 + }, + { + "epoch": 6.68032, + "grad_norm": 0.7265870571136475, + "learning_rate": 3.957783113245298e-05, + "loss": 0.5791, + "step": 5219 + }, + { + "epoch": 6.6815999999999995, + "grad_norm": 0.7722101211547852, + "learning_rate": 3.957583033213286e-05, + "loss": 0.6188, + "step": 5220 + }, + { + "epoch": 6.68288, + "grad_norm": 0.7940667271614075, + "learning_rate": 3.957382953181273e-05, + "loss": 0.5663, + "step": 5221 + }, + { + "epoch": 6.68416, + "grad_norm": 0.7684674263000488, + "learning_rate": 3.95718287314926e-05, + "loss": 0.6121, + "step": 5222 + }, + { + "epoch": 6.68544, + "grad_norm": 0.7839207053184509, + "learning_rate": 3.956982793117247e-05, + "loss": 0.6378, + "step": 5223 + }, + { + "epoch": 6.68672, + "grad_norm": 0.6971110701560974, + "learning_rate": 3.9567827130852345e-05, + "loss": 0.5632, + "step": 5224 + }, + { + "epoch": 6.688, + "grad_norm": 0.7478116750717163, + "learning_rate": 3.956582633053221e-05, + "loss": 0.6001, + "step": 5225 + }, + { + "epoch": 6.68928, + "grad_norm": 0.7529265880584717, + "learning_rate": 3.956382553021208e-05, + "loss": 0.647, + "step": 5226 + }, + { + "epoch": 6.69056, + "grad_norm": 0.7169869542121887, + "learning_rate": 3.956182472989196e-05, + "loss": 0.609, + "step": 5227 + }, + { + "epoch": 6.69184, + "grad_norm": 0.7155503034591675, + "learning_rate": 3.955982392957183e-05, + "loss": 0.5816, + "step": 5228 + }, + { + "epoch": 6.69312, + "grad_norm": 0.7647979855537415, + "learning_rate": 3.9557823129251704e-05, + "loss": 0.6386, + "step": 5229 + }, + { + "epoch": 6.6944, + "grad_norm": 0.717627227306366, + "learning_rate": 3.9555822328931576e-05, + "loss": 0.6124, + "step": 5230 + }, + { + "epoch": 6.69568, + "grad_norm": 0.7740793824195862, + "learning_rate": 3.955382152861145e-05, + "loss": 0.6819, + "step": 5231 + }, + { + "epoch": 6.69696, + "grad_norm": 0.7482873797416687, + "learning_rate": 3.955182072829132e-05, + "loss": 0.5816, + "step": 5232 + }, + { + "epoch": 6.69824, + "grad_norm": 0.8073453903198242, + "learning_rate": 3.9549819927971185e-05, + "loss": 0.6147, + "step": 5233 + }, + { + "epoch": 6.69952, + "grad_norm": 0.775709867477417, + "learning_rate": 3.954781912765106e-05, + "loss": 0.6217, + "step": 5234 + }, + { + "epoch": 6.7008, + "grad_norm": 0.8085676431655884, + "learning_rate": 3.9545818327330935e-05, + "loss": 0.6666, + "step": 5235 + }, + { + "epoch": 6.7020800000000005, + "grad_norm": 0.7595890164375305, + "learning_rate": 3.954381752701081e-05, + "loss": 0.64, + "step": 5236 + }, + { + "epoch": 6.70336, + "grad_norm": 0.7835769653320312, + "learning_rate": 3.954181672669068e-05, + "loss": 0.663, + "step": 5237 + }, + { + "epoch": 6.70464, + "grad_norm": 0.797363817691803, + "learning_rate": 3.953981592637055e-05, + "loss": 0.6874, + "step": 5238 + }, + { + "epoch": 6.70592, + "grad_norm": 0.7739813923835754, + "learning_rate": 3.953781512605042e-05, + "loss": 0.5998, + "step": 5239 + }, + { + "epoch": 6.7072, + "grad_norm": 0.7902897000312805, + "learning_rate": 3.9535814325730294e-05, + "loss": 0.6097, + "step": 5240 + }, + { + "epoch": 6.70848, + "grad_norm": 0.756549060344696, + "learning_rate": 3.9533813525410166e-05, + "loss": 0.6184, + "step": 5241 + }, + { + "epoch": 6.70976, + "grad_norm": 0.73802250623703, + "learning_rate": 3.953181272509004e-05, + "loss": 0.5682, + "step": 5242 + }, + { + "epoch": 6.71104, + "grad_norm": 0.8189078569412231, + "learning_rate": 3.952981192476991e-05, + "loss": 0.651, + "step": 5243 + }, + { + "epoch": 6.71232, + "grad_norm": 0.7824820280075073, + "learning_rate": 3.952781112444978e-05, + "loss": 0.6341, + "step": 5244 + }, + { + "epoch": 6.7136, + "grad_norm": 0.788250207901001, + "learning_rate": 3.9525810324129654e-05, + "loss": 0.5819, + "step": 5245 + }, + { + "epoch": 6.71488, + "grad_norm": 0.7493855953216553, + "learning_rate": 3.9523809523809526e-05, + "loss": 0.6157, + "step": 5246 + }, + { + "epoch": 6.71616, + "grad_norm": 0.7709637880325317, + "learning_rate": 3.95218087234894e-05, + "loss": 0.6511, + "step": 5247 + }, + { + "epoch": 6.71744, + "grad_norm": 0.7652736306190491, + "learning_rate": 3.951980792316927e-05, + "loss": 0.5601, + "step": 5248 + }, + { + "epoch": 6.71872, + "grad_norm": 0.8294394612312317, + "learning_rate": 3.951780712284914e-05, + "loss": 0.6263, + "step": 5249 + }, + { + "epoch": 6.72, + "grad_norm": 0.7636884450912476, + "learning_rate": 3.951580632252901e-05, + "loss": 0.5633, + "step": 5250 + }, + { + "epoch": 6.72128, + "grad_norm": 0.7744940519332886, + "learning_rate": 3.9513805522208885e-05, + "loss": 0.6129, + "step": 5251 + }, + { + "epoch": 6.72256, + "grad_norm": 0.8224226236343384, + "learning_rate": 3.951180472188876e-05, + "loss": 0.6453, + "step": 5252 + }, + { + "epoch": 6.72384, + "grad_norm": 0.7555708885192871, + "learning_rate": 3.950980392156863e-05, + "loss": 0.6153, + "step": 5253 + }, + { + "epoch": 6.72512, + "grad_norm": 0.7619531750679016, + "learning_rate": 3.95078031212485e-05, + "loss": 0.6237, + "step": 5254 + }, + { + "epoch": 6.7264, + "grad_norm": 0.7484665513038635, + "learning_rate": 3.950580232092838e-05, + "loss": 0.5713, + "step": 5255 + }, + { + "epoch": 6.72768, + "grad_norm": 0.8010841608047485, + "learning_rate": 3.9503801520608244e-05, + "loss": 0.6829, + "step": 5256 + }, + { + "epoch": 6.72896, + "grad_norm": 0.7087874412536621, + "learning_rate": 3.9501800720288116e-05, + "loss": 0.6001, + "step": 5257 + }, + { + "epoch": 6.73024, + "grad_norm": 0.7901442050933838, + "learning_rate": 3.949979991996799e-05, + "loss": 0.6637, + "step": 5258 + }, + { + "epoch": 6.73152, + "grad_norm": 0.78165602684021, + "learning_rate": 3.949779911964786e-05, + "loss": 0.654, + "step": 5259 + }, + { + "epoch": 6.7328, + "grad_norm": 0.768568754196167, + "learning_rate": 3.949579831932773e-05, + "loss": 0.6469, + "step": 5260 + }, + { + "epoch": 6.73408, + "grad_norm": 0.7904379963874817, + "learning_rate": 3.94937975190076e-05, + "loss": 0.6348, + "step": 5261 + }, + { + "epoch": 6.73536, + "grad_norm": 0.7367870211601257, + "learning_rate": 3.949179671868748e-05, + "loss": 0.5939, + "step": 5262 + }, + { + "epoch": 6.7366399999999995, + "grad_norm": 0.7719533443450928, + "learning_rate": 3.9489795918367354e-05, + "loss": 0.6251, + "step": 5263 + }, + { + "epoch": 6.73792, + "grad_norm": 0.7805905938148499, + "learning_rate": 3.948779511804722e-05, + "loss": 0.5785, + "step": 5264 + }, + { + "epoch": 6.7392, + "grad_norm": 0.7767636775970459, + "learning_rate": 3.948579431772709e-05, + "loss": 0.5854, + "step": 5265 + }, + { + "epoch": 6.74048, + "grad_norm": 0.7528958916664124, + "learning_rate": 3.948379351740696e-05, + "loss": 0.5972, + "step": 5266 + }, + { + "epoch": 6.74176, + "grad_norm": 0.7544768452644348, + "learning_rate": 3.9481792717086835e-05, + "loss": 0.6143, + "step": 5267 + }, + { + "epoch": 6.74304, + "grad_norm": 0.8174731731414795, + "learning_rate": 3.9479791916766706e-05, + "loss": 0.6503, + "step": 5268 + }, + { + "epoch": 6.74432, + "grad_norm": 0.7854768633842468, + "learning_rate": 3.9477791116446585e-05, + "loss": 0.6503, + "step": 5269 + }, + { + "epoch": 6.7456, + "grad_norm": 0.7965456247329712, + "learning_rate": 3.947579031612646e-05, + "loss": 0.6077, + "step": 5270 + }, + { + "epoch": 6.74688, + "grad_norm": 0.7633908987045288, + "learning_rate": 3.947378951580633e-05, + "loss": 0.6433, + "step": 5271 + }, + { + "epoch": 6.74816, + "grad_norm": 0.7727089524269104, + "learning_rate": 3.9471788715486194e-05, + "loss": 0.6058, + "step": 5272 + }, + { + "epoch": 6.74944, + "grad_norm": 0.7607753276824951, + "learning_rate": 3.9469787915166066e-05, + "loss": 0.6032, + "step": 5273 + }, + { + "epoch": 6.75072, + "grad_norm": 0.7452098727226257, + "learning_rate": 3.946778711484594e-05, + "loss": 0.581, + "step": 5274 + }, + { + "epoch": 6.752, + "grad_norm": 0.7961545586585999, + "learning_rate": 3.946578631452581e-05, + "loss": 0.6433, + "step": 5275 + }, + { + "epoch": 6.75328, + "grad_norm": 0.7878684401512146, + "learning_rate": 3.946378551420569e-05, + "loss": 0.6343, + "step": 5276 + }, + { + "epoch": 6.75456, + "grad_norm": 0.7796621322631836, + "learning_rate": 3.946178471388556e-05, + "loss": 0.6007, + "step": 5277 + }, + { + "epoch": 6.75584, + "grad_norm": 0.7304415106773376, + "learning_rate": 3.945978391356543e-05, + "loss": 0.5637, + "step": 5278 + }, + { + "epoch": 6.7571200000000005, + "grad_norm": 0.8092875480651855, + "learning_rate": 3.9457783113245304e-05, + "loss": 0.6487, + "step": 5279 + }, + { + "epoch": 6.7584, + "grad_norm": 0.7983285188674927, + "learning_rate": 3.945578231292517e-05, + "loss": 0.6291, + "step": 5280 + }, + { + "epoch": 6.75968, + "grad_norm": 0.7523446679115295, + "learning_rate": 3.945378151260504e-05, + "loss": 0.5886, + "step": 5281 + }, + { + "epoch": 6.76096, + "grad_norm": 0.7880832552909851, + "learning_rate": 3.945178071228491e-05, + "loss": 0.6386, + "step": 5282 + }, + { + "epoch": 6.76224, + "grad_norm": 0.7898958921432495, + "learning_rate": 3.944977991196479e-05, + "loss": 0.616, + "step": 5283 + }, + { + "epoch": 6.76352, + "grad_norm": 0.7464529275894165, + "learning_rate": 3.944777911164466e-05, + "loss": 0.5919, + "step": 5284 + }, + { + "epoch": 6.7648, + "grad_norm": 0.7837912440299988, + "learning_rate": 3.9445778311324535e-05, + "loss": 0.6378, + "step": 5285 + }, + { + "epoch": 6.76608, + "grad_norm": 0.7559735774993896, + "learning_rate": 3.9443777511004407e-05, + "loss": 0.6033, + "step": 5286 + }, + { + "epoch": 6.76736, + "grad_norm": 0.7761905193328857, + "learning_rate": 3.944177671068428e-05, + "loss": 0.6288, + "step": 5287 + }, + { + "epoch": 6.7686399999999995, + "grad_norm": 0.7161477208137512, + "learning_rate": 3.9439775910364143e-05, + "loss": 0.595, + "step": 5288 + }, + { + "epoch": 6.76992, + "grad_norm": 0.7817723155021667, + "learning_rate": 3.9437775110044015e-05, + "loss": 0.6474, + "step": 5289 + }, + { + "epoch": 6.7712, + "grad_norm": 0.7308269739151001, + "learning_rate": 3.9435774309723894e-05, + "loss": 0.6068, + "step": 5290 + }, + { + "epoch": 6.77248, + "grad_norm": 0.8274186253547668, + "learning_rate": 3.9433773509403766e-05, + "loss": 0.6928, + "step": 5291 + }, + { + "epoch": 6.77376, + "grad_norm": 0.7699143886566162, + "learning_rate": 3.943177270908364e-05, + "loss": 0.636, + "step": 5292 + }, + { + "epoch": 6.77504, + "grad_norm": 0.7616050243377686, + "learning_rate": 3.942977190876351e-05, + "loss": 0.6021, + "step": 5293 + }, + { + "epoch": 6.77632, + "grad_norm": 0.7531759142875671, + "learning_rate": 3.942777110844338e-05, + "loss": 0.5899, + "step": 5294 + }, + { + "epoch": 6.7776, + "grad_norm": 0.7871730327606201, + "learning_rate": 3.942577030812325e-05, + "loss": 0.6186, + "step": 5295 + }, + { + "epoch": 6.77888, + "grad_norm": 0.7265611290931702, + "learning_rate": 3.942376950780312e-05, + "loss": 0.5459, + "step": 5296 + }, + { + "epoch": 6.78016, + "grad_norm": 0.7594027519226074, + "learning_rate": 3.9421768707483e-05, + "loss": 0.6046, + "step": 5297 + }, + { + "epoch": 6.78144, + "grad_norm": 0.792248010635376, + "learning_rate": 3.941976790716287e-05, + "loss": 0.6421, + "step": 5298 + }, + { + "epoch": 6.78272, + "grad_norm": 0.8279687166213989, + "learning_rate": 3.941776710684274e-05, + "loss": 0.6959, + "step": 5299 + }, + { + "epoch": 6.784, + "grad_norm": 0.764121413230896, + "learning_rate": 3.941576630652261e-05, + "loss": 0.6368, + "step": 5300 + }, + { + "epoch": 6.78528, + "grad_norm": 0.7647132873535156, + "learning_rate": 3.9413765506202484e-05, + "loss": 0.6331, + "step": 5301 + }, + { + "epoch": 6.78656, + "grad_norm": 0.7814575433731079, + "learning_rate": 3.9411764705882356e-05, + "loss": 0.647, + "step": 5302 + }, + { + "epoch": 6.78784, + "grad_norm": 0.7337337732315063, + "learning_rate": 3.940976390556223e-05, + "loss": 0.5798, + "step": 5303 + }, + { + "epoch": 6.7891200000000005, + "grad_norm": 0.7567817568778992, + "learning_rate": 3.94077631052421e-05, + "loss": 0.6858, + "step": 5304 + }, + { + "epoch": 6.7904, + "grad_norm": 0.7290568351745605, + "learning_rate": 3.940576230492197e-05, + "loss": 0.6001, + "step": 5305 + }, + { + "epoch": 6.79168, + "grad_norm": 0.7724208235740662, + "learning_rate": 3.9403761504601844e-05, + "loss": 0.6107, + "step": 5306 + }, + { + "epoch": 6.79296, + "grad_norm": 0.7930365800857544, + "learning_rate": 3.9401760704281716e-05, + "loss": 0.6773, + "step": 5307 + }, + { + "epoch": 6.79424, + "grad_norm": 0.7742063999176025, + "learning_rate": 3.939975990396159e-05, + "loss": 0.6405, + "step": 5308 + }, + { + "epoch": 6.79552, + "grad_norm": 0.808784008026123, + "learning_rate": 3.939775910364146e-05, + "loss": 0.6853, + "step": 5309 + }, + { + "epoch": 6.7968, + "grad_norm": 0.7806289196014404, + "learning_rate": 3.939575830332133e-05, + "loss": 0.6544, + "step": 5310 + }, + { + "epoch": 6.79808, + "grad_norm": 0.7165932655334473, + "learning_rate": 3.93937575030012e-05, + "loss": 0.5988, + "step": 5311 + }, + { + "epoch": 6.79936, + "grad_norm": 0.7418404221534729, + "learning_rate": 3.9391756702681075e-05, + "loss": 0.6256, + "step": 5312 + }, + { + "epoch": 6.80064, + "grad_norm": 0.7737796902656555, + "learning_rate": 3.938975590236095e-05, + "loss": 0.5891, + "step": 5313 + }, + { + "epoch": 6.80192, + "grad_norm": 0.7775099873542786, + "learning_rate": 3.938775510204082e-05, + "loss": 0.6219, + "step": 5314 + }, + { + "epoch": 6.8032, + "grad_norm": 0.7535067796707153, + "learning_rate": 3.938575430172069e-05, + "loss": 0.5918, + "step": 5315 + }, + { + "epoch": 6.80448, + "grad_norm": 0.7628635764122009, + "learning_rate": 3.938375350140056e-05, + "loss": 0.5637, + "step": 5316 + }, + { + "epoch": 6.80576, + "grad_norm": 0.7356747984886169, + "learning_rate": 3.9381752701080434e-05, + "loss": 0.5817, + "step": 5317 + }, + { + "epoch": 6.80704, + "grad_norm": 0.7762408256530762, + "learning_rate": 3.9379751900760306e-05, + "loss": 0.6193, + "step": 5318 + }, + { + "epoch": 6.80832, + "grad_norm": 0.74337238073349, + "learning_rate": 3.937775110044018e-05, + "loss": 0.6156, + "step": 5319 + }, + { + "epoch": 6.8096, + "grad_norm": 0.7793139815330505, + "learning_rate": 3.937575030012005e-05, + "loss": 0.6768, + "step": 5320 + }, + { + "epoch": 6.81088, + "grad_norm": 0.7772572040557861, + "learning_rate": 3.937374949979992e-05, + "loss": 0.6463, + "step": 5321 + }, + { + "epoch": 6.81216, + "grad_norm": 0.7629466652870178, + "learning_rate": 3.9371748699479793e-05, + "loss": 0.6277, + "step": 5322 + }, + { + "epoch": 6.81344, + "grad_norm": 0.7419812083244324, + "learning_rate": 3.9369747899159665e-05, + "loss": 0.5655, + "step": 5323 + }, + { + "epoch": 6.81472, + "grad_norm": 0.7378270030021667, + "learning_rate": 3.936774709883954e-05, + "loss": 0.5853, + "step": 5324 + }, + { + "epoch": 6.816, + "grad_norm": 0.7700802683830261, + "learning_rate": 3.936574629851941e-05, + "loss": 0.6331, + "step": 5325 + }, + { + "epoch": 6.81728, + "grad_norm": 0.7285279631614685, + "learning_rate": 3.936374549819928e-05, + "loss": 0.6299, + "step": 5326 + }, + { + "epoch": 6.81856, + "grad_norm": 0.7254111766815186, + "learning_rate": 3.936174469787915e-05, + "loss": 0.6142, + "step": 5327 + }, + { + "epoch": 6.81984, + "grad_norm": 0.7612648606300354, + "learning_rate": 3.9359743897559025e-05, + "loss": 0.6056, + "step": 5328 + }, + { + "epoch": 6.82112, + "grad_norm": 0.7452360391616821, + "learning_rate": 3.9357743097238896e-05, + "loss": 0.6107, + "step": 5329 + }, + { + "epoch": 6.8224, + "grad_norm": 0.7449943423271179, + "learning_rate": 3.935574229691877e-05, + "loss": 0.6078, + "step": 5330 + }, + { + "epoch": 6.8236799999999995, + "grad_norm": 0.728469729423523, + "learning_rate": 3.935374149659864e-05, + "loss": 0.593, + "step": 5331 + }, + { + "epoch": 6.82496, + "grad_norm": 0.8360829949378967, + "learning_rate": 3.935174069627851e-05, + "loss": 0.6543, + "step": 5332 + }, + { + "epoch": 6.82624, + "grad_norm": 0.8084598779678345, + "learning_rate": 3.934973989595839e-05, + "loss": 0.662, + "step": 5333 + }, + { + "epoch": 6.82752, + "grad_norm": 0.758673369884491, + "learning_rate": 3.9347739095638256e-05, + "loss": 0.6426, + "step": 5334 + }, + { + "epoch": 6.8288, + "grad_norm": 0.7647375464439392, + "learning_rate": 3.934573829531813e-05, + "loss": 0.5502, + "step": 5335 + }, + { + "epoch": 6.83008, + "grad_norm": 0.7250184416770935, + "learning_rate": 3.9343737494998e-05, + "loss": 0.6173, + "step": 5336 + }, + { + "epoch": 6.83136, + "grad_norm": 0.7183948159217834, + "learning_rate": 3.934173669467787e-05, + "loss": 0.5485, + "step": 5337 + }, + { + "epoch": 6.83264, + "grad_norm": 0.7605459690093994, + "learning_rate": 3.933973589435774e-05, + "loss": 0.5859, + "step": 5338 + }, + { + "epoch": 6.83392, + "grad_norm": 0.7369337677955627, + "learning_rate": 3.9337735094037615e-05, + "loss": 0.6079, + "step": 5339 + }, + { + "epoch": 6.8352, + "grad_norm": 0.7497186660766602, + "learning_rate": 3.9335734293717494e-05, + "loss": 0.5758, + "step": 5340 + }, + { + "epoch": 6.83648, + "grad_norm": 0.76474928855896, + "learning_rate": 3.9333733493397365e-05, + "loss": 0.645, + "step": 5341 + }, + { + "epoch": 6.83776, + "grad_norm": 0.7687287926673889, + "learning_rate": 3.933173269307723e-05, + "loss": 0.6172, + "step": 5342 + }, + { + "epoch": 6.83904, + "grad_norm": 0.7535450458526611, + "learning_rate": 3.93297318927571e-05, + "loss": 0.5974, + "step": 5343 + }, + { + "epoch": 6.84032, + "grad_norm": 0.7823262214660645, + "learning_rate": 3.9327731092436974e-05, + "loss": 0.5917, + "step": 5344 + }, + { + "epoch": 6.8416, + "grad_norm": 0.765173077583313, + "learning_rate": 3.9325730292116846e-05, + "loss": 0.67, + "step": 5345 + }, + { + "epoch": 6.84288, + "grad_norm": 0.7360157370567322, + "learning_rate": 3.932372949179672e-05, + "loss": 0.6097, + "step": 5346 + }, + { + "epoch": 6.8441600000000005, + "grad_norm": 0.7717430591583252, + "learning_rate": 3.9321728691476597e-05, + "loss": 0.6088, + "step": 5347 + }, + { + "epoch": 6.84544, + "grad_norm": 0.7661362886428833, + "learning_rate": 3.931972789115647e-05, + "loss": 0.6418, + "step": 5348 + }, + { + "epoch": 6.84672, + "grad_norm": 0.7785105109214783, + "learning_rate": 3.931772709083634e-05, + "loss": 0.603, + "step": 5349 + }, + { + "epoch": 6.848, + "grad_norm": 0.7660150527954102, + "learning_rate": 3.9315726290516205e-05, + "loss": 0.6095, + "step": 5350 + }, + { + "epoch": 6.84928, + "grad_norm": 0.7388876676559448, + "learning_rate": 3.931372549019608e-05, + "loss": 0.5756, + "step": 5351 + }, + { + "epoch": 6.85056, + "grad_norm": 0.7814899682998657, + "learning_rate": 3.931172468987595e-05, + "loss": 0.6265, + "step": 5352 + }, + { + "epoch": 6.85184, + "grad_norm": 0.7103585600852966, + "learning_rate": 3.930972388955582e-05, + "loss": 0.5408, + "step": 5353 + }, + { + "epoch": 6.85312, + "grad_norm": 0.7650427222251892, + "learning_rate": 3.93077230892357e-05, + "loss": 0.6405, + "step": 5354 + }, + { + "epoch": 6.8544, + "grad_norm": 0.7595013380050659, + "learning_rate": 3.930572228891557e-05, + "loss": 0.6052, + "step": 5355 + }, + { + "epoch": 6.8556799999999996, + "grad_norm": 0.7787649631500244, + "learning_rate": 3.930372148859544e-05, + "loss": 0.6362, + "step": 5356 + }, + { + "epoch": 6.85696, + "grad_norm": 0.7371984720230103, + "learning_rate": 3.9301720688275315e-05, + "loss": 0.6212, + "step": 5357 + }, + { + "epoch": 6.85824, + "grad_norm": 0.755862295627594, + "learning_rate": 3.929971988795518e-05, + "loss": 0.5907, + "step": 5358 + }, + { + "epoch": 6.85952, + "grad_norm": 0.7287248969078064, + "learning_rate": 3.929771908763505e-05, + "loss": 0.5912, + "step": 5359 + }, + { + "epoch": 6.8608, + "grad_norm": 0.7712188363075256, + "learning_rate": 3.9295718287314924e-05, + "loss": 0.6261, + "step": 5360 + }, + { + "epoch": 6.86208, + "grad_norm": 0.8140777945518494, + "learning_rate": 3.92937174869948e-05, + "loss": 0.6052, + "step": 5361 + }, + { + "epoch": 6.86336, + "grad_norm": 0.7377832531929016, + "learning_rate": 3.9291716686674674e-05, + "loss": 0.5631, + "step": 5362 + }, + { + "epoch": 6.86464, + "grad_norm": 0.75379878282547, + "learning_rate": 3.9289715886354546e-05, + "loss": 0.6071, + "step": 5363 + }, + { + "epoch": 6.86592, + "grad_norm": 0.7823106646537781, + "learning_rate": 3.928771508603442e-05, + "loss": 0.6651, + "step": 5364 + }, + { + "epoch": 6.8672, + "grad_norm": 0.7541218400001526, + "learning_rate": 3.928571428571429e-05, + "loss": 0.6412, + "step": 5365 + }, + { + "epoch": 6.86848, + "grad_norm": 0.7208373546600342, + "learning_rate": 3.9283713485394155e-05, + "loss": 0.605, + "step": 5366 + }, + { + "epoch": 6.86976, + "grad_norm": 0.7709495425224304, + "learning_rate": 3.928171268507403e-05, + "loss": 0.6041, + "step": 5367 + }, + { + "epoch": 6.87104, + "grad_norm": 0.8122734427452087, + "learning_rate": 3.9279711884753906e-05, + "loss": 0.6647, + "step": 5368 + }, + { + "epoch": 6.87232, + "grad_norm": 0.7816949486732483, + "learning_rate": 3.927771108443378e-05, + "loss": 0.6798, + "step": 5369 + }, + { + "epoch": 6.8736, + "grad_norm": 0.7514051198959351, + "learning_rate": 3.927571028411365e-05, + "loss": 0.5785, + "step": 5370 + }, + { + "epoch": 6.87488, + "grad_norm": 0.8315825462341309, + "learning_rate": 3.927370948379352e-05, + "loss": 0.6399, + "step": 5371 + }, + { + "epoch": 6.8761600000000005, + "grad_norm": 0.7676405310630798, + "learning_rate": 3.927170868347339e-05, + "loss": 0.6142, + "step": 5372 + }, + { + "epoch": 6.87744, + "grad_norm": 0.7865943312644958, + "learning_rate": 3.9269707883153265e-05, + "loss": 0.6319, + "step": 5373 + }, + { + "epoch": 6.8787199999999995, + "grad_norm": 0.7144403457641602, + "learning_rate": 3.926770708283313e-05, + "loss": 0.5566, + "step": 5374 + }, + { + "epoch": 6.88, + "grad_norm": 0.8031983971595764, + "learning_rate": 3.926570628251301e-05, + "loss": 0.6461, + "step": 5375 + }, + { + "epoch": 6.88128, + "grad_norm": 0.761948823928833, + "learning_rate": 3.926370548219288e-05, + "loss": 0.6207, + "step": 5376 + }, + { + "epoch": 6.88256, + "grad_norm": 0.7399935126304626, + "learning_rate": 3.926170468187275e-05, + "loss": 0.6265, + "step": 5377 + }, + { + "epoch": 6.88384, + "grad_norm": 0.7543573975563049, + "learning_rate": 3.9259703881552624e-05, + "loss": 0.6648, + "step": 5378 + }, + { + "epoch": 6.88512, + "grad_norm": 0.7618049383163452, + "learning_rate": 3.9257703081232496e-05, + "loss": 0.6135, + "step": 5379 + }, + { + "epoch": 6.8864, + "grad_norm": 0.7822875380516052, + "learning_rate": 3.925570228091237e-05, + "loss": 0.6238, + "step": 5380 + }, + { + "epoch": 6.88768, + "grad_norm": 0.7895389795303345, + "learning_rate": 3.925370148059224e-05, + "loss": 0.613, + "step": 5381 + }, + { + "epoch": 6.88896, + "grad_norm": 0.7635522484779358, + "learning_rate": 3.925170068027211e-05, + "loss": 0.581, + "step": 5382 + }, + { + "epoch": 6.89024, + "grad_norm": 0.761661171913147, + "learning_rate": 3.9249699879951983e-05, + "loss": 0.5799, + "step": 5383 + }, + { + "epoch": 6.89152, + "grad_norm": 0.7741929888725281, + "learning_rate": 3.9247699079631855e-05, + "loss": 0.6283, + "step": 5384 + }, + { + "epoch": 6.8928, + "grad_norm": 0.7595140933990479, + "learning_rate": 3.924569827931173e-05, + "loss": 0.6398, + "step": 5385 + }, + { + "epoch": 6.89408, + "grad_norm": 0.7731903195381165, + "learning_rate": 3.92436974789916e-05, + "loss": 0.5852, + "step": 5386 + }, + { + "epoch": 6.89536, + "grad_norm": 0.7807347774505615, + "learning_rate": 3.924169667867147e-05, + "loss": 0.6452, + "step": 5387 + }, + { + "epoch": 6.89664, + "grad_norm": 0.7398995757102966, + "learning_rate": 3.923969587835134e-05, + "loss": 0.6283, + "step": 5388 + }, + { + "epoch": 6.89792, + "grad_norm": 0.7703590393066406, + "learning_rate": 3.9237695078031215e-05, + "loss": 0.5941, + "step": 5389 + }, + { + "epoch": 6.8992, + "grad_norm": 0.8060065507888794, + "learning_rate": 3.9235694277711086e-05, + "loss": 0.6347, + "step": 5390 + }, + { + "epoch": 6.90048, + "grad_norm": 0.7907054424285889, + "learning_rate": 3.923369347739096e-05, + "loss": 0.6734, + "step": 5391 + }, + { + "epoch": 6.90176, + "grad_norm": 0.7742048501968384, + "learning_rate": 3.923169267707083e-05, + "loss": 0.6362, + "step": 5392 + }, + { + "epoch": 6.90304, + "grad_norm": 0.7606136798858643, + "learning_rate": 3.92296918767507e-05, + "loss": 0.6501, + "step": 5393 + }, + { + "epoch": 6.90432, + "grad_norm": 0.7519256472587585, + "learning_rate": 3.9227691076430574e-05, + "loss": 0.6411, + "step": 5394 + }, + { + "epoch": 6.9056, + "grad_norm": 0.7524474859237671, + "learning_rate": 3.9225690276110446e-05, + "loss": 0.6171, + "step": 5395 + }, + { + "epoch": 6.90688, + "grad_norm": 0.8051052093505859, + "learning_rate": 3.922368947579032e-05, + "loss": 0.6251, + "step": 5396 + }, + { + "epoch": 6.90816, + "grad_norm": 0.7711687684059143, + "learning_rate": 3.922168867547019e-05, + "loss": 0.6428, + "step": 5397 + }, + { + "epoch": 6.90944, + "grad_norm": 0.7528000473976135, + "learning_rate": 3.921968787515006e-05, + "loss": 0.6054, + "step": 5398 + }, + { + "epoch": 6.9107199999999995, + "grad_norm": 0.7490524649620056, + "learning_rate": 3.921768707482993e-05, + "loss": 0.5886, + "step": 5399 + }, + { + "epoch": 6.912, + "grad_norm": 0.7919133901596069, + "learning_rate": 3.9215686274509805e-05, + "loss": 0.5964, + "step": 5400 + }, + { + "epoch": 6.91328, + "grad_norm": 0.8205950856208801, + "learning_rate": 3.921368547418968e-05, + "loss": 0.6063, + "step": 5401 + }, + { + "epoch": 6.91456, + "grad_norm": 0.7822876572608948, + "learning_rate": 3.921168467386955e-05, + "loss": 0.5971, + "step": 5402 + }, + { + "epoch": 6.91584, + "grad_norm": 0.7439231276512146, + "learning_rate": 3.920968387354943e-05, + "loss": 0.5844, + "step": 5403 + }, + { + "epoch": 6.91712, + "grad_norm": 0.7648040056228638, + "learning_rate": 3.920768307322929e-05, + "loss": 0.6627, + "step": 5404 + }, + { + "epoch": 6.9184, + "grad_norm": 0.7816339135169983, + "learning_rate": 3.9205682272909164e-05, + "loss": 0.6154, + "step": 5405 + }, + { + "epoch": 6.91968, + "grad_norm": 0.7725468873977661, + "learning_rate": 3.9203681472589036e-05, + "loss": 0.6032, + "step": 5406 + }, + { + "epoch": 6.92096, + "grad_norm": 0.7589234709739685, + "learning_rate": 3.920168067226891e-05, + "loss": 0.6349, + "step": 5407 + }, + { + "epoch": 6.92224, + "grad_norm": 0.7441187500953674, + "learning_rate": 3.919967987194878e-05, + "loss": 0.5901, + "step": 5408 + }, + { + "epoch": 6.92352, + "grad_norm": 0.8013135194778442, + "learning_rate": 3.919767907162865e-05, + "loss": 0.6455, + "step": 5409 + }, + { + "epoch": 6.9248, + "grad_norm": 0.7458381652832031, + "learning_rate": 3.919567827130853e-05, + "loss": 0.589, + "step": 5410 + }, + { + "epoch": 6.92608, + "grad_norm": 0.7533714771270752, + "learning_rate": 3.91936774709884e-05, + "loss": 0.5634, + "step": 5411 + }, + { + "epoch": 6.92736, + "grad_norm": 0.7798910140991211, + "learning_rate": 3.919167667066827e-05, + "loss": 0.6227, + "step": 5412 + }, + { + "epoch": 6.92864, + "grad_norm": 0.7700549960136414, + "learning_rate": 3.918967587034814e-05, + "loss": 0.6561, + "step": 5413 + }, + { + "epoch": 6.92992, + "grad_norm": 0.7472692728042603, + "learning_rate": 3.918767507002801e-05, + "loss": 0.6014, + "step": 5414 + }, + { + "epoch": 6.9312000000000005, + "grad_norm": 0.7239599227905273, + "learning_rate": 3.918567426970788e-05, + "loss": 0.5716, + "step": 5415 + }, + { + "epoch": 6.93248, + "grad_norm": 0.7691959738731384, + "learning_rate": 3.9183673469387755e-05, + "loss": 0.6603, + "step": 5416 + }, + { + "epoch": 6.93376, + "grad_norm": 0.8300781846046448, + "learning_rate": 3.918167266906763e-05, + "loss": 0.6178, + "step": 5417 + }, + { + "epoch": 6.93504, + "grad_norm": 0.7766384482383728, + "learning_rate": 3.9179671868747505e-05, + "loss": 0.6278, + "step": 5418 + }, + { + "epoch": 6.93632, + "grad_norm": 0.7828888297080994, + "learning_rate": 3.917767106842738e-05, + "loss": 0.6175, + "step": 5419 + }, + { + "epoch": 6.9376, + "grad_norm": 0.7322559356689453, + "learning_rate": 3.917567026810724e-05, + "loss": 0.5805, + "step": 5420 + }, + { + "epoch": 6.93888, + "grad_norm": 0.7959531545639038, + "learning_rate": 3.9173669467787114e-05, + "loss": 0.6813, + "step": 5421 + }, + { + "epoch": 6.94016, + "grad_norm": 0.7419981360435486, + "learning_rate": 3.9171668667466986e-05, + "loss": 0.6288, + "step": 5422 + }, + { + "epoch": 6.94144, + "grad_norm": 0.7409440875053406, + "learning_rate": 3.916966786714686e-05, + "loss": 0.5965, + "step": 5423 + }, + { + "epoch": 6.94272, + "grad_norm": 0.7452877759933472, + "learning_rate": 3.916766706682673e-05, + "loss": 0.6342, + "step": 5424 + }, + { + "epoch": 6.944, + "grad_norm": 0.7688354849815369, + "learning_rate": 3.916566626650661e-05, + "loss": 0.611, + "step": 5425 + }, + { + "epoch": 6.94528, + "grad_norm": 0.7577010989189148, + "learning_rate": 3.916366546618648e-05, + "loss": 0.6594, + "step": 5426 + }, + { + "epoch": 6.94656, + "grad_norm": 0.7698047161102295, + "learning_rate": 3.916166466586635e-05, + "loss": 0.6134, + "step": 5427 + }, + { + "epoch": 6.94784, + "grad_norm": 0.7376213073730469, + "learning_rate": 3.915966386554622e-05, + "loss": 0.6034, + "step": 5428 + }, + { + "epoch": 6.94912, + "grad_norm": 0.7267906665802002, + "learning_rate": 3.915766306522609e-05, + "loss": 0.5529, + "step": 5429 + }, + { + "epoch": 6.9504, + "grad_norm": 0.7915275692939758, + "learning_rate": 3.915566226490596e-05, + "loss": 0.6388, + "step": 5430 + }, + { + "epoch": 6.95168, + "grad_norm": 0.8539730906486511, + "learning_rate": 3.915366146458583e-05, + "loss": 0.6788, + "step": 5431 + }, + { + "epoch": 6.95296, + "grad_norm": 0.808468222618103, + "learning_rate": 3.915166066426571e-05, + "loss": 0.6558, + "step": 5432 + }, + { + "epoch": 6.95424, + "grad_norm": 0.8144415616989136, + "learning_rate": 3.914965986394558e-05, + "loss": 0.6735, + "step": 5433 + }, + { + "epoch": 6.95552, + "grad_norm": 0.7447432279586792, + "learning_rate": 3.9147659063625455e-05, + "loss": 0.5727, + "step": 5434 + }, + { + "epoch": 6.9568, + "grad_norm": 0.7698988914489746, + "learning_rate": 3.914565826330533e-05, + "loss": 0.6138, + "step": 5435 + }, + { + "epoch": 6.95808, + "grad_norm": 0.8165004849433899, + "learning_rate": 3.914365746298519e-05, + "loss": 0.6303, + "step": 5436 + }, + { + "epoch": 6.95936, + "grad_norm": 0.771243155002594, + "learning_rate": 3.9141656662665064e-05, + "loss": 0.6554, + "step": 5437 + }, + { + "epoch": 6.96064, + "grad_norm": 0.7581053376197815, + "learning_rate": 3.9139655862344936e-05, + "loss": 0.6171, + "step": 5438 + }, + { + "epoch": 6.96192, + "grad_norm": 0.7608379125595093, + "learning_rate": 3.9137655062024814e-05, + "loss": 0.5761, + "step": 5439 + }, + { + "epoch": 6.9632, + "grad_norm": 0.7522656321525574, + "learning_rate": 3.9135654261704686e-05, + "loss": 0.5646, + "step": 5440 + }, + { + "epoch": 6.96448, + "grad_norm": 0.7819010019302368, + "learning_rate": 3.913365346138456e-05, + "loss": 0.6509, + "step": 5441 + }, + { + "epoch": 6.9657599999999995, + "grad_norm": 0.783852756023407, + "learning_rate": 3.913165266106443e-05, + "loss": 0.6482, + "step": 5442 + }, + { + "epoch": 6.96704, + "grad_norm": 0.7638618350028992, + "learning_rate": 3.91296518607443e-05, + "loss": 0.6488, + "step": 5443 + }, + { + "epoch": 6.96832, + "grad_norm": 0.7872533798217773, + "learning_rate": 3.912765106042417e-05, + "loss": 0.6425, + "step": 5444 + }, + { + "epoch": 6.9696, + "grad_norm": 0.8017598986625671, + "learning_rate": 3.912565026010404e-05, + "loss": 0.6332, + "step": 5445 + }, + { + "epoch": 6.97088, + "grad_norm": 0.784069836139679, + "learning_rate": 3.912364945978392e-05, + "loss": 0.6307, + "step": 5446 + }, + { + "epoch": 6.97216, + "grad_norm": 0.8013083338737488, + "learning_rate": 3.912164865946379e-05, + "loss": 0.6808, + "step": 5447 + }, + { + "epoch": 6.97344, + "grad_norm": 0.7447139024734497, + "learning_rate": 3.911964785914366e-05, + "loss": 0.6048, + "step": 5448 + }, + { + "epoch": 6.97472, + "grad_norm": 0.7573011517524719, + "learning_rate": 3.911764705882353e-05, + "loss": 0.5808, + "step": 5449 + }, + { + "epoch": 6.976, + "grad_norm": 0.7706106901168823, + "learning_rate": 3.9115646258503405e-05, + "loss": 0.5773, + "step": 5450 + }, + { + "epoch": 6.97728, + "grad_norm": 0.8079164624214172, + "learning_rate": 3.9113645458183276e-05, + "loss": 0.6569, + "step": 5451 + }, + { + "epoch": 6.97856, + "grad_norm": 0.720331609249115, + "learning_rate": 3.911164465786314e-05, + "loss": 0.6154, + "step": 5452 + }, + { + "epoch": 6.97984, + "grad_norm": 0.7962987422943115, + "learning_rate": 3.910964385754302e-05, + "loss": 0.6097, + "step": 5453 + }, + { + "epoch": 6.98112, + "grad_norm": 0.7449235916137695, + "learning_rate": 3.910764305722289e-05, + "loss": 0.585, + "step": 5454 + }, + { + "epoch": 6.9824, + "grad_norm": 0.7487616539001465, + "learning_rate": 3.9105642256902764e-05, + "loss": 0.6123, + "step": 5455 + }, + { + "epoch": 6.98368, + "grad_norm": 0.775963544845581, + "learning_rate": 3.9103641456582636e-05, + "loss": 0.6661, + "step": 5456 + }, + { + "epoch": 6.98496, + "grad_norm": 0.710671603679657, + "learning_rate": 3.910164065626251e-05, + "loss": 0.5817, + "step": 5457 + }, + { + "epoch": 6.9862400000000004, + "grad_norm": 0.7331653237342834, + "learning_rate": 3.909963985594238e-05, + "loss": 0.5741, + "step": 5458 + }, + { + "epoch": 6.98752, + "grad_norm": 0.7193625569343567, + "learning_rate": 3.909763905562225e-05, + "loss": 0.5515, + "step": 5459 + }, + { + "epoch": 6.9888, + "grad_norm": 0.7541481852531433, + "learning_rate": 3.909563825530212e-05, + "loss": 0.595, + "step": 5460 + }, + { + "epoch": 6.99008, + "grad_norm": 0.7617225646972656, + "learning_rate": 3.9093637454981995e-05, + "loss": 0.6112, + "step": 5461 + }, + { + "epoch": 6.99136, + "grad_norm": 0.7736883163452148, + "learning_rate": 3.909163665466187e-05, + "loss": 0.6292, + "step": 5462 + }, + { + "epoch": 6.99264, + "grad_norm": 0.796480655670166, + "learning_rate": 3.908963585434174e-05, + "loss": 0.6122, + "step": 5463 + }, + { + "epoch": 6.99392, + "grad_norm": 0.7853350043296814, + "learning_rate": 3.908763505402161e-05, + "loss": 0.6188, + "step": 5464 + }, + { + "epoch": 6.9952, + "grad_norm": 0.7956370115280151, + "learning_rate": 3.908563425370148e-05, + "loss": 0.6699, + "step": 5465 + }, + { + "epoch": 6.99648, + "grad_norm": 0.7397189736366272, + "learning_rate": 3.9083633453381354e-05, + "loss": 0.647, + "step": 5466 + }, + { + "epoch": 6.9977599999999995, + "grad_norm": 0.6985254287719727, + "learning_rate": 3.9081632653061226e-05, + "loss": 0.5586, + "step": 5467 + }, + { + "epoch": 6.99904, + "grad_norm": 0.8187921047210693, + "learning_rate": 3.90796318527411e-05, + "loss": 0.6705, + "step": 5468 + }, + { + "epoch": 7.00032, + "grad_norm": 1.61716628074646, + "learning_rate": 3.907763105242097e-05, + "loss": 1.0948, + "step": 5469 + }, + { + "epoch": 7.0016, + "grad_norm": 0.7572510838508606, + "learning_rate": 3.907563025210084e-05, + "loss": 0.5787, + "step": 5470 + }, + { + "epoch": 7.00288, + "grad_norm": 0.7337214350700378, + "learning_rate": 3.9073629451780714e-05, + "loss": 0.579, + "step": 5471 + }, + { + "epoch": 7.00416, + "grad_norm": 0.7118316888809204, + "learning_rate": 3.9071628651460585e-05, + "loss": 0.5866, + "step": 5472 + }, + { + "epoch": 7.00544, + "grad_norm": 0.7408485412597656, + "learning_rate": 3.906962785114046e-05, + "loss": 0.6048, + "step": 5473 + }, + { + "epoch": 7.00672, + "grad_norm": 0.7442348003387451, + "learning_rate": 3.906762705082033e-05, + "loss": 0.5916, + "step": 5474 + }, + { + "epoch": 7.008, + "grad_norm": 0.7323760390281677, + "learning_rate": 3.90656262505002e-05, + "loss": 0.6146, + "step": 5475 + }, + { + "epoch": 7.00928, + "grad_norm": 0.7773112654685974, + "learning_rate": 3.906362545018007e-05, + "loss": 0.6156, + "step": 5476 + }, + { + "epoch": 7.01056, + "grad_norm": 0.7394047975540161, + "learning_rate": 3.9061624649859945e-05, + "loss": 0.5846, + "step": 5477 + }, + { + "epoch": 7.01184, + "grad_norm": 0.7753980755805969, + "learning_rate": 3.9059623849539817e-05, + "loss": 0.6151, + "step": 5478 + }, + { + "epoch": 7.01312, + "grad_norm": 0.7161932587623596, + "learning_rate": 3.905762304921969e-05, + "loss": 0.5535, + "step": 5479 + }, + { + "epoch": 7.0144, + "grad_norm": 0.7652511596679688, + "learning_rate": 3.905562224889956e-05, + "loss": 0.6409, + "step": 5480 + }, + { + "epoch": 7.01568, + "grad_norm": 0.7910155653953552, + "learning_rate": 3.905362144857944e-05, + "loss": 0.6257, + "step": 5481 + }, + { + "epoch": 7.01696, + "grad_norm": 0.7763440012931824, + "learning_rate": 3.9051620648259304e-05, + "loss": 0.5843, + "step": 5482 + }, + { + "epoch": 7.01824, + "grad_norm": 0.806786298751831, + "learning_rate": 3.9049619847939176e-05, + "loss": 0.5773, + "step": 5483 + }, + { + "epoch": 7.01952, + "grad_norm": 0.7779539227485657, + "learning_rate": 3.904761904761905e-05, + "loss": 0.6031, + "step": 5484 + }, + { + "epoch": 7.0208, + "grad_norm": 0.7639070153236389, + "learning_rate": 3.904561824729892e-05, + "loss": 0.5415, + "step": 5485 + }, + { + "epoch": 7.02208, + "grad_norm": 0.7557491064071655, + "learning_rate": 3.904361744697879e-05, + "loss": 0.5762, + "step": 5486 + }, + { + "epoch": 7.02336, + "grad_norm": 0.7613850235939026, + "learning_rate": 3.904161664665866e-05, + "loss": 0.6029, + "step": 5487 + }, + { + "epoch": 7.02464, + "grad_norm": 0.8083841800689697, + "learning_rate": 3.903961584633854e-05, + "loss": 0.6372, + "step": 5488 + }, + { + "epoch": 7.02592, + "grad_norm": 0.7833201885223389, + "learning_rate": 3.9037615046018414e-05, + "loss": 0.6152, + "step": 5489 + }, + { + "epoch": 7.0272, + "grad_norm": 0.7935197949409485, + "learning_rate": 3.903561424569828e-05, + "loss": 0.5762, + "step": 5490 + }, + { + "epoch": 7.02848, + "grad_norm": 0.7885215878486633, + "learning_rate": 3.903361344537815e-05, + "loss": 0.6361, + "step": 5491 + }, + { + "epoch": 7.02976, + "grad_norm": 0.770488440990448, + "learning_rate": 3.903161264505802e-05, + "loss": 0.6102, + "step": 5492 + }, + { + "epoch": 7.03104, + "grad_norm": 0.7559887170791626, + "learning_rate": 3.9029611844737894e-05, + "loss": 0.601, + "step": 5493 + }, + { + "epoch": 7.03232, + "grad_norm": 0.8430263996124268, + "learning_rate": 3.9027611044417766e-05, + "loss": 0.6871, + "step": 5494 + }, + { + "epoch": 7.0336, + "grad_norm": 0.729887068271637, + "learning_rate": 3.9025610244097645e-05, + "loss": 0.5806, + "step": 5495 + }, + { + "epoch": 7.03488, + "grad_norm": 0.7851121425628662, + "learning_rate": 3.902360944377752e-05, + "loss": 0.5748, + "step": 5496 + }, + { + "epoch": 7.03616, + "grad_norm": 0.7651329040527344, + "learning_rate": 3.902160864345739e-05, + "loss": 0.6004, + "step": 5497 + }, + { + "epoch": 7.03744, + "grad_norm": 0.7575582265853882, + "learning_rate": 3.9019607843137254e-05, + "loss": 0.6509, + "step": 5498 + }, + { + "epoch": 7.03872, + "grad_norm": 0.7376059293746948, + "learning_rate": 3.9017607042817126e-05, + "loss": 0.5799, + "step": 5499 + }, + { + "epoch": 7.04, + "grad_norm": 0.723619282245636, + "learning_rate": 3.9015606242497e-05, + "loss": 0.6004, + "step": 5500 + }, + { + "epoch": 7.04128, + "grad_norm": 0.803107738494873, + "learning_rate": 3.901360544217687e-05, + "loss": 0.5998, + "step": 5501 + }, + { + "epoch": 7.04256, + "grad_norm": 0.8073145151138306, + "learning_rate": 3.901160464185675e-05, + "loss": 0.5843, + "step": 5502 + }, + { + "epoch": 7.04384, + "grad_norm": 0.8166995644569397, + "learning_rate": 3.900960384153662e-05, + "loss": 0.6536, + "step": 5503 + }, + { + "epoch": 7.04512, + "grad_norm": 0.7663634419441223, + "learning_rate": 3.900760304121649e-05, + "loss": 0.6081, + "step": 5504 + }, + { + "epoch": 7.0464, + "grad_norm": 0.8213377594947815, + "learning_rate": 3.9005602240896364e-05, + "loss": 0.5922, + "step": 5505 + }, + { + "epoch": 7.04768, + "grad_norm": 0.7828342318534851, + "learning_rate": 3.900360144057623e-05, + "loss": 0.6146, + "step": 5506 + }, + { + "epoch": 7.04896, + "grad_norm": 0.8070378303527832, + "learning_rate": 3.90016006402561e-05, + "loss": 0.6231, + "step": 5507 + }, + { + "epoch": 7.05024, + "grad_norm": 0.7503546476364136, + "learning_rate": 3.899959983993597e-05, + "loss": 0.5776, + "step": 5508 + }, + { + "epoch": 7.05152, + "grad_norm": 0.8065857291221619, + "learning_rate": 3.899759903961585e-05, + "loss": 0.6244, + "step": 5509 + }, + { + "epoch": 7.0528, + "grad_norm": 0.7999621629714966, + "learning_rate": 3.899559823929572e-05, + "loss": 0.5803, + "step": 5510 + }, + { + "epoch": 7.05408, + "grad_norm": 0.7979943752288818, + "learning_rate": 3.8993597438975595e-05, + "loss": 0.5895, + "step": 5511 + }, + { + "epoch": 7.05536, + "grad_norm": 0.8131271004676819, + "learning_rate": 3.8991596638655467e-05, + "loss": 0.615, + "step": 5512 + }, + { + "epoch": 7.05664, + "grad_norm": 0.7797995209693909, + "learning_rate": 3.898959583833534e-05, + "loss": 0.5712, + "step": 5513 + }, + { + "epoch": 7.05792, + "grad_norm": 0.8302449584007263, + "learning_rate": 3.8987595038015203e-05, + "loss": 0.6598, + "step": 5514 + }, + { + "epoch": 7.0592, + "grad_norm": 0.817923367023468, + "learning_rate": 3.8985594237695075e-05, + "loss": 0.627, + "step": 5515 + }, + { + "epoch": 7.06048, + "grad_norm": 0.8435928821563721, + "learning_rate": 3.8983593437374954e-05, + "loss": 0.6078, + "step": 5516 + }, + { + "epoch": 7.06176, + "grad_norm": 0.7841606736183167, + "learning_rate": 3.8981592637054826e-05, + "loss": 0.6069, + "step": 5517 + }, + { + "epoch": 7.06304, + "grad_norm": 0.7932061553001404, + "learning_rate": 3.89795918367347e-05, + "loss": 0.5993, + "step": 5518 + }, + { + "epoch": 7.06432, + "grad_norm": 0.8158665895462036, + "learning_rate": 3.897759103641457e-05, + "loss": 0.6412, + "step": 5519 + }, + { + "epoch": 7.0656, + "grad_norm": 0.7837883830070496, + "learning_rate": 3.897559023609444e-05, + "loss": 0.6011, + "step": 5520 + }, + { + "epoch": 7.06688, + "grad_norm": 0.8409795761108398, + "learning_rate": 3.897358943577431e-05, + "loss": 0.5954, + "step": 5521 + }, + { + "epoch": 7.06816, + "grad_norm": 0.818547785282135, + "learning_rate": 3.897158863545418e-05, + "loss": 0.6266, + "step": 5522 + }, + { + "epoch": 7.06944, + "grad_norm": 0.8386947512626648, + "learning_rate": 3.896958783513406e-05, + "loss": 0.6724, + "step": 5523 + }, + { + "epoch": 7.07072, + "grad_norm": 0.7939981818199158, + "learning_rate": 3.896758703481393e-05, + "loss": 0.571, + "step": 5524 + }, + { + "epoch": 7.072, + "grad_norm": 0.8104544281959534, + "learning_rate": 3.89655862344938e-05, + "loss": 0.5906, + "step": 5525 + }, + { + "epoch": 7.07328, + "grad_norm": 0.805988609790802, + "learning_rate": 3.896358543417367e-05, + "loss": 0.6188, + "step": 5526 + }, + { + "epoch": 7.07456, + "grad_norm": 0.7390158176422119, + "learning_rate": 3.8961584633853544e-05, + "loss": 0.5718, + "step": 5527 + }, + { + "epoch": 7.07584, + "grad_norm": 0.798747181892395, + "learning_rate": 3.8959583833533416e-05, + "loss": 0.5949, + "step": 5528 + }, + { + "epoch": 7.07712, + "grad_norm": 0.8065986633300781, + "learning_rate": 3.895758303321329e-05, + "loss": 0.6211, + "step": 5529 + }, + { + "epoch": 7.0784, + "grad_norm": 0.8075457215309143, + "learning_rate": 3.895558223289316e-05, + "loss": 0.5692, + "step": 5530 + }, + { + "epoch": 7.07968, + "grad_norm": 0.8485049605369568, + "learning_rate": 3.895358143257303e-05, + "loss": 0.6845, + "step": 5531 + }, + { + "epoch": 7.08096, + "grad_norm": 0.7971506118774414, + "learning_rate": 3.8951580632252904e-05, + "loss": 0.615, + "step": 5532 + }, + { + "epoch": 7.08224, + "grad_norm": 0.7371413707733154, + "learning_rate": 3.8949579831932776e-05, + "loss": 0.5338, + "step": 5533 + }, + { + "epoch": 7.08352, + "grad_norm": 0.7684740424156189, + "learning_rate": 3.894757903161265e-05, + "loss": 0.5948, + "step": 5534 + }, + { + "epoch": 7.0848, + "grad_norm": 0.8026983737945557, + "learning_rate": 3.894557823129252e-05, + "loss": 0.5749, + "step": 5535 + }, + { + "epoch": 7.08608, + "grad_norm": 0.8087561726570129, + "learning_rate": 3.894357743097239e-05, + "loss": 0.604, + "step": 5536 + }, + { + "epoch": 7.08736, + "grad_norm": 0.7562788724899292, + "learning_rate": 3.894157663065226e-05, + "loss": 0.5552, + "step": 5537 + }, + { + "epoch": 7.08864, + "grad_norm": 0.8130525946617126, + "learning_rate": 3.8939575830332135e-05, + "loss": 0.6516, + "step": 5538 + }, + { + "epoch": 7.08992, + "grad_norm": 0.7703153491020203, + "learning_rate": 3.893757503001201e-05, + "loss": 0.5952, + "step": 5539 + }, + { + "epoch": 7.0912, + "grad_norm": 0.8727670311927795, + "learning_rate": 3.893557422969188e-05, + "loss": 0.6897, + "step": 5540 + }, + { + "epoch": 7.09248, + "grad_norm": 0.7629865407943726, + "learning_rate": 3.893357342937175e-05, + "loss": 0.6019, + "step": 5541 + }, + { + "epoch": 7.09376, + "grad_norm": 0.7899230122566223, + "learning_rate": 3.893157262905162e-05, + "loss": 0.6197, + "step": 5542 + }, + { + "epoch": 7.09504, + "grad_norm": 0.7999109625816345, + "learning_rate": 3.8929571828731494e-05, + "loss": 0.6019, + "step": 5543 + }, + { + "epoch": 7.09632, + "grad_norm": 0.7331894040107727, + "learning_rate": 3.8927571028411366e-05, + "loss": 0.5198, + "step": 5544 + }, + { + "epoch": 7.0976, + "grad_norm": 0.8195477724075317, + "learning_rate": 3.892557022809124e-05, + "loss": 0.6371, + "step": 5545 + }, + { + "epoch": 7.09888, + "grad_norm": 0.8432417511940002, + "learning_rate": 3.892356942777111e-05, + "loss": 0.6299, + "step": 5546 + }, + { + "epoch": 7.10016, + "grad_norm": 0.8083561062812805, + "learning_rate": 3.892156862745098e-05, + "loss": 0.587, + "step": 5547 + }, + { + "epoch": 7.10144, + "grad_norm": 0.8173826932907104, + "learning_rate": 3.891956782713085e-05, + "loss": 0.6131, + "step": 5548 + }, + { + "epoch": 7.10272, + "grad_norm": 0.786679208278656, + "learning_rate": 3.8917567026810725e-05, + "loss": 0.5952, + "step": 5549 + }, + { + "epoch": 7.104, + "grad_norm": 0.7349452376365662, + "learning_rate": 3.89155662264906e-05, + "loss": 0.5611, + "step": 5550 + }, + { + "epoch": 7.10528, + "grad_norm": 0.7959557771682739, + "learning_rate": 3.891356542617047e-05, + "loss": 0.6117, + "step": 5551 + }, + { + "epoch": 7.10656, + "grad_norm": 0.7580567002296448, + "learning_rate": 3.891156462585034e-05, + "loss": 0.6021, + "step": 5552 + }, + { + "epoch": 7.10784, + "grad_norm": 0.797226071357727, + "learning_rate": 3.890956382553021e-05, + "loss": 0.6083, + "step": 5553 + }, + { + "epoch": 7.10912, + "grad_norm": 0.8558824062347412, + "learning_rate": 3.8907563025210084e-05, + "loss": 0.6534, + "step": 5554 + }, + { + "epoch": 7.1104, + "grad_norm": 0.7972661256790161, + "learning_rate": 3.8905562224889956e-05, + "loss": 0.6106, + "step": 5555 + }, + { + "epoch": 7.11168, + "grad_norm": 0.8266356587409973, + "learning_rate": 3.890356142456983e-05, + "loss": 0.6189, + "step": 5556 + }, + { + "epoch": 7.11296, + "grad_norm": 0.8113858103752136, + "learning_rate": 3.89015606242497e-05, + "loss": 0.5916, + "step": 5557 + }, + { + "epoch": 7.11424, + "grad_norm": 0.7940702438354492, + "learning_rate": 3.889955982392957e-05, + "loss": 0.6358, + "step": 5558 + }, + { + "epoch": 7.11552, + "grad_norm": 0.8410483598709106, + "learning_rate": 3.889755902360945e-05, + "loss": 0.6397, + "step": 5559 + }, + { + "epoch": 7.1168, + "grad_norm": 0.897203266620636, + "learning_rate": 3.8895558223289316e-05, + "loss": 0.6482, + "step": 5560 + }, + { + "epoch": 7.11808, + "grad_norm": 0.781025230884552, + "learning_rate": 3.889355742296919e-05, + "loss": 0.5471, + "step": 5561 + }, + { + "epoch": 7.11936, + "grad_norm": 0.7567470073699951, + "learning_rate": 3.889155662264906e-05, + "loss": 0.5391, + "step": 5562 + }, + { + "epoch": 7.12064, + "grad_norm": 0.8244337439537048, + "learning_rate": 3.888955582232893e-05, + "loss": 0.6146, + "step": 5563 + }, + { + "epoch": 7.12192, + "grad_norm": 0.8109177947044373, + "learning_rate": 3.88875550220088e-05, + "loss": 0.5691, + "step": 5564 + }, + { + "epoch": 7.1232, + "grad_norm": 0.7812331914901733, + "learning_rate": 3.8885554221688675e-05, + "loss": 0.6075, + "step": 5565 + }, + { + "epoch": 7.12448, + "grad_norm": 0.7723196148872375, + "learning_rate": 3.8883553421368554e-05, + "loss": 0.6153, + "step": 5566 + }, + { + "epoch": 7.12576, + "grad_norm": 0.790398120880127, + "learning_rate": 3.8881552621048425e-05, + "loss": 0.6043, + "step": 5567 + }, + { + "epoch": 7.12704, + "grad_norm": 0.7210956811904907, + "learning_rate": 3.887955182072829e-05, + "loss": 0.5861, + "step": 5568 + }, + { + "epoch": 7.12832, + "grad_norm": 0.7791348695755005, + "learning_rate": 3.887755102040816e-05, + "loss": 0.618, + "step": 5569 + }, + { + "epoch": 7.1296, + "grad_norm": 0.793732225894928, + "learning_rate": 3.8875550220088034e-05, + "loss": 0.6127, + "step": 5570 + }, + { + "epoch": 7.13088, + "grad_norm": 0.7834117412567139, + "learning_rate": 3.8873549419767906e-05, + "loss": 0.5885, + "step": 5571 + }, + { + "epoch": 7.13216, + "grad_norm": 0.8224747180938721, + "learning_rate": 3.887154861944778e-05, + "loss": 0.6073, + "step": 5572 + }, + { + "epoch": 7.13344, + "grad_norm": 0.7827395796775818, + "learning_rate": 3.8869547819127657e-05, + "loss": 0.5888, + "step": 5573 + }, + { + "epoch": 7.13472, + "grad_norm": 0.7737107872962952, + "learning_rate": 3.886754701880753e-05, + "loss": 0.6167, + "step": 5574 + }, + { + "epoch": 7.136, + "grad_norm": 0.8110948801040649, + "learning_rate": 3.88655462184874e-05, + "loss": 0.6296, + "step": 5575 + }, + { + "epoch": 7.13728, + "grad_norm": 0.7978016138076782, + "learning_rate": 3.8863545418167265e-05, + "loss": 0.6459, + "step": 5576 + }, + { + "epoch": 7.13856, + "grad_norm": 0.7949130535125732, + "learning_rate": 3.886154461784714e-05, + "loss": 0.622, + "step": 5577 + }, + { + "epoch": 7.13984, + "grad_norm": 0.7578536868095398, + "learning_rate": 3.885954381752701e-05, + "loss": 0.5487, + "step": 5578 + }, + { + "epoch": 7.14112, + "grad_norm": 0.778209388256073, + "learning_rate": 3.885754301720688e-05, + "loss": 0.6221, + "step": 5579 + }, + { + "epoch": 7.1424, + "grad_norm": 0.7351183891296387, + "learning_rate": 3.885554221688676e-05, + "loss": 0.5419, + "step": 5580 + }, + { + "epoch": 7.14368, + "grad_norm": 0.811617374420166, + "learning_rate": 3.885354141656663e-05, + "loss": 0.6545, + "step": 5581 + }, + { + "epoch": 7.14496, + "grad_norm": 0.7293868064880371, + "learning_rate": 3.88515406162465e-05, + "loss": 0.5763, + "step": 5582 + }, + { + "epoch": 7.14624, + "grad_norm": 0.7808999419212341, + "learning_rate": 3.8849539815926375e-05, + "loss": 0.5435, + "step": 5583 + }, + { + "epoch": 7.14752, + "grad_norm": 0.7816929221153259, + "learning_rate": 3.884753901560624e-05, + "loss": 0.6055, + "step": 5584 + }, + { + "epoch": 7.1488, + "grad_norm": 0.8008720278739929, + "learning_rate": 3.884553821528611e-05, + "loss": 0.6337, + "step": 5585 + }, + { + "epoch": 7.15008, + "grad_norm": 0.7841701507568359, + "learning_rate": 3.8843537414965984e-05, + "loss": 0.6371, + "step": 5586 + }, + { + "epoch": 7.15136, + "grad_norm": 0.7534654140472412, + "learning_rate": 3.884153661464586e-05, + "loss": 0.6057, + "step": 5587 + }, + { + "epoch": 7.15264, + "grad_norm": 0.7844194769859314, + "learning_rate": 3.8839535814325734e-05, + "loss": 0.6094, + "step": 5588 + }, + { + "epoch": 7.15392, + "grad_norm": 0.8300028443336487, + "learning_rate": 3.8837535014005606e-05, + "loss": 0.6388, + "step": 5589 + }, + { + "epoch": 7.1552, + "grad_norm": 0.7904089093208313, + "learning_rate": 3.883553421368548e-05, + "loss": 0.601, + "step": 5590 + }, + { + "epoch": 7.15648, + "grad_norm": 0.7863451838493347, + "learning_rate": 3.883353341336535e-05, + "loss": 0.6357, + "step": 5591 + }, + { + "epoch": 7.15776, + "grad_norm": 0.7628374695777893, + "learning_rate": 3.8831532613045215e-05, + "loss": 0.5687, + "step": 5592 + }, + { + "epoch": 7.15904, + "grad_norm": 0.8106046915054321, + "learning_rate": 3.882953181272509e-05, + "loss": 0.642, + "step": 5593 + }, + { + "epoch": 7.16032, + "grad_norm": 0.7651321887969971, + "learning_rate": 3.8827531012404966e-05, + "loss": 0.608, + "step": 5594 + }, + { + "epoch": 7.1616, + "grad_norm": 0.7789268493652344, + "learning_rate": 3.882553021208484e-05, + "loss": 0.6187, + "step": 5595 + }, + { + "epoch": 7.16288, + "grad_norm": 0.7903891801834106, + "learning_rate": 3.882352941176471e-05, + "loss": 0.5915, + "step": 5596 + }, + { + "epoch": 7.16416, + "grad_norm": 0.7863123416900635, + "learning_rate": 3.882152861144458e-05, + "loss": 0.6097, + "step": 5597 + }, + { + "epoch": 7.16544, + "grad_norm": 0.7845132350921631, + "learning_rate": 3.881952781112445e-05, + "loss": 0.5963, + "step": 5598 + }, + { + "epoch": 7.16672, + "grad_norm": 0.7682058811187744, + "learning_rate": 3.8817527010804325e-05, + "loss": 0.5804, + "step": 5599 + }, + { + "epoch": 7.168, + "grad_norm": 0.7563507556915283, + "learning_rate": 3.881552621048419e-05, + "loss": 0.5514, + "step": 5600 + }, + { + "epoch": 7.16928, + "grad_norm": 0.7621744275093079, + "learning_rate": 3.881352541016407e-05, + "loss": 0.5595, + "step": 5601 + }, + { + "epoch": 7.17056, + "grad_norm": 0.773902416229248, + "learning_rate": 3.881152460984394e-05, + "loss": 0.5483, + "step": 5602 + }, + { + "epoch": 7.1718399999999995, + "grad_norm": 0.7652880549430847, + "learning_rate": 3.880952380952381e-05, + "loss": 0.5439, + "step": 5603 + }, + { + "epoch": 7.17312, + "grad_norm": 0.8013029098510742, + "learning_rate": 3.8807523009203684e-05, + "loss": 0.583, + "step": 5604 + }, + { + "epoch": 7.1744, + "grad_norm": 0.7590806484222412, + "learning_rate": 3.8805522208883556e-05, + "loss": 0.569, + "step": 5605 + }, + { + "epoch": 7.17568, + "grad_norm": 0.8396809697151184, + "learning_rate": 3.880352140856343e-05, + "loss": 0.6549, + "step": 5606 + }, + { + "epoch": 7.17696, + "grad_norm": 0.796314537525177, + "learning_rate": 3.88015206082433e-05, + "loss": 0.6438, + "step": 5607 + }, + { + "epoch": 7.17824, + "grad_norm": 0.7749351859092712, + "learning_rate": 3.879951980792317e-05, + "loss": 0.5674, + "step": 5608 + }, + { + "epoch": 7.17952, + "grad_norm": 0.7739863395690918, + "learning_rate": 3.8797519007603043e-05, + "loss": 0.5771, + "step": 5609 + }, + { + "epoch": 7.1808, + "grad_norm": 0.808387279510498, + "learning_rate": 3.8795518207282915e-05, + "loss": 0.5839, + "step": 5610 + }, + { + "epoch": 7.18208, + "grad_norm": 0.7463328838348389, + "learning_rate": 3.879351740696279e-05, + "loss": 0.5557, + "step": 5611 + }, + { + "epoch": 7.18336, + "grad_norm": 0.7860255837440491, + "learning_rate": 3.879151660664266e-05, + "loss": 0.5787, + "step": 5612 + }, + { + "epoch": 7.18464, + "grad_norm": 0.841917097568512, + "learning_rate": 3.878951580632253e-05, + "loss": 0.6375, + "step": 5613 + }, + { + "epoch": 7.18592, + "grad_norm": 0.8427509665489197, + "learning_rate": 3.87875150060024e-05, + "loss": 0.6356, + "step": 5614 + }, + { + "epoch": 7.1872, + "grad_norm": 0.8131973147392273, + "learning_rate": 3.8785514205682275e-05, + "loss": 0.6143, + "step": 5615 + }, + { + "epoch": 7.18848, + "grad_norm": 0.8563538193702698, + "learning_rate": 3.8783513405362146e-05, + "loss": 0.6376, + "step": 5616 + }, + { + "epoch": 7.18976, + "grad_norm": 0.8263671398162842, + "learning_rate": 3.878151260504202e-05, + "loss": 0.5873, + "step": 5617 + }, + { + "epoch": 7.19104, + "grad_norm": 0.7478400468826294, + "learning_rate": 3.877951180472189e-05, + "loss": 0.5703, + "step": 5618 + }, + { + "epoch": 7.19232, + "grad_norm": 0.7577512860298157, + "learning_rate": 3.877751100440176e-05, + "loss": 0.5557, + "step": 5619 + }, + { + "epoch": 7.1936, + "grad_norm": 0.7820968627929688, + "learning_rate": 3.8775510204081634e-05, + "loss": 0.5891, + "step": 5620 + }, + { + "epoch": 7.19488, + "grad_norm": 0.7701216340065002, + "learning_rate": 3.8773509403761506e-05, + "loss": 0.6057, + "step": 5621 + }, + { + "epoch": 7.19616, + "grad_norm": 0.7920276522636414, + "learning_rate": 3.8771508603441384e-05, + "loss": 0.5919, + "step": 5622 + }, + { + "epoch": 7.19744, + "grad_norm": 0.8216814398765564, + "learning_rate": 3.876950780312125e-05, + "loss": 0.6177, + "step": 5623 + }, + { + "epoch": 7.19872, + "grad_norm": 0.8043842911720276, + "learning_rate": 3.876750700280112e-05, + "loss": 0.5935, + "step": 5624 + }, + { + "epoch": 7.2, + "grad_norm": 0.7516394853591919, + "learning_rate": 3.876550620248099e-05, + "loss": 0.5912, + "step": 5625 + }, + { + "epoch": 7.20128, + "grad_norm": 0.7623422145843506, + "learning_rate": 3.8763505402160865e-05, + "loss": 0.5907, + "step": 5626 + }, + { + "epoch": 7.20256, + "grad_norm": 0.8073017597198486, + "learning_rate": 3.876150460184074e-05, + "loss": 0.6344, + "step": 5627 + }, + { + "epoch": 7.20384, + "grad_norm": 0.7998178005218506, + "learning_rate": 3.875950380152061e-05, + "loss": 0.6338, + "step": 5628 + }, + { + "epoch": 7.20512, + "grad_norm": 0.8303821086883545, + "learning_rate": 3.875750300120049e-05, + "loss": 0.6152, + "step": 5629 + }, + { + "epoch": 7.2064, + "grad_norm": 0.758705735206604, + "learning_rate": 3.875550220088036e-05, + "loss": 0.5706, + "step": 5630 + }, + { + "epoch": 7.20768, + "grad_norm": 0.7329617738723755, + "learning_rate": 3.8753501400560224e-05, + "loss": 0.5728, + "step": 5631 + }, + { + "epoch": 7.20896, + "grad_norm": 0.7714441418647766, + "learning_rate": 3.8751500600240096e-05, + "loss": 0.6087, + "step": 5632 + }, + { + "epoch": 7.21024, + "grad_norm": 0.8715619444847107, + "learning_rate": 3.874949979991997e-05, + "loss": 0.6635, + "step": 5633 + }, + { + "epoch": 7.21152, + "grad_norm": 0.7906916737556458, + "learning_rate": 3.874749899959984e-05, + "loss": 0.5903, + "step": 5634 + }, + { + "epoch": 7.2128, + "grad_norm": 0.7834184765815735, + "learning_rate": 3.874549819927971e-05, + "loss": 0.6234, + "step": 5635 + }, + { + "epoch": 7.21408, + "grad_norm": 0.7641972899436951, + "learning_rate": 3.874349739895959e-05, + "loss": 0.5475, + "step": 5636 + }, + { + "epoch": 7.21536, + "grad_norm": 0.7720842957496643, + "learning_rate": 3.874149659863946e-05, + "loss": 0.5907, + "step": 5637 + }, + { + "epoch": 7.21664, + "grad_norm": 0.8000057339668274, + "learning_rate": 3.8739495798319334e-05, + "loss": 0.6073, + "step": 5638 + }, + { + "epoch": 7.21792, + "grad_norm": 0.7984986901283264, + "learning_rate": 3.87374949979992e-05, + "loss": 0.5555, + "step": 5639 + }, + { + "epoch": 7.2192, + "grad_norm": 0.7842923998832703, + "learning_rate": 3.873549419767907e-05, + "loss": 0.5406, + "step": 5640 + }, + { + "epoch": 7.22048, + "grad_norm": 0.7952711582183838, + "learning_rate": 3.873349339735894e-05, + "loss": 0.6319, + "step": 5641 + }, + { + "epoch": 7.22176, + "grad_norm": 0.7931072115898132, + "learning_rate": 3.8731492597038815e-05, + "loss": 0.623, + "step": 5642 + }, + { + "epoch": 7.22304, + "grad_norm": 0.8495784997940063, + "learning_rate": 3.872949179671869e-05, + "loss": 0.6935, + "step": 5643 + }, + { + "epoch": 7.22432, + "grad_norm": 0.7872684001922607, + "learning_rate": 3.8727490996398565e-05, + "loss": 0.608, + "step": 5644 + }, + { + "epoch": 7.2256, + "grad_norm": 0.7937207221984863, + "learning_rate": 3.872549019607844e-05, + "loss": 0.6136, + "step": 5645 + }, + { + "epoch": 7.22688, + "grad_norm": 0.8002104163169861, + "learning_rate": 3.872348939575831e-05, + "loss": 0.6445, + "step": 5646 + }, + { + "epoch": 7.22816, + "grad_norm": 0.7604816555976868, + "learning_rate": 3.8721488595438174e-05, + "loss": 0.5837, + "step": 5647 + }, + { + "epoch": 7.22944, + "grad_norm": 0.804574728012085, + "learning_rate": 3.8719487795118046e-05, + "loss": 0.6667, + "step": 5648 + }, + { + "epoch": 7.23072, + "grad_norm": 0.7907571792602539, + "learning_rate": 3.871748699479792e-05, + "loss": 0.6271, + "step": 5649 + }, + { + "epoch": 7.232, + "grad_norm": 0.7569406628608704, + "learning_rate": 3.871548619447779e-05, + "loss": 0.5681, + "step": 5650 + }, + { + "epoch": 7.23328, + "grad_norm": 0.7395563721656799, + "learning_rate": 3.871348539415767e-05, + "loss": 0.5615, + "step": 5651 + }, + { + "epoch": 7.23456, + "grad_norm": 0.7570566534996033, + "learning_rate": 3.871148459383754e-05, + "loss": 0.5757, + "step": 5652 + }, + { + "epoch": 7.23584, + "grad_norm": 0.7983739376068115, + "learning_rate": 3.870948379351741e-05, + "loss": 0.5978, + "step": 5653 + }, + { + "epoch": 7.23712, + "grad_norm": 0.787769079208374, + "learning_rate": 3.8707482993197284e-05, + "loss": 0.5987, + "step": 5654 + }, + { + "epoch": 7.2384, + "grad_norm": 0.8369766473770142, + "learning_rate": 3.870548219287715e-05, + "loss": 0.6217, + "step": 5655 + }, + { + "epoch": 7.23968, + "grad_norm": 0.7803592681884766, + "learning_rate": 3.870348139255702e-05, + "loss": 0.5908, + "step": 5656 + }, + { + "epoch": 7.24096, + "grad_norm": 0.7458162307739258, + "learning_rate": 3.870148059223689e-05, + "loss": 0.5794, + "step": 5657 + }, + { + "epoch": 7.24224, + "grad_norm": 0.7700108885765076, + "learning_rate": 3.869947979191677e-05, + "loss": 0.6299, + "step": 5658 + }, + { + "epoch": 7.24352, + "grad_norm": 0.7748832702636719, + "learning_rate": 3.869747899159664e-05, + "loss": 0.5857, + "step": 5659 + }, + { + "epoch": 7.2448, + "grad_norm": 0.7574285864830017, + "learning_rate": 3.8695478191276515e-05, + "loss": 0.5838, + "step": 5660 + }, + { + "epoch": 7.24608, + "grad_norm": 0.8172944188117981, + "learning_rate": 3.869347739095639e-05, + "loss": 0.6803, + "step": 5661 + }, + { + "epoch": 7.24736, + "grad_norm": 0.79927659034729, + "learning_rate": 3.869147659063626e-05, + "loss": 0.5985, + "step": 5662 + }, + { + "epoch": 7.24864, + "grad_norm": 0.7518450617790222, + "learning_rate": 3.8689475790316124e-05, + "loss": 0.5637, + "step": 5663 + }, + { + "epoch": 7.24992, + "grad_norm": 0.7577493190765381, + "learning_rate": 3.8687474989995996e-05, + "loss": 0.5303, + "step": 5664 + }, + { + "epoch": 7.2512, + "grad_norm": 0.771763801574707, + "learning_rate": 3.8685474189675874e-05, + "loss": 0.5408, + "step": 5665 + }, + { + "epoch": 7.25248, + "grad_norm": 0.8028543591499329, + "learning_rate": 3.8683473389355746e-05, + "loss": 0.5607, + "step": 5666 + }, + { + "epoch": 7.25376, + "grad_norm": 0.8177222013473511, + "learning_rate": 3.868147258903562e-05, + "loss": 0.5871, + "step": 5667 + }, + { + "epoch": 7.25504, + "grad_norm": 0.772655189037323, + "learning_rate": 3.867947178871549e-05, + "loss": 0.5901, + "step": 5668 + }, + { + "epoch": 7.25632, + "grad_norm": 0.8063568472862244, + "learning_rate": 3.867747098839536e-05, + "loss": 0.5927, + "step": 5669 + }, + { + "epoch": 7.2576, + "grad_norm": 0.8081983327865601, + "learning_rate": 3.8675470188075233e-05, + "loss": 0.6183, + "step": 5670 + }, + { + "epoch": 7.2588799999999996, + "grad_norm": 0.7956660985946655, + "learning_rate": 3.86734693877551e-05, + "loss": 0.5789, + "step": 5671 + }, + { + "epoch": 7.26016, + "grad_norm": 0.7728344798088074, + "learning_rate": 3.867146858743498e-05, + "loss": 0.5848, + "step": 5672 + }, + { + "epoch": 7.26144, + "grad_norm": 0.7772632837295532, + "learning_rate": 3.866946778711485e-05, + "loss": 0.5777, + "step": 5673 + }, + { + "epoch": 7.26272, + "grad_norm": 0.7768100500106812, + "learning_rate": 3.866746698679472e-05, + "loss": 0.5562, + "step": 5674 + }, + { + "epoch": 7.264, + "grad_norm": 0.7781060934066772, + "learning_rate": 3.866546618647459e-05, + "loss": 0.5728, + "step": 5675 + }, + { + "epoch": 7.26528, + "grad_norm": 0.810567319393158, + "learning_rate": 3.8663465386154465e-05, + "loss": 0.5866, + "step": 5676 + }, + { + "epoch": 7.26656, + "grad_norm": 0.8131520748138428, + "learning_rate": 3.8661464585834336e-05, + "loss": 0.6461, + "step": 5677 + }, + { + "epoch": 7.26784, + "grad_norm": 0.8413848876953125, + "learning_rate": 3.865946378551421e-05, + "loss": 0.6273, + "step": 5678 + }, + { + "epoch": 7.26912, + "grad_norm": 0.7927761673927307, + "learning_rate": 3.865746298519408e-05, + "loss": 0.5788, + "step": 5679 + }, + { + "epoch": 7.2704, + "grad_norm": 0.7904192805290222, + "learning_rate": 3.865546218487395e-05, + "loss": 0.5934, + "step": 5680 + }, + { + "epoch": 7.27168, + "grad_norm": 0.8133504986763, + "learning_rate": 3.8653461384553824e-05, + "loss": 0.5641, + "step": 5681 + }, + { + "epoch": 7.27296, + "grad_norm": 0.8121780157089233, + "learning_rate": 3.8651460584233696e-05, + "loss": 0.6126, + "step": 5682 + }, + { + "epoch": 7.27424, + "grad_norm": 0.850824773311615, + "learning_rate": 3.864945978391357e-05, + "loss": 0.6447, + "step": 5683 + }, + { + "epoch": 7.27552, + "grad_norm": 0.7701993584632874, + "learning_rate": 3.864745898359344e-05, + "loss": 0.5702, + "step": 5684 + }, + { + "epoch": 7.2768, + "grad_norm": 0.8420401811599731, + "learning_rate": 3.864545818327331e-05, + "loss": 0.6527, + "step": 5685 + }, + { + "epoch": 7.27808, + "grad_norm": 0.7517603039741516, + "learning_rate": 3.864345738295318e-05, + "loss": 0.5792, + "step": 5686 + }, + { + "epoch": 7.27936, + "grad_norm": 0.877324640750885, + "learning_rate": 3.8641456582633055e-05, + "loss": 0.6342, + "step": 5687 + }, + { + "epoch": 7.28064, + "grad_norm": 0.7491464614868164, + "learning_rate": 3.863945578231293e-05, + "loss": 0.5504, + "step": 5688 + }, + { + "epoch": 7.28192, + "grad_norm": 0.7788039445877075, + "learning_rate": 3.86374549819928e-05, + "loss": 0.6107, + "step": 5689 + }, + { + "epoch": 7.2832, + "grad_norm": 0.8021331429481506, + "learning_rate": 3.863545418167267e-05, + "loss": 0.6289, + "step": 5690 + }, + { + "epoch": 7.28448, + "grad_norm": 0.7916176915168762, + "learning_rate": 3.863345338135254e-05, + "loss": 0.5962, + "step": 5691 + }, + { + "epoch": 7.28576, + "grad_norm": 0.7986366152763367, + "learning_rate": 3.8631452581032414e-05, + "loss": 0.5942, + "step": 5692 + }, + { + "epoch": 7.28704, + "grad_norm": 0.8029093742370605, + "learning_rate": 3.8629451780712286e-05, + "loss": 0.6185, + "step": 5693 + }, + { + "epoch": 7.28832, + "grad_norm": 0.7949596047401428, + "learning_rate": 3.862745098039216e-05, + "loss": 0.6337, + "step": 5694 + }, + { + "epoch": 7.2896, + "grad_norm": 0.7814388871192932, + "learning_rate": 3.862545018007203e-05, + "loss": 0.6188, + "step": 5695 + }, + { + "epoch": 7.29088, + "grad_norm": 0.8712118268013, + "learning_rate": 3.86234493797519e-05, + "loss": 0.6191, + "step": 5696 + }, + { + "epoch": 7.29216, + "grad_norm": 0.769758939743042, + "learning_rate": 3.8621448579431774e-05, + "loss": 0.5759, + "step": 5697 + }, + { + "epoch": 7.29344, + "grad_norm": 0.7669387459754944, + "learning_rate": 3.8619447779111645e-05, + "loss": 0.5509, + "step": 5698 + }, + { + "epoch": 7.29472, + "grad_norm": 0.7945777177810669, + "learning_rate": 3.861744697879152e-05, + "loss": 0.6417, + "step": 5699 + }, + { + "epoch": 7.296, + "grad_norm": 0.764313280582428, + "learning_rate": 3.8615446178471396e-05, + "loss": 0.5851, + "step": 5700 + }, + { + "epoch": 7.29728, + "grad_norm": 0.7636333107948303, + "learning_rate": 3.861344537815126e-05, + "loss": 0.5799, + "step": 5701 + }, + { + "epoch": 7.29856, + "grad_norm": 0.7744512557983398, + "learning_rate": 3.861144457783113e-05, + "loss": 0.5415, + "step": 5702 + }, + { + "epoch": 7.29984, + "grad_norm": 0.7718966007232666, + "learning_rate": 3.8609443777511005e-05, + "loss": 0.6007, + "step": 5703 + }, + { + "epoch": 7.30112, + "grad_norm": 0.7986489534378052, + "learning_rate": 3.8607442977190877e-05, + "loss": 0.6244, + "step": 5704 + }, + { + "epoch": 7.3024000000000004, + "grad_norm": 0.7697789072990417, + "learning_rate": 3.860544217687075e-05, + "loss": 0.6394, + "step": 5705 + }, + { + "epoch": 7.30368, + "grad_norm": 0.781538188457489, + "learning_rate": 3.860344137655062e-05, + "loss": 0.6007, + "step": 5706 + }, + { + "epoch": 7.30496, + "grad_norm": 0.7859370112419128, + "learning_rate": 3.86014405762305e-05, + "loss": 0.6103, + "step": 5707 + }, + { + "epoch": 7.30624, + "grad_norm": 0.8621463775634766, + "learning_rate": 3.859943977591037e-05, + "loss": 0.6401, + "step": 5708 + }, + { + "epoch": 7.30752, + "grad_norm": 0.8109675645828247, + "learning_rate": 3.8597438975590236e-05, + "loss": 0.611, + "step": 5709 + }, + { + "epoch": 7.3088, + "grad_norm": 0.7787231206893921, + "learning_rate": 3.859543817527011e-05, + "loss": 0.5673, + "step": 5710 + }, + { + "epoch": 7.31008, + "grad_norm": 0.7826237678527832, + "learning_rate": 3.859343737494998e-05, + "loss": 0.6092, + "step": 5711 + }, + { + "epoch": 7.31136, + "grad_norm": 0.8043519258499146, + "learning_rate": 3.859143657462985e-05, + "loss": 0.6405, + "step": 5712 + }, + { + "epoch": 7.31264, + "grad_norm": 0.7601256370544434, + "learning_rate": 3.858943577430972e-05, + "loss": 0.5687, + "step": 5713 + }, + { + "epoch": 7.3139199999999995, + "grad_norm": 0.8053227663040161, + "learning_rate": 3.85874349739896e-05, + "loss": 0.6097, + "step": 5714 + }, + { + "epoch": 7.3152, + "grad_norm": 0.7957972288131714, + "learning_rate": 3.8585434173669474e-05, + "loss": 0.6215, + "step": 5715 + }, + { + "epoch": 7.31648, + "grad_norm": 0.7682824730873108, + "learning_rate": 3.8583433373349346e-05, + "loss": 0.6002, + "step": 5716 + }, + { + "epoch": 7.31776, + "grad_norm": 0.7858462333679199, + "learning_rate": 3.858143257302921e-05, + "loss": 0.6155, + "step": 5717 + }, + { + "epoch": 7.31904, + "grad_norm": 0.7957653999328613, + "learning_rate": 3.857943177270908e-05, + "loss": 0.5728, + "step": 5718 + }, + { + "epoch": 7.32032, + "grad_norm": 0.7708908915519714, + "learning_rate": 3.8577430972388954e-05, + "loss": 0.6013, + "step": 5719 + }, + { + "epoch": 7.3216, + "grad_norm": 0.7898252010345459, + "learning_rate": 3.8575430172068826e-05, + "loss": 0.6068, + "step": 5720 + }, + { + "epoch": 7.32288, + "grad_norm": 0.8056557178497314, + "learning_rate": 3.8573429371748705e-05, + "loss": 0.6074, + "step": 5721 + }, + { + "epoch": 7.32416, + "grad_norm": 0.7734809517860413, + "learning_rate": 3.857142857142858e-05, + "loss": 0.6006, + "step": 5722 + }, + { + "epoch": 7.32544, + "grad_norm": 0.8016178607940674, + "learning_rate": 3.856942777110845e-05, + "loss": 0.6086, + "step": 5723 + }, + { + "epoch": 7.32672, + "grad_norm": 0.8026044964790344, + "learning_rate": 3.856742697078832e-05, + "loss": 0.6075, + "step": 5724 + }, + { + "epoch": 7.328, + "grad_norm": 0.7688097953796387, + "learning_rate": 3.8565426170468186e-05, + "loss": 0.5736, + "step": 5725 + }, + { + "epoch": 7.32928, + "grad_norm": 0.7733714580535889, + "learning_rate": 3.856342537014806e-05, + "loss": 0.5723, + "step": 5726 + }, + { + "epoch": 7.33056, + "grad_norm": 0.7920982837677002, + "learning_rate": 3.856142456982793e-05, + "loss": 0.6015, + "step": 5727 + }, + { + "epoch": 7.33184, + "grad_norm": 0.8239418864250183, + "learning_rate": 3.855942376950781e-05, + "loss": 0.5999, + "step": 5728 + }, + { + "epoch": 7.33312, + "grad_norm": 0.7626153826713562, + "learning_rate": 3.855742296918768e-05, + "loss": 0.5769, + "step": 5729 + }, + { + "epoch": 7.3344, + "grad_norm": 0.7836630344390869, + "learning_rate": 3.855542216886755e-05, + "loss": 0.5713, + "step": 5730 + }, + { + "epoch": 7.33568, + "grad_norm": 0.8354891538619995, + "learning_rate": 3.8553421368547423e-05, + "loss": 0.5959, + "step": 5731 + }, + { + "epoch": 7.33696, + "grad_norm": 0.8115443587303162, + "learning_rate": 3.8551420568227295e-05, + "loss": 0.581, + "step": 5732 + }, + { + "epoch": 7.33824, + "grad_norm": 0.787783682346344, + "learning_rate": 3.854941976790716e-05, + "loss": 0.6043, + "step": 5733 + }, + { + "epoch": 7.33952, + "grad_norm": 0.7985721230506897, + "learning_rate": 3.854741896758703e-05, + "loss": 0.6119, + "step": 5734 + }, + { + "epoch": 7.3408, + "grad_norm": 0.8209099173545837, + "learning_rate": 3.854541816726691e-05, + "loss": 0.6216, + "step": 5735 + }, + { + "epoch": 7.34208, + "grad_norm": 0.7716638445854187, + "learning_rate": 3.854341736694678e-05, + "loss": 0.5866, + "step": 5736 + }, + { + "epoch": 7.34336, + "grad_norm": 0.7463488578796387, + "learning_rate": 3.8541416566626655e-05, + "loss": 0.5584, + "step": 5737 + }, + { + "epoch": 7.34464, + "grad_norm": 0.7812283635139465, + "learning_rate": 3.8539415766306526e-05, + "loss": 0.6046, + "step": 5738 + }, + { + "epoch": 7.34592, + "grad_norm": 0.830808699131012, + "learning_rate": 3.85374149659864e-05, + "loss": 0.6134, + "step": 5739 + }, + { + "epoch": 7.3472, + "grad_norm": 0.802952766418457, + "learning_rate": 3.853541416566627e-05, + "loss": 0.6095, + "step": 5740 + }, + { + "epoch": 7.34848, + "grad_norm": 0.8335705995559692, + "learning_rate": 3.8533413365346135e-05, + "loss": 0.6473, + "step": 5741 + }, + { + "epoch": 7.34976, + "grad_norm": 0.8194312453269958, + "learning_rate": 3.8531412565026014e-05, + "loss": 0.6392, + "step": 5742 + }, + { + "epoch": 7.35104, + "grad_norm": 0.8065541386604309, + "learning_rate": 3.8529411764705886e-05, + "loss": 0.5786, + "step": 5743 + }, + { + "epoch": 7.35232, + "grad_norm": 0.8424248695373535, + "learning_rate": 3.852741096438576e-05, + "loss": 0.6275, + "step": 5744 + }, + { + "epoch": 7.3536, + "grad_norm": 0.8039935827255249, + "learning_rate": 3.852541016406563e-05, + "loss": 0.595, + "step": 5745 + }, + { + "epoch": 7.35488, + "grad_norm": 0.8151816129684448, + "learning_rate": 3.85234093637455e-05, + "loss": 0.6375, + "step": 5746 + }, + { + "epoch": 7.35616, + "grad_norm": 0.7974145412445068, + "learning_rate": 3.852140856342537e-05, + "loss": 0.605, + "step": 5747 + }, + { + "epoch": 7.35744, + "grad_norm": 0.8081220984458923, + "learning_rate": 3.8519407763105245e-05, + "loss": 0.6031, + "step": 5748 + }, + { + "epoch": 7.35872, + "grad_norm": 0.7766823768615723, + "learning_rate": 3.851740696278512e-05, + "loss": 0.6233, + "step": 5749 + }, + { + "epoch": 7.36, + "grad_norm": 0.7977601885795593, + "learning_rate": 3.851540616246499e-05, + "loss": 0.6105, + "step": 5750 + }, + { + "epoch": 7.36128, + "grad_norm": 0.7957237958908081, + "learning_rate": 3.851340536214486e-05, + "loss": 0.6086, + "step": 5751 + }, + { + "epoch": 7.36256, + "grad_norm": 0.7621792554855347, + "learning_rate": 3.851140456182473e-05, + "loss": 0.6113, + "step": 5752 + }, + { + "epoch": 7.36384, + "grad_norm": 0.7617015242576599, + "learning_rate": 3.8509403761504604e-05, + "loss": 0.5529, + "step": 5753 + }, + { + "epoch": 7.36512, + "grad_norm": 0.7769597768783569, + "learning_rate": 3.8507402961184476e-05, + "loss": 0.6458, + "step": 5754 + }, + { + "epoch": 7.3664, + "grad_norm": 0.7622930407524109, + "learning_rate": 3.850540216086435e-05, + "loss": 0.5758, + "step": 5755 + }, + { + "epoch": 7.36768, + "grad_norm": 0.8526008129119873, + "learning_rate": 3.850340136054422e-05, + "loss": 0.6412, + "step": 5756 + }, + { + "epoch": 7.36896, + "grad_norm": 0.8075965046882629, + "learning_rate": 3.850140056022409e-05, + "loss": 0.598, + "step": 5757 + }, + { + "epoch": 7.37024, + "grad_norm": 0.859012246131897, + "learning_rate": 3.8499399759903964e-05, + "loss": 0.6844, + "step": 5758 + }, + { + "epoch": 7.37152, + "grad_norm": 0.7848984003067017, + "learning_rate": 3.8497398959583835e-05, + "loss": 0.5879, + "step": 5759 + }, + { + "epoch": 7.3728, + "grad_norm": 0.7643441557884216, + "learning_rate": 3.849539815926371e-05, + "loss": 0.5704, + "step": 5760 + }, + { + "epoch": 7.37408, + "grad_norm": 0.8113600015640259, + "learning_rate": 3.849339735894358e-05, + "loss": 0.6399, + "step": 5761 + }, + { + "epoch": 7.37536, + "grad_norm": 0.7590932846069336, + "learning_rate": 3.849139655862345e-05, + "loss": 0.5771, + "step": 5762 + }, + { + "epoch": 7.37664, + "grad_norm": 0.7780318260192871, + "learning_rate": 3.848939575830332e-05, + "loss": 0.6144, + "step": 5763 + }, + { + "epoch": 7.37792, + "grad_norm": 0.8140581846237183, + "learning_rate": 3.8487394957983195e-05, + "loss": 0.6571, + "step": 5764 + }, + { + "epoch": 7.3792, + "grad_norm": 0.7549552321434021, + "learning_rate": 3.8485394157663067e-05, + "loss": 0.5123, + "step": 5765 + }, + { + "epoch": 7.38048, + "grad_norm": 0.7963429093360901, + "learning_rate": 3.848339335734294e-05, + "loss": 0.5821, + "step": 5766 + }, + { + "epoch": 7.38176, + "grad_norm": 0.8283103704452515, + "learning_rate": 3.848139255702281e-05, + "loss": 0.627, + "step": 5767 + }, + { + "epoch": 7.38304, + "grad_norm": 0.7662546038627625, + "learning_rate": 3.847939175670268e-05, + "loss": 0.5702, + "step": 5768 + }, + { + "epoch": 7.38432, + "grad_norm": 0.780431866645813, + "learning_rate": 3.8477390956382554e-05, + "loss": 0.6072, + "step": 5769 + }, + { + "epoch": 7.3856, + "grad_norm": 0.8025130033493042, + "learning_rate": 3.8475390156062426e-05, + "loss": 0.5984, + "step": 5770 + }, + { + "epoch": 7.38688, + "grad_norm": 0.7883024215698242, + "learning_rate": 3.84733893557423e-05, + "loss": 0.5951, + "step": 5771 + }, + { + "epoch": 7.38816, + "grad_norm": 0.8061680197715759, + "learning_rate": 3.847138855542217e-05, + "loss": 0.6058, + "step": 5772 + }, + { + "epoch": 7.3894400000000005, + "grad_norm": 0.7774277925491333, + "learning_rate": 3.846938775510204e-05, + "loss": 0.5983, + "step": 5773 + }, + { + "epoch": 7.39072, + "grad_norm": 0.787751317024231, + "learning_rate": 3.846738695478191e-05, + "loss": 0.5822, + "step": 5774 + }, + { + "epoch": 7.392, + "grad_norm": 0.830544650554657, + "learning_rate": 3.8465386154461785e-05, + "loss": 0.6172, + "step": 5775 + }, + { + "epoch": 7.39328, + "grad_norm": 0.8001280426979065, + "learning_rate": 3.846338535414166e-05, + "loss": 0.6249, + "step": 5776 + }, + { + "epoch": 7.39456, + "grad_norm": 0.7862173914909363, + "learning_rate": 3.846138455382153e-05, + "loss": 0.5628, + "step": 5777 + }, + { + "epoch": 7.39584, + "grad_norm": 0.8081876039505005, + "learning_rate": 3.845938375350141e-05, + "loss": 0.6007, + "step": 5778 + }, + { + "epoch": 7.39712, + "grad_norm": 0.7967897057533264, + "learning_rate": 3.845738295318127e-05, + "loss": 0.5906, + "step": 5779 + }, + { + "epoch": 7.3984, + "grad_norm": 0.8136133551597595, + "learning_rate": 3.8455382152861144e-05, + "loss": 0.5849, + "step": 5780 + }, + { + "epoch": 7.39968, + "grad_norm": 0.7975330948829651, + "learning_rate": 3.8453381352541016e-05, + "loss": 0.6159, + "step": 5781 + }, + { + "epoch": 7.4009599999999995, + "grad_norm": 0.8437215089797974, + "learning_rate": 3.845138055222089e-05, + "loss": 0.6324, + "step": 5782 + }, + { + "epoch": 7.40224, + "grad_norm": 0.8381403684616089, + "learning_rate": 3.844937975190076e-05, + "loss": 0.6193, + "step": 5783 + }, + { + "epoch": 7.40352, + "grad_norm": 0.7790165543556213, + "learning_rate": 3.844737895158063e-05, + "loss": 0.6107, + "step": 5784 + }, + { + "epoch": 7.4048, + "grad_norm": 0.8007838726043701, + "learning_rate": 3.844537815126051e-05, + "loss": 0.6304, + "step": 5785 + }, + { + "epoch": 7.40608, + "grad_norm": 0.8209007978439331, + "learning_rate": 3.844337735094038e-05, + "loss": 0.6089, + "step": 5786 + }, + { + "epoch": 7.40736, + "grad_norm": 0.7682439684867859, + "learning_rate": 3.844137655062025e-05, + "loss": 0.5974, + "step": 5787 + }, + { + "epoch": 7.40864, + "grad_norm": 0.7677670121192932, + "learning_rate": 3.843937575030012e-05, + "loss": 0.5678, + "step": 5788 + }, + { + "epoch": 7.40992, + "grad_norm": 0.8111715912818909, + "learning_rate": 3.843737494997999e-05, + "loss": 0.5916, + "step": 5789 + }, + { + "epoch": 7.4112, + "grad_norm": 0.7806997895240784, + "learning_rate": 3.843537414965986e-05, + "loss": 0.6144, + "step": 5790 + }, + { + "epoch": 7.41248, + "grad_norm": 0.7777438759803772, + "learning_rate": 3.8433373349339735e-05, + "loss": 0.5683, + "step": 5791 + }, + { + "epoch": 7.41376, + "grad_norm": 0.751512885093689, + "learning_rate": 3.8431372549019614e-05, + "loss": 0.5775, + "step": 5792 + }, + { + "epoch": 7.41504, + "grad_norm": 0.7952166199684143, + "learning_rate": 3.8429371748699485e-05, + "loss": 0.5878, + "step": 5793 + }, + { + "epoch": 7.41632, + "grad_norm": 0.7829790711402893, + "learning_rate": 3.842737094837936e-05, + "loss": 0.6195, + "step": 5794 + }, + { + "epoch": 7.4176, + "grad_norm": 0.7563794851303101, + "learning_rate": 3.842537014805922e-05, + "loss": 0.5918, + "step": 5795 + }, + { + "epoch": 7.41888, + "grad_norm": 0.816536009311676, + "learning_rate": 3.8423369347739094e-05, + "loss": 0.6219, + "step": 5796 + }, + { + "epoch": 7.42016, + "grad_norm": 0.7779079079627991, + "learning_rate": 3.8421368547418966e-05, + "loss": 0.6168, + "step": 5797 + }, + { + "epoch": 7.42144, + "grad_norm": 0.8121761083602905, + "learning_rate": 3.841936774709884e-05, + "loss": 0.625, + "step": 5798 + }, + { + "epoch": 7.42272, + "grad_norm": 0.7565649151802063, + "learning_rate": 3.8417366946778717e-05, + "loss": 0.5407, + "step": 5799 + }, + { + "epoch": 7.424, + "grad_norm": 0.7441564798355103, + "learning_rate": 3.841536614645859e-05, + "loss": 0.5511, + "step": 5800 + }, + { + "epoch": 7.42528, + "grad_norm": 0.8287448883056641, + "learning_rate": 3.841336534613846e-05, + "loss": 0.6299, + "step": 5801 + }, + { + "epoch": 7.42656, + "grad_norm": 0.771754801273346, + "learning_rate": 3.841136454581833e-05, + "loss": 0.5718, + "step": 5802 + }, + { + "epoch": 7.42784, + "grad_norm": 0.7989311814308167, + "learning_rate": 3.84093637454982e-05, + "loss": 0.6068, + "step": 5803 + }, + { + "epoch": 7.42912, + "grad_norm": 0.7287749648094177, + "learning_rate": 3.840736294517807e-05, + "loss": 0.5552, + "step": 5804 + }, + { + "epoch": 7.4304, + "grad_norm": 0.8139730095863342, + "learning_rate": 3.840536214485794e-05, + "loss": 0.623, + "step": 5805 + }, + { + "epoch": 7.43168, + "grad_norm": 0.8090022206306458, + "learning_rate": 3.840336134453782e-05, + "loss": 0.5875, + "step": 5806 + }, + { + "epoch": 7.43296, + "grad_norm": 0.793850302696228, + "learning_rate": 3.840136054421769e-05, + "loss": 0.5948, + "step": 5807 + }, + { + "epoch": 7.43424, + "grad_norm": 0.8325986862182617, + "learning_rate": 3.839935974389756e-05, + "loss": 0.6497, + "step": 5808 + }, + { + "epoch": 7.43552, + "grad_norm": 0.7771223783493042, + "learning_rate": 3.8397358943577435e-05, + "loss": 0.584, + "step": 5809 + }, + { + "epoch": 7.4368, + "grad_norm": 0.8267020583152771, + "learning_rate": 3.839535814325731e-05, + "loss": 0.6556, + "step": 5810 + }, + { + "epoch": 7.43808, + "grad_norm": 0.8165448904037476, + "learning_rate": 3.839335734293717e-05, + "loss": 0.6132, + "step": 5811 + }, + { + "epoch": 7.43936, + "grad_norm": 0.8215618133544922, + "learning_rate": 3.8391356542617044e-05, + "loss": 0.6034, + "step": 5812 + }, + { + "epoch": 7.44064, + "grad_norm": 0.8251504302024841, + "learning_rate": 3.838935574229692e-05, + "loss": 0.6194, + "step": 5813 + }, + { + "epoch": 7.44192, + "grad_norm": 0.803627073764801, + "learning_rate": 3.8387354941976794e-05, + "loss": 0.5718, + "step": 5814 + }, + { + "epoch": 7.4432, + "grad_norm": 0.7754432559013367, + "learning_rate": 3.8385354141656666e-05, + "loss": 0.5652, + "step": 5815 + }, + { + "epoch": 7.44448, + "grad_norm": 0.7755488753318787, + "learning_rate": 3.838335334133654e-05, + "loss": 0.6237, + "step": 5816 + }, + { + "epoch": 7.44576, + "grad_norm": 0.8018819689750671, + "learning_rate": 3.838135254101641e-05, + "loss": 0.615, + "step": 5817 + }, + { + "epoch": 7.44704, + "grad_norm": 0.7852720618247986, + "learning_rate": 3.837935174069628e-05, + "loss": 0.6045, + "step": 5818 + }, + { + "epoch": 7.44832, + "grad_norm": 0.784041166305542, + "learning_rate": 3.837735094037615e-05, + "loss": 0.6095, + "step": 5819 + }, + { + "epoch": 7.4496, + "grad_norm": 0.8439889550209045, + "learning_rate": 3.8375350140056026e-05, + "loss": 0.7184, + "step": 5820 + }, + { + "epoch": 7.45088, + "grad_norm": 0.8502903580665588, + "learning_rate": 3.83733493397359e-05, + "loss": 0.6358, + "step": 5821 + }, + { + "epoch": 7.45216, + "grad_norm": 0.7948433756828308, + "learning_rate": 3.837134853941577e-05, + "loss": 0.5715, + "step": 5822 + }, + { + "epoch": 7.45344, + "grad_norm": 0.7365151047706604, + "learning_rate": 3.836934773909564e-05, + "loss": 0.567, + "step": 5823 + }, + { + "epoch": 7.45472, + "grad_norm": 0.8041279315948486, + "learning_rate": 3.836734693877551e-05, + "loss": 0.6025, + "step": 5824 + }, + { + "epoch": 7.456, + "grad_norm": 0.7834358215332031, + "learning_rate": 3.8365346138455385e-05, + "loss": 0.5941, + "step": 5825 + }, + { + "epoch": 7.45728, + "grad_norm": 0.7606781721115112, + "learning_rate": 3.836334533813526e-05, + "loss": 0.5979, + "step": 5826 + }, + { + "epoch": 7.45856, + "grad_norm": 0.7791493535041809, + "learning_rate": 3.836134453781513e-05, + "loss": 0.5739, + "step": 5827 + }, + { + "epoch": 7.45984, + "grad_norm": 0.8152048587799072, + "learning_rate": 3.8359343737495e-05, + "loss": 0.6035, + "step": 5828 + }, + { + "epoch": 7.46112, + "grad_norm": 0.8163161277770996, + "learning_rate": 3.835734293717487e-05, + "loss": 0.6061, + "step": 5829 + }, + { + "epoch": 7.4624, + "grad_norm": 0.7904027104377747, + "learning_rate": 3.8355342136854744e-05, + "loss": 0.6001, + "step": 5830 + }, + { + "epoch": 7.46368, + "grad_norm": 0.8345395922660828, + "learning_rate": 3.8353341336534616e-05, + "loss": 0.6474, + "step": 5831 + }, + { + "epoch": 7.46496, + "grad_norm": 0.764450192451477, + "learning_rate": 3.835134053621449e-05, + "loss": 0.5939, + "step": 5832 + }, + { + "epoch": 7.46624, + "grad_norm": 0.7856200933456421, + "learning_rate": 3.834933973589436e-05, + "loss": 0.5863, + "step": 5833 + }, + { + "epoch": 7.46752, + "grad_norm": 0.756513237953186, + "learning_rate": 3.834733893557423e-05, + "loss": 0.5699, + "step": 5834 + }, + { + "epoch": 7.4688, + "grad_norm": 0.8095502257347107, + "learning_rate": 3.83453381352541e-05, + "loss": 0.6145, + "step": 5835 + }, + { + "epoch": 7.47008, + "grad_norm": 0.7780006527900696, + "learning_rate": 3.8343337334933975e-05, + "loss": 0.622, + "step": 5836 + }, + { + "epoch": 7.47136, + "grad_norm": 0.7881510257720947, + "learning_rate": 3.834133653461385e-05, + "loss": 0.5873, + "step": 5837 + }, + { + "epoch": 7.47264, + "grad_norm": 0.7660511136054993, + "learning_rate": 3.833933573429372e-05, + "loss": 0.6096, + "step": 5838 + }, + { + "epoch": 7.47392, + "grad_norm": 0.7153475284576416, + "learning_rate": 3.833733493397359e-05, + "loss": 0.5051, + "step": 5839 + }, + { + "epoch": 7.4752, + "grad_norm": 0.7922013401985168, + "learning_rate": 3.833533413365346e-05, + "loss": 0.6159, + "step": 5840 + }, + { + "epoch": 7.4764800000000005, + "grad_norm": 0.7256450057029724, + "learning_rate": 3.8333333333333334e-05, + "loss": 0.5558, + "step": 5841 + }, + { + "epoch": 7.47776, + "grad_norm": 0.7810547351837158, + "learning_rate": 3.8331332533013206e-05, + "loss": 0.6134, + "step": 5842 + }, + { + "epoch": 7.47904, + "grad_norm": 0.7948379516601562, + "learning_rate": 3.832933173269308e-05, + "loss": 0.6025, + "step": 5843 + }, + { + "epoch": 7.48032, + "grad_norm": 0.7824628353118896, + "learning_rate": 3.832733093237295e-05, + "loss": 0.5658, + "step": 5844 + }, + { + "epoch": 7.4816, + "grad_norm": 0.8361453413963318, + "learning_rate": 3.832533013205282e-05, + "loss": 0.6285, + "step": 5845 + }, + { + "epoch": 7.48288, + "grad_norm": 0.8043081164360046, + "learning_rate": 3.8323329331732694e-05, + "loss": 0.6019, + "step": 5846 + }, + { + "epoch": 7.48416, + "grad_norm": 0.7815614342689514, + "learning_rate": 3.8321328531412566e-05, + "loss": 0.5788, + "step": 5847 + }, + { + "epoch": 7.48544, + "grad_norm": 0.8125666379928589, + "learning_rate": 3.8319327731092444e-05, + "loss": 0.5843, + "step": 5848 + }, + { + "epoch": 7.48672, + "grad_norm": 0.8060123920440674, + "learning_rate": 3.831732693077231e-05, + "loss": 0.605, + "step": 5849 + }, + { + "epoch": 7.4879999999999995, + "grad_norm": 0.7951981425285339, + "learning_rate": 3.831532613045218e-05, + "loss": 0.5823, + "step": 5850 + }, + { + "epoch": 7.48928, + "grad_norm": 0.767137885093689, + "learning_rate": 3.831332533013205e-05, + "loss": 0.6463, + "step": 5851 + }, + { + "epoch": 7.49056, + "grad_norm": 0.7987186312675476, + "learning_rate": 3.8311324529811925e-05, + "loss": 0.5696, + "step": 5852 + }, + { + "epoch": 7.49184, + "grad_norm": 0.8147026896476746, + "learning_rate": 3.83093237294918e-05, + "loss": 0.6369, + "step": 5853 + }, + { + "epoch": 7.49312, + "grad_norm": 0.7695271968841553, + "learning_rate": 3.830732292917167e-05, + "loss": 0.6022, + "step": 5854 + }, + { + "epoch": 7.4944, + "grad_norm": 0.7904877066612244, + "learning_rate": 3.830532212885155e-05, + "loss": 0.5893, + "step": 5855 + }, + { + "epoch": 7.49568, + "grad_norm": 0.7861725687980652, + "learning_rate": 3.830332132853142e-05, + "loss": 0.6368, + "step": 5856 + }, + { + "epoch": 7.49696, + "grad_norm": 0.7913157343864441, + "learning_rate": 3.8301320528211284e-05, + "loss": 0.6459, + "step": 5857 + }, + { + "epoch": 7.49824, + "grad_norm": 0.7918976545333862, + "learning_rate": 3.8299319727891156e-05, + "loss": 0.5793, + "step": 5858 + }, + { + "epoch": 7.49952, + "grad_norm": 0.8110979199409485, + "learning_rate": 3.829731892757103e-05, + "loss": 0.589, + "step": 5859 + }, + { + "epoch": 7.5008, + "grad_norm": 0.8010942339897156, + "learning_rate": 3.82953181272509e-05, + "loss": 0.5936, + "step": 5860 + }, + { + "epoch": 7.50208, + "grad_norm": 0.8619551062583923, + "learning_rate": 3.829331732693077e-05, + "loss": 0.6526, + "step": 5861 + }, + { + "epoch": 7.50336, + "grad_norm": 0.7884719371795654, + "learning_rate": 3.829131652661065e-05, + "loss": 0.5931, + "step": 5862 + }, + { + "epoch": 7.50464, + "grad_norm": 0.7682931423187256, + "learning_rate": 3.828931572629052e-05, + "loss": 0.5891, + "step": 5863 + }, + { + "epoch": 7.50592, + "grad_norm": 0.7763561606407166, + "learning_rate": 3.8287314925970394e-05, + "loss": 0.5551, + "step": 5864 + }, + { + "epoch": 7.5072, + "grad_norm": 0.7990860342979431, + "learning_rate": 3.828531412565026e-05, + "loss": 0.586, + "step": 5865 + }, + { + "epoch": 7.5084800000000005, + "grad_norm": 0.7618961334228516, + "learning_rate": 3.828331332533013e-05, + "loss": 0.5892, + "step": 5866 + }, + { + "epoch": 7.50976, + "grad_norm": 0.8227397203445435, + "learning_rate": 3.828131252501e-05, + "loss": 0.5808, + "step": 5867 + }, + { + "epoch": 7.51104, + "grad_norm": 0.7972336411476135, + "learning_rate": 3.8279311724689875e-05, + "loss": 0.5937, + "step": 5868 + }, + { + "epoch": 7.51232, + "grad_norm": 0.7725977301597595, + "learning_rate": 3.8277310924369746e-05, + "loss": 0.5742, + "step": 5869 + }, + { + "epoch": 7.5136, + "grad_norm": 0.7537288665771484, + "learning_rate": 3.8275310124049625e-05, + "loss": 0.5535, + "step": 5870 + }, + { + "epoch": 7.51488, + "grad_norm": 0.8171656131744385, + "learning_rate": 3.82733093237295e-05, + "loss": 0.5987, + "step": 5871 + }, + { + "epoch": 7.51616, + "grad_norm": 0.8476085662841797, + "learning_rate": 3.827130852340937e-05, + "loss": 0.6341, + "step": 5872 + }, + { + "epoch": 7.51744, + "grad_norm": 0.7884910106658936, + "learning_rate": 3.8269307723089234e-05, + "loss": 0.5655, + "step": 5873 + }, + { + "epoch": 7.51872, + "grad_norm": 0.7917491793632507, + "learning_rate": 3.8267306922769106e-05, + "loss": 0.5747, + "step": 5874 + }, + { + "epoch": 7.52, + "grad_norm": 0.7862417697906494, + "learning_rate": 3.826530612244898e-05, + "loss": 0.5739, + "step": 5875 + }, + { + "epoch": 7.52128, + "grad_norm": 0.7860782146453857, + "learning_rate": 3.826330532212885e-05, + "loss": 0.5958, + "step": 5876 + }, + { + "epoch": 7.52256, + "grad_norm": 0.7811517715454102, + "learning_rate": 3.826130452180873e-05, + "loss": 0.5845, + "step": 5877 + }, + { + "epoch": 7.52384, + "grad_norm": 0.7601850628852844, + "learning_rate": 3.82593037214886e-05, + "loss": 0.5989, + "step": 5878 + }, + { + "epoch": 7.52512, + "grad_norm": 0.727415919303894, + "learning_rate": 3.825730292116847e-05, + "loss": 0.5449, + "step": 5879 + }, + { + "epoch": 7.5264, + "grad_norm": 0.8012895584106445, + "learning_rate": 3.8255302120848344e-05, + "loss": 0.5999, + "step": 5880 + }, + { + "epoch": 7.52768, + "grad_norm": 0.7879264950752258, + "learning_rate": 3.825330132052821e-05, + "loss": 0.5992, + "step": 5881 + }, + { + "epoch": 7.52896, + "grad_norm": 0.7692047953605652, + "learning_rate": 3.825130052020808e-05, + "loss": 0.5393, + "step": 5882 + }, + { + "epoch": 7.53024, + "grad_norm": 0.8411558270454407, + "learning_rate": 3.824929971988795e-05, + "loss": 0.593, + "step": 5883 + }, + { + "epoch": 7.53152, + "grad_norm": 0.8363029956817627, + "learning_rate": 3.824729891956783e-05, + "loss": 0.6034, + "step": 5884 + }, + { + "epoch": 7.5328, + "grad_norm": 0.7915286421775818, + "learning_rate": 3.82452981192477e-05, + "loss": 0.5933, + "step": 5885 + }, + { + "epoch": 7.53408, + "grad_norm": 0.8054776787757874, + "learning_rate": 3.8243297318927575e-05, + "loss": 0.6009, + "step": 5886 + }, + { + "epoch": 7.53536, + "grad_norm": 0.7464303970336914, + "learning_rate": 3.824129651860745e-05, + "loss": 0.5387, + "step": 5887 + }, + { + "epoch": 7.53664, + "grad_norm": 0.7987693548202515, + "learning_rate": 3.823929571828732e-05, + "loss": 0.5608, + "step": 5888 + }, + { + "epoch": 7.53792, + "grad_norm": 0.7968281507492065, + "learning_rate": 3.8237294917967184e-05, + "loss": 0.605, + "step": 5889 + }, + { + "epoch": 7.5392, + "grad_norm": 0.7655385136604309, + "learning_rate": 3.8235294117647055e-05, + "loss": 0.5751, + "step": 5890 + }, + { + "epoch": 7.54048, + "grad_norm": 0.8068759441375732, + "learning_rate": 3.8233293317326934e-05, + "loss": 0.6387, + "step": 5891 + }, + { + "epoch": 7.54176, + "grad_norm": 0.7618523240089417, + "learning_rate": 3.8231292517006806e-05, + "loss": 0.5853, + "step": 5892 + }, + { + "epoch": 7.5430399999999995, + "grad_norm": 0.7742576599121094, + "learning_rate": 3.822929171668668e-05, + "loss": 0.5768, + "step": 5893 + }, + { + "epoch": 7.54432, + "grad_norm": 0.7480475902557373, + "learning_rate": 3.822729091636655e-05, + "loss": 0.5423, + "step": 5894 + }, + { + "epoch": 7.5456, + "grad_norm": 0.780965268611908, + "learning_rate": 3.822529011604642e-05, + "loss": 0.6088, + "step": 5895 + }, + { + "epoch": 7.54688, + "grad_norm": 0.8063462376594543, + "learning_rate": 3.8223289315726293e-05, + "loss": 0.5745, + "step": 5896 + }, + { + "epoch": 7.54816, + "grad_norm": 0.8213686347007751, + "learning_rate": 3.822128851540616e-05, + "loss": 0.6227, + "step": 5897 + }, + { + "epoch": 7.54944, + "grad_norm": 0.8014224767684937, + "learning_rate": 3.821928771508604e-05, + "loss": 0.6174, + "step": 5898 + }, + { + "epoch": 7.55072, + "grad_norm": 0.8072211146354675, + "learning_rate": 3.821728691476591e-05, + "loss": 0.6356, + "step": 5899 + }, + { + "epoch": 7.552, + "grad_norm": 0.8300227522850037, + "learning_rate": 3.821528611444578e-05, + "loss": 0.62, + "step": 5900 + }, + { + "epoch": 7.55328, + "grad_norm": 0.7829582095146179, + "learning_rate": 3.821328531412565e-05, + "loss": 0.5797, + "step": 5901 + }, + { + "epoch": 7.55456, + "grad_norm": 0.8026710152626038, + "learning_rate": 3.8211284513805525e-05, + "loss": 0.573, + "step": 5902 + }, + { + "epoch": 7.55584, + "grad_norm": 0.8413091897964478, + "learning_rate": 3.8209283713485396e-05, + "loss": 0.6193, + "step": 5903 + }, + { + "epoch": 7.55712, + "grad_norm": 0.7783622741699219, + "learning_rate": 3.820728291316527e-05, + "loss": 0.6419, + "step": 5904 + }, + { + "epoch": 7.5584, + "grad_norm": 0.804766058921814, + "learning_rate": 3.820528211284514e-05, + "loss": 0.5698, + "step": 5905 + }, + { + "epoch": 7.55968, + "grad_norm": 0.7991184592247009, + "learning_rate": 3.820328131252501e-05, + "loss": 0.6626, + "step": 5906 + }, + { + "epoch": 7.56096, + "grad_norm": 0.8455173969268799, + "learning_rate": 3.8201280512204884e-05, + "loss": 0.6492, + "step": 5907 + }, + { + "epoch": 7.56224, + "grad_norm": 0.8010911345481873, + "learning_rate": 3.8199279711884756e-05, + "loss": 0.625, + "step": 5908 + }, + { + "epoch": 7.5635200000000005, + "grad_norm": 0.7779704928398132, + "learning_rate": 3.819727891156463e-05, + "loss": 0.5656, + "step": 5909 + }, + { + "epoch": 7.5648, + "grad_norm": 0.7678871750831604, + "learning_rate": 3.81952781112445e-05, + "loss": 0.5712, + "step": 5910 + }, + { + "epoch": 7.56608, + "grad_norm": 0.7485666871070862, + "learning_rate": 3.819327731092437e-05, + "loss": 0.5766, + "step": 5911 + }, + { + "epoch": 7.56736, + "grad_norm": 0.8102645874023438, + "learning_rate": 3.819127651060424e-05, + "loss": 0.6139, + "step": 5912 + }, + { + "epoch": 7.56864, + "grad_norm": 0.8365371227264404, + "learning_rate": 3.8189275710284115e-05, + "loss": 0.6378, + "step": 5913 + }, + { + "epoch": 7.56992, + "grad_norm": 0.7460724115371704, + "learning_rate": 3.818727490996399e-05, + "loss": 0.5879, + "step": 5914 + }, + { + "epoch": 7.5712, + "grad_norm": 0.8165127038955688, + "learning_rate": 3.818527410964386e-05, + "loss": 0.5859, + "step": 5915 + }, + { + "epoch": 7.57248, + "grad_norm": 0.8666216135025024, + "learning_rate": 3.818327330932373e-05, + "loss": 0.6287, + "step": 5916 + }, + { + "epoch": 7.57376, + "grad_norm": 0.8045700788497925, + "learning_rate": 3.81812725090036e-05, + "loss": 0.6372, + "step": 5917 + }, + { + "epoch": 7.5750399999999996, + "grad_norm": 0.7294300198554993, + "learning_rate": 3.8179271708683474e-05, + "loss": 0.5485, + "step": 5918 + }, + { + "epoch": 7.57632, + "grad_norm": 0.7684528827667236, + "learning_rate": 3.817727090836335e-05, + "loss": 0.5719, + "step": 5919 + }, + { + "epoch": 7.5776, + "grad_norm": 0.8162046670913696, + "learning_rate": 3.817527010804322e-05, + "loss": 0.6237, + "step": 5920 + }, + { + "epoch": 7.57888, + "grad_norm": 0.7951498031616211, + "learning_rate": 3.817326930772309e-05, + "loss": 0.6499, + "step": 5921 + }, + { + "epoch": 7.58016, + "grad_norm": 0.7112811803817749, + "learning_rate": 3.817126850740296e-05, + "loss": 0.5304, + "step": 5922 + }, + { + "epoch": 7.58144, + "grad_norm": 0.794484555721283, + "learning_rate": 3.8169267707082834e-05, + "loss": 0.6157, + "step": 5923 + }, + { + "epoch": 7.58272, + "grad_norm": 0.769756019115448, + "learning_rate": 3.8167266906762705e-05, + "loss": 0.5664, + "step": 5924 + }, + { + "epoch": 7.584, + "grad_norm": 0.8022031784057617, + "learning_rate": 3.816526610644258e-05, + "loss": 0.6011, + "step": 5925 + }, + { + "epoch": 7.58528, + "grad_norm": 0.8301845788955688, + "learning_rate": 3.8163265306122456e-05, + "loss": 0.6288, + "step": 5926 + }, + { + "epoch": 7.58656, + "grad_norm": 0.7498910427093506, + "learning_rate": 3.816126450580233e-05, + "loss": 0.5681, + "step": 5927 + }, + { + "epoch": 7.58784, + "grad_norm": 0.8285461068153381, + "learning_rate": 3.815926370548219e-05, + "loss": 0.6179, + "step": 5928 + }, + { + "epoch": 7.58912, + "grad_norm": 0.795591413974762, + "learning_rate": 3.8157262905162065e-05, + "loss": 0.6189, + "step": 5929 + }, + { + "epoch": 7.5904, + "grad_norm": 0.7645912170410156, + "learning_rate": 3.8155262104841937e-05, + "loss": 0.558, + "step": 5930 + }, + { + "epoch": 7.59168, + "grad_norm": 0.7344127297401428, + "learning_rate": 3.815326130452181e-05, + "loss": 0.5886, + "step": 5931 + }, + { + "epoch": 7.59296, + "grad_norm": 0.7792043685913086, + "learning_rate": 3.815126050420168e-05, + "loss": 0.5778, + "step": 5932 + }, + { + "epoch": 7.59424, + "grad_norm": 0.7682183980941772, + "learning_rate": 3.814925970388156e-05, + "loss": 0.6103, + "step": 5933 + }, + { + "epoch": 7.5955200000000005, + "grad_norm": 0.8293752670288086, + "learning_rate": 3.814725890356143e-05, + "loss": 0.6615, + "step": 5934 + }, + { + "epoch": 7.5968, + "grad_norm": 0.8193676471710205, + "learning_rate": 3.81452581032413e-05, + "loss": 0.6054, + "step": 5935 + }, + { + "epoch": 7.5980799999999995, + "grad_norm": 0.799968957901001, + "learning_rate": 3.814325730292117e-05, + "loss": 0.5967, + "step": 5936 + }, + { + "epoch": 7.59936, + "grad_norm": 0.7881608605384827, + "learning_rate": 3.814125650260104e-05, + "loss": 0.5734, + "step": 5937 + }, + { + "epoch": 7.60064, + "grad_norm": 0.7928145527839661, + "learning_rate": 3.813925570228091e-05, + "loss": 0.62, + "step": 5938 + }, + { + "epoch": 7.60192, + "grad_norm": 0.8052318096160889, + "learning_rate": 3.813725490196078e-05, + "loss": 0.6097, + "step": 5939 + }, + { + "epoch": 7.6032, + "grad_norm": 0.7783374786376953, + "learning_rate": 3.813525410164066e-05, + "loss": 0.624, + "step": 5940 + }, + { + "epoch": 7.60448, + "grad_norm": 0.8174735307693481, + "learning_rate": 3.8133253301320534e-05, + "loss": 0.5862, + "step": 5941 + }, + { + "epoch": 7.60576, + "grad_norm": 0.8126026391983032, + "learning_rate": 3.8131252501000406e-05, + "loss": 0.6185, + "step": 5942 + }, + { + "epoch": 7.60704, + "grad_norm": 0.796207070350647, + "learning_rate": 3.812925170068028e-05, + "loss": 0.5721, + "step": 5943 + }, + { + "epoch": 7.60832, + "grad_norm": 0.8204118013381958, + "learning_rate": 3.812725090036014e-05, + "loss": 0.6101, + "step": 5944 + }, + { + "epoch": 7.6096, + "grad_norm": 0.8275149464607239, + "learning_rate": 3.8125250100040014e-05, + "loss": 0.6486, + "step": 5945 + }, + { + "epoch": 7.61088, + "grad_norm": 0.8166235089302063, + "learning_rate": 3.8123249299719886e-05, + "loss": 0.5877, + "step": 5946 + }, + { + "epoch": 7.61216, + "grad_norm": 0.7479138970375061, + "learning_rate": 3.8121248499399765e-05, + "loss": 0.5492, + "step": 5947 + }, + { + "epoch": 7.61344, + "grad_norm": 0.8284672498703003, + "learning_rate": 3.811924769907964e-05, + "loss": 0.6257, + "step": 5948 + }, + { + "epoch": 7.61472, + "grad_norm": 0.7393089532852173, + "learning_rate": 3.811724689875951e-05, + "loss": 0.5608, + "step": 5949 + }, + { + "epoch": 7.616, + "grad_norm": 0.8117731213569641, + "learning_rate": 3.811524609843938e-05, + "loss": 0.6299, + "step": 5950 + }, + { + "epoch": 7.61728, + "grad_norm": 0.7665390968322754, + "learning_rate": 3.811324529811925e-05, + "loss": 0.5933, + "step": 5951 + }, + { + "epoch": 7.61856, + "grad_norm": 0.7984145879745483, + "learning_rate": 3.811124449779912e-05, + "loss": 0.6006, + "step": 5952 + }, + { + "epoch": 7.61984, + "grad_norm": 0.7704722881317139, + "learning_rate": 3.810924369747899e-05, + "loss": 0.5863, + "step": 5953 + }, + { + "epoch": 7.62112, + "grad_norm": 0.845035970211029, + "learning_rate": 3.810724289715887e-05, + "loss": 0.6554, + "step": 5954 + }, + { + "epoch": 7.6224, + "grad_norm": 0.8365921378135681, + "learning_rate": 3.810524209683874e-05, + "loss": 0.6329, + "step": 5955 + }, + { + "epoch": 7.62368, + "grad_norm": 0.8249934911727905, + "learning_rate": 3.810324129651861e-05, + "loss": 0.5811, + "step": 5956 + }, + { + "epoch": 7.62496, + "grad_norm": 0.7940043807029724, + "learning_rate": 3.8101240496198483e-05, + "loss": 0.5887, + "step": 5957 + }, + { + "epoch": 7.62624, + "grad_norm": 0.8075708746910095, + "learning_rate": 3.8099239695878355e-05, + "loss": 0.6427, + "step": 5958 + }, + { + "epoch": 7.62752, + "grad_norm": 0.7509844303131104, + "learning_rate": 3.809723889555823e-05, + "loss": 0.5248, + "step": 5959 + }, + { + "epoch": 7.6288, + "grad_norm": 0.8051660060882568, + "learning_rate": 3.809523809523809e-05, + "loss": 0.591, + "step": 5960 + }, + { + "epoch": 7.6300799999999995, + "grad_norm": 0.8326472640037537, + "learning_rate": 3.809323729491797e-05, + "loss": 0.6136, + "step": 5961 + }, + { + "epoch": 7.63136, + "grad_norm": 0.792871356010437, + "learning_rate": 3.809123649459784e-05, + "loss": 0.5931, + "step": 5962 + }, + { + "epoch": 7.63264, + "grad_norm": 0.7740362882614136, + "learning_rate": 3.8089235694277715e-05, + "loss": 0.5771, + "step": 5963 + }, + { + "epoch": 7.63392, + "grad_norm": 0.8323248624801636, + "learning_rate": 3.8087234893957586e-05, + "loss": 0.6383, + "step": 5964 + }, + { + "epoch": 7.6352, + "grad_norm": 0.8095336556434631, + "learning_rate": 3.808523409363746e-05, + "loss": 0.5937, + "step": 5965 + }, + { + "epoch": 7.63648, + "grad_norm": 0.8045695424079895, + "learning_rate": 3.808323329331733e-05, + "loss": 0.5883, + "step": 5966 + }, + { + "epoch": 7.63776, + "grad_norm": 0.825339138507843, + "learning_rate": 3.80812324929972e-05, + "loss": 0.5761, + "step": 5967 + }, + { + "epoch": 7.63904, + "grad_norm": 0.8067010641098022, + "learning_rate": 3.8079231692677074e-05, + "loss": 0.6219, + "step": 5968 + }, + { + "epoch": 7.64032, + "grad_norm": 0.7915658950805664, + "learning_rate": 3.8077230892356946e-05, + "loss": 0.5803, + "step": 5969 + }, + { + "epoch": 7.6416, + "grad_norm": 0.7723389267921448, + "learning_rate": 3.807523009203682e-05, + "loss": 0.5954, + "step": 5970 + }, + { + "epoch": 7.64288, + "grad_norm": 0.8201357126235962, + "learning_rate": 3.807322929171669e-05, + "loss": 0.627, + "step": 5971 + }, + { + "epoch": 7.64416, + "grad_norm": 0.8042327761650085, + "learning_rate": 3.807122849139656e-05, + "loss": 0.6212, + "step": 5972 + }, + { + "epoch": 7.64544, + "grad_norm": 0.8056513667106628, + "learning_rate": 3.806922769107643e-05, + "loss": 0.5762, + "step": 5973 + }, + { + "epoch": 7.64672, + "grad_norm": 0.7721224427223206, + "learning_rate": 3.8067226890756305e-05, + "loss": 0.6134, + "step": 5974 + }, + { + "epoch": 7.648, + "grad_norm": 0.8109544515609741, + "learning_rate": 3.806522609043618e-05, + "loss": 0.6003, + "step": 5975 + }, + { + "epoch": 7.64928, + "grad_norm": 0.7777767181396484, + "learning_rate": 3.806322529011605e-05, + "loss": 0.6004, + "step": 5976 + }, + { + "epoch": 7.6505600000000005, + "grad_norm": 0.8008784651756287, + "learning_rate": 3.806122448979592e-05, + "loss": 0.6076, + "step": 5977 + }, + { + "epoch": 7.65184, + "grad_norm": 0.8032917976379395, + "learning_rate": 3.805922368947579e-05, + "loss": 0.595, + "step": 5978 + }, + { + "epoch": 7.65312, + "grad_norm": 0.7758605480194092, + "learning_rate": 3.8057222889155664e-05, + "loss": 0.5175, + "step": 5979 + }, + { + "epoch": 7.6544, + "grad_norm": 0.7716826796531677, + "learning_rate": 3.8055222088835536e-05, + "loss": 0.603, + "step": 5980 + }, + { + "epoch": 7.65568, + "grad_norm": 0.7985824346542358, + "learning_rate": 3.805322128851541e-05, + "loss": 0.602, + "step": 5981 + }, + { + "epoch": 7.65696, + "grad_norm": 0.8194901347160339, + "learning_rate": 3.805122048819528e-05, + "loss": 0.6157, + "step": 5982 + }, + { + "epoch": 7.65824, + "grad_norm": 0.8596216440200806, + "learning_rate": 3.804921968787515e-05, + "loss": 0.6033, + "step": 5983 + }, + { + "epoch": 7.65952, + "grad_norm": 0.8513696193695068, + "learning_rate": 3.8047218887555024e-05, + "loss": 0.6309, + "step": 5984 + }, + { + "epoch": 7.6608, + "grad_norm": 0.8196579813957214, + "learning_rate": 3.8045218087234895e-05, + "loss": 0.6085, + "step": 5985 + }, + { + "epoch": 7.66208, + "grad_norm": 0.7808190584182739, + "learning_rate": 3.804321728691477e-05, + "loss": 0.6266, + "step": 5986 + }, + { + "epoch": 7.66336, + "grad_norm": 0.792535662651062, + "learning_rate": 3.804121648659464e-05, + "loss": 0.5974, + "step": 5987 + }, + { + "epoch": 7.66464, + "grad_norm": 0.7937912940979004, + "learning_rate": 3.803921568627451e-05, + "loss": 0.6269, + "step": 5988 + }, + { + "epoch": 7.66592, + "grad_norm": 0.7414507269859314, + "learning_rate": 3.803721488595438e-05, + "loss": 0.575, + "step": 5989 + }, + { + "epoch": 7.6672, + "grad_norm": 0.8128142952919006, + "learning_rate": 3.8035214085634255e-05, + "loss": 0.5785, + "step": 5990 + }, + { + "epoch": 7.66848, + "grad_norm": 0.8728705644607544, + "learning_rate": 3.8033213285314127e-05, + "loss": 0.6354, + "step": 5991 + }, + { + "epoch": 7.66976, + "grad_norm": 0.7969766855239868, + "learning_rate": 3.8031212484994e-05, + "loss": 0.6223, + "step": 5992 + }, + { + "epoch": 7.67104, + "grad_norm": 0.8199662566184998, + "learning_rate": 3.802921168467387e-05, + "loss": 0.6278, + "step": 5993 + }, + { + "epoch": 7.67232, + "grad_norm": 0.7900199890136719, + "learning_rate": 3.802721088435374e-05, + "loss": 0.5949, + "step": 5994 + }, + { + "epoch": 7.6736, + "grad_norm": 0.7966949939727783, + "learning_rate": 3.8025210084033614e-05, + "loss": 0.6385, + "step": 5995 + }, + { + "epoch": 7.67488, + "grad_norm": 0.7564438581466675, + "learning_rate": 3.8023209283713486e-05, + "loss": 0.5726, + "step": 5996 + }, + { + "epoch": 7.67616, + "grad_norm": 0.8227134943008423, + "learning_rate": 3.8021208483393364e-05, + "loss": 0.6368, + "step": 5997 + }, + { + "epoch": 7.67744, + "grad_norm": 0.8298145532608032, + "learning_rate": 3.801920768307323e-05, + "loss": 0.6483, + "step": 5998 + }, + { + "epoch": 7.67872, + "grad_norm": 0.8414880037307739, + "learning_rate": 3.80172068827531e-05, + "loss": 0.6564, + "step": 5999 + }, + { + "epoch": 7.68, + "grad_norm": 0.7608658075332642, + "learning_rate": 3.801520608243297e-05, + "loss": 0.5326, + "step": 6000 + }, + { + "epoch": 7.68128, + "grad_norm": 0.816177487373352, + "learning_rate": 3.8013205282112845e-05, + "loss": 0.5995, + "step": 6001 + }, + { + "epoch": 7.68256, + "grad_norm": 0.7969012260437012, + "learning_rate": 3.801120448179272e-05, + "loss": 0.576, + "step": 6002 + }, + { + "epoch": 7.68384, + "grad_norm": 0.786906361579895, + "learning_rate": 3.800920368147259e-05, + "loss": 0.6084, + "step": 6003 + }, + { + "epoch": 7.6851199999999995, + "grad_norm": 0.764916718006134, + "learning_rate": 3.800720288115247e-05, + "loss": 0.6095, + "step": 6004 + }, + { + "epoch": 7.6864, + "grad_norm": 0.7754150629043579, + "learning_rate": 3.800520208083234e-05, + "loss": 0.6024, + "step": 6005 + }, + { + "epoch": 7.68768, + "grad_norm": 0.7677636742591858, + "learning_rate": 3.8003201280512204e-05, + "loss": 0.5886, + "step": 6006 + }, + { + "epoch": 7.68896, + "grad_norm": 0.8274985551834106, + "learning_rate": 3.8001200480192076e-05, + "loss": 0.6117, + "step": 6007 + }, + { + "epoch": 7.69024, + "grad_norm": 0.8422303199768066, + "learning_rate": 3.799919967987195e-05, + "loss": 0.6012, + "step": 6008 + }, + { + "epoch": 7.69152, + "grad_norm": 0.7773991227149963, + "learning_rate": 3.799719887955182e-05, + "loss": 0.6188, + "step": 6009 + }, + { + "epoch": 7.6928, + "grad_norm": 0.8241961002349854, + "learning_rate": 3.799519807923169e-05, + "loss": 0.6102, + "step": 6010 + }, + { + "epoch": 7.69408, + "grad_norm": 0.7824380397796631, + "learning_rate": 3.799319727891157e-05, + "loss": 0.6254, + "step": 6011 + }, + { + "epoch": 7.69536, + "grad_norm": 0.7920189499855042, + "learning_rate": 3.799119647859144e-05, + "loss": 0.687, + "step": 6012 + }, + { + "epoch": 7.69664, + "grad_norm": 0.7902247309684753, + "learning_rate": 3.7989195678271314e-05, + "loss": 0.5669, + "step": 6013 + }, + { + "epoch": 7.69792, + "grad_norm": 0.8157680630683899, + "learning_rate": 3.798719487795118e-05, + "loss": 0.6307, + "step": 6014 + }, + { + "epoch": 7.6992, + "grad_norm": 0.8133694529533386, + "learning_rate": 3.798519407763105e-05, + "loss": 0.6101, + "step": 6015 + }, + { + "epoch": 7.70048, + "grad_norm": 0.8677835464477539, + "learning_rate": 3.798319327731092e-05, + "loss": 0.6809, + "step": 6016 + }, + { + "epoch": 7.70176, + "grad_norm": 0.7828230261802673, + "learning_rate": 3.7981192476990795e-05, + "loss": 0.5994, + "step": 6017 + }, + { + "epoch": 7.70304, + "grad_norm": 0.8306964039802551, + "learning_rate": 3.7979191676670673e-05, + "loss": 0.6098, + "step": 6018 + }, + { + "epoch": 7.70432, + "grad_norm": 0.7977209687232971, + "learning_rate": 3.7977190876350545e-05, + "loss": 0.6301, + "step": 6019 + }, + { + "epoch": 7.7056000000000004, + "grad_norm": 0.7888930439949036, + "learning_rate": 3.797519007603042e-05, + "loss": 0.6073, + "step": 6020 + }, + { + "epoch": 7.70688, + "grad_norm": 0.813720166683197, + "learning_rate": 3.797318927571029e-05, + "loss": 0.6056, + "step": 6021 + }, + { + "epoch": 7.70816, + "grad_norm": 0.7712581157684326, + "learning_rate": 3.7971188475390154e-05, + "loss": 0.6198, + "step": 6022 + }, + { + "epoch": 7.70944, + "grad_norm": 0.8599300384521484, + "learning_rate": 3.7969187675070026e-05, + "loss": 0.6541, + "step": 6023 + }, + { + "epoch": 7.71072, + "grad_norm": 0.8106684684753418, + "learning_rate": 3.79671868747499e-05, + "loss": 0.625, + "step": 6024 + }, + { + "epoch": 7.712, + "grad_norm": 0.8189576268196106, + "learning_rate": 3.7965186074429776e-05, + "loss": 0.6379, + "step": 6025 + }, + { + "epoch": 7.71328, + "grad_norm": 0.8133044838905334, + "learning_rate": 3.796318527410965e-05, + "loss": 0.6289, + "step": 6026 + }, + { + "epoch": 7.71456, + "grad_norm": 0.7700027227401733, + "learning_rate": 3.796118447378952e-05, + "loss": 0.5445, + "step": 6027 + }, + { + "epoch": 7.71584, + "grad_norm": 0.7435338497161865, + "learning_rate": 3.795918367346939e-05, + "loss": 0.5761, + "step": 6028 + }, + { + "epoch": 7.7171199999999995, + "grad_norm": 0.7466886639595032, + "learning_rate": 3.7957182873149264e-05, + "loss": 0.563, + "step": 6029 + }, + { + "epoch": 7.7184, + "grad_norm": 0.7899182438850403, + "learning_rate": 3.795518207282913e-05, + "loss": 0.5442, + "step": 6030 + }, + { + "epoch": 7.71968, + "grad_norm": 0.8155211806297302, + "learning_rate": 3.7953181272509e-05, + "loss": 0.6445, + "step": 6031 + }, + { + "epoch": 7.72096, + "grad_norm": 0.8087602257728577, + "learning_rate": 3.795118047218888e-05, + "loss": 0.6062, + "step": 6032 + }, + { + "epoch": 7.72224, + "grad_norm": 0.7604132890701294, + "learning_rate": 3.794917967186875e-05, + "loss": 0.6147, + "step": 6033 + }, + { + "epoch": 7.72352, + "grad_norm": 0.8204428553581238, + "learning_rate": 3.794717887154862e-05, + "loss": 0.5717, + "step": 6034 + }, + { + "epoch": 7.7248, + "grad_norm": 0.7669458985328674, + "learning_rate": 3.7945178071228495e-05, + "loss": 0.5662, + "step": 6035 + }, + { + "epoch": 7.72608, + "grad_norm": 0.765567421913147, + "learning_rate": 3.794317727090837e-05, + "loss": 0.5558, + "step": 6036 + }, + { + "epoch": 7.72736, + "grad_norm": 0.8454002141952515, + "learning_rate": 3.794117647058824e-05, + "loss": 0.6204, + "step": 6037 + }, + { + "epoch": 7.72864, + "grad_norm": 0.7641090750694275, + "learning_rate": 3.7939175670268104e-05, + "loss": 0.5612, + "step": 6038 + }, + { + "epoch": 7.72992, + "grad_norm": 0.8416698575019836, + "learning_rate": 3.793717486994798e-05, + "loss": 0.63, + "step": 6039 + }, + { + "epoch": 7.7312, + "grad_norm": 0.8227651119232178, + "learning_rate": 3.7935174069627854e-05, + "loss": 0.6666, + "step": 6040 + }, + { + "epoch": 7.73248, + "grad_norm": 0.8543378114700317, + "learning_rate": 3.7933173269307726e-05, + "loss": 0.6023, + "step": 6041 + }, + { + "epoch": 7.73376, + "grad_norm": 0.761553943157196, + "learning_rate": 3.79311724689876e-05, + "loss": 0.583, + "step": 6042 + }, + { + "epoch": 7.73504, + "grad_norm": 0.8009664416313171, + "learning_rate": 3.792917166866747e-05, + "loss": 0.6192, + "step": 6043 + }, + { + "epoch": 7.73632, + "grad_norm": 0.7856703400611877, + "learning_rate": 3.792717086834734e-05, + "loss": 0.5523, + "step": 6044 + }, + { + "epoch": 7.7376000000000005, + "grad_norm": 0.7818490862846375, + "learning_rate": 3.7925170068027214e-05, + "loss": 0.6144, + "step": 6045 + }, + { + "epoch": 7.73888, + "grad_norm": 0.7923960089683533, + "learning_rate": 3.7923169267707085e-05, + "loss": 0.6022, + "step": 6046 + }, + { + "epoch": 7.74016, + "grad_norm": 0.769059956073761, + "learning_rate": 3.792116846738696e-05, + "loss": 0.5452, + "step": 6047 + }, + { + "epoch": 7.74144, + "grad_norm": 0.7857373952865601, + "learning_rate": 3.791916766706683e-05, + "loss": 0.6226, + "step": 6048 + }, + { + "epoch": 7.74272, + "grad_norm": 0.7994586825370789, + "learning_rate": 3.79171668667467e-05, + "loss": 0.5875, + "step": 6049 + }, + { + "epoch": 7.744, + "grad_norm": 0.7892863154411316, + "learning_rate": 3.791516606642657e-05, + "loss": 0.6333, + "step": 6050 + }, + { + "epoch": 7.74528, + "grad_norm": 0.8037270903587341, + "learning_rate": 3.7913165266106445e-05, + "loss": 0.5936, + "step": 6051 + }, + { + "epoch": 7.74656, + "grad_norm": 0.7748731374740601, + "learning_rate": 3.7911164465786317e-05, + "loss": 0.5943, + "step": 6052 + }, + { + "epoch": 7.74784, + "grad_norm": 0.7597405910491943, + "learning_rate": 3.790916366546619e-05, + "loss": 0.5837, + "step": 6053 + }, + { + "epoch": 7.74912, + "grad_norm": 0.8196899890899658, + "learning_rate": 3.790716286514606e-05, + "loss": 0.6103, + "step": 6054 + }, + { + "epoch": 7.7504, + "grad_norm": 0.7854442000389099, + "learning_rate": 3.790516206482593e-05, + "loss": 0.6141, + "step": 6055 + }, + { + "epoch": 7.75168, + "grad_norm": 0.7270790934562683, + "learning_rate": 3.7903161264505804e-05, + "loss": 0.4887, + "step": 6056 + }, + { + "epoch": 7.75296, + "grad_norm": 0.75218266248703, + "learning_rate": 3.7901160464185676e-05, + "loss": 0.5902, + "step": 6057 + }, + { + "epoch": 7.75424, + "grad_norm": 0.8306524753570557, + "learning_rate": 3.789915966386555e-05, + "loss": 0.6816, + "step": 6058 + }, + { + "epoch": 7.75552, + "grad_norm": 0.7545936703681946, + "learning_rate": 3.789715886354542e-05, + "loss": 0.5668, + "step": 6059 + }, + { + "epoch": 7.7568, + "grad_norm": 0.7960179448127747, + "learning_rate": 3.789515806322529e-05, + "loss": 0.5391, + "step": 6060 + }, + { + "epoch": 7.75808, + "grad_norm": 0.755833625793457, + "learning_rate": 3.789315726290516e-05, + "loss": 0.5984, + "step": 6061 + }, + { + "epoch": 7.75936, + "grad_norm": 0.8086370229721069, + "learning_rate": 3.7891156462585035e-05, + "loss": 0.6226, + "step": 6062 + }, + { + "epoch": 7.76064, + "grad_norm": 0.8106130361557007, + "learning_rate": 3.788915566226491e-05, + "loss": 0.6215, + "step": 6063 + }, + { + "epoch": 7.76192, + "grad_norm": 0.7895246744155884, + "learning_rate": 3.788715486194478e-05, + "loss": 0.6468, + "step": 6064 + }, + { + "epoch": 7.7632, + "grad_norm": 0.8080568313598633, + "learning_rate": 3.788515406162465e-05, + "loss": 0.6082, + "step": 6065 + }, + { + "epoch": 7.76448, + "grad_norm": 0.7731059193611145, + "learning_rate": 3.788315326130452e-05, + "loss": 0.6086, + "step": 6066 + }, + { + "epoch": 7.76576, + "grad_norm": 0.8263415694236755, + "learning_rate": 3.78811524609844e-05, + "loss": 0.6548, + "step": 6067 + }, + { + "epoch": 7.76704, + "grad_norm": 0.7245672345161438, + "learning_rate": 3.7879151660664266e-05, + "loss": 0.5606, + "step": 6068 + }, + { + "epoch": 7.76832, + "grad_norm": 0.764879047870636, + "learning_rate": 3.787715086034414e-05, + "loss": 0.6432, + "step": 6069 + }, + { + "epoch": 7.7696, + "grad_norm": 0.7973184585571289, + "learning_rate": 3.787515006002401e-05, + "loss": 0.6108, + "step": 6070 + }, + { + "epoch": 7.77088, + "grad_norm": 0.7751659750938416, + "learning_rate": 3.787314925970388e-05, + "loss": 0.6245, + "step": 6071 + }, + { + "epoch": 7.7721599999999995, + "grad_norm": 0.76836758852005, + "learning_rate": 3.7871148459383754e-05, + "loss": 0.5633, + "step": 6072 + }, + { + "epoch": 7.77344, + "grad_norm": 0.7604205012321472, + "learning_rate": 3.7869147659063626e-05, + "loss": 0.6017, + "step": 6073 + }, + { + "epoch": 7.77472, + "grad_norm": 0.8240510821342468, + "learning_rate": 3.7867146858743504e-05, + "loss": 0.5819, + "step": 6074 + }, + { + "epoch": 7.776, + "grad_norm": 0.7578047513961792, + "learning_rate": 3.7865146058423376e-05, + "loss": 0.562, + "step": 6075 + }, + { + "epoch": 7.77728, + "grad_norm": 0.7977784872055054, + "learning_rate": 3.786314525810324e-05, + "loss": 0.6071, + "step": 6076 + }, + { + "epoch": 7.77856, + "grad_norm": 0.7755118608474731, + "learning_rate": 3.786114445778311e-05, + "loss": 0.5861, + "step": 6077 + }, + { + "epoch": 7.77984, + "grad_norm": 0.7646934390068054, + "learning_rate": 3.7859143657462985e-05, + "loss": 0.6142, + "step": 6078 + }, + { + "epoch": 7.78112, + "grad_norm": 0.8070874214172363, + "learning_rate": 3.785714285714286e-05, + "loss": 0.6089, + "step": 6079 + }, + { + "epoch": 7.7824, + "grad_norm": 0.7948157787322998, + "learning_rate": 3.785514205682273e-05, + "loss": 0.6112, + "step": 6080 + }, + { + "epoch": 7.78368, + "grad_norm": 0.7674957513809204, + "learning_rate": 3.785314125650261e-05, + "loss": 0.5461, + "step": 6081 + }, + { + "epoch": 7.78496, + "grad_norm": 0.7721199989318848, + "learning_rate": 3.785114045618248e-05, + "loss": 0.5872, + "step": 6082 + }, + { + "epoch": 7.78624, + "grad_norm": 0.8199625015258789, + "learning_rate": 3.784913965586235e-05, + "loss": 0.6066, + "step": 6083 + }, + { + "epoch": 7.78752, + "grad_norm": 0.8024269342422485, + "learning_rate": 3.7847138855542216e-05, + "loss": 0.5541, + "step": 6084 + }, + { + "epoch": 7.7888, + "grad_norm": 0.7629589438438416, + "learning_rate": 3.784513805522209e-05, + "loss": 0.5699, + "step": 6085 + }, + { + "epoch": 7.79008, + "grad_norm": 0.7849573493003845, + "learning_rate": 3.784313725490196e-05, + "loss": 0.5958, + "step": 6086 + }, + { + "epoch": 7.79136, + "grad_norm": 0.8116906881332397, + "learning_rate": 3.784113645458183e-05, + "loss": 0.6552, + "step": 6087 + }, + { + "epoch": 7.7926400000000005, + "grad_norm": 0.7979661822319031, + "learning_rate": 3.783913565426171e-05, + "loss": 0.6307, + "step": 6088 + }, + { + "epoch": 7.79392, + "grad_norm": 0.8299638628959656, + "learning_rate": 3.783713485394158e-05, + "loss": 0.6451, + "step": 6089 + }, + { + "epoch": 7.7952, + "grad_norm": 0.787670910358429, + "learning_rate": 3.7835134053621454e-05, + "loss": 0.6323, + "step": 6090 + }, + { + "epoch": 7.79648, + "grad_norm": 0.8025368452072144, + "learning_rate": 3.7833133253301326e-05, + "loss": 0.6199, + "step": 6091 + }, + { + "epoch": 7.79776, + "grad_norm": 0.7752863168716431, + "learning_rate": 3.783113245298119e-05, + "loss": 0.5975, + "step": 6092 + }, + { + "epoch": 7.79904, + "grad_norm": 0.7841124534606934, + "learning_rate": 3.782913165266106e-05, + "loss": 0.5705, + "step": 6093 + }, + { + "epoch": 7.80032, + "grad_norm": 0.7986131310462952, + "learning_rate": 3.7827130852340935e-05, + "loss": 0.5626, + "step": 6094 + }, + { + "epoch": 7.8016, + "grad_norm": 0.7659592032432556, + "learning_rate": 3.7825130052020806e-05, + "loss": 0.582, + "step": 6095 + }, + { + "epoch": 7.80288, + "grad_norm": 0.7751227617263794, + "learning_rate": 3.7823129251700685e-05, + "loss": 0.589, + "step": 6096 + }, + { + "epoch": 7.8041599999999995, + "grad_norm": 0.810896635055542, + "learning_rate": 3.782112845138056e-05, + "loss": 0.5888, + "step": 6097 + }, + { + "epoch": 7.80544, + "grad_norm": 0.8425770998001099, + "learning_rate": 3.781912765106043e-05, + "loss": 0.6118, + "step": 6098 + }, + { + "epoch": 7.80672, + "grad_norm": 0.81880784034729, + "learning_rate": 3.78171268507403e-05, + "loss": 0.6555, + "step": 6099 + }, + { + "epoch": 7.808, + "grad_norm": 0.8141629099845886, + "learning_rate": 3.7815126050420166e-05, + "loss": 0.6628, + "step": 6100 + }, + { + "epoch": 7.80928, + "grad_norm": 0.8063963651657104, + "learning_rate": 3.781312525010004e-05, + "loss": 0.5981, + "step": 6101 + }, + { + "epoch": 7.81056, + "grad_norm": 0.8052300214767456, + "learning_rate": 3.781112444977991e-05, + "loss": 0.5995, + "step": 6102 + }, + { + "epoch": 7.81184, + "grad_norm": 0.804269552230835, + "learning_rate": 3.780912364945979e-05, + "loss": 0.6108, + "step": 6103 + }, + { + "epoch": 7.81312, + "grad_norm": 0.7562627196311951, + "learning_rate": 3.780712284913966e-05, + "loss": 0.5164, + "step": 6104 + }, + { + "epoch": 7.8144, + "grad_norm": 0.800515353679657, + "learning_rate": 3.780512204881953e-05, + "loss": 0.5988, + "step": 6105 + }, + { + "epoch": 7.81568, + "grad_norm": 0.8121452331542969, + "learning_rate": 3.7803121248499404e-05, + "loss": 0.632, + "step": 6106 + }, + { + "epoch": 7.81696, + "grad_norm": 0.8428505063056946, + "learning_rate": 3.7801120448179275e-05, + "loss": 0.6044, + "step": 6107 + }, + { + "epoch": 7.81824, + "grad_norm": 0.8227953314781189, + "learning_rate": 3.779911964785914e-05, + "loss": 0.6519, + "step": 6108 + }, + { + "epoch": 7.81952, + "grad_norm": 0.7917014360427856, + "learning_rate": 3.779711884753901e-05, + "loss": 0.5866, + "step": 6109 + }, + { + "epoch": 7.8208, + "grad_norm": 0.8045591115951538, + "learning_rate": 3.779511804721889e-05, + "loss": 0.6472, + "step": 6110 + }, + { + "epoch": 7.82208, + "grad_norm": 0.772802472114563, + "learning_rate": 3.779311724689876e-05, + "loss": 0.5495, + "step": 6111 + }, + { + "epoch": 7.82336, + "grad_norm": 0.8186236023902893, + "learning_rate": 3.7791116446578635e-05, + "loss": 0.664, + "step": 6112 + }, + { + "epoch": 7.8246400000000005, + "grad_norm": 0.806842029094696, + "learning_rate": 3.778911564625851e-05, + "loss": 0.6216, + "step": 6113 + }, + { + "epoch": 7.82592, + "grad_norm": 0.7762079238891602, + "learning_rate": 3.778711484593838e-05, + "loss": 0.5853, + "step": 6114 + }, + { + "epoch": 7.8272, + "grad_norm": 0.8283485770225525, + "learning_rate": 3.778511404561825e-05, + "loss": 0.5947, + "step": 6115 + }, + { + "epoch": 7.82848, + "grad_norm": 0.8360929489135742, + "learning_rate": 3.7783113245298115e-05, + "loss": 0.6088, + "step": 6116 + }, + { + "epoch": 7.82976, + "grad_norm": 0.7732667922973633, + "learning_rate": 3.7781112444977994e-05, + "loss": 0.6161, + "step": 6117 + }, + { + "epoch": 7.83104, + "grad_norm": 0.8080599308013916, + "learning_rate": 3.7779111644657866e-05, + "loss": 0.6314, + "step": 6118 + }, + { + "epoch": 7.83232, + "grad_norm": 0.8000563979148865, + "learning_rate": 3.777711084433774e-05, + "loss": 0.6548, + "step": 6119 + }, + { + "epoch": 7.8336, + "grad_norm": 0.8152042031288147, + "learning_rate": 3.777511004401761e-05, + "loss": 0.6637, + "step": 6120 + }, + { + "epoch": 7.83488, + "grad_norm": 0.8046169877052307, + "learning_rate": 3.777310924369748e-05, + "loss": 0.6306, + "step": 6121 + }, + { + "epoch": 7.83616, + "grad_norm": 0.7880290746688843, + "learning_rate": 3.777110844337735e-05, + "loss": 0.6418, + "step": 6122 + }, + { + "epoch": 7.83744, + "grad_norm": 0.8029178380966187, + "learning_rate": 3.7769107643057225e-05, + "loss": 0.6183, + "step": 6123 + }, + { + "epoch": 7.83872, + "grad_norm": 0.7654139995574951, + "learning_rate": 3.77671068427371e-05, + "loss": 0.5793, + "step": 6124 + }, + { + "epoch": 7.84, + "grad_norm": 0.8131774067878723, + "learning_rate": 3.776510604241697e-05, + "loss": 0.6485, + "step": 6125 + }, + { + "epoch": 7.84128, + "grad_norm": 0.7340885996818542, + "learning_rate": 3.776310524209684e-05, + "loss": 0.5605, + "step": 6126 + }, + { + "epoch": 7.84256, + "grad_norm": 0.7948605418205261, + "learning_rate": 3.776110444177671e-05, + "loss": 0.588, + "step": 6127 + }, + { + "epoch": 7.84384, + "grad_norm": 0.823373019695282, + "learning_rate": 3.7759103641456584e-05, + "loss": 0.5925, + "step": 6128 + }, + { + "epoch": 7.84512, + "grad_norm": 0.8136553764343262, + "learning_rate": 3.7757102841136456e-05, + "loss": 0.6336, + "step": 6129 + }, + { + "epoch": 7.8464, + "grad_norm": 0.7955484390258789, + "learning_rate": 3.775510204081633e-05, + "loss": 0.5678, + "step": 6130 + }, + { + "epoch": 7.84768, + "grad_norm": 0.7093358039855957, + "learning_rate": 3.77531012404962e-05, + "loss": 0.5829, + "step": 6131 + }, + { + "epoch": 7.84896, + "grad_norm": 0.7424338459968567, + "learning_rate": 3.775110044017607e-05, + "loss": 0.5677, + "step": 6132 + }, + { + "epoch": 7.85024, + "grad_norm": 0.7559502720832825, + "learning_rate": 3.7749099639855944e-05, + "loss": 0.5777, + "step": 6133 + }, + { + "epoch": 7.85152, + "grad_norm": 0.8359054923057556, + "learning_rate": 3.7747098839535816e-05, + "loss": 0.5884, + "step": 6134 + }, + { + "epoch": 7.8528, + "grad_norm": 0.8114975094795227, + "learning_rate": 3.774509803921569e-05, + "loss": 0.5916, + "step": 6135 + }, + { + "epoch": 7.85408, + "grad_norm": 0.8038936853408813, + "learning_rate": 3.774309723889556e-05, + "loss": 0.6401, + "step": 6136 + }, + { + "epoch": 7.85536, + "grad_norm": 0.7915340065956116, + "learning_rate": 3.774109643857543e-05, + "loss": 0.574, + "step": 6137 + }, + { + "epoch": 7.85664, + "grad_norm": 0.8028757572174072, + "learning_rate": 3.77390956382553e-05, + "loss": 0.5954, + "step": 6138 + }, + { + "epoch": 7.85792, + "grad_norm": 0.7789997458457947, + "learning_rate": 3.7737094837935175e-05, + "loss": 0.5468, + "step": 6139 + }, + { + "epoch": 7.8591999999999995, + "grad_norm": 0.8444523215293884, + "learning_rate": 3.773509403761505e-05, + "loss": 0.6584, + "step": 6140 + }, + { + "epoch": 7.86048, + "grad_norm": 0.7982633709907532, + "learning_rate": 3.773309323729492e-05, + "loss": 0.6439, + "step": 6141 + }, + { + "epoch": 7.86176, + "grad_norm": 0.8487026691436768, + "learning_rate": 3.773109243697479e-05, + "loss": 0.6307, + "step": 6142 + }, + { + "epoch": 7.86304, + "grad_norm": 0.8426850438117981, + "learning_rate": 3.772909163665466e-05, + "loss": 0.5936, + "step": 6143 + }, + { + "epoch": 7.86432, + "grad_norm": 0.82231205701828, + "learning_rate": 3.7727090836334534e-05, + "loss": 0.5931, + "step": 6144 + }, + { + "epoch": 7.8656, + "grad_norm": 0.8080248832702637, + "learning_rate": 3.772509003601441e-05, + "loss": 0.5993, + "step": 6145 + }, + { + "epoch": 7.86688, + "grad_norm": 0.7820031046867371, + "learning_rate": 3.772308923569428e-05, + "loss": 0.5974, + "step": 6146 + }, + { + "epoch": 7.86816, + "grad_norm": 0.7480020523071289, + "learning_rate": 3.772108843537415e-05, + "loss": 0.5778, + "step": 6147 + }, + { + "epoch": 7.86944, + "grad_norm": 0.7739498019218445, + "learning_rate": 3.771908763505402e-05, + "loss": 0.6069, + "step": 6148 + }, + { + "epoch": 7.87072, + "grad_norm": 0.79982590675354, + "learning_rate": 3.7717086834733893e-05, + "loss": 0.6673, + "step": 6149 + }, + { + "epoch": 7.872, + "grad_norm": 0.7504255175590515, + "learning_rate": 3.7715086034413765e-05, + "loss": 0.5946, + "step": 6150 + }, + { + "epoch": 7.87328, + "grad_norm": 0.7930676341056824, + "learning_rate": 3.771308523409364e-05, + "loss": 0.6112, + "step": 6151 + }, + { + "epoch": 7.87456, + "grad_norm": 0.7735934853553772, + "learning_rate": 3.7711084433773516e-05, + "loss": 0.5914, + "step": 6152 + }, + { + "epoch": 7.87584, + "grad_norm": 0.8146105408668518, + "learning_rate": 3.770908363345339e-05, + "loss": 0.6164, + "step": 6153 + }, + { + "epoch": 7.87712, + "grad_norm": 0.7409805655479431, + "learning_rate": 3.770708283313325e-05, + "loss": 0.4994, + "step": 6154 + }, + { + "epoch": 7.8784, + "grad_norm": 0.8767794370651245, + "learning_rate": 3.7705082032813125e-05, + "loss": 0.6723, + "step": 6155 + }, + { + "epoch": 7.8796800000000005, + "grad_norm": 0.7842835187911987, + "learning_rate": 3.7703081232492996e-05, + "loss": 0.5496, + "step": 6156 + }, + { + "epoch": 7.88096, + "grad_norm": 0.8183816075325012, + "learning_rate": 3.770108043217287e-05, + "loss": 0.5986, + "step": 6157 + }, + { + "epoch": 7.88224, + "grad_norm": 0.7943567037582397, + "learning_rate": 3.769907963185274e-05, + "loss": 0.6196, + "step": 6158 + }, + { + "epoch": 7.88352, + "grad_norm": 0.7995247840881348, + "learning_rate": 3.769707883153262e-05, + "loss": 0.6269, + "step": 6159 + }, + { + "epoch": 7.8848, + "grad_norm": 0.798160970211029, + "learning_rate": 3.769507803121249e-05, + "loss": 0.5612, + "step": 6160 + }, + { + "epoch": 7.88608, + "grad_norm": 0.7991229295730591, + "learning_rate": 3.769307723089236e-05, + "loss": 0.5785, + "step": 6161 + }, + { + "epoch": 7.88736, + "grad_norm": 0.7615501880645752, + "learning_rate": 3.769107643057223e-05, + "loss": 0.5869, + "step": 6162 + }, + { + "epoch": 7.88864, + "grad_norm": 0.7592645287513733, + "learning_rate": 3.76890756302521e-05, + "loss": 0.5602, + "step": 6163 + }, + { + "epoch": 7.88992, + "grad_norm": 0.8296216726303101, + "learning_rate": 3.768707482993197e-05, + "loss": 0.6587, + "step": 6164 + }, + { + "epoch": 7.8911999999999995, + "grad_norm": 0.7715473175048828, + "learning_rate": 3.768507402961184e-05, + "loss": 0.6282, + "step": 6165 + }, + { + "epoch": 7.89248, + "grad_norm": 0.786847710609436, + "learning_rate": 3.768307322929172e-05, + "loss": 0.6401, + "step": 6166 + }, + { + "epoch": 7.89376, + "grad_norm": 0.7830649018287659, + "learning_rate": 3.7681072428971594e-05, + "loss": 0.6306, + "step": 6167 + }, + { + "epoch": 7.89504, + "grad_norm": 0.8743398189544678, + "learning_rate": 3.7679071628651466e-05, + "loss": 0.6588, + "step": 6168 + }, + { + "epoch": 7.89632, + "grad_norm": 0.8453099131584167, + "learning_rate": 3.767707082833134e-05, + "loss": 0.6599, + "step": 6169 + }, + { + "epoch": 7.8976, + "grad_norm": 0.8115950226783752, + "learning_rate": 3.76750700280112e-05, + "loss": 0.6126, + "step": 6170 + }, + { + "epoch": 7.89888, + "grad_norm": 0.8021664619445801, + "learning_rate": 3.7673069227691074e-05, + "loss": 0.5736, + "step": 6171 + }, + { + "epoch": 7.90016, + "grad_norm": 0.7566850781440735, + "learning_rate": 3.7671068427370946e-05, + "loss": 0.5289, + "step": 6172 + }, + { + "epoch": 7.90144, + "grad_norm": 0.8024584054946899, + "learning_rate": 3.7669067627050825e-05, + "loss": 0.6521, + "step": 6173 + }, + { + "epoch": 7.90272, + "grad_norm": 0.7810555100440979, + "learning_rate": 3.76670668267307e-05, + "loss": 0.6171, + "step": 6174 + }, + { + "epoch": 7.904, + "grad_norm": 0.839141309261322, + "learning_rate": 3.766506602641057e-05, + "loss": 0.6384, + "step": 6175 + }, + { + "epoch": 7.90528, + "grad_norm": 0.8140023350715637, + "learning_rate": 3.766306522609044e-05, + "loss": 0.6113, + "step": 6176 + }, + { + "epoch": 7.90656, + "grad_norm": 0.7578498721122742, + "learning_rate": 3.766106442577031e-05, + "loss": 0.5981, + "step": 6177 + }, + { + "epoch": 7.90784, + "grad_norm": 0.850180447101593, + "learning_rate": 3.765906362545018e-05, + "loss": 0.6304, + "step": 6178 + }, + { + "epoch": 7.90912, + "grad_norm": 0.7836924195289612, + "learning_rate": 3.765706282513005e-05, + "loss": 0.6016, + "step": 6179 + }, + { + "epoch": 7.9104, + "grad_norm": 0.7858523726463318, + "learning_rate": 3.765506202480993e-05, + "loss": 0.5774, + "step": 6180 + }, + { + "epoch": 7.9116800000000005, + "grad_norm": 0.8169428706169128, + "learning_rate": 3.76530612244898e-05, + "loss": 0.6141, + "step": 6181 + }, + { + "epoch": 7.91296, + "grad_norm": 0.779033362865448, + "learning_rate": 3.765106042416967e-05, + "loss": 0.5847, + "step": 6182 + }, + { + "epoch": 7.91424, + "grad_norm": 0.8012449741363525, + "learning_rate": 3.764905962384954e-05, + "loss": 0.6393, + "step": 6183 + }, + { + "epoch": 7.91552, + "grad_norm": 0.7956286668777466, + "learning_rate": 3.7647058823529415e-05, + "loss": 0.5944, + "step": 6184 + }, + { + "epoch": 7.9168, + "grad_norm": 0.79158616065979, + "learning_rate": 3.764505802320929e-05, + "loss": 0.5891, + "step": 6185 + }, + { + "epoch": 7.91808, + "grad_norm": 0.7679545879364014, + "learning_rate": 3.764305722288915e-05, + "loss": 0.5964, + "step": 6186 + }, + { + "epoch": 7.91936, + "grad_norm": 0.7634429931640625, + "learning_rate": 3.764105642256903e-05, + "loss": 0.6025, + "step": 6187 + }, + { + "epoch": 7.92064, + "grad_norm": 0.840979278087616, + "learning_rate": 3.76390556222489e-05, + "loss": 0.6005, + "step": 6188 + }, + { + "epoch": 7.92192, + "grad_norm": 0.8021059632301331, + "learning_rate": 3.7637054821928775e-05, + "loss": 0.6507, + "step": 6189 + }, + { + "epoch": 7.9232, + "grad_norm": 0.7973845601081848, + "learning_rate": 3.7635054021608646e-05, + "loss": 0.6245, + "step": 6190 + }, + { + "epoch": 7.92448, + "grad_norm": 0.8019742369651794, + "learning_rate": 3.763305322128852e-05, + "loss": 0.6052, + "step": 6191 + }, + { + "epoch": 7.92576, + "grad_norm": 0.8176249265670776, + "learning_rate": 3.763105242096839e-05, + "loss": 0.6816, + "step": 6192 + }, + { + "epoch": 7.92704, + "grad_norm": 0.799094557762146, + "learning_rate": 3.762905162064826e-05, + "loss": 0.5972, + "step": 6193 + }, + { + "epoch": 7.92832, + "grad_norm": 0.7592592835426331, + "learning_rate": 3.7627050820328134e-05, + "loss": 0.5614, + "step": 6194 + }, + { + "epoch": 7.9296, + "grad_norm": 0.7643021941184998, + "learning_rate": 3.7625050020008006e-05, + "loss": 0.618, + "step": 6195 + }, + { + "epoch": 7.93088, + "grad_norm": 0.8433973789215088, + "learning_rate": 3.762304921968788e-05, + "loss": 0.6484, + "step": 6196 + }, + { + "epoch": 7.93216, + "grad_norm": 0.8006365299224854, + "learning_rate": 3.762104841936775e-05, + "loss": 0.5503, + "step": 6197 + }, + { + "epoch": 7.93344, + "grad_norm": 0.8090243935585022, + "learning_rate": 3.761904761904762e-05, + "loss": 0.6048, + "step": 6198 + }, + { + "epoch": 7.93472, + "grad_norm": 0.8174999356269836, + "learning_rate": 3.761704681872749e-05, + "loss": 0.6092, + "step": 6199 + }, + { + "epoch": 7.936, + "grad_norm": 0.7761614322662354, + "learning_rate": 3.7615046018407365e-05, + "loss": 0.5421, + "step": 6200 + }, + { + "epoch": 7.93728, + "grad_norm": 0.7439830303192139, + "learning_rate": 3.761304521808724e-05, + "loss": 0.6109, + "step": 6201 + }, + { + "epoch": 7.93856, + "grad_norm": 0.8538080453872681, + "learning_rate": 3.761104441776711e-05, + "loss": 0.6092, + "step": 6202 + }, + { + "epoch": 7.93984, + "grad_norm": 0.7983587980270386, + "learning_rate": 3.760904361744698e-05, + "loss": 0.6157, + "step": 6203 + }, + { + "epoch": 7.94112, + "grad_norm": 0.8033997416496277, + "learning_rate": 3.760704281712685e-05, + "loss": 0.6363, + "step": 6204 + }, + { + "epoch": 7.9424, + "grad_norm": 0.8110709190368652, + "learning_rate": 3.7605042016806724e-05, + "loss": 0.5945, + "step": 6205 + }, + { + "epoch": 7.94368, + "grad_norm": 0.8630302548408508, + "learning_rate": 3.7603041216486596e-05, + "loss": 0.5869, + "step": 6206 + }, + { + "epoch": 7.94496, + "grad_norm": 0.865012526512146, + "learning_rate": 3.760104041616647e-05, + "loss": 0.6861, + "step": 6207 + }, + { + "epoch": 7.9462399999999995, + "grad_norm": 0.8047817945480347, + "learning_rate": 3.759903961584634e-05, + "loss": 0.6484, + "step": 6208 + }, + { + "epoch": 7.94752, + "grad_norm": 0.7712967395782471, + "learning_rate": 3.759703881552621e-05, + "loss": 0.6387, + "step": 6209 + }, + { + "epoch": 7.9488, + "grad_norm": 0.7860550284385681, + "learning_rate": 3.7595038015206084e-05, + "loss": 0.6384, + "step": 6210 + }, + { + "epoch": 7.95008, + "grad_norm": 0.7760273218154907, + "learning_rate": 3.7593037214885955e-05, + "loss": 0.602, + "step": 6211 + }, + { + "epoch": 7.95136, + "grad_norm": 0.7923306822776794, + "learning_rate": 3.759103641456583e-05, + "loss": 0.6, + "step": 6212 + }, + { + "epoch": 7.95264, + "grad_norm": 0.7732347846031189, + "learning_rate": 3.75890356142457e-05, + "loss": 0.5843, + "step": 6213 + }, + { + "epoch": 7.95392, + "grad_norm": 0.7786732912063599, + "learning_rate": 3.758703481392557e-05, + "loss": 0.6319, + "step": 6214 + }, + { + "epoch": 7.9552, + "grad_norm": 0.822283148765564, + "learning_rate": 3.758503401360544e-05, + "loss": 0.638, + "step": 6215 + }, + { + "epoch": 7.95648, + "grad_norm": 0.8353981375694275, + "learning_rate": 3.7583033213285315e-05, + "loss": 0.6462, + "step": 6216 + }, + { + "epoch": 7.95776, + "grad_norm": 0.7998156547546387, + "learning_rate": 3.7581032412965187e-05, + "loss": 0.6271, + "step": 6217 + }, + { + "epoch": 7.95904, + "grad_norm": 0.8036489486694336, + "learning_rate": 3.757903161264506e-05, + "loss": 0.6486, + "step": 6218 + }, + { + "epoch": 7.96032, + "grad_norm": 0.8148389458656311, + "learning_rate": 3.757703081232493e-05, + "loss": 0.5962, + "step": 6219 + }, + { + "epoch": 7.9616, + "grad_norm": 0.7913573980331421, + "learning_rate": 3.75750300120048e-05, + "loss": 0.5983, + "step": 6220 + }, + { + "epoch": 7.96288, + "grad_norm": 0.7523571848869324, + "learning_rate": 3.7573029211684674e-05, + "loss": 0.6503, + "step": 6221 + }, + { + "epoch": 7.96416, + "grad_norm": 0.7958295941352844, + "learning_rate": 3.7571028411364546e-05, + "loss": 0.5914, + "step": 6222 + }, + { + "epoch": 7.96544, + "grad_norm": 0.7768007516860962, + "learning_rate": 3.7569027611044424e-05, + "loss": 0.617, + "step": 6223 + }, + { + "epoch": 7.9667200000000005, + "grad_norm": 0.7701081037521362, + "learning_rate": 3.756702681072429e-05, + "loss": 0.5931, + "step": 6224 + }, + { + "epoch": 7.968, + "grad_norm": 0.7834358811378479, + "learning_rate": 3.756502601040416e-05, + "loss": 0.5555, + "step": 6225 + }, + { + "epoch": 7.96928, + "grad_norm": 0.7959411144256592, + "learning_rate": 3.756302521008403e-05, + "loss": 0.63, + "step": 6226 + }, + { + "epoch": 7.97056, + "grad_norm": 0.7836350202560425, + "learning_rate": 3.7561024409763905e-05, + "loss": 0.5953, + "step": 6227 + }, + { + "epoch": 7.97184, + "grad_norm": 0.7769935131072998, + "learning_rate": 3.755902360944378e-05, + "loss": 0.6272, + "step": 6228 + }, + { + "epoch": 7.97312, + "grad_norm": 0.7422361373901367, + "learning_rate": 3.755702280912365e-05, + "loss": 0.5554, + "step": 6229 + }, + { + "epoch": 7.9744, + "grad_norm": 0.8061965703964233, + "learning_rate": 3.755502200880353e-05, + "loss": 0.5511, + "step": 6230 + }, + { + "epoch": 7.97568, + "grad_norm": 0.7815845608711243, + "learning_rate": 3.75530212084834e-05, + "loss": 0.5608, + "step": 6231 + }, + { + "epoch": 7.97696, + "grad_norm": 0.7946850657463074, + "learning_rate": 3.7551020408163264e-05, + "loss": 0.5856, + "step": 6232 + }, + { + "epoch": 7.9782399999999996, + "grad_norm": 0.738746702671051, + "learning_rate": 3.7549019607843136e-05, + "loss": 0.5789, + "step": 6233 + }, + { + "epoch": 7.97952, + "grad_norm": 0.7725175619125366, + "learning_rate": 3.754701880752301e-05, + "loss": 0.6236, + "step": 6234 + }, + { + "epoch": 7.9808, + "grad_norm": 0.8103924989700317, + "learning_rate": 3.754501800720288e-05, + "loss": 0.6221, + "step": 6235 + }, + { + "epoch": 7.98208, + "grad_norm": 0.8287813067436218, + "learning_rate": 3.754301720688275e-05, + "loss": 0.6418, + "step": 6236 + }, + { + "epoch": 7.98336, + "grad_norm": 0.8044477701187134, + "learning_rate": 3.754101640656263e-05, + "loss": 0.5968, + "step": 6237 + }, + { + "epoch": 7.98464, + "grad_norm": 0.7894521951675415, + "learning_rate": 3.75390156062425e-05, + "loss": 0.5945, + "step": 6238 + }, + { + "epoch": 7.98592, + "grad_norm": 0.8011317253112793, + "learning_rate": 3.7537014805922374e-05, + "loss": 0.6376, + "step": 6239 + }, + { + "epoch": 7.9872, + "grad_norm": 0.7666401863098145, + "learning_rate": 3.753501400560224e-05, + "loss": 0.5611, + "step": 6240 + }, + { + "epoch": 7.98848, + "grad_norm": 0.8295350670814514, + "learning_rate": 3.753301320528211e-05, + "loss": 0.6115, + "step": 6241 + }, + { + "epoch": 7.98976, + "grad_norm": 0.7832409739494324, + "learning_rate": 3.753101240496198e-05, + "loss": 0.599, + "step": 6242 + }, + { + "epoch": 7.99104, + "grad_norm": 0.8104324340820312, + "learning_rate": 3.7529011604641855e-05, + "loss": 0.6137, + "step": 6243 + }, + { + "epoch": 7.99232, + "grad_norm": 0.7831165790557861, + "learning_rate": 3.7527010804321733e-05, + "loss": 0.5828, + "step": 6244 + }, + { + "epoch": 7.9936, + "grad_norm": 0.8100852370262146, + "learning_rate": 3.7525010004001605e-05, + "loss": 0.5893, + "step": 6245 + }, + { + "epoch": 7.99488, + "grad_norm": 0.8356633186340332, + "learning_rate": 3.752300920368148e-05, + "loss": 0.6308, + "step": 6246 + }, + { + "epoch": 7.99616, + "grad_norm": 0.85598224401474, + "learning_rate": 3.752100840336135e-05, + "loss": 0.6509, + "step": 6247 + }, + { + "epoch": 7.99744, + "grad_norm": 0.8315650224685669, + "learning_rate": 3.7519007603041214e-05, + "loss": 0.6497, + "step": 6248 + }, + { + "epoch": 7.9987200000000005, + "grad_norm": 0.7793347835540771, + "learning_rate": 3.7517006802721086e-05, + "loss": 0.5806, + "step": 6249 + }, + { + "epoch": 8.0, + "grad_norm": 1.704264521598816, + "learning_rate": 3.751500600240096e-05, + "loss": 1.0745, + "step": 6250 + }, + { + "epoch": 8.00128, + "grad_norm": 0.7346014976501465, + "learning_rate": 3.7513005202080836e-05, + "loss": 0.5634, + "step": 6251 + }, + { + "epoch": 8.00256, + "grad_norm": 0.7524464130401611, + "learning_rate": 3.751100440176071e-05, + "loss": 0.5499, + "step": 6252 + }, + { + "epoch": 8.00384, + "grad_norm": 0.8172950148582458, + "learning_rate": 3.750900360144058e-05, + "loss": 0.5984, + "step": 6253 + }, + { + "epoch": 8.00512, + "grad_norm": 0.8188893795013428, + "learning_rate": 3.750700280112045e-05, + "loss": 0.6099, + "step": 6254 + }, + { + "epoch": 8.0064, + "grad_norm": 0.825077474117279, + "learning_rate": 3.7505002000800324e-05, + "loss": 0.6207, + "step": 6255 + }, + { + "epoch": 8.00768, + "grad_norm": 0.7807846069335938, + "learning_rate": 3.750300120048019e-05, + "loss": 0.5735, + "step": 6256 + }, + { + "epoch": 8.00896, + "grad_norm": 0.7946782112121582, + "learning_rate": 3.750100040016006e-05, + "loss": 0.6036, + "step": 6257 + }, + { + "epoch": 8.01024, + "grad_norm": 0.8482866883277893, + "learning_rate": 3.749899959983994e-05, + "loss": 0.5998, + "step": 6258 + }, + { + "epoch": 8.01152, + "grad_norm": 0.7960243821144104, + "learning_rate": 3.749699879951981e-05, + "loss": 0.6042, + "step": 6259 + }, + { + "epoch": 8.0128, + "grad_norm": 0.830877959728241, + "learning_rate": 3.749499799919968e-05, + "loss": 0.586, + "step": 6260 + }, + { + "epoch": 8.01408, + "grad_norm": 0.782637357711792, + "learning_rate": 3.7492997198879555e-05, + "loss": 0.5895, + "step": 6261 + }, + { + "epoch": 8.01536, + "grad_norm": 0.8307430148124695, + "learning_rate": 3.749099639855943e-05, + "loss": 0.5734, + "step": 6262 + }, + { + "epoch": 8.01664, + "grad_norm": 0.832324206829071, + "learning_rate": 3.74889955982393e-05, + "loss": 0.5546, + "step": 6263 + }, + { + "epoch": 8.01792, + "grad_norm": 0.8269922733306885, + "learning_rate": 3.7486994797919164e-05, + "loss": 0.5858, + "step": 6264 + }, + { + "epoch": 8.0192, + "grad_norm": 0.8416856527328491, + "learning_rate": 3.748499399759904e-05, + "loss": 0.6297, + "step": 6265 + }, + { + "epoch": 8.02048, + "grad_norm": 0.806425929069519, + "learning_rate": 3.7482993197278914e-05, + "loss": 0.5737, + "step": 6266 + }, + { + "epoch": 8.02176, + "grad_norm": 0.8288019895553589, + "learning_rate": 3.7480992396958786e-05, + "loss": 0.5856, + "step": 6267 + }, + { + "epoch": 8.02304, + "grad_norm": 0.8416863679885864, + "learning_rate": 3.747899159663866e-05, + "loss": 0.589, + "step": 6268 + }, + { + "epoch": 8.02432, + "grad_norm": 0.8179052472114563, + "learning_rate": 3.747699079631853e-05, + "loss": 0.6037, + "step": 6269 + }, + { + "epoch": 8.0256, + "grad_norm": 0.7979174256324768, + "learning_rate": 3.74749899959984e-05, + "loss": 0.5805, + "step": 6270 + }, + { + "epoch": 8.02688, + "grad_norm": 0.7354505658149719, + "learning_rate": 3.7472989195678274e-05, + "loss": 0.5747, + "step": 6271 + }, + { + "epoch": 8.02816, + "grad_norm": 0.8041462898254395, + "learning_rate": 3.7470988395358145e-05, + "loss": 0.5907, + "step": 6272 + }, + { + "epoch": 8.02944, + "grad_norm": 0.7784780263900757, + "learning_rate": 3.746898759503802e-05, + "loss": 0.5735, + "step": 6273 + }, + { + "epoch": 8.03072, + "grad_norm": 0.8290993571281433, + "learning_rate": 3.746698679471789e-05, + "loss": 0.5509, + "step": 6274 + }, + { + "epoch": 8.032, + "grad_norm": 0.7944415807723999, + "learning_rate": 3.746498599439776e-05, + "loss": 0.5914, + "step": 6275 + }, + { + "epoch": 8.03328, + "grad_norm": 0.7763034701347351, + "learning_rate": 3.746298519407763e-05, + "loss": 0.5719, + "step": 6276 + }, + { + "epoch": 8.03456, + "grad_norm": 0.8510763645172119, + "learning_rate": 3.7460984393757505e-05, + "loss": 0.6297, + "step": 6277 + }, + { + "epoch": 8.03584, + "grad_norm": 0.7973195314407349, + "learning_rate": 3.7458983593437377e-05, + "loss": 0.5668, + "step": 6278 + }, + { + "epoch": 8.03712, + "grad_norm": 0.7961564064025879, + "learning_rate": 3.745698279311725e-05, + "loss": 0.5721, + "step": 6279 + }, + { + "epoch": 8.0384, + "grad_norm": 0.7879326343536377, + "learning_rate": 3.745498199279712e-05, + "loss": 0.5706, + "step": 6280 + }, + { + "epoch": 8.03968, + "grad_norm": 0.8085858225822449, + "learning_rate": 3.745298119247699e-05, + "loss": 0.5476, + "step": 6281 + }, + { + "epoch": 8.04096, + "grad_norm": 0.8712663054466248, + "learning_rate": 3.7450980392156864e-05, + "loss": 0.6085, + "step": 6282 + }, + { + "epoch": 8.04224, + "grad_norm": 0.7817235589027405, + "learning_rate": 3.7448979591836736e-05, + "loss": 0.5623, + "step": 6283 + }, + { + "epoch": 8.043520000000001, + "grad_norm": 0.8824650049209595, + "learning_rate": 3.744697879151661e-05, + "loss": 0.612, + "step": 6284 + }, + { + "epoch": 8.0448, + "grad_norm": 0.8118883371353149, + "learning_rate": 3.744497799119648e-05, + "loss": 0.5937, + "step": 6285 + }, + { + "epoch": 8.04608, + "grad_norm": 0.8654536604881287, + "learning_rate": 3.744297719087636e-05, + "loss": 0.6412, + "step": 6286 + }, + { + "epoch": 8.04736, + "grad_norm": 0.8161172866821289, + "learning_rate": 3.744097639055622e-05, + "loss": 0.6114, + "step": 6287 + }, + { + "epoch": 8.04864, + "grad_norm": 0.8604544997215271, + "learning_rate": 3.7438975590236095e-05, + "loss": 0.6085, + "step": 6288 + }, + { + "epoch": 8.04992, + "grad_norm": 0.816491425037384, + "learning_rate": 3.743697478991597e-05, + "loss": 0.6371, + "step": 6289 + }, + { + "epoch": 8.0512, + "grad_norm": 0.8112639784812927, + "learning_rate": 3.743497398959584e-05, + "loss": 0.5633, + "step": 6290 + }, + { + "epoch": 8.05248, + "grad_norm": 0.8095521330833435, + "learning_rate": 3.743297318927571e-05, + "loss": 0.6115, + "step": 6291 + }, + { + "epoch": 8.05376, + "grad_norm": 0.8249873518943787, + "learning_rate": 3.743097238895558e-05, + "loss": 0.5975, + "step": 6292 + }, + { + "epoch": 8.05504, + "grad_norm": 0.7858548164367676, + "learning_rate": 3.742897158863546e-05, + "loss": 0.5539, + "step": 6293 + }, + { + "epoch": 8.05632, + "grad_norm": 0.8200925588607788, + "learning_rate": 3.742697078831533e-05, + "loss": 0.5892, + "step": 6294 + }, + { + "epoch": 8.0576, + "grad_norm": 0.7779000997543335, + "learning_rate": 3.74249699879952e-05, + "loss": 0.5372, + "step": 6295 + }, + { + "epoch": 8.05888, + "grad_norm": 0.7932630181312561, + "learning_rate": 3.742296918767507e-05, + "loss": 0.5852, + "step": 6296 + }, + { + "epoch": 8.06016, + "grad_norm": 0.8097670078277588, + "learning_rate": 3.742096838735494e-05, + "loss": 0.6018, + "step": 6297 + }, + { + "epoch": 8.06144, + "grad_norm": 0.7957705855369568, + "learning_rate": 3.7418967587034814e-05, + "loss": 0.5866, + "step": 6298 + }, + { + "epoch": 8.06272, + "grad_norm": 0.7967826128005981, + "learning_rate": 3.7416966786714686e-05, + "loss": 0.5785, + "step": 6299 + }, + { + "epoch": 8.064, + "grad_norm": 0.7749722003936768, + "learning_rate": 3.7414965986394564e-05, + "loss": 0.5758, + "step": 6300 + }, + { + "epoch": 8.06528, + "grad_norm": 0.8013430237770081, + "learning_rate": 3.7412965186074436e-05, + "loss": 0.5362, + "step": 6301 + }, + { + "epoch": 8.06656, + "grad_norm": 0.8903826475143433, + "learning_rate": 3.741096438575431e-05, + "loss": 0.6127, + "step": 6302 + }, + { + "epoch": 8.06784, + "grad_norm": 0.7794708013534546, + "learning_rate": 3.740896358543417e-05, + "loss": 0.5785, + "step": 6303 + }, + { + "epoch": 8.06912, + "grad_norm": 0.8395883440971375, + "learning_rate": 3.7406962785114045e-05, + "loss": 0.612, + "step": 6304 + }, + { + "epoch": 8.0704, + "grad_norm": 0.7724868655204773, + "learning_rate": 3.740496198479392e-05, + "loss": 0.5437, + "step": 6305 + }, + { + "epoch": 8.07168, + "grad_norm": 0.8095210790634155, + "learning_rate": 3.740296118447379e-05, + "loss": 0.6093, + "step": 6306 + }, + { + "epoch": 8.07296, + "grad_norm": 0.7821049690246582, + "learning_rate": 3.740096038415367e-05, + "loss": 0.5682, + "step": 6307 + }, + { + "epoch": 8.07424, + "grad_norm": 0.8228663802146912, + "learning_rate": 3.739895958383354e-05, + "loss": 0.5655, + "step": 6308 + }, + { + "epoch": 8.07552, + "grad_norm": 0.7802860140800476, + "learning_rate": 3.739695878351341e-05, + "loss": 0.5646, + "step": 6309 + }, + { + "epoch": 8.0768, + "grad_norm": 0.8123772740364075, + "learning_rate": 3.739495798319328e-05, + "loss": 0.5666, + "step": 6310 + }, + { + "epoch": 8.07808, + "grad_norm": 0.8669785857200623, + "learning_rate": 3.739295718287315e-05, + "loss": 0.5712, + "step": 6311 + }, + { + "epoch": 8.07936, + "grad_norm": 0.798360288143158, + "learning_rate": 3.739095638255302e-05, + "loss": 0.5568, + "step": 6312 + }, + { + "epoch": 8.08064, + "grad_norm": 0.8681429624557495, + "learning_rate": 3.738895558223289e-05, + "loss": 0.6515, + "step": 6313 + }, + { + "epoch": 8.08192, + "grad_norm": 0.8483299612998962, + "learning_rate": 3.7386954781912763e-05, + "loss": 0.6093, + "step": 6314 + }, + { + "epoch": 8.0832, + "grad_norm": 0.8004496693611145, + "learning_rate": 3.738495398159264e-05, + "loss": 0.5589, + "step": 6315 + }, + { + "epoch": 8.08448, + "grad_norm": 0.8437052369117737, + "learning_rate": 3.7382953181272514e-05, + "loss": 0.6215, + "step": 6316 + }, + { + "epoch": 8.08576, + "grad_norm": 0.7914602756500244, + "learning_rate": 3.7380952380952386e-05, + "loss": 0.5765, + "step": 6317 + }, + { + "epoch": 8.08704, + "grad_norm": 0.8129740357398987, + "learning_rate": 3.737895158063226e-05, + "loss": 0.5463, + "step": 6318 + }, + { + "epoch": 8.08832, + "grad_norm": 0.7785910964012146, + "learning_rate": 3.737695078031212e-05, + "loss": 0.5845, + "step": 6319 + }, + { + "epoch": 8.0896, + "grad_norm": 0.781987726688385, + "learning_rate": 3.7374949979991995e-05, + "loss": 0.5644, + "step": 6320 + }, + { + "epoch": 8.09088, + "grad_norm": 0.7904685139656067, + "learning_rate": 3.7372949179671866e-05, + "loss": 0.5591, + "step": 6321 + }, + { + "epoch": 8.09216, + "grad_norm": 0.7658714652061462, + "learning_rate": 3.7370948379351745e-05, + "loss": 0.5768, + "step": 6322 + }, + { + "epoch": 8.09344, + "grad_norm": 0.8255518674850464, + "learning_rate": 3.736894757903162e-05, + "loss": 0.6142, + "step": 6323 + }, + { + "epoch": 8.09472, + "grad_norm": 0.7947357296943665, + "learning_rate": 3.736694677871149e-05, + "loss": 0.5057, + "step": 6324 + }, + { + "epoch": 8.096, + "grad_norm": 0.8527635335922241, + "learning_rate": 3.736494597839136e-05, + "loss": 0.5636, + "step": 6325 + }, + { + "epoch": 8.09728, + "grad_norm": 0.8318933248519897, + "learning_rate": 3.736294517807123e-05, + "loss": 0.6075, + "step": 6326 + }, + { + "epoch": 8.09856, + "grad_norm": 0.8209018707275391, + "learning_rate": 3.73609443777511e-05, + "loss": 0.5532, + "step": 6327 + }, + { + "epoch": 8.09984, + "grad_norm": 0.7837770581245422, + "learning_rate": 3.735894357743097e-05, + "loss": 0.576, + "step": 6328 + }, + { + "epoch": 8.10112, + "grad_norm": 0.7998737096786499, + "learning_rate": 3.735694277711085e-05, + "loss": 0.5571, + "step": 6329 + }, + { + "epoch": 8.1024, + "grad_norm": 0.7599712014198303, + "learning_rate": 3.735494197679072e-05, + "loss": 0.5275, + "step": 6330 + }, + { + "epoch": 8.10368, + "grad_norm": 0.8159705400466919, + "learning_rate": 3.735294117647059e-05, + "loss": 0.5565, + "step": 6331 + }, + { + "epoch": 8.10496, + "grad_norm": 0.8160362243652344, + "learning_rate": 3.7350940376150464e-05, + "loss": 0.5994, + "step": 6332 + }, + { + "epoch": 8.10624, + "grad_norm": 0.7920660972595215, + "learning_rate": 3.7348939575830335e-05, + "loss": 0.6277, + "step": 6333 + }, + { + "epoch": 8.10752, + "grad_norm": 0.8129349946975708, + "learning_rate": 3.734693877551021e-05, + "loss": 0.5953, + "step": 6334 + }, + { + "epoch": 8.1088, + "grad_norm": 0.8045061826705933, + "learning_rate": 3.734493797519007e-05, + "loss": 0.5593, + "step": 6335 + }, + { + "epoch": 8.11008, + "grad_norm": 0.8104987144470215, + "learning_rate": 3.734293717486995e-05, + "loss": 0.587, + "step": 6336 + }, + { + "epoch": 8.11136, + "grad_norm": 0.8352326154708862, + "learning_rate": 3.734093637454982e-05, + "loss": 0.6288, + "step": 6337 + }, + { + "epoch": 8.11264, + "grad_norm": 0.7948821187019348, + "learning_rate": 3.7338935574229695e-05, + "loss": 0.5477, + "step": 6338 + }, + { + "epoch": 8.11392, + "grad_norm": 0.8230301141738892, + "learning_rate": 3.7336934773909567e-05, + "loss": 0.5805, + "step": 6339 + }, + { + "epoch": 8.1152, + "grad_norm": 0.7984503507614136, + "learning_rate": 3.733493397358944e-05, + "loss": 0.5385, + "step": 6340 + }, + { + "epoch": 8.11648, + "grad_norm": 0.817011833190918, + "learning_rate": 3.733293317326931e-05, + "loss": 0.6042, + "step": 6341 + }, + { + "epoch": 8.11776, + "grad_norm": 0.7629779577255249, + "learning_rate": 3.733093237294918e-05, + "loss": 0.5647, + "step": 6342 + }, + { + "epoch": 8.11904, + "grad_norm": 0.7868363261222839, + "learning_rate": 3.7328931572629054e-05, + "loss": 0.551, + "step": 6343 + }, + { + "epoch": 8.12032, + "grad_norm": 0.8329203128814697, + "learning_rate": 3.7326930772308926e-05, + "loss": 0.6125, + "step": 6344 + }, + { + "epoch": 8.1216, + "grad_norm": 0.835198700428009, + "learning_rate": 3.73249299719888e-05, + "loss": 0.6188, + "step": 6345 + }, + { + "epoch": 8.12288, + "grad_norm": 0.8320775032043457, + "learning_rate": 3.732292917166867e-05, + "loss": 0.5952, + "step": 6346 + }, + { + "epoch": 8.12416, + "grad_norm": 0.8890385627746582, + "learning_rate": 3.732092837134854e-05, + "loss": 0.5802, + "step": 6347 + }, + { + "epoch": 8.12544, + "grad_norm": 0.9071861505508423, + "learning_rate": 3.731892757102841e-05, + "loss": 0.6534, + "step": 6348 + }, + { + "epoch": 8.12672, + "grad_norm": 0.8464532494544983, + "learning_rate": 3.7316926770708285e-05, + "loss": 0.5867, + "step": 6349 + }, + { + "epoch": 8.128, + "grad_norm": 0.8280791640281677, + "learning_rate": 3.731492597038816e-05, + "loss": 0.5995, + "step": 6350 + }, + { + "epoch": 8.12928, + "grad_norm": 0.7893093228340149, + "learning_rate": 3.731292517006803e-05, + "loss": 0.5685, + "step": 6351 + }, + { + "epoch": 8.13056, + "grad_norm": 0.8106672167778015, + "learning_rate": 3.73109243697479e-05, + "loss": 0.5615, + "step": 6352 + }, + { + "epoch": 8.13184, + "grad_norm": 0.8697341084480286, + "learning_rate": 3.730892356942777e-05, + "loss": 0.6196, + "step": 6353 + }, + { + "epoch": 8.13312, + "grad_norm": 0.837616503238678, + "learning_rate": 3.7306922769107644e-05, + "loss": 0.5308, + "step": 6354 + }, + { + "epoch": 8.1344, + "grad_norm": 0.7826257944107056, + "learning_rate": 3.7304921968787516e-05, + "loss": 0.5694, + "step": 6355 + }, + { + "epoch": 8.13568, + "grad_norm": 0.8352533578872681, + "learning_rate": 3.730292116846739e-05, + "loss": 0.6239, + "step": 6356 + }, + { + "epoch": 8.13696, + "grad_norm": 0.835431694984436, + "learning_rate": 3.730092036814726e-05, + "loss": 0.6214, + "step": 6357 + }, + { + "epoch": 8.13824, + "grad_norm": 0.8025604486465454, + "learning_rate": 3.729891956782713e-05, + "loss": 0.579, + "step": 6358 + }, + { + "epoch": 8.13952, + "grad_norm": 0.7842774987220764, + "learning_rate": 3.7296918767507004e-05, + "loss": 0.5335, + "step": 6359 + }, + { + "epoch": 8.1408, + "grad_norm": 0.8423367142677307, + "learning_rate": 3.7294917967186876e-05, + "loss": 0.6108, + "step": 6360 + }, + { + "epoch": 8.14208, + "grad_norm": 0.8245922327041626, + "learning_rate": 3.729291716686675e-05, + "loss": 0.5721, + "step": 6361 + }, + { + "epoch": 8.14336, + "grad_norm": 0.8257554173469543, + "learning_rate": 3.729091636654662e-05, + "loss": 0.5574, + "step": 6362 + }, + { + "epoch": 8.14464, + "grad_norm": 0.8311666250228882, + "learning_rate": 3.728891556622649e-05, + "loss": 0.5973, + "step": 6363 + }, + { + "epoch": 8.14592, + "grad_norm": 0.814927339553833, + "learning_rate": 3.728691476590637e-05, + "loss": 0.6139, + "step": 6364 + }, + { + "epoch": 8.1472, + "grad_norm": 0.8726287484169006, + "learning_rate": 3.7284913965586235e-05, + "loss": 0.6423, + "step": 6365 + }, + { + "epoch": 8.14848, + "grad_norm": 0.8697568774223328, + "learning_rate": 3.728291316526611e-05, + "loss": 0.556, + "step": 6366 + }, + { + "epoch": 8.14976, + "grad_norm": 0.7923082113265991, + "learning_rate": 3.728091236494598e-05, + "loss": 0.5776, + "step": 6367 + }, + { + "epoch": 8.15104, + "grad_norm": 0.8061732053756714, + "learning_rate": 3.727891156462585e-05, + "loss": 0.6374, + "step": 6368 + }, + { + "epoch": 8.15232, + "grad_norm": 0.8172518610954285, + "learning_rate": 3.727691076430572e-05, + "loss": 0.5489, + "step": 6369 + }, + { + "epoch": 8.1536, + "grad_norm": 0.7999323606491089, + "learning_rate": 3.7274909963985594e-05, + "loss": 0.556, + "step": 6370 + }, + { + "epoch": 8.15488, + "grad_norm": 0.8566176891326904, + "learning_rate": 3.727290916366547e-05, + "loss": 0.6124, + "step": 6371 + }, + { + "epoch": 8.15616, + "grad_norm": 0.8322476148605347, + "learning_rate": 3.7270908363345345e-05, + "loss": 0.5665, + "step": 6372 + }, + { + "epoch": 8.15744, + "grad_norm": 0.8234203457832336, + "learning_rate": 3.726890756302521e-05, + "loss": 0.5859, + "step": 6373 + }, + { + "epoch": 8.15872, + "grad_norm": 0.7753047347068787, + "learning_rate": 3.726690676270508e-05, + "loss": 0.5526, + "step": 6374 + }, + { + "epoch": 8.16, + "grad_norm": 0.8442295789718628, + "learning_rate": 3.7264905962384953e-05, + "loss": 0.5812, + "step": 6375 + }, + { + "epoch": 8.16128, + "grad_norm": 0.7989637851715088, + "learning_rate": 3.7262905162064825e-05, + "loss": 0.5454, + "step": 6376 + }, + { + "epoch": 8.16256, + "grad_norm": 0.8231421113014221, + "learning_rate": 3.72609043617447e-05, + "loss": 0.5522, + "step": 6377 + }, + { + "epoch": 8.16384, + "grad_norm": 0.8021591901779175, + "learning_rate": 3.7258903561424576e-05, + "loss": 0.5667, + "step": 6378 + }, + { + "epoch": 8.16512, + "grad_norm": 0.8049442768096924, + "learning_rate": 3.725690276110445e-05, + "loss": 0.5194, + "step": 6379 + }, + { + "epoch": 8.1664, + "grad_norm": 0.8534220457077026, + "learning_rate": 3.725490196078432e-05, + "loss": 0.6475, + "step": 6380 + }, + { + "epoch": 8.16768, + "grad_norm": 0.7750808596611023, + "learning_rate": 3.7252901160464185e-05, + "loss": 0.5171, + "step": 6381 + }, + { + "epoch": 8.16896, + "grad_norm": 0.7859979867935181, + "learning_rate": 3.7250900360144056e-05, + "loss": 0.5755, + "step": 6382 + }, + { + "epoch": 8.17024, + "grad_norm": 0.9099715948104858, + "learning_rate": 3.724889955982393e-05, + "loss": 0.5664, + "step": 6383 + }, + { + "epoch": 8.17152, + "grad_norm": 0.7696498036384583, + "learning_rate": 3.72468987595038e-05, + "loss": 0.5575, + "step": 6384 + }, + { + "epoch": 8.1728, + "grad_norm": 0.8566391468048096, + "learning_rate": 3.724489795918368e-05, + "loss": 0.6486, + "step": 6385 + }, + { + "epoch": 8.17408, + "grad_norm": 0.7947507500648499, + "learning_rate": 3.724289715886355e-05, + "loss": 0.5506, + "step": 6386 + }, + { + "epoch": 8.17536, + "grad_norm": 0.8360593318939209, + "learning_rate": 3.724089635854342e-05, + "loss": 0.6098, + "step": 6387 + }, + { + "epoch": 8.17664, + "grad_norm": 0.8018707036972046, + "learning_rate": 3.7238895558223294e-05, + "loss": 0.585, + "step": 6388 + }, + { + "epoch": 8.17792, + "grad_norm": 0.8566110134124756, + "learning_rate": 3.723689475790316e-05, + "loss": 0.5698, + "step": 6389 + }, + { + "epoch": 8.1792, + "grad_norm": 0.7939445376396179, + "learning_rate": 3.723489395758303e-05, + "loss": 0.5424, + "step": 6390 + }, + { + "epoch": 8.18048, + "grad_norm": 0.816756010055542, + "learning_rate": 3.72328931572629e-05, + "loss": 0.563, + "step": 6391 + }, + { + "epoch": 8.18176, + "grad_norm": 0.8309502601623535, + "learning_rate": 3.723089235694278e-05, + "loss": 0.5777, + "step": 6392 + }, + { + "epoch": 8.18304, + "grad_norm": 0.8525409698486328, + "learning_rate": 3.7228891556622654e-05, + "loss": 0.6131, + "step": 6393 + }, + { + "epoch": 8.18432, + "grad_norm": 0.8380247354507446, + "learning_rate": 3.7226890756302525e-05, + "loss": 0.5809, + "step": 6394 + }, + { + "epoch": 8.1856, + "grad_norm": 0.7954679131507874, + "learning_rate": 3.72248899559824e-05, + "loss": 0.5846, + "step": 6395 + }, + { + "epoch": 8.18688, + "grad_norm": 0.8677021265029907, + "learning_rate": 3.722288915566227e-05, + "loss": 0.6068, + "step": 6396 + }, + { + "epoch": 8.18816, + "grad_norm": 0.8796229362487793, + "learning_rate": 3.7220888355342134e-05, + "loss": 0.637, + "step": 6397 + }, + { + "epoch": 8.18944, + "grad_norm": 0.7766791582107544, + "learning_rate": 3.7218887555022006e-05, + "loss": 0.5808, + "step": 6398 + }, + { + "epoch": 8.19072, + "grad_norm": 0.8018509149551392, + "learning_rate": 3.7216886754701885e-05, + "loss": 0.5721, + "step": 6399 + }, + { + "epoch": 8.192, + "grad_norm": 0.8333677053451538, + "learning_rate": 3.721488595438176e-05, + "loss": 0.5966, + "step": 6400 + }, + { + "epoch": 8.19328, + "grad_norm": 0.8443406224250793, + "learning_rate": 3.721288515406163e-05, + "loss": 0.5973, + "step": 6401 + }, + { + "epoch": 8.19456, + "grad_norm": 0.8139026761054993, + "learning_rate": 3.72108843537415e-05, + "loss": 0.5496, + "step": 6402 + }, + { + "epoch": 8.19584, + "grad_norm": 0.846799373626709, + "learning_rate": 3.720888355342137e-05, + "loss": 0.5896, + "step": 6403 + }, + { + "epoch": 8.19712, + "grad_norm": 0.8184740543365479, + "learning_rate": 3.7206882753101244e-05, + "loss": 0.5586, + "step": 6404 + }, + { + "epoch": 8.1984, + "grad_norm": 0.8557119965553284, + "learning_rate": 3.720488195278111e-05, + "loss": 0.5687, + "step": 6405 + }, + { + "epoch": 8.19968, + "grad_norm": 0.7910146713256836, + "learning_rate": 3.720288115246099e-05, + "loss": 0.5474, + "step": 6406 + }, + { + "epoch": 8.20096, + "grad_norm": 0.8577878475189209, + "learning_rate": 3.720088035214086e-05, + "loss": 0.6, + "step": 6407 + }, + { + "epoch": 8.20224, + "grad_norm": 0.8686122894287109, + "learning_rate": 3.719887955182073e-05, + "loss": 0.6026, + "step": 6408 + }, + { + "epoch": 8.20352, + "grad_norm": 0.7906977534294128, + "learning_rate": 3.71968787515006e-05, + "loss": 0.5427, + "step": 6409 + }, + { + "epoch": 8.2048, + "grad_norm": 0.8151067495346069, + "learning_rate": 3.7194877951180475e-05, + "loss": 0.5795, + "step": 6410 + }, + { + "epoch": 8.20608, + "grad_norm": 0.8321256041526794, + "learning_rate": 3.719287715086035e-05, + "loss": 0.6022, + "step": 6411 + }, + { + "epoch": 8.20736, + "grad_norm": 0.7891202569007874, + "learning_rate": 3.719087635054022e-05, + "loss": 0.5335, + "step": 6412 + }, + { + "epoch": 8.20864, + "grad_norm": 0.836283802986145, + "learning_rate": 3.718887555022009e-05, + "loss": 0.6082, + "step": 6413 + }, + { + "epoch": 8.20992, + "grad_norm": 0.7881807684898376, + "learning_rate": 3.718687474989996e-05, + "loss": 0.562, + "step": 6414 + }, + { + "epoch": 8.2112, + "grad_norm": 0.7665523886680603, + "learning_rate": 3.7184873949579834e-05, + "loss": 0.571, + "step": 6415 + }, + { + "epoch": 8.21248, + "grad_norm": 0.8602042198181152, + "learning_rate": 3.7182873149259706e-05, + "loss": 0.6157, + "step": 6416 + }, + { + "epoch": 8.21376, + "grad_norm": 0.8061181902885437, + "learning_rate": 3.718087234893958e-05, + "loss": 0.5852, + "step": 6417 + }, + { + "epoch": 8.21504, + "grad_norm": 0.7839178442955017, + "learning_rate": 3.717887154861945e-05, + "loss": 0.5376, + "step": 6418 + }, + { + "epoch": 8.21632, + "grad_norm": 0.8473336696624756, + "learning_rate": 3.717687074829932e-05, + "loss": 0.6365, + "step": 6419 + }, + { + "epoch": 8.2176, + "grad_norm": 0.8579205870628357, + "learning_rate": 3.7174869947979194e-05, + "loss": 0.6029, + "step": 6420 + }, + { + "epoch": 8.21888, + "grad_norm": 0.8451679348945618, + "learning_rate": 3.7172869147659066e-05, + "loss": 0.6307, + "step": 6421 + }, + { + "epoch": 8.22016, + "grad_norm": 0.7627133131027222, + "learning_rate": 3.717086834733894e-05, + "loss": 0.5673, + "step": 6422 + }, + { + "epoch": 8.22144, + "grad_norm": 0.8411123156547546, + "learning_rate": 3.716886754701881e-05, + "loss": 0.5622, + "step": 6423 + }, + { + "epoch": 8.22272, + "grad_norm": 0.8919776678085327, + "learning_rate": 3.716686674669868e-05, + "loss": 0.6082, + "step": 6424 + }, + { + "epoch": 8.224, + "grad_norm": 0.7615383267402649, + "learning_rate": 3.716486594637855e-05, + "loss": 0.5266, + "step": 6425 + }, + { + "epoch": 8.22528, + "grad_norm": 0.8244317770004272, + "learning_rate": 3.7162865146058425e-05, + "loss": 0.5582, + "step": 6426 + }, + { + "epoch": 8.22656, + "grad_norm": 0.8274043202400208, + "learning_rate": 3.71608643457383e-05, + "loss": 0.5784, + "step": 6427 + }, + { + "epoch": 8.22784, + "grad_norm": 0.8274574279785156, + "learning_rate": 3.715886354541817e-05, + "loss": 0.5993, + "step": 6428 + }, + { + "epoch": 8.22912, + "grad_norm": 0.7799541354179382, + "learning_rate": 3.715686274509804e-05, + "loss": 0.6209, + "step": 6429 + }, + { + "epoch": 8.2304, + "grad_norm": 0.8560000658035278, + "learning_rate": 3.715486194477791e-05, + "loss": 0.6404, + "step": 6430 + }, + { + "epoch": 8.23168, + "grad_norm": 0.8321433663368225, + "learning_rate": 3.7152861144457784e-05, + "loss": 0.6115, + "step": 6431 + }, + { + "epoch": 8.23296, + "grad_norm": 0.7989788055419922, + "learning_rate": 3.7150860344137656e-05, + "loss": 0.5221, + "step": 6432 + }, + { + "epoch": 8.23424, + "grad_norm": 0.8330096006393433, + "learning_rate": 3.714885954381753e-05, + "loss": 0.6045, + "step": 6433 + }, + { + "epoch": 8.23552, + "grad_norm": 0.7904120683670044, + "learning_rate": 3.71468587434974e-05, + "loss": 0.5978, + "step": 6434 + }, + { + "epoch": 8.2368, + "grad_norm": 0.8202242255210876, + "learning_rate": 3.714485794317727e-05, + "loss": 0.5845, + "step": 6435 + }, + { + "epoch": 8.23808, + "grad_norm": 0.8055812120437622, + "learning_rate": 3.7142857142857143e-05, + "loss": 0.5653, + "step": 6436 + }, + { + "epoch": 8.23936, + "grad_norm": 0.8505423665046692, + "learning_rate": 3.7140856342537015e-05, + "loss": 0.6106, + "step": 6437 + }, + { + "epoch": 8.24064, + "grad_norm": 0.8569924235343933, + "learning_rate": 3.713885554221689e-05, + "loss": 0.5948, + "step": 6438 + }, + { + "epoch": 8.24192, + "grad_norm": 0.785232663154602, + "learning_rate": 3.713685474189676e-05, + "loss": 0.5768, + "step": 6439 + }, + { + "epoch": 8.2432, + "grad_norm": 0.8176567554473877, + "learning_rate": 3.713485394157663e-05, + "loss": 0.5744, + "step": 6440 + }, + { + "epoch": 8.24448, + "grad_norm": 0.8164046406745911, + "learning_rate": 3.71328531412565e-05, + "loss": 0.5559, + "step": 6441 + }, + { + "epoch": 8.24576, + "grad_norm": 0.7764423489570618, + "learning_rate": 3.713085234093638e-05, + "loss": 0.5796, + "step": 6442 + }, + { + "epoch": 8.24704, + "grad_norm": 0.8372679352760315, + "learning_rate": 3.7128851540616246e-05, + "loss": 0.5921, + "step": 6443 + }, + { + "epoch": 8.24832, + "grad_norm": 0.8438205122947693, + "learning_rate": 3.712685074029612e-05, + "loss": 0.6257, + "step": 6444 + }, + { + "epoch": 8.2496, + "grad_norm": 0.810208261013031, + "learning_rate": 3.712484993997599e-05, + "loss": 0.5474, + "step": 6445 + }, + { + "epoch": 8.25088, + "grad_norm": 0.866226315498352, + "learning_rate": 3.712284913965586e-05, + "loss": 0.5989, + "step": 6446 + }, + { + "epoch": 8.25216, + "grad_norm": 0.8516311645507812, + "learning_rate": 3.7120848339335734e-05, + "loss": 0.6025, + "step": 6447 + }, + { + "epoch": 8.25344, + "grad_norm": 0.8606278896331787, + "learning_rate": 3.7118847539015606e-05, + "loss": 0.5967, + "step": 6448 + }, + { + "epoch": 8.25472, + "grad_norm": 0.8150187134742737, + "learning_rate": 3.7116846738695484e-05, + "loss": 0.5981, + "step": 6449 + }, + { + "epoch": 8.256, + "grad_norm": 0.7826385498046875, + "learning_rate": 3.7114845938375356e-05, + "loss": 0.5887, + "step": 6450 + }, + { + "epoch": 8.25728, + "grad_norm": 0.8195633888244629, + "learning_rate": 3.711284513805522e-05, + "loss": 0.587, + "step": 6451 + }, + { + "epoch": 8.25856, + "grad_norm": 0.9077224135398865, + "learning_rate": 3.711084433773509e-05, + "loss": 0.6068, + "step": 6452 + }, + { + "epoch": 8.25984, + "grad_norm": 0.8052554130554199, + "learning_rate": 3.7108843537414965e-05, + "loss": 0.5859, + "step": 6453 + }, + { + "epoch": 8.26112, + "grad_norm": 0.8076302409172058, + "learning_rate": 3.710684273709484e-05, + "loss": 0.576, + "step": 6454 + }, + { + "epoch": 8.2624, + "grad_norm": 0.8576489090919495, + "learning_rate": 3.710484193677471e-05, + "loss": 0.6078, + "step": 6455 + }, + { + "epoch": 8.26368, + "grad_norm": 0.8286393880844116, + "learning_rate": 3.710284113645459e-05, + "loss": 0.6022, + "step": 6456 + }, + { + "epoch": 8.26496, + "grad_norm": 0.8481602072715759, + "learning_rate": 3.710084033613446e-05, + "loss": 0.5954, + "step": 6457 + }, + { + "epoch": 8.26624, + "grad_norm": 0.8346137404441833, + "learning_rate": 3.709883953581433e-05, + "loss": 0.5913, + "step": 6458 + }, + { + "epoch": 8.26752, + "grad_norm": 0.8080378770828247, + "learning_rate": 3.7096838735494196e-05, + "loss": 0.6009, + "step": 6459 + }, + { + "epoch": 8.2688, + "grad_norm": 0.8473795652389526, + "learning_rate": 3.709483793517407e-05, + "loss": 0.6306, + "step": 6460 + }, + { + "epoch": 8.27008, + "grad_norm": 0.8970816731452942, + "learning_rate": 3.709283713485394e-05, + "loss": 0.5936, + "step": 6461 + }, + { + "epoch": 8.27136, + "grad_norm": 0.8006930947303772, + "learning_rate": 3.709083633453381e-05, + "loss": 0.5833, + "step": 6462 + }, + { + "epoch": 8.272639999999999, + "grad_norm": 0.800212562084198, + "learning_rate": 3.708883553421369e-05, + "loss": 0.5658, + "step": 6463 + }, + { + "epoch": 8.27392, + "grad_norm": 0.8020644783973694, + "learning_rate": 3.708683473389356e-05, + "loss": 0.6084, + "step": 6464 + }, + { + "epoch": 8.2752, + "grad_norm": 0.8160932660102844, + "learning_rate": 3.7084833933573434e-05, + "loss": 0.5927, + "step": 6465 + }, + { + "epoch": 8.27648, + "grad_norm": 0.8528776168823242, + "learning_rate": 3.7082833133253306e-05, + "loss": 0.6092, + "step": 6466 + }, + { + "epoch": 8.27776, + "grad_norm": 0.8029847145080566, + "learning_rate": 3.708083233293317e-05, + "loss": 0.6283, + "step": 6467 + }, + { + "epoch": 8.27904, + "grad_norm": 0.7798078060150146, + "learning_rate": 3.707883153261304e-05, + "loss": 0.5935, + "step": 6468 + }, + { + "epoch": 8.28032, + "grad_norm": 0.7831089496612549, + "learning_rate": 3.7076830732292915e-05, + "loss": 0.5904, + "step": 6469 + }, + { + "epoch": 8.2816, + "grad_norm": 0.9106951951980591, + "learning_rate": 3.707482993197279e-05, + "loss": 0.6896, + "step": 6470 + }, + { + "epoch": 8.28288, + "grad_norm": 0.8110149502754211, + "learning_rate": 3.7072829131652665e-05, + "loss": 0.5925, + "step": 6471 + }, + { + "epoch": 8.28416, + "grad_norm": 0.7867901921272278, + "learning_rate": 3.707082833133254e-05, + "loss": 0.5726, + "step": 6472 + }, + { + "epoch": 8.28544, + "grad_norm": 0.7895092368125916, + "learning_rate": 3.706882753101241e-05, + "loss": 0.5712, + "step": 6473 + }, + { + "epoch": 8.28672, + "grad_norm": 0.8702477216720581, + "learning_rate": 3.706682673069228e-05, + "loss": 0.6259, + "step": 6474 + }, + { + "epoch": 8.288, + "grad_norm": 0.8143934011459351, + "learning_rate": 3.7064825930372146e-05, + "loss": 0.5903, + "step": 6475 + }, + { + "epoch": 8.28928, + "grad_norm": 0.8529263138771057, + "learning_rate": 3.706282513005202e-05, + "loss": 0.6155, + "step": 6476 + }, + { + "epoch": 8.29056, + "grad_norm": 0.7962032556533813, + "learning_rate": 3.7060824329731896e-05, + "loss": 0.5951, + "step": 6477 + }, + { + "epoch": 8.29184, + "grad_norm": 0.7777064442634583, + "learning_rate": 3.705882352941177e-05, + "loss": 0.5678, + "step": 6478 + }, + { + "epoch": 8.29312, + "grad_norm": 0.8295832872390747, + "learning_rate": 3.705682272909164e-05, + "loss": 0.5486, + "step": 6479 + }, + { + "epoch": 8.2944, + "grad_norm": 0.7815911769866943, + "learning_rate": 3.705482192877151e-05, + "loss": 0.5732, + "step": 6480 + }, + { + "epoch": 8.29568, + "grad_norm": 0.7912229895591736, + "learning_rate": 3.7052821128451384e-05, + "loss": 0.5611, + "step": 6481 + }, + { + "epoch": 8.29696, + "grad_norm": 0.8012306690216064, + "learning_rate": 3.7050820328131256e-05, + "loss": 0.5652, + "step": 6482 + }, + { + "epoch": 8.29824, + "grad_norm": 0.7731996178627014, + "learning_rate": 3.704881952781112e-05, + "loss": 0.511, + "step": 6483 + }, + { + "epoch": 8.29952, + "grad_norm": 0.858241081237793, + "learning_rate": 3.7046818727491e-05, + "loss": 0.5636, + "step": 6484 + }, + { + "epoch": 8.3008, + "grad_norm": 0.816175639629364, + "learning_rate": 3.704481792717087e-05, + "loss": 0.5677, + "step": 6485 + }, + { + "epoch": 8.30208, + "grad_norm": 0.7947445511817932, + "learning_rate": 3.704281712685074e-05, + "loss": 0.5523, + "step": 6486 + }, + { + "epoch": 8.30336, + "grad_norm": 0.8075773119926453, + "learning_rate": 3.7040816326530615e-05, + "loss": 0.566, + "step": 6487 + }, + { + "epoch": 8.30464, + "grad_norm": 0.8670647740364075, + "learning_rate": 3.703881552621049e-05, + "loss": 0.6355, + "step": 6488 + }, + { + "epoch": 8.30592, + "grad_norm": 0.8332636952400208, + "learning_rate": 3.703681472589036e-05, + "loss": 0.6226, + "step": 6489 + }, + { + "epoch": 8.3072, + "grad_norm": 0.8159798979759216, + "learning_rate": 3.703481392557023e-05, + "loss": 0.5558, + "step": 6490 + }, + { + "epoch": 8.30848, + "grad_norm": 0.8338848948478699, + "learning_rate": 3.70328131252501e-05, + "loss": 0.5839, + "step": 6491 + }, + { + "epoch": 8.30976, + "grad_norm": 0.8652037382125854, + "learning_rate": 3.7030812324929974e-05, + "loss": 0.6033, + "step": 6492 + }, + { + "epoch": 8.31104, + "grad_norm": 0.8078513145446777, + "learning_rate": 3.7028811524609846e-05, + "loss": 0.5618, + "step": 6493 + }, + { + "epoch": 8.31232, + "grad_norm": 0.8094210028648376, + "learning_rate": 3.702681072428972e-05, + "loss": 0.5781, + "step": 6494 + }, + { + "epoch": 8.3136, + "grad_norm": 0.7953526973724365, + "learning_rate": 3.702480992396959e-05, + "loss": 0.6069, + "step": 6495 + }, + { + "epoch": 8.31488, + "grad_norm": 0.8320873379707336, + "learning_rate": 3.702280912364946e-05, + "loss": 0.5769, + "step": 6496 + }, + { + "epoch": 8.31616, + "grad_norm": 0.8492526412010193, + "learning_rate": 3.7020808323329334e-05, + "loss": 0.6435, + "step": 6497 + }, + { + "epoch": 8.31744, + "grad_norm": 0.8726949691772461, + "learning_rate": 3.7018807523009205e-05, + "loss": 0.5645, + "step": 6498 + }, + { + "epoch": 8.31872, + "grad_norm": 0.8547096252441406, + "learning_rate": 3.701680672268908e-05, + "loss": 0.5968, + "step": 6499 + }, + { + "epoch": 8.32, + "grad_norm": 0.8142231702804565, + "learning_rate": 3.701480592236895e-05, + "loss": 0.6064, + "step": 6500 + }, + { + "epoch": 8.32128, + "grad_norm": 0.8083314895629883, + "learning_rate": 3.701280512204882e-05, + "loss": 0.5691, + "step": 6501 + }, + { + "epoch": 8.32256, + "grad_norm": 0.7994301319122314, + "learning_rate": 3.701080432172869e-05, + "loss": 0.6027, + "step": 6502 + }, + { + "epoch": 8.32384, + "grad_norm": 0.8659411072731018, + "learning_rate": 3.7008803521408565e-05, + "loss": 0.6085, + "step": 6503 + }, + { + "epoch": 8.32512, + "grad_norm": 0.8234634399414062, + "learning_rate": 3.7006802721088437e-05, + "loss": 0.593, + "step": 6504 + }, + { + "epoch": 8.3264, + "grad_norm": 0.8921228051185608, + "learning_rate": 3.700480192076831e-05, + "loss": 0.6614, + "step": 6505 + }, + { + "epoch": 8.32768, + "grad_norm": 0.7711195945739746, + "learning_rate": 3.700280112044818e-05, + "loss": 0.5886, + "step": 6506 + }, + { + "epoch": 8.32896, + "grad_norm": 0.8125150203704834, + "learning_rate": 3.700080032012805e-05, + "loss": 0.58, + "step": 6507 + }, + { + "epoch": 8.33024, + "grad_norm": 0.8450069427490234, + "learning_rate": 3.6998799519807924e-05, + "loss": 0.6523, + "step": 6508 + }, + { + "epoch": 8.33152, + "grad_norm": 0.8026819229125977, + "learning_rate": 3.6996798719487796e-05, + "loss": 0.5938, + "step": 6509 + }, + { + "epoch": 8.3328, + "grad_norm": 0.7969563603401184, + "learning_rate": 3.699479791916767e-05, + "loss": 0.591, + "step": 6510 + }, + { + "epoch": 8.33408, + "grad_norm": 0.8721278309822083, + "learning_rate": 3.699279711884754e-05, + "loss": 0.636, + "step": 6511 + }, + { + "epoch": 8.33536, + "grad_norm": 0.81974196434021, + "learning_rate": 3.699079631852742e-05, + "loss": 0.6409, + "step": 6512 + }, + { + "epoch": 8.33664, + "grad_norm": 0.7834908366203308, + "learning_rate": 3.698879551820728e-05, + "loss": 0.5799, + "step": 6513 + }, + { + "epoch": 8.33792, + "grad_norm": 0.815528929233551, + "learning_rate": 3.6986794717887155e-05, + "loss": 0.5999, + "step": 6514 + }, + { + "epoch": 8.3392, + "grad_norm": 0.8319604992866516, + "learning_rate": 3.698479391756703e-05, + "loss": 0.6098, + "step": 6515 + }, + { + "epoch": 8.34048, + "grad_norm": 0.7863122820854187, + "learning_rate": 3.69827931172469e-05, + "loss": 0.5897, + "step": 6516 + }, + { + "epoch": 8.34176, + "grad_norm": 0.8292329907417297, + "learning_rate": 3.698079231692677e-05, + "loss": 0.5612, + "step": 6517 + }, + { + "epoch": 8.34304, + "grad_norm": 0.8444699645042419, + "learning_rate": 3.697879151660664e-05, + "loss": 0.6062, + "step": 6518 + }, + { + "epoch": 8.34432, + "grad_norm": 0.8435462117195129, + "learning_rate": 3.697679071628652e-05, + "loss": 0.5952, + "step": 6519 + }, + { + "epoch": 8.3456, + "grad_norm": 0.8049376606941223, + "learning_rate": 3.697478991596639e-05, + "loss": 0.5783, + "step": 6520 + }, + { + "epoch": 8.34688, + "grad_norm": 0.8144257068634033, + "learning_rate": 3.697278911564626e-05, + "loss": 0.6279, + "step": 6521 + }, + { + "epoch": 8.34816, + "grad_norm": 0.8180856108665466, + "learning_rate": 3.697078831532613e-05, + "loss": 0.5911, + "step": 6522 + }, + { + "epoch": 8.34944, + "grad_norm": 0.8317684531211853, + "learning_rate": 3.6968787515006e-05, + "loss": 0.6043, + "step": 6523 + }, + { + "epoch": 8.35072, + "grad_norm": 0.8124802112579346, + "learning_rate": 3.6966786714685874e-05, + "loss": 0.6082, + "step": 6524 + }, + { + "epoch": 8.352, + "grad_norm": 0.8340176939964294, + "learning_rate": 3.6964785914365746e-05, + "loss": 0.6172, + "step": 6525 + }, + { + "epoch": 8.35328, + "grad_norm": 0.8047380447387695, + "learning_rate": 3.6962785114045624e-05, + "loss": 0.5722, + "step": 6526 + }, + { + "epoch": 8.35456, + "grad_norm": 0.8233848810195923, + "learning_rate": 3.6960784313725496e-05, + "loss": 0.6046, + "step": 6527 + }, + { + "epoch": 8.35584, + "grad_norm": 0.7907252311706543, + "learning_rate": 3.695878351340537e-05, + "loss": 0.5304, + "step": 6528 + }, + { + "epoch": 8.35712, + "grad_norm": 0.8269102573394775, + "learning_rate": 3.695678271308523e-05, + "loss": 0.5938, + "step": 6529 + }, + { + "epoch": 8.3584, + "grad_norm": 0.8250780701637268, + "learning_rate": 3.6954781912765105e-05, + "loss": 0.5838, + "step": 6530 + }, + { + "epoch": 8.35968, + "grad_norm": 0.7899469137191772, + "learning_rate": 3.695278111244498e-05, + "loss": 0.5508, + "step": 6531 + }, + { + "epoch": 8.36096, + "grad_norm": 0.8494967818260193, + "learning_rate": 3.695078031212485e-05, + "loss": 0.6586, + "step": 6532 + }, + { + "epoch": 8.36224, + "grad_norm": 0.775540828704834, + "learning_rate": 3.694877951180473e-05, + "loss": 0.5521, + "step": 6533 + }, + { + "epoch": 8.36352, + "grad_norm": 0.7813871502876282, + "learning_rate": 3.69467787114846e-05, + "loss": 0.5531, + "step": 6534 + }, + { + "epoch": 8.3648, + "grad_norm": 0.7743489742279053, + "learning_rate": 3.694477791116447e-05, + "loss": 0.5521, + "step": 6535 + }, + { + "epoch": 8.36608, + "grad_norm": 0.8797667026519775, + "learning_rate": 3.694277711084434e-05, + "loss": 0.5972, + "step": 6536 + }, + { + "epoch": 8.36736, + "grad_norm": 0.8481217622756958, + "learning_rate": 3.694077631052421e-05, + "loss": 0.6185, + "step": 6537 + }, + { + "epoch": 8.36864, + "grad_norm": 0.8754853010177612, + "learning_rate": 3.693877551020408e-05, + "loss": 0.6194, + "step": 6538 + }, + { + "epoch": 8.36992, + "grad_norm": 0.8245212435722351, + "learning_rate": 3.693677470988395e-05, + "loss": 0.5823, + "step": 6539 + }, + { + "epoch": 8.3712, + "grad_norm": 0.8101980090141296, + "learning_rate": 3.693477390956382e-05, + "loss": 0.5779, + "step": 6540 + }, + { + "epoch": 8.37248, + "grad_norm": 0.8348204493522644, + "learning_rate": 3.69327731092437e-05, + "loss": 0.5794, + "step": 6541 + }, + { + "epoch": 8.37376, + "grad_norm": 0.84105384349823, + "learning_rate": 3.6930772308923574e-05, + "loss": 0.5751, + "step": 6542 + }, + { + "epoch": 8.37504, + "grad_norm": 0.8668518662452698, + "learning_rate": 3.6928771508603446e-05, + "loss": 0.5879, + "step": 6543 + }, + { + "epoch": 8.37632, + "grad_norm": 0.8224269151687622, + "learning_rate": 3.692677070828332e-05, + "loss": 0.5835, + "step": 6544 + }, + { + "epoch": 8.3776, + "grad_norm": 0.8665353059768677, + "learning_rate": 3.692476990796318e-05, + "loss": 0.5756, + "step": 6545 + }, + { + "epoch": 8.37888, + "grad_norm": 0.8154611587524414, + "learning_rate": 3.6922769107643054e-05, + "loss": 0.6244, + "step": 6546 + }, + { + "epoch": 8.38016, + "grad_norm": 0.7815151810646057, + "learning_rate": 3.6920768307322926e-05, + "loss": 0.5823, + "step": 6547 + }, + { + "epoch": 8.38144, + "grad_norm": 0.8650956749916077, + "learning_rate": 3.6918767507002805e-05, + "loss": 0.5881, + "step": 6548 + }, + { + "epoch": 8.38272, + "grad_norm": 0.8099015355110168, + "learning_rate": 3.691676670668268e-05, + "loss": 0.6132, + "step": 6549 + }, + { + "epoch": 8.384, + "grad_norm": 0.8544344902038574, + "learning_rate": 3.691476590636255e-05, + "loss": 0.5674, + "step": 6550 + }, + { + "epoch": 8.38528, + "grad_norm": 0.8150829672813416, + "learning_rate": 3.691276510604242e-05, + "loss": 0.6013, + "step": 6551 + }, + { + "epoch": 8.38656, + "grad_norm": 0.8252568244934082, + "learning_rate": 3.691076430572229e-05, + "loss": 0.5952, + "step": 6552 + }, + { + "epoch": 8.38784, + "grad_norm": 0.8224411010742188, + "learning_rate": 3.690876350540216e-05, + "loss": 0.613, + "step": 6553 + }, + { + "epoch": 8.38912, + "grad_norm": 0.8442636728286743, + "learning_rate": 3.690676270508203e-05, + "loss": 0.6177, + "step": 6554 + }, + { + "epoch": 8.3904, + "grad_norm": 0.8492255210876465, + "learning_rate": 3.690476190476191e-05, + "loss": 0.5842, + "step": 6555 + }, + { + "epoch": 8.39168, + "grad_norm": 0.8563710451126099, + "learning_rate": 3.690276110444178e-05, + "loss": 0.6015, + "step": 6556 + }, + { + "epoch": 8.39296, + "grad_norm": 0.8145362138748169, + "learning_rate": 3.690076030412165e-05, + "loss": 0.5714, + "step": 6557 + }, + { + "epoch": 8.39424, + "grad_norm": 0.8347020745277405, + "learning_rate": 3.6898759503801524e-05, + "loss": 0.6105, + "step": 6558 + }, + { + "epoch": 8.39552, + "grad_norm": 0.8055539131164551, + "learning_rate": 3.6896758703481395e-05, + "loss": 0.6214, + "step": 6559 + }, + { + "epoch": 8.3968, + "grad_norm": 0.7828962802886963, + "learning_rate": 3.689475790316127e-05, + "loss": 0.6206, + "step": 6560 + }, + { + "epoch": 8.39808, + "grad_norm": 0.8174715042114258, + "learning_rate": 3.689275710284113e-05, + "loss": 0.536, + "step": 6561 + }, + { + "epoch": 8.39936, + "grad_norm": 0.7967883944511414, + "learning_rate": 3.689075630252101e-05, + "loss": 0.6092, + "step": 6562 + }, + { + "epoch": 8.40064, + "grad_norm": 0.8208482265472412, + "learning_rate": 3.688875550220088e-05, + "loss": 0.5818, + "step": 6563 + }, + { + "epoch": 8.40192, + "grad_norm": 0.7751407623291016, + "learning_rate": 3.6886754701880755e-05, + "loss": 0.5362, + "step": 6564 + }, + { + "epoch": 8.4032, + "grad_norm": 0.8539714217185974, + "learning_rate": 3.6884753901560627e-05, + "loss": 0.5734, + "step": 6565 + }, + { + "epoch": 8.40448, + "grad_norm": 0.8336576819419861, + "learning_rate": 3.68827531012405e-05, + "loss": 0.5931, + "step": 6566 + }, + { + "epoch": 8.40576, + "grad_norm": 0.7995145320892334, + "learning_rate": 3.688075230092037e-05, + "loss": 0.6048, + "step": 6567 + }, + { + "epoch": 8.40704, + "grad_norm": 0.8343474268913269, + "learning_rate": 3.687875150060024e-05, + "loss": 0.5621, + "step": 6568 + }, + { + "epoch": 8.40832, + "grad_norm": 0.8455119132995605, + "learning_rate": 3.6876750700280114e-05, + "loss": 0.597, + "step": 6569 + }, + { + "epoch": 8.4096, + "grad_norm": 0.839838445186615, + "learning_rate": 3.6874749899959986e-05, + "loss": 0.5993, + "step": 6570 + }, + { + "epoch": 8.41088, + "grad_norm": 0.8005172610282898, + "learning_rate": 3.687274909963986e-05, + "loss": 0.5688, + "step": 6571 + }, + { + "epoch": 8.41216, + "grad_norm": 0.8667073249816895, + "learning_rate": 3.687074829931973e-05, + "loss": 0.5971, + "step": 6572 + }, + { + "epoch": 8.41344, + "grad_norm": 0.7542425990104675, + "learning_rate": 3.68687474989996e-05, + "loss": 0.5329, + "step": 6573 + }, + { + "epoch": 8.414719999999999, + "grad_norm": 0.8643616437911987, + "learning_rate": 3.686674669867947e-05, + "loss": 0.6388, + "step": 6574 + }, + { + "epoch": 8.416, + "grad_norm": 0.807478666305542, + "learning_rate": 3.6864745898359345e-05, + "loss": 0.6263, + "step": 6575 + }, + { + "epoch": 8.41728, + "grad_norm": 0.8404189944267273, + "learning_rate": 3.686274509803922e-05, + "loss": 0.6011, + "step": 6576 + }, + { + "epoch": 8.41856, + "grad_norm": 0.8093534111976624, + "learning_rate": 3.686074429771909e-05, + "loss": 0.5659, + "step": 6577 + }, + { + "epoch": 8.41984, + "grad_norm": 0.8560016751289368, + "learning_rate": 3.685874349739896e-05, + "loss": 0.6221, + "step": 6578 + }, + { + "epoch": 8.42112, + "grad_norm": 0.836397647857666, + "learning_rate": 3.685674269707883e-05, + "loss": 0.6274, + "step": 6579 + }, + { + "epoch": 8.4224, + "grad_norm": 0.8117139935493469, + "learning_rate": 3.6854741896758704e-05, + "loss": 0.6296, + "step": 6580 + }, + { + "epoch": 8.42368, + "grad_norm": 0.8044874668121338, + "learning_rate": 3.6852741096438576e-05, + "loss": 0.5879, + "step": 6581 + }, + { + "epoch": 8.42496, + "grad_norm": 0.7988526225090027, + "learning_rate": 3.685074029611845e-05, + "loss": 0.5576, + "step": 6582 + }, + { + "epoch": 8.42624, + "grad_norm": 0.7902975678443909, + "learning_rate": 3.684873949579833e-05, + "loss": 0.5694, + "step": 6583 + }, + { + "epoch": 8.42752, + "grad_norm": 0.8768497109413147, + "learning_rate": 3.684673869547819e-05, + "loss": 0.6101, + "step": 6584 + }, + { + "epoch": 8.4288, + "grad_norm": 0.806703507900238, + "learning_rate": 3.6844737895158064e-05, + "loss": 0.576, + "step": 6585 + }, + { + "epoch": 8.43008, + "grad_norm": 0.8013177514076233, + "learning_rate": 3.6842737094837936e-05, + "loss": 0.6118, + "step": 6586 + }, + { + "epoch": 8.43136, + "grad_norm": 0.8201990723609924, + "learning_rate": 3.684073629451781e-05, + "loss": 0.5902, + "step": 6587 + }, + { + "epoch": 8.43264, + "grad_norm": 0.8329010009765625, + "learning_rate": 3.683873549419768e-05, + "loss": 0.5995, + "step": 6588 + }, + { + "epoch": 8.43392, + "grad_norm": 0.7939520478248596, + "learning_rate": 3.683673469387755e-05, + "loss": 0.5849, + "step": 6589 + }, + { + "epoch": 8.4352, + "grad_norm": 0.8510729670524597, + "learning_rate": 3.683473389355743e-05, + "loss": 0.6381, + "step": 6590 + }, + { + "epoch": 8.43648, + "grad_norm": 0.8639024496078491, + "learning_rate": 3.68327330932373e-05, + "loss": 0.6025, + "step": 6591 + }, + { + "epoch": 8.43776, + "grad_norm": 0.81855708360672, + "learning_rate": 3.683073229291717e-05, + "loss": 0.5719, + "step": 6592 + }, + { + "epoch": 8.43904, + "grad_norm": 0.8242548108100891, + "learning_rate": 3.682873149259704e-05, + "loss": 0.5777, + "step": 6593 + }, + { + "epoch": 8.44032, + "grad_norm": 0.8355400562286377, + "learning_rate": 3.682673069227691e-05, + "loss": 0.5496, + "step": 6594 + }, + { + "epoch": 8.4416, + "grad_norm": 0.8287314772605896, + "learning_rate": 3.682472989195678e-05, + "loss": 0.6298, + "step": 6595 + }, + { + "epoch": 8.44288, + "grad_norm": 0.8493402600288391, + "learning_rate": 3.6822729091636654e-05, + "loss": 0.624, + "step": 6596 + }, + { + "epoch": 8.44416, + "grad_norm": 0.786516547203064, + "learning_rate": 3.682072829131653e-05, + "loss": 0.5663, + "step": 6597 + }, + { + "epoch": 8.44544, + "grad_norm": 0.7651713490486145, + "learning_rate": 3.6818727490996405e-05, + "loss": 0.5248, + "step": 6598 + }, + { + "epoch": 8.44672, + "grad_norm": 0.8335928320884705, + "learning_rate": 3.6816726690676276e-05, + "loss": 0.5722, + "step": 6599 + }, + { + "epoch": 8.448, + "grad_norm": 0.8091786503791809, + "learning_rate": 3.681472589035614e-05, + "loss": 0.554, + "step": 6600 + }, + { + "epoch": 8.44928, + "grad_norm": 0.846765398979187, + "learning_rate": 3.6812725090036013e-05, + "loss": 0.663, + "step": 6601 + }, + { + "epoch": 8.45056, + "grad_norm": 0.8616847991943359, + "learning_rate": 3.6810724289715885e-05, + "loss": 0.6325, + "step": 6602 + }, + { + "epoch": 8.45184, + "grad_norm": 0.7922678589820862, + "learning_rate": 3.680872348939576e-05, + "loss": 0.5701, + "step": 6603 + }, + { + "epoch": 8.45312, + "grad_norm": 0.8290618658065796, + "learning_rate": 3.6806722689075636e-05, + "loss": 0.6524, + "step": 6604 + }, + { + "epoch": 8.4544, + "grad_norm": 0.8202276825904846, + "learning_rate": 3.680472188875551e-05, + "loss": 0.5703, + "step": 6605 + }, + { + "epoch": 8.45568, + "grad_norm": 0.8039751052856445, + "learning_rate": 3.680272108843538e-05, + "loss": 0.5763, + "step": 6606 + }, + { + "epoch": 8.45696, + "grad_norm": 0.7898495197296143, + "learning_rate": 3.680072028811525e-05, + "loss": 0.5615, + "step": 6607 + }, + { + "epoch": 8.45824, + "grad_norm": 0.833078145980835, + "learning_rate": 3.6798719487795116e-05, + "loss": 0.5745, + "step": 6608 + }, + { + "epoch": 8.45952, + "grad_norm": 0.908738911151886, + "learning_rate": 3.679671868747499e-05, + "loss": 0.5853, + "step": 6609 + }, + { + "epoch": 8.4608, + "grad_norm": 0.8550048470497131, + "learning_rate": 3.679471788715486e-05, + "loss": 0.5834, + "step": 6610 + }, + { + "epoch": 8.46208, + "grad_norm": 0.8882399201393127, + "learning_rate": 3.679271708683474e-05, + "loss": 0.6226, + "step": 6611 + }, + { + "epoch": 8.46336, + "grad_norm": 0.832423746585846, + "learning_rate": 3.679071628651461e-05, + "loss": 0.5573, + "step": 6612 + }, + { + "epoch": 8.46464, + "grad_norm": 0.842068076133728, + "learning_rate": 3.678871548619448e-05, + "loss": 0.6053, + "step": 6613 + }, + { + "epoch": 8.46592, + "grad_norm": 0.8376506567001343, + "learning_rate": 3.6786714685874354e-05, + "loss": 0.5527, + "step": 6614 + }, + { + "epoch": 8.4672, + "grad_norm": 0.8572778701782227, + "learning_rate": 3.6784713885554226e-05, + "loss": 0.6259, + "step": 6615 + }, + { + "epoch": 8.46848, + "grad_norm": 0.817713737487793, + "learning_rate": 3.678271308523409e-05, + "loss": 0.5689, + "step": 6616 + }, + { + "epoch": 8.46976, + "grad_norm": 0.8326448202133179, + "learning_rate": 3.678071228491396e-05, + "loss": 0.5906, + "step": 6617 + }, + { + "epoch": 8.47104, + "grad_norm": 0.8092232942581177, + "learning_rate": 3.677871148459384e-05, + "loss": 0.5749, + "step": 6618 + }, + { + "epoch": 8.47232, + "grad_norm": 0.8321169018745422, + "learning_rate": 3.6776710684273714e-05, + "loss": 0.5773, + "step": 6619 + }, + { + "epoch": 8.4736, + "grad_norm": 0.8475345373153687, + "learning_rate": 3.6774709883953585e-05, + "loss": 0.6028, + "step": 6620 + }, + { + "epoch": 8.47488, + "grad_norm": 0.8301557898521423, + "learning_rate": 3.677270908363346e-05, + "loss": 0.5961, + "step": 6621 + }, + { + "epoch": 8.47616, + "grad_norm": 0.8141891360282898, + "learning_rate": 3.677070828331333e-05, + "loss": 0.6512, + "step": 6622 + }, + { + "epoch": 8.47744, + "grad_norm": 0.7891356348991394, + "learning_rate": 3.67687074829932e-05, + "loss": 0.539, + "step": 6623 + }, + { + "epoch": 8.47872, + "grad_norm": 0.7641344666481018, + "learning_rate": 3.6766706682673066e-05, + "loss": 0.5497, + "step": 6624 + }, + { + "epoch": 8.48, + "grad_norm": 0.766279935836792, + "learning_rate": 3.6764705882352945e-05, + "loss": 0.5496, + "step": 6625 + }, + { + "epoch": 8.48128, + "grad_norm": 0.836593508720398, + "learning_rate": 3.6762705082032817e-05, + "loss": 0.6149, + "step": 6626 + }, + { + "epoch": 8.48256, + "grad_norm": 0.8140498399734497, + "learning_rate": 3.676070428171269e-05, + "loss": 0.5873, + "step": 6627 + }, + { + "epoch": 8.48384, + "grad_norm": 0.8106237053871155, + "learning_rate": 3.675870348139256e-05, + "loss": 0.5868, + "step": 6628 + }, + { + "epoch": 8.48512, + "grad_norm": 0.7953768968582153, + "learning_rate": 3.675670268107243e-05, + "loss": 0.5578, + "step": 6629 + }, + { + "epoch": 8.4864, + "grad_norm": 0.8248597383499146, + "learning_rate": 3.6754701880752304e-05, + "loss": 0.6027, + "step": 6630 + }, + { + "epoch": 8.48768, + "grad_norm": 0.8090713024139404, + "learning_rate": 3.6752701080432176e-05, + "loss": 0.5848, + "step": 6631 + }, + { + "epoch": 8.48896, + "grad_norm": 0.8814820647239685, + "learning_rate": 3.675070028011205e-05, + "loss": 0.6117, + "step": 6632 + }, + { + "epoch": 8.49024, + "grad_norm": 0.8482004404067993, + "learning_rate": 3.674869947979192e-05, + "loss": 0.6126, + "step": 6633 + }, + { + "epoch": 8.49152, + "grad_norm": 0.8183587789535522, + "learning_rate": 3.674669867947179e-05, + "loss": 0.5719, + "step": 6634 + }, + { + "epoch": 8.4928, + "grad_norm": 0.8571783304214478, + "learning_rate": 3.674469787915166e-05, + "loss": 0.6063, + "step": 6635 + }, + { + "epoch": 8.49408, + "grad_norm": 0.802609384059906, + "learning_rate": 3.6742697078831535e-05, + "loss": 0.625, + "step": 6636 + }, + { + "epoch": 8.49536, + "grad_norm": 0.8341289758682251, + "learning_rate": 3.674069627851141e-05, + "loss": 0.5994, + "step": 6637 + }, + { + "epoch": 8.49664, + "grad_norm": 0.8001972436904907, + "learning_rate": 3.673869547819128e-05, + "loss": 0.5502, + "step": 6638 + }, + { + "epoch": 8.49792, + "grad_norm": 0.8287181258201599, + "learning_rate": 3.673669467787115e-05, + "loss": 0.602, + "step": 6639 + }, + { + "epoch": 8.4992, + "grad_norm": 0.7981618046760559, + "learning_rate": 3.673469387755102e-05, + "loss": 0.6023, + "step": 6640 + }, + { + "epoch": 8.50048, + "grad_norm": 0.8224805593490601, + "learning_rate": 3.6732693077230894e-05, + "loss": 0.5563, + "step": 6641 + }, + { + "epoch": 8.50176, + "grad_norm": 0.7999238967895508, + "learning_rate": 3.6730692276910766e-05, + "loss": 0.5549, + "step": 6642 + }, + { + "epoch": 8.50304, + "grad_norm": 0.8282129764556885, + "learning_rate": 3.672869147659064e-05, + "loss": 0.6004, + "step": 6643 + }, + { + "epoch": 8.50432, + "grad_norm": 0.8010920882225037, + "learning_rate": 3.672669067627051e-05, + "loss": 0.5956, + "step": 6644 + }, + { + "epoch": 8.5056, + "grad_norm": 0.7640748023986816, + "learning_rate": 3.672468987595038e-05, + "loss": 0.5638, + "step": 6645 + }, + { + "epoch": 8.50688, + "grad_norm": 0.8032206296920776, + "learning_rate": 3.6722689075630254e-05, + "loss": 0.5797, + "step": 6646 + }, + { + "epoch": 8.50816, + "grad_norm": 0.8864914774894714, + "learning_rate": 3.6720688275310126e-05, + "loss": 0.5798, + "step": 6647 + }, + { + "epoch": 8.50944, + "grad_norm": 0.8176901340484619, + "learning_rate": 3.671868747499e-05, + "loss": 0.6298, + "step": 6648 + }, + { + "epoch": 8.51072, + "grad_norm": 0.8001031279563904, + "learning_rate": 3.671668667466987e-05, + "loss": 0.6211, + "step": 6649 + }, + { + "epoch": 8.512, + "grad_norm": 0.8112642168998718, + "learning_rate": 3.671468587434974e-05, + "loss": 0.6003, + "step": 6650 + }, + { + "epoch": 8.51328, + "grad_norm": 0.8111798763275146, + "learning_rate": 3.671268507402961e-05, + "loss": 0.603, + "step": 6651 + }, + { + "epoch": 8.51456, + "grad_norm": 0.8160041570663452, + "learning_rate": 3.6710684273709485e-05, + "loss": 0.5565, + "step": 6652 + }, + { + "epoch": 8.51584, + "grad_norm": 0.7813338041305542, + "learning_rate": 3.670868347338936e-05, + "loss": 0.5783, + "step": 6653 + }, + { + "epoch": 8.51712, + "grad_norm": 0.8111140131950378, + "learning_rate": 3.670668267306923e-05, + "loss": 0.6126, + "step": 6654 + }, + { + "epoch": 8.5184, + "grad_norm": 0.8105128407478333, + "learning_rate": 3.67046818727491e-05, + "loss": 0.5711, + "step": 6655 + }, + { + "epoch": 8.51968, + "grad_norm": 0.8445085287094116, + "learning_rate": 3.670268107242897e-05, + "loss": 0.64, + "step": 6656 + }, + { + "epoch": 8.52096, + "grad_norm": 0.8055734634399414, + "learning_rate": 3.6700680272108844e-05, + "loss": 0.5938, + "step": 6657 + }, + { + "epoch": 8.52224, + "grad_norm": 0.8156753182411194, + "learning_rate": 3.6698679471788716e-05, + "loss": 0.5672, + "step": 6658 + }, + { + "epoch": 8.52352, + "grad_norm": 0.7987210154533386, + "learning_rate": 3.669667867146859e-05, + "loss": 0.5924, + "step": 6659 + }, + { + "epoch": 8.5248, + "grad_norm": 0.878676176071167, + "learning_rate": 3.669467787114846e-05, + "loss": 0.6441, + "step": 6660 + }, + { + "epoch": 8.52608, + "grad_norm": 0.8461551666259766, + "learning_rate": 3.669267707082834e-05, + "loss": 0.5701, + "step": 6661 + }, + { + "epoch": 8.52736, + "grad_norm": 0.7994030117988586, + "learning_rate": 3.6690676270508203e-05, + "loss": 0.5331, + "step": 6662 + }, + { + "epoch": 8.52864, + "grad_norm": 0.8361437320709229, + "learning_rate": 3.6688675470188075e-05, + "loss": 0.6438, + "step": 6663 + }, + { + "epoch": 8.52992, + "grad_norm": 0.8350645899772644, + "learning_rate": 3.668667466986795e-05, + "loss": 0.5887, + "step": 6664 + }, + { + "epoch": 8.5312, + "grad_norm": 0.8255120515823364, + "learning_rate": 3.668467386954782e-05, + "loss": 0.5805, + "step": 6665 + }, + { + "epoch": 8.53248, + "grad_norm": 0.8830844759941101, + "learning_rate": 3.668267306922769e-05, + "loss": 0.6237, + "step": 6666 + }, + { + "epoch": 8.533760000000001, + "grad_norm": 0.8089183568954468, + "learning_rate": 3.668067226890756e-05, + "loss": 0.5825, + "step": 6667 + }, + { + "epoch": 8.53504, + "grad_norm": 0.7940279245376587, + "learning_rate": 3.667867146858744e-05, + "loss": 0.5835, + "step": 6668 + }, + { + "epoch": 8.53632, + "grad_norm": 0.8011153340339661, + "learning_rate": 3.667667066826731e-05, + "loss": 0.5635, + "step": 6669 + }, + { + "epoch": 8.5376, + "grad_norm": 0.8229313492774963, + "learning_rate": 3.667466986794718e-05, + "loss": 0.553, + "step": 6670 + }, + { + "epoch": 8.53888, + "grad_norm": 0.8413478136062622, + "learning_rate": 3.667266906762705e-05, + "loss": 0.5636, + "step": 6671 + }, + { + "epoch": 8.54016, + "grad_norm": 0.8253154754638672, + "learning_rate": 3.667066826730692e-05, + "loss": 0.6305, + "step": 6672 + }, + { + "epoch": 8.54144, + "grad_norm": 0.8086980581283569, + "learning_rate": 3.6668667466986794e-05, + "loss": 0.6359, + "step": 6673 + }, + { + "epoch": 8.54272, + "grad_norm": 0.8422557711601257, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.5517, + "step": 6674 + }, + { + "epoch": 8.544, + "grad_norm": 0.7937127947807312, + "learning_rate": 3.6664665866346544e-05, + "loss": 0.6099, + "step": 6675 + }, + { + "epoch": 8.54528, + "grad_norm": 0.8104080557823181, + "learning_rate": 3.6662665066026416e-05, + "loss": 0.647, + "step": 6676 + }, + { + "epoch": 8.54656, + "grad_norm": 0.7860154509544373, + "learning_rate": 3.666066426570629e-05, + "loss": 0.5656, + "step": 6677 + }, + { + "epoch": 8.54784, + "grad_norm": 0.8154928684234619, + "learning_rate": 3.665866346538615e-05, + "loss": 0.5811, + "step": 6678 + }, + { + "epoch": 8.54912, + "grad_norm": 0.8200475573539734, + "learning_rate": 3.6656662665066025e-05, + "loss": 0.5985, + "step": 6679 + }, + { + "epoch": 8.5504, + "grad_norm": 0.7977676391601562, + "learning_rate": 3.66546618647459e-05, + "loss": 0.535, + "step": 6680 + }, + { + "epoch": 8.55168, + "grad_norm": 0.7876496315002441, + "learning_rate": 3.665266106442577e-05, + "loss": 0.582, + "step": 6681 + }, + { + "epoch": 8.55296, + "grad_norm": 0.7595927119255066, + "learning_rate": 3.665066026410565e-05, + "loss": 0.5749, + "step": 6682 + }, + { + "epoch": 8.55424, + "grad_norm": 0.8560596108436584, + "learning_rate": 3.664865946378552e-05, + "loss": 0.6061, + "step": 6683 + }, + { + "epoch": 8.55552, + "grad_norm": 0.7883579134941101, + "learning_rate": 3.664665866346539e-05, + "loss": 0.5673, + "step": 6684 + }, + { + "epoch": 8.556799999999999, + "grad_norm": 0.8544105887413025, + "learning_rate": 3.664465786314526e-05, + "loss": 0.644, + "step": 6685 + }, + { + "epoch": 8.55808, + "grad_norm": 0.8229860663414001, + "learning_rate": 3.664265706282513e-05, + "loss": 0.5651, + "step": 6686 + }, + { + "epoch": 8.55936, + "grad_norm": 0.8209177851676941, + "learning_rate": 3.6640656262505e-05, + "loss": 0.6064, + "step": 6687 + }, + { + "epoch": 8.56064, + "grad_norm": 0.867435097694397, + "learning_rate": 3.663865546218487e-05, + "loss": 0.568, + "step": 6688 + }, + { + "epoch": 8.56192, + "grad_norm": 0.8385619521141052, + "learning_rate": 3.663665466186475e-05, + "loss": 0.5947, + "step": 6689 + }, + { + "epoch": 8.5632, + "grad_norm": 0.7875993847846985, + "learning_rate": 3.663465386154462e-05, + "loss": 0.5768, + "step": 6690 + }, + { + "epoch": 8.56448, + "grad_norm": 0.8218605518341064, + "learning_rate": 3.6632653061224494e-05, + "loss": 0.5482, + "step": 6691 + }, + { + "epoch": 8.565760000000001, + "grad_norm": 0.7455148696899414, + "learning_rate": 3.6630652260904366e-05, + "loss": 0.5475, + "step": 6692 + }, + { + "epoch": 8.56704, + "grad_norm": 0.7991039156913757, + "learning_rate": 3.662865146058424e-05, + "loss": 0.6022, + "step": 6693 + }, + { + "epoch": 8.56832, + "grad_norm": 0.8934693932533264, + "learning_rate": 3.66266506602641e-05, + "loss": 0.6481, + "step": 6694 + }, + { + "epoch": 8.5696, + "grad_norm": 0.8251892328262329, + "learning_rate": 3.6624649859943975e-05, + "loss": 0.6115, + "step": 6695 + }, + { + "epoch": 8.57088, + "grad_norm": 0.7520719766616821, + "learning_rate": 3.662264905962385e-05, + "loss": 0.5618, + "step": 6696 + }, + { + "epoch": 8.57216, + "grad_norm": 0.7765500545501709, + "learning_rate": 3.6620648259303725e-05, + "loss": 0.5914, + "step": 6697 + }, + { + "epoch": 8.57344, + "grad_norm": 0.8249855637550354, + "learning_rate": 3.66186474589836e-05, + "loss": 0.5604, + "step": 6698 + }, + { + "epoch": 8.57472, + "grad_norm": 0.8163955211639404, + "learning_rate": 3.661664665866347e-05, + "loss": 0.5495, + "step": 6699 + }, + { + "epoch": 8.576, + "grad_norm": 0.7878378033638, + "learning_rate": 3.661464585834334e-05, + "loss": 0.5725, + "step": 6700 + }, + { + "epoch": 8.57728, + "grad_norm": 0.8135743141174316, + "learning_rate": 3.661264505802321e-05, + "loss": 0.5855, + "step": 6701 + }, + { + "epoch": 8.57856, + "grad_norm": 0.8749912977218628, + "learning_rate": 3.661064425770308e-05, + "loss": 0.6184, + "step": 6702 + }, + { + "epoch": 8.57984, + "grad_norm": 0.7891753315925598, + "learning_rate": 3.6608643457382956e-05, + "loss": 0.5873, + "step": 6703 + }, + { + "epoch": 8.58112, + "grad_norm": 0.8346560001373291, + "learning_rate": 3.660664265706283e-05, + "loss": 0.5899, + "step": 6704 + }, + { + "epoch": 8.5824, + "grad_norm": 0.874178946018219, + "learning_rate": 3.66046418567427e-05, + "loss": 0.6579, + "step": 6705 + }, + { + "epoch": 8.58368, + "grad_norm": 0.7875180244445801, + "learning_rate": 3.660264105642257e-05, + "loss": 0.5685, + "step": 6706 + }, + { + "epoch": 8.58496, + "grad_norm": 0.8282900452613831, + "learning_rate": 3.6600640256102444e-05, + "loss": 0.5997, + "step": 6707 + }, + { + "epoch": 8.58624, + "grad_norm": 0.8672759532928467, + "learning_rate": 3.6598639455782316e-05, + "loss": 0.5734, + "step": 6708 + }, + { + "epoch": 8.58752, + "grad_norm": 0.8106910586357117, + "learning_rate": 3.659663865546219e-05, + "loss": 0.6093, + "step": 6709 + }, + { + "epoch": 8.588799999999999, + "grad_norm": 0.8382178544998169, + "learning_rate": 3.659463785514206e-05, + "loss": 0.6188, + "step": 6710 + }, + { + "epoch": 8.59008, + "grad_norm": 0.8015230298042297, + "learning_rate": 3.659263705482193e-05, + "loss": 0.5521, + "step": 6711 + }, + { + "epoch": 8.59136, + "grad_norm": 0.8438718914985657, + "learning_rate": 3.65906362545018e-05, + "loss": 0.6076, + "step": 6712 + }, + { + "epoch": 8.59264, + "grad_norm": 0.8490085601806641, + "learning_rate": 3.6588635454181675e-05, + "loss": 0.6349, + "step": 6713 + }, + { + "epoch": 8.59392, + "grad_norm": 0.7863012552261353, + "learning_rate": 3.658663465386155e-05, + "loss": 0.5578, + "step": 6714 + }, + { + "epoch": 8.5952, + "grad_norm": 0.7872239947319031, + "learning_rate": 3.658463385354142e-05, + "loss": 0.5816, + "step": 6715 + }, + { + "epoch": 8.59648, + "grad_norm": 0.7982747554779053, + "learning_rate": 3.658263305322129e-05, + "loss": 0.5745, + "step": 6716 + }, + { + "epoch": 8.59776, + "grad_norm": 0.7870101928710938, + "learning_rate": 3.658063225290116e-05, + "loss": 0.5575, + "step": 6717 + }, + { + "epoch": 8.59904, + "grad_norm": 0.8285995125770569, + "learning_rate": 3.6578631452581034e-05, + "loss": 0.5857, + "step": 6718 + }, + { + "epoch": 8.60032, + "grad_norm": 0.8226214051246643, + "learning_rate": 3.6576630652260906e-05, + "loss": 0.5893, + "step": 6719 + }, + { + "epoch": 8.6016, + "grad_norm": 0.8384042382240295, + "learning_rate": 3.657462985194078e-05, + "loss": 0.5664, + "step": 6720 + }, + { + "epoch": 8.60288, + "grad_norm": 0.8193085789680481, + "learning_rate": 3.657262905162065e-05, + "loss": 0.5688, + "step": 6721 + }, + { + "epoch": 8.60416, + "grad_norm": 0.8433443307876587, + "learning_rate": 3.657062825130052e-05, + "loss": 0.6191, + "step": 6722 + }, + { + "epoch": 8.60544, + "grad_norm": 0.7860081791877747, + "learning_rate": 3.6568627450980393e-05, + "loss": 0.547, + "step": 6723 + }, + { + "epoch": 8.60672, + "grad_norm": 0.7903274893760681, + "learning_rate": 3.6566626650660265e-05, + "loss": 0.5653, + "step": 6724 + }, + { + "epoch": 8.608, + "grad_norm": 0.8736501336097717, + "learning_rate": 3.656462585034014e-05, + "loss": 0.5936, + "step": 6725 + }, + { + "epoch": 8.60928, + "grad_norm": 0.8456231355667114, + "learning_rate": 3.656262505002001e-05, + "loss": 0.5829, + "step": 6726 + }, + { + "epoch": 8.61056, + "grad_norm": 0.7985215187072754, + "learning_rate": 3.656062424969988e-05, + "loss": 0.5847, + "step": 6727 + }, + { + "epoch": 8.61184, + "grad_norm": 0.8238279223442078, + "learning_rate": 3.655862344937975e-05, + "loss": 0.5796, + "step": 6728 + }, + { + "epoch": 8.61312, + "grad_norm": 0.826208233833313, + "learning_rate": 3.6556622649059625e-05, + "loss": 0.5839, + "step": 6729 + }, + { + "epoch": 8.6144, + "grad_norm": 0.7756603956222534, + "learning_rate": 3.6554621848739496e-05, + "loss": 0.544, + "step": 6730 + }, + { + "epoch": 8.61568, + "grad_norm": 0.7803246974945068, + "learning_rate": 3.6552621048419375e-05, + "loss": 0.5523, + "step": 6731 + }, + { + "epoch": 8.61696, + "grad_norm": 0.8132696747779846, + "learning_rate": 3.655062024809924e-05, + "loss": 0.5963, + "step": 6732 + }, + { + "epoch": 8.61824, + "grad_norm": 0.8277651071548462, + "learning_rate": 3.654861944777911e-05, + "loss": 0.6203, + "step": 6733 + }, + { + "epoch": 8.61952, + "grad_norm": 0.8409605622291565, + "learning_rate": 3.6546618647458984e-05, + "loss": 0.6087, + "step": 6734 + }, + { + "epoch": 8.6208, + "grad_norm": 0.8315775990486145, + "learning_rate": 3.6544617847138856e-05, + "loss": 0.5775, + "step": 6735 + }, + { + "epoch": 8.62208, + "grad_norm": 0.8047201037406921, + "learning_rate": 3.654261704681873e-05, + "loss": 0.6142, + "step": 6736 + }, + { + "epoch": 8.62336, + "grad_norm": 0.7707719802856445, + "learning_rate": 3.65406162464986e-05, + "loss": 0.5723, + "step": 6737 + }, + { + "epoch": 8.62464, + "grad_norm": 0.7818657755851746, + "learning_rate": 3.653861544617848e-05, + "loss": 0.6173, + "step": 6738 + }, + { + "epoch": 8.62592, + "grad_norm": 0.8295494914054871, + "learning_rate": 3.653661464585835e-05, + "loss": 0.598, + "step": 6739 + }, + { + "epoch": 8.6272, + "grad_norm": 0.8243419528007507, + "learning_rate": 3.6534613845538215e-05, + "loss": 0.5859, + "step": 6740 + }, + { + "epoch": 8.62848, + "grad_norm": 0.8508108258247375, + "learning_rate": 3.653261304521809e-05, + "loss": 0.6116, + "step": 6741 + }, + { + "epoch": 8.62976, + "grad_norm": 0.8881233334541321, + "learning_rate": 3.653061224489796e-05, + "loss": 0.6642, + "step": 6742 + }, + { + "epoch": 8.63104, + "grad_norm": 0.8283036351203918, + "learning_rate": 3.652861144457783e-05, + "loss": 0.5926, + "step": 6743 + }, + { + "epoch": 8.63232, + "grad_norm": 0.7696643471717834, + "learning_rate": 3.65266106442577e-05, + "loss": 0.5894, + "step": 6744 + }, + { + "epoch": 8.6336, + "grad_norm": 0.8470257520675659, + "learning_rate": 3.652460984393758e-05, + "loss": 0.5801, + "step": 6745 + }, + { + "epoch": 8.63488, + "grad_norm": 0.8444271087646484, + "learning_rate": 3.652260904361745e-05, + "loss": 0.5986, + "step": 6746 + }, + { + "epoch": 8.63616, + "grad_norm": 0.7664961814880371, + "learning_rate": 3.6520608243297325e-05, + "loss": 0.553, + "step": 6747 + }, + { + "epoch": 8.63744, + "grad_norm": 0.8508731722831726, + "learning_rate": 3.651860744297719e-05, + "loss": 0.6113, + "step": 6748 + }, + { + "epoch": 8.63872, + "grad_norm": 0.7583442330360413, + "learning_rate": 3.651660664265706e-05, + "loss": 0.533, + "step": 6749 + }, + { + "epoch": 8.64, + "grad_norm": 0.812756359577179, + "learning_rate": 3.6514605842336934e-05, + "loss": 0.5829, + "step": 6750 + }, + { + "epoch": 8.64128, + "grad_norm": 0.8092144727706909, + "learning_rate": 3.6512605042016805e-05, + "loss": 0.5675, + "step": 6751 + }, + { + "epoch": 8.64256, + "grad_norm": 0.8396515846252441, + "learning_rate": 3.6510604241696684e-05, + "loss": 0.6065, + "step": 6752 + }, + { + "epoch": 8.64384, + "grad_norm": 0.7741784453392029, + "learning_rate": 3.6508603441376556e-05, + "loss": 0.5881, + "step": 6753 + }, + { + "epoch": 8.64512, + "grad_norm": 0.8222891092300415, + "learning_rate": 3.650660264105643e-05, + "loss": 0.6125, + "step": 6754 + }, + { + "epoch": 8.6464, + "grad_norm": 0.8462512493133545, + "learning_rate": 3.65046018407363e-05, + "loss": 0.6372, + "step": 6755 + }, + { + "epoch": 8.64768, + "grad_norm": 0.8327202796936035, + "learning_rate": 3.6502601040416165e-05, + "loss": 0.6419, + "step": 6756 + }, + { + "epoch": 8.64896, + "grad_norm": 0.7695246338844299, + "learning_rate": 3.6500600240096037e-05, + "loss": 0.5385, + "step": 6757 + }, + { + "epoch": 8.65024, + "grad_norm": 0.8170539736747742, + "learning_rate": 3.649859943977591e-05, + "loss": 0.5818, + "step": 6758 + }, + { + "epoch": 8.65152, + "grad_norm": 0.8147903680801392, + "learning_rate": 3.649659863945579e-05, + "loss": 0.616, + "step": 6759 + }, + { + "epoch": 8.6528, + "grad_norm": 0.7922413945198059, + "learning_rate": 3.649459783913566e-05, + "loss": 0.6042, + "step": 6760 + }, + { + "epoch": 8.65408, + "grad_norm": 0.7604812979698181, + "learning_rate": 3.649259703881553e-05, + "loss": 0.5312, + "step": 6761 + }, + { + "epoch": 8.65536, + "grad_norm": 0.855582594871521, + "learning_rate": 3.64905962384954e-05, + "loss": 0.6038, + "step": 6762 + }, + { + "epoch": 8.65664, + "grad_norm": 0.8385626077651978, + "learning_rate": 3.6488595438175275e-05, + "loss": 0.569, + "step": 6763 + }, + { + "epoch": 8.65792, + "grad_norm": 0.8010948300361633, + "learning_rate": 3.648659463785514e-05, + "loss": 0.5846, + "step": 6764 + }, + { + "epoch": 8.6592, + "grad_norm": 0.8321779370307922, + "learning_rate": 3.648459383753501e-05, + "loss": 0.6176, + "step": 6765 + }, + { + "epoch": 8.66048, + "grad_norm": 0.8671482801437378, + "learning_rate": 3.648259303721488e-05, + "loss": 0.6062, + "step": 6766 + }, + { + "epoch": 8.66176, + "grad_norm": 0.8177146911621094, + "learning_rate": 3.648059223689476e-05, + "loss": 0.5906, + "step": 6767 + }, + { + "epoch": 8.66304, + "grad_norm": 0.8458163738250732, + "learning_rate": 3.6478591436574634e-05, + "loss": 0.6042, + "step": 6768 + }, + { + "epoch": 8.66432, + "grad_norm": 0.7744457125663757, + "learning_rate": 3.6476590636254506e-05, + "loss": 0.5768, + "step": 6769 + }, + { + "epoch": 8.6656, + "grad_norm": 0.8034748435020447, + "learning_rate": 3.647458983593438e-05, + "loss": 0.5837, + "step": 6770 + }, + { + "epoch": 8.66688, + "grad_norm": 0.7622728943824768, + "learning_rate": 3.647258903561425e-05, + "loss": 0.5629, + "step": 6771 + }, + { + "epoch": 8.66816, + "grad_norm": 0.8260266780853271, + "learning_rate": 3.6470588235294114e-05, + "loss": 0.5742, + "step": 6772 + }, + { + "epoch": 8.66944, + "grad_norm": 0.8134649395942688, + "learning_rate": 3.6468587434973986e-05, + "loss": 0.5833, + "step": 6773 + }, + { + "epoch": 8.67072, + "grad_norm": 0.827939510345459, + "learning_rate": 3.6466586634653865e-05, + "loss": 0.5361, + "step": 6774 + }, + { + "epoch": 8.672, + "grad_norm": 0.8437283635139465, + "learning_rate": 3.646458583433374e-05, + "loss": 0.6519, + "step": 6775 + }, + { + "epoch": 8.67328, + "grad_norm": 0.836875319480896, + "learning_rate": 3.646258503401361e-05, + "loss": 0.605, + "step": 6776 + }, + { + "epoch": 8.67456, + "grad_norm": 0.8690876364707947, + "learning_rate": 3.646058423369348e-05, + "loss": 0.6817, + "step": 6777 + }, + { + "epoch": 8.67584, + "grad_norm": 0.80400550365448, + "learning_rate": 3.645858343337335e-05, + "loss": 0.5805, + "step": 6778 + }, + { + "epoch": 8.67712, + "grad_norm": 0.7860267162322998, + "learning_rate": 3.6456582633053224e-05, + "loss": 0.5937, + "step": 6779 + }, + { + "epoch": 8.6784, + "grad_norm": 0.8096144199371338, + "learning_rate": 3.645458183273309e-05, + "loss": 0.5387, + "step": 6780 + }, + { + "epoch": 8.67968, + "grad_norm": 0.7942869067192078, + "learning_rate": 3.645258103241297e-05, + "loss": 0.5842, + "step": 6781 + }, + { + "epoch": 8.68096, + "grad_norm": 0.768929123878479, + "learning_rate": 3.645058023209284e-05, + "loss": 0.6049, + "step": 6782 + }, + { + "epoch": 8.68224, + "grad_norm": 0.8190042972564697, + "learning_rate": 3.644857943177271e-05, + "loss": 0.5947, + "step": 6783 + }, + { + "epoch": 8.68352, + "grad_norm": 0.847685694694519, + "learning_rate": 3.6446578631452584e-05, + "loss": 0.6109, + "step": 6784 + }, + { + "epoch": 8.6848, + "grad_norm": 0.7972142696380615, + "learning_rate": 3.6444577831132455e-05, + "loss": 0.5764, + "step": 6785 + }, + { + "epoch": 8.68608, + "grad_norm": 0.8239948153495789, + "learning_rate": 3.644257703081233e-05, + "loss": 0.6277, + "step": 6786 + }, + { + "epoch": 8.68736, + "grad_norm": 0.7374285459518433, + "learning_rate": 3.64405762304922e-05, + "loss": 0.5331, + "step": 6787 + }, + { + "epoch": 8.68864, + "grad_norm": 0.7998077273368835, + "learning_rate": 3.643857543017207e-05, + "loss": 0.5555, + "step": 6788 + }, + { + "epoch": 8.68992, + "grad_norm": 0.8306496739387512, + "learning_rate": 3.643657462985194e-05, + "loss": 0.5906, + "step": 6789 + }, + { + "epoch": 8.6912, + "grad_norm": 0.8808903694152832, + "learning_rate": 3.6434573829531815e-05, + "loss": 0.6105, + "step": 6790 + }, + { + "epoch": 8.69248, + "grad_norm": 0.7989248037338257, + "learning_rate": 3.6432573029211687e-05, + "loss": 0.5822, + "step": 6791 + }, + { + "epoch": 8.69376, + "grad_norm": 0.7795215845108032, + "learning_rate": 3.643057222889156e-05, + "loss": 0.5552, + "step": 6792 + }, + { + "epoch": 8.69504, + "grad_norm": 0.8287959098815918, + "learning_rate": 3.642857142857143e-05, + "loss": 0.6018, + "step": 6793 + }, + { + "epoch": 8.69632, + "grad_norm": 0.8223719000816345, + "learning_rate": 3.64265706282513e-05, + "loss": 0.5816, + "step": 6794 + }, + { + "epoch": 8.6976, + "grad_norm": 0.8280214667320251, + "learning_rate": 3.6424569827931174e-05, + "loss": 0.6346, + "step": 6795 + }, + { + "epoch": 8.698879999999999, + "grad_norm": 0.795192539691925, + "learning_rate": 3.6422569027611046e-05, + "loss": 0.6239, + "step": 6796 + }, + { + "epoch": 8.70016, + "grad_norm": 0.7667868137359619, + "learning_rate": 3.642056822729092e-05, + "loss": 0.5148, + "step": 6797 + }, + { + "epoch": 8.70144, + "grad_norm": 0.8211344480514526, + "learning_rate": 3.641856742697079e-05, + "loss": 0.5715, + "step": 6798 + }, + { + "epoch": 8.70272, + "grad_norm": 0.8203229904174805, + "learning_rate": 3.641656662665066e-05, + "loss": 0.6021, + "step": 6799 + }, + { + "epoch": 8.704, + "grad_norm": 0.8265122175216675, + "learning_rate": 3.641456582633053e-05, + "loss": 0.6179, + "step": 6800 + }, + { + "epoch": 8.70528, + "grad_norm": 0.9294067025184631, + "learning_rate": 3.6412565026010405e-05, + "loss": 0.6312, + "step": 6801 + }, + { + "epoch": 8.70656, + "grad_norm": 0.8427034020423889, + "learning_rate": 3.641056422569028e-05, + "loss": 0.5626, + "step": 6802 + }, + { + "epoch": 8.707840000000001, + "grad_norm": 0.8450182676315308, + "learning_rate": 3.640856342537015e-05, + "loss": 0.6435, + "step": 6803 + }, + { + "epoch": 8.70912, + "grad_norm": 0.8336526155471802, + "learning_rate": 3.640656262505002e-05, + "loss": 0.5936, + "step": 6804 + }, + { + "epoch": 8.7104, + "grad_norm": 0.8691603541374207, + "learning_rate": 3.640456182472989e-05, + "loss": 0.605, + "step": 6805 + }, + { + "epoch": 8.71168, + "grad_norm": 0.7679282426834106, + "learning_rate": 3.6402561024409764e-05, + "loss": 0.5439, + "step": 6806 + }, + { + "epoch": 8.71296, + "grad_norm": 0.8385738730430603, + "learning_rate": 3.6400560224089636e-05, + "loss": 0.6184, + "step": 6807 + }, + { + "epoch": 8.71424, + "grad_norm": 0.8129993677139282, + "learning_rate": 3.639855942376951e-05, + "loss": 0.5656, + "step": 6808 + }, + { + "epoch": 8.71552, + "grad_norm": 0.7901842594146729, + "learning_rate": 3.639655862344939e-05, + "loss": 0.573, + "step": 6809 + }, + { + "epoch": 8.7168, + "grad_norm": 0.8075759410858154, + "learning_rate": 3.639455782312925e-05, + "loss": 0.5685, + "step": 6810 + }, + { + "epoch": 8.71808, + "grad_norm": 0.7919057011604309, + "learning_rate": 3.6392557022809124e-05, + "loss": 0.58, + "step": 6811 + }, + { + "epoch": 8.71936, + "grad_norm": 0.8212895393371582, + "learning_rate": 3.6390556222488995e-05, + "loss": 0.5962, + "step": 6812 + }, + { + "epoch": 8.72064, + "grad_norm": 0.8056111931800842, + "learning_rate": 3.638855542216887e-05, + "loss": 0.5887, + "step": 6813 + }, + { + "epoch": 8.72192, + "grad_norm": 0.794323742389679, + "learning_rate": 3.638655462184874e-05, + "loss": 0.5535, + "step": 6814 + }, + { + "epoch": 8.7232, + "grad_norm": 0.8696872591972351, + "learning_rate": 3.638455382152861e-05, + "loss": 0.5984, + "step": 6815 + }, + { + "epoch": 8.72448, + "grad_norm": 0.8847552537918091, + "learning_rate": 3.638255302120849e-05, + "loss": 0.6501, + "step": 6816 + }, + { + "epoch": 8.72576, + "grad_norm": 0.8174288272857666, + "learning_rate": 3.638055222088836e-05, + "loss": 0.6027, + "step": 6817 + }, + { + "epoch": 8.72704, + "grad_norm": 0.8479911088943481, + "learning_rate": 3.637855142056823e-05, + "loss": 0.6216, + "step": 6818 + }, + { + "epoch": 8.72832, + "grad_norm": 0.8264278173446655, + "learning_rate": 3.63765506202481e-05, + "loss": 0.615, + "step": 6819 + }, + { + "epoch": 8.7296, + "grad_norm": 0.8488075137138367, + "learning_rate": 3.637454981992797e-05, + "loss": 0.5891, + "step": 6820 + }, + { + "epoch": 8.730879999999999, + "grad_norm": 0.8699356317520142, + "learning_rate": 3.637254901960784e-05, + "loss": 0.6251, + "step": 6821 + }, + { + "epoch": 8.73216, + "grad_norm": 0.8555402755737305, + "learning_rate": 3.6370548219287714e-05, + "loss": 0.635, + "step": 6822 + }, + { + "epoch": 8.73344, + "grad_norm": 0.7735991477966309, + "learning_rate": 3.636854741896759e-05, + "loss": 0.5566, + "step": 6823 + }, + { + "epoch": 8.73472, + "grad_norm": 0.8107903599739075, + "learning_rate": 3.6366546618647465e-05, + "loss": 0.5747, + "step": 6824 + }, + { + "epoch": 8.736, + "grad_norm": 0.7944985032081604, + "learning_rate": 3.6364545818327336e-05, + "loss": 0.5683, + "step": 6825 + }, + { + "epoch": 8.73728, + "grad_norm": 0.8392530083656311, + "learning_rate": 3.63625450180072e-05, + "loss": 0.5869, + "step": 6826 + }, + { + "epoch": 8.73856, + "grad_norm": 0.8549109101295471, + "learning_rate": 3.636054421768707e-05, + "loss": 0.5996, + "step": 6827 + }, + { + "epoch": 8.739840000000001, + "grad_norm": 0.8116100430488586, + "learning_rate": 3.6358543417366945e-05, + "loss": 0.5885, + "step": 6828 + }, + { + "epoch": 8.74112, + "grad_norm": 0.81255042552948, + "learning_rate": 3.635654261704682e-05, + "loss": 0.5747, + "step": 6829 + }, + { + "epoch": 8.7424, + "grad_norm": 0.8412413597106934, + "learning_rate": 3.6354541816726696e-05, + "loss": 0.595, + "step": 6830 + }, + { + "epoch": 8.74368, + "grad_norm": 0.8014543056488037, + "learning_rate": 3.635254101640657e-05, + "loss": 0.5669, + "step": 6831 + }, + { + "epoch": 8.74496, + "grad_norm": 0.7754011750221252, + "learning_rate": 3.635054021608644e-05, + "loss": 0.5427, + "step": 6832 + }, + { + "epoch": 8.74624, + "grad_norm": 0.8052383065223694, + "learning_rate": 3.634853941576631e-05, + "loss": 0.576, + "step": 6833 + }, + { + "epoch": 8.74752, + "grad_norm": 0.8288300633430481, + "learning_rate": 3.6346538615446176e-05, + "loss": 0.6057, + "step": 6834 + }, + { + "epoch": 8.7488, + "grad_norm": 0.8074236512184143, + "learning_rate": 3.634453781512605e-05, + "loss": 0.5821, + "step": 6835 + }, + { + "epoch": 8.75008, + "grad_norm": 0.7861261963844299, + "learning_rate": 3.634253701480592e-05, + "loss": 0.5665, + "step": 6836 + }, + { + "epoch": 8.75136, + "grad_norm": 0.8360334634780884, + "learning_rate": 3.63405362144858e-05, + "loss": 0.6376, + "step": 6837 + }, + { + "epoch": 8.75264, + "grad_norm": 0.7639442086219788, + "learning_rate": 3.633853541416567e-05, + "loss": 0.5622, + "step": 6838 + }, + { + "epoch": 8.75392, + "grad_norm": 0.7803179025650024, + "learning_rate": 3.633653461384554e-05, + "loss": 0.5281, + "step": 6839 + }, + { + "epoch": 8.7552, + "grad_norm": 0.7837743163108826, + "learning_rate": 3.6334533813525414e-05, + "loss": 0.5903, + "step": 6840 + }, + { + "epoch": 8.75648, + "grad_norm": 0.7992691397666931, + "learning_rate": 3.6332533013205286e-05, + "loss": 0.5786, + "step": 6841 + }, + { + "epoch": 8.75776, + "grad_norm": 0.8223332762718201, + "learning_rate": 3.633053221288515e-05, + "loss": 0.5554, + "step": 6842 + }, + { + "epoch": 8.75904, + "grad_norm": 0.8041775226593018, + "learning_rate": 3.632853141256502e-05, + "loss": 0.5898, + "step": 6843 + }, + { + "epoch": 8.76032, + "grad_norm": 0.8420320153236389, + "learning_rate": 3.63265306122449e-05, + "loss": 0.5924, + "step": 6844 + }, + { + "epoch": 8.7616, + "grad_norm": 0.7990514039993286, + "learning_rate": 3.6324529811924774e-05, + "loss": 0.6265, + "step": 6845 + }, + { + "epoch": 8.76288, + "grad_norm": 0.7773391008377075, + "learning_rate": 3.6322529011604645e-05, + "loss": 0.5407, + "step": 6846 + }, + { + "epoch": 8.76416, + "grad_norm": 0.8004972338676453, + "learning_rate": 3.632052821128452e-05, + "loss": 0.6135, + "step": 6847 + }, + { + "epoch": 8.76544, + "grad_norm": 0.7751970887184143, + "learning_rate": 3.631852741096439e-05, + "loss": 0.5591, + "step": 6848 + }, + { + "epoch": 8.76672, + "grad_norm": 0.7655860185623169, + "learning_rate": 3.631652661064426e-05, + "loss": 0.5736, + "step": 6849 + }, + { + "epoch": 8.768, + "grad_norm": 0.8234750628471375, + "learning_rate": 3.6314525810324126e-05, + "loss": 0.5844, + "step": 6850 + }, + { + "epoch": 8.76928, + "grad_norm": 0.8051683306694031, + "learning_rate": 3.6312525010004005e-05, + "loss": 0.5872, + "step": 6851 + }, + { + "epoch": 8.77056, + "grad_norm": 0.9070225954055786, + "learning_rate": 3.6310524209683877e-05, + "loss": 0.6276, + "step": 6852 + }, + { + "epoch": 8.77184, + "grad_norm": 0.8074741959571838, + "learning_rate": 3.630852340936375e-05, + "loss": 0.5962, + "step": 6853 + }, + { + "epoch": 8.77312, + "grad_norm": 0.8040851354598999, + "learning_rate": 3.630652260904362e-05, + "loss": 0.5674, + "step": 6854 + }, + { + "epoch": 8.7744, + "grad_norm": 0.8116294741630554, + "learning_rate": 3.630452180872349e-05, + "loss": 0.5863, + "step": 6855 + }, + { + "epoch": 8.77568, + "grad_norm": 0.9247410297393799, + "learning_rate": 3.6302521008403364e-05, + "loss": 0.6572, + "step": 6856 + }, + { + "epoch": 8.77696, + "grad_norm": 0.8132933378219604, + "learning_rate": 3.6300520208083236e-05, + "loss": 0.5931, + "step": 6857 + }, + { + "epoch": 8.77824, + "grad_norm": 0.8136230707168579, + "learning_rate": 3.629851940776311e-05, + "loss": 0.579, + "step": 6858 + }, + { + "epoch": 8.77952, + "grad_norm": 0.8221474885940552, + "learning_rate": 3.629651860744298e-05, + "loss": 0.6021, + "step": 6859 + }, + { + "epoch": 8.7808, + "grad_norm": 0.8838080763816833, + "learning_rate": 3.629451780712285e-05, + "loss": 0.5825, + "step": 6860 + }, + { + "epoch": 8.78208, + "grad_norm": 0.8534195423126221, + "learning_rate": 3.629251700680272e-05, + "loss": 0.6262, + "step": 6861 + }, + { + "epoch": 8.78336, + "grad_norm": 0.8286737203598022, + "learning_rate": 3.6290516206482595e-05, + "loss": 0.5569, + "step": 6862 + }, + { + "epoch": 8.78464, + "grad_norm": 0.7863860130310059, + "learning_rate": 3.628851540616247e-05, + "loss": 0.5827, + "step": 6863 + }, + { + "epoch": 8.78592, + "grad_norm": 0.8157966136932373, + "learning_rate": 3.628651460584234e-05, + "loss": 0.6109, + "step": 6864 + }, + { + "epoch": 8.7872, + "grad_norm": 0.7991543412208557, + "learning_rate": 3.628451380552221e-05, + "loss": 0.5703, + "step": 6865 + }, + { + "epoch": 8.78848, + "grad_norm": 0.7719929814338684, + "learning_rate": 3.628251300520208e-05, + "loss": 0.5925, + "step": 6866 + }, + { + "epoch": 8.78976, + "grad_norm": 0.8264296054840088, + "learning_rate": 3.6280512204881954e-05, + "loss": 0.584, + "step": 6867 + }, + { + "epoch": 8.79104, + "grad_norm": 0.8640742897987366, + "learning_rate": 3.6278511404561826e-05, + "loss": 0.6214, + "step": 6868 + }, + { + "epoch": 8.79232, + "grad_norm": 0.7543335556983948, + "learning_rate": 3.62765106042417e-05, + "loss": 0.5316, + "step": 6869 + }, + { + "epoch": 8.7936, + "grad_norm": 0.8252272605895996, + "learning_rate": 3.627450980392157e-05, + "loss": 0.6209, + "step": 6870 + }, + { + "epoch": 8.79488, + "grad_norm": 0.7842090129852295, + "learning_rate": 3.627250900360144e-05, + "loss": 0.5349, + "step": 6871 + }, + { + "epoch": 8.79616, + "grad_norm": 0.794221818447113, + "learning_rate": 3.6270508203281314e-05, + "loss": 0.6018, + "step": 6872 + }, + { + "epoch": 8.79744, + "grad_norm": 0.7998900413513184, + "learning_rate": 3.6268507402961186e-05, + "loss": 0.5694, + "step": 6873 + }, + { + "epoch": 8.79872, + "grad_norm": 0.8280250430107117, + "learning_rate": 3.626650660264106e-05, + "loss": 0.5735, + "step": 6874 + }, + { + "epoch": 8.8, + "grad_norm": 0.8256487846374512, + "learning_rate": 3.626450580232093e-05, + "loss": 0.5985, + "step": 6875 + }, + { + "epoch": 8.80128, + "grad_norm": 0.86505526304245, + "learning_rate": 3.62625050020008e-05, + "loss": 0.6511, + "step": 6876 + }, + { + "epoch": 8.80256, + "grad_norm": 0.7898805141448975, + "learning_rate": 3.626050420168067e-05, + "loss": 0.5708, + "step": 6877 + }, + { + "epoch": 8.80384, + "grad_norm": 0.8345022797584534, + "learning_rate": 3.6258503401360545e-05, + "loss": 0.5572, + "step": 6878 + }, + { + "epoch": 8.80512, + "grad_norm": 0.8210886120796204, + "learning_rate": 3.625650260104042e-05, + "loss": 0.581, + "step": 6879 + }, + { + "epoch": 8.8064, + "grad_norm": 0.7753137350082397, + "learning_rate": 3.625450180072029e-05, + "loss": 0.6006, + "step": 6880 + }, + { + "epoch": 8.80768, + "grad_norm": 0.8361926078796387, + "learning_rate": 3.625250100040016e-05, + "loss": 0.5687, + "step": 6881 + }, + { + "epoch": 8.80896, + "grad_norm": 0.8121229410171509, + "learning_rate": 3.625050020008003e-05, + "loss": 0.5764, + "step": 6882 + }, + { + "epoch": 8.81024, + "grad_norm": 0.8940417766571045, + "learning_rate": 3.6248499399759904e-05, + "loss": 0.6222, + "step": 6883 + }, + { + "epoch": 8.81152, + "grad_norm": 0.7658796906471252, + "learning_rate": 3.6246498599439776e-05, + "loss": 0.5626, + "step": 6884 + }, + { + "epoch": 8.8128, + "grad_norm": 0.8226578831672668, + "learning_rate": 3.624449779911965e-05, + "loss": 0.5823, + "step": 6885 + }, + { + "epoch": 8.81408, + "grad_norm": 0.8413184881210327, + "learning_rate": 3.624249699879952e-05, + "loss": 0.6244, + "step": 6886 + }, + { + "epoch": 8.81536, + "grad_norm": 0.8740143775939941, + "learning_rate": 3.62404961984794e-05, + "loss": 0.6014, + "step": 6887 + }, + { + "epoch": 8.81664, + "grad_norm": 0.7891989946365356, + "learning_rate": 3.6238495398159263e-05, + "loss": 0.5616, + "step": 6888 + }, + { + "epoch": 8.81792, + "grad_norm": 0.8135474920272827, + "learning_rate": 3.6236494597839135e-05, + "loss": 0.5674, + "step": 6889 + }, + { + "epoch": 8.8192, + "grad_norm": 0.7830061316490173, + "learning_rate": 3.623449379751901e-05, + "loss": 0.5726, + "step": 6890 + }, + { + "epoch": 8.82048, + "grad_norm": 0.8667339086532593, + "learning_rate": 3.623249299719888e-05, + "loss": 0.5673, + "step": 6891 + }, + { + "epoch": 8.82176, + "grad_norm": 0.7781888246536255, + "learning_rate": 3.623049219687875e-05, + "loss": 0.5379, + "step": 6892 + }, + { + "epoch": 8.82304, + "grad_norm": 0.8315311670303345, + "learning_rate": 3.622849139655862e-05, + "loss": 0.5652, + "step": 6893 + }, + { + "epoch": 8.82432, + "grad_norm": 0.8577895164489746, + "learning_rate": 3.62264905962385e-05, + "loss": 0.631, + "step": 6894 + }, + { + "epoch": 8.8256, + "grad_norm": 0.8391826152801514, + "learning_rate": 3.622448979591837e-05, + "loss": 0.6021, + "step": 6895 + }, + { + "epoch": 8.82688, + "grad_norm": 0.8212698698043823, + "learning_rate": 3.622248899559824e-05, + "loss": 0.6771, + "step": 6896 + }, + { + "epoch": 8.82816, + "grad_norm": 0.7723982334136963, + "learning_rate": 3.622048819527811e-05, + "loss": 0.5674, + "step": 6897 + }, + { + "epoch": 8.82944, + "grad_norm": 0.807265043258667, + "learning_rate": 3.621848739495798e-05, + "loss": 0.6185, + "step": 6898 + }, + { + "epoch": 8.83072, + "grad_norm": 0.8213419318199158, + "learning_rate": 3.6216486594637854e-05, + "loss": 0.6491, + "step": 6899 + }, + { + "epoch": 8.832, + "grad_norm": 0.8153632283210754, + "learning_rate": 3.6214485794317726e-05, + "loss": 0.6082, + "step": 6900 + }, + { + "epoch": 8.83328, + "grad_norm": 0.80569988489151, + "learning_rate": 3.6212484993997604e-05, + "loss": 0.6263, + "step": 6901 + }, + { + "epoch": 8.83456, + "grad_norm": 0.8200774192810059, + "learning_rate": 3.6210484193677476e-05, + "loss": 0.6107, + "step": 6902 + }, + { + "epoch": 8.83584, + "grad_norm": 0.783087432384491, + "learning_rate": 3.620848339335735e-05, + "loss": 0.5873, + "step": 6903 + }, + { + "epoch": 8.83712, + "grad_norm": 0.8208640813827515, + "learning_rate": 3.620648259303721e-05, + "loss": 0.6105, + "step": 6904 + }, + { + "epoch": 8.8384, + "grad_norm": 0.8038390278816223, + "learning_rate": 3.6204481792717085e-05, + "loss": 0.6263, + "step": 6905 + }, + { + "epoch": 8.83968, + "grad_norm": 0.7997908592224121, + "learning_rate": 3.620248099239696e-05, + "loss": 0.631, + "step": 6906 + }, + { + "epoch": 8.84096, + "grad_norm": 0.8349906802177429, + "learning_rate": 3.620048019207683e-05, + "loss": 0.5978, + "step": 6907 + }, + { + "epoch": 8.84224, + "grad_norm": 0.8827357292175293, + "learning_rate": 3.619847939175671e-05, + "loss": 0.6295, + "step": 6908 + }, + { + "epoch": 8.84352, + "grad_norm": 0.8000686168670654, + "learning_rate": 3.619647859143658e-05, + "loss": 0.6055, + "step": 6909 + }, + { + "epoch": 8.8448, + "grad_norm": 0.8262300491333008, + "learning_rate": 3.619447779111645e-05, + "loss": 0.5929, + "step": 6910 + }, + { + "epoch": 8.84608, + "grad_norm": 0.8049215078353882, + "learning_rate": 3.619247699079632e-05, + "loss": 0.5946, + "step": 6911 + }, + { + "epoch": 8.84736, + "grad_norm": 0.8154301643371582, + "learning_rate": 3.619047619047619e-05, + "loss": 0.5801, + "step": 6912 + }, + { + "epoch": 8.84864, + "grad_norm": 0.8478993773460388, + "learning_rate": 3.618847539015606e-05, + "loss": 0.6509, + "step": 6913 + }, + { + "epoch": 8.849920000000001, + "grad_norm": 0.8522371649742126, + "learning_rate": 3.618647458983593e-05, + "loss": 0.5853, + "step": 6914 + }, + { + "epoch": 8.8512, + "grad_norm": 0.8328931331634521, + "learning_rate": 3.618447378951581e-05, + "loss": 0.6368, + "step": 6915 + }, + { + "epoch": 8.85248, + "grad_norm": 0.7752423882484436, + "learning_rate": 3.618247298919568e-05, + "loss": 0.558, + "step": 6916 + }, + { + "epoch": 8.85376, + "grad_norm": 0.7981879115104675, + "learning_rate": 3.6180472188875554e-05, + "loss": 0.5696, + "step": 6917 + }, + { + "epoch": 8.85504, + "grad_norm": 0.837726354598999, + "learning_rate": 3.6178471388555426e-05, + "loss": 0.6013, + "step": 6918 + }, + { + "epoch": 8.85632, + "grad_norm": 0.8261789679527283, + "learning_rate": 3.61764705882353e-05, + "loss": 0.6242, + "step": 6919 + }, + { + "epoch": 8.8576, + "grad_norm": 0.8679850697517395, + "learning_rate": 3.617446978791516e-05, + "loss": 0.6112, + "step": 6920 + }, + { + "epoch": 8.85888, + "grad_norm": 0.8656749129295349, + "learning_rate": 3.6172468987595035e-05, + "loss": 0.6052, + "step": 6921 + }, + { + "epoch": 8.86016, + "grad_norm": 0.7777904272079468, + "learning_rate": 3.617046818727491e-05, + "loss": 0.5814, + "step": 6922 + }, + { + "epoch": 8.86144, + "grad_norm": 0.8450097441673279, + "learning_rate": 3.6168467386954785e-05, + "loss": 0.5964, + "step": 6923 + }, + { + "epoch": 8.86272, + "grad_norm": 0.865833580493927, + "learning_rate": 3.616646658663466e-05, + "loss": 0.6279, + "step": 6924 + }, + { + "epoch": 8.864, + "grad_norm": 0.8091690540313721, + "learning_rate": 3.616446578631453e-05, + "loss": 0.5425, + "step": 6925 + }, + { + "epoch": 8.86528, + "grad_norm": 0.8185449838638306, + "learning_rate": 3.61624649859944e-05, + "loss": 0.606, + "step": 6926 + }, + { + "epoch": 8.86656, + "grad_norm": 0.8065361976623535, + "learning_rate": 3.616046418567427e-05, + "loss": 0.5938, + "step": 6927 + }, + { + "epoch": 8.86784, + "grad_norm": 0.8017386198043823, + "learning_rate": 3.615846338535414e-05, + "loss": 0.5727, + "step": 6928 + }, + { + "epoch": 8.86912, + "grad_norm": 0.8515564799308777, + "learning_rate": 3.6156462585034016e-05, + "loss": 0.6308, + "step": 6929 + }, + { + "epoch": 8.8704, + "grad_norm": 0.837009608745575, + "learning_rate": 3.615446178471389e-05, + "loss": 0.6016, + "step": 6930 + }, + { + "epoch": 8.87168, + "grad_norm": 0.8977416157722473, + "learning_rate": 3.615246098439376e-05, + "loss": 0.6323, + "step": 6931 + }, + { + "epoch": 8.872959999999999, + "grad_norm": 0.8525516390800476, + "learning_rate": 3.615046018407363e-05, + "loss": 0.627, + "step": 6932 + }, + { + "epoch": 8.87424, + "grad_norm": 0.838636577129364, + "learning_rate": 3.6148459383753504e-05, + "loss": 0.6056, + "step": 6933 + }, + { + "epoch": 8.87552, + "grad_norm": 0.8404485583305359, + "learning_rate": 3.6146458583433376e-05, + "loss": 0.6425, + "step": 6934 + }, + { + "epoch": 8.8768, + "grad_norm": 0.8780763745307922, + "learning_rate": 3.614445778311325e-05, + "loss": 0.677, + "step": 6935 + }, + { + "epoch": 8.87808, + "grad_norm": 0.7928383350372314, + "learning_rate": 3.614245698279312e-05, + "loss": 0.584, + "step": 6936 + }, + { + "epoch": 8.87936, + "grad_norm": 0.8395443558692932, + "learning_rate": 3.614045618247299e-05, + "loss": 0.6165, + "step": 6937 + }, + { + "epoch": 8.88064, + "grad_norm": 0.8272031545639038, + "learning_rate": 3.613845538215286e-05, + "loss": 0.5808, + "step": 6938 + }, + { + "epoch": 8.881920000000001, + "grad_norm": 0.8448715806007385, + "learning_rate": 3.6136454581832735e-05, + "loss": 0.6293, + "step": 6939 + }, + { + "epoch": 8.8832, + "grad_norm": 0.8742750287055969, + "learning_rate": 3.613445378151261e-05, + "loss": 0.5869, + "step": 6940 + }, + { + "epoch": 8.88448, + "grad_norm": 0.7730932235717773, + "learning_rate": 3.613245298119248e-05, + "loss": 0.5513, + "step": 6941 + }, + { + "epoch": 8.88576, + "grad_norm": 0.83644700050354, + "learning_rate": 3.613045218087235e-05, + "loss": 0.6132, + "step": 6942 + }, + { + "epoch": 8.88704, + "grad_norm": 0.8572961688041687, + "learning_rate": 3.612845138055222e-05, + "loss": 0.6198, + "step": 6943 + }, + { + "epoch": 8.88832, + "grad_norm": 0.8267669081687927, + "learning_rate": 3.6126450580232094e-05, + "loss": 0.5856, + "step": 6944 + }, + { + "epoch": 8.8896, + "grad_norm": 0.7987276315689087, + "learning_rate": 3.6124449779911966e-05, + "loss": 0.5852, + "step": 6945 + }, + { + "epoch": 8.89088, + "grad_norm": 0.8039761781692505, + "learning_rate": 3.612244897959184e-05, + "loss": 0.5562, + "step": 6946 + }, + { + "epoch": 8.89216, + "grad_norm": 0.8594339489936829, + "learning_rate": 3.612044817927171e-05, + "loss": 0.5928, + "step": 6947 + }, + { + "epoch": 8.89344, + "grad_norm": 0.7759758234024048, + "learning_rate": 3.611844737895158e-05, + "loss": 0.5363, + "step": 6948 + }, + { + "epoch": 8.89472, + "grad_norm": 0.7741097211837769, + "learning_rate": 3.6116446578631453e-05, + "loss": 0.5417, + "step": 6949 + }, + { + "epoch": 8.896, + "grad_norm": 0.800906777381897, + "learning_rate": 3.611444577831133e-05, + "loss": 0.5737, + "step": 6950 + }, + { + "epoch": 8.89728, + "grad_norm": 0.8281027674674988, + "learning_rate": 3.61124449779912e-05, + "loss": 0.5661, + "step": 6951 + }, + { + "epoch": 8.89856, + "grad_norm": 0.7662954926490784, + "learning_rate": 3.611044417767107e-05, + "loss": 0.559, + "step": 6952 + }, + { + "epoch": 8.89984, + "grad_norm": 0.8090856671333313, + "learning_rate": 3.610844337735094e-05, + "loss": 0.5599, + "step": 6953 + }, + { + "epoch": 8.90112, + "grad_norm": 0.8325416445732117, + "learning_rate": 3.610644257703081e-05, + "loss": 0.5671, + "step": 6954 + }, + { + "epoch": 8.9024, + "grad_norm": 0.8077730536460876, + "learning_rate": 3.6104441776710685e-05, + "loss": 0.6039, + "step": 6955 + }, + { + "epoch": 8.90368, + "grad_norm": 0.8278130888938904, + "learning_rate": 3.6102440976390556e-05, + "loss": 0.5771, + "step": 6956 + }, + { + "epoch": 8.904959999999999, + "grad_norm": 0.7988932132720947, + "learning_rate": 3.6100440176070435e-05, + "loss": 0.6022, + "step": 6957 + }, + { + "epoch": 8.90624, + "grad_norm": 0.8389017581939697, + "learning_rate": 3.609843937575031e-05, + "loss": 0.6001, + "step": 6958 + }, + { + "epoch": 8.90752, + "grad_norm": 0.896360456943512, + "learning_rate": 3.609643857543017e-05, + "loss": 0.6543, + "step": 6959 + }, + { + "epoch": 8.9088, + "grad_norm": 0.8242748379707336, + "learning_rate": 3.6094437775110044e-05, + "loss": 0.5962, + "step": 6960 + }, + { + "epoch": 8.91008, + "grad_norm": 0.8101686835289001, + "learning_rate": 3.6092436974789916e-05, + "loss": 0.6084, + "step": 6961 + }, + { + "epoch": 8.91136, + "grad_norm": 0.8416367769241333, + "learning_rate": 3.609043617446979e-05, + "loss": 0.5865, + "step": 6962 + }, + { + "epoch": 8.91264, + "grad_norm": 0.8185626864433289, + "learning_rate": 3.608843537414966e-05, + "loss": 0.6199, + "step": 6963 + }, + { + "epoch": 8.91392, + "grad_norm": 0.8309803009033203, + "learning_rate": 3.608643457382954e-05, + "loss": 0.6395, + "step": 6964 + }, + { + "epoch": 8.9152, + "grad_norm": 0.7879054546356201, + "learning_rate": 3.608443377350941e-05, + "loss": 0.5667, + "step": 6965 + }, + { + "epoch": 8.91648, + "grad_norm": 0.8321192860603333, + "learning_rate": 3.608243297318928e-05, + "loss": 0.6035, + "step": 6966 + }, + { + "epoch": 8.91776, + "grad_norm": 0.8073901534080505, + "learning_rate": 3.608043217286915e-05, + "loss": 0.5635, + "step": 6967 + }, + { + "epoch": 8.91904, + "grad_norm": 0.805634081363678, + "learning_rate": 3.607843137254902e-05, + "loss": 0.5788, + "step": 6968 + }, + { + "epoch": 8.92032, + "grad_norm": 0.7749924063682556, + "learning_rate": 3.607643057222889e-05, + "loss": 0.5697, + "step": 6969 + }, + { + "epoch": 8.9216, + "grad_norm": 0.7843794226646423, + "learning_rate": 3.607442977190876e-05, + "loss": 0.5763, + "step": 6970 + }, + { + "epoch": 8.92288, + "grad_norm": 0.8274489641189575, + "learning_rate": 3.607242897158864e-05, + "loss": 0.5843, + "step": 6971 + }, + { + "epoch": 8.92416, + "grad_norm": 0.834439754486084, + "learning_rate": 3.607042817126851e-05, + "loss": 0.5806, + "step": 6972 + }, + { + "epoch": 8.92544, + "grad_norm": 0.8098054528236389, + "learning_rate": 3.6068427370948385e-05, + "loss": 0.5754, + "step": 6973 + }, + { + "epoch": 8.92672, + "grad_norm": 0.8164894580841064, + "learning_rate": 3.606642657062826e-05, + "loss": 0.6191, + "step": 6974 + }, + { + "epoch": 8.928, + "grad_norm": 0.7646953463554382, + "learning_rate": 3.606442577030812e-05, + "loss": 0.5733, + "step": 6975 + }, + { + "epoch": 8.92928, + "grad_norm": 0.7695797085762024, + "learning_rate": 3.6062424969987994e-05, + "loss": 0.5586, + "step": 6976 + }, + { + "epoch": 8.93056, + "grad_norm": 0.8515230417251587, + "learning_rate": 3.6060424169667865e-05, + "loss": 0.6997, + "step": 6977 + }, + { + "epoch": 8.93184, + "grad_norm": 0.7551551461219788, + "learning_rate": 3.6058423369347744e-05, + "loss": 0.5588, + "step": 6978 + }, + { + "epoch": 8.93312, + "grad_norm": 0.8263706564903259, + "learning_rate": 3.6056422569027616e-05, + "loss": 0.6269, + "step": 6979 + }, + { + "epoch": 8.9344, + "grad_norm": 0.819164514541626, + "learning_rate": 3.605442176870749e-05, + "loss": 0.5782, + "step": 6980 + }, + { + "epoch": 8.93568, + "grad_norm": 0.875464916229248, + "learning_rate": 3.605242096838736e-05, + "loss": 0.6204, + "step": 6981 + }, + { + "epoch": 8.93696, + "grad_norm": 0.807060718536377, + "learning_rate": 3.605042016806723e-05, + "loss": 0.5681, + "step": 6982 + }, + { + "epoch": 8.93824, + "grad_norm": 0.824044942855835, + "learning_rate": 3.6048419367747097e-05, + "loss": 0.6358, + "step": 6983 + }, + { + "epoch": 8.93952, + "grad_norm": 0.7487238645553589, + "learning_rate": 3.604641856742697e-05, + "loss": 0.5667, + "step": 6984 + }, + { + "epoch": 8.9408, + "grad_norm": 0.8306474685668945, + "learning_rate": 3.604441776710684e-05, + "loss": 0.5704, + "step": 6985 + }, + { + "epoch": 8.94208, + "grad_norm": 0.7934936285018921, + "learning_rate": 3.604241696678672e-05, + "loss": 0.5734, + "step": 6986 + }, + { + "epoch": 8.94336, + "grad_norm": 0.8018743991851807, + "learning_rate": 3.604041616646659e-05, + "loss": 0.5555, + "step": 6987 + }, + { + "epoch": 8.94464, + "grad_norm": 0.8124516606330872, + "learning_rate": 3.603841536614646e-05, + "loss": 0.5972, + "step": 6988 + }, + { + "epoch": 8.94592, + "grad_norm": 0.8345234394073486, + "learning_rate": 3.6036414565826334e-05, + "loss": 0.6231, + "step": 6989 + }, + { + "epoch": 8.9472, + "grad_norm": 0.8359727263450623, + "learning_rate": 3.6034413765506206e-05, + "loss": 0.6152, + "step": 6990 + }, + { + "epoch": 8.94848, + "grad_norm": 0.7563595771789551, + "learning_rate": 3.603241296518607e-05, + "loss": 0.5544, + "step": 6991 + }, + { + "epoch": 8.94976, + "grad_norm": 0.8067274689674377, + "learning_rate": 3.603041216486594e-05, + "loss": 0.5755, + "step": 6992 + }, + { + "epoch": 8.95104, + "grad_norm": 0.8166893720626831, + "learning_rate": 3.602841136454582e-05, + "loss": 0.6486, + "step": 6993 + }, + { + "epoch": 8.95232, + "grad_norm": 0.7941920161247253, + "learning_rate": 3.6026410564225694e-05, + "loss": 0.6155, + "step": 6994 + }, + { + "epoch": 8.9536, + "grad_norm": 0.7989978790283203, + "learning_rate": 3.6024409763905566e-05, + "loss": 0.5686, + "step": 6995 + }, + { + "epoch": 8.95488, + "grad_norm": 0.8766310214996338, + "learning_rate": 3.602240896358544e-05, + "loss": 0.6189, + "step": 6996 + }, + { + "epoch": 8.95616, + "grad_norm": 0.8384754061698914, + "learning_rate": 3.602040816326531e-05, + "loss": 0.6313, + "step": 6997 + }, + { + "epoch": 8.95744, + "grad_norm": 0.8599010705947876, + "learning_rate": 3.601840736294518e-05, + "loss": 0.6274, + "step": 6998 + }, + { + "epoch": 8.95872, + "grad_norm": 0.8240278363227844, + "learning_rate": 3.6016406562625046e-05, + "loss": 0.6005, + "step": 6999 + }, + { + "epoch": 8.96, + "grad_norm": 0.8268716335296631, + "learning_rate": 3.6014405762304925e-05, + "loss": 0.6159, + "step": 7000 + }, + { + "epoch": 8.96128, + "grad_norm": 0.8283771872520447, + "learning_rate": 3.60124049619848e-05, + "loss": 0.6447, + "step": 7001 + }, + { + "epoch": 8.96256, + "grad_norm": 0.8202353119850159, + "learning_rate": 3.601040416166467e-05, + "loss": 0.5554, + "step": 7002 + }, + { + "epoch": 8.96384, + "grad_norm": 0.8381731510162354, + "learning_rate": 3.600840336134454e-05, + "loss": 0.6646, + "step": 7003 + }, + { + "epoch": 8.96512, + "grad_norm": 0.8112981915473938, + "learning_rate": 3.600640256102441e-05, + "loss": 0.5944, + "step": 7004 + }, + { + "epoch": 8.9664, + "grad_norm": 0.8326748013496399, + "learning_rate": 3.6004401760704284e-05, + "loss": 0.6004, + "step": 7005 + }, + { + "epoch": 8.96768, + "grad_norm": 0.8164408206939697, + "learning_rate": 3.6002400960384156e-05, + "loss": 0.5891, + "step": 7006 + }, + { + "epoch": 8.96896, + "grad_norm": 0.7985873818397522, + "learning_rate": 3.600040016006403e-05, + "loss": 0.5516, + "step": 7007 + }, + { + "epoch": 8.97024, + "grad_norm": 0.8269280195236206, + "learning_rate": 3.59983993597439e-05, + "loss": 0.5673, + "step": 7008 + }, + { + "epoch": 8.97152, + "grad_norm": 0.7575619220733643, + "learning_rate": 3.599639855942377e-05, + "loss": 0.5324, + "step": 7009 + }, + { + "epoch": 8.9728, + "grad_norm": 0.8653445243835449, + "learning_rate": 3.5994397759103643e-05, + "loss": 0.6027, + "step": 7010 + }, + { + "epoch": 8.97408, + "grad_norm": 0.8530913591384888, + "learning_rate": 3.5992396958783515e-05, + "loss": 0.6428, + "step": 7011 + }, + { + "epoch": 8.97536, + "grad_norm": 0.8212682604789734, + "learning_rate": 3.599039615846339e-05, + "loss": 0.5799, + "step": 7012 + }, + { + "epoch": 8.97664, + "grad_norm": 0.8439937233924866, + "learning_rate": 3.598839535814326e-05, + "loss": 0.6195, + "step": 7013 + }, + { + "epoch": 8.97792, + "grad_norm": 0.8312199711799622, + "learning_rate": 3.598639455782313e-05, + "loss": 0.5808, + "step": 7014 + }, + { + "epoch": 8.9792, + "grad_norm": 0.8415351510047913, + "learning_rate": 3.5984393757503e-05, + "loss": 0.6143, + "step": 7015 + }, + { + "epoch": 8.98048, + "grad_norm": 0.8534009456634521, + "learning_rate": 3.5982392957182875e-05, + "loss": 0.6133, + "step": 7016 + }, + { + "epoch": 8.98176, + "grad_norm": 0.819942057132721, + "learning_rate": 3.5980392156862746e-05, + "loss": 0.62, + "step": 7017 + }, + { + "epoch": 8.98304, + "grad_norm": 0.8175826668739319, + "learning_rate": 3.597839135654262e-05, + "loss": 0.5802, + "step": 7018 + }, + { + "epoch": 8.98432, + "grad_norm": 0.8342677354812622, + "learning_rate": 3.597639055622249e-05, + "loss": 0.5873, + "step": 7019 + }, + { + "epoch": 8.9856, + "grad_norm": 0.7667478919029236, + "learning_rate": 3.597438975590236e-05, + "loss": 0.557, + "step": 7020 + }, + { + "epoch": 8.98688, + "grad_norm": 0.7987193465232849, + "learning_rate": 3.5972388955582234e-05, + "loss": 0.565, + "step": 7021 + }, + { + "epoch": 8.98816, + "grad_norm": 0.8105794787406921, + "learning_rate": 3.5970388155262106e-05, + "loss": 0.6025, + "step": 7022 + }, + { + "epoch": 8.98944, + "grad_norm": 0.8362847566604614, + "learning_rate": 3.596838735494198e-05, + "loss": 0.5946, + "step": 7023 + }, + { + "epoch": 8.99072, + "grad_norm": 0.7905120253562927, + "learning_rate": 3.596638655462185e-05, + "loss": 0.5964, + "step": 7024 + }, + { + "epoch": 8.992, + "grad_norm": 0.8283672332763672, + "learning_rate": 3.596438575430172e-05, + "loss": 0.596, + "step": 7025 + }, + { + "epoch": 8.99328, + "grad_norm": 0.8208228945732117, + "learning_rate": 3.596238495398159e-05, + "loss": 0.6088, + "step": 7026 + }, + { + "epoch": 8.99456, + "grad_norm": 0.7455883026123047, + "learning_rate": 3.5960384153661465e-05, + "loss": 0.5508, + "step": 7027 + }, + { + "epoch": 8.99584, + "grad_norm": 0.8352407217025757, + "learning_rate": 3.5958383353341344e-05, + "loss": 0.628, + "step": 7028 + }, + { + "epoch": 8.99712, + "grad_norm": 0.8079842925071716, + "learning_rate": 3.595638255302121e-05, + "loss": 0.596, + "step": 7029 + }, + { + "epoch": 8.9984, + "grad_norm": 0.8588855862617493, + "learning_rate": 3.595438175270108e-05, + "loss": 0.6203, + "step": 7030 + }, + { + "epoch": 8.99968, + "grad_norm": 0.8400841355323792, + "learning_rate": 3.595238095238095e-05, + "loss": 0.5663, + "step": 7031 + }, + { + "epoch": 9.00096, + "grad_norm": 1.7468935251235962, + "learning_rate": 3.5950380152060824e-05, + "loss": 1.0437, + "step": 7032 + }, + { + "epoch": 9.00224, + "grad_norm": 0.7875801920890808, + "learning_rate": 3.5948379351740696e-05, + "loss": 0.556, + "step": 7033 + }, + { + "epoch": 9.00352, + "grad_norm": 0.8308430910110474, + "learning_rate": 3.594637855142057e-05, + "loss": 0.5821, + "step": 7034 + }, + { + "epoch": 9.0048, + "grad_norm": 0.7866562008857727, + "learning_rate": 3.594437775110045e-05, + "loss": 0.529, + "step": 7035 + }, + { + "epoch": 9.00608, + "grad_norm": 0.8387976884841919, + "learning_rate": 3.594237695078032e-05, + "loss": 0.5738, + "step": 7036 + }, + { + "epoch": 9.00736, + "grad_norm": 0.8457949757575989, + "learning_rate": 3.5940376150460184e-05, + "loss": 0.5637, + "step": 7037 + }, + { + "epoch": 9.00864, + "grad_norm": 0.8400401473045349, + "learning_rate": 3.5938375350140055e-05, + "loss": 0.6053, + "step": 7038 + }, + { + "epoch": 9.00992, + "grad_norm": 0.8000286221504211, + "learning_rate": 3.593637454981993e-05, + "loss": 0.5456, + "step": 7039 + }, + { + "epoch": 9.0112, + "grad_norm": 0.8358906507492065, + "learning_rate": 3.59343737494998e-05, + "loss": 0.5632, + "step": 7040 + }, + { + "epoch": 9.01248, + "grad_norm": 0.8368620872497559, + "learning_rate": 3.593237294917967e-05, + "loss": 0.5464, + "step": 7041 + }, + { + "epoch": 9.01376, + "grad_norm": 0.8233303427696228, + "learning_rate": 3.593037214885955e-05, + "loss": 0.5641, + "step": 7042 + }, + { + "epoch": 9.01504, + "grad_norm": 0.8248091340065002, + "learning_rate": 3.592837134853942e-05, + "loss": 0.5642, + "step": 7043 + }, + { + "epoch": 9.01632, + "grad_norm": 0.7816410660743713, + "learning_rate": 3.592637054821929e-05, + "loss": 0.557, + "step": 7044 + }, + { + "epoch": 9.0176, + "grad_norm": 0.8148441314697266, + "learning_rate": 3.592436974789916e-05, + "loss": 0.5754, + "step": 7045 + }, + { + "epoch": 9.01888, + "grad_norm": 0.846493661403656, + "learning_rate": 3.592236894757903e-05, + "loss": 0.5328, + "step": 7046 + }, + { + "epoch": 9.02016, + "grad_norm": 0.7609879374504089, + "learning_rate": 3.59203681472589e-05, + "loss": 0.5445, + "step": 7047 + }, + { + "epoch": 9.02144, + "grad_norm": 0.7646995186805725, + "learning_rate": 3.5918367346938774e-05, + "loss": 0.5248, + "step": 7048 + }, + { + "epoch": 9.02272, + "grad_norm": 0.9177650809288025, + "learning_rate": 3.591636654661865e-05, + "loss": 0.6154, + "step": 7049 + }, + { + "epoch": 9.024, + "grad_norm": 0.8574452996253967, + "learning_rate": 3.5914365746298525e-05, + "loss": 0.5763, + "step": 7050 + }, + { + "epoch": 9.02528, + "grad_norm": 0.845577597618103, + "learning_rate": 3.5912364945978396e-05, + "loss": 0.5505, + "step": 7051 + }, + { + "epoch": 9.02656, + "grad_norm": 0.8447266817092896, + "learning_rate": 3.591036414565827e-05, + "loss": 0.5625, + "step": 7052 + }, + { + "epoch": 9.02784, + "grad_norm": 0.8364947438240051, + "learning_rate": 3.590836334533813e-05, + "loss": 0.6222, + "step": 7053 + }, + { + "epoch": 9.02912, + "grad_norm": 0.7808526158332825, + "learning_rate": 3.5906362545018005e-05, + "loss": 0.5635, + "step": 7054 + }, + { + "epoch": 9.0304, + "grad_norm": 0.8134317994117737, + "learning_rate": 3.590436174469788e-05, + "loss": 0.5632, + "step": 7055 + }, + { + "epoch": 9.03168, + "grad_norm": 0.833970844745636, + "learning_rate": 3.5902360944377756e-05, + "loss": 0.5108, + "step": 7056 + }, + { + "epoch": 9.03296, + "grad_norm": 0.8089005947113037, + "learning_rate": 3.590036014405763e-05, + "loss": 0.548, + "step": 7057 + }, + { + "epoch": 9.03424, + "grad_norm": 0.8224313855171204, + "learning_rate": 3.58983593437375e-05, + "loss": 0.5738, + "step": 7058 + }, + { + "epoch": 9.03552, + "grad_norm": 0.8225283026695251, + "learning_rate": 3.589635854341737e-05, + "loss": 0.5791, + "step": 7059 + }, + { + "epoch": 9.0368, + "grad_norm": 0.8561593890190125, + "learning_rate": 3.589435774309724e-05, + "loss": 0.6185, + "step": 7060 + }, + { + "epoch": 9.03808, + "grad_norm": 0.803023636341095, + "learning_rate": 3.589235694277711e-05, + "loss": 0.5655, + "step": 7061 + }, + { + "epoch": 9.03936, + "grad_norm": 0.8006875514984131, + "learning_rate": 3.589035614245698e-05, + "loss": 0.5293, + "step": 7062 + }, + { + "epoch": 9.04064, + "grad_norm": 0.7813231348991394, + "learning_rate": 3.588835534213686e-05, + "loss": 0.5739, + "step": 7063 + }, + { + "epoch": 9.04192, + "grad_norm": 0.7644641995429993, + "learning_rate": 3.588635454181673e-05, + "loss": 0.524, + "step": 7064 + }, + { + "epoch": 9.0432, + "grad_norm": 0.8366073369979858, + "learning_rate": 3.58843537414966e-05, + "loss": 0.5816, + "step": 7065 + }, + { + "epoch": 9.04448, + "grad_norm": 0.8413146138191223, + "learning_rate": 3.5882352941176474e-05, + "loss": 0.575, + "step": 7066 + }, + { + "epoch": 9.04576, + "grad_norm": 0.868990957736969, + "learning_rate": 3.5880352140856346e-05, + "loss": 0.5821, + "step": 7067 + }, + { + "epoch": 9.04704, + "grad_norm": 0.8294727206230164, + "learning_rate": 3.587835134053622e-05, + "loss": 0.5557, + "step": 7068 + }, + { + "epoch": 9.04832, + "grad_norm": 0.8696639537811279, + "learning_rate": 3.587635054021608e-05, + "loss": 0.6255, + "step": 7069 + }, + { + "epoch": 9.0496, + "grad_norm": 0.8515434861183167, + "learning_rate": 3.587434973989596e-05, + "loss": 0.625, + "step": 7070 + }, + { + "epoch": 9.05088, + "grad_norm": 0.8410640358924866, + "learning_rate": 3.5872348939575834e-05, + "loss": 0.6161, + "step": 7071 + }, + { + "epoch": 9.05216, + "grad_norm": 0.8619793653488159, + "learning_rate": 3.5870348139255705e-05, + "loss": 0.5977, + "step": 7072 + }, + { + "epoch": 9.05344, + "grad_norm": 0.8171403408050537, + "learning_rate": 3.586834733893558e-05, + "loss": 0.5569, + "step": 7073 + }, + { + "epoch": 9.05472, + "grad_norm": 0.823085367679596, + "learning_rate": 3.586634653861545e-05, + "loss": 0.5687, + "step": 7074 + }, + { + "epoch": 9.056, + "grad_norm": 0.8016168475151062, + "learning_rate": 3.586434573829532e-05, + "loss": 0.5622, + "step": 7075 + }, + { + "epoch": 9.05728, + "grad_norm": 0.8238281607627869, + "learning_rate": 3.586234493797519e-05, + "loss": 0.5507, + "step": 7076 + }, + { + "epoch": 9.05856, + "grad_norm": 0.859733521938324, + "learning_rate": 3.5860344137655065e-05, + "loss": 0.581, + "step": 7077 + }, + { + "epoch": 9.05984, + "grad_norm": 0.8221707940101624, + "learning_rate": 3.5858343337334937e-05, + "loss": 0.5824, + "step": 7078 + }, + { + "epoch": 9.06112, + "grad_norm": 0.818545401096344, + "learning_rate": 3.585634253701481e-05, + "loss": 0.5601, + "step": 7079 + }, + { + "epoch": 9.0624, + "grad_norm": 0.8852583765983582, + "learning_rate": 3.585434173669468e-05, + "loss": 0.584, + "step": 7080 + }, + { + "epoch": 9.06368, + "grad_norm": 0.8168684244155884, + "learning_rate": 3.585234093637455e-05, + "loss": 0.5746, + "step": 7081 + }, + { + "epoch": 9.06496, + "grad_norm": 0.77977454662323, + "learning_rate": 3.5850340136054424e-05, + "loss": 0.5343, + "step": 7082 + }, + { + "epoch": 9.06624, + "grad_norm": 0.8777265548706055, + "learning_rate": 3.5848339335734296e-05, + "loss": 0.6393, + "step": 7083 + }, + { + "epoch": 9.06752, + "grad_norm": 0.8020537495613098, + "learning_rate": 3.584633853541417e-05, + "loss": 0.5444, + "step": 7084 + }, + { + "epoch": 9.0688, + "grad_norm": 0.7844178080558777, + "learning_rate": 3.584433773509404e-05, + "loss": 0.5409, + "step": 7085 + }, + { + "epoch": 9.07008, + "grad_norm": 0.8328168392181396, + "learning_rate": 3.584233693477391e-05, + "loss": 0.5712, + "step": 7086 + }, + { + "epoch": 9.07136, + "grad_norm": 0.8295521140098572, + "learning_rate": 3.584033613445378e-05, + "loss": 0.5724, + "step": 7087 + }, + { + "epoch": 9.07264, + "grad_norm": 0.8165435791015625, + "learning_rate": 3.5838335334133655e-05, + "loss": 0.5741, + "step": 7088 + }, + { + "epoch": 9.07392, + "grad_norm": 0.7974668145179749, + "learning_rate": 3.583633453381353e-05, + "loss": 0.5751, + "step": 7089 + }, + { + "epoch": 9.0752, + "grad_norm": 0.8819810152053833, + "learning_rate": 3.58343337334934e-05, + "loss": 0.5696, + "step": 7090 + }, + { + "epoch": 9.07648, + "grad_norm": 0.8275978565216064, + "learning_rate": 3.583233293317327e-05, + "loss": 0.5667, + "step": 7091 + }, + { + "epoch": 9.07776, + "grad_norm": 0.8336565494537354, + "learning_rate": 3.583033213285314e-05, + "loss": 0.5487, + "step": 7092 + }, + { + "epoch": 9.079039999999999, + "grad_norm": 0.8257220983505249, + "learning_rate": 3.5828331332533014e-05, + "loss": 0.5878, + "step": 7093 + }, + { + "epoch": 9.08032, + "grad_norm": 0.8719106316566467, + "learning_rate": 3.5826330532212886e-05, + "loss": 0.5884, + "step": 7094 + }, + { + "epoch": 9.0816, + "grad_norm": 0.822557270526886, + "learning_rate": 3.582432973189276e-05, + "loss": 0.5495, + "step": 7095 + }, + { + "epoch": 9.08288, + "grad_norm": 0.8632369637489319, + "learning_rate": 3.582232893157263e-05, + "loss": 0.631, + "step": 7096 + }, + { + "epoch": 9.08416, + "grad_norm": 0.7848061919212341, + "learning_rate": 3.58203281312525e-05, + "loss": 0.5497, + "step": 7097 + }, + { + "epoch": 9.08544, + "grad_norm": 0.8202498555183411, + "learning_rate": 3.5818327330932374e-05, + "loss": 0.5686, + "step": 7098 + }, + { + "epoch": 9.08672, + "grad_norm": 0.8682575225830078, + "learning_rate": 3.5816326530612245e-05, + "loss": 0.5808, + "step": 7099 + }, + { + "epoch": 9.088, + "grad_norm": 0.8932950496673584, + "learning_rate": 3.581432573029212e-05, + "loss": 0.5941, + "step": 7100 + }, + { + "epoch": 9.08928, + "grad_norm": 0.8102815747261047, + "learning_rate": 3.581232492997199e-05, + "loss": 0.5704, + "step": 7101 + }, + { + "epoch": 9.09056, + "grad_norm": 0.848307192325592, + "learning_rate": 3.581032412965186e-05, + "loss": 0.5954, + "step": 7102 + }, + { + "epoch": 9.09184, + "grad_norm": 0.8251248598098755, + "learning_rate": 3.580832332933173e-05, + "loss": 0.599, + "step": 7103 + }, + { + "epoch": 9.09312, + "grad_norm": 0.8306258916854858, + "learning_rate": 3.5806322529011605e-05, + "loss": 0.5746, + "step": 7104 + }, + { + "epoch": 9.0944, + "grad_norm": 0.8358504176139832, + "learning_rate": 3.580432172869148e-05, + "loss": 0.5811, + "step": 7105 + }, + { + "epoch": 9.09568, + "grad_norm": 0.8394206762313843, + "learning_rate": 3.5802320928371355e-05, + "loss": 0.5633, + "step": 7106 + }, + { + "epoch": 9.09696, + "grad_norm": 0.7698180079460144, + "learning_rate": 3.580032012805122e-05, + "loss": 0.5287, + "step": 7107 + }, + { + "epoch": 9.09824, + "grad_norm": 0.8235006928443909, + "learning_rate": 3.579831932773109e-05, + "loss": 0.575, + "step": 7108 + }, + { + "epoch": 9.09952, + "grad_norm": 0.8552541136741638, + "learning_rate": 3.5796318527410964e-05, + "loss": 0.5888, + "step": 7109 + }, + { + "epoch": 9.1008, + "grad_norm": 0.7659354209899902, + "learning_rate": 3.5794317727090836e-05, + "loss": 0.5523, + "step": 7110 + }, + { + "epoch": 9.10208, + "grad_norm": 0.8290457129478455, + "learning_rate": 3.579231692677071e-05, + "loss": 0.5812, + "step": 7111 + }, + { + "epoch": 9.10336, + "grad_norm": 0.8734808564186096, + "learning_rate": 3.579031612645058e-05, + "loss": 0.5864, + "step": 7112 + }, + { + "epoch": 9.10464, + "grad_norm": 0.8360257148742676, + "learning_rate": 3.578831532613046e-05, + "loss": 0.6079, + "step": 7113 + }, + { + "epoch": 9.10592, + "grad_norm": 0.8263709545135498, + "learning_rate": 3.578631452581033e-05, + "loss": 0.6076, + "step": 7114 + }, + { + "epoch": 9.1072, + "grad_norm": 0.8358266949653625, + "learning_rate": 3.5784313725490195e-05, + "loss": 0.5644, + "step": 7115 + }, + { + "epoch": 9.10848, + "grad_norm": 0.9020827412605286, + "learning_rate": 3.578231292517007e-05, + "loss": 0.6079, + "step": 7116 + }, + { + "epoch": 9.10976, + "grad_norm": 0.8776836395263672, + "learning_rate": 3.578031212484994e-05, + "loss": 0.5905, + "step": 7117 + }, + { + "epoch": 9.11104, + "grad_norm": 0.8753827214241028, + "learning_rate": 3.577831132452981e-05, + "loss": 0.5962, + "step": 7118 + }, + { + "epoch": 9.11232, + "grad_norm": 0.8712731599807739, + "learning_rate": 3.577631052420968e-05, + "loss": 0.5731, + "step": 7119 + }, + { + "epoch": 9.1136, + "grad_norm": 0.8524981141090393, + "learning_rate": 3.577430972388956e-05, + "loss": 0.5583, + "step": 7120 + }, + { + "epoch": 9.11488, + "grad_norm": 0.8630044460296631, + "learning_rate": 3.577230892356943e-05, + "loss": 0.5547, + "step": 7121 + }, + { + "epoch": 9.11616, + "grad_norm": 0.7815329432487488, + "learning_rate": 3.5770308123249305e-05, + "loss": 0.5294, + "step": 7122 + }, + { + "epoch": 9.11744, + "grad_norm": 0.8376869559288025, + "learning_rate": 3.576830732292917e-05, + "loss": 0.599, + "step": 7123 + }, + { + "epoch": 9.11872, + "grad_norm": 0.8631175756454468, + "learning_rate": 3.576630652260904e-05, + "loss": 0.5887, + "step": 7124 + }, + { + "epoch": 9.12, + "grad_norm": 0.8864824175834656, + "learning_rate": 3.5764305722288914e-05, + "loss": 0.571, + "step": 7125 + }, + { + "epoch": 9.12128, + "grad_norm": 0.8389583230018616, + "learning_rate": 3.5762304921968786e-05, + "loss": 0.5336, + "step": 7126 + }, + { + "epoch": 9.12256, + "grad_norm": 0.838886022567749, + "learning_rate": 3.5760304121648664e-05, + "loss": 0.5679, + "step": 7127 + }, + { + "epoch": 9.12384, + "grad_norm": 0.850913941860199, + "learning_rate": 3.5758303321328536e-05, + "loss": 0.5632, + "step": 7128 + }, + { + "epoch": 9.12512, + "grad_norm": 0.8992368578910828, + "learning_rate": 3.575630252100841e-05, + "loss": 0.598, + "step": 7129 + }, + { + "epoch": 9.1264, + "grad_norm": 0.8634032011032104, + "learning_rate": 3.575430172068828e-05, + "loss": 0.6198, + "step": 7130 + }, + { + "epoch": 9.12768, + "grad_norm": 0.8508159518241882, + "learning_rate": 3.5752300920368145e-05, + "loss": 0.5757, + "step": 7131 + }, + { + "epoch": 9.12896, + "grad_norm": 0.8230028748512268, + "learning_rate": 3.575030012004802e-05, + "loss": 0.5853, + "step": 7132 + }, + { + "epoch": 9.13024, + "grad_norm": 0.8228382468223572, + "learning_rate": 3.574829931972789e-05, + "loss": 0.5304, + "step": 7133 + }, + { + "epoch": 9.13152, + "grad_norm": 0.8506357669830322, + "learning_rate": 3.574629851940777e-05, + "loss": 0.5977, + "step": 7134 + }, + { + "epoch": 9.1328, + "grad_norm": 0.8166884779930115, + "learning_rate": 3.574429771908764e-05, + "loss": 0.5696, + "step": 7135 + }, + { + "epoch": 9.13408, + "grad_norm": 0.7951632142066956, + "learning_rate": 3.574229691876751e-05, + "loss": 0.5402, + "step": 7136 + }, + { + "epoch": 9.13536, + "grad_norm": 0.8789847493171692, + "learning_rate": 3.574029611844738e-05, + "loss": 0.5845, + "step": 7137 + }, + { + "epoch": 9.13664, + "grad_norm": 0.8529048562049866, + "learning_rate": 3.5738295318127255e-05, + "loss": 0.584, + "step": 7138 + }, + { + "epoch": 9.13792, + "grad_norm": 0.8400517702102661, + "learning_rate": 3.573629451780712e-05, + "loss": 0.585, + "step": 7139 + }, + { + "epoch": 9.1392, + "grad_norm": 0.8045387268066406, + "learning_rate": 3.573429371748699e-05, + "loss": 0.576, + "step": 7140 + }, + { + "epoch": 9.14048, + "grad_norm": 0.8386081457138062, + "learning_rate": 3.573229291716687e-05, + "loss": 0.6016, + "step": 7141 + }, + { + "epoch": 9.14176, + "grad_norm": 0.8390840291976929, + "learning_rate": 3.573029211684674e-05, + "loss": 0.5581, + "step": 7142 + }, + { + "epoch": 9.14304, + "grad_norm": 0.8972001671791077, + "learning_rate": 3.5728291316526614e-05, + "loss": 0.593, + "step": 7143 + }, + { + "epoch": 9.14432, + "grad_norm": 0.8241517543792725, + "learning_rate": 3.5726290516206486e-05, + "loss": 0.5142, + "step": 7144 + }, + { + "epoch": 9.1456, + "grad_norm": 0.8427436947822571, + "learning_rate": 3.572428971588636e-05, + "loss": 0.6082, + "step": 7145 + }, + { + "epoch": 9.14688, + "grad_norm": 0.8868518471717834, + "learning_rate": 3.572228891556623e-05, + "loss": 0.5601, + "step": 7146 + }, + { + "epoch": 9.14816, + "grad_norm": 0.8616020679473877, + "learning_rate": 3.5720288115246095e-05, + "loss": 0.589, + "step": 7147 + }, + { + "epoch": 9.14944, + "grad_norm": 0.8238210082054138, + "learning_rate": 3.571828731492597e-05, + "loss": 0.5611, + "step": 7148 + }, + { + "epoch": 9.15072, + "grad_norm": 0.7928246259689331, + "learning_rate": 3.5716286514605845e-05, + "loss": 0.5399, + "step": 7149 + }, + { + "epoch": 9.152, + "grad_norm": 0.8270377516746521, + "learning_rate": 3.571428571428572e-05, + "loss": 0.5782, + "step": 7150 + }, + { + "epoch": 9.15328, + "grad_norm": 0.8852272033691406, + "learning_rate": 3.571228491396559e-05, + "loss": 0.5984, + "step": 7151 + }, + { + "epoch": 9.15456, + "grad_norm": 0.8285141587257385, + "learning_rate": 3.571028411364546e-05, + "loss": 0.5448, + "step": 7152 + }, + { + "epoch": 9.15584, + "grad_norm": 0.8800240159034729, + "learning_rate": 3.570828331332533e-05, + "loss": 0.6208, + "step": 7153 + }, + { + "epoch": 9.15712, + "grad_norm": 0.8323574662208557, + "learning_rate": 3.5706282513005204e-05, + "loss": 0.5727, + "step": 7154 + }, + { + "epoch": 9.1584, + "grad_norm": 0.798857569694519, + "learning_rate": 3.5704281712685076e-05, + "loss": 0.5719, + "step": 7155 + }, + { + "epoch": 9.15968, + "grad_norm": 0.8088046312332153, + "learning_rate": 3.570228091236495e-05, + "loss": 0.545, + "step": 7156 + }, + { + "epoch": 9.16096, + "grad_norm": 0.8132175207138062, + "learning_rate": 3.570028011204482e-05, + "loss": 0.5349, + "step": 7157 + }, + { + "epoch": 9.16224, + "grad_norm": 0.8076733946800232, + "learning_rate": 3.569827931172469e-05, + "loss": 0.549, + "step": 7158 + }, + { + "epoch": 9.16352, + "grad_norm": 0.8983336687088013, + "learning_rate": 3.5696278511404564e-05, + "loss": 0.5902, + "step": 7159 + }, + { + "epoch": 9.1648, + "grad_norm": 0.8289273381233215, + "learning_rate": 3.5694277711084436e-05, + "loss": 0.5484, + "step": 7160 + }, + { + "epoch": 9.166080000000001, + "grad_norm": 0.8771588206291199, + "learning_rate": 3.569227691076431e-05, + "loss": 0.5911, + "step": 7161 + }, + { + "epoch": 9.16736, + "grad_norm": 0.8836865425109863, + "learning_rate": 3.569027611044418e-05, + "loss": 0.5916, + "step": 7162 + }, + { + "epoch": 9.16864, + "grad_norm": 0.8692944049835205, + "learning_rate": 3.568827531012405e-05, + "loss": 0.6283, + "step": 7163 + }, + { + "epoch": 9.16992, + "grad_norm": 0.8750563263893127, + "learning_rate": 3.568627450980392e-05, + "loss": 0.5961, + "step": 7164 + }, + { + "epoch": 9.1712, + "grad_norm": 0.9316475987434387, + "learning_rate": 3.5684273709483795e-05, + "loss": 0.6031, + "step": 7165 + }, + { + "epoch": 9.17248, + "grad_norm": 0.9066860675811768, + "learning_rate": 3.568227290916367e-05, + "loss": 0.5978, + "step": 7166 + }, + { + "epoch": 9.17376, + "grad_norm": 0.8408805727958679, + "learning_rate": 3.568027210884354e-05, + "loss": 0.5614, + "step": 7167 + }, + { + "epoch": 9.17504, + "grad_norm": 0.9087552428245544, + "learning_rate": 3.567827130852341e-05, + "loss": 0.6356, + "step": 7168 + }, + { + "epoch": 9.17632, + "grad_norm": 0.8625233769416809, + "learning_rate": 3.567627050820328e-05, + "loss": 0.5493, + "step": 7169 + }, + { + "epoch": 9.1776, + "grad_norm": 0.9364339709281921, + "learning_rate": 3.5674269707883154e-05, + "loss": 0.5999, + "step": 7170 + }, + { + "epoch": 9.17888, + "grad_norm": 0.895353376865387, + "learning_rate": 3.5672268907563026e-05, + "loss": 0.5865, + "step": 7171 + }, + { + "epoch": 9.18016, + "grad_norm": 0.8568089008331299, + "learning_rate": 3.56702681072429e-05, + "loss": 0.5691, + "step": 7172 + }, + { + "epoch": 9.18144, + "grad_norm": 0.9034045934677124, + "learning_rate": 3.566826730692277e-05, + "loss": 0.6174, + "step": 7173 + }, + { + "epoch": 9.18272, + "grad_norm": 0.8543916344642639, + "learning_rate": 3.566626650660264e-05, + "loss": 0.5903, + "step": 7174 + }, + { + "epoch": 9.184, + "grad_norm": 0.7896568775177002, + "learning_rate": 3.566426570628251e-05, + "loss": 0.5389, + "step": 7175 + }, + { + "epoch": 9.18528, + "grad_norm": 0.8801631927490234, + "learning_rate": 3.566226490596239e-05, + "loss": 0.5925, + "step": 7176 + }, + { + "epoch": 9.18656, + "grad_norm": 0.8208610415458679, + "learning_rate": 3.566026410564226e-05, + "loss": 0.5339, + "step": 7177 + }, + { + "epoch": 9.18784, + "grad_norm": 0.8739574551582336, + "learning_rate": 3.565826330532213e-05, + "loss": 0.5946, + "step": 7178 + }, + { + "epoch": 9.18912, + "grad_norm": 0.8522581458091736, + "learning_rate": 3.5656262505002e-05, + "loss": 0.604, + "step": 7179 + }, + { + "epoch": 9.1904, + "grad_norm": 0.8527406454086304, + "learning_rate": 3.565426170468187e-05, + "loss": 0.5349, + "step": 7180 + }, + { + "epoch": 9.19168, + "grad_norm": 0.796159029006958, + "learning_rate": 3.5652260904361745e-05, + "loss": 0.5852, + "step": 7181 + }, + { + "epoch": 9.19296, + "grad_norm": 0.8322233557701111, + "learning_rate": 3.5650260104041616e-05, + "loss": 0.5491, + "step": 7182 + }, + { + "epoch": 9.19424, + "grad_norm": 0.8693201541900635, + "learning_rate": 3.5648259303721495e-05, + "loss": 0.6308, + "step": 7183 + }, + { + "epoch": 9.19552, + "grad_norm": 0.8879520297050476, + "learning_rate": 3.564625850340137e-05, + "loss": 0.5455, + "step": 7184 + }, + { + "epoch": 9.1968, + "grad_norm": 0.8827615976333618, + "learning_rate": 3.564425770308123e-05, + "loss": 0.5712, + "step": 7185 + }, + { + "epoch": 9.19808, + "grad_norm": 0.8380709886550903, + "learning_rate": 3.5642256902761104e-05, + "loss": 0.5884, + "step": 7186 + }, + { + "epoch": 9.19936, + "grad_norm": 0.8361532092094421, + "learning_rate": 3.5640256102440976e-05, + "loss": 0.5282, + "step": 7187 + }, + { + "epoch": 9.20064, + "grad_norm": 0.8010978102684021, + "learning_rate": 3.563825530212085e-05, + "loss": 0.5372, + "step": 7188 + }, + { + "epoch": 9.20192, + "grad_norm": 0.8207326531410217, + "learning_rate": 3.563625450180072e-05, + "loss": 0.5496, + "step": 7189 + }, + { + "epoch": 9.2032, + "grad_norm": 0.9393238425254822, + "learning_rate": 3.56342537014806e-05, + "loss": 0.6691, + "step": 7190 + }, + { + "epoch": 9.20448, + "grad_norm": 0.9167293310165405, + "learning_rate": 3.563225290116047e-05, + "loss": 0.577, + "step": 7191 + }, + { + "epoch": 9.20576, + "grad_norm": 0.8757715821266174, + "learning_rate": 3.563025210084034e-05, + "loss": 0.56, + "step": 7192 + }, + { + "epoch": 9.20704, + "grad_norm": 0.8311654329299927, + "learning_rate": 3.562825130052021e-05, + "loss": 0.5641, + "step": 7193 + }, + { + "epoch": 9.20832, + "grad_norm": 0.8445587754249573, + "learning_rate": 3.562625050020008e-05, + "loss": 0.5876, + "step": 7194 + }, + { + "epoch": 9.2096, + "grad_norm": 0.805767834186554, + "learning_rate": 3.562424969987995e-05, + "loss": 0.5486, + "step": 7195 + }, + { + "epoch": 9.21088, + "grad_norm": 0.8612878918647766, + "learning_rate": 3.562224889955982e-05, + "loss": 0.5913, + "step": 7196 + }, + { + "epoch": 9.21216, + "grad_norm": 0.7950903177261353, + "learning_rate": 3.56202480992397e-05, + "loss": 0.5245, + "step": 7197 + }, + { + "epoch": 9.21344, + "grad_norm": 0.8570892214775085, + "learning_rate": 3.561824729891957e-05, + "loss": 0.5713, + "step": 7198 + }, + { + "epoch": 9.21472, + "grad_norm": 0.8819652795791626, + "learning_rate": 3.5616246498599445e-05, + "loss": 0.6082, + "step": 7199 + }, + { + "epoch": 9.216, + "grad_norm": 0.8352935910224915, + "learning_rate": 3.5614245698279317e-05, + "loss": 0.5274, + "step": 7200 + }, + { + "epoch": 9.21728, + "grad_norm": 0.8423967957496643, + "learning_rate": 3.561224489795918e-05, + "loss": 0.5569, + "step": 7201 + }, + { + "epoch": 9.21856, + "grad_norm": 0.8680324554443359, + "learning_rate": 3.5610244097639054e-05, + "loss": 0.5971, + "step": 7202 + }, + { + "epoch": 9.21984, + "grad_norm": 0.8493099808692932, + "learning_rate": 3.5608243297318925e-05, + "loss": 0.5817, + "step": 7203 + }, + { + "epoch": 9.22112, + "grad_norm": 0.8210243582725525, + "learning_rate": 3.5606242496998804e-05, + "loss": 0.5623, + "step": 7204 + }, + { + "epoch": 9.2224, + "grad_norm": 0.8306945562362671, + "learning_rate": 3.5604241696678676e-05, + "loss": 0.5609, + "step": 7205 + }, + { + "epoch": 9.22368, + "grad_norm": 0.8076730370521545, + "learning_rate": 3.560224089635855e-05, + "loss": 0.5814, + "step": 7206 + }, + { + "epoch": 9.22496, + "grad_norm": 0.8856160640716553, + "learning_rate": 3.560024009603842e-05, + "loss": 0.614, + "step": 7207 + }, + { + "epoch": 9.22624, + "grad_norm": 0.8176262974739075, + "learning_rate": 3.559823929571829e-05, + "loss": 0.5156, + "step": 7208 + }, + { + "epoch": 9.22752, + "grad_norm": 0.8237152099609375, + "learning_rate": 3.5596238495398157e-05, + "loss": 0.6039, + "step": 7209 + }, + { + "epoch": 9.2288, + "grad_norm": 0.8996145129203796, + "learning_rate": 3.559423769507803e-05, + "loss": 0.5799, + "step": 7210 + }, + { + "epoch": 9.23008, + "grad_norm": 0.8621578216552734, + "learning_rate": 3.55922368947579e-05, + "loss": 0.5983, + "step": 7211 + }, + { + "epoch": 9.23136, + "grad_norm": 0.834199845790863, + "learning_rate": 3.559023609443778e-05, + "loss": 0.584, + "step": 7212 + }, + { + "epoch": 9.23264, + "grad_norm": 0.8422051072120667, + "learning_rate": 3.558823529411765e-05, + "loss": 0.6015, + "step": 7213 + }, + { + "epoch": 9.23392, + "grad_norm": 0.8835949301719666, + "learning_rate": 3.558623449379752e-05, + "loss": 0.6181, + "step": 7214 + }, + { + "epoch": 9.2352, + "grad_norm": 0.8236482739448547, + "learning_rate": 3.5584233693477394e-05, + "loss": 0.5114, + "step": 7215 + }, + { + "epoch": 9.23648, + "grad_norm": 0.8392600417137146, + "learning_rate": 3.5582232893157266e-05, + "loss": 0.5606, + "step": 7216 + }, + { + "epoch": 9.23776, + "grad_norm": 0.8492880463600159, + "learning_rate": 3.558023209283713e-05, + "loss": 0.5595, + "step": 7217 + }, + { + "epoch": 9.23904, + "grad_norm": 0.8469056487083435, + "learning_rate": 3.5578231292517e-05, + "loss": 0.5619, + "step": 7218 + }, + { + "epoch": 9.24032, + "grad_norm": 0.8622716069221497, + "learning_rate": 3.557623049219688e-05, + "loss": 0.5764, + "step": 7219 + }, + { + "epoch": 9.2416, + "grad_norm": 0.8697147369384766, + "learning_rate": 3.5574229691876754e-05, + "loss": 0.6487, + "step": 7220 + }, + { + "epoch": 9.24288, + "grad_norm": 0.8444170355796814, + "learning_rate": 3.5572228891556626e-05, + "loss": 0.5976, + "step": 7221 + }, + { + "epoch": 9.24416, + "grad_norm": 0.9171527028083801, + "learning_rate": 3.55702280912365e-05, + "loss": 0.5975, + "step": 7222 + }, + { + "epoch": 9.24544, + "grad_norm": 0.8756519556045532, + "learning_rate": 3.556822729091637e-05, + "loss": 0.6635, + "step": 7223 + }, + { + "epoch": 9.24672, + "grad_norm": 0.8335146903991699, + "learning_rate": 3.556622649059624e-05, + "loss": 0.5707, + "step": 7224 + }, + { + "epoch": 9.248, + "grad_norm": 0.8439629673957825, + "learning_rate": 3.5564225690276106e-05, + "loss": 0.5443, + "step": 7225 + }, + { + "epoch": 9.24928, + "grad_norm": 0.8886339664459229, + "learning_rate": 3.5562224889955985e-05, + "loss": 0.6433, + "step": 7226 + }, + { + "epoch": 9.25056, + "grad_norm": 0.8498263359069824, + "learning_rate": 3.556022408963586e-05, + "loss": 0.5432, + "step": 7227 + }, + { + "epoch": 9.25184, + "grad_norm": 0.8150326609611511, + "learning_rate": 3.555822328931573e-05, + "loss": 0.5489, + "step": 7228 + }, + { + "epoch": 9.25312, + "grad_norm": 0.8487164974212646, + "learning_rate": 3.55562224889956e-05, + "loss": 0.6123, + "step": 7229 + }, + { + "epoch": 9.2544, + "grad_norm": 0.8371261358261108, + "learning_rate": 3.555422168867547e-05, + "loss": 0.586, + "step": 7230 + }, + { + "epoch": 9.25568, + "grad_norm": 0.7906522154808044, + "learning_rate": 3.5552220888355344e-05, + "loss": 0.5475, + "step": 7231 + }, + { + "epoch": 9.25696, + "grad_norm": 0.8361690640449524, + "learning_rate": 3.5550220088035216e-05, + "loss": 0.5587, + "step": 7232 + }, + { + "epoch": 9.25824, + "grad_norm": 0.8770825266838074, + "learning_rate": 3.554821928771509e-05, + "loss": 0.5292, + "step": 7233 + }, + { + "epoch": 9.25952, + "grad_norm": 0.8710092306137085, + "learning_rate": 3.554621848739496e-05, + "loss": 0.5879, + "step": 7234 + }, + { + "epoch": 9.2608, + "grad_norm": 0.9107195734977722, + "learning_rate": 3.554421768707483e-05, + "loss": 0.6258, + "step": 7235 + }, + { + "epoch": 9.26208, + "grad_norm": 0.8514677286148071, + "learning_rate": 3.5542216886754703e-05, + "loss": 0.5595, + "step": 7236 + }, + { + "epoch": 9.26336, + "grad_norm": 0.806038498878479, + "learning_rate": 3.5540216086434575e-05, + "loss": 0.5441, + "step": 7237 + }, + { + "epoch": 9.26464, + "grad_norm": 0.8175530433654785, + "learning_rate": 3.553821528611445e-05, + "loss": 0.5597, + "step": 7238 + }, + { + "epoch": 9.26592, + "grad_norm": 0.8429561257362366, + "learning_rate": 3.553621448579432e-05, + "loss": 0.6029, + "step": 7239 + }, + { + "epoch": 9.2672, + "grad_norm": 0.7659432888031006, + "learning_rate": 3.553421368547419e-05, + "loss": 0.4918, + "step": 7240 + }, + { + "epoch": 9.26848, + "grad_norm": 0.8414883613586426, + "learning_rate": 3.553221288515406e-05, + "loss": 0.5624, + "step": 7241 + }, + { + "epoch": 9.26976, + "grad_norm": 0.8417079448699951, + "learning_rate": 3.5530212084833935e-05, + "loss": 0.5585, + "step": 7242 + }, + { + "epoch": 9.27104, + "grad_norm": 0.8089073300361633, + "learning_rate": 3.5528211284513806e-05, + "loss": 0.5623, + "step": 7243 + }, + { + "epoch": 9.27232, + "grad_norm": 0.8453049063682556, + "learning_rate": 3.552621048419368e-05, + "loss": 0.5725, + "step": 7244 + }, + { + "epoch": 9.2736, + "grad_norm": 0.8788660168647766, + "learning_rate": 3.552420968387355e-05, + "loss": 0.6243, + "step": 7245 + }, + { + "epoch": 9.27488, + "grad_norm": 0.8520131707191467, + "learning_rate": 3.552220888355342e-05, + "loss": 0.5707, + "step": 7246 + }, + { + "epoch": 9.27616, + "grad_norm": 0.8138136863708496, + "learning_rate": 3.55202080832333e-05, + "loss": 0.5788, + "step": 7247 + }, + { + "epoch": 9.27744, + "grad_norm": 0.8127238750457764, + "learning_rate": 3.5518207282913166e-05, + "loss": 0.5925, + "step": 7248 + }, + { + "epoch": 9.27872, + "grad_norm": 0.7981357574462891, + "learning_rate": 3.551620648259304e-05, + "loss": 0.5755, + "step": 7249 + }, + { + "epoch": 9.28, + "grad_norm": 0.8368809223175049, + "learning_rate": 3.551420568227291e-05, + "loss": 0.5962, + "step": 7250 + }, + { + "epoch": 9.28128, + "grad_norm": 0.832737386226654, + "learning_rate": 3.551220488195278e-05, + "loss": 0.5504, + "step": 7251 + }, + { + "epoch": 9.28256, + "grad_norm": 0.9122929573059082, + "learning_rate": 3.551020408163265e-05, + "loss": 0.6593, + "step": 7252 + }, + { + "epoch": 9.28384, + "grad_norm": 0.8156841397285461, + "learning_rate": 3.5508203281312525e-05, + "loss": 0.5478, + "step": 7253 + }, + { + "epoch": 9.28512, + "grad_norm": 0.8393061757087708, + "learning_rate": 3.5506202480992404e-05, + "loss": 0.5744, + "step": 7254 + }, + { + "epoch": 9.2864, + "grad_norm": 0.8752142786979675, + "learning_rate": 3.5504201680672275e-05, + "loss": 0.6099, + "step": 7255 + }, + { + "epoch": 9.28768, + "grad_norm": 0.8940780162811279, + "learning_rate": 3.550220088035214e-05, + "loss": 0.6174, + "step": 7256 + }, + { + "epoch": 9.28896, + "grad_norm": 0.8913589119911194, + "learning_rate": 3.550020008003201e-05, + "loss": 0.5515, + "step": 7257 + }, + { + "epoch": 9.29024, + "grad_norm": 0.8833445310592651, + "learning_rate": 3.5498199279711884e-05, + "loss": 0.6179, + "step": 7258 + }, + { + "epoch": 9.29152, + "grad_norm": 0.8273428678512573, + "learning_rate": 3.5496198479391756e-05, + "loss": 0.5577, + "step": 7259 + }, + { + "epoch": 9.2928, + "grad_norm": 0.8162719011306763, + "learning_rate": 3.549419767907163e-05, + "loss": 0.5771, + "step": 7260 + }, + { + "epoch": 9.29408, + "grad_norm": 0.846421480178833, + "learning_rate": 3.549219687875151e-05, + "loss": 0.5642, + "step": 7261 + }, + { + "epoch": 9.29536, + "grad_norm": 0.8472880125045776, + "learning_rate": 3.549019607843138e-05, + "loss": 0.6061, + "step": 7262 + }, + { + "epoch": 9.29664, + "grad_norm": 0.8349382877349854, + "learning_rate": 3.548819527811125e-05, + "loss": 0.6095, + "step": 7263 + }, + { + "epoch": 9.29792, + "grad_norm": 0.8894979953765869, + "learning_rate": 3.5486194477791115e-05, + "loss": 0.6172, + "step": 7264 + }, + { + "epoch": 9.2992, + "grad_norm": 0.8571387529373169, + "learning_rate": 3.548419367747099e-05, + "loss": 0.5449, + "step": 7265 + }, + { + "epoch": 9.30048, + "grad_norm": 0.898956835269928, + "learning_rate": 3.548219287715086e-05, + "loss": 0.64, + "step": 7266 + }, + { + "epoch": 9.30176, + "grad_norm": 0.8103107213973999, + "learning_rate": 3.548019207683073e-05, + "loss": 0.5698, + "step": 7267 + }, + { + "epoch": 9.30304, + "grad_norm": 0.8312423825263977, + "learning_rate": 3.547819127651061e-05, + "loss": 0.612, + "step": 7268 + }, + { + "epoch": 9.30432, + "grad_norm": 0.7823015451431274, + "learning_rate": 3.547619047619048e-05, + "loss": 0.5439, + "step": 7269 + }, + { + "epoch": 9.3056, + "grad_norm": 0.8299819231033325, + "learning_rate": 3.547418967587035e-05, + "loss": 0.5612, + "step": 7270 + }, + { + "epoch": 9.30688, + "grad_norm": 0.8400328159332275, + "learning_rate": 3.5472188875550225e-05, + "loss": 0.5443, + "step": 7271 + }, + { + "epoch": 9.30816, + "grad_norm": 0.7800350189208984, + "learning_rate": 3.547018807523009e-05, + "loss": 0.5261, + "step": 7272 + }, + { + "epoch": 9.30944, + "grad_norm": 0.847671389579773, + "learning_rate": 3.546818727490996e-05, + "loss": 0.5819, + "step": 7273 + }, + { + "epoch": 9.31072, + "grad_norm": 0.8879785537719727, + "learning_rate": 3.5466186474589834e-05, + "loss": 0.5905, + "step": 7274 + }, + { + "epoch": 9.312, + "grad_norm": 0.8334875702857971, + "learning_rate": 3.546418567426971e-05, + "loss": 0.5622, + "step": 7275 + }, + { + "epoch": 9.31328, + "grad_norm": 0.8510679006576538, + "learning_rate": 3.5462184873949584e-05, + "loss": 0.5825, + "step": 7276 + }, + { + "epoch": 9.31456, + "grad_norm": 0.8725741505622864, + "learning_rate": 3.5460184073629456e-05, + "loss": 0.6079, + "step": 7277 + }, + { + "epoch": 9.31584, + "grad_norm": 0.8539466261863708, + "learning_rate": 3.545818327330933e-05, + "loss": 0.5574, + "step": 7278 + }, + { + "epoch": 9.31712, + "grad_norm": 0.8324747681617737, + "learning_rate": 3.54561824729892e-05, + "loss": 0.5647, + "step": 7279 + }, + { + "epoch": 9.3184, + "grad_norm": 0.8533186912536621, + "learning_rate": 3.5454181672669065e-05, + "loss": 0.527, + "step": 7280 + }, + { + "epoch": 9.31968, + "grad_norm": 0.8378028273582458, + "learning_rate": 3.545218087234894e-05, + "loss": 0.5808, + "step": 7281 + }, + { + "epoch": 9.32096, + "grad_norm": 0.8582174777984619, + "learning_rate": 3.5450180072028816e-05, + "loss": 0.6096, + "step": 7282 + }, + { + "epoch": 9.32224, + "grad_norm": 0.8167418837547302, + "learning_rate": 3.544817927170869e-05, + "loss": 0.5209, + "step": 7283 + }, + { + "epoch": 9.32352, + "grad_norm": 0.8674497604370117, + "learning_rate": 3.544617847138856e-05, + "loss": 0.5823, + "step": 7284 + }, + { + "epoch": 9.3248, + "grad_norm": 0.8791519999504089, + "learning_rate": 3.544417767106843e-05, + "loss": 0.5943, + "step": 7285 + }, + { + "epoch": 9.32608, + "grad_norm": 0.8264080882072449, + "learning_rate": 3.54421768707483e-05, + "loss": 0.5731, + "step": 7286 + }, + { + "epoch": 9.32736, + "grad_norm": 0.8929938673973083, + "learning_rate": 3.5440176070428175e-05, + "loss": 0.606, + "step": 7287 + }, + { + "epoch": 9.32864, + "grad_norm": 0.8631495237350464, + "learning_rate": 3.543817527010804e-05, + "loss": 0.5336, + "step": 7288 + }, + { + "epoch": 9.32992, + "grad_norm": 0.8779585957527161, + "learning_rate": 3.543617446978792e-05, + "loss": 0.5883, + "step": 7289 + }, + { + "epoch": 9.3312, + "grad_norm": 0.8582602739334106, + "learning_rate": 3.543417366946779e-05, + "loss": 0.5969, + "step": 7290 + }, + { + "epoch": 9.33248, + "grad_norm": 0.9014308452606201, + "learning_rate": 3.543217286914766e-05, + "loss": 0.5873, + "step": 7291 + }, + { + "epoch": 9.33376, + "grad_norm": 0.8432881832122803, + "learning_rate": 3.5430172068827534e-05, + "loss": 0.5814, + "step": 7292 + }, + { + "epoch": 9.33504, + "grad_norm": 0.8336278200149536, + "learning_rate": 3.5428171268507406e-05, + "loss": 0.528, + "step": 7293 + }, + { + "epoch": 9.33632, + "grad_norm": 0.8923518657684326, + "learning_rate": 3.542617046818728e-05, + "loss": 0.6088, + "step": 7294 + }, + { + "epoch": 9.3376, + "grad_norm": 0.843570351600647, + "learning_rate": 3.542416966786715e-05, + "loss": 0.5867, + "step": 7295 + }, + { + "epoch": 9.33888, + "grad_norm": 0.8679114580154419, + "learning_rate": 3.542216886754702e-05, + "loss": 0.5836, + "step": 7296 + }, + { + "epoch": 9.340160000000001, + "grad_norm": 0.8755765557289124, + "learning_rate": 3.5420168067226893e-05, + "loss": 0.6218, + "step": 7297 + }, + { + "epoch": 9.34144, + "grad_norm": 0.8723339438438416, + "learning_rate": 3.5418167266906765e-05, + "loss": 0.5534, + "step": 7298 + }, + { + "epoch": 9.34272, + "grad_norm": 0.8771824240684509, + "learning_rate": 3.541616646658664e-05, + "loss": 0.6106, + "step": 7299 + }, + { + "epoch": 9.344, + "grad_norm": 0.7724134922027588, + "learning_rate": 3.541416566626651e-05, + "loss": 0.5511, + "step": 7300 + }, + { + "epoch": 9.34528, + "grad_norm": 0.8475801348686218, + "learning_rate": 3.541216486594638e-05, + "loss": 0.5752, + "step": 7301 + }, + { + "epoch": 9.34656, + "grad_norm": 0.8831236958503723, + "learning_rate": 3.541016406562625e-05, + "loss": 0.6072, + "step": 7302 + }, + { + "epoch": 9.34784, + "grad_norm": 0.8372252583503723, + "learning_rate": 3.5408163265306125e-05, + "loss": 0.5337, + "step": 7303 + }, + { + "epoch": 9.34912, + "grad_norm": 0.810483992099762, + "learning_rate": 3.5406162464985996e-05, + "loss": 0.6066, + "step": 7304 + }, + { + "epoch": 9.3504, + "grad_norm": 0.8280951380729675, + "learning_rate": 3.540416166466587e-05, + "loss": 0.5439, + "step": 7305 + }, + { + "epoch": 9.35168, + "grad_norm": 0.8466097712516785, + "learning_rate": 3.540216086434574e-05, + "loss": 0.6007, + "step": 7306 + }, + { + "epoch": 9.35296, + "grad_norm": 0.8468445539474487, + "learning_rate": 3.540016006402561e-05, + "loss": 0.5211, + "step": 7307 + }, + { + "epoch": 9.35424, + "grad_norm": 0.8270046710968018, + "learning_rate": 3.5398159263705484e-05, + "loss": 0.6036, + "step": 7308 + }, + { + "epoch": 9.35552, + "grad_norm": 0.8338344693183899, + "learning_rate": 3.5396158463385356e-05, + "loss": 0.5664, + "step": 7309 + }, + { + "epoch": 9.3568, + "grad_norm": 0.8763896822929382, + "learning_rate": 3.539415766306523e-05, + "loss": 0.5601, + "step": 7310 + }, + { + "epoch": 9.35808, + "grad_norm": 0.8396816849708557, + "learning_rate": 3.53921568627451e-05, + "loss": 0.5643, + "step": 7311 + }, + { + "epoch": 9.35936, + "grad_norm": 0.8239136338233948, + "learning_rate": 3.539015606242497e-05, + "loss": 0.5644, + "step": 7312 + }, + { + "epoch": 9.36064, + "grad_norm": 0.8570470809936523, + "learning_rate": 3.538815526210484e-05, + "loss": 0.5655, + "step": 7313 + }, + { + "epoch": 9.36192, + "grad_norm": 0.8580249547958374, + "learning_rate": 3.5386154461784715e-05, + "loss": 0.6384, + "step": 7314 + }, + { + "epoch": 9.3632, + "grad_norm": 0.8450345993041992, + "learning_rate": 3.538415366146459e-05, + "loss": 0.5922, + "step": 7315 + }, + { + "epoch": 9.36448, + "grad_norm": 0.8073413968086243, + "learning_rate": 3.538215286114446e-05, + "loss": 0.5208, + "step": 7316 + }, + { + "epoch": 9.36576, + "grad_norm": 0.8269434571266174, + "learning_rate": 3.538015206082434e-05, + "loss": 0.5674, + "step": 7317 + }, + { + "epoch": 9.36704, + "grad_norm": 0.831274688243866, + "learning_rate": 3.53781512605042e-05, + "loss": 0.5477, + "step": 7318 + }, + { + "epoch": 9.36832, + "grad_norm": 0.8366057872772217, + "learning_rate": 3.5376150460184074e-05, + "loss": 0.561, + "step": 7319 + }, + { + "epoch": 9.3696, + "grad_norm": 0.8297872543334961, + "learning_rate": 3.5374149659863946e-05, + "loss": 0.5819, + "step": 7320 + }, + { + "epoch": 9.37088, + "grad_norm": 0.8322229981422424, + "learning_rate": 3.537214885954382e-05, + "loss": 0.5315, + "step": 7321 + }, + { + "epoch": 9.37216, + "grad_norm": 0.8755327463150024, + "learning_rate": 3.537014805922369e-05, + "loss": 0.5705, + "step": 7322 + }, + { + "epoch": 9.37344, + "grad_norm": 0.8848744630813599, + "learning_rate": 3.536814725890356e-05, + "loss": 0.6089, + "step": 7323 + }, + { + "epoch": 9.37472, + "grad_norm": 0.7823454737663269, + "learning_rate": 3.5366146458583434e-05, + "loss": 0.5527, + "step": 7324 + }, + { + "epoch": 9.376, + "grad_norm": 0.8572680950164795, + "learning_rate": 3.536414565826331e-05, + "loss": 0.6162, + "step": 7325 + }, + { + "epoch": 9.37728, + "grad_norm": 0.8322939872741699, + "learning_rate": 3.536214485794318e-05, + "loss": 0.5732, + "step": 7326 + }, + { + "epoch": 9.37856, + "grad_norm": 0.87436842918396, + "learning_rate": 3.536014405762305e-05, + "loss": 0.6265, + "step": 7327 + }, + { + "epoch": 9.37984, + "grad_norm": 0.8274515867233276, + "learning_rate": 3.535814325730292e-05, + "loss": 0.5288, + "step": 7328 + }, + { + "epoch": 9.38112, + "grad_norm": 0.9130957126617432, + "learning_rate": 3.535614245698279e-05, + "loss": 0.6091, + "step": 7329 + }, + { + "epoch": 9.3824, + "grad_norm": 0.8239238858222961, + "learning_rate": 3.5354141656662665e-05, + "loss": 0.5281, + "step": 7330 + }, + { + "epoch": 9.38368, + "grad_norm": 0.8236826658248901, + "learning_rate": 3.5352140856342537e-05, + "loss": 0.5264, + "step": 7331 + }, + { + "epoch": 9.38496, + "grad_norm": 0.8105927109718323, + "learning_rate": 3.5350140056022415e-05, + "loss": 0.5647, + "step": 7332 + }, + { + "epoch": 9.38624, + "grad_norm": 0.8803057670593262, + "learning_rate": 3.534813925570229e-05, + "loss": 0.5641, + "step": 7333 + }, + { + "epoch": 9.38752, + "grad_norm": 0.8564363121986389, + "learning_rate": 3.534613845538215e-05, + "loss": 0.5985, + "step": 7334 + }, + { + "epoch": 9.3888, + "grad_norm": 0.8307512998580933, + "learning_rate": 3.5344137655062024e-05, + "loss": 0.5664, + "step": 7335 + }, + { + "epoch": 9.39008, + "grad_norm": 0.9065113067626953, + "learning_rate": 3.5342136854741896e-05, + "loss": 0.633, + "step": 7336 + }, + { + "epoch": 9.39136, + "grad_norm": 0.8214498162269592, + "learning_rate": 3.534013605442177e-05, + "loss": 0.5621, + "step": 7337 + }, + { + "epoch": 9.39264, + "grad_norm": 0.8665810823440552, + "learning_rate": 3.533813525410164e-05, + "loss": 0.6223, + "step": 7338 + }, + { + "epoch": 9.39392, + "grad_norm": 0.9425195455551147, + "learning_rate": 3.533613445378152e-05, + "loss": 0.6163, + "step": 7339 + }, + { + "epoch": 9.395199999999999, + "grad_norm": 0.8307021260261536, + "learning_rate": 3.533413365346139e-05, + "loss": 0.5604, + "step": 7340 + }, + { + "epoch": 9.39648, + "grad_norm": 0.8683647513389587, + "learning_rate": 3.533213285314126e-05, + "loss": 0.5811, + "step": 7341 + }, + { + "epoch": 9.39776, + "grad_norm": 0.8393630385398865, + "learning_rate": 3.533013205282113e-05, + "loss": 0.5444, + "step": 7342 + }, + { + "epoch": 9.39904, + "grad_norm": 0.809565544128418, + "learning_rate": 3.5328131252501e-05, + "loss": 0.5328, + "step": 7343 + }, + { + "epoch": 9.40032, + "grad_norm": 0.8294006586074829, + "learning_rate": 3.532613045218087e-05, + "loss": 0.5563, + "step": 7344 + }, + { + "epoch": 9.4016, + "grad_norm": 0.8536190390586853, + "learning_rate": 3.532412965186074e-05, + "loss": 0.5752, + "step": 7345 + }, + { + "epoch": 9.40288, + "grad_norm": 0.8586856722831726, + "learning_rate": 3.532212885154062e-05, + "loss": 0.6095, + "step": 7346 + }, + { + "epoch": 9.40416, + "grad_norm": 0.9051749110221863, + "learning_rate": 3.532012805122049e-05, + "loss": 0.6253, + "step": 7347 + }, + { + "epoch": 9.40544, + "grad_norm": 0.8567177057266235, + "learning_rate": 3.5318127250900365e-05, + "loss": 0.6243, + "step": 7348 + }, + { + "epoch": 9.40672, + "grad_norm": 0.8297619819641113, + "learning_rate": 3.531612645058024e-05, + "loss": 0.6051, + "step": 7349 + }, + { + "epoch": 9.408, + "grad_norm": 0.8204346895217896, + "learning_rate": 3.53141256502601e-05, + "loss": 0.5123, + "step": 7350 + }, + { + "epoch": 9.40928, + "grad_norm": 0.8355170488357544, + "learning_rate": 3.5312124849939974e-05, + "loss": 0.5724, + "step": 7351 + }, + { + "epoch": 9.41056, + "grad_norm": 0.7993974089622498, + "learning_rate": 3.5310124049619846e-05, + "loss": 0.5733, + "step": 7352 + }, + { + "epoch": 9.41184, + "grad_norm": 0.846606969833374, + "learning_rate": 3.5308123249299724e-05, + "loss": 0.5533, + "step": 7353 + }, + { + "epoch": 9.41312, + "grad_norm": 0.8249667882919312, + "learning_rate": 3.5306122448979596e-05, + "loss": 0.5561, + "step": 7354 + }, + { + "epoch": 9.4144, + "grad_norm": 0.8175135850906372, + "learning_rate": 3.530412164865947e-05, + "loss": 0.557, + "step": 7355 + }, + { + "epoch": 9.41568, + "grad_norm": 0.855964183807373, + "learning_rate": 3.530212084833934e-05, + "loss": 0.5509, + "step": 7356 + }, + { + "epoch": 9.41696, + "grad_norm": 0.8171284198760986, + "learning_rate": 3.530012004801921e-05, + "loss": 0.5227, + "step": 7357 + }, + { + "epoch": 9.41824, + "grad_norm": 0.8054320812225342, + "learning_rate": 3.529811924769908e-05, + "loss": 0.6215, + "step": 7358 + }, + { + "epoch": 9.41952, + "grad_norm": 0.8339996933937073, + "learning_rate": 3.529611844737895e-05, + "loss": 0.5524, + "step": 7359 + }, + { + "epoch": 9.4208, + "grad_norm": 0.8672724366188049, + "learning_rate": 3.529411764705883e-05, + "loss": 0.5946, + "step": 7360 + }, + { + "epoch": 9.42208, + "grad_norm": 0.799502968788147, + "learning_rate": 3.52921168467387e-05, + "loss": 0.5152, + "step": 7361 + }, + { + "epoch": 9.42336, + "grad_norm": 0.8360862731933594, + "learning_rate": 3.529011604641857e-05, + "loss": 0.5564, + "step": 7362 + }, + { + "epoch": 9.42464, + "grad_norm": 0.8385033011436462, + "learning_rate": 3.528811524609844e-05, + "loss": 0.5551, + "step": 7363 + }, + { + "epoch": 9.42592, + "grad_norm": 0.8228257894515991, + "learning_rate": 3.5286114445778315e-05, + "loss": 0.5351, + "step": 7364 + }, + { + "epoch": 9.4272, + "grad_norm": 0.8925015330314636, + "learning_rate": 3.5284113645458186e-05, + "loss": 0.6315, + "step": 7365 + }, + { + "epoch": 9.42848, + "grad_norm": 0.9042237997055054, + "learning_rate": 3.528211284513805e-05, + "loss": 0.6171, + "step": 7366 + }, + { + "epoch": 9.42976, + "grad_norm": 0.8735190033912659, + "learning_rate": 3.528011204481793e-05, + "loss": 0.581, + "step": 7367 + }, + { + "epoch": 9.43104, + "grad_norm": 0.8328735828399658, + "learning_rate": 3.52781112444978e-05, + "loss": 0.5541, + "step": 7368 + }, + { + "epoch": 9.43232, + "grad_norm": 0.8481590151786804, + "learning_rate": 3.5276110444177674e-05, + "loss": 0.5868, + "step": 7369 + }, + { + "epoch": 9.4336, + "grad_norm": 0.7884271144866943, + "learning_rate": 3.5274109643857546e-05, + "loss": 0.5346, + "step": 7370 + }, + { + "epoch": 9.43488, + "grad_norm": 0.8754736185073853, + "learning_rate": 3.527210884353742e-05, + "loss": 0.6171, + "step": 7371 + }, + { + "epoch": 9.43616, + "grad_norm": 0.8703823685646057, + "learning_rate": 3.527010804321729e-05, + "loss": 0.5973, + "step": 7372 + }, + { + "epoch": 9.43744, + "grad_norm": 0.869755208492279, + "learning_rate": 3.526810724289716e-05, + "loss": 0.5851, + "step": 7373 + }, + { + "epoch": 9.43872, + "grad_norm": 0.8053287267684937, + "learning_rate": 3.526610644257703e-05, + "loss": 0.5132, + "step": 7374 + }, + { + "epoch": 9.44, + "grad_norm": 0.853288471698761, + "learning_rate": 3.5264105642256905e-05, + "loss": 0.5325, + "step": 7375 + }, + { + "epoch": 9.44128, + "grad_norm": 0.8669641613960266, + "learning_rate": 3.526210484193678e-05, + "loss": 0.5699, + "step": 7376 + }, + { + "epoch": 9.44256, + "grad_norm": 0.8685311079025269, + "learning_rate": 3.526010404161665e-05, + "loss": 0.6092, + "step": 7377 + }, + { + "epoch": 9.44384, + "grad_norm": 0.8137316703796387, + "learning_rate": 3.525810324129652e-05, + "loss": 0.5549, + "step": 7378 + }, + { + "epoch": 9.44512, + "grad_norm": 0.8705141544342041, + "learning_rate": 3.525610244097639e-05, + "loss": 0.5925, + "step": 7379 + }, + { + "epoch": 9.4464, + "grad_norm": 0.7994008660316467, + "learning_rate": 3.5254101640656264e-05, + "loss": 0.5114, + "step": 7380 + }, + { + "epoch": 9.44768, + "grad_norm": 0.8135007619857788, + "learning_rate": 3.5252100840336136e-05, + "loss": 0.5546, + "step": 7381 + }, + { + "epoch": 9.44896, + "grad_norm": 0.8146666288375854, + "learning_rate": 3.525010004001601e-05, + "loss": 0.5699, + "step": 7382 + }, + { + "epoch": 9.45024, + "grad_norm": 0.8331019878387451, + "learning_rate": 3.524809923969588e-05, + "loss": 0.6037, + "step": 7383 + }, + { + "epoch": 9.45152, + "grad_norm": 0.8365830183029175, + "learning_rate": 3.524609843937575e-05, + "loss": 0.5718, + "step": 7384 + }, + { + "epoch": 9.4528, + "grad_norm": 0.8438559770584106, + "learning_rate": 3.5244097639055624e-05, + "loss": 0.5881, + "step": 7385 + }, + { + "epoch": 9.45408, + "grad_norm": 0.8334947228431702, + "learning_rate": 3.5242096838735495e-05, + "loss": 0.5361, + "step": 7386 + }, + { + "epoch": 9.45536, + "grad_norm": 0.8178591132164001, + "learning_rate": 3.524009603841537e-05, + "loss": 0.5467, + "step": 7387 + }, + { + "epoch": 9.45664, + "grad_norm": 0.8228579759597778, + "learning_rate": 3.523809523809524e-05, + "loss": 0.549, + "step": 7388 + }, + { + "epoch": 9.45792, + "grad_norm": 0.8397416472434998, + "learning_rate": 3.523609443777511e-05, + "loss": 0.574, + "step": 7389 + }, + { + "epoch": 9.4592, + "grad_norm": 0.8239879012107849, + "learning_rate": 3.523409363745498e-05, + "loss": 0.5415, + "step": 7390 + }, + { + "epoch": 9.46048, + "grad_norm": 0.8690317273139954, + "learning_rate": 3.5232092837134855e-05, + "loss": 0.6039, + "step": 7391 + }, + { + "epoch": 9.46176, + "grad_norm": 0.7893775105476379, + "learning_rate": 3.523009203681473e-05, + "loss": 0.555, + "step": 7392 + }, + { + "epoch": 9.46304, + "grad_norm": 0.8786347508430481, + "learning_rate": 3.52280912364946e-05, + "loss": 0.5882, + "step": 7393 + }, + { + "epoch": 9.46432, + "grad_norm": 0.8203712105751038, + "learning_rate": 3.522609043617447e-05, + "loss": 0.5633, + "step": 7394 + }, + { + "epoch": 9.4656, + "grad_norm": 0.8264157176017761, + "learning_rate": 3.522408963585435e-05, + "loss": 0.548, + "step": 7395 + }, + { + "epoch": 9.46688, + "grad_norm": 0.7980100512504578, + "learning_rate": 3.5222088835534214e-05, + "loss": 0.5698, + "step": 7396 + }, + { + "epoch": 9.46816, + "grad_norm": 0.8063759803771973, + "learning_rate": 3.5220088035214086e-05, + "loss": 0.5541, + "step": 7397 + }, + { + "epoch": 9.46944, + "grad_norm": 0.8606681227684021, + "learning_rate": 3.521808723489396e-05, + "loss": 0.5892, + "step": 7398 + }, + { + "epoch": 9.47072, + "grad_norm": 0.8731750249862671, + "learning_rate": 3.521608643457383e-05, + "loss": 0.6104, + "step": 7399 + }, + { + "epoch": 9.472, + "grad_norm": 0.8435729742050171, + "learning_rate": 3.52140856342537e-05, + "loss": 0.5474, + "step": 7400 + }, + { + "epoch": 9.47328, + "grad_norm": 0.8424503207206726, + "learning_rate": 3.521208483393357e-05, + "loss": 0.6079, + "step": 7401 + }, + { + "epoch": 9.47456, + "grad_norm": 0.83389812707901, + "learning_rate": 3.521008403361345e-05, + "loss": 0.5787, + "step": 7402 + }, + { + "epoch": 9.47584, + "grad_norm": 0.8542918562889099, + "learning_rate": 3.5208083233293324e-05, + "loss": 0.5622, + "step": 7403 + }, + { + "epoch": 9.47712, + "grad_norm": 0.8500902652740479, + "learning_rate": 3.520608243297319e-05, + "loss": 0.5805, + "step": 7404 + }, + { + "epoch": 9.4784, + "grad_norm": 0.8857991099357605, + "learning_rate": 3.520408163265306e-05, + "loss": 0.5924, + "step": 7405 + }, + { + "epoch": 9.47968, + "grad_norm": 0.8328712582588196, + "learning_rate": 3.520208083233293e-05, + "loss": 0.5427, + "step": 7406 + }, + { + "epoch": 9.48096, + "grad_norm": 0.8790886998176575, + "learning_rate": 3.5200080032012804e-05, + "loss": 0.6283, + "step": 7407 + }, + { + "epoch": 9.482240000000001, + "grad_norm": 0.8296170234680176, + "learning_rate": 3.5198079231692676e-05, + "loss": 0.5678, + "step": 7408 + }, + { + "epoch": 9.48352, + "grad_norm": 0.8877446055412292, + "learning_rate": 3.5196078431372555e-05, + "loss": 0.5913, + "step": 7409 + }, + { + "epoch": 9.4848, + "grad_norm": 0.8364286422729492, + "learning_rate": 3.519407763105243e-05, + "loss": 0.5662, + "step": 7410 + }, + { + "epoch": 9.48608, + "grad_norm": 0.8386290669441223, + "learning_rate": 3.51920768307323e-05, + "loss": 0.5586, + "step": 7411 + }, + { + "epoch": 9.48736, + "grad_norm": 0.8200079798698425, + "learning_rate": 3.5190076030412164e-05, + "loss": 0.5446, + "step": 7412 + }, + { + "epoch": 9.48864, + "grad_norm": 0.8406325578689575, + "learning_rate": 3.5188075230092036e-05, + "loss": 0.5378, + "step": 7413 + }, + { + "epoch": 9.48992, + "grad_norm": 0.8740529417991638, + "learning_rate": 3.518607442977191e-05, + "loss": 0.557, + "step": 7414 + }, + { + "epoch": 9.4912, + "grad_norm": 0.8621671199798584, + "learning_rate": 3.518407362945178e-05, + "loss": 0.5946, + "step": 7415 + }, + { + "epoch": 9.49248, + "grad_norm": 0.812969446182251, + "learning_rate": 3.518207282913166e-05, + "loss": 0.5282, + "step": 7416 + }, + { + "epoch": 9.49376, + "grad_norm": 0.8162618279457092, + "learning_rate": 3.518007202881153e-05, + "loss": 0.5539, + "step": 7417 + }, + { + "epoch": 9.49504, + "grad_norm": 0.879939079284668, + "learning_rate": 3.51780712284914e-05, + "loss": 0.5933, + "step": 7418 + }, + { + "epoch": 9.49632, + "grad_norm": 0.8177329897880554, + "learning_rate": 3.5176070428171274e-05, + "loss": 0.5697, + "step": 7419 + }, + { + "epoch": 9.4976, + "grad_norm": 0.7988497018814087, + "learning_rate": 3.517406962785114e-05, + "loss": 0.4975, + "step": 7420 + }, + { + "epoch": 9.49888, + "grad_norm": 0.8101351261138916, + "learning_rate": 3.517206882753101e-05, + "loss": 0.5755, + "step": 7421 + }, + { + "epoch": 9.50016, + "grad_norm": 0.9145906567573547, + "learning_rate": 3.517006802721088e-05, + "loss": 0.6038, + "step": 7422 + }, + { + "epoch": 9.50144, + "grad_norm": 0.8204926252365112, + "learning_rate": 3.516806722689076e-05, + "loss": 0.536, + "step": 7423 + }, + { + "epoch": 9.50272, + "grad_norm": 0.8429908156394958, + "learning_rate": 3.516606642657063e-05, + "loss": 0.5637, + "step": 7424 + }, + { + "epoch": 9.504, + "grad_norm": 0.8640322685241699, + "learning_rate": 3.5164065626250505e-05, + "loss": 0.6141, + "step": 7425 + }, + { + "epoch": 9.505279999999999, + "grad_norm": 0.9030240774154663, + "learning_rate": 3.5162064825930377e-05, + "loss": 0.5929, + "step": 7426 + }, + { + "epoch": 9.50656, + "grad_norm": 0.8568662405014038, + "learning_rate": 3.516006402561025e-05, + "loss": 0.5649, + "step": 7427 + }, + { + "epoch": 9.50784, + "grad_norm": 0.8583548069000244, + "learning_rate": 3.5158063225290113e-05, + "loss": 0.5824, + "step": 7428 + }, + { + "epoch": 9.50912, + "grad_norm": 0.8739598989486694, + "learning_rate": 3.5156062424969985e-05, + "loss": 0.5895, + "step": 7429 + }, + { + "epoch": 9.5104, + "grad_norm": 0.9026753306388855, + "learning_rate": 3.515406162464986e-05, + "loss": 0.6056, + "step": 7430 + }, + { + "epoch": 9.51168, + "grad_norm": 0.8631276488304138, + "learning_rate": 3.5152060824329736e-05, + "loss": 0.5616, + "step": 7431 + }, + { + "epoch": 9.51296, + "grad_norm": 0.833394467830658, + "learning_rate": 3.515006002400961e-05, + "loss": 0.5742, + "step": 7432 + }, + { + "epoch": 9.514240000000001, + "grad_norm": 0.8503471612930298, + "learning_rate": 3.514805922368948e-05, + "loss": 0.5484, + "step": 7433 + }, + { + "epoch": 9.51552, + "grad_norm": 0.8320133686065674, + "learning_rate": 3.514605842336935e-05, + "loss": 0.5725, + "step": 7434 + }, + { + "epoch": 9.5168, + "grad_norm": 0.8564555048942566, + "learning_rate": 3.514405762304922e-05, + "loss": 0.5475, + "step": 7435 + }, + { + "epoch": 9.51808, + "grad_norm": 0.8562284708023071, + "learning_rate": 3.514205682272909e-05, + "loss": 0.6121, + "step": 7436 + }, + { + "epoch": 9.51936, + "grad_norm": 0.8364126682281494, + "learning_rate": 3.514005602240896e-05, + "loss": 0.57, + "step": 7437 + }, + { + "epoch": 9.52064, + "grad_norm": 0.8533650636672974, + "learning_rate": 3.513805522208884e-05, + "loss": 0.6145, + "step": 7438 + }, + { + "epoch": 9.52192, + "grad_norm": 0.8603209257125854, + "learning_rate": 3.513605442176871e-05, + "loss": 0.5859, + "step": 7439 + }, + { + "epoch": 9.5232, + "grad_norm": 0.8335322141647339, + "learning_rate": 3.513405362144858e-05, + "loss": 0.5862, + "step": 7440 + }, + { + "epoch": 9.52448, + "grad_norm": 0.848399817943573, + "learning_rate": 3.5132052821128454e-05, + "loss": 0.572, + "step": 7441 + }, + { + "epoch": 9.52576, + "grad_norm": 0.7791329026222229, + "learning_rate": 3.5130052020808326e-05, + "loss": 0.5761, + "step": 7442 + }, + { + "epoch": 9.52704, + "grad_norm": 0.8725833892822266, + "learning_rate": 3.51280512204882e-05, + "loss": 0.6316, + "step": 7443 + }, + { + "epoch": 9.52832, + "grad_norm": 0.8655521869659424, + "learning_rate": 3.512605042016806e-05, + "loss": 0.5992, + "step": 7444 + }, + { + "epoch": 9.5296, + "grad_norm": 0.862616777420044, + "learning_rate": 3.512404961984794e-05, + "loss": 0.5501, + "step": 7445 + }, + { + "epoch": 9.53088, + "grad_norm": 0.8571741580963135, + "learning_rate": 3.5122048819527814e-05, + "loss": 0.5958, + "step": 7446 + }, + { + "epoch": 9.53216, + "grad_norm": 0.8498180508613586, + "learning_rate": 3.5120048019207686e-05, + "loss": 0.5891, + "step": 7447 + }, + { + "epoch": 9.53344, + "grad_norm": 0.829552412033081, + "learning_rate": 3.511804721888756e-05, + "loss": 0.5575, + "step": 7448 + }, + { + "epoch": 9.53472, + "grad_norm": 0.8693800568580627, + "learning_rate": 3.511604641856743e-05, + "loss": 0.6149, + "step": 7449 + }, + { + "epoch": 9.536, + "grad_norm": 0.7874047160148621, + "learning_rate": 3.51140456182473e-05, + "loss": 0.5286, + "step": 7450 + }, + { + "epoch": 9.537279999999999, + "grad_norm": 0.8254282474517822, + "learning_rate": 3.511204481792717e-05, + "loss": 0.581, + "step": 7451 + }, + { + "epoch": 9.53856, + "grad_norm": 0.8394695520401001, + "learning_rate": 3.5110044017607045e-05, + "loss": 0.5244, + "step": 7452 + }, + { + "epoch": 9.53984, + "grad_norm": 0.8637860417366028, + "learning_rate": 3.510804321728692e-05, + "loss": 0.6142, + "step": 7453 + }, + { + "epoch": 9.54112, + "grad_norm": 0.8084328770637512, + "learning_rate": 3.510604241696679e-05, + "loss": 0.529, + "step": 7454 + }, + { + "epoch": 9.5424, + "grad_norm": 0.8361181616783142, + "learning_rate": 3.510404161664666e-05, + "loss": 0.5863, + "step": 7455 + }, + { + "epoch": 9.54368, + "grad_norm": 0.8431722521781921, + "learning_rate": 3.510204081632653e-05, + "loss": 0.5477, + "step": 7456 + }, + { + "epoch": 9.54496, + "grad_norm": 0.868948221206665, + "learning_rate": 3.5100040016006404e-05, + "loss": 0.6217, + "step": 7457 + }, + { + "epoch": 9.54624, + "grad_norm": 0.8233850002288818, + "learning_rate": 3.5098039215686276e-05, + "loss": 0.5369, + "step": 7458 + }, + { + "epoch": 9.54752, + "grad_norm": 0.8040074110031128, + "learning_rate": 3.509603841536615e-05, + "loss": 0.5127, + "step": 7459 + }, + { + "epoch": 9.5488, + "grad_norm": 0.8325415849685669, + "learning_rate": 3.509403761504602e-05, + "loss": 0.5572, + "step": 7460 + }, + { + "epoch": 9.55008, + "grad_norm": 0.8861798644065857, + "learning_rate": 3.509203681472589e-05, + "loss": 0.6159, + "step": 7461 + }, + { + "epoch": 9.55136, + "grad_norm": 0.8803839087486267, + "learning_rate": 3.509003601440576e-05, + "loss": 0.586, + "step": 7462 + }, + { + "epoch": 9.55264, + "grad_norm": 0.8698328733444214, + "learning_rate": 3.5088035214085635e-05, + "loss": 0.57, + "step": 7463 + }, + { + "epoch": 9.55392, + "grad_norm": 0.8476325273513794, + "learning_rate": 3.508603441376551e-05, + "loss": 0.5661, + "step": 7464 + }, + { + "epoch": 9.5552, + "grad_norm": 0.789214015007019, + "learning_rate": 3.508403361344538e-05, + "loss": 0.5412, + "step": 7465 + }, + { + "epoch": 9.55648, + "grad_norm": 0.851634681224823, + "learning_rate": 3.508203281312525e-05, + "loss": 0.5722, + "step": 7466 + }, + { + "epoch": 9.55776, + "grad_norm": 0.8242769241333008, + "learning_rate": 3.508003201280512e-05, + "loss": 0.5649, + "step": 7467 + }, + { + "epoch": 9.55904, + "grad_norm": 0.8163190484046936, + "learning_rate": 3.5078031212484995e-05, + "loss": 0.5694, + "step": 7468 + }, + { + "epoch": 9.56032, + "grad_norm": 0.802607536315918, + "learning_rate": 3.5076030412164866e-05, + "loss": 0.5583, + "step": 7469 + }, + { + "epoch": 9.5616, + "grad_norm": 0.8320533633232117, + "learning_rate": 3.507402961184474e-05, + "loss": 0.5443, + "step": 7470 + }, + { + "epoch": 9.56288, + "grad_norm": 0.8149722218513489, + "learning_rate": 3.507202881152461e-05, + "loss": 0.5603, + "step": 7471 + }, + { + "epoch": 9.56416, + "grad_norm": 0.8735612630844116, + "learning_rate": 3.507002801120448e-05, + "loss": 0.5986, + "step": 7472 + }, + { + "epoch": 9.56544, + "grad_norm": 0.8785853385925293, + "learning_rate": 3.506802721088436e-05, + "loss": 0.6058, + "step": 7473 + }, + { + "epoch": 9.56672, + "grad_norm": 0.8740175366401672, + "learning_rate": 3.5066026410564226e-05, + "loss": 0.5921, + "step": 7474 + }, + { + "epoch": 9.568, + "grad_norm": 0.8479311466217041, + "learning_rate": 3.50640256102441e-05, + "loss": 0.5683, + "step": 7475 + }, + { + "epoch": 9.56928, + "grad_norm": 0.786037266254425, + "learning_rate": 3.506202480992397e-05, + "loss": 0.5639, + "step": 7476 + }, + { + "epoch": 9.57056, + "grad_norm": 0.8478996753692627, + "learning_rate": 3.506002400960384e-05, + "loss": 0.5613, + "step": 7477 + }, + { + "epoch": 9.57184, + "grad_norm": 0.8028008341789246, + "learning_rate": 3.505802320928371e-05, + "loss": 0.4897, + "step": 7478 + }, + { + "epoch": 9.57312, + "grad_norm": 0.844845175743103, + "learning_rate": 3.5056022408963585e-05, + "loss": 0.5955, + "step": 7479 + }, + { + "epoch": 9.5744, + "grad_norm": 0.8595391511917114, + "learning_rate": 3.5054021608643464e-05, + "loss": 0.5998, + "step": 7480 + }, + { + "epoch": 9.57568, + "grad_norm": 0.860565185546875, + "learning_rate": 3.5052020808323335e-05, + "loss": 0.5901, + "step": 7481 + }, + { + "epoch": 9.57696, + "grad_norm": 0.8473759293556213, + "learning_rate": 3.50500200080032e-05, + "loss": 0.5797, + "step": 7482 + }, + { + "epoch": 9.57824, + "grad_norm": 0.8636260032653809, + "learning_rate": 3.504801920768307e-05, + "loss": 0.5983, + "step": 7483 + }, + { + "epoch": 9.57952, + "grad_norm": 0.8650203943252563, + "learning_rate": 3.5046018407362944e-05, + "loss": 0.5942, + "step": 7484 + }, + { + "epoch": 9.5808, + "grad_norm": 0.9003608226776123, + "learning_rate": 3.5044017607042816e-05, + "loss": 0.5908, + "step": 7485 + }, + { + "epoch": 9.58208, + "grad_norm": 0.8681960105895996, + "learning_rate": 3.504201680672269e-05, + "loss": 0.6542, + "step": 7486 + }, + { + "epoch": 9.58336, + "grad_norm": 0.8483841419219971, + "learning_rate": 3.5040016006402567e-05, + "loss": 0.5657, + "step": 7487 + }, + { + "epoch": 9.58464, + "grad_norm": 0.894481897354126, + "learning_rate": 3.503801520608244e-05, + "loss": 0.6173, + "step": 7488 + }, + { + "epoch": 9.58592, + "grad_norm": 0.7843658328056335, + "learning_rate": 3.503601440576231e-05, + "loss": 0.527, + "step": 7489 + }, + { + "epoch": 9.5872, + "grad_norm": 0.8325011134147644, + "learning_rate": 3.5034013605442175e-05, + "loss": 0.5687, + "step": 7490 + }, + { + "epoch": 9.58848, + "grad_norm": 0.8706384301185608, + "learning_rate": 3.503201280512205e-05, + "loss": 0.6229, + "step": 7491 + }, + { + "epoch": 9.58976, + "grad_norm": 0.8439401388168335, + "learning_rate": 3.503001200480192e-05, + "loss": 0.565, + "step": 7492 + }, + { + "epoch": 9.59104, + "grad_norm": 0.8194774985313416, + "learning_rate": 3.502801120448179e-05, + "loss": 0.5327, + "step": 7493 + }, + { + "epoch": 9.59232, + "grad_norm": 0.8847520351409912, + "learning_rate": 3.502601040416167e-05, + "loss": 0.6109, + "step": 7494 + }, + { + "epoch": 9.5936, + "grad_norm": 0.8577123284339905, + "learning_rate": 3.502400960384154e-05, + "loss": 0.6078, + "step": 7495 + }, + { + "epoch": 9.59488, + "grad_norm": 0.8352939486503601, + "learning_rate": 3.502200880352141e-05, + "loss": 0.5655, + "step": 7496 + }, + { + "epoch": 9.59616, + "grad_norm": 0.8935215473175049, + "learning_rate": 3.5020008003201285e-05, + "loss": 0.6405, + "step": 7497 + }, + { + "epoch": 9.59744, + "grad_norm": 0.8329689502716064, + "learning_rate": 3.501800720288115e-05, + "loss": 0.5859, + "step": 7498 + }, + { + "epoch": 9.59872, + "grad_norm": 0.8347263932228088, + "learning_rate": 3.501600640256102e-05, + "loss": 0.5564, + "step": 7499 + }, + { + "epoch": 9.6, + "grad_norm": 0.8732603192329407, + "learning_rate": 3.5014005602240894e-05, + "loss": 0.6078, + "step": 7500 + }, + { + "epoch": 9.60128, + "grad_norm": 0.8412352800369263, + "learning_rate": 3.501200480192077e-05, + "loss": 0.5728, + "step": 7501 + }, + { + "epoch": 9.60256, + "grad_norm": 0.8667293787002563, + "learning_rate": 3.5010004001600644e-05, + "loss": 0.5968, + "step": 7502 + }, + { + "epoch": 9.60384, + "grad_norm": 0.8245180249214172, + "learning_rate": 3.5008003201280516e-05, + "loss": 0.5847, + "step": 7503 + }, + { + "epoch": 9.60512, + "grad_norm": 0.8028945326805115, + "learning_rate": 3.500600240096039e-05, + "loss": 0.5471, + "step": 7504 + }, + { + "epoch": 9.6064, + "grad_norm": 0.844236433506012, + "learning_rate": 3.500400160064026e-05, + "loss": 0.5414, + "step": 7505 + }, + { + "epoch": 9.60768, + "grad_norm": 0.8215577006340027, + "learning_rate": 3.5002000800320125e-05, + "loss": 0.5612, + "step": 7506 + }, + { + "epoch": 9.60896, + "grad_norm": 0.8602308034896851, + "learning_rate": 3.5e-05, + "loss": 0.6125, + "step": 7507 + }, + { + "epoch": 9.61024, + "grad_norm": 0.8574222922325134, + "learning_rate": 3.4997999199679876e-05, + "loss": 0.6139, + "step": 7508 + }, + { + "epoch": 9.61152, + "grad_norm": 0.8892394304275513, + "learning_rate": 3.499599839935975e-05, + "loss": 0.6059, + "step": 7509 + }, + { + "epoch": 9.6128, + "grad_norm": 0.8644453287124634, + "learning_rate": 3.499399759903962e-05, + "loss": 0.6204, + "step": 7510 + }, + { + "epoch": 9.61408, + "grad_norm": 0.8173637390136719, + "learning_rate": 3.499199679871949e-05, + "loss": 0.5308, + "step": 7511 + }, + { + "epoch": 9.61536, + "grad_norm": 0.8753309845924377, + "learning_rate": 3.498999599839936e-05, + "loss": 0.6066, + "step": 7512 + }, + { + "epoch": 9.61664, + "grad_norm": 0.8645505309104919, + "learning_rate": 3.4987995198079235e-05, + "loss": 0.5772, + "step": 7513 + }, + { + "epoch": 9.61792, + "grad_norm": 0.8549172878265381, + "learning_rate": 3.49859943977591e-05, + "loss": 0.5876, + "step": 7514 + }, + { + "epoch": 9.6192, + "grad_norm": 0.8866432309150696, + "learning_rate": 3.498399359743898e-05, + "loss": 0.5682, + "step": 7515 + }, + { + "epoch": 9.62048, + "grad_norm": 0.8328559398651123, + "learning_rate": 3.498199279711885e-05, + "loss": 0.568, + "step": 7516 + }, + { + "epoch": 9.62176, + "grad_norm": 0.8284956216812134, + "learning_rate": 3.497999199679872e-05, + "loss": 0.5592, + "step": 7517 + }, + { + "epoch": 9.62304, + "grad_norm": 0.8239821195602417, + "learning_rate": 3.4977991196478594e-05, + "loss": 0.6051, + "step": 7518 + }, + { + "epoch": 9.62432, + "grad_norm": 0.8453178405761719, + "learning_rate": 3.4975990396158466e-05, + "loss": 0.5632, + "step": 7519 + }, + { + "epoch": 9.6256, + "grad_norm": 0.8655393123626709, + "learning_rate": 3.497398959583834e-05, + "loss": 0.6542, + "step": 7520 + }, + { + "epoch": 9.62688, + "grad_norm": 0.8494473099708557, + "learning_rate": 3.497198879551821e-05, + "loss": 0.584, + "step": 7521 + }, + { + "epoch": 9.62816, + "grad_norm": 0.8517120480537415, + "learning_rate": 3.496998799519808e-05, + "loss": 0.5515, + "step": 7522 + }, + { + "epoch": 9.62944, + "grad_norm": 0.8242863416671753, + "learning_rate": 3.4967987194877953e-05, + "loss": 0.5267, + "step": 7523 + }, + { + "epoch": 9.63072, + "grad_norm": 0.8850992321968079, + "learning_rate": 3.4965986394557825e-05, + "loss": 0.6003, + "step": 7524 + }, + { + "epoch": 9.632, + "grad_norm": 0.8448317050933838, + "learning_rate": 3.49639855942377e-05, + "loss": 0.6187, + "step": 7525 + }, + { + "epoch": 9.63328, + "grad_norm": 0.8723029494285583, + "learning_rate": 3.496198479391757e-05, + "loss": 0.6129, + "step": 7526 + }, + { + "epoch": 9.63456, + "grad_norm": 0.8665981292724609, + "learning_rate": 3.495998399359744e-05, + "loss": 0.5949, + "step": 7527 + }, + { + "epoch": 9.63584, + "grad_norm": 0.8512018322944641, + "learning_rate": 3.495798319327731e-05, + "loss": 0.5528, + "step": 7528 + }, + { + "epoch": 9.63712, + "grad_norm": 0.8611388802528381, + "learning_rate": 3.4955982392957185e-05, + "loss": 0.5936, + "step": 7529 + }, + { + "epoch": 9.6384, + "grad_norm": 0.865940272808075, + "learning_rate": 3.4953981592637056e-05, + "loss": 0.6168, + "step": 7530 + }, + { + "epoch": 9.63968, + "grad_norm": 0.8968199491500854, + "learning_rate": 3.495198079231693e-05, + "loss": 0.6163, + "step": 7531 + }, + { + "epoch": 9.64096, + "grad_norm": 0.8725362420082092, + "learning_rate": 3.49499799919968e-05, + "loss": 0.6209, + "step": 7532 + }, + { + "epoch": 9.64224, + "grad_norm": 0.8303427696228027, + "learning_rate": 3.494797919167667e-05, + "loss": 0.5585, + "step": 7533 + }, + { + "epoch": 9.64352, + "grad_norm": 0.8281154036521912, + "learning_rate": 3.4945978391356544e-05, + "loss": 0.5885, + "step": 7534 + }, + { + "epoch": 9.6448, + "grad_norm": 0.8495305776596069, + "learning_rate": 3.4943977591036416e-05, + "loss": 0.6193, + "step": 7535 + }, + { + "epoch": 9.64608, + "grad_norm": 0.8367983102798462, + "learning_rate": 3.494197679071629e-05, + "loss": 0.5942, + "step": 7536 + }, + { + "epoch": 9.64736, + "grad_norm": 0.8438671231269836, + "learning_rate": 3.493997599039616e-05, + "loss": 0.5856, + "step": 7537 + }, + { + "epoch": 9.64864, + "grad_norm": 0.8848779797554016, + "learning_rate": 3.493797519007603e-05, + "loss": 0.6449, + "step": 7538 + }, + { + "epoch": 9.64992, + "grad_norm": 0.7978121042251587, + "learning_rate": 3.49359743897559e-05, + "loss": 0.573, + "step": 7539 + }, + { + "epoch": 9.6512, + "grad_norm": 0.807237446308136, + "learning_rate": 3.4933973589435775e-05, + "loss": 0.5753, + "step": 7540 + }, + { + "epoch": 9.65248, + "grad_norm": 0.7662572264671326, + "learning_rate": 3.493197278911565e-05, + "loss": 0.5361, + "step": 7541 + }, + { + "epoch": 9.65376, + "grad_norm": 0.8328047394752502, + "learning_rate": 3.492997198879552e-05, + "loss": 0.5792, + "step": 7542 + }, + { + "epoch": 9.65504, + "grad_norm": 0.7806389331817627, + "learning_rate": 3.492797118847539e-05, + "loss": 0.5287, + "step": 7543 + }, + { + "epoch": 9.656320000000001, + "grad_norm": 0.8597953915596008, + "learning_rate": 3.492597038815526e-05, + "loss": 0.5512, + "step": 7544 + }, + { + "epoch": 9.6576, + "grad_norm": 0.8092467784881592, + "learning_rate": 3.4923969587835134e-05, + "loss": 0.557, + "step": 7545 + }, + { + "epoch": 9.65888, + "grad_norm": 0.8465959429740906, + "learning_rate": 3.4921968787515006e-05, + "loss": 0.5901, + "step": 7546 + }, + { + "epoch": 9.66016, + "grad_norm": 0.8790938258171082, + "learning_rate": 3.491996798719488e-05, + "loss": 0.5985, + "step": 7547 + }, + { + "epoch": 9.66144, + "grad_norm": 0.8606868982315063, + "learning_rate": 3.491796718687475e-05, + "loss": 0.6082, + "step": 7548 + }, + { + "epoch": 9.66272, + "grad_norm": 0.8462215065956116, + "learning_rate": 3.491596638655462e-05, + "loss": 0.6326, + "step": 7549 + }, + { + "epoch": 9.664, + "grad_norm": 0.8208281993865967, + "learning_rate": 3.4913965586234494e-05, + "loss": 0.5717, + "step": 7550 + }, + { + "epoch": 9.66528, + "grad_norm": 0.8237935900688171, + "learning_rate": 3.491196478591437e-05, + "loss": 0.5861, + "step": 7551 + }, + { + "epoch": 9.66656, + "grad_norm": 0.8763028979301453, + "learning_rate": 3.490996398559424e-05, + "loss": 0.5965, + "step": 7552 + }, + { + "epoch": 9.66784, + "grad_norm": 0.7981544733047485, + "learning_rate": 3.490796318527411e-05, + "loss": 0.51, + "step": 7553 + }, + { + "epoch": 9.66912, + "grad_norm": 0.8098160624504089, + "learning_rate": 3.490596238495398e-05, + "loss": 0.5858, + "step": 7554 + }, + { + "epoch": 9.6704, + "grad_norm": 0.8530492782592773, + "learning_rate": 3.490396158463385e-05, + "loss": 0.5727, + "step": 7555 + }, + { + "epoch": 9.67168, + "grad_norm": 0.8604266047477722, + "learning_rate": 3.4901960784313725e-05, + "loss": 0.6063, + "step": 7556 + }, + { + "epoch": 9.67296, + "grad_norm": 0.8214526772499084, + "learning_rate": 3.4899959983993597e-05, + "loss": 0.549, + "step": 7557 + }, + { + "epoch": 9.67424, + "grad_norm": 0.7836440205574036, + "learning_rate": 3.4897959183673475e-05, + "loss": 0.5446, + "step": 7558 + }, + { + "epoch": 9.67552, + "grad_norm": 0.8219870924949646, + "learning_rate": 3.489595838335335e-05, + "loss": 0.5766, + "step": 7559 + }, + { + "epoch": 9.6768, + "grad_norm": 0.8548493385314941, + "learning_rate": 3.489395758303321e-05, + "loss": 0.5894, + "step": 7560 + }, + { + "epoch": 9.67808, + "grad_norm": 0.8581690788269043, + "learning_rate": 3.4891956782713084e-05, + "loss": 0.549, + "step": 7561 + }, + { + "epoch": 9.679359999999999, + "grad_norm": 0.858836829662323, + "learning_rate": 3.4889955982392956e-05, + "loss": 0.5783, + "step": 7562 + }, + { + "epoch": 9.68064, + "grad_norm": 0.840040385723114, + "learning_rate": 3.488795518207283e-05, + "loss": 0.6081, + "step": 7563 + }, + { + "epoch": 9.68192, + "grad_norm": 0.8569283485412598, + "learning_rate": 3.48859543817527e-05, + "loss": 0.61, + "step": 7564 + }, + { + "epoch": 9.6832, + "grad_norm": 0.843917191028595, + "learning_rate": 3.488395358143258e-05, + "loss": 0.5821, + "step": 7565 + }, + { + "epoch": 9.68448, + "grad_norm": 0.8357688784599304, + "learning_rate": 3.488195278111245e-05, + "loss": 0.5624, + "step": 7566 + }, + { + "epoch": 9.68576, + "grad_norm": 0.8503354787826538, + "learning_rate": 3.487995198079232e-05, + "loss": 0.6566, + "step": 7567 + }, + { + "epoch": 9.68704, + "grad_norm": 0.8254701495170593, + "learning_rate": 3.487795118047219e-05, + "loss": 0.5381, + "step": 7568 + }, + { + "epoch": 9.688320000000001, + "grad_norm": 0.899823784828186, + "learning_rate": 3.487595038015206e-05, + "loss": 0.578, + "step": 7569 + }, + { + "epoch": 9.6896, + "grad_norm": 0.8426713943481445, + "learning_rate": 3.487394957983193e-05, + "loss": 0.5818, + "step": 7570 + }, + { + "epoch": 9.69088, + "grad_norm": 0.846393346786499, + "learning_rate": 3.48719487795118e-05, + "loss": 0.5821, + "step": 7571 + }, + { + "epoch": 9.69216, + "grad_norm": 0.8248345255851746, + "learning_rate": 3.486994797919168e-05, + "loss": 0.5701, + "step": 7572 + }, + { + "epoch": 9.69344, + "grad_norm": 0.8395203948020935, + "learning_rate": 3.486794717887155e-05, + "loss": 0.5862, + "step": 7573 + }, + { + "epoch": 9.69472, + "grad_norm": 0.8293047547340393, + "learning_rate": 3.4865946378551425e-05, + "loss": 0.5585, + "step": 7574 + }, + { + "epoch": 9.696, + "grad_norm": 0.848100483417511, + "learning_rate": 3.48639455782313e-05, + "loss": 0.5687, + "step": 7575 + }, + { + "epoch": 9.69728, + "grad_norm": 0.8797847032546997, + "learning_rate": 3.486194477791116e-05, + "loss": 0.6331, + "step": 7576 + }, + { + "epoch": 9.69856, + "grad_norm": 0.7995453476905823, + "learning_rate": 3.4859943977591034e-05, + "loss": 0.5394, + "step": 7577 + }, + { + "epoch": 9.69984, + "grad_norm": 0.8336317539215088, + "learning_rate": 3.4857943177270906e-05, + "loss": 0.6146, + "step": 7578 + }, + { + "epoch": 9.70112, + "grad_norm": 0.8199133276939392, + "learning_rate": 3.4855942376950784e-05, + "loss": 0.596, + "step": 7579 + }, + { + "epoch": 9.7024, + "grad_norm": 0.8550775647163391, + "learning_rate": 3.4853941576630656e-05, + "loss": 0.5881, + "step": 7580 + }, + { + "epoch": 9.70368, + "grad_norm": 0.8737037777900696, + "learning_rate": 3.485194077631053e-05, + "loss": 0.6125, + "step": 7581 + }, + { + "epoch": 9.70496, + "grad_norm": 0.8511414527893066, + "learning_rate": 3.48499399759904e-05, + "loss": 0.5995, + "step": 7582 + }, + { + "epoch": 9.70624, + "grad_norm": 0.8362718224525452, + "learning_rate": 3.484793917567027e-05, + "loss": 0.5418, + "step": 7583 + }, + { + "epoch": 9.70752, + "grad_norm": 0.8965070247650146, + "learning_rate": 3.484593837535014e-05, + "loss": 0.6493, + "step": 7584 + }, + { + "epoch": 9.7088, + "grad_norm": 0.8419719338417053, + "learning_rate": 3.484393757503001e-05, + "loss": 0.5813, + "step": 7585 + }, + { + "epoch": 9.71008, + "grad_norm": 0.903337299823761, + "learning_rate": 3.484193677470989e-05, + "loss": 0.6155, + "step": 7586 + }, + { + "epoch": 9.711359999999999, + "grad_norm": 0.8451673984527588, + "learning_rate": 3.483993597438976e-05, + "loss": 0.5438, + "step": 7587 + }, + { + "epoch": 9.71264, + "grad_norm": 0.8772710561752319, + "learning_rate": 3.483793517406963e-05, + "loss": 0.5555, + "step": 7588 + }, + { + "epoch": 9.71392, + "grad_norm": 0.8326647877693176, + "learning_rate": 3.48359343737495e-05, + "loss": 0.6242, + "step": 7589 + }, + { + "epoch": 9.7152, + "grad_norm": 0.8710974454879761, + "learning_rate": 3.4833933573429375e-05, + "loss": 0.6274, + "step": 7590 + }, + { + "epoch": 9.71648, + "grad_norm": 0.8127773404121399, + "learning_rate": 3.4831932773109246e-05, + "loss": 0.6033, + "step": 7591 + }, + { + "epoch": 9.71776, + "grad_norm": 0.7887704372406006, + "learning_rate": 3.482993197278911e-05, + "loss": 0.5284, + "step": 7592 + }, + { + "epoch": 9.71904, + "grad_norm": 0.8618784546852112, + "learning_rate": 3.482793117246899e-05, + "loss": 0.6029, + "step": 7593 + }, + { + "epoch": 9.72032, + "grad_norm": 0.8521010875701904, + "learning_rate": 3.482593037214886e-05, + "loss": 0.5912, + "step": 7594 + }, + { + "epoch": 9.7216, + "grad_norm": 0.8535512685775757, + "learning_rate": 3.4823929571828734e-05, + "loss": 0.5922, + "step": 7595 + }, + { + "epoch": 9.72288, + "grad_norm": 0.8280578255653381, + "learning_rate": 3.4821928771508606e-05, + "loss": 0.5867, + "step": 7596 + }, + { + "epoch": 9.72416, + "grad_norm": 0.7792149782180786, + "learning_rate": 3.481992797118848e-05, + "loss": 0.5363, + "step": 7597 + }, + { + "epoch": 9.72544, + "grad_norm": 0.7704604864120483, + "learning_rate": 3.481792717086835e-05, + "loss": 0.5437, + "step": 7598 + }, + { + "epoch": 9.72672, + "grad_norm": 0.819395899772644, + "learning_rate": 3.481592637054822e-05, + "loss": 0.5802, + "step": 7599 + }, + { + "epoch": 9.728, + "grad_norm": 0.83621746301651, + "learning_rate": 3.481392557022809e-05, + "loss": 0.579, + "step": 7600 + }, + { + "epoch": 9.72928, + "grad_norm": 0.8944879770278931, + "learning_rate": 3.4811924769907965e-05, + "loss": 0.6284, + "step": 7601 + }, + { + "epoch": 9.73056, + "grad_norm": 0.8720524311065674, + "learning_rate": 3.480992396958784e-05, + "loss": 0.5917, + "step": 7602 + }, + { + "epoch": 9.73184, + "grad_norm": 0.877223789691925, + "learning_rate": 3.480792316926771e-05, + "loss": 0.591, + "step": 7603 + }, + { + "epoch": 9.73312, + "grad_norm": 0.9013481140136719, + "learning_rate": 3.480592236894758e-05, + "loss": 0.6077, + "step": 7604 + }, + { + "epoch": 9.7344, + "grad_norm": 0.8513638973236084, + "learning_rate": 3.480392156862745e-05, + "loss": 0.5542, + "step": 7605 + }, + { + "epoch": 9.73568, + "grad_norm": 0.862791121006012, + "learning_rate": 3.4801920768307324e-05, + "loss": 0.5792, + "step": 7606 + }, + { + "epoch": 9.73696, + "grad_norm": 0.8830035328865051, + "learning_rate": 3.4799919967987196e-05, + "loss": 0.5655, + "step": 7607 + }, + { + "epoch": 9.73824, + "grad_norm": 0.8427029252052307, + "learning_rate": 3.479791916766707e-05, + "loss": 0.5665, + "step": 7608 + }, + { + "epoch": 9.73952, + "grad_norm": 0.8503885269165039, + "learning_rate": 3.479591836734694e-05, + "loss": 0.5891, + "step": 7609 + }, + { + "epoch": 9.7408, + "grad_norm": 0.8855993151664734, + "learning_rate": 3.479391756702681e-05, + "loss": 0.603, + "step": 7610 + }, + { + "epoch": 9.74208, + "grad_norm": 0.8236071467399597, + "learning_rate": 3.4791916766706684e-05, + "loss": 0.5416, + "step": 7611 + }, + { + "epoch": 9.74336, + "grad_norm": 0.8632338643074036, + "learning_rate": 3.4789915966386555e-05, + "loss": 0.572, + "step": 7612 + }, + { + "epoch": 9.74464, + "grad_norm": 0.8438730239868164, + "learning_rate": 3.478791516606643e-05, + "loss": 0.5588, + "step": 7613 + }, + { + "epoch": 9.74592, + "grad_norm": 0.8554427027702332, + "learning_rate": 3.4785914365746306e-05, + "loss": 0.596, + "step": 7614 + }, + { + "epoch": 9.7472, + "grad_norm": 0.8517599701881409, + "learning_rate": 3.478391356542617e-05, + "loss": 0.5597, + "step": 7615 + }, + { + "epoch": 9.74848, + "grad_norm": 0.8286646604537964, + "learning_rate": 3.478191276510604e-05, + "loss": 0.5932, + "step": 7616 + }, + { + "epoch": 9.74976, + "grad_norm": 0.7939943075180054, + "learning_rate": 3.4779911964785915e-05, + "loss": 0.5925, + "step": 7617 + }, + { + "epoch": 9.75104, + "grad_norm": 0.8272203803062439, + "learning_rate": 3.4777911164465787e-05, + "loss": 0.6005, + "step": 7618 + }, + { + "epoch": 9.75232, + "grad_norm": 0.8649276494979858, + "learning_rate": 3.477591036414566e-05, + "loss": 0.5847, + "step": 7619 + }, + { + "epoch": 9.7536, + "grad_norm": 0.7952982783317566, + "learning_rate": 3.477390956382553e-05, + "loss": 0.5359, + "step": 7620 + }, + { + "epoch": 9.75488, + "grad_norm": 0.8271358609199524, + "learning_rate": 3.477190876350541e-05, + "loss": 0.5964, + "step": 7621 + }, + { + "epoch": 9.75616, + "grad_norm": 0.8045120239257812, + "learning_rate": 3.476990796318528e-05, + "loss": 0.5676, + "step": 7622 + }, + { + "epoch": 9.75744, + "grad_norm": 0.7775219082832336, + "learning_rate": 3.4767907162865146e-05, + "loss": 0.5519, + "step": 7623 + }, + { + "epoch": 9.75872, + "grad_norm": 0.8189329504966736, + "learning_rate": 3.476590636254502e-05, + "loss": 0.5834, + "step": 7624 + }, + { + "epoch": 9.76, + "grad_norm": 0.8617966771125793, + "learning_rate": 3.476390556222489e-05, + "loss": 0.5863, + "step": 7625 + }, + { + "epoch": 9.76128, + "grad_norm": 0.8166400790214539, + "learning_rate": 3.476190476190476e-05, + "loss": 0.5754, + "step": 7626 + }, + { + "epoch": 9.76256, + "grad_norm": 0.8374921679496765, + "learning_rate": 3.475990396158463e-05, + "loss": 0.557, + "step": 7627 + }, + { + "epoch": 9.76384, + "grad_norm": 0.7682600617408752, + "learning_rate": 3.475790316126451e-05, + "loss": 0.5077, + "step": 7628 + }, + { + "epoch": 9.76512, + "grad_norm": 0.8810880780220032, + "learning_rate": 3.4755902360944384e-05, + "loss": 0.6657, + "step": 7629 + }, + { + "epoch": 9.7664, + "grad_norm": 0.8713139295578003, + "learning_rate": 3.4753901560624256e-05, + "loss": 0.6198, + "step": 7630 + }, + { + "epoch": 9.76768, + "grad_norm": 0.858737051486969, + "learning_rate": 3.475190076030412e-05, + "loss": 0.5409, + "step": 7631 + }, + { + "epoch": 9.76896, + "grad_norm": 0.7983132004737854, + "learning_rate": 3.474989995998399e-05, + "loss": 0.5621, + "step": 7632 + }, + { + "epoch": 9.77024, + "grad_norm": 0.8178649544715881, + "learning_rate": 3.4747899159663864e-05, + "loss": 0.5632, + "step": 7633 + }, + { + "epoch": 9.77152, + "grad_norm": 0.8813498020172119, + "learning_rate": 3.4745898359343736e-05, + "loss": 0.6399, + "step": 7634 + }, + { + "epoch": 9.7728, + "grad_norm": 0.8513997197151184, + "learning_rate": 3.4743897559023615e-05, + "loss": 0.5805, + "step": 7635 + }, + { + "epoch": 9.77408, + "grad_norm": 0.8140328526496887, + "learning_rate": 3.474189675870349e-05, + "loss": 0.534, + "step": 7636 + }, + { + "epoch": 9.77536, + "grad_norm": 0.8354536890983582, + "learning_rate": 3.473989595838336e-05, + "loss": 0.5771, + "step": 7637 + }, + { + "epoch": 9.77664, + "grad_norm": 0.8029643297195435, + "learning_rate": 3.473789515806323e-05, + "loss": 0.527, + "step": 7638 + }, + { + "epoch": 9.77792, + "grad_norm": 0.8642273545265198, + "learning_rate": 3.4735894357743096e-05, + "loss": 0.591, + "step": 7639 + }, + { + "epoch": 9.7792, + "grad_norm": 0.8148753643035889, + "learning_rate": 3.473389355742297e-05, + "loss": 0.5682, + "step": 7640 + }, + { + "epoch": 9.78048, + "grad_norm": 0.8324840664863586, + "learning_rate": 3.473189275710284e-05, + "loss": 0.5479, + "step": 7641 + }, + { + "epoch": 9.78176, + "grad_norm": 0.911037027835846, + "learning_rate": 3.472989195678272e-05, + "loss": 0.6622, + "step": 7642 + }, + { + "epoch": 9.78304, + "grad_norm": 0.847952663898468, + "learning_rate": 3.472789115646259e-05, + "loss": 0.5716, + "step": 7643 + }, + { + "epoch": 9.78432, + "grad_norm": 0.8314871191978455, + "learning_rate": 3.472589035614246e-05, + "loss": 0.626, + "step": 7644 + }, + { + "epoch": 9.7856, + "grad_norm": 0.8641082048416138, + "learning_rate": 3.4723889555822333e-05, + "loss": 0.5465, + "step": 7645 + }, + { + "epoch": 9.78688, + "grad_norm": 0.834967851638794, + "learning_rate": 3.4721888755502205e-05, + "loss": 0.5653, + "step": 7646 + }, + { + "epoch": 9.78816, + "grad_norm": 0.8525474667549133, + "learning_rate": 3.471988795518207e-05, + "loss": 0.5715, + "step": 7647 + }, + { + "epoch": 9.78944, + "grad_norm": 0.8522567749023438, + "learning_rate": 3.471788715486194e-05, + "loss": 0.5699, + "step": 7648 + }, + { + "epoch": 9.79072, + "grad_norm": 0.8890289068222046, + "learning_rate": 3.471588635454182e-05, + "loss": 0.6542, + "step": 7649 + }, + { + "epoch": 9.792, + "grad_norm": 0.8257919549942017, + "learning_rate": 3.471388555422169e-05, + "loss": 0.5972, + "step": 7650 + }, + { + "epoch": 9.79328, + "grad_norm": 0.8120242357254028, + "learning_rate": 3.4711884753901565e-05, + "loss": 0.5841, + "step": 7651 + }, + { + "epoch": 9.79456, + "grad_norm": 0.8151296973228455, + "learning_rate": 3.4709883953581436e-05, + "loss": 0.5583, + "step": 7652 + }, + { + "epoch": 9.79584, + "grad_norm": 0.8686169981956482, + "learning_rate": 3.470788315326131e-05, + "loss": 0.5816, + "step": 7653 + }, + { + "epoch": 9.79712, + "grad_norm": 0.871562123298645, + "learning_rate": 3.470588235294118e-05, + "loss": 0.5792, + "step": 7654 + }, + { + "epoch": 9.7984, + "grad_norm": 0.8428971767425537, + "learning_rate": 3.4703881552621045e-05, + "loss": 0.517, + "step": 7655 + }, + { + "epoch": 9.79968, + "grad_norm": 0.8845090866088867, + "learning_rate": 3.470188075230092e-05, + "loss": 0.6303, + "step": 7656 + }, + { + "epoch": 9.80096, + "grad_norm": 0.8464072942733765, + "learning_rate": 3.4699879951980796e-05, + "loss": 0.5476, + "step": 7657 + }, + { + "epoch": 9.80224, + "grad_norm": 0.871966540813446, + "learning_rate": 3.469787915166067e-05, + "loss": 0.5654, + "step": 7658 + }, + { + "epoch": 9.80352, + "grad_norm": 0.811346173286438, + "learning_rate": 3.469587835134054e-05, + "loss": 0.5738, + "step": 7659 + }, + { + "epoch": 9.8048, + "grad_norm": 0.8650268316268921, + "learning_rate": 3.469387755102041e-05, + "loss": 0.6266, + "step": 7660 + }, + { + "epoch": 9.80608, + "grad_norm": 0.7988438010215759, + "learning_rate": 3.469187675070028e-05, + "loss": 0.5255, + "step": 7661 + }, + { + "epoch": 9.80736, + "grad_norm": 0.8700106739997864, + "learning_rate": 3.4689875950380155e-05, + "loss": 0.6087, + "step": 7662 + }, + { + "epoch": 9.80864, + "grad_norm": 0.8239459991455078, + "learning_rate": 3.468787515006002e-05, + "loss": 0.5602, + "step": 7663 + }, + { + "epoch": 9.80992, + "grad_norm": 0.9134540557861328, + "learning_rate": 3.46858743497399e-05, + "loss": 0.5968, + "step": 7664 + }, + { + "epoch": 9.8112, + "grad_norm": 0.9253342747688293, + "learning_rate": 3.468387354941977e-05, + "loss": 0.6128, + "step": 7665 + }, + { + "epoch": 9.81248, + "grad_norm": 0.7687677145004272, + "learning_rate": 3.468187274909964e-05, + "loss": 0.5191, + "step": 7666 + }, + { + "epoch": 9.81376, + "grad_norm": 0.8132002949714661, + "learning_rate": 3.4679871948779514e-05, + "loss": 0.5736, + "step": 7667 + }, + { + "epoch": 9.81504, + "grad_norm": 0.8032669425010681, + "learning_rate": 3.4677871148459386e-05, + "loss": 0.5311, + "step": 7668 + }, + { + "epoch": 9.81632, + "grad_norm": 0.8530933260917664, + "learning_rate": 3.467587034813926e-05, + "loss": 0.5699, + "step": 7669 + }, + { + "epoch": 9.8176, + "grad_norm": 0.8100473880767822, + "learning_rate": 3.467386954781913e-05, + "loss": 0.5431, + "step": 7670 + }, + { + "epoch": 9.81888, + "grad_norm": 0.8088340759277344, + "learning_rate": 3.4671868747499e-05, + "loss": 0.5454, + "step": 7671 + }, + { + "epoch": 9.82016, + "grad_norm": 0.8359904289245605, + "learning_rate": 3.4669867947178874e-05, + "loss": 0.567, + "step": 7672 + }, + { + "epoch": 9.821439999999999, + "grad_norm": 0.8769454956054688, + "learning_rate": 3.4667867146858745e-05, + "loss": 0.5966, + "step": 7673 + }, + { + "epoch": 9.82272, + "grad_norm": 0.8174176812171936, + "learning_rate": 3.466586634653862e-05, + "loss": 0.5452, + "step": 7674 + }, + { + "epoch": 9.824, + "grad_norm": 0.8501536846160889, + "learning_rate": 3.466386554621849e-05, + "loss": 0.5929, + "step": 7675 + }, + { + "epoch": 9.82528, + "grad_norm": 0.8151215314865112, + "learning_rate": 3.466186474589836e-05, + "loss": 0.552, + "step": 7676 + }, + { + "epoch": 9.82656, + "grad_norm": 0.8890595436096191, + "learning_rate": 3.465986394557823e-05, + "loss": 0.6142, + "step": 7677 + }, + { + "epoch": 9.82784, + "grad_norm": 0.8509846329689026, + "learning_rate": 3.4657863145258105e-05, + "loss": 0.6358, + "step": 7678 + }, + { + "epoch": 9.82912, + "grad_norm": 0.8409616947174072, + "learning_rate": 3.465586234493798e-05, + "loss": 0.5594, + "step": 7679 + }, + { + "epoch": 9.830400000000001, + "grad_norm": 0.8827511072158813, + "learning_rate": 3.465386154461785e-05, + "loss": 0.6054, + "step": 7680 + }, + { + "epoch": 9.83168, + "grad_norm": 0.8123102784156799, + "learning_rate": 3.465186074429772e-05, + "loss": 0.5833, + "step": 7681 + }, + { + "epoch": 9.83296, + "grad_norm": 0.8210862874984741, + "learning_rate": 3.464985994397759e-05, + "loss": 0.6082, + "step": 7682 + }, + { + "epoch": 9.83424, + "grad_norm": 0.8209624886512756, + "learning_rate": 3.4647859143657464e-05, + "loss": 0.615, + "step": 7683 + }, + { + "epoch": 9.83552, + "grad_norm": 0.8852274417877197, + "learning_rate": 3.4645858343337336e-05, + "loss": 0.6337, + "step": 7684 + }, + { + "epoch": 9.8368, + "grad_norm": 0.8162413239479065, + "learning_rate": 3.464385754301721e-05, + "loss": 0.5424, + "step": 7685 + }, + { + "epoch": 9.83808, + "grad_norm": 0.8327953219413757, + "learning_rate": 3.464185674269708e-05, + "loss": 0.5759, + "step": 7686 + }, + { + "epoch": 9.83936, + "grad_norm": 0.8537192940711975, + "learning_rate": 3.463985594237695e-05, + "loss": 0.5714, + "step": 7687 + }, + { + "epoch": 9.84064, + "grad_norm": 0.8469899892807007, + "learning_rate": 3.463785514205682e-05, + "loss": 0.6088, + "step": 7688 + }, + { + "epoch": 9.84192, + "grad_norm": 0.8121939301490784, + "learning_rate": 3.4635854341736695e-05, + "loss": 0.5807, + "step": 7689 + }, + { + "epoch": 9.8432, + "grad_norm": 0.8225154280662537, + "learning_rate": 3.463385354141657e-05, + "loss": 0.5833, + "step": 7690 + }, + { + "epoch": 9.84448, + "grad_norm": 0.8836968541145325, + "learning_rate": 3.463185274109644e-05, + "loss": 0.5574, + "step": 7691 + }, + { + "epoch": 9.84576, + "grad_norm": 0.8945218920707703, + "learning_rate": 3.462985194077632e-05, + "loss": 0.6273, + "step": 7692 + }, + { + "epoch": 9.84704, + "grad_norm": 0.8279548287391663, + "learning_rate": 3.462785114045618e-05, + "loss": 0.5263, + "step": 7693 + }, + { + "epoch": 9.84832, + "grad_norm": 0.8554354906082153, + "learning_rate": 3.4625850340136054e-05, + "loss": 0.5709, + "step": 7694 + }, + { + "epoch": 9.8496, + "grad_norm": 0.8173097372055054, + "learning_rate": 3.4623849539815926e-05, + "loss": 0.5876, + "step": 7695 + }, + { + "epoch": 9.85088, + "grad_norm": 0.8793294429779053, + "learning_rate": 3.46218487394958e-05, + "loss": 0.627, + "step": 7696 + }, + { + "epoch": 9.85216, + "grad_norm": 0.7973728775978088, + "learning_rate": 3.461984793917567e-05, + "loss": 0.5243, + "step": 7697 + }, + { + "epoch": 9.853439999999999, + "grad_norm": 0.8718621730804443, + "learning_rate": 3.461784713885554e-05, + "loss": 0.5843, + "step": 7698 + }, + { + "epoch": 9.85472, + "grad_norm": 0.8487060070037842, + "learning_rate": 3.461584633853542e-05, + "loss": 0.5407, + "step": 7699 + }, + { + "epoch": 9.856, + "grad_norm": 0.8882485032081604, + "learning_rate": 3.461384553821529e-05, + "loss": 0.6204, + "step": 7700 + }, + { + "epoch": 9.85728, + "grad_norm": 0.8720253705978394, + "learning_rate": 3.461184473789516e-05, + "loss": 0.5962, + "step": 7701 + }, + { + "epoch": 9.85856, + "grad_norm": 0.8400206565856934, + "learning_rate": 3.460984393757503e-05, + "loss": 0.5695, + "step": 7702 + }, + { + "epoch": 9.85984, + "grad_norm": 0.9012857675552368, + "learning_rate": 3.46078431372549e-05, + "loss": 0.5909, + "step": 7703 + }, + { + "epoch": 9.86112, + "grad_norm": 0.8660620450973511, + "learning_rate": 3.460584233693477e-05, + "loss": 0.5646, + "step": 7704 + }, + { + "epoch": 9.862400000000001, + "grad_norm": 0.8514848947525024, + "learning_rate": 3.4603841536614645e-05, + "loss": 0.5869, + "step": 7705 + }, + { + "epoch": 9.86368, + "grad_norm": 0.8337460160255432, + "learning_rate": 3.4601840736294524e-05, + "loss": 0.5438, + "step": 7706 + }, + { + "epoch": 9.86496, + "grad_norm": 0.8334992527961731, + "learning_rate": 3.4599839935974395e-05, + "loss": 0.5825, + "step": 7707 + }, + { + "epoch": 9.86624, + "grad_norm": 0.8814105987548828, + "learning_rate": 3.459783913565427e-05, + "loss": 0.6023, + "step": 7708 + }, + { + "epoch": 9.86752, + "grad_norm": 0.9168762564659119, + "learning_rate": 3.459583833533413e-05, + "loss": 0.6323, + "step": 7709 + }, + { + "epoch": 9.8688, + "grad_norm": 0.8501449227333069, + "learning_rate": 3.4593837535014004e-05, + "loss": 0.5978, + "step": 7710 + }, + { + "epoch": 9.87008, + "grad_norm": 0.8778765201568604, + "learning_rate": 3.4591836734693876e-05, + "loss": 0.5654, + "step": 7711 + }, + { + "epoch": 9.87136, + "grad_norm": 0.8818456530570984, + "learning_rate": 3.458983593437375e-05, + "loss": 0.6219, + "step": 7712 + }, + { + "epoch": 9.87264, + "grad_norm": 0.8822683095932007, + "learning_rate": 3.4587835134053627e-05, + "loss": 0.6086, + "step": 7713 + }, + { + "epoch": 9.87392, + "grad_norm": 0.8153172731399536, + "learning_rate": 3.45858343337335e-05, + "loss": 0.568, + "step": 7714 + }, + { + "epoch": 9.8752, + "grad_norm": 0.8170477151870728, + "learning_rate": 3.458383353341337e-05, + "loss": 0.5734, + "step": 7715 + }, + { + "epoch": 9.87648, + "grad_norm": 0.8518624901771545, + "learning_rate": 3.458183273309324e-05, + "loss": 0.5697, + "step": 7716 + }, + { + "epoch": 9.87776, + "grad_norm": 0.8774121999740601, + "learning_rate": 3.457983193277311e-05, + "loss": 0.5983, + "step": 7717 + }, + { + "epoch": 9.87904, + "grad_norm": 0.851138710975647, + "learning_rate": 3.457783113245298e-05, + "loss": 0.6161, + "step": 7718 + }, + { + "epoch": 9.88032, + "grad_norm": 0.8621139526367188, + "learning_rate": 3.457583033213285e-05, + "loss": 0.6147, + "step": 7719 + }, + { + "epoch": 9.8816, + "grad_norm": 0.8349597454071045, + "learning_rate": 3.457382953181273e-05, + "loss": 0.5587, + "step": 7720 + }, + { + "epoch": 9.88288, + "grad_norm": 0.8482023477554321, + "learning_rate": 3.45718287314926e-05, + "loss": 0.5783, + "step": 7721 + }, + { + "epoch": 9.88416, + "grad_norm": 0.8426772952079773, + "learning_rate": 3.456982793117247e-05, + "loss": 0.5994, + "step": 7722 + }, + { + "epoch": 9.88544, + "grad_norm": 0.8138001561164856, + "learning_rate": 3.4567827130852345e-05, + "loss": 0.5165, + "step": 7723 + }, + { + "epoch": 9.88672, + "grad_norm": 0.838672399520874, + "learning_rate": 3.456582633053222e-05, + "loss": 0.5827, + "step": 7724 + }, + { + "epoch": 9.888, + "grad_norm": 0.8754329085350037, + "learning_rate": 3.456382553021208e-05, + "loss": 0.5673, + "step": 7725 + }, + { + "epoch": 9.88928, + "grad_norm": 0.856153130531311, + "learning_rate": 3.4561824729891954e-05, + "loss": 0.5857, + "step": 7726 + }, + { + "epoch": 9.89056, + "grad_norm": 0.8428956866264343, + "learning_rate": 3.455982392957183e-05, + "loss": 0.5538, + "step": 7727 + }, + { + "epoch": 9.89184, + "grad_norm": 0.8644785284996033, + "learning_rate": 3.4557823129251704e-05, + "loss": 0.5754, + "step": 7728 + }, + { + "epoch": 9.89312, + "grad_norm": 0.8557632565498352, + "learning_rate": 3.4555822328931576e-05, + "loss": 0.5598, + "step": 7729 + }, + { + "epoch": 9.8944, + "grad_norm": 0.84462970495224, + "learning_rate": 3.455382152861145e-05, + "loss": 0.59, + "step": 7730 + }, + { + "epoch": 9.89568, + "grad_norm": 0.8369764089584351, + "learning_rate": 3.455182072829132e-05, + "loss": 0.6021, + "step": 7731 + }, + { + "epoch": 9.89696, + "grad_norm": 0.8819150328636169, + "learning_rate": 3.454981992797119e-05, + "loss": 0.6122, + "step": 7732 + }, + { + "epoch": 9.89824, + "grad_norm": 0.8815730214118958, + "learning_rate": 3.454781912765106e-05, + "loss": 0.6513, + "step": 7733 + }, + { + "epoch": 9.89952, + "grad_norm": 0.8620604276657104, + "learning_rate": 3.4545818327330936e-05, + "loss": 0.5683, + "step": 7734 + }, + { + "epoch": 9.9008, + "grad_norm": 0.8745002746582031, + "learning_rate": 3.454381752701081e-05, + "loss": 0.5996, + "step": 7735 + }, + { + "epoch": 9.90208, + "grad_norm": 0.8810462355613708, + "learning_rate": 3.454181672669068e-05, + "loss": 0.6145, + "step": 7736 + }, + { + "epoch": 9.90336, + "grad_norm": 0.8663910031318665, + "learning_rate": 3.453981592637055e-05, + "loss": 0.5592, + "step": 7737 + }, + { + "epoch": 9.90464, + "grad_norm": 0.8494516611099243, + "learning_rate": 3.453781512605042e-05, + "loss": 0.5281, + "step": 7738 + }, + { + "epoch": 9.90592, + "grad_norm": 0.8416398763656616, + "learning_rate": 3.4535814325730295e-05, + "loss": 0.5736, + "step": 7739 + }, + { + "epoch": 9.9072, + "grad_norm": 0.7921009659767151, + "learning_rate": 3.453381352541017e-05, + "loss": 0.5327, + "step": 7740 + }, + { + "epoch": 9.90848, + "grad_norm": 0.8950327038764954, + "learning_rate": 3.453181272509004e-05, + "loss": 0.6223, + "step": 7741 + }, + { + "epoch": 9.90976, + "grad_norm": 0.8658446669578552, + "learning_rate": 3.452981192476991e-05, + "loss": 0.6088, + "step": 7742 + }, + { + "epoch": 9.91104, + "grad_norm": 0.8409397006034851, + "learning_rate": 3.452781112444978e-05, + "loss": 0.6228, + "step": 7743 + }, + { + "epoch": 9.91232, + "grad_norm": 0.873226523399353, + "learning_rate": 3.4525810324129654e-05, + "loss": 0.6051, + "step": 7744 + }, + { + "epoch": 9.9136, + "grad_norm": 0.8407812714576721, + "learning_rate": 3.4523809523809526e-05, + "loss": 0.5404, + "step": 7745 + }, + { + "epoch": 9.91488, + "grad_norm": 0.8529120087623596, + "learning_rate": 3.45218087234894e-05, + "loss": 0.6056, + "step": 7746 + }, + { + "epoch": 9.91616, + "grad_norm": 0.88858562707901, + "learning_rate": 3.451980792316927e-05, + "loss": 0.5821, + "step": 7747 + }, + { + "epoch": 9.91744, + "grad_norm": 0.8706713318824768, + "learning_rate": 3.451780712284914e-05, + "loss": 0.6058, + "step": 7748 + }, + { + "epoch": 9.91872, + "grad_norm": 0.8645205497741699, + "learning_rate": 3.451580632252901e-05, + "loss": 0.634, + "step": 7749 + }, + { + "epoch": 9.92, + "grad_norm": 0.9036076068878174, + "learning_rate": 3.4513805522208885e-05, + "loss": 0.6029, + "step": 7750 + }, + { + "epoch": 9.92128, + "grad_norm": 0.8720131516456604, + "learning_rate": 3.451180472188876e-05, + "loss": 0.587, + "step": 7751 + }, + { + "epoch": 9.92256, + "grad_norm": 0.8317989110946655, + "learning_rate": 3.450980392156863e-05, + "loss": 0.5854, + "step": 7752 + }, + { + "epoch": 9.92384, + "grad_norm": 0.8542910814285278, + "learning_rate": 3.45078031212485e-05, + "loss": 0.6074, + "step": 7753 + }, + { + "epoch": 9.92512, + "grad_norm": 0.858959972858429, + "learning_rate": 3.450580232092837e-05, + "loss": 0.5892, + "step": 7754 + }, + { + "epoch": 9.9264, + "grad_norm": 0.8159505128860474, + "learning_rate": 3.4503801520608245e-05, + "loss": 0.5781, + "step": 7755 + }, + { + "epoch": 9.92768, + "grad_norm": 0.810766339302063, + "learning_rate": 3.4501800720288116e-05, + "loss": 0.5439, + "step": 7756 + }, + { + "epoch": 9.92896, + "grad_norm": 0.8826687335968018, + "learning_rate": 3.449979991996799e-05, + "loss": 0.6347, + "step": 7757 + }, + { + "epoch": 9.93024, + "grad_norm": 0.8329678773880005, + "learning_rate": 3.449779911964786e-05, + "loss": 0.5471, + "step": 7758 + }, + { + "epoch": 9.93152, + "grad_norm": 0.8527499437332153, + "learning_rate": 3.449579831932773e-05, + "loss": 0.5741, + "step": 7759 + }, + { + "epoch": 9.9328, + "grad_norm": 0.8892127871513367, + "learning_rate": 3.4493797519007604e-05, + "loss": 0.6161, + "step": 7760 + }, + { + "epoch": 9.93408, + "grad_norm": 0.8425252437591553, + "learning_rate": 3.4491796718687476e-05, + "loss": 0.6091, + "step": 7761 + }, + { + "epoch": 9.93536, + "grad_norm": 0.8560076355934143, + "learning_rate": 3.4489795918367354e-05, + "loss": 0.6142, + "step": 7762 + }, + { + "epoch": 9.93664, + "grad_norm": 0.8586561679840088, + "learning_rate": 3.448779511804722e-05, + "loss": 0.6231, + "step": 7763 + }, + { + "epoch": 9.93792, + "grad_norm": 0.8342392444610596, + "learning_rate": 3.448579431772709e-05, + "loss": 0.546, + "step": 7764 + }, + { + "epoch": 9.9392, + "grad_norm": 0.788269579410553, + "learning_rate": 3.448379351740696e-05, + "loss": 0.5268, + "step": 7765 + }, + { + "epoch": 9.94048, + "grad_norm": 0.8121354579925537, + "learning_rate": 3.4481792717086835e-05, + "loss": 0.5221, + "step": 7766 + }, + { + "epoch": 9.94176, + "grad_norm": 0.8493075966835022, + "learning_rate": 3.447979191676671e-05, + "loss": 0.6281, + "step": 7767 + }, + { + "epoch": 9.94304, + "grad_norm": 0.8820042014122009, + "learning_rate": 3.447779111644658e-05, + "loss": 0.5822, + "step": 7768 + }, + { + "epoch": 9.94432, + "grad_norm": 0.8810234665870667, + "learning_rate": 3.447579031612645e-05, + "loss": 0.5933, + "step": 7769 + }, + { + "epoch": 9.9456, + "grad_norm": 0.8822959065437317, + "learning_rate": 3.447378951580633e-05, + "loss": 0.6068, + "step": 7770 + }, + { + "epoch": 9.94688, + "grad_norm": 0.8374966979026794, + "learning_rate": 3.4471788715486194e-05, + "loss": 0.5737, + "step": 7771 + }, + { + "epoch": 9.94816, + "grad_norm": 0.8867483139038086, + "learning_rate": 3.4469787915166066e-05, + "loss": 0.5899, + "step": 7772 + }, + { + "epoch": 9.94944, + "grad_norm": 0.8893982172012329, + "learning_rate": 3.446778711484594e-05, + "loss": 0.5771, + "step": 7773 + }, + { + "epoch": 9.95072, + "grad_norm": 0.8742989897727966, + "learning_rate": 3.446578631452581e-05, + "loss": 0.5844, + "step": 7774 + }, + { + "epoch": 9.952, + "grad_norm": 0.8614414930343628, + "learning_rate": 3.446378551420568e-05, + "loss": 0.5596, + "step": 7775 + }, + { + "epoch": 9.95328, + "grad_norm": 0.8370686173439026, + "learning_rate": 3.4461784713885554e-05, + "loss": 0.5797, + "step": 7776 + }, + { + "epoch": 9.95456, + "grad_norm": 0.8885643482208252, + "learning_rate": 3.445978391356543e-05, + "loss": 0.5866, + "step": 7777 + }, + { + "epoch": 9.95584, + "grad_norm": 0.8820523619651794, + "learning_rate": 3.4457783113245304e-05, + "loss": 0.5999, + "step": 7778 + }, + { + "epoch": 9.95712, + "grad_norm": 0.8967301249504089, + "learning_rate": 3.445578231292517e-05, + "loss": 0.6284, + "step": 7779 + }, + { + "epoch": 9.9584, + "grad_norm": 0.8450223207473755, + "learning_rate": 3.445378151260504e-05, + "loss": 0.5917, + "step": 7780 + }, + { + "epoch": 9.95968, + "grad_norm": 0.8768996596336365, + "learning_rate": 3.445178071228491e-05, + "loss": 0.5879, + "step": 7781 + }, + { + "epoch": 9.96096, + "grad_norm": 0.876092255115509, + "learning_rate": 3.4449779911964785e-05, + "loss": 0.609, + "step": 7782 + }, + { + "epoch": 9.96224, + "grad_norm": 0.799235463142395, + "learning_rate": 3.4447779111644657e-05, + "loss": 0.5266, + "step": 7783 + }, + { + "epoch": 9.96352, + "grad_norm": 0.8335480690002441, + "learning_rate": 3.4445778311324535e-05, + "loss": 0.579, + "step": 7784 + }, + { + "epoch": 9.9648, + "grad_norm": 0.8067023158073425, + "learning_rate": 3.444377751100441e-05, + "loss": 0.5569, + "step": 7785 + }, + { + "epoch": 9.96608, + "grad_norm": 0.8609577417373657, + "learning_rate": 3.444177671068428e-05, + "loss": 0.614, + "step": 7786 + }, + { + "epoch": 9.96736, + "grad_norm": 0.8286531567573547, + "learning_rate": 3.4439775910364144e-05, + "loss": 0.582, + "step": 7787 + }, + { + "epoch": 9.96864, + "grad_norm": 0.8231421113014221, + "learning_rate": 3.4437775110044016e-05, + "loss": 0.5541, + "step": 7788 + }, + { + "epoch": 9.96992, + "grad_norm": 0.8715162873268127, + "learning_rate": 3.443577430972389e-05, + "loss": 0.6195, + "step": 7789 + }, + { + "epoch": 9.9712, + "grad_norm": 0.9087401032447815, + "learning_rate": 3.443377350940376e-05, + "loss": 0.593, + "step": 7790 + }, + { + "epoch": 9.972480000000001, + "grad_norm": 0.8596914410591125, + "learning_rate": 3.443177270908364e-05, + "loss": 0.5851, + "step": 7791 + }, + { + "epoch": 9.97376, + "grad_norm": 0.8268222808837891, + "learning_rate": 3.442977190876351e-05, + "loss": 0.5725, + "step": 7792 + }, + { + "epoch": 9.97504, + "grad_norm": 0.8293636441230774, + "learning_rate": 3.442777110844338e-05, + "loss": 0.5897, + "step": 7793 + }, + { + "epoch": 9.97632, + "grad_norm": 0.8125051259994507, + "learning_rate": 3.4425770308123254e-05, + "loss": 0.5753, + "step": 7794 + }, + { + "epoch": 9.9776, + "grad_norm": 0.8295629024505615, + "learning_rate": 3.442376950780312e-05, + "loss": 0.5983, + "step": 7795 + }, + { + "epoch": 9.97888, + "grad_norm": 0.828607439994812, + "learning_rate": 3.442176870748299e-05, + "loss": 0.6585, + "step": 7796 + }, + { + "epoch": 9.98016, + "grad_norm": 0.8404040932655334, + "learning_rate": 3.441976790716286e-05, + "loss": 0.5979, + "step": 7797 + }, + { + "epoch": 9.98144, + "grad_norm": 0.8014954924583435, + "learning_rate": 3.441776710684274e-05, + "loss": 0.5512, + "step": 7798 + }, + { + "epoch": 9.98272, + "grad_norm": 0.8161661028862, + "learning_rate": 3.441576630652261e-05, + "loss": 0.5654, + "step": 7799 + }, + { + "epoch": 9.984, + "grad_norm": 0.8457411527633667, + "learning_rate": 3.4413765506202485e-05, + "loss": 0.6289, + "step": 7800 + }, + { + "epoch": 9.98528, + "grad_norm": 0.8675429224967957, + "learning_rate": 3.441176470588236e-05, + "loss": 0.5657, + "step": 7801 + }, + { + "epoch": 9.98656, + "grad_norm": 0.8058546185493469, + "learning_rate": 3.440976390556223e-05, + "loss": 0.5645, + "step": 7802 + }, + { + "epoch": 9.98784, + "grad_norm": 0.7946271896362305, + "learning_rate": 3.4407763105242094e-05, + "loss": 0.5822, + "step": 7803 + }, + { + "epoch": 9.98912, + "grad_norm": 0.7849445939064026, + "learning_rate": 3.4405762304921965e-05, + "loss": 0.5638, + "step": 7804 + }, + { + "epoch": 9.9904, + "grad_norm": 0.8515406847000122, + "learning_rate": 3.4403761504601844e-05, + "loss": 0.5734, + "step": 7805 + }, + { + "epoch": 9.99168, + "grad_norm": 0.8528926968574524, + "learning_rate": 3.4401760704281716e-05, + "loss": 0.5722, + "step": 7806 + }, + { + "epoch": 9.99296, + "grad_norm": 0.851125180721283, + "learning_rate": 3.439975990396159e-05, + "loss": 0.5603, + "step": 7807 + }, + { + "epoch": 9.99424, + "grad_norm": 0.8529646396636963, + "learning_rate": 3.439775910364146e-05, + "loss": 0.6152, + "step": 7808 + }, + { + "epoch": 9.995519999999999, + "grad_norm": 0.8701699376106262, + "learning_rate": 3.439575830332133e-05, + "loss": 0.6082, + "step": 7809 + }, + { + "epoch": 9.9968, + "grad_norm": 0.875981330871582, + "learning_rate": 3.4393757503001203e-05, + "loss": 0.5978, + "step": 7810 + }, + { + "epoch": 9.99808, + "grad_norm": 0.8291494250297546, + "learning_rate": 3.439175670268107e-05, + "loss": 0.5422, + "step": 7811 + }, + { + "epoch": 9.99936, + "grad_norm": 0.9472531080245972, + "learning_rate": 3.438975590236095e-05, + "loss": 0.6558, + "step": 7812 + }, + { + "epoch": 10.00064, + "grad_norm": 1.5498021841049194, + "learning_rate": 3.438775510204082e-05, + "loss": 0.8758, + "step": 7813 + }, + { + "epoch": 10.00192, + "grad_norm": 0.8499518632888794, + "learning_rate": 3.438575430172069e-05, + "loss": 0.5858, + "step": 7814 + }, + { + "epoch": 10.0032, + "grad_norm": 0.7815194129943848, + "learning_rate": 3.438375350140056e-05, + "loss": 0.5487, + "step": 7815 + }, + { + "epoch": 10.00448, + "grad_norm": 0.8824237585067749, + "learning_rate": 3.4381752701080435e-05, + "loss": 0.65, + "step": 7816 + }, + { + "epoch": 10.00576, + "grad_norm": 0.7906972169876099, + "learning_rate": 3.4379751900760306e-05, + "loss": 0.5486, + "step": 7817 + }, + { + "epoch": 10.00704, + "grad_norm": 0.8036819100379944, + "learning_rate": 3.437775110044018e-05, + "loss": 0.5068, + "step": 7818 + }, + { + "epoch": 10.00832, + "grad_norm": 0.7495970129966736, + "learning_rate": 3.437575030012005e-05, + "loss": 0.5221, + "step": 7819 + }, + { + "epoch": 10.0096, + "grad_norm": 0.8430529832839966, + "learning_rate": 3.437374949979992e-05, + "loss": 0.5377, + "step": 7820 + }, + { + "epoch": 10.01088, + "grad_norm": 0.8672014474868774, + "learning_rate": 3.4371748699479794e-05, + "loss": 0.5683, + "step": 7821 + }, + { + "epoch": 10.01216, + "grad_norm": 0.904216468334198, + "learning_rate": 3.4369747899159666e-05, + "loss": 0.6345, + "step": 7822 + }, + { + "epoch": 10.01344, + "grad_norm": 0.9090774655342102, + "learning_rate": 3.436774709883954e-05, + "loss": 0.6341, + "step": 7823 + }, + { + "epoch": 10.01472, + "grad_norm": 0.857877790927887, + "learning_rate": 3.436574629851941e-05, + "loss": 0.5991, + "step": 7824 + }, + { + "epoch": 10.016, + "grad_norm": 0.8711366057395935, + "learning_rate": 3.436374549819928e-05, + "loss": 0.5882, + "step": 7825 + }, + { + "epoch": 10.01728, + "grad_norm": 0.8916945457458496, + "learning_rate": 3.436174469787915e-05, + "loss": 0.5846, + "step": 7826 + }, + { + "epoch": 10.01856, + "grad_norm": 0.8220417499542236, + "learning_rate": 3.4359743897559025e-05, + "loss": 0.5259, + "step": 7827 + }, + { + "epoch": 10.01984, + "grad_norm": 0.8267784118652344, + "learning_rate": 3.43577430972389e-05, + "loss": 0.5125, + "step": 7828 + }, + { + "epoch": 10.02112, + "grad_norm": 0.8478044271469116, + "learning_rate": 3.435574229691877e-05, + "loss": 0.564, + "step": 7829 + }, + { + "epoch": 10.0224, + "grad_norm": 0.8476915955543518, + "learning_rate": 3.435374149659864e-05, + "loss": 0.514, + "step": 7830 + }, + { + "epoch": 10.02368, + "grad_norm": 0.8940284252166748, + "learning_rate": 3.435174069627851e-05, + "loss": 0.6042, + "step": 7831 + }, + { + "epoch": 10.02496, + "grad_norm": 0.8573498129844666, + "learning_rate": 3.4349739895958384e-05, + "loss": 0.5649, + "step": 7832 + }, + { + "epoch": 10.02624, + "grad_norm": 0.8762027621269226, + "learning_rate": 3.4347739095638256e-05, + "loss": 0.5969, + "step": 7833 + }, + { + "epoch": 10.02752, + "grad_norm": 0.9091726541519165, + "learning_rate": 3.434573829531813e-05, + "loss": 0.5614, + "step": 7834 + }, + { + "epoch": 10.0288, + "grad_norm": 0.8687475919723511, + "learning_rate": 3.4343737494998e-05, + "loss": 0.5659, + "step": 7835 + }, + { + "epoch": 10.03008, + "grad_norm": 0.8800902366638184, + "learning_rate": 3.434173669467787e-05, + "loss": 0.5798, + "step": 7836 + }, + { + "epoch": 10.03136, + "grad_norm": 0.8568121194839478, + "learning_rate": 3.4339735894357744e-05, + "loss": 0.5399, + "step": 7837 + }, + { + "epoch": 10.03264, + "grad_norm": 0.8370882272720337, + "learning_rate": 3.4337735094037615e-05, + "loss": 0.5406, + "step": 7838 + }, + { + "epoch": 10.03392, + "grad_norm": 0.8754110932350159, + "learning_rate": 3.433573429371749e-05, + "loss": 0.5842, + "step": 7839 + }, + { + "epoch": 10.0352, + "grad_norm": 0.8230734467506409, + "learning_rate": 3.4333733493397366e-05, + "loss": 0.529, + "step": 7840 + }, + { + "epoch": 10.03648, + "grad_norm": 0.8830897212028503, + "learning_rate": 3.433173269307723e-05, + "loss": 0.5796, + "step": 7841 + }, + { + "epoch": 10.03776, + "grad_norm": 0.8082362413406372, + "learning_rate": 3.43297318927571e-05, + "loss": 0.5195, + "step": 7842 + }, + { + "epoch": 10.03904, + "grad_norm": 0.8439376354217529, + "learning_rate": 3.4327731092436975e-05, + "loss": 0.5555, + "step": 7843 + }, + { + "epoch": 10.04032, + "grad_norm": 0.8426663279533386, + "learning_rate": 3.4325730292116847e-05, + "loss": 0.5987, + "step": 7844 + }, + { + "epoch": 10.0416, + "grad_norm": 0.8852523565292358, + "learning_rate": 3.432372949179672e-05, + "loss": 0.5595, + "step": 7845 + }, + { + "epoch": 10.04288, + "grad_norm": 0.83525151014328, + "learning_rate": 3.432172869147659e-05, + "loss": 0.5076, + "step": 7846 + }, + { + "epoch": 10.04416, + "grad_norm": 0.879931628704071, + "learning_rate": 3.431972789115647e-05, + "loss": 0.56, + "step": 7847 + }, + { + "epoch": 10.04544, + "grad_norm": 0.8788473606109619, + "learning_rate": 3.431772709083634e-05, + "loss": 0.6005, + "step": 7848 + }, + { + "epoch": 10.04672, + "grad_norm": 0.8380979895591736, + "learning_rate": 3.4315726290516206e-05, + "loss": 0.5227, + "step": 7849 + }, + { + "epoch": 10.048, + "grad_norm": 0.8248594999313354, + "learning_rate": 3.431372549019608e-05, + "loss": 0.5403, + "step": 7850 + }, + { + "epoch": 10.04928, + "grad_norm": 0.8815642595291138, + "learning_rate": 3.431172468987595e-05, + "loss": 0.5672, + "step": 7851 + }, + { + "epoch": 10.05056, + "grad_norm": 0.8581164479255676, + "learning_rate": 3.430972388955582e-05, + "loss": 0.5726, + "step": 7852 + }, + { + "epoch": 10.05184, + "grad_norm": 0.9075201749801636, + "learning_rate": 3.430772308923569e-05, + "loss": 0.6214, + "step": 7853 + }, + { + "epoch": 10.05312, + "grad_norm": 0.918493390083313, + "learning_rate": 3.430572228891557e-05, + "loss": 0.591, + "step": 7854 + }, + { + "epoch": 10.0544, + "grad_norm": 0.9461652040481567, + "learning_rate": 3.4303721488595444e-05, + "loss": 0.641, + "step": 7855 + }, + { + "epoch": 10.05568, + "grad_norm": 0.9013059735298157, + "learning_rate": 3.4301720688275316e-05, + "loss": 0.548, + "step": 7856 + }, + { + "epoch": 10.05696, + "grad_norm": 0.8635231256484985, + "learning_rate": 3.429971988795518e-05, + "loss": 0.5381, + "step": 7857 + }, + { + "epoch": 10.05824, + "grad_norm": 0.8652742505073547, + "learning_rate": 3.429771908763505e-05, + "loss": 0.526, + "step": 7858 + }, + { + "epoch": 10.05952, + "grad_norm": 0.8748012781143188, + "learning_rate": 3.4295718287314924e-05, + "loss": 0.5851, + "step": 7859 + }, + { + "epoch": 10.0608, + "grad_norm": 0.8530634641647339, + "learning_rate": 3.4293717486994796e-05, + "loss": 0.5469, + "step": 7860 + }, + { + "epoch": 10.06208, + "grad_norm": 0.8816656470298767, + "learning_rate": 3.4291716686674675e-05, + "loss": 0.5817, + "step": 7861 + }, + { + "epoch": 10.06336, + "grad_norm": 0.8042013049125671, + "learning_rate": 3.428971588635455e-05, + "loss": 0.5568, + "step": 7862 + }, + { + "epoch": 10.06464, + "grad_norm": 0.8870890736579895, + "learning_rate": 3.428771508603442e-05, + "loss": 0.5667, + "step": 7863 + }, + { + "epoch": 10.06592, + "grad_norm": 0.9021799564361572, + "learning_rate": 3.428571428571429e-05, + "loss": 0.5946, + "step": 7864 + }, + { + "epoch": 10.0672, + "grad_norm": 0.8616315126419067, + "learning_rate": 3.4283713485394156e-05, + "loss": 0.5375, + "step": 7865 + }, + { + "epoch": 10.06848, + "grad_norm": 0.8459718823432922, + "learning_rate": 3.428171268507403e-05, + "loss": 0.5569, + "step": 7866 + }, + { + "epoch": 10.06976, + "grad_norm": 0.8223601579666138, + "learning_rate": 3.42797118847539e-05, + "loss": 0.5202, + "step": 7867 + }, + { + "epoch": 10.07104, + "grad_norm": 0.8322831392288208, + "learning_rate": 3.427771108443378e-05, + "loss": 0.5596, + "step": 7868 + }, + { + "epoch": 10.07232, + "grad_norm": 0.8532194495201111, + "learning_rate": 3.427571028411365e-05, + "loss": 0.562, + "step": 7869 + }, + { + "epoch": 10.0736, + "grad_norm": 0.8365527391433716, + "learning_rate": 3.427370948379352e-05, + "loss": 0.5869, + "step": 7870 + }, + { + "epoch": 10.07488, + "grad_norm": 0.8544728755950928, + "learning_rate": 3.4271708683473393e-05, + "loss": 0.562, + "step": 7871 + }, + { + "epoch": 10.07616, + "grad_norm": 0.8916680216789246, + "learning_rate": 3.4269707883153265e-05, + "loss": 0.6106, + "step": 7872 + }, + { + "epoch": 10.07744, + "grad_norm": 0.904686450958252, + "learning_rate": 3.426770708283313e-05, + "loss": 0.5728, + "step": 7873 + }, + { + "epoch": 10.07872, + "grad_norm": 0.9149865508079529, + "learning_rate": 3.4265706282513e-05, + "loss": 0.5738, + "step": 7874 + }, + { + "epoch": 10.08, + "grad_norm": 0.905720591545105, + "learning_rate": 3.426370548219288e-05, + "loss": 0.5414, + "step": 7875 + }, + { + "epoch": 10.08128, + "grad_norm": 0.9203237295150757, + "learning_rate": 3.426170468187275e-05, + "loss": 0.5853, + "step": 7876 + }, + { + "epoch": 10.08256, + "grad_norm": 0.8600616455078125, + "learning_rate": 3.4259703881552625e-05, + "loss": 0.5506, + "step": 7877 + }, + { + "epoch": 10.08384, + "grad_norm": 0.8730953335762024, + "learning_rate": 3.4257703081232496e-05, + "loss": 0.5553, + "step": 7878 + }, + { + "epoch": 10.08512, + "grad_norm": 0.8567367792129517, + "learning_rate": 3.425570228091237e-05, + "loss": 0.5875, + "step": 7879 + }, + { + "epoch": 10.0864, + "grad_norm": 0.8998702168464661, + "learning_rate": 3.425370148059224e-05, + "loss": 0.6049, + "step": 7880 + }, + { + "epoch": 10.08768, + "grad_norm": 0.88094162940979, + "learning_rate": 3.4251700680272105e-05, + "loss": 0.5715, + "step": 7881 + }, + { + "epoch": 10.08896, + "grad_norm": 0.8907934427261353, + "learning_rate": 3.424969987995198e-05, + "loss": 0.5553, + "step": 7882 + }, + { + "epoch": 10.09024, + "grad_norm": 0.809238076210022, + "learning_rate": 3.4247699079631856e-05, + "loss": 0.5389, + "step": 7883 + }, + { + "epoch": 10.09152, + "grad_norm": 0.8654890060424805, + "learning_rate": 3.424569827931173e-05, + "loss": 0.5753, + "step": 7884 + }, + { + "epoch": 10.0928, + "grad_norm": 0.8068164587020874, + "learning_rate": 3.42436974789916e-05, + "loss": 0.5082, + "step": 7885 + }, + { + "epoch": 10.09408, + "grad_norm": 0.8308097124099731, + "learning_rate": 3.424169667867147e-05, + "loss": 0.5334, + "step": 7886 + }, + { + "epoch": 10.09536, + "grad_norm": 0.889793336391449, + "learning_rate": 3.423969587835134e-05, + "loss": 0.5774, + "step": 7887 + }, + { + "epoch": 10.09664, + "grad_norm": 0.9080507159233093, + "learning_rate": 3.4237695078031215e-05, + "loss": 0.5919, + "step": 7888 + }, + { + "epoch": 10.09792, + "grad_norm": 0.8671941757202148, + "learning_rate": 3.423569427771108e-05, + "loss": 0.5619, + "step": 7889 + }, + { + "epoch": 10.0992, + "grad_norm": 0.8323997259140015, + "learning_rate": 3.423369347739096e-05, + "loss": 0.4927, + "step": 7890 + }, + { + "epoch": 10.10048, + "grad_norm": 0.8521079421043396, + "learning_rate": 3.423169267707083e-05, + "loss": 0.5689, + "step": 7891 + }, + { + "epoch": 10.10176, + "grad_norm": 0.8631630539894104, + "learning_rate": 3.42296918767507e-05, + "loss": 0.5655, + "step": 7892 + }, + { + "epoch": 10.10304, + "grad_norm": 0.8886538147926331, + "learning_rate": 3.4227691076430574e-05, + "loss": 0.5851, + "step": 7893 + }, + { + "epoch": 10.10432, + "grad_norm": 0.8930172920227051, + "learning_rate": 3.4225690276110446e-05, + "loss": 0.6251, + "step": 7894 + }, + { + "epoch": 10.1056, + "grad_norm": 0.8463165760040283, + "learning_rate": 3.422368947579032e-05, + "loss": 0.4898, + "step": 7895 + }, + { + "epoch": 10.10688, + "grad_norm": 0.893409252166748, + "learning_rate": 3.422168867547019e-05, + "loss": 0.5761, + "step": 7896 + }, + { + "epoch": 10.10816, + "grad_norm": 0.8559973239898682, + "learning_rate": 3.421968787515006e-05, + "loss": 0.5351, + "step": 7897 + }, + { + "epoch": 10.10944, + "grad_norm": 0.8258970379829407, + "learning_rate": 3.4217687074829934e-05, + "loss": 0.5773, + "step": 7898 + }, + { + "epoch": 10.11072, + "grad_norm": 0.8773029446601868, + "learning_rate": 3.4215686274509805e-05, + "loss": 0.547, + "step": 7899 + }, + { + "epoch": 10.112, + "grad_norm": 0.8930992484092712, + "learning_rate": 3.421368547418968e-05, + "loss": 0.6053, + "step": 7900 + }, + { + "epoch": 10.11328, + "grad_norm": 0.872535228729248, + "learning_rate": 3.421168467386955e-05, + "loss": 0.5813, + "step": 7901 + }, + { + "epoch": 10.11456, + "grad_norm": 0.8744470477104187, + "learning_rate": 3.420968387354942e-05, + "loss": 0.5333, + "step": 7902 + }, + { + "epoch": 10.11584, + "grad_norm": 0.8636131882667542, + "learning_rate": 3.420768307322929e-05, + "loss": 0.5746, + "step": 7903 + }, + { + "epoch": 10.11712, + "grad_norm": 0.865611732006073, + "learning_rate": 3.4205682272909165e-05, + "loss": 0.561, + "step": 7904 + }, + { + "epoch": 10.1184, + "grad_norm": 0.8732993006706238, + "learning_rate": 3.4203681472589037e-05, + "loss": 0.6067, + "step": 7905 + }, + { + "epoch": 10.11968, + "grad_norm": 0.8363077044487, + "learning_rate": 3.420168067226891e-05, + "loss": 0.5354, + "step": 7906 + }, + { + "epoch": 10.12096, + "grad_norm": 0.9122660756111145, + "learning_rate": 3.419967987194878e-05, + "loss": 0.6047, + "step": 7907 + }, + { + "epoch": 10.12224, + "grad_norm": 0.8874605298042297, + "learning_rate": 3.419767907162865e-05, + "loss": 0.6021, + "step": 7908 + }, + { + "epoch": 10.12352, + "grad_norm": 0.9380055665969849, + "learning_rate": 3.4195678271308524e-05, + "loss": 0.6033, + "step": 7909 + }, + { + "epoch": 10.1248, + "grad_norm": 0.8816724419593811, + "learning_rate": 3.4193677470988396e-05, + "loss": 0.5775, + "step": 7910 + }, + { + "epoch": 10.12608, + "grad_norm": 0.8694825172424316, + "learning_rate": 3.4191676670668274e-05, + "loss": 0.5716, + "step": 7911 + }, + { + "epoch": 10.12736, + "grad_norm": 0.9232937693595886, + "learning_rate": 3.418967587034814e-05, + "loss": 0.6145, + "step": 7912 + }, + { + "epoch": 10.12864, + "grad_norm": 0.8252090215682983, + "learning_rate": 3.418767507002801e-05, + "loss": 0.5711, + "step": 7913 + }, + { + "epoch": 10.12992, + "grad_norm": 0.8287693858146667, + "learning_rate": 3.418567426970788e-05, + "loss": 0.5349, + "step": 7914 + }, + { + "epoch": 10.1312, + "grad_norm": 0.8445566892623901, + "learning_rate": 3.4183673469387755e-05, + "loss": 0.5614, + "step": 7915 + }, + { + "epoch": 10.13248, + "grad_norm": 0.886237382888794, + "learning_rate": 3.418167266906763e-05, + "loss": 0.5865, + "step": 7916 + }, + { + "epoch": 10.13376, + "grad_norm": 0.8767721652984619, + "learning_rate": 3.41796718687475e-05, + "loss": 0.5316, + "step": 7917 + }, + { + "epoch": 10.13504, + "grad_norm": 0.907900333404541, + "learning_rate": 3.417767106842738e-05, + "loss": 0.5782, + "step": 7918 + }, + { + "epoch": 10.13632, + "grad_norm": 0.8434520363807678, + "learning_rate": 3.417567026810725e-05, + "loss": 0.5359, + "step": 7919 + }, + { + "epoch": 10.1376, + "grad_norm": 0.8087551593780518, + "learning_rate": 3.4173669467787114e-05, + "loss": 0.5508, + "step": 7920 + }, + { + "epoch": 10.13888, + "grad_norm": 0.9005576968193054, + "learning_rate": 3.4171668667466986e-05, + "loss": 0.5558, + "step": 7921 + }, + { + "epoch": 10.14016, + "grad_norm": 0.8877005577087402, + "learning_rate": 3.416966786714686e-05, + "loss": 0.615, + "step": 7922 + }, + { + "epoch": 10.14144, + "grad_norm": 0.8587065935134888, + "learning_rate": 3.416766706682673e-05, + "loss": 0.5714, + "step": 7923 + }, + { + "epoch": 10.14272, + "grad_norm": 0.8844937086105347, + "learning_rate": 3.41656662665066e-05, + "loss": 0.6407, + "step": 7924 + }, + { + "epoch": 10.144, + "grad_norm": 0.8781236410140991, + "learning_rate": 3.416366546618648e-05, + "loss": 0.5428, + "step": 7925 + }, + { + "epoch": 10.14528, + "grad_norm": 0.8121967315673828, + "learning_rate": 3.416166466586635e-05, + "loss": 0.5061, + "step": 7926 + }, + { + "epoch": 10.14656, + "grad_norm": 0.8259449005126953, + "learning_rate": 3.4159663865546224e-05, + "loss": 0.5297, + "step": 7927 + }, + { + "epoch": 10.14784, + "grad_norm": 0.9252973794937134, + "learning_rate": 3.415766306522609e-05, + "loss": 0.6159, + "step": 7928 + }, + { + "epoch": 10.14912, + "grad_norm": 0.8705994486808777, + "learning_rate": 3.415566226490596e-05, + "loss": 0.5131, + "step": 7929 + }, + { + "epoch": 10.1504, + "grad_norm": 0.892468273639679, + "learning_rate": 3.415366146458583e-05, + "loss": 0.551, + "step": 7930 + }, + { + "epoch": 10.15168, + "grad_norm": 0.8754770755767822, + "learning_rate": 3.4151660664265705e-05, + "loss": 0.5457, + "step": 7931 + }, + { + "epoch": 10.15296, + "grad_norm": 0.8621395826339722, + "learning_rate": 3.4149659863945583e-05, + "loss": 0.5292, + "step": 7932 + }, + { + "epoch": 10.15424, + "grad_norm": 0.9692301750183105, + "learning_rate": 3.4147659063625455e-05, + "loss": 0.5985, + "step": 7933 + }, + { + "epoch": 10.15552, + "grad_norm": 0.9365867376327515, + "learning_rate": 3.414565826330533e-05, + "loss": 0.5769, + "step": 7934 + }, + { + "epoch": 10.1568, + "grad_norm": 0.9413188695907593, + "learning_rate": 3.41436574629852e-05, + "loss": 0.6406, + "step": 7935 + }, + { + "epoch": 10.15808, + "grad_norm": 0.8697437047958374, + "learning_rate": 3.4141656662665064e-05, + "loss": 0.5793, + "step": 7936 + }, + { + "epoch": 10.15936, + "grad_norm": 0.8574464917182922, + "learning_rate": 3.4139655862344936e-05, + "loss": 0.5234, + "step": 7937 + }, + { + "epoch": 10.16064, + "grad_norm": 0.8575087785720825, + "learning_rate": 3.413765506202481e-05, + "loss": 0.5734, + "step": 7938 + }, + { + "epoch": 10.16192, + "grad_norm": 0.8977473974227905, + "learning_rate": 3.4135654261704686e-05, + "loss": 0.5894, + "step": 7939 + }, + { + "epoch": 10.1632, + "grad_norm": 0.8591358661651611, + "learning_rate": 3.413365346138456e-05, + "loss": 0.527, + "step": 7940 + }, + { + "epoch": 10.16448, + "grad_norm": 0.8251833915710449, + "learning_rate": 3.413165266106443e-05, + "loss": 0.5393, + "step": 7941 + }, + { + "epoch": 10.16576, + "grad_norm": 0.8660216331481934, + "learning_rate": 3.41296518607443e-05, + "loss": 0.5797, + "step": 7942 + }, + { + "epoch": 10.16704, + "grad_norm": 0.9057691693305969, + "learning_rate": 3.4127651060424174e-05, + "loss": 0.5394, + "step": 7943 + }, + { + "epoch": 10.16832, + "grad_norm": 0.914899468421936, + "learning_rate": 3.412565026010404e-05, + "loss": 0.5747, + "step": 7944 + }, + { + "epoch": 10.1696, + "grad_norm": 0.838821291923523, + "learning_rate": 3.412364945978391e-05, + "loss": 0.5453, + "step": 7945 + }, + { + "epoch": 10.17088, + "grad_norm": 0.8535134792327881, + "learning_rate": 3.412164865946379e-05, + "loss": 0.5618, + "step": 7946 + }, + { + "epoch": 10.17216, + "grad_norm": 0.8193386793136597, + "learning_rate": 3.411964785914366e-05, + "loss": 0.5303, + "step": 7947 + }, + { + "epoch": 10.17344, + "grad_norm": 0.8792878985404968, + "learning_rate": 3.411764705882353e-05, + "loss": 0.6145, + "step": 7948 + }, + { + "epoch": 10.17472, + "grad_norm": 0.9004140496253967, + "learning_rate": 3.4115646258503405e-05, + "loss": 0.6126, + "step": 7949 + }, + { + "epoch": 10.176, + "grad_norm": 0.883179247379303, + "learning_rate": 3.411364545818328e-05, + "loss": 0.5655, + "step": 7950 + }, + { + "epoch": 10.17728, + "grad_norm": 0.8625328540802002, + "learning_rate": 3.411164465786315e-05, + "loss": 0.5993, + "step": 7951 + }, + { + "epoch": 10.17856, + "grad_norm": 0.8304584622383118, + "learning_rate": 3.4109643857543014e-05, + "loss": 0.5177, + "step": 7952 + }, + { + "epoch": 10.17984, + "grad_norm": 0.8241145610809326, + "learning_rate": 3.410764305722289e-05, + "loss": 0.5112, + "step": 7953 + }, + { + "epoch": 10.18112, + "grad_norm": 0.8981744647026062, + "learning_rate": 3.4105642256902764e-05, + "loss": 0.5897, + "step": 7954 + }, + { + "epoch": 10.1824, + "grad_norm": 0.8619903326034546, + "learning_rate": 3.4103641456582636e-05, + "loss": 0.5543, + "step": 7955 + }, + { + "epoch": 10.18368, + "grad_norm": 0.8140372037887573, + "learning_rate": 3.410164065626251e-05, + "loss": 0.5214, + "step": 7956 + }, + { + "epoch": 10.18496, + "grad_norm": 0.8518925309181213, + "learning_rate": 3.409963985594238e-05, + "loss": 0.5465, + "step": 7957 + }, + { + "epoch": 10.18624, + "grad_norm": 0.8702125549316406, + "learning_rate": 3.409763905562225e-05, + "loss": 0.5621, + "step": 7958 + }, + { + "epoch": 10.18752, + "grad_norm": 0.8971076011657715, + "learning_rate": 3.4095638255302124e-05, + "loss": 0.5631, + "step": 7959 + }, + { + "epoch": 10.1888, + "grad_norm": 0.853391706943512, + "learning_rate": 3.4093637454981995e-05, + "loss": 0.5682, + "step": 7960 + }, + { + "epoch": 10.19008, + "grad_norm": 0.8766993284225464, + "learning_rate": 3.409163665466187e-05, + "loss": 0.6276, + "step": 7961 + }, + { + "epoch": 10.19136, + "grad_norm": 0.8604820370674133, + "learning_rate": 3.408963585434174e-05, + "loss": 0.5688, + "step": 7962 + }, + { + "epoch": 10.19264, + "grad_norm": 0.8074341416358948, + "learning_rate": 3.408763505402161e-05, + "loss": 0.5106, + "step": 7963 + }, + { + "epoch": 10.19392, + "grad_norm": 0.871636152267456, + "learning_rate": 3.408563425370148e-05, + "loss": 0.6026, + "step": 7964 + }, + { + "epoch": 10.1952, + "grad_norm": 0.8580559492111206, + "learning_rate": 3.4083633453381355e-05, + "loss": 0.5621, + "step": 7965 + }, + { + "epoch": 10.19648, + "grad_norm": 0.8855502605438232, + "learning_rate": 3.408163265306123e-05, + "loss": 0.5359, + "step": 7966 + }, + { + "epoch": 10.19776, + "grad_norm": 0.8627590537071228, + "learning_rate": 3.40796318527411e-05, + "loss": 0.521, + "step": 7967 + }, + { + "epoch": 10.19904, + "grad_norm": 0.7877764105796814, + "learning_rate": 3.407763105242097e-05, + "loss": 0.5293, + "step": 7968 + }, + { + "epoch": 10.20032, + "grad_norm": 0.8240257501602173, + "learning_rate": 3.407563025210084e-05, + "loss": 0.5453, + "step": 7969 + }, + { + "epoch": 10.2016, + "grad_norm": 0.8509385585784912, + "learning_rate": 3.4073629451780714e-05, + "loss": 0.5745, + "step": 7970 + }, + { + "epoch": 10.20288, + "grad_norm": 0.8994958400726318, + "learning_rate": 3.4071628651460586e-05, + "loss": 0.5561, + "step": 7971 + }, + { + "epoch": 10.20416, + "grad_norm": 0.8885303139686584, + "learning_rate": 3.406962785114046e-05, + "loss": 0.5537, + "step": 7972 + }, + { + "epoch": 10.20544, + "grad_norm": 0.9128597378730774, + "learning_rate": 3.406762705082033e-05, + "loss": 0.5795, + "step": 7973 + }, + { + "epoch": 10.20672, + "grad_norm": 0.83772212266922, + "learning_rate": 3.40656262505002e-05, + "loss": 0.5156, + "step": 7974 + }, + { + "epoch": 10.208, + "grad_norm": 0.822971522808075, + "learning_rate": 3.406362545018007e-05, + "loss": 0.5237, + "step": 7975 + }, + { + "epoch": 10.20928, + "grad_norm": 0.898842453956604, + "learning_rate": 3.4061624649859945e-05, + "loss": 0.5979, + "step": 7976 + }, + { + "epoch": 10.21056, + "grad_norm": 0.9250447154045105, + "learning_rate": 3.405962384953982e-05, + "loss": 0.6017, + "step": 7977 + }, + { + "epoch": 10.21184, + "grad_norm": 0.8869206309318542, + "learning_rate": 3.405762304921969e-05, + "loss": 0.5366, + "step": 7978 + }, + { + "epoch": 10.21312, + "grad_norm": 0.863950788974762, + "learning_rate": 3.405562224889956e-05, + "loss": 0.5374, + "step": 7979 + }, + { + "epoch": 10.2144, + "grad_norm": 0.81676185131073, + "learning_rate": 3.405362144857943e-05, + "loss": 0.5335, + "step": 7980 + }, + { + "epoch": 10.21568, + "grad_norm": 0.8401233553886414, + "learning_rate": 3.405162064825931e-05, + "loss": 0.5484, + "step": 7981 + }, + { + "epoch": 10.21696, + "grad_norm": 0.8678064346313477, + "learning_rate": 3.4049619847939176e-05, + "loss": 0.5231, + "step": 7982 + }, + { + "epoch": 10.21824, + "grad_norm": 0.8489272594451904, + "learning_rate": 3.404761904761905e-05, + "loss": 0.4638, + "step": 7983 + }, + { + "epoch": 10.21952, + "grad_norm": 0.8029497265815735, + "learning_rate": 3.404561824729892e-05, + "loss": 0.5081, + "step": 7984 + }, + { + "epoch": 10.2208, + "grad_norm": 0.8994140625, + "learning_rate": 3.404361744697879e-05, + "loss": 0.5386, + "step": 7985 + }, + { + "epoch": 10.22208, + "grad_norm": 0.8933800458908081, + "learning_rate": 3.4041616646658664e-05, + "loss": 0.5312, + "step": 7986 + }, + { + "epoch": 10.22336, + "grad_norm": 0.8817566633224487, + "learning_rate": 3.4039615846338536e-05, + "loss": 0.5578, + "step": 7987 + }, + { + "epoch": 10.22464, + "grad_norm": 0.8852986693382263, + "learning_rate": 3.403761504601841e-05, + "loss": 0.5669, + "step": 7988 + }, + { + "epoch": 10.22592, + "grad_norm": 0.9889851212501526, + "learning_rate": 3.4035614245698286e-05, + "loss": 0.5101, + "step": 7989 + }, + { + "epoch": 10.2272, + "grad_norm": 0.8555750846862793, + "learning_rate": 3.403361344537815e-05, + "loss": 0.551, + "step": 7990 + }, + { + "epoch": 10.22848, + "grad_norm": 0.838453471660614, + "learning_rate": 3.403161264505802e-05, + "loss": 0.565, + "step": 7991 + }, + { + "epoch": 10.22976, + "grad_norm": 0.8447315096855164, + "learning_rate": 3.4029611844737895e-05, + "loss": 0.5671, + "step": 7992 + }, + { + "epoch": 10.23104, + "grad_norm": 0.8619872331619263, + "learning_rate": 3.402761104441777e-05, + "loss": 0.5877, + "step": 7993 + }, + { + "epoch": 10.23232, + "grad_norm": 0.8690614104270935, + "learning_rate": 3.402561024409764e-05, + "loss": 0.5583, + "step": 7994 + }, + { + "epoch": 10.2336, + "grad_norm": 0.9469643831253052, + "learning_rate": 3.402360944377751e-05, + "loss": 0.5224, + "step": 7995 + }, + { + "epoch": 10.23488, + "grad_norm": 0.8962279558181763, + "learning_rate": 3.402160864345739e-05, + "loss": 0.5714, + "step": 7996 + }, + { + "epoch": 10.23616, + "grad_norm": 0.9197521805763245, + "learning_rate": 3.401960784313726e-05, + "loss": 0.5755, + "step": 7997 + }, + { + "epoch": 10.23744, + "grad_norm": 0.8115956783294678, + "learning_rate": 3.4017607042817126e-05, + "loss": 0.5475, + "step": 7998 + }, + { + "epoch": 10.23872, + "grad_norm": 0.7984319925308228, + "learning_rate": 3.4015606242497e-05, + "loss": 0.5479, + "step": 7999 + }, + { + "epoch": 10.24, + "grad_norm": 0.8857104778289795, + "learning_rate": 3.401360544217687e-05, + "loss": 0.5641, + "step": 8000 + }, + { + "epoch": 10.24128, + "grad_norm": 0.9119358062744141, + "learning_rate": 3.401160464185674e-05, + "loss": 0.5903, + "step": 8001 + }, + { + "epoch": 10.24256, + "grad_norm": 0.8577554821968079, + "learning_rate": 3.4009603841536613e-05, + "loss": 0.5551, + "step": 8002 + }, + { + "epoch": 10.24384, + "grad_norm": 0.9047631025314331, + "learning_rate": 3.400760304121649e-05, + "loss": 0.551, + "step": 8003 + }, + { + "epoch": 10.24512, + "grad_norm": 0.8967153429985046, + "learning_rate": 3.4005602240896364e-05, + "loss": 0.5531, + "step": 8004 + }, + { + "epoch": 10.2464, + "grad_norm": 0.8773406744003296, + "learning_rate": 3.4003601440576236e-05, + "loss": 0.5648, + "step": 8005 + }, + { + "epoch": 10.24768, + "grad_norm": 0.903235137462616, + "learning_rate": 3.40016006402561e-05, + "loss": 0.5552, + "step": 8006 + }, + { + "epoch": 10.24896, + "grad_norm": 0.8304867148399353, + "learning_rate": 3.399959983993597e-05, + "loss": 0.5472, + "step": 8007 + }, + { + "epoch": 10.25024, + "grad_norm": 0.844079852104187, + "learning_rate": 3.3997599039615845e-05, + "loss": 0.5607, + "step": 8008 + }, + { + "epoch": 10.25152, + "grad_norm": 0.8768244981765747, + "learning_rate": 3.3995598239295716e-05, + "loss": 0.5559, + "step": 8009 + }, + { + "epoch": 10.2528, + "grad_norm": 0.8504285216331482, + "learning_rate": 3.3993597438975595e-05, + "loss": 0.5555, + "step": 8010 + }, + { + "epoch": 10.25408, + "grad_norm": 0.8665524125099182, + "learning_rate": 3.399159663865547e-05, + "loss": 0.5473, + "step": 8011 + }, + { + "epoch": 10.25536, + "grad_norm": 0.8851114511489868, + "learning_rate": 3.398959583833534e-05, + "loss": 0.6117, + "step": 8012 + }, + { + "epoch": 10.25664, + "grad_norm": 0.8923035860061646, + "learning_rate": 3.398759503801521e-05, + "loss": 0.5575, + "step": 8013 + }, + { + "epoch": 10.25792, + "grad_norm": 0.8378768563270569, + "learning_rate": 3.3985594237695076e-05, + "loss": 0.5484, + "step": 8014 + }, + { + "epoch": 10.2592, + "grad_norm": 0.8756462931632996, + "learning_rate": 3.398359343737495e-05, + "loss": 0.562, + "step": 8015 + }, + { + "epoch": 10.26048, + "grad_norm": 0.8708872199058533, + "learning_rate": 3.398159263705482e-05, + "loss": 0.5702, + "step": 8016 + }, + { + "epoch": 10.26176, + "grad_norm": 0.865534245967865, + "learning_rate": 3.39795918367347e-05, + "loss": 0.5521, + "step": 8017 + }, + { + "epoch": 10.26304, + "grad_norm": 0.8553348779678345, + "learning_rate": 3.397759103641457e-05, + "loss": 0.5661, + "step": 8018 + }, + { + "epoch": 10.26432, + "grad_norm": 0.9413052797317505, + "learning_rate": 3.397559023609444e-05, + "loss": 0.5919, + "step": 8019 + }, + { + "epoch": 10.2656, + "grad_norm": 0.8924047350883484, + "learning_rate": 3.3973589435774314e-05, + "loss": 0.5855, + "step": 8020 + }, + { + "epoch": 10.26688, + "grad_norm": 0.8545451164245605, + "learning_rate": 3.3971588635454186e-05, + "loss": 0.5515, + "step": 8021 + }, + { + "epoch": 10.26816, + "grad_norm": 0.8800926208496094, + "learning_rate": 3.396958783513405e-05, + "loss": 0.537, + "step": 8022 + }, + { + "epoch": 10.26944, + "grad_norm": 0.9025660753250122, + "learning_rate": 3.396758703481392e-05, + "loss": 0.6178, + "step": 8023 + }, + { + "epoch": 10.27072, + "grad_norm": 0.8830463290214539, + "learning_rate": 3.39655862344938e-05, + "loss": 0.555, + "step": 8024 + }, + { + "epoch": 10.272, + "grad_norm": 0.9167059659957886, + "learning_rate": 3.396358543417367e-05, + "loss": 0.5678, + "step": 8025 + }, + { + "epoch": 10.27328, + "grad_norm": 0.8567641377449036, + "learning_rate": 3.3961584633853545e-05, + "loss": 0.5473, + "step": 8026 + }, + { + "epoch": 10.27456, + "grad_norm": 0.8600786328315735, + "learning_rate": 3.395958383353342e-05, + "loss": 0.5562, + "step": 8027 + }, + { + "epoch": 10.27584, + "grad_norm": 0.8937509655952454, + "learning_rate": 3.395758303321329e-05, + "loss": 0.5721, + "step": 8028 + }, + { + "epoch": 10.27712, + "grad_norm": 0.8415324091911316, + "learning_rate": 3.395558223289316e-05, + "loss": 0.5219, + "step": 8029 + }, + { + "epoch": 10.2784, + "grad_norm": 0.888176679611206, + "learning_rate": 3.3953581432573025e-05, + "loss": 0.5618, + "step": 8030 + }, + { + "epoch": 10.27968, + "grad_norm": 0.8729084730148315, + "learning_rate": 3.3951580632252904e-05, + "loss": 0.581, + "step": 8031 + }, + { + "epoch": 10.28096, + "grad_norm": 0.8753156661987305, + "learning_rate": 3.3949579831932776e-05, + "loss": 0.5667, + "step": 8032 + }, + { + "epoch": 10.28224, + "grad_norm": 0.8683957457542419, + "learning_rate": 3.394757903161265e-05, + "loss": 0.5274, + "step": 8033 + }, + { + "epoch": 10.28352, + "grad_norm": 0.8913008570671082, + "learning_rate": 3.394557823129252e-05, + "loss": 0.5799, + "step": 8034 + }, + { + "epoch": 10.2848, + "grad_norm": 0.8697010278701782, + "learning_rate": 3.394357743097239e-05, + "loss": 0.56, + "step": 8035 + }, + { + "epoch": 10.28608, + "grad_norm": 0.8997796177864075, + "learning_rate": 3.394157663065226e-05, + "loss": 0.6066, + "step": 8036 + }, + { + "epoch": 10.28736, + "grad_norm": 0.8587310314178467, + "learning_rate": 3.3939575830332135e-05, + "loss": 0.5758, + "step": 8037 + }, + { + "epoch": 10.288640000000001, + "grad_norm": 0.8550379276275635, + "learning_rate": 3.393757503001201e-05, + "loss": 0.5858, + "step": 8038 + }, + { + "epoch": 10.28992, + "grad_norm": 0.8495590686798096, + "learning_rate": 3.393557422969188e-05, + "loss": 0.5774, + "step": 8039 + }, + { + "epoch": 10.2912, + "grad_norm": 0.8640943765640259, + "learning_rate": 3.393357342937175e-05, + "loss": 0.5839, + "step": 8040 + }, + { + "epoch": 10.29248, + "grad_norm": 0.8572250604629517, + "learning_rate": 3.393157262905162e-05, + "loss": 0.5521, + "step": 8041 + }, + { + "epoch": 10.29376, + "grad_norm": 0.8335790634155273, + "learning_rate": 3.3929571828731495e-05, + "loss": 0.5296, + "step": 8042 + }, + { + "epoch": 10.29504, + "grad_norm": 0.861184298992157, + "learning_rate": 3.3927571028411366e-05, + "loss": 0.5687, + "step": 8043 + }, + { + "epoch": 10.29632, + "grad_norm": 0.8524270057678223, + "learning_rate": 3.392557022809124e-05, + "loss": 0.5577, + "step": 8044 + }, + { + "epoch": 10.2976, + "grad_norm": 0.9047152996063232, + "learning_rate": 3.392356942777111e-05, + "loss": 0.5752, + "step": 8045 + }, + { + "epoch": 10.29888, + "grad_norm": 0.8878473043441772, + "learning_rate": 3.392156862745098e-05, + "loss": 0.5627, + "step": 8046 + }, + { + "epoch": 10.30016, + "grad_norm": 0.8869286775588989, + "learning_rate": 3.3919567827130854e-05, + "loss": 0.587, + "step": 8047 + }, + { + "epoch": 10.30144, + "grad_norm": 0.8654095530509949, + "learning_rate": 3.3917567026810726e-05, + "loss": 0.5427, + "step": 8048 + }, + { + "epoch": 10.30272, + "grad_norm": 0.9054012298583984, + "learning_rate": 3.39155662264906e-05, + "loss": 0.5907, + "step": 8049 + }, + { + "epoch": 10.304, + "grad_norm": 0.852177083492279, + "learning_rate": 3.391356542617047e-05, + "loss": 0.5445, + "step": 8050 + }, + { + "epoch": 10.30528, + "grad_norm": 0.9118196964263916, + "learning_rate": 3.391156462585034e-05, + "loss": 0.5557, + "step": 8051 + }, + { + "epoch": 10.30656, + "grad_norm": 0.9239182472229004, + "learning_rate": 3.390956382553021e-05, + "loss": 0.5647, + "step": 8052 + }, + { + "epoch": 10.30784, + "grad_norm": 0.875339925289154, + "learning_rate": 3.3907563025210085e-05, + "loss": 0.5532, + "step": 8053 + }, + { + "epoch": 10.30912, + "grad_norm": 0.9014837145805359, + "learning_rate": 3.390556222488996e-05, + "loss": 0.5569, + "step": 8054 + }, + { + "epoch": 10.3104, + "grad_norm": 0.9332119226455688, + "learning_rate": 3.390356142456983e-05, + "loss": 0.5645, + "step": 8055 + }, + { + "epoch": 10.31168, + "grad_norm": 0.8813993334770203, + "learning_rate": 3.39015606242497e-05, + "loss": 0.5751, + "step": 8056 + }, + { + "epoch": 10.31296, + "grad_norm": 0.8453924655914307, + "learning_rate": 3.389955982392957e-05, + "loss": 0.5304, + "step": 8057 + }, + { + "epoch": 10.31424, + "grad_norm": 0.8781421780586243, + "learning_rate": 3.3897559023609444e-05, + "loss": 0.5555, + "step": 8058 + }, + { + "epoch": 10.31552, + "grad_norm": 0.819387674331665, + "learning_rate": 3.389555822328932e-05, + "loss": 0.5362, + "step": 8059 + }, + { + "epoch": 10.3168, + "grad_norm": 0.838315486907959, + "learning_rate": 3.389355742296919e-05, + "loss": 0.5609, + "step": 8060 + }, + { + "epoch": 10.31808, + "grad_norm": 0.8653327226638794, + "learning_rate": 3.389155662264906e-05, + "loss": 0.5313, + "step": 8061 + }, + { + "epoch": 10.31936, + "grad_norm": 0.8249607682228088, + "learning_rate": 3.388955582232893e-05, + "loss": 0.5308, + "step": 8062 + }, + { + "epoch": 10.32064, + "grad_norm": 0.8277181386947632, + "learning_rate": 3.3887555022008804e-05, + "loss": 0.5428, + "step": 8063 + }, + { + "epoch": 10.32192, + "grad_norm": 0.8770848512649536, + "learning_rate": 3.3885554221688675e-05, + "loss": 0.536, + "step": 8064 + }, + { + "epoch": 10.3232, + "grad_norm": 0.8789455890655518, + "learning_rate": 3.388355342136855e-05, + "loss": 0.5509, + "step": 8065 + }, + { + "epoch": 10.32448, + "grad_norm": 0.8777965307235718, + "learning_rate": 3.3881552621048426e-05, + "loss": 0.5601, + "step": 8066 + }, + { + "epoch": 10.32576, + "grad_norm": 0.846284031867981, + "learning_rate": 3.38795518207283e-05, + "loss": 0.5322, + "step": 8067 + }, + { + "epoch": 10.32704, + "grad_norm": 0.820878267288208, + "learning_rate": 3.387755102040816e-05, + "loss": 0.4972, + "step": 8068 + }, + { + "epoch": 10.32832, + "grad_norm": 0.8530818819999695, + "learning_rate": 3.3875550220088035e-05, + "loss": 0.5465, + "step": 8069 + }, + { + "epoch": 10.3296, + "grad_norm": 0.8610104322433472, + "learning_rate": 3.3873549419767907e-05, + "loss": 0.5503, + "step": 8070 + }, + { + "epoch": 10.33088, + "grad_norm": 0.883931040763855, + "learning_rate": 3.387154861944778e-05, + "loss": 0.5977, + "step": 8071 + }, + { + "epoch": 10.33216, + "grad_norm": 0.8754240870475769, + "learning_rate": 3.386954781912765e-05, + "loss": 0.5968, + "step": 8072 + }, + { + "epoch": 10.33344, + "grad_norm": 0.8416239023208618, + "learning_rate": 3.386754701880753e-05, + "loss": 0.5346, + "step": 8073 + }, + { + "epoch": 10.33472, + "grad_norm": 0.8367804884910583, + "learning_rate": 3.38655462184874e-05, + "loss": 0.5026, + "step": 8074 + }, + { + "epoch": 10.336, + "grad_norm": 0.8462991118431091, + "learning_rate": 3.386354541816727e-05, + "loss": 0.563, + "step": 8075 + }, + { + "epoch": 10.33728, + "grad_norm": 0.8666642308235168, + "learning_rate": 3.386154461784714e-05, + "loss": 0.5549, + "step": 8076 + }, + { + "epoch": 10.33856, + "grad_norm": 0.8881992101669312, + "learning_rate": 3.385954381752701e-05, + "loss": 0.5675, + "step": 8077 + }, + { + "epoch": 10.33984, + "grad_norm": 0.8590701222419739, + "learning_rate": 3.385754301720688e-05, + "loss": 0.5806, + "step": 8078 + }, + { + "epoch": 10.34112, + "grad_norm": 0.8979669809341431, + "learning_rate": 3.385554221688675e-05, + "loss": 0.5928, + "step": 8079 + }, + { + "epoch": 10.3424, + "grad_norm": 0.8830470442771912, + "learning_rate": 3.385354141656663e-05, + "loss": 0.5631, + "step": 8080 + }, + { + "epoch": 10.343679999999999, + "grad_norm": 0.8694333434104919, + "learning_rate": 3.3851540616246504e-05, + "loss": 0.5683, + "step": 8081 + }, + { + "epoch": 10.34496, + "grad_norm": 0.8495916128158569, + "learning_rate": 3.3849539815926376e-05, + "loss": 0.5719, + "step": 8082 + }, + { + "epoch": 10.34624, + "grad_norm": 0.8509037494659424, + "learning_rate": 3.384753901560625e-05, + "loss": 0.5438, + "step": 8083 + }, + { + "epoch": 10.34752, + "grad_norm": 0.8636916875839233, + "learning_rate": 3.384553821528611e-05, + "loss": 0.5433, + "step": 8084 + }, + { + "epoch": 10.3488, + "grad_norm": 0.8579983711242676, + "learning_rate": 3.3843537414965984e-05, + "loss": 0.5244, + "step": 8085 + }, + { + "epoch": 10.35008, + "grad_norm": 0.8326935172080994, + "learning_rate": 3.3841536614645856e-05, + "loss": 0.4873, + "step": 8086 + }, + { + "epoch": 10.35136, + "grad_norm": 0.8900673389434814, + "learning_rate": 3.3839535814325735e-05, + "loss": 0.6081, + "step": 8087 + }, + { + "epoch": 10.35264, + "grad_norm": 0.8790429830551147, + "learning_rate": 3.383753501400561e-05, + "loss": 0.5785, + "step": 8088 + }, + { + "epoch": 10.35392, + "grad_norm": 0.8508909344673157, + "learning_rate": 3.383553421368548e-05, + "loss": 0.5545, + "step": 8089 + }, + { + "epoch": 10.3552, + "grad_norm": 0.9315157532691956, + "learning_rate": 3.383353341336535e-05, + "loss": 0.5911, + "step": 8090 + }, + { + "epoch": 10.35648, + "grad_norm": 0.8326758146286011, + "learning_rate": 3.383153261304522e-05, + "loss": 0.5483, + "step": 8091 + }, + { + "epoch": 10.35776, + "grad_norm": 0.8776389956474304, + "learning_rate": 3.382953181272509e-05, + "loss": 0.5853, + "step": 8092 + }, + { + "epoch": 10.35904, + "grad_norm": 0.8722195625305176, + "learning_rate": 3.382753101240496e-05, + "loss": 0.5785, + "step": 8093 + }, + { + "epoch": 10.36032, + "grad_norm": 0.9000146389007568, + "learning_rate": 3.382553021208484e-05, + "loss": 0.5833, + "step": 8094 + }, + { + "epoch": 10.3616, + "grad_norm": 0.8690416812896729, + "learning_rate": 3.382352941176471e-05, + "loss": 0.5583, + "step": 8095 + }, + { + "epoch": 10.36288, + "grad_norm": 0.8674702048301697, + "learning_rate": 3.382152861144458e-05, + "loss": 0.5792, + "step": 8096 + }, + { + "epoch": 10.36416, + "grad_norm": 0.9020072221755981, + "learning_rate": 3.3819527811124453e-05, + "loss": 0.5737, + "step": 8097 + }, + { + "epoch": 10.36544, + "grad_norm": 0.9401588439941406, + "learning_rate": 3.3817527010804325e-05, + "loss": 0.6055, + "step": 8098 + }, + { + "epoch": 10.36672, + "grad_norm": 0.8760440349578857, + "learning_rate": 3.38155262104842e-05, + "loss": 0.5584, + "step": 8099 + }, + { + "epoch": 10.368, + "grad_norm": 0.8204819560050964, + "learning_rate": 3.381352541016406e-05, + "loss": 0.5278, + "step": 8100 + }, + { + "epoch": 10.36928, + "grad_norm": 0.9087020754814148, + "learning_rate": 3.3811524609843934e-05, + "loss": 0.5957, + "step": 8101 + }, + { + "epoch": 10.37056, + "grad_norm": 0.9149252772331238, + "learning_rate": 3.380952380952381e-05, + "loss": 0.5722, + "step": 8102 + }, + { + "epoch": 10.37184, + "grad_norm": 0.8594633936882019, + "learning_rate": 3.3807523009203685e-05, + "loss": 0.5866, + "step": 8103 + }, + { + "epoch": 10.37312, + "grad_norm": 0.8941963315010071, + "learning_rate": 3.3805522208883556e-05, + "loss": 0.566, + "step": 8104 + }, + { + "epoch": 10.3744, + "grad_norm": 0.9061945676803589, + "learning_rate": 3.380352140856343e-05, + "loss": 0.5989, + "step": 8105 + }, + { + "epoch": 10.37568, + "grad_norm": 0.9072801470756531, + "learning_rate": 3.38015206082433e-05, + "loss": 0.5341, + "step": 8106 + }, + { + "epoch": 10.37696, + "grad_norm": 0.8797597289085388, + "learning_rate": 3.379951980792317e-05, + "loss": 0.5658, + "step": 8107 + }, + { + "epoch": 10.37824, + "grad_norm": 0.8986260294914246, + "learning_rate": 3.379751900760304e-05, + "loss": 0.5663, + "step": 8108 + }, + { + "epoch": 10.37952, + "grad_norm": 0.870966374874115, + "learning_rate": 3.3795518207282916e-05, + "loss": 0.5759, + "step": 8109 + }, + { + "epoch": 10.3808, + "grad_norm": 0.8527510166168213, + "learning_rate": 3.379351740696279e-05, + "loss": 0.5566, + "step": 8110 + }, + { + "epoch": 10.38208, + "grad_norm": 0.8812639117240906, + "learning_rate": 3.379151660664266e-05, + "loss": 0.5787, + "step": 8111 + }, + { + "epoch": 10.38336, + "grad_norm": 0.9007368683815002, + "learning_rate": 3.378951580632253e-05, + "loss": 0.6065, + "step": 8112 + }, + { + "epoch": 10.38464, + "grad_norm": 0.908555269241333, + "learning_rate": 3.37875150060024e-05, + "loss": 0.5707, + "step": 8113 + }, + { + "epoch": 10.38592, + "grad_norm": 0.8893713355064392, + "learning_rate": 3.3785514205682275e-05, + "loss": 0.6067, + "step": 8114 + }, + { + "epoch": 10.3872, + "grad_norm": 0.8574310541152954, + "learning_rate": 3.378351340536215e-05, + "loss": 0.5643, + "step": 8115 + }, + { + "epoch": 10.38848, + "grad_norm": 0.823165774345398, + "learning_rate": 3.378151260504202e-05, + "loss": 0.5808, + "step": 8116 + }, + { + "epoch": 10.38976, + "grad_norm": 0.818540096282959, + "learning_rate": 3.377951180472189e-05, + "loss": 0.5709, + "step": 8117 + }, + { + "epoch": 10.39104, + "grad_norm": 0.8618648052215576, + "learning_rate": 3.377751100440176e-05, + "loss": 0.5565, + "step": 8118 + }, + { + "epoch": 10.39232, + "grad_norm": 0.869215726852417, + "learning_rate": 3.3775510204081634e-05, + "loss": 0.5499, + "step": 8119 + }, + { + "epoch": 10.3936, + "grad_norm": 0.9559123516082764, + "learning_rate": 3.3773509403761506e-05, + "loss": 0.6168, + "step": 8120 + }, + { + "epoch": 10.39488, + "grad_norm": 0.924443244934082, + "learning_rate": 3.377150860344138e-05, + "loss": 0.5954, + "step": 8121 + }, + { + "epoch": 10.39616, + "grad_norm": 0.8861650824546814, + "learning_rate": 3.376950780312125e-05, + "loss": 0.5793, + "step": 8122 + }, + { + "epoch": 10.39744, + "grad_norm": 0.9248014092445374, + "learning_rate": 3.376750700280112e-05, + "loss": 0.5581, + "step": 8123 + }, + { + "epoch": 10.39872, + "grad_norm": 0.8819634914398193, + "learning_rate": 3.3765506202480994e-05, + "loss": 0.5263, + "step": 8124 + }, + { + "epoch": 10.4, + "grad_norm": 0.8656063079833984, + "learning_rate": 3.3763505402160865e-05, + "loss": 0.5241, + "step": 8125 + }, + { + "epoch": 10.40128, + "grad_norm": 0.8893786668777466, + "learning_rate": 3.376150460184074e-05, + "loss": 0.5451, + "step": 8126 + }, + { + "epoch": 10.40256, + "grad_norm": 0.9260717034339905, + "learning_rate": 3.375950380152061e-05, + "loss": 0.5505, + "step": 8127 + }, + { + "epoch": 10.40384, + "grad_norm": 0.912977397441864, + "learning_rate": 3.375750300120048e-05, + "loss": 0.6363, + "step": 8128 + }, + { + "epoch": 10.40512, + "grad_norm": 0.8732292652130127, + "learning_rate": 3.375550220088035e-05, + "loss": 0.5778, + "step": 8129 + }, + { + "epoch": 10.4064, + "grad_norm": 0.8404425978660583, + "learning_rate": 3.3753501400560225e-05, + "loss": 0.5693, + "step": 8130 + }, + { + "epoch": 10.40768, + "grad_norm": 0.9023398160934448, + "learning_rate": 3.3751500600240097e-05, + "loss": 0.6112, + "step": 8131 + }, + { + "epoch": 10.40896, + "grad_norm": 0.9250540137290955, + "learning_rate": 3.374949979991997e-05, + "loss": 0.6067, + "step": 8132 + }, + { + "epoch": 10.41024, + "grad_norm": 0.9005985856056213, + "learning_rate": 3.374749899959984e-05, + "loss": 0.5746, + "step": 8133 + }, + { + "epoch": 10.41152, + "grad_norm": 0.9189587235450745, + "learning_rate": 3.374549819927971e-05, + "loss": 0.5847, + "step": 8134 + }, + { + "epoch": 10.4128, + "grad_norm": 0.8882416486740112, + "learning_rate": 3.3743497398959584e-05, + "loss": 0.6156, + "step": 8135 + }, + { + "epoch": 10.41408, + "grad_norm": 0.845780074596405, + "learning_rate": 3.3741496598639456e-05, + "loss": 0.549, + "step": 8136 + }, + { + "epoch": 10.41536, + "grad_norm": 0.9068146347999573, + "learning_rate": 3.3739495798319334e-05, + "loss": 0.5735, + "step": 8137 + }, + { + "epoch": 10.41664, + "grad_norm": 0.8547993898391724, + "learning_rate": 3.37374949979992e-05, + "loss": 0.5472, + "step": 8138 + }, + { + "epoch": 10.41792, + "grad_norm": 0.8667715191841125, + "learning_rate": 3.373549419767907e-05, + "loss": 0.5533, + "step": 8139 + }, + { + "epoch": 10.4192, + "grad_norm": 0.8902264833450317, + "learning_rate": 3.373349339735894e-05, + "loss": 0.5751, + "step": 8140 + }, + { + "epoch": 10.42048, + "grad_norm": 0.8540170788764954, + "learning_rate": 3.3731492597038815e-05, + "loss": 0.5179, + "step": 8141 + }, + { + "epoch": 10.42176, + "grad_norm": 0.8607949614524841, + "learning_rate": 3.372949179671869e-05, + "loss": 0.5825, + "step": 8142 + }, + { + "epoch": 10.42304, + "grad_norm": 0.8747577667236328, + "learning_rate": 3.372749099639856e-05, + "loss": 0.6228, + "step": 8143 + }, + { + "epoch": 10.42432, + "grad_norm": 0.8700942993164062, + "learning_rate": 3.372549019607844e-05, + "loss": 0.5888, + "step": 8144 + }, + { + "epoch": 10.4256, + "grad_norm": 0.9046246409416199, + "learning_rate": 3.372348939575831e-05, + "loss": 0.5769, + "step": 8145 + }, + { + "epoch": 10.42688, + "grad_norm": 0.8095414042472839, + "learning_rate": 3.3721488595438174e-05, + "loss": 0.5288, + "step": 8146 + }, + { + "epoch": 10.42816, + "grad_norm": 0.8345249891281128, + "learning_rate": 3.3719487795118046e-05, + "loss": 0.5943, + "step": 8147 + }, + { + "epoch": 10.42944, + "grad_norm": 0.9272936582565308, + "learning_rate": 3.371748699479792e-05, + "loss": 0.5317, + "step": 8148 + }, + { + "epoch": 10.43072, + "grad_norm": 0.9539486169815063, + "learning_rate": 3.371548619447779e-05, + "loss": 0.6475, + "step": 8149 + }, + { + "epoch": 10.432, + "grad_norm": Infinity, + "learning_rate": 3.371548619447779e-05, + "loss": 0.6468, + "step": 8150 + }, + { + "epoch": 10.43328, + "grad_norm": 0.8678048253059387, + "learning_rate": 3.371348539415766e-05, + "loss": 0.5463, + "step": 8151 + }, + { + "epoch": 10.43456, + "grad_norm": 0.8489457964897156, + "learning_rate": 3.371148459383754e-05, + "loss": 0.5703, + "step": 8152 + }, + { + "epoch": 10.43584, + "grad_norm": 0.8788950443267822, + "learning_rate": 3.370948379351741e-05, + "loss": 0.5346, + "step": 8153 + }, + { + "epoch": 10.43712, + "grad_norm": 0.853437066078186, + "learning_rate": 3.3707482993197284e-05, + "loss": 0.5659, + "step": 8154 + }, + { + "epoch": 10.4384, + "grad_norm": 0.8777585625648499, + "learning_rate": 3.370548219287715e-05, + "loss": 0.5578, + "step": 8155 + }, + { + "epoch": 10.43968, + "grad_norm": 0.8524881601333618, + "learning_rate": 3.370348139255702e-05, + "loss": 0.5656, + "step": 8156 + }, + { + "epoch": 10.44096, + "grad_norm": 0.8162695169448853, + "learning_rate": 3.370148059223689e-05, + "loss": 0.5092, + "step": 8157 + }, + { + "epoch": 10.44224, + "grad_norm": 0.8364541530609131, + "learning_rate": 3.3699479791916765e-05, + "loss": 0.5346, + "step": 8158 + }, + { + "epoch": 10.44352, + "grad_norm": 0.9183416962623596, + "learning_rate": 3.3697478991596643e-05, + "loss": 0.6277, + "step": 8159 + }, + { + "epoch": 10.4448, + "grad_norm": 0.8532844185829163, + "learning_rate": 3.3695478191276515e-05, + "loss": 0.5318, + "step": 8160 + }, + { + "epoch": 10.44608, + "grad_norm": 0.8698902726173401, + "learning_rate": 3.369347739095639e-05, + "loss": 0.5872, + "step": 8161 + }, + { + "epoch": 10.44736, + "grad_norm": 0.8348177671432495, + "learning_rate": 3.369147659063626e-05, + "loss": 0.5381, + "step": 8162 + }, + { + "epoch": 10.44864, + "grad_norm": 0.8752634525299072, + "learning_rate": 3.3689475790316124e-05, + "loss": 0.5587, + "step": 8163 + }, + { + "epoch": 10.44992, + "grad_norm": 0.8772228360176086, + "learning_rate": 3.3687474989995996e-05, + "loss": 0.5763, + "step": 8164 + }, + { + "epoch": 10.4512, + "grad_norm": 0.8985121250152588, + "learning_rate": 3.368547418967587e-05, + "loss": 0.5443, + "step": 8165 + }, + { + "epoch": 10.45248, + "grad_norm": 0.8824835419654846, + "learning_rate": 3.3683473389355746e-05, + "loss": 0.5583, + "step": 8166 + }, + { + "epoch": 10.45376, + "grad_norm": 0.8776131272315979, + "learning_rate": 3.368147258903562e-05, + "loss": 0.5331, + "step": 8167 + }, + { + "epoch": 10.45504, + "grad_norm": 0.9394381046295166, + "learning_rate": 3.367947178871549e-05, + "loss": 0.6115, + "step": 8168 + }, + { + "epoch": 10.45632, + "grad_norm": 0.8873891234397888, + "learning_rate": 3.367747098839536e-05, + "loss": 0.5574, + "step": 8169 + }, + { + "epoch": 10.4576, + "grad_norm": 0.8328064680099487, + "learning_rate": 3.3675470188075234e-05, + "loss": 0.5499, + "step": 8170 + }, + { + "epoch": 10.45888, + "grad_norm": 0.8790708184242249, + "learning_rate": 3.36734693877551e-05, + "loss": 0.5231, + "step": 8171 + }, + { + "epoch": 10.46016, + "grad_norm": 0.8684185743331909, + "learning_rate": 3.367146858743497e-05, + "loss": 0.5494, + "step": 8172 + }, + { + "epoch": 10.46144, + "grad_norm": 0.9176812767982483, + "learning_rate": 3.366946778711485e-05, + "loss": 0.5929, + "step": 8173 + }, + { + "epoch": 10.462720000000001, + "grad_norm": 0.9031935930252075, + "learning_rate": 3.366746698679472e-05, + "loss": 0.5999, + "step": 8174 + }, + { + "epoch": 10.464, + "grad_norm": 0.8639822006225586, + "learning_rate": 3.366546618647459e-05, + "loss": 0.6266, + "step": 8175 + }, + { + "epoch": 10.46528, + "grad_norm": 0.8309516310691833, + "learning_rate": 3.3663465386154465e-05, + "loss": 0.5368, + "step": 8176 + }, + { + "epoch": 10.46656, + "grad_norm": 0.8735124468803406, + "learning_rate": 3.366146458583434e-05, + "loss": 0.5759, + "step": 8177 + }, + { + "epoch": 10.46784, + "grad_norm": 0.850908100605011, + "learning_rate": 3.365946378551421e-05, + "loss": 0.5377, + "step": 8178 + }, + { + "epoch": 10.46912, + "grad_norm": 0.8890109062194824, + "learning_rate": 3.3657462985194074e-05, + "loss": 0.5821, + "step": 8179 + }, + { + "epoch": 10.4704, + "grad_norm": 0.8895400762557983, + "learning_rate": 3.365546218487395e-05, + "loss": 0.5535, + "step": 8180 + }, + { + "epoch": 10.47168, + "grad_norm": 0.8601114153862, + "learning_rate": 3.3653461384553824e-05, + "loss": 0.5324, + "step": 8181 + }, + { + "epoch": 10.47296, + "grad_norm": 0.8928337693214417, + "learning_rate": 3.3651460584233696e-05, + "loss": 0.5868, + "step": 8182 + }, + { + "epoch": 10.47424, + "grad_norm": 0.8805640935897827, + "learning_rate": 3.364945978391357e-05, + "loss": 0.5735, + "step": 8183 + }, + { + "epoch": 10.47552, + "grad_norm": 0.8599642515182495, + "learning_rate": 3.364745898359344e-05, + "loss": 0.5431, + "step": 8184 + }, + { + "epoch": 10.4768, + "grad_norm": 0.9143441319465637, + "learning_rate": 3.364545818327331e-05, + "loss": 0.5903, + "step": 8185 + }, + { + "epoch": 10.47808, + "grad_norm": 0.8206641674041748, + "learning_rate": 3.3643457382953184e-05, + "loss": 0.5216, + "step": 8186 + }, + { + "epoch": 10.47936, + "grad_norm": 0.7796454429626465, + "learning_rate": 3.3641456582633055e-05, + "loss": 0.4986, + "step": 8187 + }, + { + "epoch": 10.48064, + "grad_norm": 0.8821196556091309, + "learning_rate": 3.363945578231293e-05, + "loss": 0.5665, + "step": 8188 + }, + { + "epoch": 10.48192, + "grad_norm": 0.86763596534729, + "learning_rate": 3.36374549819928e-05, + "loss": 0.5393, + "step": 8189 + }, + { + "epoch": 10.4832, + "grad_norm": 0.8729016184806824, + "learning_rate": 3.363545418167267e-05, + "loss": 0.5357, + "step": 8190 + }, + { + "epoch": 10.48448, + "grad_norm": 0.9105653166770935, + "learning_rate": 3.363345338135254e-05, + "loss": 0.6309, + "step": 8191 + }, + { + "epoch": 10.48576, + "grad_norm": 0.8702036142349243, + "learning_rate": 3.3631452581032415e-05, + "loss": 0.5913, + "step": 8192 + }, + { + "epoch": 10.48704, + "grad_norm": 0.8215315937995911, + "learning_rate": 3.3629451780712287e-05, + "loss": 0.585, + "step": 8193 + }, + { + "epoch": 10.48832, + "grad_norm": 0.8497202396392822, + "learning_rate": 3.362745098039216e-05, + "loss": 0.5519, + "step": 8194 + }, + { + "epoch": 10.4896, + "grad_norm": 0.8402096629142761, + "learning_rate": 3.362545018007203e-05, + "loss": 0.5624, + "step": 8195 + }, + { + "epoch": 10.49088, + "grad_norm": 0.8736252784729004, + "learning_rate": 3.36234493797519e-05, + "loss": 0.5404, + "step": 8196 + }, + { + "epoch": 10.49216, + "grad_norm": 0.8728154301643372, + "learning_rate": 3.3621448579431774e-05, + "loss": 0.5412, + "step": 8197 + }, + { + "epoch": 10.49344, + "grad_norm": 0.8873513340950012, + "learning_rate": 3.3619447779111646e-05, + "loss": 0.5671, + "step": 8198 + }, + { + "epoch": 10.49472, + "grad_norm": 0.873191237449646, + "learning_rate": 3.361744697879152e-05, + "loss": 0.5645, + "step": 8199 + }, + { + "epoch": 10.496, + "grad_norm": 0.8937346339225769, + "learning_rate": 3.361544617847139e-05, + "loss": 0.604, + "step": 8200 + }, + { + "epoch": 10.49728, + "grad_norm": 0.8788394927978516, + "learning_rate": 3.361344537815127e-05, + "loss": 0.6088, + "step": 8201 + }, + { + "epoch": 10.49856, + "grad_norm": 0.8717678189277649, + "learning_rate": 3.361144457783113e-05, + "loss": 0.5475, + "step": 8202 + }, + { + "epoch": 10.49984, + "grad_norm": 0.8855169415473938, + "learning_rate": 3.3609443777511005e-05, + "loss": 0.6181, + "step": 8203 + }, + { + "epoch": 10.50112, + "grad_norm": 0.9005426168441772, + "learning_rate": 3.360744297719088e-05, + "loss": 0.5924, + "step": 8204 + }, + { + "epoch": 10.5024, + "grad_norm": 0.9078197479248047, + "learning_rate": 3.360544217687075e-05, + "loss": 0.6005, + "step": 8205 + }, + { + "epoch": 10.50368, + "grad_norm": 0.9592050909996033, + "learning_rate": 3.360344137655062e-05, + "loss": 0.6359, + "step": 8206 + }, + { + "epoch": 10.50496, + "grad_norm": 0.8957341313362122, + "learning_rate": 3.360144057623049e-05, + "loss": 0.5806, + "step": 8207 + }, + { + "epoch": 10.50624, + "grad_norm": 0.8797503709793091, + "learning_rate": 3.359943977591037e-05, + "loss": 0.5798, + "step": 8208 + }, + { + "epoch": 10.50752, + "grad_norm": 0.8420235514640808, + "learning_rate": 3.359743897559024e-05, + "loss": 0.5894, + "step": 8209 + }, + { + "epoch": 10.5088, + "grad_norm": 0.8644899725914001, + "learning_rate": 3.359543817527011e-05, + "loss": 0.553, + "step": 8210 + }, + { + "epoch": 10.51008, + "grad_norm": 0.8995040059089661, + "learning_rate": 3.359343737494998e-05, + "loss": 0.6117, + "step": 8211 + }, + { + "epoch": 10.51136, + "grad_norm": 0.8395155668258667, + "learning_rate": 3.359143657462985e-05, + "loss": 0.5678, + "step": 8212 + }, + { + "epoch": 10.51264, + "grad_norm": 0.8515924215316772, + "learning_rate": 3.3589435774309724e-05, + "loss": 0.5409, + "step": 8213 + }, + { + "epoch": 10.51392, + "grad_norm": 0.9288252592086792, + "learning_rate": 3.3587434973989596e-05, + "loss": 0.6328, + "step": 8214 + }, + { + "epoch": 10.5152, + "grad_norm": 0.9206863045692444, + "learning_rate": 3.358543417366947e-05, + "loss": 0.6071, + "step": 8215 + }, + { + "epoch": 10.51648, + "grad_norm": 0.875196099281311, + "learning_rate": 3.3583433373349346e-05, + "loss": 0.5762, + "step": 8216 + }, + { + "epoch": 10.517759999999999, + "grad_norm": 0.8598700165748596, + "learning_rate": 3.358143257302921e-05, + "loss": 0.5428, + "step": 8217 + }, + { + "epoch": 10.51904, + "grad_norm": 0.8348386287689209, + "learning_rate": 3.357943177270908e-05, + "loss": 0.5135, + "step": 8218 + }, + { + "epoch": 10.52032, + "grad_norm": 0.8748103380203247, + "learning_rate": 3.3577430972388955e-05, + "loss": 0.5582, + "step": 8219 + }, + { + "epoch": 10.5216, + "grad_norm": 0.8388493657112122, + "learning_rate": 3.357543017206883e-05, + "loss": 0.5594, + "step": 8220 + }, + { + "epoch": 10.52288, + "grad_norm": 0.8430709838867188, + "learning_rate": 3.35734293717487e-05, + "loss": 0.5304, + "step": 8221 + }, + { + "epoch": 10.52416, + "grad_norm": 0.8487758040428162, + "learning_rate": 3.357142857142857e-05, + "loss": 0.5249, + "step": 8222 + }, + { + "epoch": 10.52544, + "grad_norm": 0.8581976890563965, + "learning_rate": 3.356942777110845e-05, + "loss": 0.5209, + "step": 8223 + }, + { + "epoch": 10.52672, + "grad_norm": 0.9251878261566162, + "learning_rate": 3.356742697078832e-05, + "loss": 0.5878, + "step": 8224 + }, + { + "epoch": 10.528, + "grad_norm": 0.8627904057502747, + "learning_rate": 3.3565426170468186e-05, + "loss": 0.5766, + "step": 8225 + }, + { + "epoch": 10.52928, + "grad_norm": 0.8664613962173462, + "learning_rate": 3.356342537014806e-05, + "loss": 0.562, + "step": 8226 + }, + { + "epoch": 10.53056, + "grad_norm": 0.8714502453804016, + "learning_rate": 3.356142456982793e-05, + "loss": 0.578, + "step": 8227 + }, + { + "epoch": 10.53184, + "grad_norm": 0.8915185332298279, + "learning_rate": 3.35594237695078e-05, + "loss": 0.5542, + "step": 8228 + }, + { + "epoch": 10.53312, + "grad_norm": 0.9236972332000732, + "learning_rate": 3.3557422969187673e-05, + "loss": 0.5776, + "step": 8229 + }, + { + "epoch": 10.5344, + "grad_norm": 0.8826823234558105, + "learning_rate": 3.355542216886755e-05, + "loss": 0.5879, + "step": 8230 + }, + { + "epoch": 10.53568, + "grad_norm": 0.8796812295913696, + "learning_rate": 3.3553421368547424e-05, + "loss": 0.5992, + "step": 8231 + }, + { + "epoch": 10.53696, + "grad_norm": 0.8763803243637085, + "learning_rate": 3.3551420568227296e-05, + "loss": 0.5546, + "step": 8232 + }, + { + "epoch": 10.53824, + "grad_norm": 0.8898423910140991, + "learning_rate": 3.354941976790716e-05, + "loss": 0.5755, + "step": 8233 + }, + { + "epoch": 10.53952, + "grad_norm": 0.8547391295433044, + "learning_rate": 3.354741896758703e-05, + "loss": 0.5844, + "step": 8234 + }, + { + "epoch": 10.5408, + "grad_norm": 0.8550713658332825, + "learning_rate": 3.3545418167266905e-05, + "loss": 0.5609, + "step": 8235 + }, + { + "epoch": 10.54208, + "grad_norm": 0.8670467138290405, + "learning_rate": 3.3543417366946776e-05, + "loss": 0.5439, + "step": 8236 + }, + { + "epoch": 10.54336, + "grad_norm": 0.8298435807228088, + "learning_rate": 3.3541416566626655e-05, + "loss": 0.5424, + "step": 8237 + }, + { + "epoch": 10.54464, + "grad_norm": 0.8586260676383972, + "learning_rate": 3.353941576630653e-05, + "loss": 0.5316, + "step": 8238 + }, + { + "epoch": 10.54592, + "grad_norm": 0.8656241297721863, + "learning_rate": 3.35374149659864e-05, + "loss": 0.5515, + "step": 8239 + }, + { + "epoch": 10.5472, + "grad_norm": 0.8878941535949707, + "learning_rate": 3.353541416566627e-05, + "loss": 0.5874, + "step": 8240 + }, + { + "epoch": 10.54848, + "grad_norm": 0.900540828704834, + "learning_rate": 3.3533413365346136e-05, + "loss": 0.6261, + "step": 8241 + }, + { + "epoch": 10.54976, + "grad_norm": 0.8868071436882019, + "learning_rate": 3.353141256502601e-05, + "loss": 0.5749, + "step": 8242 + }, + { + "epoch": 10.55104, + "grad_norm": 0.9406156539916992, + "learning_rate": 3.352941176470588e-05, + "loss": 0.6071, + "step": 8243 + }, + { + "epoch": 10.55232, + "grad_norm": 0.9124932289123535, + "learning_rate": 3.352741096438576e-05, + "loss": 0.587, + "step": 8244 + }, + { + "epoch": 10.5536, + "grad_norm": 0.8528494834899902, + "learning_rate": 3.352541016406563e-05, + "loss": 0.5266, + "step": 8245 + }, + { + "epoch": 10.55488, + "grad_norm": 0.8507541418075562, + "learning_rate": 3.35234093637455e-05, + "loss": 0.5906, + "step": 8246 + }, + { + "epoch": 10.55616, + "grad_norm": 0.8238681554794312, + "learning_rate": 3.3521408563425374e-05, + "loss": 0.5421, + "step": 8247 + }, + { + "epoch": 10.55744, + "grad_norm": 0.8529794812202454, + "learning_rate": 3.3519407763105245e-05, + "loss": 0.5699, + "step": 8248 + }, + { + "epoch": 10.55872, + "grad_norm": 0.8948842883110046, + "learning_rate": 3.351740696278511e-05, + "loss": 0.5987, + "step": 8249 + }, + { + "epoch": 10.56, + "grad_norm": 0.8481034636497498, + "learning_rate": 3.351540616246498e-05, + "loss": 0.5355, + "step": 8250 + }, + { + "epoch": 10.56128, + "grad_norm": 0.900606632232666, + "learning_rate": 3.351340536214486e-05, + "loss": 0.6198, + "step": 8251 + }, + { + "epoch": 10.56256, + "grad_norm": 0.8651003241539001, + "learning_rate": 3.351140456182473e-05, + "loss": 0.5792, + "step": 8252 + }, + { + "epoch": 10.56384, + "grad_norm": 0.8270814418792725, + "learning_rate": 3.3509403761504605e-05, + "loss": 0.5138, + "step": 8253 + }, + { + "epoch": 10.56512, + "grad_norm": 0.8760477900505066, + "learning_rate": 3.350740296118448e-05, + "loss": 0.5928, + "step": 8254 + }, + { + "epoch": 10.5664, + "grad_norm": 0.889542818069458, + "learning_rate": 3.350540216086435e-05, + "loss": 0.6012, + "step": 8255 + }, + { + "epoch": 10.56768, + "grad_norm": 0.8778030872344971, + "learning_rate": 3.350340136054422e-05, + "loss": 0.5747, + "step": 8256 + }, + { + "epoch": 10.56896, + "grad_norm": 0.889215350151062, + "learning_rate": 3.3501400560224085e-05, + "loss": 0.5577, + "step": 8257 + }, + { + "epoch": 10.57024, + "grad_norm": 0.8863917589187622, + "learning_rate": 3.3499399759903964e-05, + "loss": 0.5355, + "step": 8258 + }, + { + "epoch": 10.57152, + "grad_norm": 0.8980902433395386, + "learning_rate": 3.3497398959583836e-05, + "loss": 0.5563, + "step": 8259 + }, + { + "epoch": 10.5728, + "grad_norm": 0.8849428296089172, + "learning_rate": 3.349539815926371e-05, + "loss": 0.578, + "step": 8260 + }, + { + "epoch": 10.57408, + "grad_norm": 0.8660392761230469, + "learning_rate": 3.349339735894358e-05, + "loss": 0.5187, + "step": 8261 + }, + { + "epoch": 10.57536, + "grad_norm": 0.891211986541748, + "learning_rate": 3.349139655862345e-05, + "loss": 0.5778, + "step": 8262 + }, + { + "epoch": 10.57664, + "grad_norm": 0.8924466967582703, + "learning_rate": 3.348939575830332e-05, + "loss": 0.5561, + "step": 8263 + }, + { + "epoch": 10.57792, + "grad_norm": 0.9162275791168213, + "learning_rate": 3.3487394957983195e-05, + "loss": 0.6179, + "step": 8264 + }, + { + "epoch": 10.5792, + "grad_norm": 0.9216055870056152, + "learning_rate": 3.348539415766307e-05, + "loss": 0.5975, + "step": 8265 + }, + { + "epoch": 10.58048, + "grad_norm": 0.8706624507904053, + "learning_rate": 3.348339335734294e-05, + "loss": 0.5412, + "step": 8266 + }, + { + "epoch": 10.58176, + "grad_norm": 0.8571588397026062, + "learning_rate": 3.348139255702281e-05, + "loss": 0.5807, + "step": 8267 + }, + { + "epoch": 10.58304, + "grad_norm": 0.7969219088554382, + "learning_rate": 3.347939175670268e-05, + "loss": 0.5632, + "step": 8268 + }, + { + "epoch": 10.58432, + "grad_norm": 0.8372544050216675, + "learning_rate": 3.3477390956382554e-05, + "loss": 0.5497, + "step": 8269 + }, + { + "epoch": 10.5856, + "grad_norm": 0.8220162987709045, + "learning_rate": 3.3475390156062426e-05, + "loss": 0.5106, + "step": 8270 + }, + { + "epoch": 10.58688, + "grad_norm": 0.8827033042907715, + "learning_rate": 3.34733893557423e-05, + "loss": 0.5885, + "step": 8271 + }, + { + "epoch": 10.58816, + "grad_norm": 0.8495452404022217, + "learning_rate": 3.347138855542217e-05, + "loss": 0.565, + "step": 8272 + }, + { + "epoch": 10.58944, + "grad_norm": 0.8994190692901611, + "learning_rate": 3.346938775510204e-05, + "loss": 0.5612, + "step": 8273 + }, + { + "epoch": 10.59072, + "grad_norm": 0.8603847622871399, + "learning_rate": 3.3467386954781914e-05, + "loss": 0.5775, + "step": 8274 + }, + { + "epoch": 10.592, + "grad_norm": 0.884907066822052, + "learning_rate": 3.3465386154461786e-05, + "loss": 0.5231, + "step": 8275 + }, + { + "epoch": 10.59328, + "grad_norm": 0.9084495306015015, + "learning_rate": 3.346338535414166e-05, + "loss": 0.5845, + "step": 8276 + }, + { + "epoch": 10.59456, + "grad_norm": 0.8667944669723511, + "learning_rate": 3.346138455382153e-05, + "loss": 0.5664, + "step": 8277 + }, + { + "epoch": 10.59584, + "grad_norm": 0.8516685366630554, + "learning_rate": 3.34593837535014e-05, + "loss": 0.5483, + "step": 8278 + }, + { + "epoch": 10.59712, + "grad_norm": 0.8369115591049194, + "learning_rate": 3.345738295318128e-05, + "loss": 0.505, + "step": 8279 + }, + { + "epoch": 10.5984, + "grad_norm": 0.8858165740966797, + "learning_rate": 3.3455382152861145e-05, + "loss": 0.5714, + "step": 8280 + }, + { + "epoch": 10.59968, + "grad_norm": 0.858709990978241, + "learning_rate": 3.345338135254102e-05, + "loss": 0.5921, + "step": 8281 + }, + { + "epoch": 10.60096, + "grad_norm": 0.8412848114967346, + "learning_rate": 3.345138055222089e-05, + "loss": 0.5848, + "step": 8282 + }, + { + "epoch": 10.60224, + "grad_norm": 0.8578768968582153, + "learning_rate": 3.344937975190076e-05, + "loss": 0.564, + "step": 8283 + }, + { + "epoch": 10.60352, + "grad_norm": 0.8731163144111633, + "learning_rate": 3.344737895158063e-05, + "loss": 0.5947, + "step": 8284 + }, + { + "epoch": 10.604800000000001, + "grad_norm": 0.8939322829246521, + "learning_rate": 3.3445378151260504e-05, + "loss": 0.6458, + "step": 8285 + }, + { + "epoch": 10.60608, + "grad_norm": 0.8193719983100891, + "learning_rate": 3.344337735094038e-05, + "loss": 0.5601, + "step": 8286 + }, + { + "epoch": 10.60736, + "grad_norm": 0.9018811583518982, + "learning_rate": 3.3441376550620255e-05, + "loss": 0.5914, + "step": 8287 + }, + { + "epoch": 10.60864, + "grad_norm": 0.8852116465568542, + "learning_rate": 3.343937575030012e-05, + "loss": 0.5545, + "step": 8288 + }, + { + "epoch": 10.60992, + "grad_norm": 0.880398690700531, + "learning_rate": 3.343737494997999e-05, + "loss": 0.5642, + "step": 8289 + }, + { + "epoch": 10.6112, + "grad_norm": 0.8978055715560913, + "learning_rate": 3.3435374149659863e-05, + "loss": 0.5649, + "step": 8290 + }, + { + "epoch": 10.61248, + "grad_norm": 0.8889503479003906, + "learning_rate": 3.3433373349339735e-05, + "loss": 0.6023, + "step": 8291 + }, + { + "epoch": 10.61376, + "grad_norm": 0.8328986167907715, + "learning_rate": 3.343137254901961e-05, + "loss": 0.5532, + "step": 8292 + }, + { + "epoch": 10.61504, + "grad_norm": 0.8667716979980469, + "learning_rate": 3.3429371748699486e-05, + "loss": 0.5735, + "step": 8293 + }, + { + "epoch": 10.61632, + "grad_norm": 0.8680749535560608, + "learning_rate": 3.342737094837936e-05, + "loss": 0.5629, + "step": 8294 + }, + { + "epoch": 10.6176, + "grad_norm": 0.8346037268638611, + "learning_rate": 3.342537014805923e-05, + "loss": 0.5411, + "step": 8295 + }, + { + "epoch": 10.61888, + "grad_norm": 0.8703474402427673, + "learning_rate": 3.3423369347739095e-05, + "loss": 0.5365, + "step": 8296 + }, + { + "epoch": 10.62016, + "grad_norm": 0.8694249391555786, + "learning_rate": 3.3421368547418966e-05, + "loss": 0.5453, + "step": 8297 + }, + { + "epoch": 10.62144, + "grad_norm": 0.8796177506446838, + "learning_rate": 3.341936774709884e-05, + "loss": 0.5339, + "step": 8298 + }, + { + "epoch": 10.62272, + "grad_norm": 0.8828015923500061, + "learning_rate": 3.341736694677871e-05, + "loss": 0.5838, + "step": 8299 + }, + { + "epoch": 10.624, + "grad_norm": 0.9480679035186768, + "learning_rate": 3.341536614645859e-05, + "loss": 0.5491, + "step": 8300 + }, + { + "epoch": 10.62528, + "grad_norm": 0.8716679811477661, + "learning_rate": 3.341336534613846e-05, + "loss": 0.5377, + "step": 8301 + }, + { + "epoch": 10.62656, + "grad_norm": 0.886888861656189, + "learning_rate": 3.341136454581833e-05, + "loss": 0.5493, + "step": 8302 + }, + { + "epoch": 10.627839999999999, + "grad_norm": 0.8986707925796509, + "learning_rate": 3.3409363745498204e-05, + "loss": 0.5805, + "step": 8303 + }, + { + "epoch": 10.62912, + "grad_norm": 0.8717333078384399, + "learning_rate": 3.340736294517807e-05, + "loss": 0.5042, + "step": 8304 + }, + { + "epoch": 10.6304, + "grad_norm": 0.8657734394073486, + "learning_rate": 3.340536214485794e-05, + "loss": 0.5976, + "step": 8305 + }, + { + "epoch": 10.63168, + "grad_norm": 0.8697702288627625, + "learning_rate": 3.340336134453781e-05, + "loss": 0.5653, + "step": 8306 + }, + { + "epoch": 10.63296, + "grad_norm": 0.9345014691352844, + "learning_rate": 3.340136054421769e-05, + "loss": 0.6546, + "step": 8307 + }, + { + "epoch": 10.63424, + "grad_norm": 0.8431648015975952, + "learning_rate": 3.3399359743897564e-05, + "loss": 0.5652, + "step": 8308 + }, + { + "epoch": 10.63552, + "grad_norm": 0.8228232264518738, + "learning_rate": 3.3397358943577436e-05, + "loss": 0.5742, + "step": 8309 + }, + { + "epoch": 10.636800000000001, + "grad_norm": 0.8282391428947449, + "learning_rate": 3.339535814325731e-05, + "loss": 0.5692, + "step": 8310 + }, + { + "epoch": 10.63808, + "grad_norm": 0.8612817525863647, + "learning_rate": 3.339335734293718e-05, + "loss": 0.5897, + "step": 8311 + }, + { + "epoch": 10.63936, + "grad_norm": 0.8568882942199707, + "learning_rate": 3.3391356542617044e-05, + "loss": 0.5675, + "step": 8312 + }, + { + "epoch": 10.64064, + "grad_norm": 0.8818540573120117, + "learning_rate": 3.3389355742296916e-05, + "loss": 0.5862, + "step": 8313 + }, + { + "epoch": 10.64192, + "grad_norm": 0.859569787979126, + "learning_rate": 3.3387354941976795e-05, + "loss": 0.5456, + "step": 8314 + }, + { + "epoch": 10.6432, + "grad_norm": 0.9021491408348083, + "learning_rate": 3.338535414165667e-05, + "loss": 0.6043, + "step": 8315 + }, + { + "epoch": 10.64448, + "grad_norm": 0.8722798228263855, + "learning_rate": 3.338335334133654e-05, + "loss": 0.5441, + "step": 8316 + }, + { + "epoch": 10.64576, + "grad_norm": 0.8792587518692017, + "learning_rate": 3.338135254101641e-05, + "loss": 0.5534, + "step": 8317 + }, + { + "epoch": 10.64704, + "grad_norm": 0.8752608895301819, + "learning_rate": 3.337935174069628e-05, + "loss": 0.537, + "step": 8318 + }, + { + "epoch": 10.64832, + "grad_norm": 0.8951905369758606, + "learning_rate": 3.3377350940376154e-05, + "loss": 0.6054, + "step": 8319 + }, + { + "epoch": 10.6496, + "grad_norm": 0.8960387110710144, + "learning_rate": 3.337535014005602e-05, + "loss": 0.6164, + "step": 8320 + }, + { + "epoch": 10.65088, + "grad_norm": 0.9029424786567688, + "learning_rate": 3.33733493397359e-05, + "loss": 0.5642, + "step": 8321 + }, + { + "epoch": 10.65216, + "grad_norm": 0.8857460618019104, + "learning_rate": 3.337134853941577e-05, + "loss": 0.5833, + "step": 8322 + }, + { + "epoch": 10.65344, + "grad_norm": 0.8373080492019653, + "learning_rate": 3.336934773909564e-05, + "loss": 0.5349, + "step": 8323 + }, + { + "epoch": 10.65472, + "grad_norm": 0.8566232919692993, + "learning_rate": 3.336734693877551e-05, + "loss": 0.5648, + "step": 8324 + }, + { + "epoch": 10.656, + "grad_norm": 0.894591212272644, + "learning_rate": 3.3365346138455385e-05, + "loss": 0.5989, + "step": 8325 + }, + { + "epoch": 10.65728, + "grad_norm": 0.8619808554649353, + "learning_rate": 3.336334533813526e-05, + "loss": 0.535, + "step": 8326 + }, + { + "epoch": 10.65856, + "grad_norm": 0.8474307060241699, + "learning_rate": 3.336134453781513e-05, + "loss": 0.5814, + "step": 8327 + }, + { + "epoch": 10.659839999999999, + "grad_norm": 0.8520254492759705, + "learning_rate": 3.3359343737494994e-05, + "loss": 0.522, + "step": 8328 + }, + { + "epoch": 10.66112, + "grad_norm": 0.8767806887626648, + "learning_rate": 3.335734293717487e-05, + "loss": 0.5766, + "step": 8329 + }, + { + "epoch": 10.6624, + "grad_norm": 0.8880060911178589, + "learning_rate": 3.3355342136854745e-05, + "loss": 0.6238, + "step": 8330 + }, + { + "epoch": 10.66368, + "grad_norm": 0.8470461964607239, + "learning_rate": 3.3353341336534616e-05, + "loss": 0.5878, + "step": 8331 + }, + { + "epoch": 10.66496, + "grad_norm": 0.8690903782844543, + "learning_rate": 3.335134053621449e-05, + "loss": 0.6082, + "step": 8332 + }, + { + "epoch": 10.66624, + "grad_norm": 0.8474588990211487, + "learning_rate": 3.334933973589436e-05, + "loss": 0.5873, + "step": 8333 + }, + { + "epoch": 10.66752, + "grad_norm": 0.9330595135688782, + "learning_rate": 3.334733893557423e-05, + "loss": 0.6094, + "step": 8334 + }, + { + "epoch": 10.6688, + "grad_norm": 0.8823012709617615, + "learning_rate": 3.3345338135254104e-05, + "loss": 0.5635, + "step": 8335 + }, + { + "epoch": 10.67008, + "grad_norm": 0.8572724461555481, + "learning_rate": 3.3343337334933976e-05, + "loss": 0.5932, + "step": 8336 + }, + { + "epoch": 10.67136, + "grad_norm": 0.8401781916618347, + "learning_rate": 3.334133653461385e-05, + "loss": 0.595, + "step": 8337 + }, + { + "epoch": 10.67264, + "grad_norm": 0.841372549533844, + "learning_rate": 3.333933573429372e-05, + "loss": 0.5606, + "step": 8338 + }, + { + "epoch": 10.67392, + "grad_norm": 0.8536092042922974, + "learning_rate": 3.333733493397359e-05, + "loss": 0.5635, + "step": 8339 + }, + { + "epoch": 10.6752, + "grad_norm": 0.8724260330200195, + "learning_rate": 3.333533413365346e-05, + "loss": 0.5582, + "step": 8340 + }, + { + "epoch": 10.67648, + "grad_norm": 0.8848631978034973, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.5671, + "step": 8341 + }, + { + "epoch": 10.67776, + "grad_norm": 0.8863515257835388, + "learning_rate": 3.333133253301321e-05, + "loss": 0.6179, + "step": 8342 + }, + { + "epoch": 10.67904, + "grad_norm": 0.8705168962478638, + "learning_rate": 3.332933173269308e-05, + "loss": 0.5777, + "step": 8343 + }, + { + "epoch": 10.68032, + "grad_norm": 0.845525324344635, + "learning_rate": 3.332733093237295e-05, + "loss": 0.5936, + "step": 8344 + }, + { + "epoch": 10.6816, + "grad_norm": 0.9055159687995911, + "learning_rate": 3.332533013205282e-05, + "loss": 0.5915, + "step": 8345 + }, + { + "epoch": 10.68288, + "grad_norm": 0.8923260569572449, + "learning_rate": 3.3323329331732694e-05, + "loss": 0.5929, + "step": 8346 + }, + { + "epoch": 10.68416, + "grad_norm": 0.8504287600517273, + "learning_rate": 3.3321328531412566e-05, + "loss": 0.5632, + "step": 8347 + }, + { + "epoch": 10.68544, + "grad_norm": 0.8821974992752075, + "learning_rate": 3.331932773109244e-05, + "loss": 0.5745, + "step": 8348 + }, + { + "epoch": 10.68672, + "grad_norm": 0.8966748118400574, + "learning_rate": 3.331732693077231e-05, + "loss": 0.5867, + "step": 8349 + }, + { + "epoch": 10.688, + "grad_norm": 0.8681886196136475, + "learning_rate": 3.331532613045218e-05, + "loss": 0.5741, + "step": 8350 + }, + { + "epoch": 10.68928, + "grad_norm": 0.8703950643539429, + "learning_rate": 3.3313325330132053e-05, + "loss": 0.5322, + "step": 8351 + }, + { + "epoch": 10.69056, + "grad_norm": 0.8685113787651062, + "learning_rate": 3.3311324529811925e-05, + "loss": 0.5438, + "step": 8352 + }, + { + "epoch": 10.69184, + "grad_norm": 0.8648349642753601, + "learning_rate": 3.33093237294918e-05, + "loss": 0.5632, + "step": 8353 + }, + { + "epoch": 10.69312, + "grad_norm": 0.906882107257843, + "learning_rate": 3.330732292917167e-05, + "loss": 0.5535, + "step": 8354 + }, + { + "epoch": 10.6944, + "grad_norm": 0.8382284045219421, + "learning_rate": 3.330532212885154e-05, + "loss": 0.542, + "step": 8355 + }, + { + "epoch": 10.69568, + "grad_norm": 0.90635746717453, + "learning_rate": 3.330332132853141e-05, + "loss": 0.588, + "step": 8356 + }, + { + "epoch": 10.69696, + "grad_norm": 0.8943015336990356, + "learning_rate": 3.330132052821129e-05, + "loss": 0.5763, + "step": 8357 + }, + { + "epoch": 10.69824, + "grad_norm": 0.8443808555603027, + "learning_rate": 3.3299319727891156e-05, + "loss": 0.4964, + "step": 8358 + }, + { + "epoch": 10.69952, + "grad_norm": 0.8682435750961304, + "learning_rate": 3.329731892757103e-05, + "loss": 0.5687, + "step": 8359 + }, + { + "epoch": 10.7008, + "grad_norm": 0.856095552444458, + "learning_rate": 3.32953181272509e-05, + "loss": 0.5713, + "step": 8360 + }, + { + "epoch": 10.70208, + "grad_norm": 0.8573371171951294, + "learning_rate": 3.329331732693077e-05, + "loss": 0.5471, + "step": 8361 + }, + { + "epoch": 10.70336, + "grad_norm": 0.8584128618240356, + "learning_rate": 3.3291316526610644e-05, + "loss": 0.545, + "step": 8362 + }, + { + "epoch": 10.70464, + "grad_norm": 0.827085554599762, + "learning_rate": 3.3289315726290516e-05, + "loss": 0.5662, + "step": 8363 + }, + { + "epoch": 10.70592, + "grad_norm": 0.852120578289032, + "learning_rate": 3.3287314925970394e-05, + "loss": 0.5583, + "step": 8364 + }, + { + "epoch": 10.7072, + "grad_norm": 0.8772450685501099, + "learning_rate": 3.3285314125650266e-05, + "loss": 0.5958, + "step": 8365 + }, + { + "epoch": 10.70848, + "grad_norm": 0.7987073063850403, + "learning_rate": 3.328331332533013e-05, + "loss": 0.5179, + "step": 8366 + }, + { + "epoch": 10.70976, + "grad_norm": 0.8500874638557434, + "learning_rate": 3.328131252501e-05, + "loss": 0.5829, + "step": 8367 + }, + { + "epoch": 10.71104, + "grad_norm": 0.8709688782691956, + "learning_rate": 3.3279311724689875e-05, + "loss": 0.5892, + "step": 8368 + }, + { + "epoch": 10.71232, + "grad_norm": 0.8327182531356812, + "learning_rate": 3.327731092436975e-05, + "loss": 0.5589, + "step": 8369 + }, + { + "epoch": 10.7136, + "grad_norm": 0.8227571845054626, + "learning_rate": 3.327531012404962e-05, + "loss": 0.5639, + "step": 8370 + }, + { + "epoch": 10.71488, + "grad_norm": 0.8822687268257141, + "learning_rate": 3.32733093237295e-05, + "loss": 0.5801, + "step": 8371 + }, + { + "epoch": 10.71616, + "grad_norm": 0.8199798464775085, + "learning_rate": 3.327130852340937e-05, + "loss": 0.5418, + "step": 8372 + }, + { + "epoch": 10.71744, + "grad_norm": 0.877302885055542, + "learning_rate": 3.326930772308924e-05, + "loss": 0.5717, + "step": 8373 + }, + { + "epoch": 10.71872, + "grad_norm": 0.874665379524231, + "learning_rate": 3.3267306922769106e-05, + "loss": 0.558, + "step": 8374 + }, + { + "epoch": 10.72, + "grad_norm": 0.8584824800491333, + "learning_rate": 3.326530612244898e-05, + "loss": 0.5602, + "step": 8375 + }, + { + "epoch": 10.72128, + "grad_norm": 0.8538774251937866, + "learning_rate": 3.326330532212885e-05, + "loss": 0.5614, + "step": 8376 + }, + { + "epoch": 10.72256, + "grad_norm": 0.8546411991119385, + "learning_rate": 3.326130452180872e-05, + "loss": 0.6141, + "step": 8377 + }, + { + "epoch": 10.72384, + "grad_norm": 0.8563506007194519, + "learning_rate": 3.32593037214886e-05, + "loss": 0.5317, + "step": 8378 + }, + { + "epoch": 10.72512, + "grad_norm": 0.88761305809021, + "learning_rate": 3.325730292116847e-05, + "loss": 0.5783, + "step": 8379 + }, + { + "epoch": 10.7264, + "grad_norm": 0.8833502531051636, + "learning_rate": 3.3255302120848344e-05, + "loss": 0.5822, + "step": 8380 + }, + { + "epoch": 10.72768, + "grad_norm": 0.8859793543815613, + "learning_rate": 3.3253301320528216e-05, + "loss": 0.5652, + "step": 8381 + }, + { + "epoch": 10.72896, + "grad_norm": 0.855240523815155, + "learning_rate": 3.325130052020808e-05, + "loss": 0.5449, + "step": 8382 + }, + { + "epoch": 10.73024, + "grad_norm": 0.8389564156532288, + "learning_rate": 3.324929971988795e-05, + "loss": 0.5591, + "step": 8383 + }, + { + "epoch": 10.73152, + "grad_norm": 0.8849216103553772, + "learning_rate": 3.3247298919567825e-05, + "loss": 0.5663, + "step": 8384 + }, + { + "epoch": 10.7328, + "grad_norm": 0.8661803007125854, + "learning_rate": 3.3245298119247703e-05, + "loss": 0.59, + "step": 8385 + }, + { + "epoch": 10.73408, + "grad_norm": 0.8539361953735352, + "learning_rate": 3.3243297318927575e-05, + "loss": 0.5407, + "step": 8386 + }, + { + "epoch": 10.73536, + "grad_norm": 0.9314575791358948, + "learning_rate": 3.324129651860745e-05, + "loss": 0.6113, + "step": 8387 + }, + { + "epoch": 10.73664, + "grad_norm": 0.8329246044158936, + "learning_rate": 3.323929571828732e-05, + "loss": 0.5317, + "step": 8388 + }, + { + "epoch": 10.73792, + "grad_norm": 0.8746315240859985, + "learning_rate": 3.323729491796719e-05, + "loss": 0.6104, + "step": 8389 + }, + { + "epoch": 10.7392, + "grad_norm": 0.8568903803825378, + "learning_rate": 3.3235294117647056e-05, + "loss": 0.6033, + "step": 8390 + }, + { + "epoch": 10.74048, + "grad_norm": 0.8712742924690247, + "learning_rate": 3.323329331732693e-05, + "loss": 0.5644, + "step": 8391 + }, + { + "epoch": 10.74176, + "grad_norm": 0.8397703170776367, + "learning_rate": 3.3231292517006806e-05, + "loss": 0.5512, + "step": 8392 + }, + { + "epoch": 10.74304, + "grad_norm": 0.9338214993476868, + "learning_rate": 3.322929171668668e-05, + "loss": 0.6489, + "step": 8393 + }, + { + "epoch": 10.74432, + "grad_norm": 0.8752598762512207, + "learning_rate": 3.322729091636655e-05, + "loss": 0.5417, + "step": 8394 + }, + { + "epoch": 10.7456, + "grad_norm": 0.8597055077552795, + "learning_rate": 3.322529011604642e-05, + "loss": 0.5591, + "step": 8395 + }, + { + "epoch": 10.74688, + "grad_norm": 0.9333181381225586, + "learning_rate": 3.3223289315726294e-05, + "loss": 0.5923, + "step": 8396 + }, + { + "epoch": 10.74816, + "grad_norm": 0.8479388356208801, + "learning_rate": 3.3221288515406166e-05, + "loss": 0.5427, + "step": 8397 + }, + { + "epoch": 10.74944, + "grad_norm": 0.8617178201675415, + "learning_rate": 3.321928771508603e-05, + "loss": 0.5473, + "step": 8398 + }, + { + "epoch": 10.75072, + "grad_norm": 0.9343951344490051, + "learning_rate": 3.321728691476591e-05, + "loss": 0.6115, + "step": 8399 + }, + { + "epoch": 10.752, + "grad_norm": 0.9055424928665161, + "learning_rate": 3.321528611444578e-05, + "loss": 0.5647, + "step": 8400 + }, + { + "epoch": 10.75328, + "grad_norm": 0.9039073586463928, + "learning_rate": 3.321328531412565e-05, + "loss": 0.5382, + "step": 8401 + }, + { + "epoch": 10.75456, + "grad_norm": 0.8732048869132996, + "learning_rate": 3.3211284513805525e-05, + "loss": 0.5488, + "step": 8402 + }, + { + "epoch": 10.75584, + "grad_norm": 0.859381914138794, + "learning_rate": 3.32092837134854e-05, + "loss": 0.6105, + "step": 8403 + }, + { + "epoch": 10.75712, + "grad_norm": 0.8635976910591125, + "learning_rate": 3.320728291316527e-05, + "loss": 0.5932, + "step": 8404 + }, + { + "epoch": 10.7584, + "grad_norm": 0.8763898611068726, + "learning_rate": 3.320528211284514e-05, + "loss": 0.5703, + "step": 8405 + }, + { + "epoch": 10.75968, + "grad_norm": 0.828618049621582, + "learning_rate": 3.320328131252501e-05, + "loss": 0.5675, + "step": 8406 + }, + { + "epoch": 10.76096, + "grad_norm": 0.894950807094574, + "learning_rate": 3.3201280512204884e-05, + "loss": 0.5832, + "step": 8407 + }, + { + "epoch": 10.76224, + "grad_norm": 0.8377487659454346, + "learning_rate": 3.3199279711884756e-05, + "loss": 0.5568, + "step": 8408 + }, + { + "epoch": 10.76352, + "grad_norm": 0.8179535269737244, + "learning_rate": 3.319727891156463e-05, + "loss": 0.4754, + "step": 8409 + }, + { + "epoch": 10.7648, + "grad_norm": 0.9088795185089111, + "learning_rate": 3.31952781112445e-05, + "loss": 0.6044, + "step": 8410 + }, + { + "epoch": 10.76608, + "grad_norm": 0.9149488806724548, + "learning_rate": 3.319327731092437e-05, + "loss": 0.577, + "step": 8411 + }, + { + "epoch": 10.76736, + "grad_norm": 0.9027832746505737, + "learning_rate": 3.3191276510604244e-05, + "loss": 0.5866, + "step": 8412 + }, + { + "epoch": 10.76864, + "grad_norm": 0.86416095495224, + "learning_rate": 3.3189275710284115e-05, + "loss": 0.6003, + "step": 8413 + }, + { + "epoch": 10.76992, + "grad_norm": 0.8570460677146912, + "learning_rate": 3.318727490996399e-05, + "loss": 0.562, + "step": 8414 + }, + { + "epoch": 10.7712, + "grad_norm": 0.9128470420837402, + "learning_rate": 3.318527410964386e-05, + "loss": 0.5915, + "step": 8415 + }, + { + "epoch": 10.77248, + "grad_norm": 0.8856536746025085, + "learning_rate": 3.318327330932373e-05, + "loss": 0.5406, + "step": 8416 + }, + { + "epoch": 10.77376, + "grad_norm": 0.8812784552574158, + "learning_rate": 3.31812725090036e-05, + "loss": 0.5233, + "step": 8417 + }, + { + "epoch": 10.77504, + "grad_norm": 0.8757975697517395, + "learning_rate": 3.3179271708683475e-05, + "loss": 0.5947, + "step": 8418 + }, + { + "epoch": 10.77632, + "grad_norm": 0.8653748035430908, + "learning_rate": 3.3177270908363347e-05, + "loss": 0.5823, + "step": 8419 + }, + { + "epoch": 10.7776, + "grad_norm": 0.9042747020721436, + "learning_rate": 3.317527010804322e-05, + "loss": 0.5583, + "step": 8420 + }, + { + "epoch": 10.778880000000001, + "grad_norm": 0.8836710453033447, + "learning_rate": 3.317326930772309e-05, + "loss": 0.563, + "step": 8421 + }, + { + "epoch": 10.78016, + "grad_norm": 0.8876301050186157, + "learning_rate": 3.317126850740296e-05, + "loss": 0.5814, + "step": 8422 + }, + { + "epoch": 10.78144, + "grad_norm": 0.8692103028297424, + "learning_rate": 3.3169267707082834e-05, + "loss": 0.6207, + "step": 8423 + }, + { + "epoch": 10.78272, + "grad_norm": 0.8895595669746399, + "learning_rate": 3.3167266906762706e-05, + "loss": 0.566, + "step": 8424 + }, + { + "epoch": 10.784, + "grad_norm": 0.8768266439437866, + "learning_rate": 3.316526610644258e-05, + "loss": 0.6218, + "step": 8425 + }, + { + "epoch": 10.78528, + "grad_norm": 0.8848718404769897, + "learning_rate": 3.316326530612245e-05, + "loss": 0.5943, + "step": 8426 + }, + { + "epoch": 10.78656, + "grad_norm": 0.9084532260894775, + "learning_rate": 3.316126450580233e-05, + "loss": 0.6096, + "step": 8427 + }, + { + "epoch": 10.78784, + "grad_norm": 0.9083433747291565, + "learning_rate": 3.315926370548219e-05, + "loss": 0.5836, + "step": 8428 + }, + { + "epoch": 10.78912, + "grad_norm": 0.8622574806213379, + "learning_rate": 3.3157262905162065e-05, + "loss": 0.5541, + "step": 8429 + }, + { + "epoch": 10.7904, + "grad_norm": 0.8447574973106384, + "learning_rate": 3.315526210484194e-05, + "loss": 0.5544, + "step": 8430 + }, + { + "epoch": 10.79168, + "grad_norm": 0.8831068873405457, + "learning_rate": 3.315326130452181e-05, + "loss": 0.6255, + "step": 8431 + }, + { + "epoch": 10.79296, + "grad_norm": 0.8802228569984436, + "learning_rate": 3.315126050420168e-05, + "loss": 0.6056, + "step": 8432 + }, + { + "epoch": 10.79424, + "grad_norm": 0.8990511298179626, + "learning_rate": 3.314925970388155e-05, + "loss": 0.6257, + "step": 8433 + }, + { + "epoch": 10.79552, + "grad_norm": 0.8636667728424072, + "learning_rate": 3.3147258903561424e-05, + "loss": 0.5596, + "step": 8434 + }, + { + "epoch": 10.7968, + "grad_norm": 0.8775346279144287, + "learning_rate": 3.31452581032413e-05, + "loss": 0.5596, + "step": 8435 + }, + { + "epoch": 10.79808, + "grad_norm": 0.8895041942596436, + "learning_rate": 3.314325730292117e-05, + "loss": 0.5812, + "step": 8436 + }, + { + "epoch": 10.79936, + "grad_norm": 0.8645225167274475, + "learning_rate": 3.314125650260104e-05, + "loss": 0.566, + "step": 8437 + }, + { + "epoch": 10.80064, + "grad_norm": 0.9078739285469055, + "learning_rate": 3.313925570228091e-05, + "loss": 0.5976, + "step": 8438 + }, + { + "epoch": 10.801919999999999, + "grad_norm": 0.8740204572677612, + "learning_rate": 3.3137254901960784e-05, + "loss": 0.5386, + "step": 8439 + }, + { + "epoch": 10.8032, + "grad_norm": 0.8575959205627441, + "learning_rate": 3.3135254101640656e-05, + "loss": 0.5371, + "step": 8440 + }, + { + "epoch": 10.80448, + "grad_norm": 0.8617302179336548, + "learning_rate": 3.313325330132053e-05, + "loss": 0.5515, + "step": 8441 + }, + { + "epoch": 10.80576, + "grad_norm": 0.8753884434700012, + "learning_rate": 3.3131252501000406e-05, + "loss": 0.5821, + "step": 8442 + }, + { + "epoch": 10.80704, + "grad_norm": 0.8543848991394043, + "learning_rate": 3.312925170068028e-05, + "loss": 0.5948, + "step": 8443 + }, + { + "epoch": 10.80832, + "grad_norm": 0.8967148661613464, + "learning_rate": 3.312725090036014e-05, + "loss": 0.6083, + "step": 8444 + }, + { + "epoch": 10.8096, + "grad_norm": 0.8336367011070251, + "learning_rate": 3.3125250100040015e-05, + "loss": 0.5127, + "step": 8445 + }, + { + "epoch": 10.810880000000001, + "grad_norm": 0.8468740582466125, + "learning_rate": 3.312324929971989e-05, + "loss": 0.5522, + "step": 8446 + }, + { + "epoch": 10.81216, + "grad_norm": 0.8677701354026794, + "learning_rate": 3.312124849939976e-05, + "loss": 0.5554, + "step": 8447 + }, + { + "epoch": 10.81344, + "grad_norm": 0.8767905235290527, + "learning_rate": 3.311924769907963e-05, + "loss": 0.5572, + "step": 8448 + }, + { + "epoch": 10.81472, + "grad_norm": 0.8894631266593933, + "learning_rate": 3.311724689875951e-05, + "loss": 0.6002, + "step": 8449 + }, + { + "epoch": 10.816, + "grad_norm": 0.8528721332550049, + "learning_rate": 3.311524609843938e-05, + "loss": 0.5644, + "step": 8450 + }, + { + "epoch": 10.81728, + "grad_norm": 0.8421396017074585, + "learning_rate": 3.311324529811925e-05, + "loss": 0.5392, + "step": 8451 + }, + { + "epoch": 10.81856, + "grad_norm": 0.8507756590843201, + "learning_rate": 3.311124449779912e-05, + "loss": 0.5513, + "step": 8452 + }, + { + "epoch": 10.81984, + "grad_norm": 0.8500514626502991, + "learning_rate": 3.310924369747899e-05, + "loss": 0.6198, + "step": 8453 + }, + { + "epoch": 10.82112, + "grad_norm": 0.8615267872810364, + "learning_rate": 3.310724289715886e-05, + "loss": 0.5162, + "step": 8454 + }, + { + "epoch": 10.8224, + "grad_norm": 0.8663539290428162, + "learning_rate": 3.310524209683873e-05, + "loss": 0.5811, + "step": 8455 + }, + { + "epoch": 10.82368, + "grad_norm": 0.8774354457855225, + "learning_rate": 3.310324129651861e-05, + "loss": 0.6085, + "step": 8456 + }, + { + "epoch": 10.82496, + "grad_norm": 0.9009751081466675, + "learning_rate": 3.3101240496198484e-05, + "loss": 0.5154, + "step": 8457 + }, + { + "epoch": 10.82624, + "grad_norm": 0.9249484539031982, + "learning_rate": 3.3099239695878356e-05, + "loss": 0.6029, + "step": 8458 + }, + { + "epoch": 10.82752, + "grad_norm": 0.8451379537582397, + "learning_rate": 3.309723889555823e-05, + "loss": 0.5601, + "step": 8459 + }, + { + "epoch": 10.8288, + "grad_norm": 0.905768871307373, + "learning_rate": 3.309523809523809e-05, + "loss": 0.5652, + "step": 8460 + }, + { + "epoch": 10.83008, + "grad_norm": 0.9199037551879883, + "learning_rate": 3.3093237294917965e-05, + "loss": 0.6089, + "step": 8461 + }, + { + "epoch": 10.83136, + "grad_norm": 0.8506740927696228, + "learning_rate": 3.3091236494597836e-05, + "loss": 0.5242, + "step": 8462 + }, + { + "epoch": 10.83264, + "grad_norm": 0.9114420413970947, + "learning_rate": 3.3089235694277715e-05, + "loss": 0.6026, + "step": 8463 + }, + { + "epoch": 10.833919999999999, + "grad_norm": 0.8378769755363464, + "learning_rate": 3.308723489395759e-05, + "loss": 0.5428, + "step": 8464 + }, + { + "epoch": 10.8352, + "grad_norm": 0.863136887550354, + "learning_rate": 3.308523409363746e-05, + "loss": 0.5569, + "step": 8465 + }, + { + "epoch": 10.83648, + "grad_norm": 0.9558273553848267, + "learning_rate": 3.308323329331733e-05, + "loss": 0.6102, + "step": 8466 + }, + { + "epoch": 10.83776, + "grad_norm": 0.860105037689209, + "learning_rate": 3.30812324929972e-05, + "loss": 0.5872, + "step": 8467 + }, + { + "epoch": 10.83904, + "grad_norm": 0.8709381222724915, + "learning_rate": 3.307923169267707e-05, + "loss": 0.5548, + "step": 8468 + }, + { + "epoch": 10.84032, + "grad_norm": 0.8705414533615112, + "learning_rate": 3.307723089235694e-05, + "loss": 0.5317, + "step": 8469 + }, + { + "epoch": 10.8416, + "grad_norm": 0.8116998672485352, + "learning_rate": 3.307523009203682e-05, + "loss": 0.5055, + "step": 8470 + }, + { + "epoch": 10.84288, + "grad_norm": 0.9043382406234741, + "learning_rate": 3.307322929171669e-05, + "loss": 0.5995, + "step": 8471 + }, + { + "epoch": 10.84416, + "grad_norm": 0.9392409920692444, + "learning_rate": 3.307122849139656e-05, + "loss": 0.6222, + "step": 8472 + }, + { + "epoch": 10.84544, + "grad_norm": 0.8821365833282471, + "learning_rate": 3.3069227691076434e-05, + "loss": 0.5192, + "step": 8473 + }, + { + "epoch": 10.84672, + "grad_norm": 0.885803759098053, + "learning_rate": 3.3067226890756305e-05, + "loss": 0.6162, + "step": 8474 + }, + { + "epoch": 10.848, + "grad_norm": 0.8577411770820618, + "learning_rate": 3.306522609043618e-05, + "loss": 0.5609, + "step": 8475 + }, + { + "epoch": 10.84928, + "grad_norm": 0.8510453104972839, + "learning_rate": 3.306322529011604e-05, + "loss": 0.5378, + "step": 8476 + }, + { + "epoch": 10.85056, + "grad_norm": 0.8527485728263855, + "learning_rate": 3.306122448979592e-05, + "loss": 0.558, + "step": 8477 + }, + { + "epoch": 10.85184, + "grad_norm": 0.8647722601890564, + "learning_rate": 3.305922368947579e-05, + "loss": 0.5802, + "step": 8478 + }, + { + "epoch": 10.85312, + "grad_norm": 0.8825690746307373, + "learning_rate": 3.3057222889155665e-05, + "loss": 0.5959, + "step": 8479 + }, + { + "epoch": 10.8544, + "grad_norm": 0.8674288392066956, + "learning_rate": 3.3055222088835537e-05, + "loss": 0.5621, + "step": 8480 + }, + { + "epoch": 10.85568, + "grad_norm": 0.8390064239501953, + "learning_rate": 3.305322128851541e-05, + "loss": 0.5452, + "step": 8481 + }, + { + "epoch": 10.85696, + "grad_norm": 0.8398364186286926, + "learning_rate": 3.305122048819528e-05, + "loss": 0.5649, + "step": 8482 + }, + { + "epoch": 10.85824, + "grad_norm": 0.7934215664863586, + "learning_rate": 3.304921968787515e-05, + "loss": 0.5222, + "step": 8483 + }, + { + "epoch": 10.85952, + "grad_norm": 0.8218225240707397, + "learning_rate": 3.3047218887555024e-05, + "loss": 0.5509, + "step": 8484 + }, + { + "epoch": 10.8608, + "grad_norm": 0.8352612257003784, + "learning_rate": 3.3045218087234896e-05, + "loss": 0.564, + "step": 8485 + }, + { + "epoch": 10.86208, + "grad_norm": 0.8846644759178162, + "learning_rate": 3.304321728691477e-05, + "loss": 0.5925, + "step": 8486 + }, + { + "epoch": 10.86336, + "grad_norm": 0.8664666414260864, + "learning_rate": 3.304121648659464e-05, + "loss": 0.573, + "step": 8487 + }, + { + "epoch": 10.86464, + "grad_norm": 0.8052683472633362, + "learning_rate": 3.303921568627451e-05, + "loss": 0.5172, + "step": 8488 + }, + { + "epoch": 10.86592, + "grad_norm": 0.8650311827659607, + "learning_rate": 3.303721488595438e-05, + "loss": 0.5449, + "step": 8489 + }, + { + "epoch": 10.8672, + "grad_norm": 0.8478010296821594, + "learning_rate": 3.3035214085634255e-05, + "loss": 0.5509, + "step": 8490 + }, + { + "epoch": 10.86848, + "grad_norm": 0.8610655665397644, + "learning_rate": 3.303321328531413e-05, + "loss": 0.5669, + "step": 8491 + }, + { + "epoch": 10.86976, + "grad_norm": 0.8204941153526306, + "learning_rate": 3.3031212484994e-05, + "loss": 0.5607, + "step": 8492 + }, + { + "epoch": 10.87104, + "grad_norm": 0.8423011302947998, + "learning_rate": 3.302921168467387e-05, + "loss": 0.5431, + "step": 8493 + }, + { + "epoch": 10.87232, + "grad_norm": 0.8661743998527527, + "learning_rate": 3.302721088435374e-05, + "loss": 0.5609, + "step": 8494 + }, + { + "epoch": 10.8736, + "grad_norm": 0.8518742918968201, + "learning_rate": 3.3025210084033614e-05, + "loss": 0.5223, + "step": 8495 + }, + { + "epoch": 10.87488, + "grad_norm": 0.9045655131340027, + "learning_rate": 3.3023209283713486e-05, + "loss": 0.6088, + "step": 8496 + }, + { + "epoch": 10.87616, + "grad_norm": 0.8613651990890503, + "learning_rate": 3.302120848339336e-05, + "loss": 0.5155, + "step": 8497 + }, + { + "epoch": 10.87744, + "grad_norm": 0.8599672317504883, + "learning_rate": 3.301920768307323e-05, + "loss": 0.5743, + "step": 8498 + }, + { + "epoch": 10.87872, + "grad_norm": 0.8626627922058105, + "learning_rate": 3.30172068827531e-05, + "loss": 0.5878, + "step": 8499 + }, + { + "epoch": 10.88, + "grad_norm": 0.879513144493103, + "learning_rate": 3.3015206082432974e-05, + "loss": 0.5954, + "step": 8500 + }, + { + "epoch": 10.88128, + "grad_norm": 0.8912549614906311, + "learning_rate": 3.3013205282112846e-05, + "loss": 0.585, + "step": 8501 + }, + { + "epoch": 10.88256, + "grad_norm": 0.8111445903778076, + "learning_rate": 3.301120448179272e-05, + "loss": 0.5391, + "step": 8502 + }, + { + "epoch": 10.88384, + "grad_norm": 0.8916809558868408, + "learning_rate": 3.300920368147259e-05, + "loss": 0.6075, + "step": 8503 + }, + { + "epoch": 10.88512, + "grad_norm": 0.8808960318565369, + "learning_rate": 3.300720288115246e-05, + "loss": 0.5037, + "step": 8504 + }, + { + "epoch": 10.8864, + "grad_norm": 0.8821715116500854, + "learning_rate": 3.300520208083234e-05, + "loss": 0.566, + "step": 8505 + }, + { + "epoch": 10.88768, + "grad_norm": 0.8724703192710876, + "learning_rate": 3.3003201280512205e-05, + "loss": 0.5715, + "step": 8506 + }, + { + "epoch": 10.88896, + "grad_norm": 0.8815937638282776, + "learning_rate": 3.300120048019208e-05, + "loss": 0.5345, + "step": 8507 + }, + { + "epoch": 10.89024, + "grad_norm": 0.8287563920021057, + "learning_rate": 3.299919967987195e-05, + "loss": 0.551, + "step": 8508 + }, + { + "epoch": 10.89152, + "grad_norm": 0.9234015941619873, + "learning_rate": 3.299719887955182e-05, + "loss": 0.6083, + "step": 8509 + }, + { + "epoch": 10.8928, + "grad_norm": 0.8626712560653687, + "learning_rate": 3.299519807923169e-05, + "loss": 0.6007, + "step": 8510 + }, + { + "epoch": 10.89408, + "grad_norm": 0.8290971517562866, + "learning_rate": 3.2993197278911564e-05, + "loss": 0.5234, + "step": 8511 + }, + { + "epoch": 10.89536, + "grad_norm": 0.8788360357284546, + "learning_rate": 3.299119647859144e-05, + "loss": 0.6131, + "step": 8512 + }, + { + "epoch": 10.89664, + "grad_norm": 0.8960014581680298, + "learning_rate": 3.2989195678271315e-05, + "loss": 0.6024, + "step": 8513 + }, + { + "epoch": 10.89792, + "grad_norm": 0.8566673398017883, + "learning_rate": 3.298719487795118e-05, + "loss": 0.5515, + "step": 8514 + }, + { + "epoch": 10.8992, + "grad_norm": 0.8075211048126221, + "learning_rate": 3.298519407763105e-05, + "loss": 0.516, + "step": 8515 + }, + { + "epoch": 10.90048, + "grad_norm": 0.9277984499931335, + "learning_rate": 3.2983193277310923e-05, + "loss": 0.6206, + "step": 8516 + }, + { + "epoch": 10.90176, + "grad_norm": 0.8750211596488953, + "learning_rate": 3.2981192476990795e-05, + "loss": 0.5697, + "step": 8517 + }, + { + "epoch": 10.90304, + "grad_norm": 0.8238059282302856, + "learning_rate": 3.297919167667067e-05, + "loss": 0.541, + "step": 8518 + }, + { + "epoch": 10.90432, + "grad_norm": 0.8247233629226685, + "learning_rate": 3.2977190876350546e-05, + "loss": 0.5711, + "step": 8519 + }, + { + "epoch": 10.9056, + "grad_norm": 0.880142092704773, + "learning_rate": 3.297519007603042e-05, + "loss": 0.581, + "step": 8520 + }, + { + "epoch": 10.90688, + "grad_norm": 0.9421783089637756, + "learning_rate": 3.297318927571029e-05, + "loss": 0.6042, + "step": 8521 + }, + { + "epoch": 10.90816, + "grad_norm": 0.8348096609115601, + "learning_rate": 3.2971188475390155e-05, + "loss": 0.5075, + "step": 8522 + }, + { + "epoch": 10.90944, + "grad_norm": 0.8904414772987366, + "learning_rate": 3.2969187675070026e-05, + "loss": 0.6436, + "step": 8523 + }, + { + "epoch": 10.91072, + "grad_norm": 0.927683413028717, + "learning_rate": 3.29671868747499e-05, + "loss": 0.6266, + "step": 8524 + }, + { + "epoch": 10.912, + "grad_norm": 0.8911395072937012, + "learning_rate": 3.296518607442977e-05, + "loss": 0.6085, + "step": 8525 + }, + { + "epoch": 10.91328, + "grad_norm": 0.8726085424423218, + "learning_rate": 3.296318527410965e-05, + "loss": 0.5798, + "step": 8526 + }, + { + "epoch": 10.91456, + "grad_norm": 0.8867003321647644, + "learning_rate": 3.296118447378952e-05, + "loss": 0.5685, + "step": 8527 + }, + { + "epoch": 10.91584, + "grad_norm": 0.877424418926239, + "learning_rate": 3.295918367346939e-05, + "loss": 0.5986, + "step": 8528 + }, + { + "epoch": 10.91712, + "grad_norm": 0.8679222464561462, + "learning_rate": 3.2957182873149264e-05, + "loss": 0.5735, + "step": 8529 + }, + { + "epoch": 10.9184, + "grad_norm": 0.8904789090156555, + "learning_rate": 3.295518207282913e-05, + "loss": 0.567, + "step": 8530 + }, + { + "epoch": 10.91968, + "grad_norm": 0.8913958668708801, + "learning_rate": 3.2953181272509e-05, + "loss": 0.57, + "step": 8531 + }, + { + "epoch": 10.920960000000001, + "grad_norm": 0.8957118391990662, + "learning_rate": 3.295118047218887e-05, + "loss": 0.5692, + "step": 8532 + }, + { + "epoch": 10.92224, + "grad_norm": 0.8906075358390808, + "learning_rate": 3.294917967186875e-05, + "loss": 0.5645, + "step": 8533 + }, + { + "epoch": 10.92352, + "grad_norm": 0.8821467757225037, + "learning_rate": 3.2947178871548624e-05, + "loss": 0.5799, + "step": 8534 + }, + { + "epoch": 10.9248, + "grad_norm": 0.8894703984260559, + "learning_rate": 3.2945178071228495e-05, + "loss": 0.6076, + "step": 8535 + }, + { + "epoch": 10.92608, + "grad_norm": 0.8252606987953186, + "learning_rate": 3.294317727090837e-05, + "loss": 0.5762, + "step": 8536 + }, + { + "epoch": 10.92736, + "grad_norm": 0.8460403680801392, + "learning_rate": 3.294117647058824e-05, + "loss": 0.5853, + "step": 8537 + }, + { + "epoch": 10.92864, + "grad_norm": 0.8825834393501282, + "learning_rate": 3.2939175670268104e-05, + "loss": 0.5628, + "step": 8538 + }, + { + "epoch": 10.92992, + "grad_norm": 0.8889744877815247, + "learning_rate": 3.2937174869947976e-05, + "loss": 0.549, + "step": 8539 + }, + { + "epoch": 10.9312, + "grad_norm": 0.9132182002067566, + "learning_rate": 3.2935174069627855e-05, + "loss": 0.5821, + "step": 8540 + }, + { + "epoch": 10.93248, + "grad_norm": 0.8521642684936523, + "learning_rate": 3.2933173269307727e-05, + "loss": 0.5378, + "step": 8541 + }, + { + "epoch": 10.93376, + "grad_norm": 0.8864872455596924, + "learning_rate": 3.29311724689876e-05, + "loss": 0.5589, + "step": 8542 + }, + { + "epoch": 10.93504, + "grad_norm": 0.8883858919143677, + "learning_rate": 3.292917166866747e-05, + "loss": 0.5716, + "step": 8543 + }, + { + "epoch": 10.93632, + "grad_norm": 0.862106442451477, + "learning_rate": 3.292717086834734e-05, + "loss": 0.5487, + "step": 8544 + }, + { + "epoch": 10.9376, + "grad_norm": 0.9219768643379211, + "learning_rate": 3.2925170068027214e-05, + "loss": 0.5911, + "step": 8545 + }, + { + "epoch": 10.93888, + "grad_norm": 0.9102250337600708, + "learning_rate": 3.292316926770708e-05, + "loss": 0.5712, + "step": 8546 + }, + { + "epoch": 10.94016, + "grad_norm": 0.8525840640068054, + "learning_rate": 3.292116846738695e-05, + "loss": 0.5885, + "step": 8547 + }, + { + "epoch": 10.94144, + "grad_norm": 0.907727837562561, + "learning_rate": 3.291916766706683e-05, + "loss": 0.6011, + "step": 8548 + }, + { + "epoch": 10.94272, + "grad_norm": 0.8702722191810608, + "learning_rate": 3.29171668667467e-05, + "loss": 0.564, + "step": 8549 + }, + { + "epoch": 10.943999999999999, + "grad_norm": 0.8578301668167114, + "learning_rate": 3.291516606642657e-05, + "loss": 0.5763, + "step": 8550 + }, + { + "epoch": 10.94528, + "grad_norm": 0.9135550260543823, + "learning_rate": 3.2913165266106445e-05, + "loss": 0.6035, + "step": 8551 + }, + { + "epoch": 10.94656, + "grad_norm": 0.8668347001075745, + "learning_rate": 3.291116446578632e-05, + "loss": 0.5407, + "step": 8552 + }, + { + "epoch": 10.94784, + "grad_norm": 0.8684234619140625, + "learning_rate": 3.290916366546619e-05, + "loss": 0.5913, + "step": 8553 + }, + { + "epoch": 10.94912, + "grad_norm": 0.8825991749763489, + "learning_rate": 3.2907162865146054e-05, + "loss": 0.6135, + "step": 8554 + }, + { + "epoch": 10.9504, + "grad_norm": 0.9004281163215637, + "learning_rate": 3.290516206482593e-05, + "loss": 0.6063, + "step": 8555 + }, + { + "epoch": 10.95168, + "grad_norm": 0.8196146488189697, + "learning_rate": 3.2903161264505804e-05, + "loss": 0.5095, + "step": 8556 + }, + { + "epoch": 10.952960000000001, + "grad_norm": 0.8968302011489868, + "learning_rate": 3.2901160464185676e-05, + "loss": 0.6126, + "step": 8557 + }, + { + "epoch": 10.95424, + "grad_norm": 0.8384824991226196, + "learning_rate": 3.289915966386555e-05, + "loss": 0.5662, + "step": 8558 + }, + { + "epoch": 10.95552, + "grad_norm": 0.9103649258613586, + "learning_rate": 3.289715886354542e-05, + "loss": 0.5788, + "step": 8559 + }, + { + "epoch": 10.9568, + "grad_norm": 0.8952497243881226, + "learning_rate": 3.289515806322529e-05, + "loss": 0.5809, + "step": 8560 + }, + { + "epoch": 10.95808, + "grad_norm": 0.8801275491714478, + "learning_rate": 3.2893157262905164e-05, + "loss": 0.5753, + "step": 8561 + }, + { + "epoch": 10.95936, + "grad_norm": 0.9225260615348816, + "learning_rate": 3.2891156462585036e-05, + "loss": 0.601, + "step": 8562 + }, + { + "epoch": 10.96064, + "grad_norm": 0.8567471504211426, + "learning_rate": 3.288915566226491e-05, + "loss": 0.554, + "step": 8563 + }, + { + "epoch": 10.96192, + "grad_norm": 0.8874462842941284, + "learning_rate": 3.288715486194478e-05, + "loss": 0.5497, + "step": 8564 + }, + { + "epoch": 10.9632, + "grad_norm": 0.916522741317749, + "learning_rate": 3.288515406162465e-05, + "loss": 0.6097, + "step": 8565 + }, + { + "epoch": 10.96448, + "grad_norm": 0.890692412853241, + "learning_rate": 3.288315326130452e-05, + "loss": 0.5895, + "step": 8566 + }, + { + "epoch": 10.96576, + "grad_norm": 0.8528810739517212, + "learning_rate": 3.2881152460984395e-05, + "loss": 0.5664, + "step": 8567 + }, + { + "epoch": 10.96704, + "grad_norm": 0.8634024858474731, + "learning_rate": 3.287915166066427e-05, + "loss": 0.598, + "step": 8568 + }, + { + "epoch": 10.96832, + "grad_norm": 0.8372824788093567, + "learning_rate": 3.287715086034414e-05, + "loss": 0.5313, + "step": 8569 + }, + { + "epoch": 10.9696, + "grad_norm": 0.8880471587181091, + "learning_rate": 3.287515006002401e-05, + "loss": 0.5818, + "step": 8570 + }, + { + "epoch": 10.97088, + "grad_norm": 0.8914214372634888, + "learning_rate": 3.287314925970388e-05, + "loss": 0.5708, + "step": 8571 + }, + { + "epoch": 10.97216, + "grad_norm": 0.8861146569252014, + "learning_rate": 3.2871148459383754e-05, + "loss": 0.5029, + "step": 8572 + }, + { + "epoch": 10.97344, + "grad_norm": 0.9242550730705261, + "learning_rate": 3.2869147659063626e-05, + "loss": 0.5788, + "step": 8573 + }, + { + "epoch": 10.97472, + "grad_norm": 0.8695968389511108, + "learning_rate": 3.28671468587435e-05, + "loss": 0.5828, + "step": 8574 + }, + { + "epoch": 10.975999999999999, + "grad_norm": 0.8502150177955627, + "learning_rate": 3.286514605842337e-05, + "loss": 0.5377, + "step": 8575 + }, + { + "epoch": 10.97728, + "grad_norm": 0.8788812160491943, + "learning_rate": 3.286314525810325e-05, + "loss": 0.5599, + "step": 8576 + }, + { + "epoch": 10.97856, + "grad_norm": 0.9098305106163025, + "learning_rate": 3.2861144457783113e-05, + "loss": 0.5665, + "step": 8577 + }, + { + "epoch": 10.97984, + "grad_norm": 0.9447456002235413, + "learning_rate": 3.2859143657462985e-05, + "loss": 0.5948, + "step": 8578 + }, + { + "epoch": 10.98112, + "grad_norm": 0.9016488194465637, + "learning_rate": 3.285714285714286e-05, + "loss": 0.5681, + "step": 8579 + }, + { + "epoch": 10.9824, + "grad_norm": 0.8744751811027527, + "learning_rate": 3.285514205682273e-05, + "loss": 0.5673, + "step": 8580 + }, + { + "epoch": 10.98368, + "grad_norm": 0.8944303393363953, + "learning_rate": 3.28531412565026e-05, + "loss": 0.5808, + "step": 8581 + }, + { + "epoch": 10.98496, + "grad_norm": 0.8822252750396729, + "learning_rate": 3.285114045618247e-05, + "loss": 0.5607, + "step": 8582 + }, + { + "epoch": 10.98624, + "grad_norm": 0.8522640466690063, + "learning_rate": 3.284913965586235e-05, + "loss": 0.5971, + "step": 8583 + }, + { + "epoch": 10.98752, + "grad_norm": 0.874326765537262, + "learning_rate": 3.284713885554222e-05, + "loss": 0.6094, + "step": 8584 + }, + { + "epoch": 10.9888, + "grad_norm": 0.9082483053207397, + "learning_rate": 3.284513805522209e-05, + "loss": 0.5866, + "step": 8585 + }, + { + "epoch": 10.99008, + "grad_norm": 0.8725292086601257, + "learning_rate": 3.284313725490196e-05, + "loss": 0.6204, + "step": 8586 + }, + { + "epoch": 10.99136, + "grad_norm": 0.8516504168510437, + "learning_rate": 3.284113645458183e-05, + "loss": 0.5942, + "step": 8587 + }, + { + "epoch": 10.99264, + "grad_norm": 0.8405760526657104, + "learning_rate": 3.2839135654261704e-05, + "loss": 0.6035, + "step": 8588 + }, + { + "epoch": 10.99392, + "grad_norm": 0.8437188863754272, + "learning_rate": 3.2837134853941576e-05, + "loss": 0.5071, + "step": 8589 + }, + { + "epoch": 10.9952, + "grad_norm": 0.877285897731781, + "learning_rate": 3.2835134053621454e-05, + "loss": 0.5761, + "step": 8590 + }, + { + "epoch": 10.99648, + "grad_norm": 0.8691837191581726, + "learning_rate": 3.2833133253301326e-05, + "loss": 0.6239, + "step": 8591 + }, + { + "epoch": 10.99776, + "grad_norm": 0.8213376402854919, + "learning_rate": 3.28311324529812e-05, + "loss": 0.5429, + "step": 8592 + }, + { + "epoch": 10.99904, + "grad_norm": 0.8809442520141602, + "learning_rate": 3.282913165266106e-05, + "loss": 0.5888, + "step": 8593 + }, + { + "epoch": 11.00032, + "grad_norm": 1.9187315702438354, + "learning_rate": 3.2827130852340935e-05, + "loss": 1.0473, + "step": 8594 + }, + { + "epoch": 11.0016, + "grad_norm": 0.9014028310775757, + "learning_rate": 3.282513005202081e-05, + "loss": 0.587, + "step": 8595 + }, + { + "epoch": 11.00288, + "grad_norm": 0.8248893022537231, + "learning_rate": 3.282312925170068e-05, + "loss": 0.5534, + "step": 8596 + }, + { + "epoch": 11.00416, + "grad_norm": 0.8243193626403809, + "learning_rate": 3.282112845138056e-05, + "loss": 0.5334, + "step": 8597 + }, + { + "epoch": 11.00544, + "grad_norm": 0.8348228931427002, + "learning_rate": 3.281912765106043e-05, + "loss": 0.5808, + "step": 8598 + }, + { + "epoch": 11.00672, + "grad_norm": 0.8530436754226685, + "learning_rate": 3.28171268507403e-05, + "loss": 0.5644, + "step": 8599 + }, + { + "epoch": 11.008, + "grad_norm": 0.8829576969146729, + "learning_rate": 3.281512605042017e-05, + "loss": 0.5946, + "step": 8600 + }, + { + "epoch": 11.00928, + "grad_norm": 0.8608317971229553, + "learning_rate": 3.281312525010004e-05, + "loss": 0.5479, + "step": 8601 + }, + { + "epoch": 11.01056, + "grad_norm": 0.8292250037193298, + "learning_rate": 3.281112444977991e-05, + "loss": 0.5278, + "step": 8602 + }, + { + "epoch": 11.01184, + "grad_norm": 0.8977212905883789, + "learning_rate": 3.280912364945978e-05, + "loss": 0.6198, + "step": 8603 + }, + { + "epoch": 11.01312, + "grad_norm": 0.866463840007782, + "learning_rate": 3.280712284913966e-05, + "loss": 0.5493, + "step": 8604 + }, + { + "epoch": 11.0144, + "grad_norm": 0.9034777879714966, + "learning_rate": 3.280512204881953e-05, + "loss": 0.5457, + "step": 8605 + }, + { + "epoch": 11.01568, + "grad_norm": 0.9087190628051758, + "learning_rate": 3.2803121248499404e-05, + "loss": 0.5557, + "step": 8606 + }, + { + "epoch": 11.01696, + "grad_norm": 0.8566074967384338, + "learning_rate": 3.2801120448179276e-05, + "loss": 0.5722, + "step": 8607 + }, + { + "epoch": 11.01824, + "grad_norm": 0.8810505270957947, + "learning_rate": 3.279911964785915e-05, + "loss": 0.5842, + "step": 8608 + }, + { + "epoch": 11.01952, + "grad_norm": 0.8587162494659424, + "learning_rate": 3.279711884753901e-05, + "loss": 0.4996, + "step": 8609 + }, + { + "epoch": 11.0208, + "grad_norm": 0.8678077459335327, + "learning_rate": 3.2795118047218885e-05, + "loss": 0.5706, + "step": 8610 + }, + { + "epoch": 11.02208, + "grad_norm": 0.8607105612754822, + "learning_rate": 3.279311724689876e-05, + "loss": 0.548, + "step": 8611 + }, + { + "epoch": 11.02336, + "grad_norm": 0.8053077459335327, + "learning_rate": 3.2791116446578635e-05, + "loss": 0.5258, + "step": 8612 + }, + { + "epoch": 11.02464, + "grad_norm": 0.8794430494308472, + "learning_rate": 3.278911564625851e-05, + "loss": 0.5362, + "step": 8613 + }, + { + "epoch": 11.02592, + "grad_norm": 0.8815926313400269, + "learning_rate": 3.278711484593838e-05, + "loss": 0.5306, + "step": 8614 + }, + { + "epoch": 11.0272, + "grad_norm": 0.8498107194900513, + "learning_rate": 3.278511404561825e-05, + "loss": 0.5102, + "step": 8615 + }, + { + "epoch": 11.02848, + "grad_norm": 0.907062292098999, + "learning_rate": 3.278311324529812e-05, + "loss": 0.5752, + "step": 8616 + }, + { + "epoch": 11.02976, + "grad_norm": 0.8796223998069763, + "learning_rate": 3.278111244497799e-05, + "loss": 0.5416, + "step": 8617 + }, + { + "epoch": 11.03104, + "grad_norm": 0.9161748290061951, + "learning_rate": 3.2779111644657866e-05, + "loss": 0.5676, + "step": 8618 + }, + { + "epoch": 11.03232, + "grad_norm": 0.8488770127296448, + "learning_rate": 3.277711084433774e-05, + "loss": 0.5681, + "step": 8619 + }, + { + "epoch": 11.0336, + "grad_norm": 0.9294846057891846, + "learning_rate": 3.277511004401761e-05, + "loss": 0.5358, + "step": 8620 + }, + { + "epoch": 11.03488, + "grad_norm": 0.8868327736854553, + "learning_rate": 3.277310924369748e-05, + "loss": 0.4881, + "step": 8621 + }, + { + "epoch": 11.03616, + "grad_norm": 0.8899376392364502, + "learning_rate": 3.2771108443377354e-05, + "loss": 0.533, + "step": 8622 + }, + { + "epoch": 11.03744, + "grad_norm": 0.8953622579574585, + "learning_rate": 3.2769107643057226e-05, + "loss": 0.5389, + "step": 8623 + }, + { + "epoch": 11.03872, + "grad_norm": 0.9292336106300354, + "learning_rate": 3.27671068427371e-05, + "loss": 0.606, + "step": 8624 + }, + { + "epoch": 11.04, + "grad_norm": 0.9286232590675354, + "learning_rate": 3.276510604241697e-05, + "loss": 0.5897, + "step": 8625 + }, + { + "epoch": 11.04128, + "grad_norm": 0.9098997712135315, + "learning_rate": 3.276310524209684e-05, + "loss": 0.5316, + "step": 8626 + }, + { + "epoch": 11.04256, + "grad_norm": 0.9626008868217468, + "learning_rate": 3.276110444177671e-05, + "loss": 0.6035, + "step": 8627 + }, + { + "epoch": 11.04384, + "grad_norm": 0.8677993416786194, + "learning_rate": 3.2759103641456585e-05, + "loss": 0.4975, + "step": 8628 + }, + { + "epoch": 11.04512, + "grad_norm": 0.8999881148338318, + "learning_rate": 3.275710284113646e-05, + "loss": 0.5494, + "step": 8629 + }, + { + "epoch": 11.0464, + "grad_norm": 0.8779304623603821, + "learning_rate": 3.275510204081633e-05, + "loss": 0.5525, + "step": 8630 + }, + { + "epoch": 11.04768, + "grad_norm": 0.9179341793060303, + "learning_rate": 3.27531012404962e-05, + "loss": 0.5784, + "step": 8631 + }, + { + "epoch": 11.04896, + "grad_norm": 0.9072324633598328, + "learning_rate": 3.275110044017607e-05, + "loss": 0.548, + "step": 8632 + }, + { + "epoch": 11.05024, + "grad_norm": 0.9047489166259766, + "learning_rate": 3.2749099639855944e-05, + "loss": 0.5374, + "step": 8633 + }, + { + "epoch": 11.05152, + "grad_norm": 0.9042777419090271, + "learning_rate": 3.2747098839535816e-05, + "loss": 0.6008, + "step": 8634 + }, + { + "epoch": 11.0528, + "grad_norm": 0.913502037525177, + "learning_rate": 3.274509803921569e-05, + "loss": 0.6139, + "step": 8635 + }, + { + "epoch": 11.05408, + "grad_norm": 0.8988363146781921, + "learning_rate": 3.274309723889556e-05, + "loss": 0.5735, + "step": 8636 + }, + { + "epoch": 11.05536, + "grad_norm": 0.9248592257499695, + "learning_rate": 3.274109643857543e-05, + "loss": 0.6024, + "step": 8637 + }, + { + "epoch": 11.05664, + "grad_norm": 0.8595258593559265, + "learning_rate": 3.2739095638255303e-05, + "loss": 0.5686, + "step": 8638 + }, + { + "epoch": 11.05792, + "grad_norm": 0.917407751083374, + "learning_rate": 3.2737094837935175e-05, + "loss": 0.5686, + "step": 8639 + }, + { + "epoch": 11.0592, + "grad_norm": 0.8839895725250244, + "learning_rate": 3.273509403761505e-05, + "loss": 0.5334, + "step": 8640 + }, + { + "epoch": 11.06048, + "grad_norm": 0.8676347136497498, + "learning_rate": 3.273309323729492e-05, + "loss": 0.5396, + "step": 8641 + }, + { + "epoch": 11.06176, + "grad_norm": 0.880967915058136, + "learning_rate": 3.273109243697479e-05, + "loss": 0.514, + "step": 8642 + }, + { + "epoch": 11.06304, + "grad_norm": 0.9367587566375732, + "learning_rate": 3.272909163665466e-05, + "loss": 0.5731, + "step": 8643 + }, + { + "epoch": 11.06432, + "grad_norm": 0.8796616196632385, + "learning_rate": 3.2727090836334535e-05, + "loss": 0.5887, + "step": 8644 + }, + { + "epoch": 11.0656, + "grad_norm": 0.8505174517631531, + "learning_rate": 3.2725090036014406e-05, + "loss": 0.5534, + "step": 8645 + }, + { + "epoch": 11.06688, + "grad_norm": 0.8879563808441162, + "learning_rate": 3.2723089235694285e-05, + "loss": 0.6019, + "step": 8646 + }, + { + "epoch": 11.06816, + "grad_norm": 0.876809298992157, + "learning_rate": 3.272108843537415e-05, + "loss": 0.5354, + "step": 8647 + }, + { + "epoch": 11.06944, + "grad_norm": 0.8911082148551941, + "learning_rate": 3.271908763505402e-05, + "loss": 0.5154, + "step": 8648 + }, + { + "epoch": 11.07072, + "grad_norm": 0.9009726643562317, + "learning_rate": 3.2717086834733894e-05, + "loss": 0.5128, + "step": 8649 + }, + { + "epoch": 11.072, + "grad_norm": 0.8957163691520691, + "learning_rate": 3.2715086034413766e-05, + "loss": 0.5322, + "step": 8650 + }, + { + "epoch": 11.07328, + "grad_norm": 0.866563618183136, + "learning_rate": 3.271308523409364e-05, + "loss": 0.5464, + "step": 8651 + }, + { + "epoch": 11.07456, + "grad_norm": 0.9009678959846497, + "learning_rate": 3.271108443377351e-05, + "loss": 0.536, + "step": 8652 + }, + { + "epoch": 11.07584, + "grad_norm": 0.9172965288162231, + "learning_rate": 3.270908363345339e-05, + "loss": 0.5667, + "step": 8653 + }, + { + "epoch": 11.07712, + "grad_norm": 0.9287314414978027, + "learning_rate": 3.270708283313326e-05, + "loss": 0.5507, + "step": 8654 + }, + { + "epoch": 11.0784, + "grad_norm": 0.9205261468887329, + "learning_rate": 3.2705082032813125e-05, + "loss": 0.5491, + "step": 8655 + }, + { + "epoch": 11.07968, + "grad_norm": 0.8569768667221069, + "learning_rate": 3.2703081232493e-05, + "loss": 0.5158, + "step": 8656 + }, + { + "epoch": 11.08096, + "grad_norm": 0.9120553731918335, + "learning_rate": 3.270108043217287e-05, + "loss": 0.5894, + "step": 8657 + }, + { + "epoch": 11.08224, + "grad_norm": 0.9037409424781799, + "learning_rate": 3.269907963185274e-05, + "loss": 0.5456, + "step": 8658 + }, + { + "epoch": 11.08352, + "grad_norm": 0.9417985081672668, + "learning_rate": 3.269707883153261e-05, + "loss": 0.5491, + "step": 8659 + }, + { + "epoch": 11.0848, + "grad_norm": 0.8706283569335938, + "learning_rate": 3.2695078031212484e-05, + "loss": 0.5391, + "step": 8660 + }, + { + "epoch": 11.08608, + "grad_norm": 0.8442224860191345, + "learning_rate": 3.269307723089236e-05, + "loss": 0.5301, + "step": 8661 + }, + { + "epoch": 11.08736, + "grad_norm": 0.8892581462860107, + "learning_rate": 3.2691076430572235e-05, + "loss": 0.5239, + "step": 8662 + }, + { + "epoch": 11.08864, + "grad_norm": 0.8483731746673584, + "learning_rate": 3.26890756302521e-05, + "loss": 0.5079, + "step": 8663 + }, + { + "epoch": 11.08992, + "grad_norm": 0.8920766711235046, + "learning_rate": 3.268707482993197e-05, + "loss": 0.5682, + "step": 8664 + }, + { + "epoch": 11.0912, + "grad_norm": 0.8351317048072815, + "learning_rate": 3.2685074029611844e-05, + "loss": 0.5189, + "step": 8665 + }, + { + "epoch": 11.09248, + "grad_norm": 0.9149410724639893, + "learning_rate": 3.2683073229291715e-05, + "loss": 0.5959, + "step": 8666 + }, + { + "epoch": 11.09376, + "grad_norm": 0.8564448356628418, + "learning_rate": 3.268107242897159e-05, + "loss": 0.5124, + "step": 8667 + }, + { + "epoch": 11.09504, + "grad_norm": 0.8573926687240601, + "learning_rate": 3.2679071628651466e-05, + "loss": 0.5366, + "step": 8668 + }, + { + "epoch": 11.09632, + "grad_norm": 0.8379302620887756, + "learning_rate": 3.267707082833134e-05, + "loss": 0.5218, + "step": 8669 + }, + { + "epoch": 11.0976, + "grad_norm": 0.8685084581375122, + "learning_rate": 3.267507002801121e-05, + "loss": 0.5322, + "step": 8670 + }, + { + "epoch": 11.09888, + "grad_norm": 0.8932461738586426, + "learning_rate": 3.2673069227691075e-05, + "loss": 0.59, + "step": 8671 + }, + { + "epoch": 11.10016, + "grad_norm": 0.8610022664070129, + "learning_rate": 3.267106842737095e-05, + "loss": 0.5244, + "step": 8672 + }, + { + "epoch": 11.10144, + "grad_norm": 0.8831630349159241, + "learning_rate": 3.266906762705082e-05, + "loss": 0.609, + "step": 8673 + }, + { + "epoch": 11.10272, + "grad_norm": 0.899043619632721, + "learning_rate": 3.266706682673069e-05, + "loss": 0.552, + "step": 8674 + }, + { + "epoch": 11.104, + "grad_norm": 0.9248834848403931, + "learning_rate": 3.266506602641057e-05, + "loss": 0.5851, + "step": 8675 + }, + { + "epoch": 11.10528, + "grad_norm": 0.8615071177482605, + "learning_rate": 3.266306522609044e-05, + "loss": 0.5664, + "step": 8676 + }, + { + "epoch": 11.10656, + "grad_norm": 0.882163405418396, + "learning_rate": 3.266106442577031e-05, + "loss": 0.5823, + "step": 8677 + }, + { + "epoch": 11.10784, + "grad_norm": 0.8114548921585083, + "learning_rate": 3.2659063625450185e-05, + "loss": 0.5001, + "step": 8678 + }, + { + "epoch": 11.10912, + "grad_norm": 0.9176467657089233, + "learning_rate": 3.265706282513005e-05, + "loss": 0.566, + "step": 8679 + }, + { + "epoch": 11.1104, + "grad_norm": 0.867563009262085, + "learning_rate": 3.265506202480992e-05, + "loss": 0.5233, + "step": 8680 + }, + { + "epoch": 11.11168, + "grad_norm": 0.8672628402709961, + "learning_rate": 3.265306122448979e-05, + "loss": 0.5412, + "step": 8681 + }, + { + "epoch": 11.11296, + "grad_norm": 0.9063531160354614, + "learning_rate": 3.265106042416967e-05, + "loss": 0.5753, + "step": 8682 + }, + { + "epoch": 11.11424, + "grad_norm": 0.86628258228302, + "learning_rate": 3.2649059623849544e-05, + "loss": 0.5739, + "step": 8683 + }, + { + "epoch": 11.11552, + "grad_norm": 0.8533140420913696, + "learning_rate": 3.2647058823529416e-05, + "loss": 0.555, + "step": 8684 + }, + { + "epoch": 11.1168, + "grad_norm": 0.8930099606513977, + "learning_rate": 3.264505802320929e-05, + "loss": 0.5193, + "step": 8685 + }, + { + "epoch": 11.11808, + "grad_norm": 0.8562926650047302, + "learning_rate": 3.264305722288916e-05, + "loss": 0.5209, + "step": 8686 + }, + { + "epoch": 11.11936, + "grad_norm": 0.8850371241569519, + "learning_rate": 3.2641056422569024e-05, + "loss": 0.5525, + "step": 8687 + }, + { + "epoch": 11.12064, + "grad_norm": 0.932697594165802, + "learning_rate": 3.2639055622248896e-05, + "loss": 0.5613, + "step": 8688 + }, + { + "epoch": 11.12192, + "grad_norm": 0.8724458813667297, + "learning_rate": 3.2637054821928775e-05, + "loss": 0.5146, + "step": 8689 + }, + { + "epoch": 11.1232, + "grad_norm": 0.8727782368659973, + "learning_rate": 3.263505402160865e-05, + "loss": 0.5339, + "step": 8690 + }, + { + "epoch": 11.12448, + "grad_norm": 0.916933536529541, + "learning_rate": 3.263305322128852e-05, + "loss": 0.5998, + "step": 8691 + }, + { + "epoch": 11.12576, + "grad_norm": 0.875407338142395, + "learning_rate": 3.263105242096839e-05, + "loss": 0.5418, + "step": 8692 + }, + { + "epoch": 11.12704, + "grad_norm": 0.8921369910240173, + "learning_rate": 3.262905162064826e-05, + "loss": 0.5805, + "step": 8693 + }, + { + "epoch": 11.12832, + "grad_norm": 0.9058687090873718, + "learning_rate": 3.2627050820328134e-05, + "loss": 0.5751, + "step": 8694 + }, + { + "epoch": 11.1296, + "grad_norm": 0.9492460489273071, + "learning_rate": 3.2625050020008e-05, + "loss": 0.5572, + "step": 8695 + }, + { + "epoch": 11.13088, + "grad_norm": 0.9078418016433716, + "learning_rate": 3.262304921968788e-05, + "loss": 0.5816, + "step": 8696 + }, + { + "epoch": 11.13216, + "grad_norm": 0.8583430051803589, + "learning_rate": 3.262104841936775e-05, + "loss": 0.5219, + "step": 8697 + }, + { + "epoch": 11.13344, + "grad_norm": 0.9013339281082153, + "learning_rate": 3.261904761904762e-05, + "loss": 0.5521, + "step": 8698 + }, + { + "epoch": 11.13472, + "grad_norm": 0.9228495955467224, + "learning_rate": 3.2617046818727494e-05, + "loss": 0.6093, + "step": 8699 + }, + { + "epoch": 11.136, + "grad_norm": 0.895261287689209, + "learning_rate": 3.2615046018407365e-05, + "loss": 0.5856, + "step": 8700 + }, + { + "epoch": 11.13728, + "grad_norm": 0.9044747948646545, + "learning_rate": 3.261304521808724e-05, + "loss": 0.5621, + "step": 8701 + }, + { + "epoch": 11.13856, + "grad_norm": 0.9378688335418701, + "learning_rate": 3.261104441776711e-05, + "loss": 0.6047, + "step": 8702 + }, + { + "epoch": 11.13984, + "grad_norm": 0.941558301448822, + "learning_rate": 3.260904361744698e-05, + "loss": 0.5467, + "step": 8703 + }, + { + "epoch": 11.14112, + "grad_norm": 0.9449127912521362, + "learning_rate": 3.260704281712685e-05, + "loss": 0.5336, + "step": 8704 + }, + { + "epoch": 11.1424, + "grad_norm": 0.8693781495094299, + "learning_rate": 3.2605042016806725e-05, + "loss": 0.522, + "step": 8705 + }, + { + "epoch": 11.14368, + "grad_norm": 0.8828169107437134, + "learning_rate": 3.2603041216486597e-05, + "loss": 0.557, + "step": 8706 + }, + { + "epoch": 11.14496, + "grad_norm": 0.9226519465446472, + "learning_rate": 3.260104041616647e-05, + "loss": 0.5246, + "step": 8707 + }, + { + "epoch": 11.14624, + "grad_norm": 0.9169709086418152, + "learning_rate": 3.259903961584634e-05, + "loss": 0.58, + "step": 8708 + }, + { + "epoch": 11.14752, + "grad_norm": 0.92818683385849, + "learning_rate": 3.259703881552621e-05, + "loss": 0.5937, + "step": 8709 + }, + { + "epoch": 11.1488, + "grad_norm": 0.9431915879249573, + "learning_rate": 3.2595038015206084e-05, + "loss": 0.5702, + "step": 8710 + }, + { + "epoch": 11.150079999999999, + "grad_norm": 0.8843738436698914, + "learning_rate": 3.2593037214885956e-05, + "loss": 0.5305, + "step": 8711 + }, + { + "epoch": 11.15136, + "grad_norm": 0.8925315737724304, + "learning_rate": 3.259103641456583e-05, + "loss": 0.5114, + "step": 8712 + }, + { + "epoch": 11.15264, + "grad_norm": 0.8767265677452087, + "learning_rate": 3.25890356142457e-05, + "loss": 0.5199, + "step": 8713 + }, + { + "epoch": 11.15392, + "grad_norm": 0.8850144743919373, + "learning_rate": 3.258703481392557e-05, + "loss": 0.5388, + "step": 8714 + }, + { + "epoch": 11.1552, + "grad_norm": 0.8759915232658386, + "learning_rate": 3.258503401360544e-05, + "loss": 0.5628, + "step": 8715 + }, + { + "epoch": 11.15648, + "grad_norm": 0.8618879318237305, + "learning_rate": 3.2583033213285315e-05, + "loss": 0.5094, + "step": 8716 + }, + { + "epoch": 11.15776, + "grad_norm": 0.9156239628791809, + "learning_rate": 3.258103241296519e-05, + "loss": 0.562, + "step": 8717 + }, + { + "epoch": 11.15904, + "grad_norm": 0.8669613599777222, + "learning_rate": 3.257903161264506e-05, + "loss": 0.5478, + "step": 8718 + }, + { + "epoch": 11.16032, + "grad_norm": 0.9046533107757568, + "learning_rate": 3.257703081232493e-05, + "loss": 0.6049, + "step": 8719 + }, + { + "epoch": 11.1616, + "grad_norm": 0.9057307243347168, + "learning_rate": 3.25750300120048e-05, + "loss": 0.5175, + "step": 8720 + }, + { + "epoch": 11.16288, + "grad_norm": 0.8702899813652039, + "learning_rate": 3.2573029211684674e-05, + "loss": 0.5061, + "step": 8721 + }, + { + "epoch": 11.16416, + "grad_norm": 0.8357275724411011, + "learning_rate": 3.2571028411364546e-05, + "loss": 0.5112, + "step": 8722 + }, + { + "epoch": 11.16544, + "grad_norm": 0.9093285799026489, + "learning_rate": 3.256902761104442e-05, + "loss": 0.5853, + "step": 8723 + }, + { + "epoch": 11.16672, + "grad_norm": 0.8935590386390686, + "learning_rate": 3.25670268107243e-05, + "loss": 0.5726, + "step": 8724 + }, + { + "epoch": 11.168, + "grad_norm": 0.8492559194564819, + "learning_rate": 3.256502601040416e-05, + "loss": 0.522, + "step": 8725 + }, + { + "epoch": 11.16928, + "grad_norm": 0.8798608779907227, + "learning_rate": 3.2563025210084034e-05, + "loss": 0.5661, + "step": 8726 + }, + { + "epoch": 11.17056, + "grad_norm": 0.8920543193817139, + "learning_rate": 3.2561024409763906e-05, + "loss": 0.5535, + "step": 8727 + }, + { + "epoch": 11.17184, + "grad_norm": 0.8616754412651062, + "learning_rate": 3.255902360944378e-05, + "loss": 0.5294, + "step": 8728 + }, + { + "epoch": 11.17312, + "grad_norm": 0.8341376185417175, + "learning_rate": 3.255702280912365e-05, + "loss": 0.4899, + "step": 8729 + }, + { + "epoch": 11.1744, + "grad_norm": 0.8843587636947632, + "learning_rate": 3.255502200880352e-05, + "loss": 0.5537, + "step": 8730 + }, + { + "epoch": 11.17568, + "grad_norm": 0.8609508872032166, + "learning_rate": 3.25530212084834e-05, + "loss": 0.5028, + "step": 8731 + }, + { + "epoch": 11.17696, + "grad_norm": 0.8849518299102783, + "learning_rate": 3.255102040816327e-05, + "loss": 0.5173, + "step": 8732 + }, + { + "epoch": 11.17824, + "grad_norm": 0.874070942401886, + "learning_rate": 3.254901960784314e-05, + "loss": 0.5143, + "step": 8733 + }, + { + "epoch": 11.17952, + "grad_norm": 0.9157163500785828, + "learning_rate": 3.254701880752301e-05, + "loss": 0.5429, + "step": 8734 + }, + { + "epoch": 11.1808, + "grad_norm": 0.90493243932724, + "learning_rate": 3.254501800720288e-05, + "loss": 0.5794, + "step": 8735 + }, + { + "epoch": 11.18208, + "grad_norm": 0.9423990249633789, + "learning_rate": 3.254301720688275e-05, + "loss": 0.5889, + "step": 8736 + }, + { + "epoch": 11.18336, + "grad_norm": 0.950400710105896, + "learning_rate": 3.2541016406562624e-05, + "loss": 0.5909, + "step": 8737 + }, + { + "epoch": 11.18464, + "grad_norm": 0.946723997592926, + "learning_rate": 3.25390156062425e-05, + "loss": 0.5764, + "step": 8738 + }, + { + "epoch": 11.18592, + "grad_norm": 0.9119811058044434, + "learning_rate": 3.2537014805922375e-05, + "loss": 0.5859, + "step": 8739 + }, + { + "epoch": 11.1872, + "grad_norm": 0.8999208807945251, + "learning_rate": 3.2535014005602246e-05, + "loss": 0.5725, + "step": 8740 + }, + { + "epoch": 11.18848, + "grad_norm": 0.8947361707687378, + "learning_rate": 3.253301320528211e-05, + "loss": 0.5432, + "step": 8741 + }, + { + "epoch": 11.18976, + "grad_norm": 0.9105396270751953, + "learning_rate": 3.253101240496198e-05, + "loss": 0.5896, + "step": 8742 + }, + { + "epoch": 11.19104, + "grad_norm": 0.9032384753227234, + "learning_rate": 3.2529011604641855e-05, + "loss": 0.5516, + "step": 8743 + }, + { + "epoch": 11.19232, + "grad_norm": 0.966087281703949, + "learning_rate": 3.252701080432173e-05, + "loss": 0.5906, + "step": 8744 + }, + { + "epoch": 11.1936, + "grad_norm": 0.9482411742210388, + "learning_rate": 3.2525010004001606e-05, + "loss": 0.597, + "step": 8745 + }, + { + "epoch": 11.19488, + "grad_norm": 0.8811086416244507, + "learning_rate": 3.252300920368148e-05, + "loss": 0.54, + "step": 8746 + }, + { + "epoch": 11.19616, + "grad_norm": 0.8933411240577698, + "learning_rate": 3.252100840336135e-05, + "loss": 0.5465, + "step": 8747 + }, + { + "epoch": 11.19744, + "grad_norm": 0.8633106350898743, + "learning_rate": 3.251900760304122e-05, + "loss": 0.5178, + "step": 8748 + }, + { + "epoch": 11.19872, + "grad_norm": 0.8998020887374878, + "learning_rate": 3.2517006802721086e-05, + "loss": 0.5437, + "step": 8749 + }, + { + "epoch": 11.2, + "grad_norm": 0.9244217276573181, + "learning_rate": 3.251500600240096e-05, + "loss": 0.6415, + "step": 8750 + }, + { + "epoch": 11.20128, + "grad_norm": 0.9325177073478699, + "learning_rate": 3.251300520208083e-05, + "loss": 0.6122, + "step": 8751 + }, + { + "epoch": 11.20256, + "grad_norm": 0.9503816962242126, + "learning_rate": 3.251100440176071e-05, + "loss": 0.5848, + "step": 8752 + }, + { + "epoch": 11.20384, + "grad_norm": 0.9142863154411316, + "learning_rate": 3.250900360144058e-05, + "loss": 0.5554, + "step": 8753 + }, + { + "epoch": 11.20512, + "grad_norm": 0.8959869742393494, + "learning_rate": 3.250700280112045e-05, + "loss": 0.6131, + "step": 8754 + }, + { + "epoch": 11.2064, + "grad_norm": 0.9044204354286194, + "learning_rate": 3.2505002000800324e-05, + "loss": 0.5885, + "step": 8755 + }, + { + "epoch": 11.20768, + "grad_norm": 0.9072441458702087, + "learning_rate": 3.2503001200480196e-05, + "loss": 0.5396, + "step": 8756 + }, + { + "epoch": 11.20896, + "grad_norm": 0.8490614295005798, + "learning_rate": 3.250100040016006e-05, + "loss": 0.5353, + "step": 8757 + }, + { + "epoch": 11.21024, + "grad_norm": 0.9115065336227417, + "learning_rate": 3.249899959983993e-05, + "loss": 0.5137, + "step": 8758 + }, + { + "epoch": 11.21152, + "grad_norm": 0.8661530017852783, + "learning_rate": 3.249699879951981e-05, + "loss": 0.4879, + "step": 8759 + }, + { + "epoch": 11.2128, + "grad_norm": 0.8535701036453247, + "learning_rate": 3.2494997999199684e-05, + "loss": 0.5335, + "step": 8760 + }, + { + "epoch": 11.21408, + "grad_norm": 0.9392260313034058, + "learning_rate": 3.2492997198879555e-05, + "loss": 0.5892, + "step": 8761 + }, + { + "epoch": 11.21536, + "grad_norm": 0.861599862575531, + "learning_rate": 3.249099639855943e-05, + "loss": 0.5172, + "step": 8762 + }, + { + "epoch": 11.21664, + "grad_norm": 0.8953681588172913, + "learning_rate": 3.24889955982393e-05, + "loss": 0.5388, + "step": 8763 + }, + { + "epoch": 11.21792, + "grad_norm": 0.9016774892807007, + "learning_rate": 3.248699479791917e-05, + "loss": 0.5661, + "step": 8764 + }, + { + "epoch": 11.2192, + "grad_norm": 0.8995947241783142, + "learning_rate": 3.2484993997599036e-05, + "loss": 0.5247, + "step": 8765 + }, + { + "epoch": 11.22048, + "grad_norm": 0.9032994508743286, + "learning_rate": 3.2482993197278915e-05, + "loss": 0.5577, + "step": 8766 + }, + { + "epoch": 11.22176, + "grad_norm": 0.934908390045166, + "learning_rate": 3.2480992396958787e-05, + "loss": 0.6034, + "step": 8767 + }, + { + "epoch": 11.22304, + "grad_norm": 0.8706194758415222, + "learning_rate": 3.247899159663866e-05, + "loss": 0.5512, + "step": 8768 + }, + { + "epoch": 11.22432, + "grad_norm": 0.8422589302062988, + "learning_rate": 3.247699079631853e-05, + "loss": 0.5348, + "step": 8769 + }, + { + "epoch": 11.2256, + "grad_norm": 0.8409394025802612, + "learning_rate": 3.24749899959984e-05, + "loss": 0.545, + "step": 8770 + }, + { + "epoch": 11.22688, + "grad_norm": 0.9218395948410034, + "learning_rate": 3.2472989195678274e-05, + "loss": 0.6042, + "step": 8771 + }, + { + "epoch": 11.22816, + "grad_norm": 0.8771111965179443, + "learning_rate": 3.2470988395358146e-05, + "loss": 0.5602, + "step": 8772 + }, + { + "epoch": 11.22944, + "grad_norm": 0.8485012054443359, + "learning_rate": 3.246898759503801e-05, + "loss": 0.5159, + "step": 8773 + }, + { + "epoch": 11.23072, + "grad_norm": 0.9108744263648987, + "learning_rate": 3.246698679471789e-05, + "loss": 0.5771, + "step": 8774 + }, + { + "epoch": 11.232, + "grad_norm": 0.8793061971664429, + "learning_rate": 3.246498599439776e-05, + "loss": 0.5697, + "step": 8775 + }, + { + "epoch": 11.23328, + "grad_norm": 0.8831557631492615, + "learning_rate": 3.246298519407763e-05, + "loss": 0.5513, + "step": 8776 + }, + { + "epoch": 11.23456, + "grad_norm": 0.9184591770172119, + "learning_rate": 3.2460984393757505e-05, + "loss": 0.5967, + "step": 8777 + }, + { + "epoch": 11.23584, + "grad_norm": 0.9362067580223083, + "learning_rate": 3.245898359343738e-05, + "loss": 0.5762, + "step": 8778 + }, + { + "epoch": 11.23712, + "grad_norm": 0.9164170026779175, + "learning_rate": 3.245698279311725e-05, + "loss": 0.5353, + "step": 8779 + }, + { + "epoch": 11.2384, + "grad_norm": 0.878761351108551, + "learning_rate": 3.245498199279712e-05, + "loss": 0.5491, + "step": 8780 + }, + { + "epoch": 11.23968, + "grad_norm": 0.828332245349884, + "learning_rate": 3.245298119247699e-05, + "loss": 0.505, + "step": 8781 + }, + { + "epoch": 11.24096, + "grad_norm": 0.9329909682273865, + "learning_rate": 3.2450980392156864e-05, + "loss": 0.5247, + "step": 8782 + }, + { + "epoch": 11.24224, + "grad_norm": 0.9171594977378845, + "learning_rate": 3.2448979591836736e-05, + "loss": 0.5647, + "step": 8783 + }, + { + "epoch": 11.24352, + "grad_norm": 0.829479455947876, + "learning_rate": 3.244697879151661e-05, + "loss": 0.4752, + "step": 8784 + }, + { + "epoch": 11.2448, + "grad_norm": 0.8708236813545227, + "learning_rate": 3.244497799119648e-05, + "loss": 0.5795, + "step": 8785 + }, + { + "epoch": 11.24608, + "grad_norm": 0.8896415829658508, + "learning_rate": 3.244297719087635e-05, + "loss": 0.5318, + "step": 8786 + }, + { + "epoch": 11.24736, + "grad_norm": 0.9375750422477722, + "learning_rate": 3.2440976390556224e-05, + "loss": 0.612, + "step": 8787 + }, + { + "epoch": 11.24864, + "grad_norm": 0.9271597862243652, + "learning_rate": 3.2438975590236096e-05, + "loss": 0.5192, + "step": 8788 + }, + { + "epoch": 11.24992, + "grad_norm": 0.894335925579071, + "learning_rate": 3.243697478991597e-05, + "loss": 0.6151, + "step": 8789 + }, + { + "epoch": 11.2512, + "grad_norm": 0.9000656008720398, + "learning_rate": 3.243497398959584e-05, + "loss": 0.5774, + "step": 8790 + }, + { + "epoch": 11.25248, + "grad_norm": 0.8798941969871521, + "learning_rate": 3.243297318927571e-05, + "loss": 0.5594, + "step": 8791 + }, + { + "epoch": 11.25376, + "grad_norm": 0.8531872034072876, + "learning_rate": 3.243097238895558e-05, + "loss": 0.5439, + "step": 8792 + }, + { + "epoch": 11.25504, + "grad_norm": 0.9348339438438416, + "learning_rate": 3.2428971588635455e-05, + "loss": 0.5349, + "step": 8793 + }, + { + "epoch": 11.25632, + "grad_norm": 0.8829602003097534, + "learning_rate": 3.242697078831533e-05, + "loss": 0.5285, + "step": 8794 + }, + { + "epoch": 11.2576, + "grad_norm": 0.9161067605018616, + "learning_rate": 3.24249699879952e-05, + "loss": 0.5868, + "step": 8795 + }, + { + "epoch": 11.25888, + "grad_norm": 0.8983628153800964, + "learning_rate": 3.242296918767507e-05, + "loss": 0.5213, + "step": 8796 + }, + { + "epoch": 11.26016, + "grad_norm": 0.9068379998207092, + "learning_rate": 3.242096838735494e-05, + "loss": 0.5583, + "step": 8797 + }, + { + "epoch": 11.26144, + "grad_norm": 0.9481262564659119, + "learning_rate": 3.2418967587034814e-05, + "loss": 0.5554, + "step": 8798 + }, + { + "epoch": 11.26272, + "grad_norm": 0.8567753434181213, + "learning_rate": 3.2416966786714686e-05, + "loss": 0.4905, + "step": 8799 + }, + { + "epoch": 11.264, + "grad_norm": 0.9208746552467346, + "learning_rate": 3.241496598639456e-05, + "loss": 0.5791, + "step": 8800 + }, + { + "epoch": 11.26528, + "grad_norm": 0.9474934935569763, + "learning_rate": 3.241296518607443e-05, + "loss": 0.5849, + "step": 8801 + }, + { + "epoch": 11.26656, + "grad_norm": 0.9702956676483154, + "learning_rate": 3.241096438575431e-05, + "loss": 0.62, + "step": 8802 + }, + { + "epoch": 11.26784, + "grad_norm": 0.8868454694747925, + "learning_rate": 3.2408963585434173e-05, + "loss": 0.5779, + "step": 8803 + }, + { + "epoch": 11.269120000000001, + "grad_norm": 0.9131936430931091, + "learning_rate": 3.2406962785114045e-05, + "loss": 0.5787, + "step": 8804 + }, + { + "epoch": 11.2704, + "grad_norm": 0.8842905163764954, + "learning_rate": 3.240496198479392e-05, + "loss": 0.5514, + "step": 8805 + }, + { + "epoch": 11.27168, + "grad_norm": 0.8582538366317749, + "learning_rate": 3.240296118447379e-05, + "loss": 0.5355, + "step": 8806 + }, + { + "epoch": 11.27296, + "grad_norm": 0.8297027945518494, + "learning_rate": 3.240096038415366e-05, + "loss": 0.5481, + "step": 8807 + }, + { + "epoch": 11.27424, + "grad_norm": 0.9267979264259338, + "learning_rate": 3.239895958383353e-05, + "loss": 0.5676, + "step": 8808 + }, + { + "epoch": 11.27552, + "grad_norm": 0.9307857155799866, + "learning_rate": 3.239695878351341e-05, + "loss": 0.5526, + "step": 8809 + }, + { + "epoch": 11.2768, + "grad_norm": 0.9614000916481018, + "learning_rate": 3.239495798319328e-05, + "loss": 0.583, + "step": 8810 + }, + { + "epoch": 11.27808, + "grad_norm": 0.8482030630111694, + "learning_rate": 3.239295718287315e-05, + "loss": 0.5473, + "step": 8811 + }, + { + "epoch": 11.27936, + "grad_norm": 0.8771170377731323, + "learning_rate": 3.239095638255302e-05, + "loss": 0.5808, + "step": 8812 + }, + { + "epoch": 11.28064, + "grad_norm": 0.8574709296226501, + "learning_rate": 3.238895558223289e-05, + "loss": 0.55, + "step": 8813 + }, + { + "epoch": 11.28192, + "grad_norm": 0.8876118659973145, + "learning_rate": 3.2386954781912764e-05, + "loss": 0.5451, + "step": 8814 + }, + { + "epoch": 11.2832, + "grad_norm": 0.874653697013855, + "learning_rate": 3.2384953981592636e-05, + "loss": 0.5076, + "step": 8815 + }, + { + "epoch": 11.28448, + "grad_norm": 0.9255512356758118, + "learning_rate": 3.2382953181272514e-05, + "loss": 0.5806, + "step": 8816 + }, + { + "epoch": 11.28576, + "grad_norm": 0.9429183602333069, + "learning_rate": 3.2380952380952386e-05, + "loss": 0.5884, + "step": 8817 + }, + { + "epoch": 11.28704, + "grad_norm": 0.9244290590286255, + "learning_rate": 3.237895158063226e-05, + "loss": 0.5938, + "step": 8818 + }, + { + "epoch": 11.28832, + "grad_norm": 0.8580917716026306, + "learning_rate": 3.237695078031212e-05, + "loss": 0.55, + "step": 8819 + }, + { + "epoch": 11.2896, + "grad_norm": 0.9350975155830383, + "learning_rate": 3.2374949979991995e-05, + "loss": 0.5704, + "step": 8820 + }, + { + "epoch": 11.29088, + "grad_norm": 0.913230836391449, + "learning_rate": 3.237294917967187e-05, + "loss": 0.5657, + "step": 8821 + }, + { + "epoch": 11.292159999999999, + "grad_norm": 0.8917736411094666, + "learning_rate": 3.237094837935174e-05, + "loss": 0.5393, + "step": 8822 + }, + { + "epoch": 11.29344, + "grad_norm": 0.8485799431800842, + "learning_rate": 3.236894757903162e-05, + "loss": 0.5264, + "step": 8823 + }, + { + "epoch": 11.29472, + "grad_norm": 0.8522817492485046, + "learning_rate": 3.236694677871149e-05, + "loss": 0.4902, + "step": 8824 + }, + { + "epoch": 11.296, + "grad_norm": 0.8695195317268372, + "learning_rate": 3.236494597839136e-05, + "loss": 0.5453, + "step": 8825 + }, + { + "epoch": 11.29728, + "grad_norm": 0.9237691164016724, + "learning_rate": 3.236294517807123e-05, + "loss": 0.5709, + "step": 8826 + }, + { + "epoch": 11.29856, + "grad_norm": 0.9328168034553528, + "learning_rate": 3.23609443777511e-05, + "loss": 0.5593, + "step": 8827 + }, + { + "epoch": 11.29984, + "grad_norm": 0.8827159404754639, + "learning_rate": 3.235894357743097e-05, + "loss": 0.528, + "step": 8828 + }, + { + "epoch": 11.30112, + "grad_norm": 0.9112002849578857, + "learning_rate": 3.235694277711084e-05, + "loss": 0.557, + "step": 8829 + }, + { + "epoch": 11.3024, + "grad_norm": 0.8474133610725403, + "learning_rate": 3.235494197679072e-05, + "loss": 0.5448, + "step": 8830 + }, + { + "epoch": 11.30368, + "grad_norm": 0.9159875512123108, + "learning_rate": 3.235294117647059e-05, + "loss": 0.5638, + "step": 8831 + }, + { + "epoch": 11.30496, + "grad_norm": 0.8455161452293396, + "learning_rate": 3.2350940376150464e-05, + "loss": 0.5317, + "step": 8832 + }, + { + "epoch": 11.30624, + "grad_norm": 0.8944229483604431, + "learning_rate": 3.2348939575830336e-05, + "loss": 0.5455, + "step": 8833 + }, + { + "epoch": 11.30752, + "grad_norm": 0.9140881896018982, + "learning_rate": 3.234693877551021e-05, + "loss": 0.565, + "step": 8834 + }, + { + "epoch": 11.3088, + "grad_norm": 0.9187982082366943, + "learning_rate": 3.234493797519007e-05, + "loss": 0.5593, + "step": 8835 + }, + { + "epoch": 11.31008, + "grad_norm": 0.9055234789848328, + "learning_rate": 3.2342937174869945e-05, + "loss": 0.5166, + "step": 8836 + }, + { + "epoch": 11.31136, + "grad_norm": 0.9232755303382874, + "learning_rate": 3.234093637454982e-05, + "loss": 0.5993, + "step": 8837 + }, + { + "epoch": 11.31264, + "grad_norm": 0.8765431046485901, + "learning_rate": 3.2338935574229695e-05, + "loss": 0.5717, + "step": 8838 + }, + { + "epoch": 11.31392, + "grad_norm": 0.848947286605835, + "learning_rate": 3.233693477390957e-05, + "loss": 0.528, + "step": 8839 + }, + { + "epoch": 11.3152, + "grad_norm": 0.9066827297210693, + "learning_rate": 3.233493397358944e-05, + "loss": 0.5882, + "step": 8840 + }, + { + "epoch": 11.31648, + "grad_norm": 0.9069925546646118, + "learning_rate": 3.233293317326931e-05, + "loss": 0.5502, + "step": 8841 + }, + { + "epoch": 11.31776, + "grad_norm": 0.9095439314842224, + "learning_rate": 3.233093237294918e-05, + "loss": 0.5638, + "step": 8842 + }, + { + "epoch": 11.31904, + "grad_norm": 0.9278164505958557, + "learning_rate": 3.232893157262905e-05, + "loss": 0.5834, + "step": 8843 + }, + { + "epoch": 11.32032, + "grad_norm": 0.8815664052963257, + "learning_rate": 3.2326930772308926e-05, + "loss": 0.5516, + "step": 8844 + }, + { + "epoch": 11.3216, + "grad_norm": 0.8748176097869873, + "learning_rate": 3.23249299719888e-05, + "loss": 0.5715, + "step": 8845 + }, + { + "epoch": 11.32288, + "grad_norm": 0.8980875015258789, + "learning_rate": 3.232292917166867e-05, + "loss": 0.5218, + "step": 8846 + }, + { + "epoch": 11.32416, + "grad_norm": 0.9095767736434937, + "learning_rate": 3.232092837134854e-05, + "loss": 0.5353, + "step": 8847 + }, + { + "epoch": 11.32544, + "grad_norm": 0.9545304775238037, + "learning_rate": 3.2318927571028414e-05, + "loss": 0.5816, + "step": 8848 + }, + { + "epoch": 11.32672, + "grad_norm": 0.8944869637489319, + "learning_rate": 3.2316926770708286e-05, + "loss": 0.5624, + "step": 8849 + }, + { + "epoch": 11.328, + "grad_norm": 0.891927182674408, + "learning_rate": 3.231492597038816e-05, + "loss": 0.523, + "step": 8850 + }, + { + "epoch": 11.32928, + "grad_norm": 0.9130933284759521, + "learning_rate": 3.231292517006803e-05, + "loss": 0.6285, + "step": 8851 + }, + { + "epoch": 11.33056, + "grad_norm": 0.8824868202209473, + "learning_rate": 3.23109243697479e-05, + "loss": 0.5385, + "step": 8852 + }, + { + "epoch": 11.33184, + "grad_norm": 0.8892396688461304, + "learning_rate": 3.230892356942777e-05, + "loss": 0.5261, + "step": 8853 + }, + { + "epoch": 11.33312, + "grad_norm": 0.9578300714492798, + "learning_rate": 3.2306922769107645e-05, + "loss": 0.635, + "step": 8854 + }, + { + "epoch": 11.3344, + "grad_norm": 0.9021438956260681, + "learning_rate": 3.230492196878752e-05, + "loss": 0.5346, + "step": 8855 + }, + { + "epoch": 11.33568, + "grad_norm": 0.8745192289352417, + "learning_rate": 3.230292116846739e-05, + "loss": 0.5481, + "step": 8856 + }, + { + "epoch": 11.33696, + "grad_norm": 0.8894162178039551, + "learning_rate": 3.230092036814726e-05, + "loss": 0.558, + "step": 8857 + }, + { + "epoch": 11.33824, + "grad_norm": 0.8988181352615356, + "learning_rate": 3.229891956782713e-05, + "loss": 0.5633, + "step": 8858 + }, + { + "epoch": 11.33952, + "grad_norm": 0.8719711899757385, + "learning_rate": 3.2296918767507004e-05, + "loss": 0.5066, + "step": 8859 + }, + { + "epoch": 11.3408, + "grad_norm": 0.8456899523735046, + "learning_rate": 3.2294917967186876e-05, + "loss": 0.5139, + "step": 8860 + }, + { + "epoch": 11.34208, + "grad_norm": 0.8919529914855957, + "learning_rate": 3.229291716686675e-05, + "loss": 0.5203, + "step": 8861 + }, + { + "epoch": 11.34336, + "grad_norm": 0.8657916188240051, + "learning_rate": 3.229091636654662e-05, + "loss": 0.5298, + "step": 8862 + }, + { + "epoch": 11.34464, + "grad_norm": 0.8953869938850403, + "learning_rate": 3.228891556622649e-05, + "loss": 0.5217, + "step": 8863 + }, + { + "epoch": 11.34592, + "grad_norm": 0.9137585759162903, + "learning_rate": 3.2286914765906363e-05, + "loss": 0.5738, + "step": 8864 + }, + { + "epoch": 11.3472, + "grad_norm": 0.8777027726173401, + "learning_rate": 3.228491396558624e-05, + "loss": 0.5342, + "step": 8865 + }, + { + "epoch": 11.34848, + "grad_norm": 0.9374188780784607, + "learning_rate": 3.228291316526611e-05, + "loss": 0.6148, + "step": 8866 + }, + { + "epoch": 11.34976, + "grad_norm": 0.8601318001747131, + "learning_rate": 3.228091236494598e-05, + "loss": 0.5121, + "step": 8867 + }, + { + "epoch": 11.35104, + "grad_norm": 0.904672384262085, + "learning_rate": 3.227891156462585e-05, + "loss": 0.5718, + "step": 8868 + }, + { + "epoch": 11.35232, + "grad_norm": 0.9223089218139648, + "learning_rate": 3.227691076430572e-05, + "loss": 0.5815, + "step": 8869 + }, + { + "epoch": 11.3536, + "grad_norm": 0.8737932443618774, + "learning_rate": 3.2274909963985595e-05, + "loss": 0.5285, + "step": 8870 + }, + { + "epoch": 11.35488, + "grad_norm": 0.9372346997261047, + "learning_rate": 3.2272909163665466e-05, + "loss": 0.5748, + "step": 8871 + }, + { + "epoch": 11.35616, + "grad_norm": 0.9131039977073669, + "learning_rate": 3.2270908363345345e-05, + "loss": 0.5606, + "step": 8872 + }, + { + "epoch": 11.35744, + "grad_norm": 0.9296916127204895, + "learning_rate": 3.226890756302522e-05, + "loss": 0.576, + "step": 8873 + }, + { + "epoch": 11.35872, + "grad_norm": 0.9245988130569458, + "learning_rate": 3.226690676270508e-05, + "loss": 0.5462, + "step": 8874 + }, + { + "epoch": 11.36, + "grad_norm": 0.934669017791748, + "learning_rate": 3.2264905962384954e-05, + "loss": 0.555, + "step": 8875 + }, + { + "epoch": 11.36128, + "grad_norm": 0.8804442882537842, + "learning_rate": 3.2262905162064826e-05, + "loss": 0.5157, + "step": 8876 + }, + { + "epoch": 11.36256, + "grad_norm": 0.8725820779800415, + "learning_rate": 3.22609043617447e-05, + "loss": 0.5829, + "step": 8877 + }, + { + "epoch": 11.36384, + "grad_norm": 0.9546533823013306, + "learning_rate": 3.225890356142457e-05, + "loss": 0.5668, + "step": 8878 + }, + { + "epoch": 11.36512, + "grad_norm": 0.9215765595436096, + "learning_rate": 3.225690276110445e-05, + "loss": 0.546, + "step": 8879 + }, + { + "epoch": 11.3664, + "grad_norm": 0.9249255061149597, + "learning_rate": 3.225490196078432e-05, + "loss": 0.5562, + "step": 8880 + }, + { + "epoch": 11.36768, + "grad_norm": 0.9244142174720764, + "learning_rate": 3.225290116046419e-05, + "loss": 0.54, + "step": 8881 + }, + { + "epoch": 11.36896, + "grad_norm": 0.9065011143684387, + "learning_rate": 3.225090036014406e-05, + "loss": 0.5555, + "step": 8882 + }, + { + "epoch": 11.37024, + "grad_norm": 0.8759673237800598, + "learning_rate": 3.224889955982393e-05, + "loss": 0.5506, + "step": 8883 + }, + { + "epoch": 11.37152, + "grad_norm": 0.9149131774902344, + "learning_rate": 3.22468987595038e-05, + "loss": 0.5635, + "step": 8884 + }, + { + "epoch": 11.3728, + "grad_norm": 0.9005359411239624, + "learning_rate": 3.224489795918367e-05, + "loss": 0.5289, + "step": 8885 + }, + { + "epoch": 11.37408, + "grad_norm": 0.9132285714149475, + "learning_rate": 3.2242897158863544e-05, + "loss": 0.5287, + "step": 8886 + }, + { + "epoch": 11.37536, + "grad_norm": 0.9003539681434631, + "learning_rate": 3.224089635854342e-05, + "loss": 0.521, + "step": 8887 + }, + { + "epoch": 11.37664, + "grad_norm": 0.8908035755157471, + "learning_rate": 3.2238895558223295e-05, + "loss": 0.5668, + "step": 8888 + }, + { + "epoch": 11.37792, + "grad_norm": 0.9106180667877197, + "learning_rate": 3.223689475790317e-05, + "loss": 0.5502, + "step": 8889 + }, + { + "epoch": 11.3792, + "grad_norm": 0.9311418533325195, + "learning_rate": 3.223489395758303e-05, + "loss": 0.5703, + "step": 8890 + }, + { + "epoch": 11.38048, + "grad_norm": 0.8639375567436218, + "learning_rate": 3.2232893157262904e-05, + "loss": 0.5366, + "step": 8891 + }, + { + "epoch": 11.38176, + "grad_norm": 0.903183102607727, + "learning_rate": 3.2230892356942775e-05, + "loss": 0.5783, + "step": 8892 + }, + { + "epoch": 11.38304, + "grad_norm": 0.8958691954612732, + "learning_rate": 3.222889155662265e-05, + "loss": 0.5197, + "step": 8893 + }, + { + "epoch": 11.38432, + "grad_norm": 0.95194411277771, + "learning_rate": 3.2226890756302526e-05, + "loss": 0.6212, + "step": 8894 + }, + { + "epoch": 11.3856, + "grad_norm": 0.8696224689483643, + "learning_rate": 3.22248899559824e-05, + "loss": 0.5376, + "step": 8895 + }, + { + "epoch": 11.38688, + "grad_norm": 0.925094485282898, + "learning_rate": 3.222288915566227e-05, + "loss": 0.5839, + "step": 8896 + }, + { + "epoch": 11.38816, + "grad_norm": 0.8680374026298523, + "learning_rate": 3.222088835534214e-05, + "loss": 0.4927, + "step": 8897 + }, + { + "epoch": 11.38944, + "grad_norm": 0.9025076031684875, + "learning_rate": 3.2218887555022007e-05, + "loss": 0.5683, + "step": 8898 + }, + { + "epoch": 11.39072, + "grad_norm": 0.8952671885490417, + "learning_rate": 3.221688675470188e-05, + "loss": 0.5648, + "step": 8899 + }, + { + "epoch": 11.392, + "grad_norm": 0.916085422039032, + "learning_rate": 3.221488595438175e-05, + "loss": 0.5825, + "step": 8900 + }, + { + "epoch": 11.39328, + "grad_norm": 0.8768932223320007, + "learning_rate": 3.221288515406163e-05, + "loss": 0.5205, + "step": 8901 + }, + { + "epoch": 11.39456, + "grad_norm": 0.912806510925293, + "learning_rate": 3.22108843537415e-05, + "loss": 0.5078, + "step": 8902 + }, + { + "epoch": 11.39584, + "grad_norm": 0.9842191338539124, + "learning_rate": 3.220888355342137e-05, + "loss": 0.6436, + "step": 8903 + }, + { + "epoch": 11.39712, + "grad_norm": 0.9304628372192383, + "learning_rate": 3.2206882753101244e-05, + "loss": 0.5789, + "step": 8904 + }, + { + "epoch": 11.3984, + "grad_norm": 0.9298278093338013, + "learning_rate": 3.2204881952781116e-05, + "loss": 0.575, + "step": 8905 + }, + { + "epoch": 11.39968, + "grad_norm": 0.9112979173660278, + "learning_rate": 3.220288115246098e-05, + "loss": 0.5536, + "step": 8906 + }, + { + "epoch": 11.40096, + "grad_norm": 0.9044650793075562, + "learning_rate": 3.220088035214085e-05, + "loss": 0.5697, + "step": 8907 + }, + { + "epoch": 11.40224, + "grad_norm": 0.90861576795578, + "learning_rate": 3.219887955182073e-05, + "loss": 0.5748, + "step": 8908 + }, + { + "epoch": 11.40352, + "grad_norm": 0.8984891176223755, + "learning_rate": 3.2196878751500604e-05, + "loss": 0.5191, + "step": 8909 + }, + { + "epoch": 11.4048, + "grad_norm": 0.9326877593994141, + "learning_rate": 3.2194877951180476e-05, + "loss": 0.5591, + "step": 8910 + }, + { + "epoch": 11.40608, + "grad_norm": 0.9329106211662292, + "learning_rate": 3.219287715086035e-05, + "loss": 0.5585, + "step": 8911 + }, + { + "epoch": 11.40736, + "grad_norm": 0.917129397392273, + "learning_rate": 3.219087635054022e-05, + "loss": 0.5843, + "step": 8912 + }, + { + "epoch": 11.40864, + "grad_norm": 0.8364509344100952, + "learning_rate": 3.218887555022009e-05, + "loss": 0.5127, + "step": 8913 + }, + { + "epoch": 11.40992, + "grad_norm": 0.8630757927894592, + "learning_rate": 3.2186874749899956e-05, + "loss": 0.5589, + "step": 8914 + }, + { + "epoch": 11.411200000000001, + "grad_norm": 0.9007800221443176, + "learning_rate": 3.2184873949579835e-05, + "loss": 0.6069, + "step": 8915 + }, + { + "epoch": 11.41248, + "grad_norm": 0.9004875421524048, + "learning_rate": 3.218287314925971e-05, + "loss": 0.5776, + "step": 8916 + }, + { + "epoch": 11.41376, + "grad_norm": 0.8987116813659668, + "learning_rate": 3.218087234893958e-05, + "loss": 0.5216, + "step": 8917 + }, + { + "epoch": 11.41504, + "grad_norm": 0.8875414133071899, + "learning_rate": 3.217887154861945e-05, + "loss": 0.5412, + "step": 8918 + }, + { + "epoch": 11.41632, + "grad_norm": 0.8848508596420288, + "learning_rate": 3.217687074829932e-05, + "loss": 0.5271, + "step": 8919 + }, + { + "epoch": 11.4176, + "grad_norm": 0.9203560948371887, + "learning_rate": 3.2174869947979194e-05, + "loss": 0.5973, + "step": 8920 + }, + { + "epoch": 11.41888, + "grad_norm": 0.9155365228652954, + "learning_rate": 3.2172869147659066e-05, + "loss": 0.536, + "step": 8921 + }, + { + "epoch": 11.42016, + "grad_norm": 0.9187197685241699, + "learning_rate": 3.217086834733894e-05, + "loss": 0.5452, + "step": 8922 + }, + { + "epoch": 11.42144, + "grad_norm": 0.927579939365387, + "learning_rate": 3.216886754701881e-05, + "loss": 0.5534, + "step": 8923 + }, + { + "epoch": 11.42272, + "grad_norm": 0.9490095376968384, + "learning_rate": 3.216686674669868e-05, + "loss": 0.5418, + "step": 8924 + }, + { + "epoch": 11.424, + "grad_norm": 0.9742210507392883, + "learning_rate": 3.2164865946378553e-05, + "loss": 0.5727, + "step": 8925 + }, + { + "epoch": 11.42528, + "grad_norm": 0.9028719067573547, + "learning_rate": 3.2162865146058425e-05, + "loss": 0.5383, + "step": 8926 + }, + { + "epoch": 11.42656, + "grad_norm": 0.9693518280982971, + "learning_rate": 3.21608643457383e-05, + "loss": 0.6059, + "step": 8927 + }, + { + "epoch": 11.42784, + "grad_norm": 0.866688072681427, + "learning_rate": 3.215886354541817e-05, + "loss": 0.5364, + "step": 8928 + }, + { + "epoch": 11.42912, + "grad_norm": 0.9084386229515076, + "learning_rate": 3.215686274509804e-05, + "loss": 0.5616, + "step": 8929 + }, + { + "epoch": 11.4304, + "grad_norm": 0.9357685446739197, + "learning_rate": 3.215486194477791e-05, + "loss": 0.5707, + "step": 8930 + }, + { + "epoch": 11.43168, + "grad_norm": 0.8925808072090149, + "learning_rate": 3.2152861144457785e-05, + "loss": 0.5594, + "step": 8931 + }, + { + "epoch": 11.43296, + "grad_norm": 0.9576624035835266, + "learning_rate": 3.2150860344137656e-05, + "loss": 0.5892, + "step": 8932 + }, + { + "epoch": 11.43424, + "grad_norm": 0.8737156391143799, + "learning_rate": 3.214885954381753e-05, + "loss": 0.54, + "step": 8933 + }, + { + "epoch": 11.43552, + "grad_norm": 0.9288681745529175, + "learning_rate": 3.21468587434974e-05, + "loss": 0.5832, + "step": 8934 + }, + { + "epoch": 11.4368, + "grad_norm": 0.9747262597084045, + "learning_rate": 3.214485794317727e-05, + "loss": 0.5935, + "step": 8935 + }, + { + "epoch": 11.43808, + "grad_norm": 0.8848868012428284, + "learning_rate": 3.2142857142857144e-05, + "loss": 0.5971, + "step": 8936 + }, + { + "epoch": 11.43936, + "grad_norm": 0.9349198341369629, + "learning_rate": 3.2140856342537016e-05, + "loss": 0.5941, + "step": 8937 + }, + { + "epoch": 11.44064, + "grad_norm": 0.8732218146324158, + "learning_rate": 3.213885554221689e-05, + "loss": 0.5509, + "step": 8938 + }, + { + "epoch": 11.44192, + "grad_norm": 0.8781768083572388, + "learning_rate": 3.213685474189676e-05, + "loss": 0.569, + "step": 8939 + }, + { + "epoch": 11.4432, + "grad_norm": 0.8788825869560242, + "learning_rate": 3.213485394157663e-05, + "loss": 0.5946, + "step": 8940 + }, + { + "epoch": 11.44448, + "grad_norm": 0.8523260354995728, + "learning_rate": 3.21328531412565e-05, + "loss": 0.5232, + "step": 8941 + }, + { + "epoch": 11.44576, + "grad_norm": 0.9410714507102966, + "learning_rate": 3.2130852340936375e-05, + "loss": 0.5787, + "step": 8942 + }, + { + "epoch": 11.44704, + "grad_norm": 0.9099283814430237, + "learning_rate": 3.2128851540616254e-05, + "loss": 0.5687, + "step": 8943 + }, + { + "epoch": 11.44832, + "grad_norm": 0.9799555540084839, + "learning_rate": 3.212685074029612e-05, + "loss": 0.586, + "step": 8944 + }, + { + "epoch": 11.4496, + "grad_norm": 0.983367383480072, + "learning_rate": 3.212484993997599e-05, + "loss": 0.5625, + "step": 8945 + }, + { + "epoch": 11.45088, + "grad_norm": 0.9114019870758057, + "learning_rate": 3.212284913965586e-05, + "loss": 0.5501, + "step": 8946 + }, + { + "epoch": 11.45216, + "grad_norm": 0.964597761631012, + "learning_rate": 3.2120848339335734e-05, + "loss": 0.584, + "step": 8947 + }, + { + "epoch": 11.45344, + "grad_norm": 0.8593063950538635, + "learning_rate": 3.2118847539015606e-05, + "loss": 0.5178, + "step": 8948 + }, + { + "epoch": 11.45472, + "grad_norm": 0.8868183493614197, + "learning_rate": 3.211684673869548e-05, + "loss": 0.5591, + "step": 8949 + }, + { + "epoch": 11.456, + "grad_norm": 0.8840005397796631, + "learning_rate": 3.211484593837536e-05, + "loss": 0.5101, + "step": 8950 + }, + { + "epoch": 11.45728, + "grad_norm": 0.9012945890426636, + "learning_rate": 3.211284513805523e-05, + "loss": 0.5854, + "step": 8951 + }, + { + "epoch": 11.45856, + "grad_norm": 0.8864527940750122, + "learning_rate": 3.2110844337735094e-05, + "loss": 0.5846, + "step": 8952 + }, + { + "epoch": 11.45984, + "grad_norm": 0.9185726642608643, + "learning_rate": 3.2108843537414965e-05, + "loss": 0.5771, + "step": 8953 + }, + { + "epoch": 11.46112, + "grad_norm": 0.8853892683982849, + "learning_rate": 3.210684273709484e-05, + "loss": 0.526, + "step": 8954 + }, + { + "epoch": 11.4624, + "grad_norm": 0.9288807511329651, + "learning_rate": 3.210484193677471e-05, + "loss": 0.5306, + "step": 8955 + }, + { + "epoch": 11.46368, + "grad_norm": 0.9466400742530823, + "learning_rate": 3.210284113645458e-05, + "loss": 0.606, + "step": 8956 + }, + { + "epoch": 11.46496, + "grad_norm": 0.9334494471549988, + "learning_rate": 3.210084033613446e-05, + "loss": 0.5746, + "step": 8957 + }, + { + "epoch": 11.466239999999999, + "grad_norm": 0.9179563522338867, + "learning_rate": 3.209883953581433e-05, + "loss": 0.5453, + "step": 8958 + }, + { + "epoch": 11.46752, + "grad_norm": 0.9171554446220398, + "learning_rate": 3.2096838735494203e-05, + "loss": 0.5683, + "step": 8959 + }, + { + "epoch": 11.4688, + "grad_norm": 0.9348716735839844, + "learning_rate": 3.209483793517407e-05, + "loss": 0.5502, + "step": 8960 + }, + { + "epoch": 11.47008, + "grad_norm": 0.9068107604980469, + "learning_rate": 3.209283713485394e-05, + "loss": 0.5924, + "step": 8961 + }, + { + "epoch": 11.47136, + "grad_norm": 0.9149397611618042, + "learning_rate": 3.209083633453381e-05, + "loss": 0.5681, + "step": 8962 + }, + { + "epoch": 11.47264, + "grad_norm": 0.876530110836029, + "learning_rate": 3.2088835534213684e-05, + "loss": 0.55, + "step": 8963 + }, + { + "epoch": 11.47392, + "grad_norm": 0.8814985752105713, + "learning_rate": 3.208683473389356e-05, + "loss": 0.5513, + "step": 8964 + }, + { + "epoch": 11.4752, + "grad_norm": 0.8350082039833069, + "learning_rate": 3.2084833933573435e-05, + "loss": 0.5217, + "step": 8965 + }, + { + "epoch": 11.47648, + "grad_norm": 0.8814594149589539, + "learning_rate": 3.2082833133253306e-05, + "loss": 0.5626, + "step": 8966 + }, + { + "epoch": 11.47776, + "grad_norm": 0.8578638434410095, + "learning_rate": 3.208083233293318e-05, + "loss": 0.5671, + "step": 8967 + }, + { + "epoch": 11.47904, + "grad_norm": 0.8845701217651367, + "learning_rate": 3.207883153261304e-05, + "loss": 0.5927, + "step": 8968 + }, + { + "epoch": 11.48032, + "grad_norm": 0.8991924524307251, + "learning_rate": 3.2076830732292915e-05, + "loss": 0.5528, + "step": 8969 + }, + { + "epoch": 11.4816, + "grad_norm": 0.898190975189209, + "learning_rate": 3.207482993197279e-05, + "loss": 0.5988, + "step": 8970 + }, + { + "epoch": 11.48288, + "grad_norm": 0.9360696077346802, + "learning_rate": 3.2072829131652666e-05, + "loss": 0.569, + "step": 8971 + }, + { + "epoch": 11.48416, + "grad_norm": 0.845306932926178, + "learning_rate": 3.207082833133254e-05, + "loss": 0.5489, + "step": 8972 + }, + { + "epoch": 11.48544, + "grad_norm": 0.9122028946876526, + "learning_rate": 3.206882753101241e-05, + "loss": 0.5735, + "step": 8973 + }, + { + "epoch": 11.48672, + "grad_norm": 0.9009816646575928, + "learning_rate": 3.206682673069228e-05, + "loss": 0.5192, + "step": 8974 + }, + { + "epoch": 11.488, + "grad_norm": 0.9005442261695862, + "learning_rate": 3.206482593037215e-05, + "loss": 0.6311, + "step": 8975 + }, + { + "epoch": 11.48928, + "grad_norm": 0.931171715259552, + "learning_rate": 3.206282513005202e-05, + "loss": 0.5831, + "step": 8976 + }, + { + "epoch": 11.49056, + "grad_norm": 0.902475893497467, + "learning_rate": 3.206082432973189e-05, + "loss": 0.5382, + "step": 8977 + }, + { + "epoch": 11.49184, + "grad_norm": 0.9351593852043152, + "learning_rate": 3.205882352941177e-05, + "loss": 0.5586, + "step": 8978 + }, + { + "epoch": 11.49312, + "grad_norm": 0.9076836705207825, + "learning_rate": 3.205682272909164e-05, + "loss": 0.5568, + "step": 8979 + }, + { + "epoch": 11.4944, + "grad_norm": 0.8612614274024963, + "learning_rate": 3.205482192877151e-05, + "loss": 0.531, + "step": 8980 + }, + { + "epoch": 11.49568, + "grad_norm": 0.8604639768600464, + "learning_rate": 3.2052821128451384e-05, + "loss": 0.5539, + "step": 8981 + }, + { + "epoch": 11.49696, + "grad_norm": 0.8722749352455139, + "learning_rate": 3.2050820328131256e-05, + "loss": 0.5436, + "step": 8982 + }, + { + "epoch": 11.49824, + "grad_norm": 0.8463775515556335, + "learning_rate": 3.204881952781113e-05, + "loss": 0.5086, + "step": 8983 + }, + { + "epoch": 11.49952, + "grad_norm": 0.9018126130104065, + "learning_rate": 3.204681872749099e-05, + "loss": 0.6194, + "step": 8984 + }, + { + "epoch": 11.5008, + "grad_norm": 0.9355940222740173, + "learning_rate": 3.204481792717087e-05, + "loss": 0.5751, + "step": 8985 + }, + { + "epoch": 11.50208, + "grad_norm": 0.8555790185928345, + "learning_rate": 3.2042817126850744e-05, + "loss": 0.5685, + "step": 8986 + }, + { + "epoch": 11.50336, + "grad_norm": 0.8574039936065674, + "learning_rate": 3.2040816326530615e-05, + "loss": 0.5265, + "step": 8987 + }, + { + "epoch": 11.50464, + "grad_norm": 0.898993730545044, + "learning_rate": 3.203881552621049e-05, + "loss": 0.5406, + "step": 8988 + }, + { + "epoch": 11.50592, + "grad_norm": 0.9504684805870056, + "learning_rate": 3.203681472589036e-05, + "loss": 0.6053, + "step": 8989 + }, + { + "epoch": 11.5072, + "grad_norm": 0.9039958119392395, + "learning_rate": 3.203481392557023e-05, + "loss": 0.577, + "step": 8990 + }, + { + "epoch": 11.50848, + "grad_norm": 0.9396102428436279, + "learning_rate": 3.20328131252501e-05, + "loss": 0.5763, + "step": 8991 + }, + { + "epoch": 11.50976, + "grad_norm": 0.8567662835121155, + "learning_rate": 3.203081232492997e-05, + "loss": 0.5162, + "step": 8992 + }, + { + "epoch": 11.51104, + "grad_norm": 0.9210956692695618, + "learning_rate": 3.2028811524609847e-05, + "loss": 0.579, + "step": 8993 + }, + { + "epoch": 11.51232, + "grad_norm": 0.8853841423988342, + "learning_rate": 3.202681072428972e-05, + "loss": 0.5236, + "step": 8994 + }, + { + "epoch": 11.5136, + "grad_norm": 0.9339121580123901, + "learning_rate": 3.202480992396959e-05, + "loss": 0.619, + "step": 8995 + }, + { + "epoch": 11.51488, + "grad_norm": 0.8573284149169922, + "learning_rate": 3.202280912364946e-05, + "loss": 0.5583, + "step": 8996 + }, + { + "epoch": 11.51616, + "grad_norm": 0.9069136381149292, + "learning_rate": 3.2020808323329334e-05, + "loss": 0.5256, + "step": 8997 + }, + { + "epoch": 11.51744, + "grad_norm": 0.9280446767807007, + "learning_rate": 3.2018807523009206e-05, + "loss": 0.6084, + "step": 8998 + }, + { + "epoch": 11.51872, + "grad_norm": 0.8712737560272217, + "learning_rate": 3.201680672268908e-05, + "loss": 0.5816, + "step": 8999 + }, + { + "epoch": 11.52, + "grad_norm": 0.8745893836021423, + "learning_rate": 3.201480592236895e-05, + "loss": 0.5167, + "step": 9000 + }, + { + "epoch": 11.52128, + "grad_norm": 0.8918792605400085, + "learning_rate": 3.201280512204882e-05, + "loss": 0.587, + "step": 9001 + }, + { + "epoch": 11.52256, + "grad_norm": 0.9053367972373962, + "learning_rate": 3.201080432172869e-05, + "loss": 0.5899, + "step": 9002 + }, + { + "epoch": 11.52384, + "grad_norm": 0.8964258432388306, + "learning_rate": 3.2008803521408565e-05, + "loss": 0.5589, + "step": 9003 + }, + { + "epoch": 11.52512, + "grad_norm": 0.852827250957489, + "learning_rate": 3.200680272108844e-05, + "loss": 0.5024, + "step": 9004 + }, + { + "epoch": 11.5264, + "grad_norm": 0.8889139890670776, + "learning_rate": 3.200480192076831e-05, + "loss": 0.5795, + "step": 9005 + }, + { + "epoch": 11.52768, + "grad_norm": 0.8449302315711975, + "learning_rate": 3.200280112044818e-05, + "loss": 0.5439, + "step": 9006 + }, + { + "epoch": 11.52896, + "grad_norm": 0.9062175154685974, + "learning_rate": 3.200080032012805e-05, + "loss": 0.543, + "step": 9007 + }, + { + "epoch": 11.53024, + "grad_norm": 0.9764111042022705, + "learning_rate": 3.1998799519807924e-05, + "loss": 0.5982, + "step": 9008 + }, + { + "epoch": 11.53152, + "grad_norm": 0.8889600038528442, + "learning_rate": 3.1996798719487796e-05, + "loss": 0.5308, + "step": 9009 + }, + { + "epoch": 11.5328, + "grad_norm": 0.9224411845207214, + "learning_rate": 3.199479791916767e-05, + "loss": 0.5809, + "step": 9010 + }, + { + "epoch": 11.53408, + "grad_norm": 0.8506020307540894, + "learning_rate": 3.199279711884754e-05, + "loss": 0.5375, + "step": 9011 + }, + { + "epoch": 11.53536, + "grad_norm": 0.8242222666740417, + "learning_rate": 3.199079631852741e-05, + "loss": 0.5254, + "step": 9012 + }, + { + "epoch": 11.53664, + "grad_norm": 0.8847529888153076, + "learning_rate": 3.1988795518207284e-05, + "loss": 0.5867, + "step": 9013 + }, + { + "epoch": 11.53792, + "grad_norm": 0.8491979837417603, + "learning_rate": 3.1986794717887156e-05, + "loss": 0.5463, + "step": 9014 + }, + { + "epoch": 11.5392, + "grad_norm": 0.8366856575012207, + "learning_rate": 3.198479391756703e-05, + "loss": 0.5193, + "step": 9015 + }, + { + "epoch": 11.54048, + "grad_norm": 0.8921111822128296, + "learning_rate": 3.19827931172469e-05, + "loss": 0.5723, + "step": 9016 + }, + { + "epoch": 11.54176, + "grad_norm": 0.8784570097923279, + "learning_rate": 3.198079231692677e-05, + "loss": 0.5305, + "step": 9017 + }, + { + "epoch": 11.54304, + "grad_norm": 0.9135284423828125, + "learning_rate": 3.197879151660664e-05, + "loss": 0.5463, + "step": 9018 + }, + { + "epoch": 11.54432, + "grad_norm": 0.8987352848052979, + "learning_rate": 3.1976790716286515e-05, + "loss": 0.6218, + "step": 9019 + }, + { + "epoch": 11.5456, + "grad_norm": 0.857602059841156, + "learning_rate": 3.197478991596639e-05, + "loss": 0.5085, + "step": 9020 + }, + { + "epoch": 11.54688, + "grad_norm": 0.8649136424064636, + "learning_rate": 3.1972789115646265e-05, + "loss": 0.5577, + "step": 9021 + }, + { + "epoch": 11.54816, + "grad_norm": 0.8804261684417725, + "learning_rate": 3.197078831532613e-05, + "loss": 0.5755, + "step": 9022 + }, + { + "epoch": 11.54944, + "grad_norm": 0.8786762356758118, + "learning_rate": 3.1968787515006e-05, + "loss": 0.5368, + "step": 9023 + }, + { + "epoch": 11.55072, + "grad_norm": 0.8894196152687073, + "learning_rate": 3.1966786714685874e-05, + "loss": 0.5745, + "step": 9024 + }, + { + "epoch": 11.552, + "grad_norm": 0.9118727445602417, + "learning_rate": 3.1964785914365746e-05, + "loss": 0.5589, + "step": 9025 + }, + { + "epoch": 11.55328, + "grad_norm": 0.9220285415649414, + "learning_rate": 3.196278511404562e-05, + "loss": 0.5938, + "step": 9026 + }, + { + "epoch": 11.55456, + "grad_norm": 0.9039615392684937, + "learning_rate": 3.196078431372549e-05, + "loss": 0.5626, + "step": 9027 + }, + { + "epoch": 11.55584, + "grad_norm": 0.8908433318138123, + "learning_rate": 3.195878351340537e-05, + "loss": 0.6023, + "step": 9028 + }, + { + "epoch": 11.55712, + "grad_norm": 0.8978443741798401, + "learning_rate": 3.195678271308524e-05, + "loss": 0.5491, + "step": 9029 + }, + { + "epoch": 11.5584, + "grad_norm": 0.8988606929779053, + "learning_rate": 3.1954781912765105e-05, + "loss": 0.5366, + "step": 9030 + }, + { + "epoch": 11.55968, + "grad_norm": 0.9168279767036438, + "learning_rate": 3.195278111244498e-05, + "loss": 0.5961, + "step": 9031 + }, + { + "epoch": 11.56096, + "grad_norm": 0.8574641942977905, + "learning_rate": 3.195078031212485e-05, + "loss": 0.5461, + "step": 9032 + }, + { + "epoch": 11.56224, + "grad_norm": 0.9339374303817749, + "learning_rate": 3.194877951180472e-05, + "loss": 0.5732, + "step": 9033 + }, + { + "epoch": 11.56352, + "grad_norm": 0.9759158492088318, + "learning_rate": 3.194677871148459e-05, + "loss": 0.6018, + "step": 9034 + }, + { + "epoch": 11.5648, + "grad_norm": 0.9553015232086182, + "learning_rate": 3.194477791116447e-05, + "loss": 0.5936, + "step": 9035 + }, + { + "epoch": 11.56608, + "grad_norm": 0.9375491142272949, + "learning_rate": 3.194277711084434e-05, + "loss": 0.5606, + "step": 9036 + }, + { + "epoch": 11.56736, + "grad_norm": 0.9261005520820618, + "learning_rate": 3.1940776310524215e-05, + "loss": 0.61, + "step": 9037 + }, + { + "epoch": 11.56864, + "grad_norm": 0.9291247129440308, + "learning_rate": 3.193877551020408e-05, + "loss": 0.6361, + "step": 9038 + }, + { + "epoch": 11.56992, + "grad_norm": 0.9398210644721985, + "learning_rate": 3.193677470988395e-05, + "loss": 0.554, + "step": 9039 + }, + { + "epoch": 11.5712, + "grad_norm": 0.897376298904419, + "learning_rate": 3.1934773909563824e-05, + "loss": 0.5887, + "step": 9040 + }, + { + "epoch": 11.57248, + "grad_norm": 0.8442445397377014, + "learning_rate": 3.1932773109243696e-05, + "loss": 0.5555, + "step": 9041 + }, + { + "epoch": 11.57376, + "grad_norm": 0.8896632194519043, + "learning_rate": 3.1930772308923574e-05, + "loss": 0.5612, + "step": 9042 + }, + { + "epoch": 11.57504, + "grad_norm": 0.8563700914382935, + "learning_rate": 3.1928771508603446e-05, + "loss": 0.5588, + "step": 9043 + }, + { + "epoch": 11.57632, + "grad_norm": 0.8944402933120728, + "learning_rate": 3.192677070828332e-05, + "loss": 0.5691, + "step": 9044 + }, + { + "epoch": 11.5776, + "grad_norm": 0.9250562191009521, + "learning_rate": 3.192476990796319e-05, + "loss": 0.582, + "step": 9045 + }, + { + "epoch": 11.57888, + "grad_norm": 0.9460017085075378, + "learning_rate": 3.1922769107643055e-05, + "loss": 0.6296, + "step": 9046 + }, + { + "epoch": 11.58016, + "grad_norm": 0.9160764217376709, + "learning_rate": 3.192076830732293e-05, + "loss": 0.5749, + "step": 9047 + }, + { + "epoch": 11.58144, + "grad_norm": 0.8978300094604492, + "learning_rate": 3.19187675070028e-05, + "loss": 0.522, + "step": 9048 + }, + { + "epoch": 11.58272, + "grad_norm": 0.9328131675720215, + "learning_rate": 3.191676670668268e-05, + "loss": 0.6059, + "step": 9049 + }, + { + "epoch": 11.584, + "grad_norm": 0.9057248830795288, + "learning_rate": 3.191476590636255e-05, + "loss": 0.5399, + "step": 9050 + }, + { + "epoch": 11.585280000000001, + "grad_norm": 0.9383625984191895, + "learning_rate": 3.191276510604242e-05, + "loss": 0.6006, + "step": 9051 + }, + { + "epoch": 11.58656, + "grad_norm": 0.9605696797370911, + "learning_rate": 3.191076430572229e-05, + "loss": 0.6013, + "step": 9052 + }, + { + "epoch": 11.58784, + "grad_norm": 0.9236608743667603, + "learning_rate": 3.1908763505402165e-05, + "loss": 0.54, + "step": 9053 + }, + { + "epoch": 11.58912, + "grad_norm": 0.9085046052932739, + "learning_rate": 3.190676270508203e-05, + "loss": 0.5383, + "step": 9054 + }, + { + "epoch": 11.5904, + "grad_norm": 0.9364131689071655, + "learning_rate": 3.19047619047619e-05, + "loss": 0.5888, + "step": 9055 + }, + { + "epoch": 11.59168, + "grad_norm": 0.9124594330787659, + "learning_rate": 3.190276110444178e-05, + "loss": 0.5496, + "step": 9056 + }, + { + "epoch": 11.59296, + "grad_norm": 0.900393009185791, + "learning_rate": 3.190076030412165e-05, + "loss": 0.5176, + "step": 9057 + }, + { + "epoch": 11.59424, + "grad_norm": 0.8931260704994202, + "learning_rate": 3.1898759503801524e-05, + "loss": 0.5582, + "step": 9058 + }, + { + "epoch": 11.59552, + "grad_norm": 0.8596901297569275, + "learning_rate": 3.1896758703481396e-05, + "loss": 0.5446, + "step": 9059 + }, + { + "epoch": 11.5968, + "grad_norm": 0.8517556190490723, + "learning_rate": 3.189475790316127e-05, + "loss": 0.5244, + "step": 9060 + }, + { + "epoch": 11.59808, + "grad_norm": 0.9195119142532349, + "learning_rate": 3.189275710284114e-05, + "loss": 0.6166, + "step": 9061 + }, + { + "epoch": 11.59936, + "grad_norm": 0.8437806367874146, + "learning_rate": 3.1890756302521005e-05, + "loss": 0.5334, + "step": 9062 + }, + { + "epoch": 11.60064, + "grad_norm": 0.8890377283096313, + "learning_rate": 3.188875550220088e-05, + "loss": 0.5651, + "step": 9063 + }, + { + "epoch": 11.60192, + "grad_norm": 0.9175596833229065, + "learning_rate": 3.1886754701880755e-05, + "loss": 0.587, + "step": 9064 + }, + { + "epoch": 11.6032, + "grad_norm": 0.8746216893196106, + "learning_rate": 3.188475390156063e-05, + "loss": 0.562, + "step": 9065 + }, + { + "epoch": 11.60448, + "grad_norm": 0.9140630960464478, + "learning_rate": 3.18827531012405e-05, + "loss": 0.5781, + "step": 9066 + }, + { + "epoch": 11.60576, + "grad_norm": 0.8744653463363647, + "learning_rate": 3.188075230092037e-05, + "loss": 0.5093, + "step": 9067 + }, + { + "epoch": 11.60704, + "grad_norm": 0.8821300268173218, + "learning_rate": 3.187875150060024e-05, + "loss": 0.5314, + "step": 9068 + }, + { + "epoch": 11.608319999999999, + "grad_norm": 0.9396583437919617, + "learning_rate": 3.1876750700280114e-05, + "loss": 0.5425, + "step": 9069 + }, + { + "epoch": 11.6096, + "grad_norm": 0.9217917323112488, + "learning_rate": 3.1874749899959986e-05, + "loss": 0.5373, + "step": 9070 + }, + { + "epoch": 11.61088, + "grad_norm": 0.9106178283691406, + "learning_rate": 3.187274909963986e-05, + "loss": 0.5847, + "step": 9071 + }, + { + "epoch": 11.61216, + "grad_norm": 0.852333128452301, + "learning_rate": 3.187074829931973e-05, + "loss": 0.4806, + "step": 9072 + }, + { + "epoch": 11.61344, + "grad_norm": 0.8371844291687012, + "learning_rate": 3.18687474989996e-05, + "loss": 0.5455, + "step": 9073 + }, + { + "epoch": 11.61472, + "grad_norm": 0.8995749950408936, + "learning_rate": 3.1866746698679474e-05, + "loss": 0.5826, + "step": 9074 + }, + { + "epoch": 11.616, + "grad_norm": 0.8314163088798523, + "learning_rate": 3.1864745898359346e-05, + "loss": 0.5006, + "step": 9075 + }, + { + "epoch": 11.617280000000001, + "grad_norm": 0.920068085193634, + "learning_rate": 3.186274509803922e-05, + "loss": 0.5947, + "step": 9076 + }, + { + "epoch": 11.61856, + "grad_norm": 0.9251027703285217, + "learning_rate": 3.186074429771909e-05, + "loss": 0.5639, + "step": 9077 + }, + { + "epoch": 11.61984, + "grad_norm": 0.8840105533599854, + "learning_rate": 3.185874349739896e-05, + "loss": 0.543, + "step": 9078 + }, + { + "epoch": 11.62112, + "grad_norm": 0.8957288265228271, + "learning_rate": 3.185674269707883e-05, + "loss": 0.5646, + "step": 9079 + }, + { + "epoch": 11.6224, + "grad_norm": 0.8777086734771729, + "learning_rate": 3.1854741896758705e-05, + "loss": 0.554, + "step": 9080 + }, + { + "epoch": 11.62368, + "grad_norm": 0.8875241875648499, + "learning_rate": 3.185274109643858e-05, + "loss": 0.5173, + "step": 9081 + }, + { + "epoch": 11.62496, + "grad_norm": 0.9366897344589233, + "learning_rate": 3.185074029611845e-05, + "loss": 0.6264, + "step": 9082 + }, + { + "epoch": 11.62624, + "grad_norm": 0.9162141680717468, + "learning_rate": 3.184873949579832e-05, + "loss": 0.5632, + "step": 9083 + }, + { + "epoch": 11.62752, + "grad_norm": 0.9224411249160767, + "learning_rate": 3.184673869547819e-05, + "loss": 0.5485, + "step": 9084 + }, + { + "epoch": 11.6288, + "grad_norm": 0.8495880365371704, + "learning_rate": 3.1844737895158064e-05, + "loss": 0.4792, + "step": 9085 + }, + { + "epoch": 11.63008, + "grad_norm": 0.9102917313575745, + "learning_rate": 3.1842737094837936e-05, + "loss": 0.563, + "step": 9086 + }, + { + "epoch": 11.63136, + "grad_norm": 0.9147833585739136, + "learning_rate": 3.184073629451781e-05, + "loss": 0.5744, + "step": 9087 + }, + { + "epoch": 11.63264, + "grad_norm": 0.897863507270813, + "learning_rate": 3.183873549419768e-05, + "loss": 0.544, + "step": 9088 + }, + { + "epoch": 11.63392, + "grad_norm": 0.8683566451072693, + "learning_rate": 3.183673469387755e-05, + "loss": 0.5532, + "step": 9089 + }, + { + "epoch": 11.6352, + "grad_norm": 0.8494045734405518, + "learning_rate": 3.1834733893557423e-05, + "loss": 0.5413, + "step": 9090 + }, + { + "epoch": 11.63648, + "grad_norm": 0.8431652784347534, + "learning_rate": 3.18327330932373e-05, + "loss": 0.5293, + "step": 9091 + }, + { + "epoch": 11.63776, + "grad_norm": 0.8813535571098328, + "learning_rate": 3.183073229291717e-05, + "loss": 0.5671, + "step": 9092 + }, + { + "epoch": 11.63904, + "grad_norm": 0.9110351204872131, + "learning_rate": 3.182873149259704e-05, + "loss": 0.5919, + "step": 9093 + }, + { + "epoch": 11.64032, + "grad_norm": 0.9060587286949158, + "learning_rate": 3.182673069227691e-05, + "loss": 0.5494, + "step": 9094 + }, + { + "epoch": 11.6416, + "grad_norm": 0.8764411211013794, + "learning_rate": 3.182472989195678e-05, + "loss": 0.5471, + "step": 9095 + }, + { + "epoch": 11.64288, + "grad_norm": 0.8792155981063843, + "learning_rate": 3.1822729091636655e-05, + "loss": 0.581, + "step": 9096 + }, + { + "epoch": 11.64416, + "grad_norm": 0.8817364573478699, + "learning_rate": 3.1820728291316526e-05, + "loss": 0.5777, + "step": 9097 + }, + { + "epoch": 11.64544, + "grad_norm": 0.8980505466461182, + "learning_rate": 3.1818727490996405e-05, + "loss": 0.5925, + "step": 9098 + }, + { + "epoch": 11.64672, + "grad_norm": 0.9368979930877686, + "learning_rate": 3.181672669067628e-05, + "loss": 0.5798, + "step": 9099 + }, + { + "epoch": 11.648, + "grad_norm": 0.9280688762664795, + "learning_rate": 3.181472589035614e-05, + "loss": 0.559, + "step": 9100 + }, + { + "epoch": 11.64928, + "grad_norm": 0.8853485584259033, + "learning_rate": 3.1812725090036014e-05, + "loss": 0.5102, + "step": 9101 + }, + { + "epoch": 11.65056, + "grad_norm": 0.9357476234436035, + "learning_rate": 3.1810724289715886e-05, + "loss": 0.5616, + "step": 9102 + }, + { + "epoch": 11.65184, + "grad_norm": 0.8820586800575256, + "learning_rate": 3.180872348939576e-05, + "loss": 0.5462, + "step": 9103 + }, + { + "epoch": 11.65312, + "grad_norm": 0.9340028166770935, + "learning_rate": 3.180672268907563e-05, + "loss": 0.5758, + "step": 9104 + }, + { + "epoch": 11.6544, + "grad_norm": 0.9392242431640625, + "learning_rate": 3.18047218887555e-05, + "loss": 0.546, + "step": 9105 + }, + { + "epoch": 11.65568, + "grad_norm": 0.874309241771698, + "learning_rate": 3.180272108843538e-05, + "loss": 0.5407, + "step": 9106 + }, + { + "epoch": 11.65696, + "grad_norm": 0.9422518610954285, + "learning_rate": 3.180072028811525e-05, + "loss": 0.5634, + "step": 9107 + }, + { + "epoch": 11.65824, + "grad_norm": 0.8917849659919739, + "learning_rate": 3.179871948779512e-05, + "loss": 0.5674, + "step": 9108 + }, + { + "epoch": 11.65952, + "grad_norm": 0.9151961207389832, + "learning_rate": 3.179671868747499e-05, + "loss": 0.5681, + "step": 9109 + }, + { + "epoch": 11.6608, + "grad_norm": 0.9490141868591309, + "learning_rate": 3.179471788715486e-05, + "loss": 0.5851, + "step": 9110 + }, + { + "epoch": 11.66208, + "grad_norm": 0.9519876837730408, + "learning_rate": 3.179271708683473e-05, + "loss": 0.5503, + "step": 9111 + }, + { + "epoch": 11.66336, + "grad_norm": 0.9187465906143188, + "learning_rate": 3.1790716286514604e-05, + "loss": 0.5978, + "step": 9112 + }, + { + "epoch": 11.66464, + "grad_norm": 0.8703121542930603, + "learning_rate": 3.178871548619448e-05, + "loss": 0.5282, + "step": 9113 + }, + { + "epoch": 11.66592, + "grad_norm": 0.9381796717643738, + "learning_rate": 3.1786714685874355e-05, + "loss": 0.5909, + "step": 9114 + }, + { + "epoch": 11.6672, + "grad_norm": 0.8817717432975769, + "learning_rate": 3.1784713885554227e-05, + "loss": 0.5652, + "step": 9115 + }, + { + "epoch": 11.66848, + "grad_norm": 0.8601658940315247, + "learning_rate": 3.178271308523409e-05, + "loss": 0.5071, + "step": 9116 + }, + { + "epoch": 11.66976, + "grad_norm": 0.9270244836807251, + "learning_rate": 3.1780712284913964e-05, + "loss": 0.5354, + "step": 9117 + }, + { + "epoch": 11.67104, + "grad_norm": 0.885297954082489, + "learning_rate": 3.1778711484593835e-05, + "loss": 0.577, + "step": 9118 + }, + { + "epoch": 11.67232, + "grad_norm": 0.929111659526825, + "learning_rate": 3.177671068427371e-05, + "loss": 0.5876, + "step": 9119 + }, + { + "epoch": 11.6736, + "grad_norm": 0.8553233742713928, + "learning_rate": 3.1774709883953586e-05, + "loss": 0.5356, + "step": 9120 + }, + { + "epoch": 11.67488, + "grad_norm": 0.9026567339897156, + "learning_rate": 3.177270908363346e-05, + "loss": 0.5431, + "step": 9121 + }, + { + "epoch": 11.67616, + "grad_norm": 0.9046716690063477, + "learning_rate": 3.177070828331333e-05, + "loss": 0.5823, + "step": 9122 + }, + { + "epoch": 11.67744, + "grad_norm": 0.8545054197311401, + "learning_rate": 3.17687074829932e-05, + "loss": 0.5063, + "step": 9123 + }, + { + "epoch": 11.67872, + "grad_norm": 0.9412639737129211, + "learning_rate": 3.1766706682673067e-05, + "loss": 0.5906, + "step": 9124 + }, + { + "epoch": 11.68, + "grad_norm": 0.9309195280075073, + "learning_rate": 3.176470588235294e-05, + "loss": 0.5749, + "step": 9125 + }, + { + "epoch": 11.68128, + "grad_norm": 0.9051558971405029, + "learning_rate": 3.176270508203281e-05, + "loss": 0.5678, + "step": 9126 + }, + { + "epoch": 11.68256, + "grad_norm": 0.927655816078186, + "learning_rate": 3.176070428171269e-05, + "loss": 0.6117, + "step": 9127 + }, + { + "epoch": 11.68384, + "grad_norm": 0.9260802865028381, + "learning_rate": 3.175870348139256e-05, + "loss": 0.5492, + "step": 9128 + }, + { + "epoch": 11.68512, + "grad_norm": 0.8989552855491638, + "learning_rate": 3.175670268107243e-05, + "loss": 0.5712, + "step": 9129 + }, + { + "epoch": 11.6864, + "grad_norm": 0.9030582308769226, + "learning_rate": 3.1754701880752304e-05, + "loss": 0.5752, + "step": 9130 + }, + { + "epoch": 11.68768, + "grad_norm": 0.9276950359344482, + "learning_rate": 3.1752701080432176e-05, + "loss": 0.6009, + "step": 9131 + }, + { + "epoch": 11.68896, + "grad_norm": 0.8523882031440735, + "learning_rate": 3.175070028011204e-05, + "loss": 0.5174, + "step": 9132 + }, + { + "epoch": 11.69024, + "grad_norm": 0.9310316443443298, + "learning_rate": 3.174869947979191e-05, + "loss": 0.6016, + "step": 9133 + }, + { + "epoch": 11.69152, + "grad_norm": 0.9005877375602722, + "learning_rate": 3.174669867947179e-05, + "loss": 0.5351, + "step": 9134 + }, + { + "epoch": 11.6928, + "grad_norm": 0.9501352906227112, + "learning_rate": 3.1744697879151664e-05, + "loss": 0.5938, + "step": 9135 + }, + { + "epoch": 11.69408, + "grad_norm": 0.9486740827560425, + "learning_rate": 3.1742697078831536e-05, + "loss": 0.5906, + "step": 9136 + }, + { + "epoch": 11.69536, + "grad_norm": 0.9294406771659851, + "learning_rate": 3.174069627851141e-05, + "loss": 0.592, + "step": 9137 + }, + { + "epoch": 11.69664, + "grad_norm": 0.8708564043045044, + "learning_rate": 3.173869547819128e-05, + "loss": 0.5135, + "step": 9138 + }, + { + "epoch": 11.69792, + "grad_norm": 1.0165308713912964, + "learning_rate": 3.173669467787115e-05, + "loss": 0.6272, + "step": 9139 + }, + { + "epoch": 11.6992, + "grad_norm": 0.8839307427406311, + "learning_rate": 3.1734693877551016e-05, + "loss": 0.5633, + "step": 9140 + }, + { + "epoch": 11.70048, + "grad_norm": 0.8960999846458435, + "learning_rate": 3.1732693077230895e-05, + "loss": 0.5565, + "step": 9141 + }, + { + "epoch": 11.70176, + "grad_norm": 0.9335712790489197, + "learning_rate": 3.173069227691077e-05, + "loss": 0.5516, + "step": 9142 + }, + { + "epoch": 11.70304, + "grad_norm": 0.8799536228179932, + "learning_rate": 3.172869147659064e-05, + "loss": 0.585, + "step": 9143 + }, + { + "epoch": 11.70432, + "grad_norm": 0.9027646780014038, + "learning_rate": 3.172669067627051e-05, + "loss": 0.5533, + "step": 9144 + }, + { + "epoch": 11.7056, + "grad_norm": 0.9302271008491516, + "learning_rate": 3.172468987595038e-05, + "loss": 0.6, + "step": 9145 + }, + { + "epoch": 11.70688, + "grad_norm": 0.8945030570030212, + "learning_rate": 3.1722689075630254e-05, + "loss": 0.5827, + "step": 9146 + }, + { + "epoch": 11.70816, + "grad_norm": 0.9455578327178955, + "learning_rate": 3.1720688275310126e-05, + "loss": 0.546, + "step": 9147 + }, + { + "epoch": 11.70944, + "grad_norm": 0.8383356332778931, + "learning_rate": 3.171868747499e-05, + "loss": 0.5304, + "step": 9148 + }, + { + "epoch": 11.71072, + "grad_norm": 0.8386917114257812, + "learning_rate": 3.171668667466987e-05, + "loss": 0.5498, + "step": 9149 + }, + { + "epoch": 11.712, + "grad_norm": 0.8906264901161194, + "learning_rate": 3.171468587434974e-05, + "loss": 0.5319, + "step": 9150 + }, + { + "epoch": 11.71328, + "grad_norm": 0.9018164873123169, + "learning_rate": 3.1712685074029613e-05, + "loss": 0.5535, + "step": 9151 + }, + { + "epoch": 11.71456, + "grad_norm": 0.9292755126953125, + "learning_rate": 3.1710684273709485e-05, + "loss": 0.5615, + "step": 9152 + }, + { + "epoch": 11.71584, + "grad_norm": 0.947083592414856, + "learning_rate": 3.170868347338936e-05, + "loss": 0.5326, + "step": 9153 + }, + { + "epoch": 11.71712, + "grad_norm": 0.872265100479126, + "learning_rate": 3.170668267306923e-05, + "loss": 0.5514, + "step": 9154 + }, + { + "epoch": 11.7184, + "grad_norm": 0.8972936272621155, + "learning_rate": 3.17046818727491e-05, + "loss": 0.555, + "step": 9155 + }, + { + "epoch": 11.71968, + "grad_norm": 0.8738512992858887, + "learning_rate": 3.170268107242897e-05, + "loss": 0.5585, + "step": 9156 + }, + { + "epoch": 11.72096, + "grad_norm": 0.8479512333869934, + "learning_rate": 3.1700680272108845e-05, + "loss": 0.5217, + "step": 9157 + }, + { + "epoch": 11.72224, + "grad_norm": 0.853814959526062, + "learning_rate": 3.1698679471788716e-05, + "loss": 0.5418, + "step": 9158 + }, + { + "epoch": 11.72352, + "grad_norm": 0.9030658006668091, + "learning_rate": 3.169667867146859e-05, + "loss": 0.5371, + "step": 9159 + }, + { + "epoch": 11.7248, + "grad_norm": 0.9065144062042236, + "learning_rate": 3.169467787114846e-05, + "loss": 0.555, + "step": 9160 + }, + { + "epoch": 11.72608, + "grad_norm": 0.8621678948402405, + "learning_rate": 3.169267707082833e-05, + "loss": 0.552, + "step": 9161 + }, + { + "epoch": 11.727360000000001, + "grad_norm": 0.8453906178474426, + "learning_rate": 3.1690676270508204e-05, + "loss": 0.5394, + "step": 9162 + }, + { + "epoch": 11.72864, + "grad_norm": 0.9425848722457886, + "learning_rate": 3.1688675470188076e-05, + "loss": 0.6404, + "step": 9163 + }, + { + "epoch": 11.72992, + "grad_norm": 0.9024143815040588, + "learning_rate": 3.168667466986795e-05, + "loss": 0.5704, + "step": 9164 + }, + { + "epoch": 11.7312, + "grad_norm": 0.89797443151474, + "learning_rate": 3.168467386954782e-05, + "loss": 0.5647, + "step": 9165 + }, + { + "epoch": 11.73248, + "grad_norm": 0.9100660085678101, + "learning_rate": 3.168267306922769e-05, + "loss": 0.553, + "step": 9166 + }, + { + "epoch": 11.73376, + "grad_norm": 0.8714706301689148, + "learning_rate": 3.168067226890756e-05, + "loss": 0.5353, + "step": 9167 + }, + { + "epoch": 11.73504, + "grad_norm": 0.8743889331817627, + "learning_rate": 3.1678671468587435e-05, + "loss": 0.5513, + "step": 9168 + }, + { + "epoch": 11.73632, + "grad_norm": 0.8862664699554443, + "learning_rate": 3.1676670668267314e-05, + "loss": 0.522, + "step": 9169 + }, + { + "epoch": 11.7376, + "grad_norm": 0.8375951647758484, + "learning_rate": 3.167466986794718e-05, + "loss": 0.5308, + "step": 9170 + }, + { + "epoch": 11.73888, + "grad_norm": 0.9067046642303467, + "learning_rate": 3.167266906762705e-05, + "loss": 0.5673, + "step": 9171 + }, + { + "epoch": 11.74016, + "grad_norm": 0.8670250177383423, + "learning_rate": 3.167066826730692e-05, + "loss": 0.519, + "step": 9172 + }, + { + "epoch": 11.74144, + "grad_norm": 0.8583195805549622, + "learning_rate": 3.1668667466986794e-05, + "loss": 0.532, + "step": 9173 + }, + { + "epoch": 11.74272, + "grad_norm": 0.9073027968406677, + "learning_rate": 3.1666666666666666e-05, + "loss": 0.5339, + "step": 9174 + }, + { + "epoch": 11.744, + "grad_norm": 0.8694731593132019, + "learning_rate": 3.166466586634654e-05, + "loss": 0.5488, + "step": 9175 + }, + { + "epoch": 11.74528, + "grad_norm": 0.9093069434165955, + "learning_rate": 3.166266506602642e-05, + "loss": 0.5305, + "step": 9176 + }, + { + "epoch": 11.74656, + "grad_norm": 0.8719558119773865, + "learning_rate": 3.166066426570629e-05, + "loss": 0.5112, + "step": 9177 + }, + { + "epoch": 11.74784, + "grad_norm": 0.8447220325469971, + "learning_rate": 3.1658663465386154e-05, + "loss": 0.4831, + "step": 9178 + }, + { + "epoch": 11.74912, + "grad_norm": 0.911290168762207, + "learning_rate": 3.1656662665066025e-05, + "loss": 0.5559, + "step": 9179 + }, + { + "epoch": 11.750399999999999, + "grad_norm": 0.8428891897201538, + "learning_rate": 3.16546618647459e-05, + "loss": 0.5264, + "step": 9180 + }, + { + "epoch": 11.75168, + "grad_norm": 0.9175834059715271, + "learning_rate": 3.165266106442577e-05, + "loss": 0.5798, + "step": 9181 + }, + { + "epoch": 11.75296, + "grad_norm": 0.8715541958808899, + "learning_rate": 3.165066026410564e-05, + "loss": 0.5017, + "step": 9182 + }, + { + "epoch": 11.75424, + "grad_norm": 0.8270224928855896, + "learning_rate": 3.164865946378552e-05, + "loss": 0.5368, + "step": 9183 + }, + { + "epoch": 11.75552, + "grad_norm": 0.9201467037200928, + "learning_rate": 3.164665866346539e-05, + "loss": 0.5696, + "step": 9184 + }, + { + "epoch": 11.7568, + "grad_norm": 0.8654879331588745, + "learning_rate": 3.164465786314526e-05, + "loss": 0.5424, + "step": 9185 + }, + { + "epoch": 11.75808, + "grad_norm": 0.897590696811676, + "learning_rate": 3.164265706282513e-05, + "loss": 0.5809, + "step": 9186 + }, + { + "epoch": 11.759360000000001, + "grad_norm": 0.895999014377594, + "learning_rate": 3.1640656262505e-05, + "loss": 0.5432, + "step": 9187 + }, + { + "epoch": 11.76064, + "grad_norm": 0.9101575613021851, + "learning_rate": 3.163865546218487e-05, + "loss": 0.6348, + "step": 9188 + }, + { + "epoch": 11.76192, + "grad_norm": 0.8356999754905701, + "learning_rate": 3.1636654661864744e-05, + "loss": 0.4978, + "step": 9189 + }, + { + "epoch": 11.7632, + "grad_norm": 0.886934757232666, + "learning_rate": 3.163465386154462e-05, + "loss": 0.5392, + "step": 9190 + }, + { + "epoch": 11.76448, + "grad_norm": 0.8999463319778442, + "learning_rate": 3.1632653061224494e-05, + "loss": 0.5413, + "step": 9191 + }, + { + "epoch": 11.76576, + "grad_norm": 0.9087318778038025, + "learning_rate": 3.1630652260904366e-05, + "loss": 0.5776, + "step": 9192 + }, + { + "epoch": 11.76704, + "grad_norm": 0.8902879357337952, + "learning_rate": 3.162865146058424e-05, + "loss": 0.5987, + "step": 9193 + }, + { + "epoch": 11.76832, + "grad_norm": 0.8536181449890137, + "learning_rate": 3.16266506602641e-05, + "loss": 0.5449, + "step": 9194 + }, + { + "epoch": 11.7696, + "grad_norm": 0.8617259860038757, + "learning_rate": 3.1624649859943975e-05, + "loss": 0.5814, + "step": 9195 + }, + { + "epoch": 11.77088, + "grad_norm": 0.9288644790649414, + "learning_rate": 3.162264905962385e-05, + "loss": 0.5716, + "step": 9196 + }, + { + "epoch": 11.77216, + "grad_norm": 0.9424710869789124, + "learning_rate": 3.1620648259303726e-05, + "loss": 0.5987, + "step": 9197 + }, + { + "epoch": 11.77344, + "grad_norm": 0.8956670761108398, + "learning_rate": 3.16186474589836e-05, + "loss": 0.5165, + "step": 9198 + }, + { + "epoch": 11.77472, + "grad_norm": 0.9281831383705139, + "learning_rate": 3.161664665866347e-05, + "loss": 0.572, + "step": 9199 + }, + { + "epoch": 11.776, + "grad_norm": 0.9560943841934204, + "learning_rate": 3.161464585834334e-05, + "loss": 0.5877, + "step": 9200 + }, + { + "epoch": 11.77728, + "grad_norm": 0.8967903852462769, + "learning_rate": 3.161264505802321e-05, + "loss": 0.5637, + "step": 9201 + }, + { + "epoch": 11.77856, + "grad_norm": 0.8792325258255005, + "learning_rate": 3.161064425770308e-05, + "loss": 0.5224, + "step": 9202 + }, + { + "epoch": 11.77984, + "grad_norm": 0.910875678062439, + "learning_rate": 3.160864345738295e-05, + "loss": 0.5554, + "step": 9203 + }, + { + "epoch": 11.78112, + "grad_norm": 0.8686668872833252, + "learning_rate": 3.160664265706283e-05, + "loss": 0.5578, + "step": 9204 + }, + { + "epoch": 11.782399999999999, + "grad_norm": 0.8657653331756592, + "learning_rate": 3.16046418567427e-05, + "loss": 0.5172, + "step": 9205 + }, + { + "epoch": 11.78368, + "grad_norm": 0.8818771243095398, + "learning_rate": 3.160264105642257e-05, + "loss": 0.5662, + "step": 9206 + }, + { + "epoch": 11.78496, + "grad_norm": 0.9204904437065125, + "learning_rate": 3.1600640256102444e-05, + "loss": 0.5711, + "step": 9207 + }, + { + "epoch": 11.78624, + "grad_norm": 0.9225419759750366, + "learning_rate": 3.1598639455782316e-05, + "loss": 0.5677, + "step": 9208 + }, + { + "epoch": 11.78752, + "grad_norm": 0.8883199095726013, + "learning_rate": 3.159663865546219e-05, + "loss": 0.5218, + "step": 9209 + }, + { + "epoch": 11.7888, + "grad_norm": 0.9116802215576172, + "learning_rate": 3.159463785514205e-05, + "loss": 0.6062, + "step": 9210 + }, + { + "epoch": 11.79008, + "grad_norm": 0.8611482977867126, + "learning_rate": 3.159263705482193e-05, + "loss": 0.5445, + "step": 9211 + }, + { + "epoch": 11.79136, + "grad_norm": 0.8924483060836792, + "learning_rate": 3.1590636254501803e-05, + "loss": 0.602, + "step": 9212 + }, + { + "epoch": 11.79264, + "grad_norm": 0.8485545516014099, + "learning_rate": 3.1588635454181675e-05, + "loss": 0.5326, + "step": 9213 + }, + { + "epoch": 11.79392, + "grad_norm": 0.8666700124740601, + "learning_rate": 3.158663465386155e-05, + "loss": 0.5149, + "step": 9214 + }, + { + "epoch": 11.7952, + "grad_norm": 0.9414648413658142, + "learning_rate": 3.158463385354142e-05, + "loss": 0.5767, + "step": 9215 + }, + { + "epoch": 11.79648, + "grad_norm": 0.885181188583374, + "learning_rate": 3.158263305322129e-05, + "loss": 0.5112, + "step": 9216 + }, + { + "epoch": 11.79776, + "grad_norm": 0.8811646103858948, + "learning_rate": 3.158063225290116e-05, + "loss": 0.5482, + "step": 9217 + }, + { + "epoch": 11.79904, + "grad_norm": 0.905148983001709, + "learning_rate": 3.157863145258103e-05, + "loss": 0.6129, + "step": 9218 + }, + { + "epoch": 11.80032, + "grad_norm": 0.9198310971260071, + "learning_rate": 3.1576630652260906e-05, + "loss": 0.5667, + "step": 9219 + }, + { + "epoch": 11.8016, + "grad_norm": 0.9002284407615662, + "learning_rate": 3.157462985194078e-05, + "loss": 0.5478, + "step": 9220 + }, + { + "epoch": 11.80288, + "grad_norm": 0.8892956972122192, + "learning_rate": 3.157262905162065e-05, + "loss": 0.5446, + "step": 9221 + }, + { + "epoch": 11.80416, + "grad_norm": 0.8949340581893921, + "learning_rate": 3.157062825130052e-05, + "loss": 0.5443, + "step": 9222 + }, + { + "epoch": 11.80544, + "grad_norm": 0.869086742401123, + "learning_rate": 3.1568627450980394e-05, + "loss": 0.546, + "step": 9223 + }, + { + "epoch": 11.80672, + "grad_norm": 0.920300304889679, + "learning_rate": 3.1566626650660266e-05, + "loss": 0.5784, + "step": 9224 + }, + { + "epoch": 11.808, + "grad_norm": 0.8822394013404846, + "learning_rate": 3.156462585034014e-05, + "loss": 0.5174, + "step": 9225 + }, + { + "epoch": 11.80928, + "grad_norm": 0.9120069146156311, + "learning_rate": 3.156262505002001e-05, + "loss": 0.5333, + "step": 9226 + }, + { + "epoch": 11.81056, + "grad_norm": 0.8972925543785095, + "learning_rate": 3.156062424969988e-05, + "loss": 0.6007, + "step": 9227 + }, + { + "epoch": 11.81184, + "grad_norm": 0.8929551839828491, + "learning_rate": 3.155862344937975e-05, + "loss": 0.5792, + "step": 9228 + }, + { + "epoch": 11.81312, + "grad_norm": 0.8880111575126648, + "learning_rate": 3.1556622649059625e-05, + "loss": 0.5315, + "step": 9229 + }, + { + "epoch": 11.8144, + "grad_norm": 0.9577479362487793, + "learning_rate": 3.15546218487395e-05, + "loss": 0.6392, + "step": 9230 + }, + { + "epoch": 11.81568, + "grad_norm": 0.9124792814254761, + "learning_rate": 3.155262104841937e-05, + "loss": 0.5666, + "step": 9231 + }, + { + "epoch": 11.81696, + "grad_norm": 0.9073526263237, + "learning_rate": 3.155062024809924e-05, + "loss": 0.5656, + "step": 9232 + }, + { + "epoch": 11.81824, + "grad_norm": 0.8945934176445007, + "learning_rate": 3.154861944777911e-05, + "loss": 0.5644, + "step": 9233 + }, + { + "epoch": 11.81952, + "grad_norm": 0.8661502599716187, + "learning_rate": 3.1546618647458984e-05, + "loss": 0.5162, + "step": 9234 + }, + { + "epoch": 11.8208, + "grad_norm": 0.9258007407188416, + "learning_rate": 3.1544617847138856e-05, + "loss": 0.542, + "step": 9235 + }, + { + "epoch": 11.82208, + "grad_norm": 0.859990119934082, + "learning_rate": 3.154261704681873e-05, + "loss": 0.5686, + "step": 9236 + }, + { + "epoch": 11.82336, + "grad_norm": 0.8406195044517517, + "learning_rate": 3.15406162464986e-05, + "loss": 0.5378, + "step": 9237 + }, + { + "epoch": 11.82464, + "grad_norm": 0.8850319385528564, + "learning_rate": 3.153861544617847e-05, + "loss": 0.5726, + "step": 9238 + }, + { + "epoch": 11.82592, + "grad_norm": 0.9233469367027283, + "learning_rate": 3.1536614645858344e-05, + "loss": 0.5878, + "step": 9239 + }, + { + "epoch": 11.8272, + "grad_norm": 0.9111922979354858, + "learning_rate": 3.153461384553822e-05, + "loss": 0.5668, + "step": 9240 + }, + { + "epoch": 11.82848, + "grad_norm": 0.903139054775238, + "learning_rate": 3.153261304521809e-05, + "loss": 0.5535, + "step": 9241 + }, + { + "epoch": 11.82976, + "grad_norm": 0.8692033886909485, + "learning_rate": 3.153061224489796e-05, + "loss": 0.4958, + "step": 9242 + }, + { + "epoch": 11.83104, + "grad_norm": 0.8684501051902771, + "learning_rate": 3.152861144457783e-05, + "loss": 0.5101, + "step": 9243 + }, + { + "epoch": 11.83232, + "grad_norm": 0.8597356081008911, + "learning_rate": 3.15266106442577e-05, + "loss": 0.5575, + "step": 9244 + }, + { + "epoch": 11.8336, + "grad_norm": 0.8873345851898193, + "learning_rate": 3.1524609843937575e-05, + "loss": 0.5863, + "step": 9245 + }, + { + "epoch": 11.83488, + "grad_norm": 0.8959015011787415, + "learning_rate": 3.152260904361745e-05, + "loss": 0.5667, + "step": 9246 + }, + { + "epoch": 11.83616, + "grad_norm": 0.9511164426803589, + "learning_rate": 3.1520608243297325e-05, + "loss": 0.5755, + "step": 9247 + }, + { + "epoch": 11.83744, + "grad_norm": 0.9274942278862, + "learning_rate": 3.15186074429772e-05, + "loss": 0.5661, + "step": 9248 + }, + { + "epoch": 11.83872, + "grad_norm": 0.893008291721344, + "learning_rate": 3.151660664265706e-05, + "loss": 0.5641, + "step": 9249 + }, + { + "epoch": 11.84, + "grad_norm": 0.9084400534629822, + "learning_rate": 3.1514605842336934e-05, + "loss": 0.533, + "step": 9250 + }, + { + "epoch": 11.84128, + "grad_norm": 0.9191239476203918, + "learning_rate": 3.1512605042016806e-05, + "loss": 0.557, + "step": 9251 + }, + { + "epoch": 11.84256, + "grad_norm": 0.933152973651886, + "learning_rate": 3.151060424169668e-05, + "loss": 0.5457, + "step": 9252 + }, + { + "epoch": 11.84384, + "grad_norm": 0.9432767629623413, + "learning_rate": 3.150860344137655e-05, + "loss": 0.6098, + "step": 9253 + }, + { + "epoch": 11.84512, + "grad_norm": 0.8910885453224182, + "learning_rate": 3.150660264105643e-05, + "loss": 0.5155, + "step": 9254 + }, + { + "epoch": 11.8464, + "grad_norm": 0.8926212787628174, + "learning_rate": 3.15046018407363e-05, + "loss": 0.5236, + "step": 9255 + }, + { + "epoch": 11.84768, + "grad_norm": 0.8529669046401978, + "learning_rate": 3.150260104041617e-05, + "loss": 0.5063, + "step": 9256 + }, + { + "epoch": 11.84896, + "grad_norm": 0.8807107210159302, + "learning_rate": 3.150060024009604e-05, + "loss": 0.5411, + "step": 9257 + }, + { + "epoch": 11.85024, + "grad_norm": 0.8540694117546082, + "learning_rate": 3.149859943977591e-05, + "loss": 0.5482, + "step": 9258 + }, + { + "epoch": 11.85152, + "grad_norm": 0.8822780847549438, + "learning_rate": 3.149659863945578e-05, + "loss": 0.5256, + "step": 9259 + }, + { + "epoch": 11.8528, + "grad_norm": 0.8807215094566345, + "learning_rate": 3.149459783913565e-05, + "loss": 0.5734, + "step": 9260 + }, + { + "epoch": 11.85408, + "grad_norm": 0.8632503151893616, + "learning_rate": 3.149259703881553e-05, + "loss": 0.5134, + "step": 9261 + }, + { + "epoch": 11.85536, + "grad_norm": 0.9131238460540771, + "learning_rate": 3.14905962384954e-05, + "loss": 0.5878, + "step": 9262 + }, + { + "epoch": 11.85664, + "grad_norm": 0.862756609916687, + "learning_rate": 3.1488595438175275e-05, + "loss": 0.5613, + "step": 9263 + }, + { + "epoch": 11.85792, + "grad_norm": 0.8660969734191895, + "learning_rate": 3.148659463785515e-05, + "loss": 0.5352, + "step": 9264 + }, + { + "epoch": 11.8592, + "grad_norm": 0.9220228791236877, + "learning_rate": 3.148459383753501e-05, + "loss": 0.5738, + "step": 9265 + }, + { + "epoch": 11.86048, + "grad_norm": 0.872033953666687, + "learning_rate": 3.1482593037214884e-05, + "loss": 0.5792, + "step": 9266 + }, + { + "epoch": 11.86176, + "grad_norm": 0.8677129745483398, + "learning_rate": 3.1480592236894756e-05, + "loss": 0.5334, + "step": 9267 + }, + { + "epoch": 11.86304, + "grad_norm": 0.8851068615913391, + "learning_rate": 3.1478591436574634e-05, + "loss": 0.5471, + "step": 9268 + }, + { + "epoch": 11.86432, + "grad_norm": 0.921118438243866, + "learning_rate": 3.1476590636254506e-05, + "loss": 0.562, + "step": 9269 + }, + { + "epoch": 11.8656, + "grad_norm": 0.868963897228241, + "learning_rate": 3.147458983593438e-05, + "loss": 0.5733, + "step": 9270 + }, + { + "epoch": 11.86688, + "grad_norm": 0.9390560388565063, + "learning_rate": 3.147258903561425e-05, + "loss": 0.5904, + "step": 9271 + }, + { + "epoch": 11.86816, + "grad_norm": 0.9005419611930847, + "learning_rate": 3.147058823529412e-05, + "loss": 0.5529, + "step": 9272 + }, + { + "epoch": 11.86944, + "grad_norm": 0.8554105758666992, + "learning_rate": 3.146858743497399e-05, + "loss": 0.6002, + "step": 9273 + }, + { + "epoch": 11.87072, + "grad_norm": 0.9244778752326965, + "learning_rate": 3.146658663465386e-05, + "loss": 0.5658, + "step": 9274 + }, + { + "epoch": 11.872, + "grad_norm": 0.8940372467041016, + "learning_rate": 3.146458583433374e-05, + "loss": 0.5647, + "step": 9275 + }, + { + "epoch": 11.87328, + "grad_norm": 0.998906672000885, + "learning_rate": 3.146258503401361e-05, + "loss": 0.6089, + "step": 9276 + }, + { + "epoch": 11.87456, + "grad_norm": 0.915664792060852, + "learning_rate": 3.146058423369348e-05, + "loss": 0.5579, + "step": 9277 + }, + { + "epoch": 11.87584, + "grad_norm": 0.8526196479797363, + "learning_rate": 3.145858343337335e-05, + "loss": 0.5319, + "step": 9278 + }, + { + "epoch": 11.87712, + "grad_norm": 0.8531317710876465, + "learning_rate": 3.1456582633053225e-05, + "loss": 0.5384, + "step": 9279 + }, + { + "epoch": 11.8784, + "grad_norm": 0.876233696937561, + "learning_rate": 3.1454581832733097e-05, + "loss": 0.5413, + "step": 9280 + }, + { + "epoch": 11.87968, + "grad_norm": 0.8565805554389954, + "learning_rate": 3.145258103241296e-05, + "loss": 0.5441, + "step": 9281 + }, + { + "epoch": 11.88096, + "grad_norm": 0.8764281272888184, + "learning_rate": 3.145058023209284e-05, + "loss": 0.5208, + "step": 9282 + }, + { + "epoch": 11.88224, + "grad_norm": 0.837112545967102, + "learning_rate": 3.144857943177271e-05, + "loss": 0.5515, + "step": 9283 + }, + { + "epoch": 11.88352, + "grad_norm": 0.8274785280227661, + "learning_rate": 3.1446578631452584e-05, + "loss": 0.5302, + "step": 9284 + }, + { + "epoch": 11.8848, + "grad_norm": 0.8528085350990295, + "learning_rate": 3.1444577831132456e-05, + "loss": 0.5414, + "step": 9285 + }, + { + "epoch": 11.88608, + "grad_norm": 0.8616606593132019, + "learning_rate": 3.144257703081233e-05, + "loss": 0.5315, + "step": 9286 + }, + { + "epoch": 11.88736, + "grad_norm": 0.9000251293182373, + "learning_rate": 3.14405762304922e-05, + "loss": 0.6074, + "step": 9287 + }, + { + "epoch": 11.88864, + "grad_norm": 0.9343587756156921, + "learning_rate": 3.143857543017207e-05, + "loss": 0.5863, + "step": 9288 + }, + { + "epoch": 11.88992, + "grad_norm": 0.9544128179550171, + "learning_rate": 3.143657462985194e-05, + "loss": 0.5909, + "step": 9289 + }, + { + "epoch": 11.8912, + "grad_norm": 0.8408989310264587, + "learning_rate": 3.1434573829531815e-05, + "loss": 0.5349, + "step": 9290 + }, + { + "epoch": 11.89248, + "grad_norm": 0.8732945919036865, + "learning_rate": 3.143257302921169e-05, + "loss": 0.5135, + "step": 9291 + }, + { + "epoch": 11.89376, + "grad_norm": 0.8831563591957092, + "learning_rate": 3.143057222889156e-05, + "loss": 0.5941, + "step": 9292 + }, + { + "epoch": 11.89504, + "grad_norm": 0.7958306074142456, + "learning_rate": 3.142857142857143e-05, + "loss": 0.5004, + "step": 9293 + }, + { + "epoch": 11.89632, + "grad_norm": 0.9264546632766724, + "learning_rate": 3.14265706282513e-05, + "loss": 0.5965, + "step": 9294 + }, + { + "epoch": 11.8976, + "grad_norm": 0.9189742803573608, + "learning_rate": 3.1424569827931174e-05, + "loss": 0.5785, + "step": 9295 + }, + { + "epoch": 11.89888, + "grad_norm": 0.9090092778205872, + "learning_rate": 3.1422569027611046e-05, + "loss": 0.5715, + "step": 9296 + }, + { + "epoch": 11.90016, + "grad_norm": 0.9040752053260803, + "learning_rate": 3.142056822729092e-05, + "loss": 0.5748, + "step": 9297 + }, + { + "epoch": 11.901440000000001, + "grad_norm": 0.8540357947349548, + "learning_rate": 3.141856742697079e-05, + "loss": 0.5637, + "step": 9298 + }, + { + "epoch": 11.90272, + "grad_norm": 0.8528608083724976, + "learning_rate": 3.141656662665066e-05, + "loss": 0.5238, + "step": 9299 + }, + { + "epoch": 11.904, + "grad_norm": 0.8939346075057983, + "learning_rate": 3.1414565826330534e-05, + "loss": 0.5637, + "step": 9300 + }, + { + "epoch": 11.90528, + "grad_norm": 0.8866257071495056, + "learning_rate": 3.1412565026010406e-05, + "loss": 0.6088, + "step": 9301 + }, + { + "epoch": 11.90656, + "grad_norm": 0.9136728644371033, + "learning_rate": 3.141056422569028e-05, + "loss": 0.5708, + "step": 9302 + }, + { + "epoch": 11.90784, + "grad_norm": 0.908406138420105, + "learning_rate": 3.140856342537015e-05, + "loss": 0.5593, + "step": 9303 + }, + { + "epoch": 11.90912, + "grad_norm": 0.9199392199516296, + "learning_rate": 3.140656262505002e-05, + "loss": 0.5653, + "step": 9304 + }, + { + "epoch": 11.9104, + "grad_norm": 0.9563983082771301, + "learning_rate": 3.140456182472989e-05, + "loss": 0.56, + "step": 9305 + }, + { + "epoch": 11.91168, + "grad_norm": 0.8945441842079163, + "learning_rate": 3.1402561024409765e-05, + "loss": 0.5475, + "step": 9306 + }, + { + "epoch": 11.91296, + "grad_norm": 0.9007996320724487, + "learning_rate": 3.140056022408964e-05, + "loss": 0.6062, + "step": 9307 + }, + { + "epoch": 11.91424, + "grad_norm": 0.9043374061584473, + "learning_rate": 3.139855942376951e-05, + "loss": 0.5396, + "step": 9308 + }, + { + "epoch": 11.91552, + "grad_norm": 0.8611972332000732, + "learning_rate": 3.139655862344938e-05, + "loss": 0.5255, + "step": 9309 + }, + { + "epoch": 11.9168, + "grad_norm": 0.9283512234687805, + "learning_rate": 3.139455782312926e-05, + "loss": 0.5369, + "step": 9310 + }, + { + "epoch": 11.91808, + "grad_norm": 0.9080418944358826, + "learning_rate": 3.1392557022809124e-05, + "loss": 0.5843, + "step": 9311 + }, + { + "epoch": 11.91936, + "grad_norm": 0.9004126191139221, + "learning_rate": 3.1390556222488996e-05, + "loss": 0.5956, + "step": 9312 + }, + { + "epoch": 11.92064, + "grad_norm": 0.9140704274177551, + "learning_rate": 3.138855542216887e-05, + "loss": 0.5571, + "step": 9313 + }, + { + "epoch": 11.92192, + "grad_norm": 0.8815629482269287, + "learning_rate": 3.138655462184874e-05, + "loss": 0.5591, + "step": 9314 + }, + { + "epoch": 11.9232, + "grad_norm": 0.9093433618545532, + "learning_rate": 3.138455382152861e-05, + "loss": 0.5775, + "step": 9315 + }, + { + "epoch": 11.924479999999999, + "grad_norm": 0.8649457693099976, + "learning_rate": 3.138255302120848e-05, + "loss": 0.547, + "step": 9316 + }, + { + "epoch": 11.92576, + "grad_norm": 0.8733769059181213, + "learning_rate": 3.138055222088836e-05, + "loss": 0.5395, + "step": 9317 + }, + { + "epoch": 11.92704, + "grad_norm": 0.8359034061431885, + "learning_rate": 3.1378551420568234e-05, + "loss": 0.5197, + "step": 9318 + }, + { + "epoch": 11.92832, + "grad_norm": 0.9323531985282898, + "learning_rate": 3.13765506202481e-05, + "loss": 0.5518, + "step": 9319 + }, + { + "epoch": 11.9296, + "grad_norm": 0.8930750489234924, + "learning_rate": 3.137454981992797e-05, + "loss": 0.576, + "step": 9320 + }, + { + "epoch": 11.93088, + "grad_norm": 0.9025473594665527, + "learning_rate": 3.137254901960784e-05, + "loss": 0.6194, + "step": 9321 + }, + { + "epoch": 11.93216, + "grad_norm": 0.8879018425941467, + "learning_rate": 3.1370548219287715e-05, + "loss": 0.5253, + "step": 9322 + }, + { + "epoch": 11.933440000000001, + "grad_norm": 0.9026443958282471, + "learning_rate": 3.1368547418967586e-05, + "loss": 0.61, + "step": 9323 + }, + { + "epoch": 11.93472, + "grad_norm": 0.9492444396018982, + "learning_rate": 3.1366546618647465e-05, + "loss": 0.5656, + "step": 9324 + }, + { + "epoch": 11.936, + "grad_norm": 0.913387656211853, + "learning_rate": 3.136454581832734e-05, + "loss": 0.533, + "step": 9325 + }, + { + "epoch": 11.93728, + "grad_norm": 0.8534862995147705, + "learning_rate": 3.136254501800721e-05, + "loss": 0.5176, + "step": 9326 + }, + { + "epoch": 11.93856, + "grad_norm": 0.8602653741836548, + "learning_rate": 3.1360544217687074e-05, + "loss": 0.5341, + "step": 9327 + }, + { + "epoch": 11.93984, + "grad_norm": 0.9074930548667908, + "learning_rate": 3.1358543417366946e-05, + "loss": 0.539, + "step": 9328 + }, + { + "epoch": 11.94112, + "grad_norm": 0.9066956639289856, + "learning_rate": 3.135654261704682e-05, + "loss": 0.5496, + "step": 9329 + }, + { + "epoch": 11.9424, + "grad_norm": 0.9023146033287048, + "learning_rate": 3.135454181672669e-05, + "loss": 0.5679, + "step": 9330 + }, + { + "epoch": 11.94368, + "grad_norm": 0.9154323935508728, + "learning_rate": 3.135254101640656e-05, + "loss": 0.539, + "step": 9331 + }, + { + "epoch": 11.94496, + "grad_norm": 0.9193128347396851, + "learning_rate": 3.135054021608644e-05, + "loss": 0.5444, + "step": 9332 + }, + { + "epoch": 11.94624, + "grad_norm": 0.9258593916893005, + "learning_rate": 3.134853941576631e-05, + "loss": 0.5432, + "step": 9333 + }, + { + "epoch": 11.94752, + "grad_norm": 0.9019486904144287, + "learning_rate": 3.1346538615446184e-05, + "loss": 0.5888, + "step": 9334 + }, + { + "epoch": 11.9488, + "grad_norm": 0.9071494340896606, + "learning_rate": 3.134453781512605e-05, + "loss": 0.5334, + "step": 9335 + }, + { + "epoch": 11.95008, + "grad_norm": 0.8721327185630798, + "learning_rate": 3.134253701480592e-05, + "loss": 0.5885, + "step": 9336 + }, + { + "epoch": 11.95136, + "grad_norm": 0.9365826845169067, + "learning_rate": 3.134053621448579e-05, + "loss": 0.5799, + "step": 9337 + }, + { + "epoch": 11.95264, + "grad_norm": 0.9165831804275513, + "learning_rate": 3.1338535414165664e-05, + "loss": 0.5104, + "step": 9338 + }, + { + "epoch": 11.95392, + "grad_norm": 0.8874984979629517, + "learning_rate": 3.133653461384554e-05, + "loss": 0.5204, + "step": 9339 + }, + { + "epoch": 11.9552, + "grad_norm": 0.8862015604972839, + "learning_rate": 3.1334533813525415e-05, + "loss": 0.5502, + "step": 9340 + }, + { + "epoch": 11.956479999999999, + "grad_norm": 0.9116657376289368, + "learning_rate": 3.1332533013205287e-05, + "loss": 0.5619, + "step": 9341 + }, + { + "epoch": 11.95776, + "grad_norm": 0.8775522112846375, + "learning_rate": 3.133053221288516e-05, + "loss": 0.526, + "step": 9342 + }, + { + "epoch": 11.95904, + "grad_norm": 0.8832023739814758, + "learning_rate": 3.1328531412565023e-05, + "loss": 0.5192, + "step": 9343 + }, + { + "epoch": 11.96032, + "grad_norm": 0.8574889302253723, + "learning_rate": 3.1326530612244895e-05, + "loss": 0.5594, + "step": 9344 + }, + { + "epoch": 11.9616, + "grad_norm": 0.8446320295333862, + "learning_rate": 3.132452981192477e-05, + "loss": 0.5316, + "step": 9345 + }, + { + "epoch": 11.96288, + "grad_norm": 0.8501649498939514, + "learning_rate": 3.1322529011604646e-05, + "loss": 0.5666, + "step": 9346 + }, + { + "epoch": 11.96416, + "grad_norm": 0.9033554196357727, + "learning_rate": 3.132052821128452e-05, + "loss": 0.5631, + "step": 9347 + }, + { + "epoch": 11.96544, + "grad_norm": 0.9225219488143921, + "learning_rate": 3.131852741096439e-05, + "loss": 0.573, + "step": 9348 + }, + { + "epoch": 11.96672, + "grad_norm": 0.9399309754371643, + "learning_rate": 3.131652661064426e-05, + "loss": 0.6026, + "step": 9349 + }, + { + "epoch": 11.968, + "grad_norm": 0.8722439408302307, + "learning_rate": 3.131452581032413e-05, + "loss": 0.5392, + "step": 9350 + }, + { + "epoch": 11.96928, + "grad_norm": 0.8984782099723816, + "learning_rate": 3.1312525010004e-05, + "loss": 0.5778, + "step": 9351 + }, + { + "epoch": 11.97056, + "grad_norm": 0.9469655156135559, + "learning_rate": 3.131052420968387e-05, + "loss": 0.6142, + "step": 9352 + }, + { + "epoch": 11.97184, + "grad_norm": 0.8549695014953613, + "learning_rate": 3.130852340936375e-05, + "loss": 0.5452, + "step": 9353 + }, + { + "epoch": 11.97312, + "grad_norm": 0.913131058216095, + "learning_rate": 3.130652260904362e-05, + "loss": 0.5255, + "step": 9354 + }, + { + "epoch": 11.9744, + "grad_norm": 0.8826735615730286, + "learning_rate": 3.130452180872349e-05, + "loss": 0.5603, + "step": 9355 + }, + { + "epoch": 11.97568, + "grad_norm": 0.8853660225868225, + "learning_rate": 3.1302521008403364e-05, + "loss": 0.5223, + "step": 9356 + }, + { + "epoch": 11.97696, + "grad_norm": 0.9496334791183472, + "learning_rate": 3.1300520208083236e-05, + "loss": 0.6112, + "step": 9357 + }, + { + "epoch": 11.97824, + "grad_norm": 0.9710426926612854, + "learning_rate": 3.129851940776311e-05, + "loss": 0.5996, + "step": 9358 + }, + { + "epoch": 11.97952, + "grad_norm": 0.9180809855461121, + "learning_rate": 3.129651860744297e-05, + "loss": 0.5501, + "step": 9359 + }, + { + "epoch": 11.9808, + "grad_norm": 0.875312089920044, + "learning_rate": 3.129451780712285e-05, + "loss": 0.5474, + "step": 9360 + }, + { + "epoch": 11.98208, + "grad_norm": 0.9497184753417969, + "learning_rate": 3.1292517006802724e-05, + "loss": 0.6125, + "step": 9361 + }, + { + "epoch": 11.98336, + "grad_norm": 0.9112341403961182, + "learning_rate": 3.1290516206482596e-05, + "loss": 0.5461, + "step": 9362 + }, + { + "epoch": 11.98464, + "grad_norm": 0.8672278523445129, + "learning_rate": 3.128851540616247e-05, + "loss": 0.5453, + "step": 9363 + }, + { + "epoch": 11.98592, + "grad_norm": 0.853042721748352, + "learning_rate": 3.128651460584234e-05, + "loss": 0.5445, + "step": 9364 + }, + { + "epoch": 11.9872, + "grad_norm": 0.936596155166626, + "learning_rate": 3.128451380552221e-05, + "loss": 0.5467, + "step": 9365 + }, + { + "epoch": 11.98848, + "grad_norm": 0.901436984539032, + "learning_rate": 3.128251300520208e-05, + "loss": 0.5904, + "step": 9366 + }, + { + "epoch": 11.98976, + "grad_norm": 0.9036868214607239, + "learning_rate": 3.1280512204881955e-05, + "loss": 0.5724, + "step": 9367 + }, + { + "epoch": 11.99104, + "grad_norm": 0.8857167959213257, + "learning_rate": 3.127851140456183e-05, + "loss": 0.5707, + "step": 9368 + }, + { + "epoch": 11.99232, + "grad_norm": 0.8939414024353027, + "learning_rate": 3.12765106042417e-05, + "loss": 0.6032, + "step": 9369 + }, + { + "epoch": 11.9936, + "grad_norm": 0.9171572327613831, + "learning_rate": 3.127450980392157e-05, + "loss": 0.5319, + "step": 9370 + }, + { + "epoch": 11.99488, + "grad_norm": 0.92975914478302, + "learning_rate": 3.127250900360144e-05, + "loss": 0.6055, + "step": 9371 + }, + { + "epoch": 11.99616, + "grad_norm": 0.9182210564613342, + "learning_rate": 3.1270508203281314e-05, + "loss": 0.6125, + "step": 9372 + }, + { + "epoch": 11.99744, + "grad_norm": 0.8801096081733704, + "learning_rate": 3.1268507402961186e-05, + "loss": 0.5197, + "step": 9373 + }, + { + "epoch": 11.99872, + "grad_norm": 0.9054288268089294, + "learning_rate": 3.126650660264106e-05, + "loss": 0.5574, + "step": 9374 + }, + { + "epoch": 12.0, + "grad_norm": Infinity, + "learning_rate": 3.126650660264106e-05, + "loss": 0.9918, + "step": 9375 + }, + { + "epoch": 12.00128, + "grad_norm": 0.8894946575164795, + "learning_rate": 3.126450580232093e-05, + "loss": 0.5222, + "step": 9376 + }, + { + "epoch": 12.00256, + "grad_norm": 0.8508769273757935, + "learning_rate": 3.12625050020008e-05, + "loss": 0.521, + "step": 9377 + }, + { + "epoch": 12.00384, + "grad_norm": 0.8415517807006836, + "learning_rate": 3.1260504201680673e-05, + "loss": 0.5538, + "step": 9378 + }, + { + "epoch": 12.00512, + "grad_norm": 0.8784507513046265, + "learning_rate": 3.1258503401360545e-05, + "loss": 0.5281, + "step": 9379 + }, + { + "epoch": 12.0064, + "grad_norm": 0.8953071236610413, + "learning_rate": 3.125650260104042e-05, + "loss": 0.5587, + "step": 9380 + }, + { + "epoch": 12.00768, + "grad_norm": 0.9295496344566345, + "learning_rate": 3.125450180072029e-05, + "loss": 0.5007, + "step": 9381 + }, + { + "epoch": 12.00896, + "grad_norm": 0.8452316522598267, + "learning_rate": 3.125250100040016e-05, + "loss": 0.5127, + "step": 9382 + }, + { + "epoch": 12.01024, + "grad_norm": 0.9824538826942444, + "learning_rate": 3.125050020008003e-05, + "loss": 0.6222, + "step": 9383 + }, + { + "epoch": 12.01152, + "grad_norm": 0.8931326270103455, + "learning_rate": 3.1248499399759905e-05, + "loss": 0.5504, + "step": 9384 + }, + { + "epoch": 12.0128, + "grad_norm": 0.8844808340072632, + "learning_rate": 3.1246498599439776e-05, + "loss": 0.5371, + "step": 9385 + }, + { + "epoch": 12.01408, + "grad_norm": 0.8983806371688843, + "learning_rate": 3.124449779911965e-05, + "loss": 0.5548, + "step": 9386 + }, + { + "epoch": 12.01536, + "grad_norm": 0.9335402250289917, + "learning_rate": 3.124249699879952e-05, + "loss": 0.5677, + "step": 9387 + }, + { + "epoch": 12.01664, + "grad_norm": 0.928325355052948, + "learning_rate": 3.124049619847939e-05, + "loss": 0.5343, + "step": 9388 + }, + { + "epoch": 12.01792, + "grad_norm": 0.9786158800125122, + "learning_rate": 3.123849539815927e-05, + "loss": 0.5708, + "step": 9389 + }, + { + "epoch": 12.0192, + "grad_norm": 0.8691074252128601, + "learning_rate": 3.1236494597839136e-05, + "loss": 0.5301, + "step": 9390 + }, + { + "epoch": 12.02048, + "grad_norm": 0.9225201606750488, + "learning_rate": 3.123449379751901e-05, + "loss": 0.5683, + "step": 9391 + }, + { + "epoch": 12.02176, + "grad_norm": 0.9116083383560181, + "learning_rate": 3.123249299719888e-05, + "loss": 0.5318, + "step": 9392 + }, + { + "epoch": 12.02304, + "grad_norm": 0.9338945150375366, + "learning_rate": 3.123049219687875e-05, + "loss": 0.5751, + "step": 9393 + }, + { + "epoch": 12.02432, + "grad_norm": 0.8985954523086548, + "learning_rate": 3.122849139655862e-05, + "loss": 0.5471, + "step": 9394 + }, + { + "epoch": 12.0256, + "grad_norm": 0.9236059188842773, + "learning_rate": 3.1226490596238495e-05, + "loss": 0.54, + "step": 9395 + }, + { + "epoch": 12.02688, + "grad_norm": 0.902243971824646, + "learning_rate": 3.1224489795918374e-05, + "loss": 0.5632, + "step": 9396 + }, + { + "epoch": 12.02816, + "grad_norm": 0.9559075832366943, + "learning_rate": 3.1222488995598245e-05, + "loss": 0.5201, + "step": 9397 + }, + { + "epoch": 12.02944, + "grad_norm": 0.8943519592285156, + "learning_rate": 3.122048819527811e-05, + "loss": 0.5407, + "step": 9398 + }, + { + "epoch": 12.03072, + "grad_norm": 0.8215572834014893, + "learning_rate": 3.121848739495798e-05, + "loss": 0.4806, + "step": 9399 + }, + { + "epoch": 12.032, + "grad_norm": 0.9161409139633179, + "learning_rate": 3.1216486594637854e-05, + "loss": 0.5971, + "step": 9400 + }, + { + "epoch": 12.03328, + "grad_norm": 0.9485968947410583, + "learning_rate": 3.1214485794317726e-05, + "loss": 0.5852, + "step": 9401 + }, + { + "epoch": 12.03456, + "grad_norm": 0.8830835223197937, + "learning_rate": 3.12124849939976e-05, + "loss": 0.475, + "step": 9402 + }, + { + "epoch": 12.03584, + "grad_norm": 0.9634863138198853, + "learning_rate": 3.1210484193677477e-05, + "loss": 0.5458, + "step": 9403 + }, + { + "epoch": 12.03712, + "grad_norm": 0.9121254086494446, + "learning_rate": 3.120848339335735e-05, + "loss": 0.5025, + "step": 9404 + }, + { + "epoch": 12.0384, + "grad_norm": 1.0343906879425049, + "learning_rate": 3.120648259303722e-05, + "loss": 0.5407, + "step": 9405 + }, + { + "epoch": 12.03968, + "grad_norm": 0.892846941947937, + "learning_rate": 3.1204481792717085e-05, + "loss": 0.5607, + "step": 9406 + }, + { + "epoch": 12.04096, + "grad_norm": 0.8945642113685608, + "learning_rate": 3.120248099239696e-05, + "loss": 0.5562, + "step": 9407 + }, + { + "epoch": 12.04224, + "grad_norm": 0.8921273350715637, + "learning_rate": 3.120048019207683e-05, + "loss": 0.5033, + "step": 9408 + }, + { + "epoch": 12.043520000000001, + "grad_norm": 0.8849148154258728, + "learning_rate": 3.11984793917567e-05, + "loss": 0.558, + "step": 9409 + }, + { + "epoch": 12.0448, + "grad_norm": 0.8761171698570251, + "learning_rate": 3.119647859143658e-05, + "loss": 0.5437, + "step": 9410 + }, + { + "epoch": 12.04608, + "grad_norm": 0.9475454092025757, + "learning_rate": 3.119447779111645e-05, + "loss": 0.5873, + "step": 9411 + }, + { + "epoch": 12.04736, + "grad_norm": 0.9096229076385498, + "learning_rate": 3.119247699079632e-05, + "loss": 0.5356, + "step": 9412 + }, + { + "epoch": 12.04864, + "grad_norm": 0.8909059166908264, + "learning_rate": 3.1190476190476195e-05, + "loss": 0.5258, + "step": 9413 + }, + { + "epoch": 12.04992, + "grad_norm": 0.8790292739868164, + "learning_rate": 3.118847539015606e-05, + "loss": 0.5287, + "step": 9414 + }, + { + "epoch": 12.0512, + "grad_norm": 0.9126578569412231, + "learning_rate": 3.118647458983593e-05, + "loss": 0.5081, + "step": 9415 + }, + { + "epoch": 12.05248, + "grad_norm": 0.9787217378616333, + "learning_rate": 3.1184473789515804e-05, + "loss": 0.5798, + "step": 9416 + }, + { + "epoch": 12.05376, + "grad_norm": 0.8833532333374023, + "learning_rate": 3.118247298919568e-05, + "loss": 0.5202, + "step": 9417 + }, + { + "epoch": 12.05504, + "grad_norm": 0.8925086855888367, + "learning_rate": 3.1180472188875554e-05, + "loss": 0.5235, + "step": 9418 + }, + { + "epoch": 12.05632, + "grad_norm": 0.9011042714118958, + "learning_rate": 3.1178471388555426e-05, + "loss": 0.54, + "step": 9419 + }, + { + "epoch": 12.0576, + "grad_norm": 0.8985966444015503, + "learning_rate": 3.11764705882353e-05, + "loss": 0.5078, + "step": 9420 + }, + { + "epoch": 12.05888, + "grad_norm": 0.9557191729545593, + "learning_rate": 3.117446978791517e-05, + "loss": 0.5882, + "step": 9421 + }, + { + "epoch": 12.06016, + "grad_norm": 0.9084435701370239, + "learning_rate": 3.1172468987595035e-05, + "loss": 0.5736, + "step": 9422 + }, + { + "epoch": 12.06144, + "grad_norm": 0.8469879627227783, + "learning_rate": 3.117046818727491e-05, + "loss": 0.5194, + "step": 9423 + }, + { + "epoch": 12.06272, + "grad_norm": 0.9563634991645813, + "learning_rate": 3.1168467386954786e-05, + "loss": 0.5507, + "step": 9424 + }, + { + "epoch": 12.064, + "grad_norm": 0.9139657020568848, + "learning_rate": 3.116646658663466e-05, + "loss": 0.5972, + "step": 9425 + }, + { + "epoch": 12.06528, + "grad_norm": 0.8986368775367737, + "learning_rate": 3.116446578631453e-05, + "loss": 0.5257, + "step": 9426 + }, + { + "epoch": 12.06656, + "grad_norm": 0.9286632537841797, + "learning_rate": 3.11624649859944e-05, + "loss": 0.5412, + "step": 9427 + }, + { + "epoch": 12.06784, + "grad_norm": 0.9259238243103027, + "learning_rate": 3.116046418567427e-05, + "loss": 0.5729, + "step": 9428 + }, + { + "epoch": 12.06912, + "grad_norm": 0.8716070652008057, + "learning_rate": 3.1158463385354145e-05, + "loss": 0.5373, + "step": 9429 + }, + { + "epoch": 12.0704, + "grad_norm": 0.937944233417511, + "learning_rate": 3.115646258503401e-05, + "loss": 0.5433, + "step": 9430 + }, + { + "epoch": 12.07168, + "grad_norm": 0.8506153225898743, + "learning_rate": 3.115446178471389e-05, + "loss": 0.4806, + "step": 9431 + }, + { + "epoch": 12.07296, + "grad_norm": 0.9485256671905518, + "learning_rate": 3.115246098439376e-05, + "loss": 0.5236, + "step": 9432 + }, + { + "epoch": 12.07424, + "grad_norm": 0.9169779419898987, + "learning_rate": 3.115046018407363e-05, + "loss": 0.5357, + "step": 9433 + }, + { + "epoch": 12.07552, + "grad_norm": 0.9298108816146851, + "learning_rate": 3.1148459383753504e-05, + "loss": 0.5527, + "step": 9434 + }, + { + "epoch": 12.0768, + "grad_norm": 0.8851594924926758, + "learning_rate": 3.1146458583433376e-05, + "loss": 0.4717, + "step": 9435 + }, + { + "epoch": 12.07808, + "grad_norm": 0.9402860999107361, + "learning_rate": 3.114445778311325e-05, + "loss": 0.5484, + "step": 9436 + }, + { + "epoch": 12.07936, + "grad_norm": 0.9027056097984314, + "learning_rate": 3.114245698279312e-05, + "loss": 0.5506, + "step": 9437 + }, + { + "epoch": 12.08064, + "grad_norm": 0.9592556953430176, + "learning_rate": 3.114045618247299e-05, + "loss": 0.5557, + "step": 9438 + }, + { + "epoch": 12.08192, + "grad_norm": 0.9053078293800354, + "learning_rate": 3.1138455382152863e-05, + "loss": 0.5468, + "step": 9439 + }, + { + "epoch": 12.0832, + "grad_norm": 0.941765546798706, + "learning_rate": 3.1136454581832735e-05, + "loss": 0.5678, + "step": 9440 + }, + { + "epoch": 12.08448, + "grad_norm": 0.9676002264022827, + "learning_rate": 3.113445378151261e-05, + "loss": 0.5451, + "step": 9441 + }, + { + "epoch": 12.08576, + "grad_norm": 0.9788287878036499, + "learning_rate": 3.113245298119248e-05, + "loss": 0.6131, + "step": 9442 + }, + { + "epoch": 12.08704, + "grad_norm": 0.9841639399528503, + "learning_rate": 3.113045218087235e-05, + "loss": 0.5438, + "step": 9443 + }, + { + "epoch": 12.08832, + "grad_norm": 0.955268144607544, + "learning_rate": 3.112845138055222e-05, + "loss": 0.5622, + "step": 9444 + }, + { + "epoch": 12.0896, + "grad_norm": 0.905160129070282, + "learning_rate": 3.1126450580232095e-05, + "loss": 0.5138, + "step": 9445 + }, + { + "epoch": 12.09088, + "grad_norm": 0.9529837369918823, + "learning_rate": 3.1124449779911966e-05, + "loss": 0.5891, + "step": 9446 + }, + { + "epoch": 12.09216, + "grad_norm": 1.0116820335388184, + "learning_rate": 3.112244897959184e-05, + "loss": 0.5347, + "step": 9447 + }, + { + "epoch": 12.09344, + "grad_norm": 0.9079636335372925, + "learning_rate": 3.112044817927171e-05, + "loss": 0.5178, + "step": 9448 + }, + { + "epoch": 12.09472, + "grad_norm": 0.9194412231445312, + "learning_rate": 3.111844737895158e-05, + "loss": 0.5488, + "step": 9449 + }, + { + "epoch": 12.096, + "grad_norm": 0.9513563513755798, + "learning_rate": 3.1116446578631454e-05, + "loss": 0.5472, + "step": 9450 + }, + { + "epoch": 12.09728, + "grad_norm": 0.9122597575187683, + "learning_rate": 3.1114445778311326e-05, + "loss": 0.5887, + "step": 9451 + }, + { + "epoch": 12.09856, + "grad_norm": 0.9185177683830261, + "learning_rate": 3.11124449779912e-05, + "loss": 0.5636, + "step": 9452 + }, + { + "epoch": 12.09984, + "grad_norm": 0.923226535320282, + "learning_rate": 3.111044417767107e-05, + "loss": 0.5302, + "step": 9453 + }, + { + "epoch": 12.10112, + "grad_norm": 0.8814284801483154, + "learning_rate": 3.110844337735094e-05, + "loss": 0.522, + "step": 9454 + }, + { + "epoch": 12.1024, + "grad_norm": 0.9242017865180969, + "learning_rate": 3.110644257703081e-05, + "loss": 0.5458, + "step": 9455 + }, + { + "epoch": 12.10368, + "grad_norm": 0.9658417701721191, + "learning_rate": 3.1104441776710685e-05, + "loss": 0.5912, + "step": 9456 + }, + { + "epoch": 12.10496, + "grad_norm": 0.90215003490448, + "learning_rate": 3.110244097639056e-05, + "loss": 0.5586, + "step": 9457 + }, + { + "epoch": 12.10624, + "grad_norm": 0.9708035588264465, + "learning_rate": 3.110044017607043e-05, + "loss": 0.5718, + "step": 9458 + }, + { + "epoch": 12.10752, + "grad_norm": 0.8893763422966003, + "learning_rate": 3.10984393757503e-05, + "loss": 0.532, + "step": 9459 + }, + { + "epoch": 12.1088, + "grad_norm": 0.9912137389183044, + "learning_rate": 3.109643857543017e-05, + "loss": 0.5976, + "step": 9460 + }, + { + "epoch": 12.11008, + "grad_norm": 0.89457768201828, + "learning_rate": 3.1094437775110044e-05, + "loss": 0.5394, + "step": 9461 + }, + { + "epoch": 12.11136, + "grad_norm": 0.9052734971046448, + "learning_rate": 3.1092436974789916e-05, + "loss": 0.5318, + "step": 9462 + }, + { + "epoch": 12.11264, + "grad_norm": 0.9517974257469177, + "learning_rate": 3.109043617446979e-05, + "loss": 0.5494, + "step": 9463 + }, + { + "epoch": 12.11392, + "grad_norm": 0.921795666217804, + "learning_rate": 3.108843537414966e-05, + "loss": 0.4871, + "step": 9464 + }, + { + "epoch": 12.1152, + "grad_norm": 0.8893502950668335, + "learning_rate": 3.108643457382953e-05, + "loss": 0.5022, + "step": 9465 + }, + { + "epoch": 12.11648, + "grad_norm": 1.020261287689209, + "learning_rate": 3.1084433773509404e-05, + "loss": 0.6282, + "step": 9466 + }, + { + "epoch": 12.11776, + "grad_norm": 0.9455602169036865, + "learning_rate": 3.108243297318928e-05, + "loss": 0.5625, + "step": 9467 + }, + { + "epoch": 12.11904, + "grad_norm": 0.9229424595832825, + "learning_rate": 3.108043217286915e-05, + "loss": 0.5478, + "step": 9468 + }, + { + "epoch": 12.12032, + "grad_norm": 0.9324727654457092, + "learning_rate": 3.107843137254902e-05, + "loss": 0.5842, + "step": 9469 + }, + { + "epoch": 12.1216, + "grad_norm": 0.8508306741714478, + "learning_rate": 3.107643057222889e-05, + "loss": 0.4829, + "step": 9470 + }, + { + "epoch": 12.12288, + "grad_norm": 0.9504947662353516, + "learning_rate": 3.107442977190876e-05, + "loss": 0.5494, + "step": 9471 + }, + { + "epoch": 12.12416, + "grad_norm": 0.8809190988540649, + "learning_rate": 3.1072428971588635e-05, + "loss": 0.518, + "step": 9472 + }, + { + "epoch": 12.12544, + "grad_norm": 0.9718751311302185, + "learning_rate": 3.1070428171268507e-05, + "loss": 0.5083, + "step": 9473 + }, + { + "epoch": 12.12672, + "grad_norm": 0.9734033346176147, + "learning_rate": 3.1068427370948385e-05, + "loss": 0.5977, + "step": 9474 + }, + { + "epoch": 12.128, + "grad_norm": 0.892812967300415, + "learning_rate": 3.106642657062826e-05, + "loss": 0.5214, + "step": 9475 + }, + { + "epoch": 12.12928, + "grad_norm": 0.9172582626342773, + "learning_rate": 3.106442577030812e-05, + "loss": 0.5311, + "step": 9476 + }, + { + "epoch": 12.13056, + "grad_norm": 0.8705976009368896, + "learning_rate": 3.1062424969987994e-05, + "loss": 0.4711, + "step": 9477 + }, + { + "epoch": 12.13184, + "grad_norm": 0.9229205250740051, + "learning_rate": 3.1060424169667866e-05, + "loss": 0.5198, + "step": 9478 + }, + { + "epoch": 12.13312, + "grad_norm": 0.9089353680610657, + "learning_rate": 3.105842336934774e-05, + "loss": 0.5156, + "step": 9479 + }, + { + "epoch": 12.1344, + "grad_norm": 0.9263928532600403, + "learning_rate": 3.105642256902761e-05, + "loss": 0.4731, + "step": 9480 + }, + { + "epoch": 12.13568, + "grad_norm": 0.8989574313163757, + "learning_rate": 3.105442176870749e-05, + "loss": 0.5337, + "step": 9481 + }, + { + "epoch": 12.13696, + "grad_norm": 0.9506709575653076, + "learning_rate": 3.105242096838736e-05, + "loss": 0.5357, + "step": 9482 + }, + { + "epoch": 12.13824, + "grad_norm": 0.9234217405319214, + "learning_rate": 3.105042016806723e-05, + "loss": 0.5471, + "step": 9483 + }, + { + "epoch": 12.13952, + "grad_norm": 0.8666446805000305, + "learning_rate": 3.10484193677471e-05, + "loss": 0.5298, + "step": 9484 + }, + { + "epoch": 12.1408, + "grad_norm": 0.9469799399375916, + "learning_rate": 3.104641856742697e-05, + "loss": 0.546, + "step": 9485 + }, + { + "epoch": 12.14208, + "grad_norm": 0.8840956091880798, + "learning_rate": 3.104441776710684e-05, + "loss": 0.5417, + "step": 9486 + }, + { + "epoch": 12.14336, + "grad_norm": 0.8798984885215759, + "learning_rate": 3.104241696678671e-05, + "loss": 0.5403, + "step": 9487 + }, + { + "epoch": 12.14464, + "grad_norm": 0.9367090463638306, + "learning_rate": 3.104041616646659e-05, + "loss": 0.562, + "step": 9488 + }, + { + "epoch": 12.14592, + "grad_norm": 0.8626176118850708, + "learning_rate": 3.103841536614646e-05, + "loss": 0.5044, + "step": 9489 + }, + { + "epoch": 12.1472, + "grad_norm": 0.953429102897644, + "learning_rate": 3.1036414565826335e-05, + "loss": 0.5363, + "step": 9490 + }, + { + "epoch": 12.14848, + "grad_norm": 0.847478449344635, + "learning_rate": 3.103441376550621e-05, + "loss": 0.4921, + "step": 9491 + }, + { + "epoch": 12.14976, + "grad_norm": 0.8839147090911865, + "learning_rate": 3.103241296518607e-05, + "loss": 0.5174, + "step": 9492 + }, + { + "epoch": 12.15104, + "grad_norm": 0.952755868434906, + "learning_rate": 3.1030412164865944e-05, + "loss": 0.5461, + "step": 9493 + }, + { + "epoch": 12.15232, + "grad_norm": 0.8509531021118164, + "learning_rate": 3.1028411364545816e-05, + "loss": 0.5218, + "step": 9494 + }, + { + "epoch": 12.1536, + "grad_norm": 0.9300630688667297, + "learning_rate": 3.1026410564225694e-05, + "loss": 0.5625, + "step": 9495 + }, + { + "epoch": 12.15488, + "grad_norm": 0.9531006217002869, + "learning_rate": 3.1024409763905566e-05, + "loss": 0.5831, + "step": 9496 + }, + { + "epoch": 12.15616, + "grad_norm": 0.9329797625541687, + "learning_rate": 3.102240896358544e-05, + "loss": 0.539, + "step": 9497 + }, + { + "epoch": 12.15744, + "grad_norm": 0.855829656124115, + "learning_rate": 3.102040816326531e-05, + "loss": 0.4812, + "step": 9498 + }, + { + "epoch": 12.15872, + "grad_norm": 0.8939371705055237, + "learning_rate": 3.101840736294518e-05, + "loss": 0.5208, + "step": 9499 + }, + { + "epoch": 12.16, + "grad_norm": 0.9188044667243958, + "learning_rate": 3.101640656262505e-05, + "loss": 0.545, + "step": 9500 + }, + { + "epoch": 12.16128, + "grad_norm": 0.8524420261383057, + "learning_rate": 3.101440576230492e-05, + "loss": 0.5028, + "step": 9501 + }, + { + "epoch": 12.16256, + "grad_norm": 0.906829297542572, + "learning_rate": 3.10124049619848e-05, + "loss": 0.5557, + "step": 9502 + }, + { + "epoch": 12.16384, + "grad_norm": 0.904629647731781, + "learning_rate": 3.101040416166467e-05, + "loss": 0.5265, + "step": 9503 + }, + { + "epoch": 12.16512, + "grad_norm": 0.930142343044281, + "learning_rate": 3.100840336134454e-05, + "loss": 0.5353, + "step": 9504 + }, + { + "epoch": 12.1664, + "grad_norm": 0.9185201525688171, + "learning_rate": 3.100640256102441e-05, + "loss": 0.5448, + "step": 9505 + }, + { + "epoch": 12.16768, + "grad_norm": 0.9382092952728271, + "learning_rate": 3.1004401760704285e-05, + "loss": 0.5498, + "step": 9506 + }, + { + "epoch": 12.16896, + "grad_norm": 0.9041383862495422, + "learning_rate": 3.1002400960384156e-05, + "loss": 0.4895, + "step": 9507 + }, + { + "epoch": 12.17024, + "grad_norm": 0.921959638595581, + "learning_rate": 3.100040016006402e-05, + "loss": 0.5513, + "step": 9508 + }, + { + "epoch": 12.17152, + "grad_norm": 0.9161378145217896, + "learning_rate": 3.09983993597439e-05, + "loss": 0.5745, + "step": 9509 + }, + { + "epoch": 12.1728, + "grad_norm": 0.9752117991447449, + "learning_rate": 3.099639855942377e-05, + "loss": 0.5436, + "step": 9510 + }, + { + "epoch": 12.17408, + "grad_norm": 0.8904070854187012, + "learning_rate": 3.0994397759103644e-05, + "loss": 0.5351, + "step": 9511 + }, + { + "epoch": 12.17536, + "grad_norm": 0.8877695202827454, + "learning_rate": 3.0992396958783516e-05, + "loss": 0.5604, + "step": 9512 + }, + { + "epoch": 12.17664, + "grad_norm": 0.9066382050514221, + "learning_rate": 3.099039615846339e-05, + "loss": 0.5335, + "step": 9513 + }, + { + "epoch": 12.17792, + "grad_norm": 0.9197863936424255, + "learning_rate": 3.098839535814326e-05, + "loss": 0.5328, + "step": 9514 + }, + { + "epoch": 12.1792, + "grad_norm": 0.9085410237312317, + "learning_rate": 3.098639455782313e-05, + "loss": 0.4907, + "step": 9515 + }, + { + "epoch": 12.18048, + "grad_norm": 0.9633046984672546, + "learning_rate": 3.0984393757503e-05, + "loss": 0.5437, + "step": 9516 + }, + { + "epoch": 12.18176, + "grad_norm": 0.9514236450195312, + "learning_rate": 3.0982392957182875e-05, + "loss": 0.5653, + "step": 9517 + }, + { + "epoch": 12.18304, + "grad_norm": 0.9399169087409973, + "learning_rate": 3.098039215686275e-05, + "loss": 0.521, + "step": 9518 + }, + { + "epoch": 12.18432, + "grad_norm": 0.9197844862937927, + "learning_rate": 3.097839135654262e-05, + "loss": 0.5644, + "step": 9519 + }, + { + "epoch": 12.1856, + "grad_norm": 0.8995923399925232, + "learning_rate": 3.097639055622249e-05, + "loss": 0.5357, + "step": 9520 + }, + { + "epoch": 12.18688, + "grad_norm": 0.9238200783729553, + "learning_rate": 3.097438975590236e-05, + "loss": 0.5414, + "step": 9521 + }, + { + "epoch": 12.18816, + "grad_norm": 0.9110772609710693, + "learning_rate": 3.0972388955582234e-05, + "loss": 0.5164, + "step": 9522 + }, + { + "epoch": 12.18944, + "grad_norm": 0.9395493865013123, + "learning_rate": 3.0970388155262106e-05, + "loss": 0.5542, + "step": 9523 + }, + { + "epoch": 12.19072, + "grad_norm": 0.8886218667030334, + "learning_rate": 3.096838735494198e-05, + "loss": 0.4889, + "step": 9524 + }, + { + "epoch": 12.192, + "grad_norm": 0.9301427006721497, + "learning_rate": 3.096638655462185e-05, + "loss": 0.5434, + "step": 9525 + }, + { + "epoch": 12.19328, + "grad_norm": 0.956852912902832, + "learning_rate": 3.096438575430172e-05, + "loss": 0.5286, + "step": 9526 + }, + { + "epoch": 12.19456, + "grad_norm": 0.9380202293395996, + "learning_rate": 3.0962384953981594e-05, + "loss": 0.5866, + "step": 9527 + }, + { + "epoch": 12.19584, + "grad_norm": 0.9663145542144775, + "learning_rate": 3.0960384153661465e-05, + "loss": 0.5784, + "step": 9528 + }, + { + "epoch": 12.19712, + "grad_norm": 0.9319117069244385, + "learning_rate": 3.095838335334134e-05, + "loss": 0.5328, + "step": 9529 + }, + { + "epoch": 12.1984, + "grad_norm": 0.9537164568901062, + "learning_rate": 3.0956382553021216e-05, + "loss": 0.5598, + "step": 9530 + }, + { + "epoch": 12.19968, + "grad_norm": 0.9373032450675964, + "learning_rate": 3.095438175270108e-05, + "loss": 0.5457, + "step": 9531 + }, + { + "epoch": 12.20096, + "grad_norm": 0.9559725522994995, + "learning_rate": 3.095238095238095e-05, + "loss": 0.5518, + "step": 9532 + }, + { + "epoch": 12.20224, + "grad_norm": 0.9501890540122986, + "learning_rate": 3.0950380152060825e-05, + "loss": 0.5554, + "step": 9533 + }, + { + "epoch": 12.20352, + "grad_norm": 0.9486271142959595, + "learning_rate": 3.0948379351740697e-05, + "loss": 0.5774, + "step": 9534 + }, + { + "epoch": 12.2048, + "grad_norm": 0.8942713737487793, + "learning_rate": 3.094637855142057e-05, + "loss": 0.539, + "step": 9535 + }, + { + "epoch": 12.20608, + "grad_norm": 0.9453210234642029, + "learning_rate": 3.094437775110044e-05, + "loss": 0.5477, + "step": 9536 + }, + { + "epoch": 12.20736, + "grad_norm": 0.9913842678070068, + "learning_rate": 3.094237695078032e-05, + "loss": 0.5536, + "step": 9537 + }, + { + "epoch": 12.20864, + "grad_norm": 0.9225572347640991, + "learning_rate": 3.094037615046019e-05, + "loss": 0.5175, + "step": 9538 + }, + { + "epoch": 12.20992, + "grad_norm": 0.8077876567840576, + "learning_rate": 3.0938375350140056e-05, + "loss": 0.5227, + "step": 9539 + }, + { + "epoch": 12.2112, + "grad_norm": 0.9560815691947937, + "learning_rate": 3.093637454981993e-05, + "loss": 0.5408, + "step": 9540 + }, + { + "epoch": 12.21248, + "grad_norm": 0.9433364868164062, + "learning_rate": 3.09343737494998e-05, + "loss": 0.5622, + "step": 9541 + }, + { + "epoch": 12.21376, + "grad_norm": 0.934070885181427, + "learning_rate": 3.093237294917967e-05, + "loss": 0.5873, + "step": 9542 + }, + { + "epoch": 12.21504, + "grad_norm": 0.8797191977500916, + "learning_rate": 3.093037214885954e-05, + "loss": 0.5258, + "step": 9543 + }, + { + "epoch": 12.21632, + "grad_norm": 0.9075624346733093, + "learning_rate": 3.092837134853942e-05, + "loss": 0.5438, + "step": 9544 + }, + { + "epoch": 12.2176, + "grad_norm": 0.9165863990783691, + "learning_rate": 3.0926370548219294e-05, + "loss": 0.5279, + "step": 9545 + }, + { + "epoch": 12.21888, + "grad_norm": 0.9102526307106018, + "learning_rate": 3.0924369747899166e-05, + "loss": 0.5665, + "step": 9546 + }, + { + "epoch": 12.22016, + "grad_norm": 0.9222195744514465, + "learning_rate": 3.092236894757903e-05, + "loss": 0.5457, + "step": 9547 + }, + { + "epoch": 12.22144, + "grad_norm": 0.9091429710388184, + "learning_rate": 3.09203681472589e-05, + "loss": 0.5565, + "step": 9548 + }, + { + "epoch": 12.22272, + "grad_norm": 0.8707777857780457, + "learning_rate": 3.0918367346938774e-05, + "loss": 0.5086, + "step": 9549 + }, + { + "epoch": 12.224, + "grad_norm": 0.8751879930496216, + "learning_rate": 3.0916366546618646e-05, + "loss": 0.4871, + "step": 9550 + }, + { + "epoch": 12.22528, + "grad_norm": 0.9578078389167786, + "learning_rate": 3.091436574629852e-05, + "loss": 0.5734, + "step": 9551 + }, + { + "epoch": 12.22656, + "grad_norm": 0.9274276494979858, + "learning_rate": 3.09123649459784e-05, + "loss": 0.5629, + "step": 9552 + }, + { + "epoch": 12.22784, + "grad_norm": 0.9546663761138916, + "learning_rate": 3.091036414565827e-05, + "loss": 0.5466, + "step": 9553 + }, + { + "epoch": 12.22912, + "grad_norm": 0.9720085859298706, + "learning_rate": 3.090836334533814e-05, + "loss": 0.5884, + "step": 9554 + }, + { + "epoch": 12.2304, + "grad_norm": 0.9668819308280945, + "learning_rate": 3.0906362545018006e-05, + "loss": 0.5534, + "step": 9555 + }, + { + "epoch": 12.23168, + "grad_norm": 0.9304522275924683, + "learning_rate": 3.090436174469788e-05, + "loss": 0.5697, + "step": 9556 + }, + { + "epoch": 12.23296, + "grad_norm": 0.915131688117981, + "learning_rate": 3.090236094437775e-05, + "loss": 0.5135, + "step": 9557 + }, + { + "epoch": 12.23424, + "grad_norm": 0.8442711234092712, + "learning_rate": 3.090036014405762e-05, + "loss": 0.5244, + "step": 9558 + }, + { + "epoch": 12.23552, + "grad_norm": 0.8993996381759644, + "learning_rate": 3.08983593437375e-05, + "loss": 0.5362, + "step": 9559 + }, + { + "epoch": 12.2368, + "grad_norm": 0.9662063121795654, + "learning_rate": 3.089635854341737e-05, + "loss": 0.5618, + "step": 9560 + }, + { + "epoch": 12.23808, + "grad_norm": 0.9213639497756958, + "learning_rate": 3.0894357743097244e-05, + "loss": 0.5064, + "step": 9561 + }, + { + "epoch": 12.23936, + "grad_norm": 0.9037312269210815, + "learning_rate": 3.0892356942777115e-05, + "loss": 0.5408, + "step": 9562 + }, + { + "epoch": 12.24064, + "grad_norm": 0.9527158141136169, + "learning_rate": 3.089035614245698e-05, + "loss": 0.5756, + "step": 9563 + }, + { + "epoch": 12.24192, + "grad_norm": 0.886060357093811, + "learning_rate": 3.088835534213685e-05, + "loss": 0.5351, + "step": 9564 + }, + { + "epoch": 12.2432, + "grad_norm": 0.9264008402824402, + "learning_rate": 3.0886354541816724e-05, + "loss": 0.5426, + "step": 9565 + }, + { + "epoch": 12.24448, + "grad_norm": 0.9923083782196045, + "learning_rate": 3.08843537414966e-05, + "loss": 0.556, + "step": 9566 + }, + { + "epoch": 12.24576, + "grad_norm": 0.9164190888404846, + "learning_rate": 3.0882352941176475e-05, + "loss": 0.5639, + "step": 9567 + }, + { + "epoch": 12.24704, + "grad_norm": 0.9647570252418518, + "learning_rate": 3.0880352140856347e-05, + "loss": 0.5686, + "step": 9568 + }, + { + "epoch": 12.24832, + "grad_norm": 0.9134481549263, + "learning_rate": 3.087835134053622e-05, + "loss": 0.5295, + "step": 9569 + }, + { + "epoch": 12.2496, + "grad_norm": 0.8650757670402527, + "learning_rate": 3.087635054021609e-05, + "loss": 0.5215, + "step": 9570 + }, + { + "epoch": 12.25088, + "grad_norm": 0.9101705551147461, + "learning_rate": 3.0874349739895955e-05, + "loss": 0.5681, + "step": 9571 + }, + { + "epoch": 12.25216, + "grad_norm": 0.9110287427902222, + "learning_rate": 3.087234893957583e-05, + "loss": 0.5318, + "step": 9572 + }, + { + "epoch": 12.25344, + "grad_norm": 0.8674918413162231, + "learning_rate": 3.0870348139255706e-05, + "loss": 0.5185, + "step": 9573 + }, + { + "epoch": 12.25472, + "grad_norm": 0.9074335694313049, + "learning_rate": 3.086834733893558e-05, + "loss": 0.5352, + "step": 9574 + }, + { + "epoch": 12.256, + "grad_norm": 0.9423937797546387, + "learning_rate": 3.086634653861545e-05, + "loss": 0.4859, + "step": 9575 + }, + { + "epoch": 12.25728, + "grad_norm": 0.8732250928878784, + "learning_rate": 3.086434573829532e-05, + "loss": 0.5237, + "step": 9576 + }, + { + "epoch": 12.25856, + "grad_norm": 0.9658347964286804, + "learning_rate": 3.086234493797519e-05, + "loss": 0.5853, + "step": 9577 + }, + { + "epoch": 12.25984, + "grad_norm": 0.9143103957176208, + "learning_rate": 3.0860344137655065e-05, + "loss": 0.5277, + "step": 9578 + }, + { + "epoch": 12.26112, + "grad_norm": 0.8892953395843506, + "learning_rate": 3.085834333733493e-05, + "loss": 0.5178, + "step": 9579 + }, + { + "epoch": 12.2624, + "grad_norm": 0.8988597393035889, + "learning_rate": 3.085634253701481e-05, + "loss": 0.5443, + "step": 9580 + }, + { + "epoch": 12.26368, + "grad_norm": 0.9245293736457825, + "learning_rate": 3.085434173669468e-05, + "loss": 0.5107, + "step": 9581 + }, + { + "epoch": 12.26496, + "grad_norm": 0.9400307536125183, + "learning_rate": 3.085234093637455e-05, + "loss": 0.5741, + "step": 9582 + }, + { + "epoch": 12.26624, + "grad_norm": 0.9172420501708984, + "learning_rate": 3.0850340136054424e-05, + "loss": 0.5751, + "step": 9583 + }, + { + "epoch": 12.26752, + "grad_norm": 0.8972305059432983, + "learning_rate": 3.0848339335734296e-05, + "loss": 0.4933, + "step": 9584 + }, + { + "epoch": 12.2688, + "grad_norm": 0.8898636698722839, + "learning_rate": 3.084633853541417e-05, + "loss": 0.5589, + "step": 9585 + }, + { + "epoch": 12.27008, + "grad_norm": 0.9300894141197205, + "learning_rate": 3.084433773509404e-05, + "loss": 0.5789, + "step": 9586 + }, + { + "epoch": 12.27136, + "grad_norm": 0.8965767621994019, + "learning_rate": 3.084233693477391e-05, + "loss": 0.5071, + "step": 9587 + }, + { + "epoch": 12.272639999999999, + "grad_norm": 0.9093717932701111, + "learning_rate": 3.0840336134453784e-05, + "loss": 0.5453, + "step": 9588 + }, + { + "epoch": 12.27392, + "grad_norm": 0.9376620650291443, + "learning_rate": 3.0838335334133656e-05, + "loss": 0.5392, + "step": 9589 + }, + { + "epoch": 12.2752, + "grad_norm": 0.9461066126823425, + "learning_rate": 3.083633453381353e-05, + "loss": 0.5636, + "step": 9590 + }, + { + "epoch": 12.27648, + "grad_norm": 0.8881168365478516, + "learning_rate": 3.08343337334934e-05, + "loss": 0.5378, + "step": 9591 + }, + { + "epoch": 12.27776, + "grad_norm": 0.9719573855400085, + "learning_rate": 3.083233293317327e-05, + "loss": 0.6019, + "step": 9592 + }, + { + "epoch": 12.27904, + "grad_norm": 0.9166926741600037, + "learning_rate": 3.083033213285314e-05, + "loss": 0.579, + "step": 9593 + }, + { + "epoch": 12.28032, + "grad_norm": 0.9417465925216675, + "learning_rate": 3.0828331332533015e-05, + "loss": 0.5209, + "step": 9594 + }, + { + "epoch": 12.2816, + "grad_norm": 0.9969757199287415, + "learning_rate": 3.082633053221289e-05, + "loss": 0.5814, + "step": 9595 + }, + { + "epoch": 12.28288, + "grad_norm": 0.9610475897789001, + "learning_rate": 3.082432973189276e-05, + "loss": 0.5997, + "step": 9596 + }, + { + "epoch": 12.28416, + "grad_norm": 0.8945482969284058, + "learning_rate": 3.082232893157263e-05, + "loss": 0.5166, + "step": 9597 + }, + { + "epoch": 12.28544, + "grad_norm": 0.9552536606788635, + "learning_rate": 3.08203281312525e-05, + "loss": 0.5762, + "step": 9598 + }, + { + "epoch": 12.28672, + "grad_norm": 0.9500235915184021, + "learning_rate": 3.0818327330932374e-05, + "loss": 0.5129, + "step": 9599 + }, + { + "epoch": 12.288, + "grad_norm": 0.8789950609207153, + "learning_rate": 3.0816326530612246e-05, + "loss": 0.5308, + "step": 9600 + }, + { + "epoch": 12.28928, + "grad_norm": 0.8765109181404114, + "learning_rate": 3.081432573029212e-05, + "loss": 0.4918, + "step": 9601 + }, + { + "epoch": 12.29056, + "grad_norm": 0.9502473473548889, + "learning_rate": 3.081232492997199e-05, + "loss": 0.5485, + "step": 9602 + }, + { + "epoch": 12.29184, + "grad_norm": 0.9585152864456177, + "learning_rate": 3.081032412965186e-05, + "loss": 0.5748, + "step": 9603 + }, + { + "epoch": 12.29312, + "grad_norm": 0.9246171116828918, + "learning_rate": 3.080832332933173e-05, + "loss": 0.5394, + "step": 9604 + }, + { + "epoch": 12.2944, + "grad_norm": 0.9247891902923584, + "learning_rate": 3.0806322529011605e-05, + "loss": 0.5278, + "step": 9605 + }, + { + "epoch": 12.29568, + "grad_norm": 0.9395011067390442, + "learning_rate": 3.080432172869148e-05, + "loss": 0.555, + "step": 9606 + }, + { + "epoch": 12.29696, + "grad_norm": 0.9174982905387878, + "learning_rate": 3.080232092837135e-05, + "loss": 0.51, + "step": 9607 + }, + { + "epoch": 12.29824, + "grad_norm": 0.8428568840026855, + "learning_rate": 3.080032012805123e-05, + "loss": 0.5107, + "step": 9608 + }, + { + "epoch": 12.29952, + "grad_norm": 0.9542026519775391, + "learning_rate": 3.079831932773109e-05, + "loss": 0.5491, + "step": 9609 + }, + { + "epoch": 12.3008, + "grad_norm": 0.9842756390571594, + "learning_rate": 3.0796318527410964e-05, + "loss": 0.5392, + "step": 9610 + }, + { + "epoch": 12.30208, + "grad_norm": 0.9100461006164551, + "learning_rate": 3.0794317727090836e-05, + "loss": 0.5535, + "step": 9611 + }, + { + "epoch": 12.30336, + "grad_norm": 0.8975451588630676, + "learning_rate": 3.079231692677071e-05, + "loss": 0.5449, + "step": 9612 + }, + { + "epoch": 12.30464, + "grad_norm": 0.9020168781280518, + "learning_rate": 3.079031612645058e-05, + "loss": 0.5142, + "step": 9613 + }, + { + "epoch": 12.30592, + "grad_norm": 0.9032678604125977, + "learning_rate": 3.078831532613045e-05, + "loss": 0.5045, + "step": 9614 + }, + { + "epoch": 12.3072, + "grad_norm": 0.9697889089584351, + "learning_rate": 3.078631452581033e-05, + "loss": 0.5418, + "step": 9615 + }, + { + "epoch": 12.30848, + "grad_norm": 0.9151008725166321, + "learning_rate": 3.07843137254902e-05, + "loss": 0.5524, + "step": 9616 + }, + { + "epoch": 12.30976, + "grad_norm": 0.9026890993118286, + "learning_rate": 3.078231292517007e-05, + "loss": 0.5102, + "step": 9617 + }, + { + "epoch": 12.31104, + "grad_norm": 0.9061116576194763, + "learning_rate": 3.078031212484994e-05, + "loss": 0.5607, + "step": 9618 + }, + { + "epoch": 12.31232, + "grad_norm": 0.893916666507721, + "learning_rate": 3.077831132452981e-05, + "loss": 0.4953, + "step": 9619 + }, + { + "epoch": 12.3136, + "grad_norm": 0.9334031343460083, + "learning_rate": 3.077631052420968e-05, + "loss": 0.5265, + "step": 9620 + }, + { + "epoch": 12.31488, + "grad_norm": 0.8903616070747375, + "learning_rate": 3.0774309723889555e-05, + "loss": 0.5208, + "step": 9621 + }, + { + "epoch": 12.31616, + "grad_norm": 0.9477413296699524, + "learning_rate": 3.0772308923569434e-05, + "loss": 0.5759, + "step": 9622 + }, + { + "epoch": 12.31744, + "grad_norm": 0.881305456161499, + "learning_rate": 3.0770308123249305e-05, + "loss": 0.531, + "step": 9623 + }, + { + "epoch": 12.31872, + "grad_norm": 0.9640594124794006, + "learning_rate": 3.076830732292918e-05, + "loss": 0.5694, + "step": 9624 + }, + { + "epoch": 12.32, + "grad_norm": 0.888465404510498, + "learning_rate": 3.076630652260904e-05, + "loss": 0.4961, + "step": 9625 + }, + { + "epoch": 12.32128, + "grad_norm": 0.8741172552108765, + "learning_rate": 3.0764305722288914e-05, + "loss": 0.5258, + "step": 9626 + }, + { + "epoch": 12.32256, + "grad_norm": 0.9086135029792786, + "learning_rate": 3.0762304921968786e-05, + "loss": 0.545, + "step": 9627 + }, + { + "epoch": 12.32384, + "grad_norm": 0.8762311935424805, + "learning_rate": 3.076030412164866e-05, + "loss": 0.538, + "step": 9628 + }, + { + "epoch": 12.32512, + "grad_norm": 0.9073178172111511, + "learning_rate": 3.0758303321328537e-05, + "loss": 0.5466, + "step": 9629 + }, + { + "epoch": 12.3264, + "grad_norm": 0.866665780544281, + "learning_rate": 3.075630252100841e-05, + "loss": 0.545, + "step": 9630 + }, + { + "epoch": 12.32768, + "grad_norm": 0.9432623386383057, + "learning_rate": 3.075430172068828e-05, + "loss": 0.5641, + "step": 9631 + }, + { + "epoch": 12.32896, + "grad_norm": 0.9183183312416077, + "learning_rate": 3.075230092036815e-05, + "loss": 0.5232, + "step": 9632 + }, + { + "epoch": 12.33024, + "grad_norm": 0.9821698069572449, + "learning_rate": 3.075030012004802e-05, + "loss": 0.5631, + "step": 9633 + }, + { + "epoch": 12.33152, + "grad_norm": 0.9003911018371582, + "learning_rate": 3.074829931972789e-05, + "loss": 0.5493, + "step": 9634 + }, + { + "epoch": 12.3328, + "grad_norm": 0.9370520114898682, + "learning_rate": 3.074629851940776e-05, + "loss": 0.556, + "step": 9635 + }, + { + "epoch": 12.33408, + "grad_norm": 0.8950033187866211, + "learning_rate": 3.074429771908764e-05, + "loss": 0.5258, + "step": 9636 + }, + { + "epoch": 12.33536, + "grad_norm": 0.9155859351158142, + "learning_rate": 3.074229691876751e-05, + "loss": 0.5623, + "step": 9637 + }, + { + "epoch": 12.33664, + "grad_norm": 0.929165244102478, + "learning_rate": 3.074029611844738e-05, + "loss": 0.5535, + "step": 9638 + }, + { + "epoch": 12.33792, + "grad_norm": 0.9462549686431885, + "learning_rate": 3.0738295318127255e-05, + "loss": 0.5687, + "step": 9639 + }, + { + "epoch": 12.3392, + "grad_norm": 0.9272847175598145, + "learning_rate": 3.073629451780713e-05, + "loss": 0.5393, + "step": 9640 + }, + { + "epoch": 12.34048, + "grad_norm": 0.8985689282417297, + "learning_rate": 3.073429371748699e-05, + "loss": 0.5385, + "step": 9641 + }, + { + "epoch": 12.34176, + "grad_norm": 0.9289854764938354, + "learning_rate": 3.0732292917166864e-05, + "loss": 0.5499, + "step": 9642 + }, + { + "epoch": 12.34304, + "grad_norm": 0.9333150386810303, + "learning_rate": 3.073029211684674e-05, + "loss": 0.5588, + "step": 9643 + }, + { + "epoch": 12.34432, + "grad_norm": 0.9284766316413879, + "learning_rate": 3.0728291316526614e-05, + "loss": 0.5602, + "step": 9644 + }, + { + "epoch": 12.3456, + "grad_norm": 0.9135120511054993, + "learning_rate": 3.0726290516206486e-05, + "loss": 0.5444, + "step": 9645 + }, + { + "epoch": 12.34688, + "grad_norm": 0.8909054398536682, + "learning_rate": 3.072428971588636e-05, + "loss": 0.553, + "step": 9646 + }, + { + "epoch": 12.34816, + "grad_norm": 0.9332317113876343, + "learning_rate": 3.072228891556623e-05, + "loss": 0.558, + "step": 9647 + }, + { + "epoch": 12.34944, + "grad_norm": 0.8675878643989563, + "learning_rate": 3.07202881152461e-05, + "loss": 0.5093, + "step": 9648 + }, + { + "epoch": 12.35072, + "grad_norm": 0.9375959634780884, + "learning_rate": 3.071828731492597e-05, + "loss": 0.5277, + "step": 9649 + }, + { + "epoch": 12.352, + "grad_norm": 0.9183427691459656, + "learning_rate": 3.0716286514605846e-05, + "loss": 0.5171, + "step": 9650 + }, + { + "epoch": 12.35328, + "grad_norm": 0.9373577237129211, + "learning_rate": 3.071428571428572e-05, + "loss": 0.5495, + "step": 9651 + }, + { + "epoch": 12.35456, + "grad_norm": 0.9106853604316711, + "learning_rate": 3.071228491396559e-05, + "loss": 0.512, + "step": 9652 + }, + { + "epoch": 12.35584, + "grad_norm": 0.9207229614257812, + "learning_rate": 3.071028411364546e-05, + "loss": 0.538, + "step": 9653 + }, + { + "epoch": 12.35712, + "grad_norm": 0.9344127178192139, + "learning_rate": 3.070828331332533e-05, + "loss": 0.5688, + "step": 9654 + }, + { + "epoch": 12.3584, + "grad_norm": 0.9381983280181885, + "learning_rate": 3.0706282513005205e-05, + "loss": 0.5687, + "step": 9655 + }, + { + "epoch": 12.35968, + "grad_norm": 0.9426577687263489, + "learning_rate": 3.070428171268508e-05, + "loss": 0.5498, + "step": 9656 + }, + { + "epoch": 12.36096, + "grad_norm": 0.9524323344230652, + "learning_rate": 3.070228091236495e-05, + "loss": 0.5394, + "step": 9657 + }, + { + "epoch": 12.36224, + "grad_norm": 0.9800800681114197, + "learning_rate": 3.070028011204482e-05, + "loss": 0.5799, + "step": 9658 + }, + { + "epoch": 12.36352, + "grad_norm": 0.9301595687866211, + "learning_rate": 3.069827931172469e-05, + "loss": 0.551, + "step": 9659 + }, + { + "epoch": 12.3648, + "grad_norm": 0.9066020250320435, + "learning_rate": 3.0696278511404564e-05, + "loss": 0.5535, + "step": 9660 + }, + { + "epoch": 12.36608, + "grad_norm": 0.9361892938613892, + "learning_rate": 3.0694277711084436e-05, + "loss": 0.5224, + "step": 9661 + }, + { + "epoch": 12.36736, + "grad_norm": 0.9013199806213379, + "learning_rate": 3.069227691076431e-05, + "loss": 0.5531, + "step": 9662 + }, + { + "epoch": 12.36864, + "grad_norm": 0.9619473814964294, + "learning_rate": 3.069027611044418e-05, + "loss": 0.5865, + "step": 9663 + }, + { + "epoch": 12.36992, + "grad_norm": 0.9332448244094849, + "learning_rate": 3.068827531012405e-05, + "loss": 0.528, + "step": 9664 + }, + { + "epoch": 12.3712, + "grad_norm": 0.9028695821762085, + "learning_rate": 3.0686274509803923e-05, + "loss": 0.5707, + "step": 9665 + }, + { + "epoch": 12.37248, + "grad_norm": 0.9249699115753174, + "learning_rate": 3.0684273709483795e-05, + "loss": 0.5385, + "step": 9666 + }, + { + "epoch": 12.37376, + "grad_norm": 0.8583320379257202, + "learning_rate": 3.068227290916367e-05, + "loss": 0.5264, + "step": 9667 + }, + { + "epoch": 12.37504, + "grad_norm": 0.8901596069335938, + "learning_rate": 3.068027210884354e-05, + "loss": 0.5339, + "step": 9668 + }, + { + "epoch": 12.37632, + "grad_norm": 0.9462403059005737, + "learning_rate": 3.067827130852341e-05, + "loss": 0.5604, + "step": 9669 + }, + { + "epoch": 12.3776, + "grad_norm": 0.9234113693237305, + "learning_rate": 3.067627050820328e-05, + "loss": 0.5289, + "step": 9670 + }, + { + "epoch": 12.37888, + "grad_norm": 0.9540828466415405, + "learning_rate": 3.0674269707883155e-05, + "loss": 0.5867, + "step": 9671 + }, + { + "epoch": 12.38016, + "grad_norm": 0.936007559299469, + "learning_rate": 3.0672268907563026e-05, + "loss": 0.5348, + "step": 9672 + }, + { + "epoch": 12.38144, + "grad_norm": 0.9366984367370605, + "learning_rate": 3.06702681072429e-05, + "loss": 0.52, + "step": 9673 + }, + { + "epoch": 12.38272, + "grad_norm": 0.9363959431648254, + "learning_rate": 3.066826730692277e-05, + "loss": 0.5781, + "step": 9674 + }, + { + "epoch": 12.384, + "grad_norm": 0.947193443775177, + "learning_rate": 3.066626650660264e-05, + "loss": 0.5687, + "step": 9675 + }, + { + "epoch": 12.38528, + "grad_norm": 0.8775532841682434, + "learning_rate": 3.0664265706282514e-05, + "loss": 0.5269, + "step": 9676 + }, + { + "epoch": 12.38656, + "grad_norm": 0.8915206789970398, + "learning_rate": 3.0662264905962386e-05, + "loss": 0.5509, + "step": 9677 + }, + { + "epoch": 12.38784, + "grad_norm": 0.9614311456680298, + "learning_rate": 3.066026410564226e-05, + "loss": 0.6082, + "step": 9678 + }, + { + "epoch": 12.38912, + "grad_norm": 0.9689681529998779, + "learning_rate": 3.065826330532213e-05, + "loss": 0.5464, + "step": 9679 + }, + { + "epoch": 12.3904, + "grad_norm": 0.9640049338340759, + "learning_rate": 3.0656262505002e-05, + "loss": 0.5523, + "step": 9680 + }, + { + "epoch": 12.39168, + "grad_norm": 0.8928719758987427, + "learning_rate": 3.065426170468187e-05, + "loss": 0.568, + "step": 9681 + }, + { + "epoch": 12.39296, + "grad_norm": 0.9739533066749573, + "learning_rate": 3.0652260904361745e-05, + "loss": 0.5606, + "step": 9682 + }, + { + "epoch": 12.39424, + "grad_norm": 0.894177258014679, + "learning_rate": 3.065026010404162e-05, + "loss": 0.5587, + "step": 9683 + }, + { + "epoch": 12.39552, + "grad_norm": 0.9096843600273132, + "learning_rate": 3.064825930372149e-05, + "loss": 0.5484, + "step": 9684 + }, + { + "epoch": 12.3968, + "grad_norm": 0.862076997756958, + "learning_rate": 3.064625850340136e-05, + "loss": 0.4924, + "step": 9685 + }, + { + "epoch": 12.39808, + "grad_norm": 0.8842014670372009, + "learning_rate": 3.064425770308124e-05, + "loss": 0.5408, + "step": 9686 + }, + { + "epoch": 12.39936, + "grad_norm": 0.9338904023170471, + "learning_rate": 3.0642256902761104e-05, + "loss": 0.556, + "step": 9687 + }, + { + "epoch": 12.40064, + "grad_norm": 0.9398663640022278, + "learning_rate": 3.0640256102440976e-05, + "loss": 0.5535, + "step": 9688 + }, + { + "epoch": 12.40192, + "grad_norm": 0.944598913192749, + "learning_rate": 3.063825530212085e-05, + "loss": 0.5534, + "step": 9689 + }, + { + "epoch": 12.4032, + "grad_norm": 0.9525898098945618, + "learning_rate": 3.063625450180072e-05, + "loss": 0.5571, + "step": 9690 + }, + { + "epoch": 12.40448, + "grad_norm": 0.9750270247459412, + "learning_rate": 3.063425370148059e-05, + "loss": 0.6101, + "step": 9691 + }, + { + "epoch": 12.40576, + "grad_norm": 0.9584131240844727, + "learning_rate": 3.0632252901160464e-05, + "loss": 0.5429, + "step": 9692 + }, + { + "epoch": 12.40704, + "grad_norm": 0.8970118165016174, + "learning_rate": 3.063025210084034e-05, + "loss": 0.537, + "step": 9693 + }, + { + "epoch": 12.40832, + "grad_norm": 0.9430438876152039, + "learning_rate": 3.0628251300520214e-05, + "loss": 0.5364, + "step": 9694 + }, + { + "epoch": 12.4096, + "grad_norm": 0.9645063281059265, + "learning_rate": 3.062625050020008e-05, + "loss": 0.5354, + "step": 9695 + }, + { + "epoch": 12.41088, + "grad_norm": 0.9782135486602783, + "learning_rate": 3.062424969987995e-05, + "loss": 0.551, + "step": 9696 + }, + { + "epoch": 12.41216, + "grad_norm": 0.9144942760467529, + "learning_rate": 3.062224889955982e-05, + "loss": 0.5719, + "step": 9697 + }, + { + "epoch": 12.41344, + "grad_norm": 0.9149807691574097, + "learning_rate": 3.0620248099239695e-05, + "loss": 0.4937, + "step": 9698 + }, + { + "epoch": 12.414719999999999, + "grad_norm": 0.9331946969032288, + "learning_rate": 3.0618247298919567e-05, + "loss": 0.5472, + "step": 9699 + }, + { + "epoch": 12.416, + "grad_norm": 0.9157038927078247, + "learning_rate": 3.0616246498599445e-05, + "loss": 0.5839, + "step": 9700 + }, + { + "epoch": 12.41728, + "grad_norm": 0.955386757850647, + "learning_rate": 3.061424569827932e-05, + "loss": 0.5573, + "step": 9701 + }, + { + "epoch": 12.41856, + "grad_norm": 0.9963140487670898, + "learning_rate": 3.061224489795919e-05, + "loss": 0.5524, + "step": 9702 + }, + { + "epoch": 12.41984, + "grad_norm": 0.9035356044769287, + "learning_rate": 3.0610244097639054e-05, + "loss": 0.5279, + "step": 9703 + }, + { + "epoch": 12.42112, + "grad_norm": 0.885988712310791, + "learning_rate": 3.0608243297318926e-05, + "loss": 0.5387, + "step": 9704 + }, + { + "epoch": 12.4224, + "grad_norm": 0.9754456281661987, + "learning_rate": 3.06062424969988e-05, + "loss": 0.5724, + "step": 9705 + }, + { + "epoch": 12.42368, + "grad_norm": 0.8800594806671143, + "learning_rate": 3.060424169667867e-05, + "loss": 0.503, + "step": 9706 + }, + { + "epoch": 12.42496, + "grad_norm": 0.9732749462127686, + "learning_rate": 3.060224089635855e-05, + "loss": 0.5478, + "step": 9707 + }, + { + "epoch": 12.42624, + "grad_norm": 0.8836948871612549, + "learning_rate": 3.060024009603842e-05, + "loss": 0.5481, + "step": 9708 + }, + { + "epoch": 12.42752, + "grad_norm": 0.8984959721565247, + "learning_rate": 3.059823929571829e-05, + "loss": 0.57, + "step": 9709 + }, + { + "epoch": 12.4288, + "grad_norm": 0.9155954122543335, + "learning_rate": 3.0596238495398164e-05, + "loss": 0.5659, + "step": 9710 + }, + { + "epoch": 12.43008, + "grad_norm": 0.897638201713562, + "learning_rate": 3.059423769507803e-05, + "loss": 0.5256, + "step": 9711 + }, + { + "epoch": 12.43136, + "grad_norm": 0.9188939929008484, + "learning_rate": 3.05922368947579e-05, + "loss": 0.575, + "step": 9712 + }, + { + "epoch": 12.43264, + "grad_norm": 0.9399385452270508, + "learning_rate": 3.059023609443777e-05, + "loss": 0.569, + "step": 9713 + }, + { + "epoch": 12.43392, + "grad_norm": 0.9348775148391724, + "learning_rate": 3.058823529411765e-05, + "loss": 0.5243, + "step": 9714 + }, + { + "epoch": 12.4352, + "grad_norm": 0.9413948655128479, + "learning_rate": 3.058623449379752e-05, + "loss": 0.5641, + "step": 9715 + }, + { + "epoch": 12.43648, + "grad_norm": 0.9737404584884644, + "learning_rate": 3.0584233693477395e-05, + "loss": 0.5496, + "step": 9716 + }, + { + "epoch": 12.43776, + "grad_norm": 0.8885311484336853, + "learning_rate": 3.058223289315727e-05, + "loss": 0.5279, + "step": 9717 + }, + { + "epoch": 12.43904, + "grad_norm": 0.9236472249031067, + "learning_rate": 3.058023209283714e-05, + "loss": 0.5602, + "step": 9718 + }, + { + "epoch": 12.44032, + "grad_norm": 0.9534575343132019, + "learning_rate": 3.0578231292517004e-05, + "loss": 0.5359, + "step": 9719 + }, + { + "epoch": 12.4416, + "grad_norm": 0.9509499073028564, + "learning_rate": 3.0576230492196876e-05, + "loss": 0.5428, + "step": 9720 + }, + { + "epoch": 12.44288, + "grad_norm": 0.9832822680473328, + "learning_rate": 3.0574229691876754e-05, + "loss": 0.5792, + "step": 9721 + }, + { + "epoch": 12.44416, + "grad_norm": 0.8691754937171936, + "learning_rate": 3.0572228891556626e-05, + "loss": 0.4652, + "step": 9722 + }, + { + "epoch": 12.44544, + "grad_norm": 0.9270925521850586, + "learning_rate": 3.05702280912365e-05, + "loss": 0.5903, + "step": 9723 + }, + { + "epoch": 12.44672, + "grad_norm": 0.9018378853797913, + "learning_rate": 3.056822729091637e-05, + "loss": 0.5811, + "step": 9724 + }, + { + "epoch": 12.448, + "grad_norm": 0.8946050405502319, + "learning_rate": 3.056622649059624e-05, + "loss": 0.5234, + "step": 9725 + }, + { + "epoch": 12.44928, + "grad_norm": 0.9452985525131226, + "learning_rate": 3.0564225690276113e-05, + "loss": 0.5267, + "step": 9726 + }, + { + "epoch": 12.45056, + "grad_norm": 0.9652531147003174, + "learning_rate": 3.056222488995598e-05, + "loss": 0.582, + "step": 9727 + }, + { + "epoch": 12.45184, + "grad_norm": 0.9452758431434631, + "learning_rate": 3.056022408963586e-05, + "loss": 0.6246, + "step": 9728 + }, + { + "epoch": 12.45312, + "grad_norm": 0.9329007863998413, + "learning_rate": 3.055822328931573e-05, + "loss": 0.535, + "step": 9729 + }, + { + "epoch": 12.4544, + "grad_norm": 0.9430539011955261, + "learning_rate": 3.05562224889956e-05, + "loss": 0.5769, + "step": 9730 + }, + { + "epoch": 12.45568, + "grad_norm": 0.9002211689949036, + "learning_rate": 3.055422168867547e-05, + "loss": 0.5709, + "step": 9731 + }, + { + "epoch": 12.45696, + "grad_norm": 0.8887854814529419, + "learning_rate": 3.0552220888355345e-05, + "loss": 0.5366, + "step": 9732 + }, + { + "epoch": 12.45824, + "grad_norm": 0.8703662753105164, + "learning_rate": 3.0550220088035216e-05, + "loss": 0.4981, + "step": 9733 + }, + { + "epoch": 12.45952, + "grad_norm": 0.9427342414855957, + "learning_rate": 3.054821928771509e-05, + "loss": 0.5584, + "step": 9734 + }, + { + "epoch": 12.4608, + "grad_norm": 0.9092150926589966, + "learning_rate": 3.054621848739496e-05, + "loss": 0.5098, + "step": 9735 + }, + { + "epoch": 12.46208, + "grad_norm": 0.9247660636901855, + "learning_rate": 3.054421768707483e-05, + "loss": 0.5255, + "step": 9736 + }, + { + "epoch": 12.46336, + "grad_norm": 0.8831589221954346, + "learning_rate": 3.0542216886754704e-05, + "loss": 0.5409, + "step": 9737 + }, + { + "epoch": 12.46464, + "grad_norm": 0.9176967740058899, + "learning_rate": 3.0540216086434576e-05, + "loss": 0.5467, + "step": 9738 + }, + { + "epoch": 12.46592, + "grad_norm": 0.9632626175880432, + "learning_rate": 3.053821528611445e-05, + "loss": 0.5785, + "step": 9739 + }, + { + "epoch": 12.4672, + "grad_norm": 0.9407864212989807, + "learning_rate": 3.053621448579432e-05, + "loss": 0.5921, + "step": 9740 + }, + { + "epoch": 12.46848, + "grad_norm": 0.9391652941703796, + "learning_rate": 3.053421368547419e-05, + "loss": 0.5707, + "step": 9741 + }, + { + "epoch": 12.46976, + "grad_norm": 0.9201903939247131, + "learning_rate": 3.053221288515406e-05, + "loss": 0.5635, + "step": 9742 + }, + { + "epoch": 12.47104, + "grad_norm": 0.9038977026939392, + "learning_rate": 3.0530212084833935e-05, + "loss": 0.5281, + "step": 9743 + }, + { + "epoch": 12.47232, + "grad_norm": 0.8993562459945679, + "learning_rate": 3.052821128451381e-05, + "loss": 0.5565, + "step": 9744 + }, + { + "epoch": 12.4736, + "grad_norm": 0.9167428016662598, + "learning_rate": 3.052621048419368e-05, + "loss": 0.6111, + "step": 9745 + }, + { + "epoch": 12.47488, + "grad_norm": 0.9848437905311584, + "learning_rate": 3.052420968387355e-05, + "loss": 0.54, + "step": 9746 + }, + { + "epoch": 12.47616, + "grad_norm": 0.937060534954071, + "learning_rate": 3.052220888355342e-05, + "loss": 0.5115, + "step": 9747 + }, + { + "epoch": 12.47744, + "grad_norm": 0.9132764935493469, + "learning_rate": 3.0520208083233294e-05, + "loss": 0.5782, + "step": 9748 + }, + { + "epoch": 12.47872, + "grad_norm": 0.89909428358078, + "learning_rate": 3.0518207282913166e-05, + "loss": 0.5177, + "step": 9749 + }, + { + "epoch": 12.48, + "grad_norm": 1.0129623413085938, + "learning_rate": 3.051620648259304e-05, + "loss": 0.6329, + "step": 9750 + }, + { + "epoch": 12.48128, + "grad_norm": 0.9289687871932983, + "learning_rate": 3.0514205682272913e-05, + "loss": 0.4931, + "step": 9751 + }, + { + "epoch": 12.48256, + "grad_norm": 0.8912503123283386, + "learning_rate": 3.0512204881952782e-05, + "loss": 0.4795, + "step": 9752 + }, + { + "epoch": 12.48384, + "grad_norm": 0.9053838849067688, + "learning_rate": 3.0510204081632654e-05, + "loss": 0.5524, + "step": 9753 + }, + { + "epoch": 12.48512, + "grad_norm": 0.9058827757835388, + "learning_rate": 3.0508203281312525e-05, + "loss": 0.5323, + "step": 9754 + }, + { + "epoch": 12.4864, + "grad_norm": 0.9562934041023254, + "learning_rate": 3.0506202480992397e-05, + "loss": 0.5378, + "step": 9755 + }, + { + "epoch": 12.48768, + "grad_norm": 0.8920333981513977, + "learning_rate": 3.0504201680672273e-05, + "loss": 0.516, + "step": 9756 + }, + { + "epoch": 12.48896, + "grad_norm": 0.909192681312561, + "learning_rate": 3.0502200880352144e-05, + "loss": 0.5491, + "step": 9757 + }, + { + "epoch": 12.49024, + "grad_norm": 0.8839040398597717, + "learning_rate": 3.0500200080032016e-05, + "loss": 0.5202, + "step": 9758 + }, + { + "epoch": 12.49152, + "grad_norm": 0.9545120000839233, + "learning_rate": 3.0498199279711888e-05, + "loss": 0.5876, + "step": 9759 + }, + { + "epoch": 12.4928, + "grad_norm": 0.885185182094574, + "learning_rate": 3.0496198479391757e-05, + "loss": 0.5223, + "step": 9760 + }, + { + "epoch": 12.49408, + "grad_norm": 0.9367235898971558, + "learning_rate": 3.049419767907163e-05, + "loss": 0.5421, + "step": 9761 + }, + { + "epoch": 12.49536, + "grad_norm": 0.9383306503295898, + "learning_rate": 3.04921968787515e-05, + "loss": 0.5907, + "step": 9762 + }, + { + "epoch": 12.49664, + "grad_norm": 0.869220495223999, + "learning_rate": 3.0490196078431376e-05, + "loss": 0.5031, + "step": 9763 + }, + { + "epoch": 12.49792, + "grad_norm": 0.9352115392684937, + "learning_rate": 3.0488195278111247e-05, + "loss": 0.559, + "step": 9764 + }, + { + "epoch": 12.4992, + "grad_norm": 0.8659606575965881, + "learning_rate": 3.048619447779112e-05, + "loss": 0.5303, + "step": 9765 + }, + { + "epoch": 12.50048, + "grad_norm": 0.921378493309021, + "learning_rate": 3.048419367747099e-05, + "loss": 0.5447, + "step": 9766 + }, + { + "epoch": 12.50176, + "grad_norm": 0.8634151816368103, + "learning_rate": 3.0482192877150863e-05, + "loss": 0.5637, + "step": 9767 + }, + { + "epoch": 12.50304, + "grad_norm": 0.9479573965072632, + "learning_rate": 3.048019207683073e-05, + "loss": 0.5185, + "step": 9768 + }, + { + "epoch": 12.50432, + "grad_norm": 0.9313710927963257, + "learning_rate": 3.0478191276510603e-05, + "loss": 0.5721, + "step": 9769 + }, + { + "epoch": 12.5056, + "grad_norm": 0.9208516478538513, + "learning_rate": 3.0476190476190482e-05, + "loss": 0.5565, + "step": 9770 + }, + { + "epoch": 12.50688, + "grad_norm": 0.9547169804573059, + "learning_rate": 3.047418967587035e-05, + "loss": 0.5426, + "step": 9771 + }, + { + "epoch": 12.50816, + "grad_norm": 0.8872602581977844, + "learning_rate": 3.0472188875550222e-05, + "loss": 0.5087, + "step": 9772 + }, + { + "epoch": 12.50944, + "grad_norm": 0.8986743092536926, + "learning_rate": 3.0470188075230094e-05, + "loss": 0.5241, + "step": 9773 + }, + { + "epoch": 12.51072, + "grad_norm": 0.9405331015586853, + "learning_rate": 3.0468187274909966e-05, + "loss": 0.5672, + "step": 9774 + }, + { + "epoch": 12.512, + "grad_norm": 0.9390310645103455, + "learning_rate": 3.0466186474589838e-05, + "loss": 0.5732, + "step": 9775 + }, + { + "epoch": 12.51328, + "grad_norm": 0.994657039642334, + "learning_rate": 3.0464185674269706e-05, + "loss": 0.5688, + "step": 9776 + }, + { + "epoch": 12.51456, + "grad_norm": 0.9204804301261902, + "learning_rate": 3.0462184873949578e-05, + "loss": 0.6019, + "step": 9777 + }, + { + "epoch": 12.51584, + "grad_norm": 0.9733226299285889, + "learning_rate": 3.0460184073629457e-05, + "loss": 0.5185, + "step": 9778 + }, + { + "epoch": 12.51712, + "grad_norm": 0.9087737202644348, + "learning_rate": 3.0458183273309325e-05, + "loss": 0.5242, + "step": 9779 + }, + { + "epoch": 12.5184, + "grad_norm": 0.9519286751747131, + "learning_rate": 3.0456182472989197e-05, + "loss": 0.5675, + "step": 9780 + }, + { + "epoch": 12.51968, + "grad_norm": 0.899489164352417, + "learning_rate": 3.045418167266907e-05, + "loss": 0.5307, + "step": 9781 + }, + { + "epoch": 12.52096, + "grad_norm": 0.926542341709137, + "learning_rate": 3.045218087234894e-05, + "loss": 0.5442, + "step": 9782 + }, + { + "epoch": 12.52224, + "grad_norm": 0.8883079290390015, + "learning_rate": 3.0450180072028813e-05, + "loss": 0.5385, + "step": 9783 + }, + { + "epoch": 12.52352, + "grad_norm": 0.9173272252082825, + "learning_rate": 3.044817927170868e-05, + "loss": 0.5661, + "step": 9784 + }, + { + "epoch": 12.5248, + "grad_norm": 1.0070183277130127, + "learning_rate": 3.044617847138856e-05, + "loss": 0.5957, + "step": 9785 + }, + { + "epoch": 12.52608, + "grad_norm": 0.8930588364601135, + "learning_rate": 3.044417767106843e-05, + "loss": 0.5212, + "step": 9786 + }, + { + "epoch": 12.52736, + "grad_norm": 0.883655846118927, + "learning_rate": 3.04421768707483e-05, + "loss": 0.547, + "step": 9787 + }, + { + "epoch": 12.52864, + "grad_norm": 0.917133092880249, + "learning_rate": 3.0440176070428172e-05, + "loss": 0.4803, + "step": 9788 + }, + { + "epoch": 12.52992, + "grad_norm": 0.9094095826148987, + "learning_rate": 3.0438175270108044e-05, + "loss": 0.544, + "step": 9789 + }, + { + "epoch": 12.5312, + "grad_norm": 0.977846622467041, + "learning_rate": 3.0436174469787916e-05, + "loss": 0.6179, + "step": 9790 + }, + { + "epoch": 12.53248, + "grad_norm": 0.9368249177932739, + "learning_rate": 3.0434173669467788e-05, + "loss": 0.5934, + "step": 9791 + }, + { + "epoch": 12.533760000000001, + "grad_norm": 0.8966599106788635, + "learning_rate": 3.0432172869147663e-05, + "loss": 0.55, + "step": 9792 + }, + { + "epoch": 12.53504, + "grad_norm": 0.9707033038139343, + "learning_rate": 3.0430172068827535e-05, + "loss": 0.5788, + "step": 9793 + }, + { + "epoch": 12.53632, + "grad_norm": 0.9984316825866699, + "learning_rate": 3.0428171268507406e-05, + "loss": 0.6074, + "step": 9794 + }, + { + "epoch": 12.5376, + "grad_norm": 0.9540078043937683, + "learning_rate": 3.0426170468187275e-05, + "loss": 0.5769, + "step": 9795 + }, + { + "epoch": 12.53888, + "grad_norm": 0.9404662847518921, + "learning_rate": 3.0424169667867147e-05, + "loss": 0.5381, + "step": 9796 + }, + { + "epoch": 12.54016, + "grad_norm": 0.8832229375839233, + "learning_rate": 3.042216886754702e-05, + "loss": 0.5524, + "step": 9797 + }, + { + "epoch": 12.54144, + "grad_norm": 0.9047516584396362, + "learning_rate": 3.042016806722689e-05, + "loss": 0.5836, + "step": 9798 + }, + { + "epoch": 12.54272, + "grad_norm": 0.9186232089996338, + "learning_rate": 3.0418167266906766e-05, + "loss": 0.5923, + "step": 9799 + }, + { + "epoch": 12.544, + "grad_norm": 1.0132439136505127, + "learning_rate": 3.0416166466586638e-05, + "loss": 0.5755, + "step": 9800 + }, + { + "epoch": 12.54528, + "grad_norm": 0.9297081232070923, + "learning_rate": 3.041416566626651e-05, + "loss": 0.5604, + "step": 9801 + }, + { + "epoch": 12.54656, + "grad_norm": 0.9263479113578796, + "learning_rate": 3.041216486594638e-05, + "loss": 0.5238, + "step": 9802 + }, + { + "epoch": 12.54784, + "grad_norm": 0.964608371257782, + "learning_rate": 3.041016406562625e-05, + "loss": 0.5523, + "step": 9803 + }, + { + "epoch": 12.54912, + "grad_norm": 0.8875142931938171, + "learning_rate": 3.040816326530612e-05, + "loss": 0.4931, + "step": 9804 + }, + { + "epoch": 12.5504, + "grad_norm": 0.9739357829093933, + "learning_rate": 3.0406162464985994e-05, + "loss": 0.5274, + "step": 9805 + }, + { + "epoch": 12.55168, + "grad_norm": 0.8982617855072021, + "learning_rate": 3.040416166466587e-05, + "loss": 0.5359, + "step": 9806 + }, + { + "epoch": 12.55296, + "grad_norm": 0.9386458396911621, + "learning_rate": 3.040216086434574e-05, + "loss": 0.5505, + "step": 9807 + }, + { + "epoch": 12.55424, + "grad_norm": 0.9533564448356628, + "learning_rate": 3.0400160064025612e-05, + "loss": 0.6093, + "step": 9808 + }, + { + "epoch": 12.55552, + "grad_norm": 0.9058338403701782, + "learning_rate": 3.0398159263705484e-05, + "loss": 0.5394, + "step": 9809 + }, + { + "epoch": 12.556799999999999, + "grad_norm": 1.01373291015625, + "learning_rate": 3.0396158463385356e-05, + "loss": 0.5627, + "step": 9810 + }, + { + "epoch": 12.55808, + "grad_norm": 1.0229073762893677, + "learning_rate": 3.0394157663065225e-05, + "loss": 0.5781, + "step": 9811 + }, + { + "epoch": 12.55936, + "grad_norm": 0.9454696178436279, + "learning_rate": 3.0392156862745097e-05, + "loss": 0.5293, + "step": 9812 + }, + { + "epoch": 12.56064, + "grad_norm": 0.9132083058357239, + "learning_rate": 3.0390156062424975e-05, + "loss": 0.546, + "step": 9813 + }, + { + "epoch": 12.56192, + "grad_norm": 0.90849369764328, + "learning_rate": 3.0388155262104844e-05, + "loss": 0.5221, + "step": 9814 + }, + { + "epoch": 12.5632, + "grad_norm": 0.9330136179924011, + "learning_rate": 3.0386154461784715e-05, + "loss": 0.5212, + "step": 9815 + }, + { + "epoch": 12.56448, + "grad_norm": 0.9285899996757507, + "learning_rate": 3.0384153661464587e-05, + "loss": 0.6005, + "step": 9816 + }, + { + "epoch": 12.565760000000001, + "grad_norm": 0.8947268724441528, + "learning_rate": 3.038215286114446e-05, + "loss": 0.5294, + "step": 9817 + }, + { + "epoch": 12.56704, + "grad_norm": 0.9373891949653625, + "learning_rate": 3.038015206082433e-05, + "loss": 0.5463, + "step": 9818 + }, + { + "epoch": 12.56832, + "grad_norm": 0.9649326205253601, + "learning_rate": 3.03781512605042e-05, + "loss": 0.5701, + "step": 9819 + }, + { + "epoch": 12.5696, + "grad_norm": 0.9598551392555237, + "learning_rate": 3.0376150460184078e-05, + "loss": 0.6032, + "step": 9820 + }, + { + "epoch": 12.57088, + "grad_norm": 1.0046522617340088, + "learning_rate": 3.037414965986395e-05, + "loss": 0.6048, + "step": 9821 + }, + { + "epoch": 12.57216, + "grad_norm": 0.9179285168647766, + "learning_rate": 3.037214885954382e-05, + "loss": 0.5717, + "step": 9822 + }, + { + "epoch": 12.57344, + "grad_norm": 0.9317951798439026, + "learning_rate": 3.037014805922369e-05, + "loss": 0.5137, + "step": 9823 + }, + { + "epoch": 12.57472, + "grad_norm": 0.9623469114303589, + "learning_rate": 3.0368147258903562e-05, + "loss": 0.564, + "step": 9824 + }, + { + "epoch": 12.576, + "grad_norm": 0.9758321046829224, + "learning_rate": 3.0366146458583434e-05, + "loss": 0.586, + "step": 9825 + }, + { + "epoch": 12.57728, + "grad_norm": 0.9277058839797974, + "learning_rate": 3.0364145658263306e-05, + "loss": 0.5431, + "step": 9826 + }, + { + "epoch": 12.57856, + "grad_norm": 0.9288407564163208, + "learning_rate": 3.036214485794318e-05, + "loss": 0.5268, + "step": 9827 + }, + { + "epoch": 12.57984, + "grad_norm": 0.9227582812309265, + "learning_rate": 3.0360144057623053e-05, + "loss": 0.5604, + "step": 9828 + }, + { + "epoch": 12.58112, + "grad_norm": 0.8422914743423462, + "learning_rate": 3.0358143257302925e-05, + "loss": 0.5244, + "step": 9829 + }, + { + "epoch": 12.5824, + "grad_norm": 0.9041396975517273, + "learning_rate": 3.0356142456982793e-05, + "loss": 0.558, + "step": 9830 + }, + { + "epoch": 12.58368, + "grad_norm": 0.9546091556549072, + "learning_rate": 3.0354141656662665e-05, + "loss": 0.5733, + "step": 9831 + }, + { + "epoch": 12.58496, + "grad_norm": 0.9388670325279236, + "learning_rate": 3.0352140856342537e-05, + "loss": 0.542, + "step": 9832 + }, + { + "epoch": 12.58624, + "grad_norm": 0.9128877520561218, + "learning_rate": 3.035014005602241e-05, + "loss": 0.5452, + "step": 9833 + }, + { + "epoch": 12.58752, + "grad_norm": 0.9351105690002441, + "learning_rate": 3.0348139255702284e-05, + "loss": 0.5283, + "step": 9834 + }, + { + "epoch": 12.588799999999999, + "grad_norm": 0.9028121829032898, + "learning_rate": 3.0346138455382156e-05, + "loss": 0.5479, + "step": 9835 + }, + { + "epoch": 12.59008, + "grad_norm": 0.9574125409126282, + "learning_rate": 3.0344137655062028e-05, + "loss": 0.5786, + "step": 9836 + }, + { + "epoch": 12.59136, + "grad_norm": 0.9389735460281372, + "learning_rate": 3.03421368547419e-05, + "loss": 0.5688, + "step": 9837 + }, + { + "epoch": 12.59264, + "grad_norm": 0.889968752861023, + "learning_rate": 3.0340136054421768e-05, + "loss": 0.5263, + "step": 9838 + }, + { + "epoch": 12.59392, + "grad_norm": 0.8959866762161255, + "learning_rate": 3.033813525410164e-05, + "loss": 0.5766, + "step": 9839 + }, + { + "epoch": 12.5952, + "grad_norm": 0.9044232964515686, + "learning_rate": 3.0336134453781512e-05, + "loss": 0.536, + "step": 9840 + }, + { + "epoch": 12.59648, + "grad_norm": 0.896634042263031, + "learning_rate": 3.0334133653461387e-05, + "loss": 0.5389, + "step": 9841 + }, + { + "epoch": 12.59776, + "grad_norm": 0.9344484806060791, + "learning_rate": 3.033213285314126e-05, + "loss": 0.552, + "step": 9842 + }, + { + "epoch": 12.59904, + "grad_norm": 1.0194035768508911, + "learning_rate": 3.033013205282113e-05, + "loss": 0.5967, + "step": 9843 + }, + { + "epoch": 12.60032, + "grad_norm": 0.9549538493156433, + "learning_rate": 3.0328131252501003e-05, + "loss": 0.5707, + "step": 9844 + }, + { + "epoch": 12.6016, + "grad_norm": 0.9823207259178162, + "learning_rate": 3.0326130452180875e-05, + "loss": 0.5702, + "step": 9845 + }, + { + "epoch": 12.60288, + "grad_norm": 0.9882277846336365, + "learning_rate": 3.0324129651860743e-05, + "loss": 0.6005, + "step": 9846 + }, + { + "epoch": 12.60416, + "grad_norm": 0.8940028548240662, + "learning_rate": 3.0322128851540615e-05, + "loss": 0.5271, + "step": 9847 + }, + { + "epoch": 12.60544, + "grad_norm": 0.9345920085906982, + "learning_rate": 3.0320128051220494e-05, + "loss": 0.5547, + "step": 9848 + }, + { + "epoch": 12.60672, + "grad_norm": 0.8393700122833252, + "learning_rate": 3.0318127250900362e-05, + "loss": 0.5294, + "step": 9849 + }, + { + "epoch": 12.608, + "grad_norm": 0.9185911417007446, + "learning_rate": 3.0316126450580234e-05, + "loss": 0.5218, + "step": 9850 + }, + { + "epoch": 12.60928, + "grad_norm": 0.9599105715751648, + "learning_rate": 3.0314125650260106e-05, + "loss": 0.5365, + "step": 9851 + }, + { + "epoch": 12.61056, + "grad_norm": 0.9423932433128357, + "learning_rate": 3.0312124849939978e-05, + "loss": 0.5799, + "step": 9852 + }, + { + "epoch": 12.61184, + "grad_norm": 0.9011184573173523, + "learning_rate": 3.031012404961985e-05, + "loss": 0.5154, + "step": 9853 + }, + { + "epoch": 12.61312, + "grad_norm": 0.9672764539718628, + "learning_rate": 3.0308123249299718e-05, + "loss": 0.5381, + "step": 9854 + }, + { + "epoch": 12.6144, + "grad_norm": 0.9262174963951111, + "learning_rate": 3.0306122448979597e-05, + "loss": 0.5639, + "step": 9855 + }, + { + "epoch": 12.61568, + "grad_norm": 0.9052280783653259, + "learning_rate": 3.030412164865947e-05, + "loss": 0.5398, + "step": 9856 + }, + { + "epoch": 12.61696, + "grad_norm": 0.9824522733688354, + "learning_rate": 3.0302120848339337e-05, + "loss": 0.5554, + "step": 9857 + }, + { + "epoch": 12.61824, + "grad_norm": 0.9158837795257568, + "learning_rate": 3.030012004801921e-05, + "loss": 0.5455, + "step": 9858 + }, + { + "epoch": 12.61952, + "grad_norm": 0.8802503943443298, + "learning_rate": 3.029811924769908e-05, + "loss": 0.5605, + "step": 9859 + }, + { + "epoch": 12.6208, + "grad_norm": 0.9751753807067871, + "learning_rate": 3.0296118447378952e-05, + "loss": 0.6144, + "step": 9860 + }, + { + "epoch": 12.62208, + "grad_norm": 0.9232403039932251, + "learning_rate": 3.0294117647058824e-05, + "loss": 0.5928, + "step": 9861 + }, + { + "epoch": 12.62336, + "grad_norm": 0.9599583148956299, + "learning_rate": 3.02921168467387e-05, + "loss": 0.5603, + "step": 9862 + }, + { + "epoch": 12.62464, + "grad_norm": 0.8870642185211182, + "learning_rate": 3.029011604641857e-05, + "loss": 0.5307, + "step": 9863 + }, + { + "epoch": 12.62592, + "grad_norm": 0.9634974598884583, + "learning_rate": 3.0288115246098443e-05, + "loss": 0.5891, + "step": 9864 + }, + { + "epoch": 12.6272, + "grad_norm": 0.8991132974624634, + "learning_rate": 3.028611444577831e-05, + "loss": 0.5648, + "step": 9865 + }, + { + "epoch": 12.62848, + "grad_norm": 0.8831954002380371, + "learning_rate": 3.0284113645458184e-05, + "loss": 0.5817, + "step": 9866 + }, + { + "epoch": 12.62976, + "grad_norm": 0.9551595449447632, + "learning_rate": 3.0282112845138055e-05, + "loss": 0.5596, + "step": 9867 + }, + { + "epoch": 12.63104, + "grad_norm": 0.9024450182914734, + "learning_rate": 3.0280112044817927e-05, + "loss": 0.5449, + "step": 9868 + }, + { + "epoch": 12.63232, + "grad_norm": 0.8698076009750366, + "learning_rate": 3.0278111244497803e-05, + "loss": 0.5622, + "step": 9869 + }, + { + "epoch": 12.6336, + "grad_norm": 0.9642755389213562, + "learning_rate": 3.0276110444177674e-05, + "loss": 0.5489, + "step": 9870 + }, + { + "epoch": 12.63488, + "grad_norm": 0.8824167251586914, + "learning_rate": 3.0274109643857546e-05, + "loss": 0.5393, + "step": 9871 + }, + { + "epoch": 12.63616, + "grad_norm": 0.9382432103157043, + "learning_rate": 3.0272108843537418e-05, + "loss": 0.5089, + "step": 9872 + }, + { + "epoch": 12.63744, + "grad_norm": 0.9504085779190063, + "learning_rate": 3.0270108043217287e-05, + "loss": 0.5866, + "step": 9873 + }, + { + "epoch": 12.63872, + "grad_norm": 0.9067440032958984, + "learning_rate": 3.026810724289716e-05, + "loss": 0.5128, + "step": 9874 + }, + { + "epoch": 12.64, + "grad_norm": 0.8995640277862549, + "learning_rate": 3.026610644257703e-05, + "loss": 0.5669, + "step": 9875 + }, + { + "epoch": 12.64128, + "grad_norm": 0.919801652431488, + "learning_rate": 3.0264105642256906e-05, + "loss": 0.5359, + "step": 9876 + }, + { + "epoch": 12.64256, + "grad_norm": 0.8864008188247681, + "learning_rate": 3.0262104841936777e-05, + "loss": 0.5566, + "step": 9877 + }, + { + "epoch": 12.64384, + "grad_norm": 0.940040111541748, + "learning_rate": 3.026010404161665e-05, + "loss": 0.4943, + "step": 9878 + }, + { + "epoch": 12.64512, + "grad_norm": 0.9585533738136292, + "learning_rate": 3.025810324129652e-05, + "loss": 0.6033, + "step": 9879 + }, + { + "epoch": 12.6464, + "grad_norm": 0.9533222317695618, + "learning_rate": 3.0256102440976393e-05, + "loss": 0.5542, + "step": 9880 + }, + { + "epoch": 12.64768, + "grad_norm": 0.9033933877944946, + "learning_rate": 3.025410164065626e-05, + "loss": 0.5347, + "step": 9881 + }, + { + "epoch": 12.64896, + "grad_norm": 0.8955773115158081, + "learning_rate": 3.0252100840336133e-05, + "loss": 0.5131, + "step": 9882 + }, + { + "epoch": 12.65024, + "grad_norm": 0.9527932405471802, + "learning_rate": 3.0250100040016012e-05, + "loss": 0.5438, + "step": 9883 + }, + { + "epoch": 12.65152, + "grad_norm": 0.9614503979682922, + "learning_rate": 3.024809923969588e-05, + "loss": 0.5858, + "step": 9884 + }, + { + "epoch": 12.6528, + "grad_norm": 0.9274621605873108, + "learning_rate": 3.0246098439375752e-05, + "loss": 0.4716, + "step": 9885 + }, + { + "epoch": 12.65408, + "grad_norm": 0.8693538904190063, + "learning_rate": 3.0244097639055624e-05, + "loss": 0.5251, + "step": 9886 + }, + { + "epoch": 12.65536, + "grad_norm": 0.9951110482215881, + "learning_rate": 3.0242096838735496e-05, + "loss": 0.5828, + "step": 9887 + }, + { + "epoch": 12.65664, + "grad_norm": 0.9591448903083801, + "learning_rate": 3.0240096038415368e-05, + "loss": 0.5891, + "step": 9888 + }, + { + "epoch": 12.65792, + "grad_norm": 0.9329800009727478, + "learning_rate": 3.0238095238095236e-05, + "loss": 0.5501, + "step": 9889 + }, + { + "epoch": 12.6592, + "grad_norm": 0.911115288734436, + "learning_rate": 3.0236094437775108e-05, + "loss": 0.5655, + "step": 9890 + }, + { + "epoch": 12.66048, + "grad_norm": 0.9046706557273865, + "learning_rate": 3.0234093637454987e-05, + "loss": 0.5582, + "step": 9891 + }, + { + "epoch": 12.66176, + "grad_norm": 0.9165447950363159, + "learning_rate": 3.0232092837134855e-05, + "loss": 0.5773, + "step": 9892 + }, + { + "epoch": 12.66304, + "grad_norm": 0.9891864657402039, + "learning_rate": 3.0230092036814727e-05, + "loss": 0.6351, + "step": 9893 + }, + { + "epoch": 12.66432, + "grad_norm": 0.8324431777000427, + "learning_rate": 3.02280912364946e-05, + "loss": 0.5198, + "step": 9894 + }, + { + "epoch": 12.6656, + "grad_norm": 0.8809410929679871, + "learning_rate": 3.022609043617447e-05, + "loss": 0.5294, + "step": 9895 + }, + { + "epoch": 12.66688, + "grad_norm": 0.8862389922142029, + "learning_rate": 3.0224089635854343e-05, + "loss": 0.5305, + "step": 9896 + }, + { + "epoch": 12.66816, + "grad_norm": 0.8923200368881226, + "learning_rate": 3.022208883553421e-05, + "loss": 0.5304, + "step": 9897 + }, + { + "epoch": 12.66944, + "grad_norm": 0.9289109110832214, + "learning_rate": 3.022008803521409e-05, + "loss": 0.5732, + "step": 9898 + }, + { + "epoch": 12.67072, + "grad_norm": 0.9402267336845398, + "learning_rate": 3.021808723489396e-05, + "loss": 0.5936, + "step": 9899 + }, + { + "epoch": 12.672, + "grad_norm": 0.9303426146507263, + "learning_rate": 3.021608643457383e-05, + "loss": 0.555, + "step": 9900 + }, + { + "epoch": 12.67328, + "grad_norm": 0.8610079288482666, + "learning_rate": 3.0214085634253702e-05, + "loss": 0.5454, + "step": 9901 + }, + { + "epoch": 12.67456, + "grad_norm": 0.9670707583427429, + "learning_rate": 3.0212084833933574e-05, + "loss": 0.5369, + "step": 9902 + }, + { + "epoch": 12.67584, + "grad_norm": 0.9356277585029602, + "learning_rate": 3.0210084033613446e-05, + "loss": 0.556, + "step": 9903 + }, + { + "epoch": 12.67712, + "grad_norm": 0.8627066016197205, + "learning_rate": 3.0208083233293317e-05, + "loss": 0.5081, + "step": 9904 + }, + { + "epoch": 12.6784, + "grad_norm": 0.918406069278717, + "learning_rate": 3.0206082432973193e-05, + "loss": 0.5187, + "step": 9905 + }, + { + "epoch": 12.67968, + "grad_norm": 0.9237809777259827, + "learning_rate": 3.0204081632653065e-05, + "loss": 0.5165, + "step": 9906 + }, + { + "epoch": 12.68096, + "grad_norm": 0.8767214417457581, + "learning_rate": 3.0202080832332936e-05, + "loss": 0.4903, + "step": 9907 + }, + { + "epoch": 12.68224, + "grad_norm": 1.0054161548614502, + "learning_rate": 3.0200080032012805e-05, + "loss": 0.5947, + "step": 9908 + }, + { + "epoch": 12.68352, + "grad_norm": 0.9165403246879578, + "learning_rate": 3.0198079231692677e-05, + "loss": 0.5614, + "step": 9909 + }, + { + "epoch": 12.6848, + "grad_norm": 0.9228652715682983, + "learning_rate": 3.019607843137255e-05, + "loss": 0.5589, + "step": 9910 + }, + { + "epoch": 12.68608, + "grad_norm": 0.8921694755554199, + "learning_rate": 3.019407763105242e-05, + "loss": 0.5131, + "step": 9911 + }, + { + "epoch": 12.68736, + "grad_norm": 0.9007654786109924, + "learning_rate": 3.0192076830732296e-05, + "loss": 0.5555, + "step": 9912 + }, + { + "epoch": 12.68864, + "grad_norm": 0.8586267828941345, + "learning_rate": 3.0190076030412168e-05, + "loss": 0.5562, + "step": 9913 + }, + { + "epoch": 12.68992, + "grad_norm": 0.9020005464553833, + "learning_rate": 3.018807523009204e-05, + "loss": 0.5793, + "step": 9914 + }, + { + "epoch": 12.6912, + "grad_norm": 0.887444019317627, + "learning_rate": 3.018607442977191e-05, + "loss": 0.572, + "step": 9915 + }, + { + "epoch": 12.69248, + "grad_norm": 0.9526282548904419, + "learning_rate": 3.018407362945178e-05, + "loss": 0.574, + "step": 9916 + }, + { + "epoch": 12.69376, + "grad_norm": 0.9392738938331604, + "learning_rate": 3.018207282913165e-05, + "loss": 0.5512, + "step": 9917 + }, + { + "epoch": 12.69504, + "grad_norm": 0.938337504863739, + "learning_rate": 3.0180072028811523e-05, + "loss": 0.5903, + "step": 9918 + }, + { + "epoch": 12.69632, + "grad_norm": 0.9258535504341125, + "learning_rate": 3.01780712284914e-05, + "loss": 0.5464, + "step": 9919 + }, + { + "epoch": 12.6976, + "grad_norm": 0.9325132369995117, + "learning_rate": 3.017607042817127e-05, + "loss": 0.5417, + "step": 9920 + }, + { + "epoch": 12.698879999999999, + "grad_norm": 0.8271166682243347, + "learning_rate": 3.0174069627851142e-05, + "loss": 0.4818, + "step": 9921 + }, + { + "epoch": 12.70016, + "grad_norm": 0.9156856536865234, + "learning_rate": 3.0172068827531014e-05, + "loss": 0.508, + "step": 9922 + }, + { + "epoch": 12.70144, + "grad_norm": 0.9150842428207397, + "learning_rate": 3.0170068027210886e-05, + "loss": 0.5311, + "step": 9923 + }, + { + "epoch": 12.70272, + "grad_norm": 0.9068151116371155, + "learning_rate": 3.0168067226890755e-05, + "loss": 0.5557, + "step": 9924 + }, + { + "epoch": 12.704, + "grad_norm": 0.9375526309013367, + "learning_rate": 3.0166066426570626e-05, + "loss": 0.5309, + "step": 9925 + }, + { + "epoch": 12.70528, + "grad_norm": 0.9023036360740662, + "learning_rate": 3.0164065626250505e-05, + "loss": 0.5057, + "step": 9926 + }, + { + "epoch": 12.70656, + "grad_norm": 0.9717710614204407, + "learning_rate": 3.0162064825930374e-05, + "loss": 0.5915, + "step": 9927 + }, + { + "epoch": 12.707840000000001, + "grad_norm": 0.8739220499992371, + "learning_rate": 3.0160064025610245e-05, + "loss": 0.5367, + "step": 9928 + }, + { + "epoch": 12.70912, + "grad_norm": 0.903511643409729, + "learning_rate": 3.0158063225290117e-05, + "loss": 0.515, + "step": 9929 + }, + { + "epoch": 12.7104, + "grad_norm": 0.9045807719230652, + "learning_rate": 3.015606242496999e-05, + "loss": 0.5255, + "step": 9930 + }, + { + "epoch": 12.71168, + "grad_norm": 0.8613312244415283, + "learning_rate": 3.015406162464986e-05, + "loss": 0.5145, + "step": 9931 + }, + { + "epoch": 12.71296, + "grad_norm": 0.9359320998191833, + "learning_rate": 3.015206082432973e-05, + "loss": 0.5632, + "step": 9932 + }, + { + "epoch": 12.71424, + "grad_norm": 0.8847656846046448, + "learning_rate": 3.0150060024009608e-05, + "loss": 0.5334, + "step": 9933 + }, + { + "epoch": 12.71552, + "grad_norm": 0.9487404823303223, + "learning_rate": 3.014805922368948e-05, + "loss": 0.5566, + "step": 9934 + }, + { + "epoch": 12.7168, + "grad_norm": 1.0114420652389526, + "learning_rate": 3.014605842336935e-05, + "loss": 0.5976, + "step": 9935 + }, + { + "epoch": 12.71808, + "grad_norm": 0.9208101034164429, + "learning_rate": 3.014405762304922e-05, + "loss": 0.5365, + "step": 9936 + }, + { + "epoch": 12.71936, + "grad_norm": 0.8874728083610535, + "learning_rate": 3.0142056822729092e-05, + "loss": 0.5601, + "step": 9937 + }, + { + "epoch": 12.72064, + "grad_norm": 0.8904001116752625, + "learning_rate": 3.0140056022408964e-05, + "loss": 0.534, + "step": 9938 + }, + { + "epoch": 12.72192, + "grad_norm": 0.9263580441474915, + "learning_rate": 3.0138055222088836e-05, + "loss": 0.5514, + "step": 9939 + }, + { + "epoch": 12.7232, + "grad_norm": 0.9768319725990295, + "learning_rate": 3.013605442176871e-05, + "loss": 0.556, + "step": 9940 + }, + { + "epoch": 12.72448, + "grad_norm": 0.8745740652084351, + "learning_rate": 3.0134053621448583e-05, + "loss": 0.559, + "step": 9941 + }, + { + "epoch": 12.72576, + "grad_norm": 0.963864266872406, + "learning_rate": 3.0132052821128455e-05, + "loss": 0.5594, + "step": 9942 + }, + { + "epoch": 12.72704, + "grad_norm": 0.9123207330703735, + "learning_rate": 3.0130052020808323e-05, + "loss": 0.5439, + "step": 9943 + }, + { + "epoch": 12.72832, + "grad_norm": 0.8876965045928955, + "learning_rate": 3.0128051220488195e-05, + "loss": 0.5076, + "step": 9944 + }, + { + "epoch": 12.7296, + "grad_norm": 0.8987967371940613, + "learning_rate": 3.0126050420168067e-05, + "loss": 0.5557, + "step": 9945 + }, + { + "epoch": 12.730879999999999, + "grad_norm": 0.9241291880607605, + "learning_rate": 3.012404961984794e-05, + "loss": 0.5262, + "step": 9946 + }, + { + "epoch": 12.73216, + "grad_norm": 0.9232166409492493, + "learning_rate": 3.0122048819527814e-05, + "loss": 0.5576, + "step": 9947 + }, + { + "epoch": 12.73344, + "grad_norm": 0.8862061500549316, + "learning_rate": 3.0120048019207686e-05, + "loss": 0.5504, + "step": 9948 + }, + { + "epoch": 12.73472, + "grad_norm": 0.9697844982147217, + "learning_rate": 3.0118047218887558e-05, + "loss": 0.578, + "step": 9949 + }, + { + "epoch": 12.736, + "grad_norm": 0.9346733093261719, + "learning_rate": 3.011604641856743e-05, + "loss": 0.5529, + "step": 9950 + }, + { + "epoch": 12.73728, + "grad_norm": 0.9218923449516296, + "learning_rate": 3.0114045618247298e-05, + "loss": 0.5634, + "step": 9951 + }, + { + "epoch": 12.73856, + "grad_norm": 1.0053513050079346, + "learning_rate": 3.011204481792717e-05, + "loss": 0.5313, + "step": 9952 + }, + { + "epoch": 12.739840000000001, + "grad_norm": 0.9116829633712769, + "learning_rate": 3.0110044017607042e-05, + "loss": 0.565, + "step": 9953 + }, + { + "epoch": 12.74112, + "grad_norm": 0.8650526404380798, + "learning_rate": 3.0108043217286917e-05, + "loss": 0.4948, + "step": 9954 + }, + { + "epoch": 12.7424, + "grad_norm": 0.9802688360214233, + "learning_rate": 3.010604241696679e-05, + "loss": 0.5878, + "step": 9955 + }, + { + "epoch": 12.74368, + "grad_norm": 0.9233496785163879, + "learning_rate": 3.010404161664666e-05, + "loss": 0.5235, + "step": 9956 + }, + { + "epoch": 12.74496, + "grad_norm": 0.9107963442802429, + "learning_rate": 3.0102040816326533e-05, + "loss": 0.5203, + "step": 9957 + }, + { + "epoch": 12.74624, + "grad_norm": 0.9463147521018982, + "learning_rate": 3.0100040016006405e-05, + "loss": 0.5231, + "step": 9958 + }, + { + "epoch": 12.74752, + "grad_norm": 0.9093472957611084, + "learning_rate": 3.0098039215686273e-05, + "loss": 0.5534, + "step": 9959 + }, + { + "epoch": 12.7488, + "grad_norm": 1.0168708562850952, + "learning_rate": 3.0096038415366145e-05, + "loss": 0.6179, + "step": 9960 + }, + { + "epoch": 12.75008, + "grad_norm": 0.8969560265541077, + "learning_rate": 3.0094037615046023e-05, + "loss": 0.5302, + "step": 9961 + }, + { + "epoch": 12.75136, + "grad_norm": 0.9327144026756287, + "learning_rate": 3.0092036814725892e-05, + "loss": 0.5877, + "step": 9962 + }, + { + "epoch": 12.75264, + "grad_norm": 0.9616484642028809, + "learning_rate": 3.0090036014405764e-05, + "loss": 0.5779, + "step": 9963 + }, + { + "epoch": 12.75392, + "grad_norm": 0.9223829507827759, + "learning_rate": 3.0088035214085636e-05, + "loss": 0.5649, + "step": 9964 + }, + { + "epoch": 12.7552, + "grad_norm": 0.8914867639541626, + "learning_rate": 3.0086034413765508e-05, + "loss": 0.5435, + "step": 9965 + }, + { + "epoch": 12.75648, + "grad_norm": 0.8898464441299438, + "learning_rate": 3.008403361344538e-05, + "loss": 0.5918, + "step": 9966 + }, + { + "epoch": 12.75776, + "grad_norm": 0.8924893140792847, + "learning_rate": 3.0082032813125248e-05, + "loss": 0.5312, + "step": 9967 + }, + { + "epoch": 12.75904, + "grad_norm": 0.9874442219734192, + "learning_rate": 3.0080032012805126e-05, + "loss": 0.5591, + "step": 9968 + }, + { + "epoch": 12.76032, + "grad_norm": 0.8837193846702576, + "learning_rate": 3.0078031212485e-05, + "loss": 0.5174, + "step": 9969 + }, + { + "epoch": 12.7616, + "grad_norm": 0.9593360424041748, + "learning_rate": 3.0076030412164867e-05, + "loss": 0.5871, + "step": 9970 + }, + { + "epoch": 12.76288, + "grad_norm": 0.9044248461723328, + "learning_rate": 3.007402961184474e-05, + "loss": 0.5099, + "step": 9971 + }, + { + "epoch": 12.76416, + "grad_norm": 0.9392541646957397, + "learning_rate": 3.007202881152461e-05, + "loss": 0.5553, + "step": 9972 + }, + { + "epoch": 12.76544, + "grad_norm": 0.9666686058044434, + "learning_rate": 3.0070028011204482e-05, + "loss": 0.54, + "step": 9973 + }, + { + "epoch": 12.76672, + "grad_norm": 0.9441251754760742, + "learning_rate": 3.0068027210884354e-05, + "loss": 0.6014, + "step": 9974 + }, + { + "epoch": 12.768, + "grad_norm": 0.8636193871498108, + "learning_rate": 3.006602641056423e-05, + "loss": 0.5608, + "step": 9975 + }, + { + "epoch": 12.76928, + "grad_norm": 0.8688570261001587, + "learning_rate": 3.00640256102441e-05, + "loss": 0.5199, + "step": 9976 + }, + { + "epoch": 12.77056, + "grad_norm": 0.9947240352630615, + "learning_rate": 3.0062024809923973e-05, + "loss": 0.5931, + "step": 9977 + }, + { + "epoch": 12.77184, + "grad_norm": 0.9212228059768677, + "learning_rate": 3.006002400960384e-05, + "loss": 0.4972, + "step": 9978 + }, + { + "epoch": 12.77312, + "grad_norm": 0.9512316584587097, + "learning_rate": 3.0058023209283714e-05, + "loss": 0.5568, + "step": 9979 + }, + { + "epoch": 12.7744, + "grad_norm": 0.9175422191619873, + "learning_rate": 3.0056022408963585e-05, + "loss": 0.5375, + "step": 9980 + }, + { + "epoch": 12.77568, + "grad_norm": 0.918135404586792, + "learning_rate": 3.0054021608643457e-05, + "loss": 0.5353, + "step": 9981 + }, + { + "epoch": 12.77696, + "grad_norm": 0.937879204750061, + "learning_rate": 3.0052020808323332e-05, + "loss": 0.5707, + "step": 9982 + }, + { + "epoch": 12.77824, + "grad_norm": 0.8635044693946838, + "learning_rate": 3.0050020008003204e-05, + "loss": 0.5118, + "step": 9983 + }, + { + "epoch": 12.77952, + "grad_norm": 0.8819299936294556, + "learning_rate": 3.0048019207683076e-05, + "loss": 0.4941, + "step": 9984 + }, + { + "epoch": 12.7808, + "grad_norm": 0.9287380576133728, + "learning_rate": 3.0046018407362948e-05, + "loss": 0.5739, + "step": 9985 + }, + { + "epoch": 12.78208, + "grad_norm": 0.8525633811950684, + "learning_rate": 3.0044017607042817e-05, + "loss": 0.5316, + "step": 9986 + }, + { + "epoch": 12.78336, + "grad_norm": 0.8659927845001221, + "learning_rate": 3.004201680672269e-05, + "loss": 0.5096, + "step": 9987 + }, + { + "epoch": 12.78464, + "grad_norm": 0.876082181930542, + "learning_rate": 3.004001600640256e-05, + "loss": 0.5481, + "step": 9988 + }, + { + "epoch": 12.78592, + "grad_norm": 0.8915519118309021, + "learning_rate": 3.0038015206082435e-05, + "loss": 0.5542, + "step": 9989 + }, + { + "epoch": 12.7872, + "grad_norm": 0.961881697177887, + "learning_rate": 3.0036014405762307e-05, + "loss": 0.541, + "step": 9990 + }, + { + "epoch": 12.78848, + "grad_norm": 0.9022373557090759, + "learning_rate": 3.003401360544218e-05, + "loss": 0.5454, + "step": 9991 + }, + { + "epoch": 12.78976, + "grad_norm": 0.9297406077384949, + "learning_rate": 3.003201280512205e-05, + "loss": 0.5537, + "step": 9992 + }, + { + "epoch": 12.79104, + "grad_norm": 0.9228201508522034, + "learning_rate": 3.0030012004801923e-05, + "loss": 0.5281, + "step": 9993 + }, + { + "epoch": 12.79232, + "grad_norm": 0.8959901928901672, + "learning_rate": 3.002801120448179e-05, + "loss": 0.5428, + "step": 9994 + }, + { + "epoch": 12.7936, + "grad_norm": 0.903972864151001, + "learning_rate": 3.0026010404161663e-05, + "loss": 0.5513, + "step": 9995 + }, + { + "epoch": 12.79488, + "grad_norm": 0.9172789454460144, + "learning_rate": 3.0024009603841542e-05, + "loss": 0.5723, + "step": 9996 + }, + { + "epoch": 12.79616, + "grad_norm": 0.872391402721405, + "learning_rate": 3.002200880352141e-05, + "loss": 0.5302, + "step": 9997 + }, + { + "epoch": 12.79744, + "grad_norm": 0.9103299379348755, + "learning_rate": 3.0020008003201282e-05, + "loss": 0.5548, + "step": 9998 + }, + { + "epoch": 12.79872, + "grad_norm": 0.9463931322097778, + "learning_rate": 3.0018007202881154e-05, + "loss": 0.5299, + "step": 9999 + }, + { + "epoch": 12.8, + "grad_norm": 0.9099661707878113, + "learning_rate": 3.0016006402561026e-05, + "loss": 0.5733, + "step": 10000 + }, + { + "epoch": 12.80128, + "grad_norm": 0.8782345652580261, + "learning_rate": 3.0014005602240898e-05, + "loss": 0.5495, + "step": 10001 + }, + { + "epoch": 12.80256, + "grad_norm": 0.9504924416542053, + "learning_rate": 3.0012004801920766e-05, + "loss": 0.5548, + "step": 10002 + }, + { + "epoch": 12.80384, + "grad_norm": 0.9535840749740601, + "learning_rate": 3.0010004001600638e-05, + "loss": 0.6183, + "step": 10003 + }, + { + "epoch": 12.80512, + "grad_norm": 0.9448462724685669, + "learning_rate": 3.0008003201280517e-05, + "loss": 0.5871, + "step": 10004 + }, + { + "epoch": 12.8064, + "grad_norm": 0.856593906879425, + "learning_rate": 3.0006002400960385e-05, + "loss": 0.5501, + "step": 10005 + }, + { + "epoch": 12.80768, + "grad_norm": 0.9613591432571411, + "learning_rate": 3.0004001600640257e-05, + "loss": 0.5669, + "step": 10006 + }, + { + "epoch": 12.80896, + "grad_norm": 0.9325043559074402, + "learning_rate": 3.000200080032013e-05, + "loss": 0.5625, + "step": 10007 + }, + { + "epoch": 12.81024, + "grad_norm": 0.8977808356285095, + "learning_rate": 3e-05, + "loss": 0.5378, + "step": 10008 + }, + { + "epoch": 12.81152, + "grad_norm": 0.8371548652648926, + "learning_rate": 2.9997999199679873e-05, + "loss": 0.5012, + "step": 10009 + }, + { + "epoch": 12.8128, + "grad_norm": 0.9407792091369629, + "learning_rate": 2.999599839935974e-05, + "loss": 0.6057, + "step": 10010 + }, + { + "epoch": 12.81408, + "grad_norm": 0.911292552947998, + "learning_rate": 2.999399759903962e-05, + "loss": 0.5342, + "step": 10011 + }, + { + "epoch": 12.81536, + "grad_norm": 0.947562038898468, + "learning_rate": 2.999199679871949e-05, + "loss": 0.597, + "step": 10012 + }, + { + "epoch": 12.81664, + "grad_norm": 0.8605309724807739, + "learning_rate": 2.998999599839936e-05, + "loss": 0.5011, + "step": 10013 + }, + { + "epoch": 12.81792, + "grad_norm": 0.9538720846176147, + "learning_rate": 2.9987995198079232e-05, + "loss": 0.5383, + "step": 10014 + }, + { + "epoch": 12.8192, + "grad_norm": 0.8674808144569397, + "learning_rate": 2.9985994397759104e-05, + "loss": 0.5635, + "step": 10015 + }, + { + "epoch": 12.82048, + "grad_norm": 0.8953600525856018, + "learning_rate": 2.9983993597438976e-05, + "loss": 0.5338, + "step": 10016 + }, + { + "epoch": 12.82176, + "grad_norm": 0.9249882102012634, + "learning_rate": 2.9981992797118847e-05, + "loss": 0.5529, + "step": 10017 + }, + { + "epoch": 12.82304, + "grad_norm": 0.9073203802108765, + "learning_rate": 2.9979991996798723e-05, + "loss": 0.5515, + "step": 10018 + }, + { + "epoch": 12.82432, + "grad_norm": 0.8862814903259277, + "learning_rate": 2.9977991196478595e-05, + "loss": 0.5639, + "step": 10019 + }, + { + "epoch": 12.8256, + "grad_norm": 0.9115248918533325, + "learning_rate": 2.9975990396158466e-05, + "loss": 0.5338, + "step": 10020 + }, + { + "epoch": 12.82688, + "grad_norm": 0.9155441522598267, + "learning_rate": 2.9973989595838335e-05, + "loss": 0.5904, + "step": 10021 + }, + { + "epoch": 12.82816, + "grad_norm": 0.936619222164154, + "learning_rate": 2.9971988795518207e-05, + "loss": 0.5606, + "step": 10022 + }, + { + "epoch": 12.82944, + "grad_norm": 0.8957839012145996, + "learning_rate": 2.996998799519808e-05, + "loss": 0.5432, + "step": 10023 + }, + { + "epoch": 12.83072, + "grad_norm": 0.8455110192298889, + "learning_rate": 2.996798719487795e-05, + "loss": 0.5095, + "step": 10024 + }, + { + "epoch": 12.832, + "grad_norm": 0.8867446184158325, + "learning_rate": 2.9965986394557826e-05, + "loss": 0.5319, + "step": 10025 + }, + { + "epoch": 12.83328, + "grad_norm": 0.8974578976631165, + "learning_rate": 2.9963985594237698e-05, + "loss": 0.5514, + "step": 10026 + }, + { + "epoch": 12.83456, + "grad_norm": 0.8834570050239563, + "learning_rate": 2.996198479391757e-05, + "loss": 0.5682, + "step": 10027 + }, + { + "epoch": 12.83584, + "grad_norm": 0.9071859121322632, + "learning_rate": 2.995998399359744e-05, + "loss": 0.5334, + "step": 10028 + }, + { + "epoch": 12.83712, + "grad_norm": 0.9252317547798157, + "learning_rate": 2.995798319327731e-05, + "loss": 0.5062, + "step": 10029 + }, + { + "epoch": 12.8384, + "grad_norm": 0.9470645189285278, + "learning_rate": 2.995598239295718e-05, + "loss": 0.5328, + "step": 10030 + }, + { + "epoch": 12.83968, + "grad_norm": 0.9187636375427246, + "learning_rate": 2.9953981592637053e-05, + "loss": 0.6012, + "step": 10031 + }, + { + "epoch": 12.84096, + "grad_norm": 0.9187378883361816, + "learning_rate": 2.9951980792316932e-05, + "loss": 0.582, + "step": 10032 + }, + { + "epoch": 12.84224, + "grad_norm": 0.9218907952308655, + "learning_rate": 2.99499799919968e-05, + "loss": 0.5591, + "step": 10033 + }, + { + "epoch": 12.84352, + "grad_norm": 0.8669567704200745, + "learning_rate": 2.9947979191676672e-05, + "loss": 0.5021, + "step": 10034 + }, + { + "epoch": 12.8448, + "grad_norm": 0.9718922972679138, + "learning_rate": 2.9945978391356544e-05, + "loss": 0.5636, + "step": 10035 + }, + { + "epoch": 12.84608, + "grad_norm": 0.9400469064712524, + "learning_rate": 2.9943977591036416e-05, + "loss": 0.5565, + "step": 10036 + }, + { + "epoch": 12.84736, + "grad_norm": 0.9250262975692749, + "learning_rate": 2.9941976790716285e-05, + "loss": 0.5336, + "step": 10037 + }, + { + "epoch": 12.84864, + "grad_norm": 0.9361352324485779, + "learning_rate": 2.9939975990396156e-05, + "loss": 0.5438, + "step": 10038 + }, + { + "epoch": 12.849920000000001, + "grad_norm": 0.9436172246932983, + "learning_rate": 2.9937975190076035e-05, + "loss": 0.5784, + "step": 10039 + }, + { + "epoch": 12.8512, + "grad_norm": 0.9591764807701111, + "learning_rate": 2.9935974389755907e-05, + "loss": 0.5426, + "step": 10040 + }, + { + "epoch": 12.85248, + "grad_norm": 0.9026290774345398, + "learning_rate": 2.9933973589435775e-05, + "loss": 0.5524, + "step": 10041 + }, + { + "epoch": 12.85376, + "grad_norm": 0.8631820678710938, + "learning_rate": 2.9931972789115647e-05, + "loss": 0.5365, + "step": 10042 + }, + { + "epoch": 12.85504, + "grad_norm": 0.8909380435943604, + "learning_rate": 2.992997198879552e-05, + "loss": 0.4973, + "step": 10043 + }, + { + "epoch": 12.85632, + "grad_norm": 0.9117045998573303, + "learning_rate": 2.992797118847539e-05, + "loss": 0.5777, + "step": 10044 + }, + { + "epoch": 12.8576, + "grad_norm": 0.9356653094291687, + "learning_rate": 2.992597038815526e-05, + "loss": 0.5703, + "step": 10045 + }, + { + "epoch": 12.85888, + "grad_norm": 0.9161821007728577, + "learning_rate": 2.9923969587835138e-05, + "loss": 0.5378, + "step": 10046 + }, + { + "epoch": 12.86016, + "grad_norm": 0.9785245060920715, + "learning_rate": 2.992196878751501e-05, + "loss": 0.6163, + "step": 10047 + }, + { + "epoch": 12.86144, + "grad_norm": 0.896397590637207, + "learning_rate": 2.9919967987194882e-05, + "loss": 0.5743, + "step": 10048 + }, + { + "epoch": 12.86272, + "grad_norm": 0.8948002457618713, + "learning_rate": 2.991796718687475e-05, + "loss": 0.5688, + "step": 10049 + }, + { + "epoch": 12.864, + "grad_norm": 0.9673354625701904, + "learning_rate": 2.9915966386554622e-05, + "loss": 0.572, + "step": 10050 + }, + { + "epoch": 12.86528, + "grad_norm": 0.9555987119674683, + "learning_rate": 2.9913965586234494e-05, + "loss": 0.5516, + "step": 10051 + }, + { + "epoch": 12.86656, + "grad_norm": 0.908279299736023, + "learning_rate": 2.9911964785914366e-05, + "loss": 0.5513, + "step": 10052 + }, + { + "epoch": 12.86784, + "grad_norm": 0.8543174862861633, + "learning_rate": 2.990996398559424e-05, + "loss": 0.5194, + "step": 10053 + }, + { + "epoch": 12.86912, + "grad_norm": 0.9750853180885315, + "learning_rate": 2.9907963185274113e-05, + "loss": 0.5985, + "step": 10054 + }, + { + "epoch": 12.8704, + "grad_norm": 0.8315509557723999, + "learning_rate": 2.9905962384953985e-05, + "loss": 0.5259, + "step": 10055 + }, + { + "epoch": 12.87168, + "grad_norm": 0.8759975433349609, + "learning_rate": 2.9903961584633857e-05, + "loss": 0.5003, + "step": 10056 + }, + { + "epoch": 12.872959999999999, + "grad_norm": 0.8982505798339844, + "learning_rate": 2.9901960784313725e-05, + "loss": 0.5745, + "step": 10057 + }, + { + "epoch": 12.87424, + "grad_norm": 0.9328362941741943, + "learning_rate": 2.9899959983993597e-05, + "loss": 0.5945, + "step": 10058 + }, + { + "epoch": 12.87552, + "grad_norm": 0.9085741639137268, + "learning_rate": 2.989795918367347e-05, + "loss": 0.5479, + "step": 10059 + }, + { + "epoch": 12.8768, + "grad_norm": 0.879937469959259, + "learning_rate": 2.9895958383353344e-05, + "loss": 0.5385, + "step": 10060 + }, + { + "epoch": 12.87808, + "grad_norm": 0.932284951210022, + "learning_rate": 2.9893957583033216e-05, + "loss": 0.5911, + "step": 10061 + }, + { + "epoch": 12.87936, + "grad_norm": 0.9298887252807617, + "learning_rate": 2.9891956782713088e-05, + "loss": 0.5586, + "step": 10062 + }, + { + "epoch": 12.88064, + "grad_norm": 0.9269364476203918, + "learning_rate": 2.988995598239296e-05, + "loss": 0.5432, + "step": 10063 + }, + { + "epoch": 12.881920000000001, + "grad_norm": 0.9070879817008972, + "learning_rate": 2.988795518207283e-05, + "loss": 0.5722, + "step": 10064 + }, + { + "epoch": 12.8832, + "grad_norm": 0.9287840723991394, + "learning_rate": 2.98859543817527e-05, + "loss": 0.5642, + "step": 10065 + }, + { + "epoch": 12.88448, + "grad_norm": 0.9351973533630371, + "learning_rate": 2.9883953581432572e-05, + "loss": 0.5648, + "step": 10066 + }, + { + "epoch": 12.88576, + "grad_norm": 0.9211728572845459, + "learning_rate": 2.988195278111245e-05, + "loss": 0.5144, + "step": 10067 + }, + { + "epoch": 12.88704, + "grad_norm": 0.9010429382324219, + "learning_rate": 2.987995198079232e-05, + "loss": 0.536, + "step": 10068 + }, + { + "epoch": 12.88832, + "grad_norm": 0.9658145904541016, + "learning_rate": 2.987795118047219e-05, + "loss": 0.5977, + "step": 10069 + }, + { + "epoch": 12.8896, + "grad_norm": 1.0316338539123535, + "learning_rate": 2.9875950380152063e-05, + "loss": 0.5963, + "step": 10070 + }, + { + "epoch": 12.89088, + "grad_norm": 0.8775344491004944, + "learning_rate": 2.9873949579831935e-05, + "loss": 0.525, + "step": 10071 + }, + { + "epoch": 12.89216, + "grad_norm": 0.9124401807785034, + "learning_rate": 2.9871948779511806e-05, + "loss": 0.5544, + "step": 10072 + }, + { + "epoch": 12.89344, + "grad_norm": 0.9111108183860779, + "learning_rate": 2.9869947979191675e-05, + "loss": 0.574, + "step": 10073 + }, + { + "epoch": 12.89472, + "grad_norm": 0.9600886702537537, + "learning_rate": 2.9867947178871553e-05, + "loss": 0.5347, + "step": 10074 + }, + { + "epoch": 12.896, + "grad_norm": 0.967065691947937, + "learning_rate": 2.9865946378551425e-05, + "loss": 0.5945, + "step": 10075 + }, + { + "epoch": 12.89728, + "grad_norm": 0.8715311288833618, + "learning_rate": 2.9863945578231294e-05, + "loss": 0.5239, + "step": 10076 + }, + { + "epoch": 12.89856, + "grad_norm": 0.8990671634674072, + "learning_rate": 2.9861944777911166e-05, + "loss": 0.5526, + "step": 10077 + }, + { + "epoch": 12.89984, + "grad_norm": 0.8786954283714294, + "learning_rate": 2.9859943977591038e-05, + "loss": 0.5653, + "step": 10078 + }, + { + "epoch": 12.90112, + "grad_norm": 0.9412379860877991, + "learning_rate": 2.985794317727091e-05, + "loss": 0.5592, + "step": 10079 + }, + { + "epoch": 12.9024, + "grad_norm": 0.9109273552894592, + "learning_rate": 2.985594237695078e-05, + "loss": 0.5307, + "step": 10080 + }, + { + "epoch": 12.90368, + "grad_norm": 0.9346706867218018, + "learning_rate": 2.9853941576630656e-05, + "loss": 0.5661, + "step": 10081 + }, + { + "epoch": 12.904959999999999, + "grad_norm": 0.9366570711135864, + "learning_rate": 2.985194077631053e-05, + "loss": 0.5806, + "step": 10082 + }, + { + "epoch": 12.90624, + "grad_norm": 0.906199038028717, + "learning_rate": 2.98499399759904e-05, + "loss": 0.5577, + "step": 10083 + }, + { + "epoch": 12.90752, + "grad_norm": 0.9052384495735168, + "learning_rate": 2.984793917567027e-05, + "loss": 0.5144, + "step": 10084 + }, + { + "epoch": 12.9088, + "grad_norm": 0.8487529754638672, + "learning_rate": 2.984593837535014e-05, + "loss": 0.5046, + "step": 10085 + }, + { + "epoch": 12.91008, + "grad_norm": 0.9538546204566956, + "learning_rate": 2.9843937575030012e-05, + "loss": 0.5881, + "step": 10086 + }, + { + "epoch": 12.91136, + "grad_norm": 0.9074375033378601, + "learning_rate": 2.9841936774709884e-05, + "loss": 0.5077, + "step": 10087 + }, + { + "epoch": 12.91264, + "grad_norm": 0.8918171525001526, + "learning_rate": 2.983993597438976e-05, + "loss": 0.547, + "step": 10088 + }, + { + "epoch": 12.91392, + "grad_norm": 0.877153217792511, + "learning_rate": 2.983793517406963e-05, + "loss": 0.5116, + "step": 10089 + }, + { + "epoch": 12.9152, + "grad_norm": 0.9327219724655151, + "learning_rate": 2.9835934373749503e-05, + "loss": 0.5949, + "step": 10090 + }, + { + "epoch": 12.91648, + "grad_norm": 0.8527021408081055, + "learning_rate": 2.9833933573429375e-05, + "loss": 0.5414, + "step": 10091 + }, + { + "epoch": 12.91776, + "grad_norm": 0.9309681057929993, + "learning_rate": 2.9831932773109244e-05, + "loss": 0.5714, + "step": 10092 + }, + { + "epoch": 12.91904, + "grad_norm": 1.0311362743377686, + "learning_rate": 2.9829931972789115e-05, + "loss": 0.6181, + "step": 10093 + }, + { + "epoch": 12.92032, + "grad_norm": 0.9770155549049377, + "learning_rate": 2.9827931172468987e-05, + "loss": 0.6033, + "step": 10094 + }, + { + "epoch": 12.9216, + "grad_norm": 0.9168463945388794, + "learning_rate": 2.9825930372148862e-05, + "loss": 0.5045, + "step": 10095 + }, + { + "epoch": 12.92288, + "grad_norm": 0.9091753959655762, + "learning_rate": 2.9823929571828734e-05, + "loss": 0.5097, + "step": 10096 + }, + { + "epoch": 12.92416, + "grad_norm": 0.9451500177383423, + "learning_rate": 2.9821928771508606e-05, + "loss": 0.5358, + "step": 10097 + }, + { + "epoch": 12.92544, + "grad_norm": 0.8995253443717957, + "learning_rate": 2.9819927971188478e-05, + "loss": 0.5671, + "step": 10098 + }, + { + "epoch": 12.92672, + "grad_norm": 0.9600387215614319, + "learning_rate": 2.981792717086835e-05, + "loss": 0.6202, + "step": 10099 + }, + { + "epoch": 12.928, + "grad_norm": 0.9363338351249695, + "learning_rate": 2.981592637054822e-05, + "loss": 0.5377, + "step": 10100 + }, + { + "epoch": 12.92928, + "grad_norm": 0.9418403506278992, + "learning_rate": 2.981392557022809e-05, + "loss": 0.5899, + "step": 10101 + }, + { + "epoch": 12.93056, + "grad_norm": 0.9346731305122375, + "learning_rate": 2.981192476990797e-05, + "loss": 0.5907, + "step": 10102 + }, + { + "epoch": 12.93184, + "grad_norm": 0.8982060551643372, + "learning_rate": 2.9809923969587837e-05, + "loss": 0.5422, + "step": 10103 + }, + { + "epoch": 12.93312, + "grad_norm": 0.980532705783844, + "learning_rate": 2.980792316926771e-05, + "loss": 0.6078, + "step": 10104 + }, + { + "epoch": 12.9344, + "grad_norm": 0.9058598875999451, + "learning_rate": 2.980592236894758e-05, + "loss": 0.5936, + "step": 10105 + }, + { + "epoch": 12.93568, + "grad_norm": 0.81413733959198, + "learning_rate": 2.9803921568627453e-05, + "loss": 0.4857, + "step": 10106 + }, + { + "epoch": 12.93696, + "grad_norm": 0.9065173864364624, + "learning_rate": 2.9801920768307325e-05, + "loss": 0.5294, + "step": 10107 + }, + { + "epoch": 12.93824, + "grad_norm": 0.9533013701438904, + "learning_rate": 2.9799919967987193e-05, + "loss": 0.5727, + "step": 10108 + }, + { + "epoch": 12.93952, + "grad_norm": 0.9087998867034912, + "learning_rate": 2.9797919167667065e-05, + "loss": 0.5334, + "step": 10109 + }, + { + "epoch": 12.9408, + "grad_norm": 0.905795156955719, + "learning_rate": 2.9795918367346944e-05, + "loss": 0.5665, + "step": 10110 + }, + { + "epoch": 12.94208, + "grad_norm": 0.9425820708274841, + "learning_rate": 2.9793917567026812e-05, + "loss": 0.5665, + "step": 10111 + }, + { + "epoch": 12.94336, + "grad_norm": 0.8737591505050659, + "learning_rate": 2.9791916766706684e-05, + "loss": 0.5013, + "step": 10112 + }, + { + "epoch": 12.94464, + "grad_norm": 0.9114744067192078, + "learning_rate": 2.9789915966386556e-05, + "loss": 0.5359, + "step": 10113 + }, + { + "epoch": 12.94592, + "grad_norm": 0.9109938144683838, + "learning_rate": 2.9787915166066428e-05, + "loss": 0.525, + "step": 10114 + }, + { + "epoch": 12.9472, + "grad_norm": 1.0331107378005981, + "learning_rate": 2.97859143657463e-05, + "loss": 0.6145, + "step": 10115 + }, + { + "epoch": 12.94848, + "grad_norm": 0.9183046221733093, + "learning_rate": 2.9783913565426168e-05, + "loss": 0.5607, + "step": 10116 + }, + { + "epoch": 12.94976, + "grad_norm": 0.8793911337852478, + "learning_rate": 2.9781912765106047e-05, + "loss": 0.5091, + "step": 10117 + }, + { + "epoch": 12.95104, + "grad_norm": 0.8715007305145264, + "learning_rate": 2.977991196478592e-05, + "loss": 0.5342, + "step": 10118 + }, + { + "epoch": 12.95232, + "grad_norm": 0.9130722284317017, + "learning_rate": 2.9777911164465787e-05, + "loss": 0.569, + "step": 10119 + }, + { + "epoch": 12.9536, + "grad_norm": 0.8778414130210876, + "learning_rate": 2.977591036414566e-05, + "loss": 0.5715, + "step": 10120 + }, + { + "epoch": 12.95488, + "grad_norm": 0.8723252415657043, + "learning_rate": 2.977390956382553e-05, + "loss": 0.5556, + "step": 10121 + }, + { + "epoch": 12.95616, + "grad_norm": 0.9245275259017944, + "learning_rate": 2.9771908763505403e-05, + "loss": 0.5706, + "step": 10122 + }, + { + "epoch": 12.95744, + "grad_norm": 0.9028117656707764, + "learning_rate": 2.9769907963185274e-05, + "loss": 0.5435, + "step": 10123 + }, + { + "epoch": 12.95872, + "grad_norm": 0.8952034115791321, + "learning_rate": 2.976790716286515e-05, + "loss": 0.5545, + "step": 10124 + }, + { + "epoch": 12.96, + "grad_norm": 0.889771580696106, + "learning_rate": 2.976590636254502e-05, + "loss": 0.5124, + "step": 10125 + }, + { + "epoch": 12.96128, + "grad_norm": 0.9668723940849304, + "learning_rate": 2.9763905562224893e-05, + "loss": 0.537, + "step": 10126 + }, + { + "epoch": 12.96256, + "grad_norm": 0.8818986415863037, + "learning_rate": 2.9761904761904762e-05, + "loss": 0.5641, + "step": 10127 + }, + { + "epoch": 12.96384, + "grad_norm": 0.8750602006912231, + "learning_rate": 2.9759903961584634e-05, + "loss": 0.5355, + "step": 10128 + }, + { + "epoch": 12.96512, + "grad_norm": 0.9368957877159119, + "learning_rate": 2.9757903161264506e-05, + "loss": 0.5498, + "step": 10129 + }, + { + "epoch": 12.9664, + "grad_norm": 0.9112243056297302, + "learning_rate": 2.9755902360944377e-05, + "loss": 0.5243, + "step": 10130 + }, + { + "epoch": 12.96768, + "grad_norm": 0.9702836871147156, + "learning_rate": 2.9753901560624253e-05, + "loss": 0.5832, + "step": 10131 + }, + { + "epoch": 12.96896, + "grad_norm": 0.9459221363067627, + "learning_rate": 2.9751900760304125e-05, + "loss": 0.5452, + "step": 10132 + }, + { + "epoch": 12.97024, + "grad_norm": 0.8926064968109131, + "learning_rate": 2.9749899959983996e-05, + "loss": 0.5324, + "step": 10133 + }, + { + "epoch": 12.97152, + "grad_norm": 0.9436557292938232, + "learning_rate": 2.9747899159663868e-05, + "loss": 0.5705, + "step": 10134 + }, + { + "epoch": 12.9728, + "grad_norm": 0.8792611956596375, + "learning_rate": 2.9745898359343737e-05, + "loss": 0.5387, + "step": 10135 + }, + { + "epoch": 12.97408, + "grad_norm": 0.8789375424385071, + "learning_rate": 2.974389755902361e-05, + "loss": 0.4897, + "step": 10136 + }, + { + "epoch": 12.97536, + "grad_norm": 0.9235910773277283, + "learning_rate": 2.974189675870348e-05, + "loss": 0.5667, + "step": 10137 + }, + { + "epoch": 12.97664, + "grad_norm": 0.9125930070877075, + "learning_rate": 2.9739895958383356e-05, + "loss": 0.5704, + "step": 10138 + }, + { + "epoch": 12.97792, + "grad_norm": 0.9023631811141968, + "learning_rate": 2.9737895158063228e-05, + "loss": 0.5521, + "step": 10139 + }, + { + "epoch": 12.9792, + "grad_norm": 0.9152993559837341, + "learning_rate": 2.97358943577431e-05, + "loss": 0.4928, + "step": 10140 + }, + { + "epoch": 12.98048, + "grad_norm": 0.9630506038665771, + "learning_rate": 2.973389355742297e-05, + "loss": 0.5829, + "step": 10141 + }, + { + "epoch": 12.98176, + "grad_norm": 0.9099013209342957, + "learning_rate": 2.9731892757102843e-05, + "loss": 0.5566, + "step": 10142 + }, + { + "epoch": 12.98304, + "grad_norm": 0.8991549015045166, + "learning_rate": 2.972989195678271e-05, + "loss": 0.5459, + "step": 10143 + }, + { + "epoch": 12.98432, + "grad_norm": 0.9547131657600403, + "learning_rate": 2.9727891156462583e-05, + "loss": 0.5874, + "step": 10144 + }, + { + "epoch": 12.9856, + "grad_norm": 0.9026567339897156, + "learning_rate": 2.9725890356142462e-05, + "loss": 0.547, + "step": 10145 + }, + { + "epoch": 12.98688, + "grad_norm": 0.9058331847190857, + "learning_rate": 2.972388955582233e-05, + "loss": 0.52, + "step": 10146 + }, + { + "epoch": 12.98816, + "grad_norm": 0.9603937864303589, + "learning_rate": 2.9721888755502202e-05, + "loss": 0.615, + "step": 10147 + }, + { + "epoch": 12.98944, + "grad_norm": 0.9125460982322693, + "learning_rate": 2.9719887955182074e-05, + "loss": 0.5463, + "step": 10148 + }, + { + "epoch": 12.99072, + "grad_norm": 0.9278149008750916, + "learning_rate": 2.9717887154861946e-05, + "loss": 0.5386, + "step": 10149 + }, + { + "epoch": 12.992, + "grad_norm": 0.88197922706604, + "learning_rate": 2.9715886354541818e-05, + "loss": 0.5443, + "step": 10150 + }, + { + "epoch": 12.99328, + "grad_norm": 0.9040613174438477, + "learning_rate": 2.9713885554221686e-05, + "loss": 0.5515, + "step": 10151 + }, + { + "epoch": 12.99456, + "grad_norm": 0.9090338945388794, + "learning_rate": 2.9711884753901565e-05, + "loss": 0.5483, + "step": 10152 + }, + { + "epoch": 12.99584, + "grad_norm": 0.9126954674720764, + "learning_rate": 2.9709883953581437e-05, + "loss": 0.6022, + "step": 10153 + }, + { + "epoch": 12.99712, + "grad_norm": 0.9582985043525696, + "learning_rate": 2.9707883153261305e-05, + "loss": 0.5414, + "step": 10154 + }, + { + "epoch": 12.9984, + "grad_norm": 0.9858940839767456, + "learning_rate": 2.9705882352941177e-05, + "loss": 0.5773, + "step": 10155 + }, + { + "epoch": 12.99968, + "grad_norm": 0.9113115072250366, + "learning_rate": 2.970388155262105e-05, + "loss": 0.5455, + "step": 10156 + }, + { + "epoch": 13.00096, + "grad_norm": 1.8516324758529663, + "learning_rate": 2.970188075230092e-05, + "loss": 0.9295, + "step": 10157 + }, + { + "epoch": 13.00224, + "grad_norm": 0.8609241843223572, + "learning_rate": 2.9699879951980793e-05, + "loss": 0.4725, + "step": 10158 + }, + { + "epoch": 13.00352, + "grad_norm": 0.8947498202323914, + "learning_rate": 2.9697879151660668e-05, + "loss": 0.5303, + "step": 10159 + }, + { + "epoch": 13.0048, + "grad_norm": 0.9149647951126099, + "learning_rate": 2.969587835134054e-05, + "loss": 0.5236, + "step": 10160 + }, + { + "epoch": 13.00608, + "grad_norm": 0.9216925501823425, + "learning_rate": 2.9693877551020412e-05, + "loss": 0.5461, + "step": 10161 + }, + { + "epoch": 13.00736, + "grad_norm": 0.9156137108802795, + "learning_rate": 2.969187675070028e-05, + "loss": 0.5692, + "step": 10162 + }, + { + "epoch": 13.00864, + "grad_norm": 0.9287101030349731, + "learning_rate": 2.9689875950380152e-05, + "loss": 0.5375, + "step": 10163 + }, + { + "epoch": 13.00992, + "grad_norm": 0.9227731227874756, + "learning_rate": 2.9687875150060024e-05, + "loss": 0.5584, + "step": 10164 + }, + { + "epoch": 13.0112, + "grad_norm": 0.9401906132698059, + "learning_rate": 2.9685874349739896e-05, + "loss": 0.5354, + "step": 10165 + }, + { + "epoch": 13.01248, + "grad_norm": 0.9380677938461304, + "learning_rate": 2.968387354941977e-05, + "loss": 0.5367, + "step": 10166 + }, + { + "epoch": 13.01376, + "grad_norm": 0.9107190370559692, + "learning_rate": 2.9681872749099643e-05, + "loss": 0.515, + "step": 10167 + }, + { + "epoch": 13.01504, + "grad_norm": 0.9699810147285461, + "learning_rate": 2.9679871948779515e-05, + "loss": 0.5413, + "step": 10168 + }, + { + "epoch": 13.01632, + "grad_norm": 0.9228163361549377, + "learning_rate": 2.9677871148459387e-05, + "loss": 0.5589, + "step": 10169 + }, + { + "epoch": 13.0176, + "grad_norm": 0.8890709280967712, + "learning_rate": 2.9675870348139255e-05, + "loss": 0.5434, + "step": 10170 + }, + { + "epoch": 13.01888, + "grad_norm": 0.8726478815078735, + "learning_rate": 2.9673869547819127e-05, + "loss": 0.5035, + "step": 10171 + }, + { + "epoch": 13.02016, + "grad_norm": 0.9581865072250366, + "learning_rate": 2.9671868747499e-05, + "loss": 0.5897, + "step": 10172 + }, + { + "epoch": 13.02144, + "grad_norm": 0.9103356599807739, + "learning_rate": 2.9669867947178874e-05, + "loss": 0.5305, + "step": 10173 + }, + { + "epoch": 13.02272, + "grad_norm": 0.9056451916694641, + "learning_rate": 2.9667867146858746e-05, + "loss": 0.5249, + "step": 10174 + }, + { + "epoch": 13.024, + "grad_norm": 0.9359303712844849, + "learning_rate": 2.9665866346538618e-05, + "loss": 0.552, + "step": 10175 + }, + { + "epoch": 13.02528, + "grad_norm": 0.9274691343307495, + "learning_rate": 2.966386554621849e-05, + "loss": 0.5194, + "step": 10176 + }, + { + "epoch": 13.02656, + "grad_norm": 0.8577048182487488, + "learning_rate": 2.966186474589836e-05, + "loss": 0.488, + "step": 10177 + }, + { + "epoch": 13.02784, + "grad_norm": 0.8899839520454407, + "learning_rate": 2.965986394557823e-05, + "loss": 0.5371, + "step": 10178 + }, + { + "epoch": 13.02912, + "grad_norm": 0.8708844184875488, + "learning_rate": 2.9657863145258102e-05, + "loss": 0.5118, + "step": 10179 + }, + { + "epoch": 13.0304, + "grad_norm": 0.9025720357894897, + "learning_rate": 2.965586234493798e-05, + "loss": 0.5332, + "step": 10180 + }, + { + "epoch": 13.03168, + "grad_norm": 0.9257388710975647, + "learning_rate": 2.965386154461785e-05, + "loss": 0.6321, + "step": 10181 + }, + { + "epoch": 13.03296, + "grad_norm": 0.8940935730934143, + "learning_rate": 2.965186074429772e-05, + "loss": 0.5406, + "step": 10182 + }, + { + "epoch": 13.03424, + "grad_norm": 0.9493997693061829, + "learning_rate": 2.9649859943977593e-05, + "loss": 0.5364, + "step": 10183 + }, + { + "epoch": 13.03552, + "grad_norm": 0.8869070410728455, + "learning_rate": 2.9647859143657464e-05, + "loss": 0.5076, + "step": 10184 + }, + { + "epoch": 13.0368, + "grad_norm": 0.9130921363830566, + "learning_rate": 2.9645858343337336e-05, + "loss": 0.553, + "step": 10185 + }, + { + "epoch": 13.03808, + "grad_norm": 0.8858230113983154, + "learning_rate": 2.9643857543017205e-05, + "loss": 0.471, + "step": 10186 + }, + { + "epoch": 13.03936, + "grad_norm": 0.8924762010574341, + "learning_rate": 2.9641856742697083e-05, + "loss": 0.5224, + "step": 10187 + }, + { + "epoch": 13.04064, + "grad_norm": 0.9254968762397766, + "learning_rate": 2.9639855942376955e-05, + "loss": 0.5203, + "step": 10188 + }, + { + "epoch": 13.04192, + "grad_norm": 0.9559713006019592, + "learning_rate": 2.9637855142056824e-05, + "loss": 0.5552, + "step": 10189 + }, + { + "epoch": 13.0432, + "grad_norm": 0.9039125442504883, + "learning_rate": 2.9635854341736696e-05, + "loss": 0.5119, + "step": 10190 + }, + { + "epoch": 13.04448, + "grad_norm": 0.9470018744468689, + "learning_rate": 2.9633853541416567e-05, + "loss": 0.5233, + "step": 10191 + }, + { + "epoch": 13.04576, + "grad_norm": 0.991151750087738, + "learning_rate": 2.963185274109644e-05, + "loss": 0.5534, + "step": 10192 + }, + { + "epoch": 13.04704, + "grad_norm": 0.9508792161941528, + "learning_rate": 2.962985194077631e-05, + "loss": 0.4863, + "step": 10193 + }, + { + "epoch": 13.04832, + "grad_norm": 0.9612112045288086, + "learning_rate": 2.9627851140456186e-05, + "loss": 0.5613, + "step": 10194 + }, + { + "epoch": 13.0496, + "grad_norm": 0.9475077986717224, + "learning_rate": 2.9625850340136058e-05, + "loss": 0.5695, + "step": 10195 + }, + { + "epoch": 13.05088, + "grad_norm": 0.9162499308586121, + "learning_rate": 2.962384953981593e-05, + "loss": 0.5628, + "step": 10196 + }, + { + "epoch": 13.05216, + "grad_norm": 0.9611510038375854, + "learning_rate": 2.96218487394958e-05, + "loss": 0.5155, + "step": 10197 + }, + { + "epoch": 13.05344, + "grad_norm": 0.9086456298828125, + "learning_rate": 2.961984793917567e-05, + "loss": 0.5197, + "step": 10198 + }, + { + "epoch": 13.05472, + "grad_norm": 0.9529975652694702, + "learning_rate": 2.9617847138855542e-05, + "loss": 0.5468, + "step": 10199 + }, + { + "epoch": 13.056, + "grad_norm": 0.9286540746688843, + "learning_rate": 2.9615846338535414e-05, + "loss": 0.5342, + "step": 10200 + }, + { + "epoch": 13.05728, + "grad_norm": 0.9417791366577148, + "learning_rate": 2.961384553821529e-05, + "loss": 0.5515, + "step": 10201 + }, + { + "epoch": 13.05856, + "grad_norm": 0.9122342467308044, + "learning_rate": 2.961184473789516e-05, + "loss": 0.4782, + "step": 10202 + }, + { + "epoch": 13.05984, + "grad_norm": 0.9562501311302185, + "learning_rate": 2.9609843937575033e-05, + "loss": 0.5577, + "step": 10203 + }, + { + "epoch": 13.06112, + "grad_norm": 0.9384812116622925, + "learning_rate": 2.9607843137254905e-05, + "loss": 0.5087, + "step": 10204 + }, + { + "epoch": 13.0624, + "grad_norm": 0.9272902607917786, + "learning_rate": 2.9605842336934773e-05, + "loss": 0.4929, + "step": 10205 + }, + { + "epoch": 13.06368, + "grad_norm": 0.9776669144630432, + "learning_rate": 2.9603841536614645e-05, + "loss": 0.5292, + "step": 10206 + }, + { + "epoch": 13.06496, + "grad_norm": 0.9316274523735046, + "learning_rate": 2.9601840736294517e-05, + "loss": 0.5501, + "step": 10207 + }, + { + "epoch": 13.06624, + "grad_norm": 0.9690887928009033, + "learning_rate": 2.9599839935974392e-05, + "loss": 0.5342, + "step": 10208 + }, + { + "epoch": 13.06752, + "grad_norm": 0.8935588002204895, + "learning_rate": 2.9597839135654264e-05, + "loss": 0.5046, + "step": 10209 + }, + { + "epoch": 13.0688, + "grad_norm": 0.9328128099441528, + "learning_rate": 2.9595838335334136e-05, + "loss": 0.5601, + "step": 10210 + }, + { + "epoch": 13.07008, + "grad_norm": 0.9481015801429749, + "learning_rate": 2.9593837535014008e-05, + "loss": 0.5336, + "step": 10211 + }, + { + "epoch": 13.07136, + "grad_norm": 0.9740045070648193, + "learning_rate": 2.959183673469388e-05, + "loss": 0.5383, + "step": 10212 + }, + { + "epoch": 13.07264, + "grad_norm": 0.9591369032859802, + "learning_rate": 2.958983593437375e-05, + "loss": 0.5688, + "step": 10213 + }, + { + "epoch": 13.07392, + "grad_norm": 0.8902487754821777, + "learning_rate": 2.958783513405362e-05, + "loss": 0.5243, + "step": 10214 + }, + { + "epoch": 13.0752, + "grad_norm": 0.8981714844703674, + "learning_rate": 2.95858343337335e-05, + "loss": 0.504, + "step": 10215 + }, + { + "epoch": 13.07648, + "grad_norm": 0.857538640499115, + "learning_rate": 2.9583833533413367e-05, + "loss": 0.4907, + "step": 10216 + }, + { + "epoch": 13.07776, + "grad_norm": 0.9131780862808228, + "learning_rate": 2.958183273309324e-05, + "loss": 0.5018, + "step": 10217 + }, + { + "epoch": 13.079039999999999, + "grad_norm": 0.9661425352096558, + "learning_rate": 2.957983193277311e-05, + "loss": 0.5639, + "step": 10218 + }, + { + "epoch": 13.08032, + "grad_norm": 0.9604724645614624, + "learning_rate": 2.9577831132452983e-05, + "loss": 0.5433, + "step": 10219 + }, + { + "epoch": 13.0816, + "grad_norm": 0.9069200158119202, + "learning_rate": 2.9575830332132855e-05, + "loss": 0.5409, + "step": 10220 + }, + { + "epoch": 13.08288, + "grad_norm": 0.9007838368415833, + "learning_rate": 2.9573829531812723e-05, + "loss": 0.5377, + "step": 10221 + }, + { + "epoch": 13.08416, + "grad_norm": 0.9173440337181091, + "learning_rate": 2.9571828731492595e-05, + "loss": 0.5275, + "step": 10222 + }, + { + "epoch": 13.08544, + "grad_norm": 0.936603844165802, + "learning_rate": 2.9569827931172474e-05, + "loss": 0.5257, + "step": 10223 + }, + { + "epoch": 13.08672, + "grad_norm": 0.9958267211914062, + "learning_rate": 2.9567827130852342e-05, + "loss": 0.5632, + "step": 10224 + }, + { + "epoch": 13.088, + "grad_norm": 0.9502796530723572, + "learning_rate": 2.9565826330532214e-05, + "loss": 0.5396, + "step": 10225 + }, + { + "epoch": 13.08928, + "grad_norm": 0.8708899617195129, + "learning_rate": 2.9563825530212086e-05, + "loss": 0.4855, + "step": 10226 + }, + { + "epoch": 13.09056, + "grad_norm": 0.9281712770462036, + "learning_rate": 2.9561824729891958e-05, + "loss": 0.4983, + "step": 10227 + }, + { + "epoch": 13.09184, + "grad_norm": 0.943922221660614, + "learning_rate": 2.955982392957183e-05, + "loss": 0.5545, + "step": 10228 + }, + { + "epoch": 13.09312, + "grad_norm": 0.9381184577941895, + "learning_rate": 2.9557823129251698e-05, + "loss": 0.5406, + "step": 10229 + }, + { + "epoch": 13.0944, + "grad_norm": 0.9126680493354797, + "learning_rate": 2.9555822328931577e-05, + "loss": 0.5364, + "step": 10230 + }, + { + "epoch": 13.09568, + "grad_norm": 0.9556214809417725, + "learning_rate": 2.955382152861145e-05, + "loss": 0.5298, + "step": 10231 + }, + { + "epoch": 13.09696, + "grad_norm": 0.9941635727882385, + "learning_rate": 2.9551820728291317e-05, + "loss": 0.5511, + "step": 10232 + }, + { + "epoch": 13.09824, + "grad_norm": 0.9520924687385559, + "learning_rate": 2.954981992797119e-05, + "loss": 0.5176, + "step": 10233 + }, + { + "epoch": 13.09952, + "grad_norm": 1.0025166273117065, + "learning_rate": 2.954781912765106e-05, + "loss": 0.5434, + "step": 10234 + }, + { + "epoch": 13.1008, + "grad_norm": 0.9303215146064758, + "learning_rate": 2.9545818327330933e-05, + "loss": 0.5081, + "step": 10235 + }, + { + "epoch": 13.10208, + "grad_norm": 0.9125444293022156, + "learning_rate": 2.9543817527010804e-05, + "loss": 0.5134, + "step": 10236 + }, + { + "epoch": 13.10336, + "grad_norm": 0.9002273678779602, + "learning_rate": 2.954181672669068e-05, + "loss": 0.4885, + "step": 10237 + }, + { + "epoch": 13.10464, + "grad_norm": 0.9505068063735962, + "learning_rate": 2.953981592637055e-05, + "loss": 0.5645, + "step": 10238 + }, + { + "epoch": 13.10592, + "grad_norm": 0.9050679206848145, + "learning_rate": 2.9537815126050423e-05, + "loss": 0.5322, + "step": 10239 + }, + { + "epoch": 13.1072, + "grad_norm": 0.9206121563911438, + "learning_rate": 2.9535814325730292e-05, + "loss": 0.5077, + "step": 10240 + }, + { + "epoch": 13.10848, + "grad_norm": 0.9037232398986816, + "learning_rate": 2.9533813525410164e-05, + "loss": 0.5202, + "step": 10241 + }, + { + "epoch": 13.10976, + "grad_norm": 0.9261408448219299, + "learning_rate": 2.9531812725090036e-05, + "loss": 0.5514, + "step": 10242 + }, + { + "epoch": 13.11104, + "grad_norm": 0.9304765462875366, + "learning_rate": 2.9529811924769907e-05, + "loss": 0.5671, + "step": 10243 + }, + { + "epoch": 13.11232, + "grad_norm": 0.8699920773506165, + "learning_rate": 2.9527811124449783e-05, + "loss": 0.5076, + "step": 10244 + }, + { + "epoch": 13.1136, + "grad_norm": 0.9810715913772583, + "learning_rate": 2.9525810324129655e-05, + "loss": 0.6187, + "step": 10245 + }, + { + "epoch": 13.11488, + "grad_norm": 0.9661498665809631, + "learning_rate": 2.9523809523809526e-05, + "loss": 0.575, + "step": 10246 + }, + { + "epoch": 13.11616, + "grad_norm": 0.9530900120735168, + "learning_rate": 2.9521808723489398e-05, + "loss": 0.5207, + "step": 10247 + }, + { + "epoch": 13.11744, + "grad_norm": 0.8980867266654968, + "learning_rate": 2.9519807923169267e-05, + "loss": 0.48, + "step": 10248 + }, + { + "epoch": 13.11872, + "grad_norm": 1.0098669528961182, + "learning_rate": 2.951780712284914e-05, + "loss": 0.5863, + "step": 10249 + }, + { + "epoch": 13.12, + "grad_norm": 0.8966968655586243, + "learning_rate": 2.951580632252901e-05, + "loss": 0.5161, + "step": 10250 + }, + { + "epoch": 13.12128, + "grad_norm": 0.8702352046966553, + "learning_rate": 2.9513805522208886e-05, + "loss": 0.4706, + "step": 10251 + }, + { + "epoch": 13.12256, + "grad_norm": 0.923116147518158, + "learning_rate": 2.9511804721888758e-05, + "loss": 0.5097, + "step": 10252 + }, + { + "epoch": 13.12384, + "grad_norm": 0.9864121079444885, + "learning_rate": 2.950980392156863e-05, + "loss": 0.5453, + "step": 10253 + }, + { + "epoch": 13.12512, + "grad_norm": 0.9209245443344116, + "learning_rate": 2.95078031212485e-05, + "loss": 0.5178, + "step": 10254 + }, + { + "epoch": 13.1264, + "grad_norm": 0.9390770792961121, + "learning_rate": 2.9505802320928373e-05, + "loss": 0.5423, + "step": 10255 + }, + { + "epoch": 13.12768, + "grad_norm": 0.9811953902244568, + "learning_rate": 2.950380152060824e-05, + "loss": 0.5938, + "step": 10256 + }, + { + "epoch": 13.12896, + "grad_norm": 0.9948422908782959, + "learning_rate": 2.9501800720288113e-05, + "loss": 0.6015, + "step": 10257 + }, + { + "epoch": 13.13024, + "grad_norm": 0.9334530830383301, + "learning_rate": 2.9499799919967992e-05, + "loss": 0.5042, + "step": 10258 + }, + { + "epoch": 13.13152, + "grad_norm": 0.887524425983429, + "learning_rate": 2.949779911964786e-05, + "loss": 0.5115, + "step": 10259 + }, + { + "epoch": 13.1328, + "grad_norm": 0.9282869696617126, + "learning_rate": 2.9495798319327732e-05, + "loss": 0.5348, + "step": 10260 + }, + { + "epoch": 13.13408, + "grad_norm": 0.8602845072746277, + "learning_rate": 2.9493797519007604e-05, + "loss": 0.477, + "step": 10261 + }, + { + "epoch": 13.13536, + "grad_norm": 0.9811587333679199, + "learning_rate": 2.9491796718687476e-05, + "loss": 0.6148, + "step": 10262 + }, + { + "epoch": 13.13664, + "grad_norm": 0.938101589679718, + "learning_rate": 2.9489795918367348e-05, + "loss": 0.5469, + "step": 10263 + }, + { + "epoch": 13.13792, + "grad_norm": 0.938423216342926, + "learning_rate": 2.9487795118047216e-05, + "loss": 0.5415, + "step": 10264 + }, + { + "epoch": 13.1392, + "grad_norm": 0.9609931111335754, + "learning_rate": 2.9485794317727095e-05, + "loss": 0.5779, + "step": 10265 + }, + { + "epoch": 13.14048, + "grad_norm": 0.9402124881744385, + "learning_rate": 2.9483793517406967e-05, + "loss": 0.5688, + "step": 10266 + }, + { + "epoch": 13.14176, + "grad_norm": 0.9588215351104736, + "learning_rate": 2.9481792717086835e-05, + "loss": 0.5559, + "step": 10267 + }, + { + "epoch": 13.14304, + "grad_norm": 0.9343993663787842, + "learning_rate": 2.9479791916766707e-05, + "loss": 0.5233, + "step": 10268 + }, + { + "epoch": 13.14432, + "grad_norm": 0.8951725363731384, + "learning_rate": 2.947779111644658e-05, + "loss": 0.4691, + "step": 10269 + }, + { + "epoch": 13.1456, + "grad_norm": 0.9678958654403687, + "learning_rate": 2.947579031612645e-05, + "loss": 0.5648, + "step": 10270 + }, + { + "epoch": 13.14688, + "grad_norm": 0.973098874092102, + "learning_rate": 2.9473789515806323e-05, + "loss": 0.527, + "step": 10271 + }, + { + "epoch": 13.14816, + "grad_norm": 0.9279984831809998, + "learning_rate": 2.9471788715486198e-05, + "loss": 0.4758, + "step": 10272 + }, + { + "epoch": 13.14944, + "grad_norm": 0.9784411787986755, + "learning_rate": 2.946978791516607e-05, + "loss": 0.5491, + "step": 10273 + }, + { + "epoch": 13.15072, + "grad_norm": 0.9150949120521545, + "learning_rate": 2.9467787114845942e-05, + "loss": 0.5418, + "step": 10274 + }, + { + "epoch": 13.152, + "grad_norm": 0.8972795605659485, + "learning_rate": 2.946578631452581e-05, + "loss": 0.5065, + "step": 10275 + }, + { + "epoch": 13.15328, + "grad_norm": 0.8859349489212036, + "learning_rate": 2.9463785514205682e-05, + "loss": 0.5211, + "step": 10276 + }, + { + "epoch": 13.15456, + "grad_norm": 0.9786564707756042, + "learning_rate": 2.9461784713885554e-05, + "loss": 0.5468, + "step": 10277 + }, + { + "epoch": 13.15584, + "grad_norm": 0.9281904101371765, + "learning_rate": 2.9459783913565426e-05, + "loss": 0.5303, + "step": 10278 + }, + { + "epoch": 13.15712, + "grad_norm": 0.8598402142524719, + "learning_rate": 2.94577831132453e-05, + "loss": 0.4675, + "step": 10279 + }, + { + "epoch": 13.1584, + "grad_norm": 0.881607174873352, + "learning_rate": 2.9455782312925173e-05, + "loss": 0.4969, + "step": 10280 + }, + { + "epoch": 13.15968, + "grad_norm": 0.8797462582588196, + "learning_rate": 2.9453781512605045e-05, + "loss": 0.5216, + "step": 10281 + }, + { + "epoch": 13.16096, + "grad_norm": 0.9314226508140564, + "learning_rate": 2.9451780712284917e-05, + "loss": 0.5117, + "step": 10282 + }, + { + "epoch": 13.16224, + "grad_norm": 0.9078558683395386, + "learning_rate": 2.9449779911964785e-05, + "loss": 0.5344, + "step": 10283 + }, + { + "epoch": 13.16352, + "grad_norm": 0.972720742225647, + "learning_rate": 2.9447779111644657e-05, + "loss": 0.5651, + "step": 10284 + }, + { + "epoch": 13.1648, + "grad_norm": 0.8826903104782104, + "learning_rate": 2.944577831132453e-05, + "loss": 0.4932, + "step": 10285 + }, + { + "epoch": 13.166080000000001, + "grad_norm": 0.9253196120262146, + "learning_rate": 2.9443777511004404e-05, + "loss": 0.5725, + "step": 10286 + }, + { + "epoch": 13.16736, + "grad_norm": 0.9346468448638916, + "learning_rate": 2.9441776710684276e-05, + "loss": 0.5665, + "step": 10287 + }, + { + "epoch": 13.16864, + "grad_norm": 0.86590176820755, + "learning_rate": 2.9439775910364148e-05, + "loss": 0.4922, + "step": 10288 + }, + { + "epoch": 13.16992, + "grad_norm": 0.8929559588432312, + "learning_rate": 2.943777511004402e-05, + "loss": 0.5051, + "step": 10289 + }, + { + "epoch": 13.1712, + "grad_norm": 0.9785338044166565, + "learning_rate": 2.943577430972389e-05, + "loss": 0.565, + "step": 10290 + }, + { + "epoch": 13.17248, + "grad_norm": 0.9786810278892517, + "learning_rate": 2.943377350940376e-05, + "loss": 0.5285, + "step": 10291 + }, + { + "epoch": 13.17376, + "grad_norm": 1.0224378108978271, + "learning_rate": 2.9431772709083632e-05, + "loss": 0.6223, + "step": 10292 + }, + { + "epoch": 13.17504, + "grad_norm": 0.8732693195343018, + "learning_rate": 2.942977190876351e-05, + "loss": 0.4957, + "step": 10293 + }, + { + "epoch": 13.17632, + "grad_norm": 0.8983555436134338, + "learning_rate": 2.942777110844338e-05, + "loss": 0.5085, + "step": 10294 + }, + { + "epoch": 13.1776, + "grad_norm": 0.9694886803627014, + "learning_rate": 2.942577030812325e-05, + "loss": 0.534, + "step": 10295 + }, + { + "epoch": 13.17888, + "grad_norm": 0.9326276183128357, + "learning_rate": 2.9423769507803123e-05, + "loss": 0.5244, + "step": 10296 + }, + { + "epoch": 13.18016, + "grad_norm": 0.9629067778587341, + "learning_rate": 2.9421768707482994e-05, + "loss": 0.5082, + "step": 10297 + }, + { + "epoch": 13.18144, + "grad_norm": 0.8940865397453308, + "learning_rate": 2.9419767907162866e-05, + "loss": 0.4947, + "step": 10298 + }, + { + "epoch": 13.18272, + "grad_norm": 0.9360054135322571, + "learning_rate": 2.9417767106842735e-05, + "loss": 0.5556, + "step": 10299 + }, + { + "epoch": 13.184, + "grad_norm": 0.8930971026420593, + "learning_rate": 2.9415766306522613e-05, + "loss": 0.4866, + "step": 10300 + }, + { + "epoch": 13.18528, + "grad_norm": 0.9753515720367432, + "learning_rate": 2.9413765506202485e-05, + "loss": 0.4929, + "step": 10301 + }, + { + "epoch": 13.18656, + "grad_norm": 0.8833822011947632, + "learning_rate": 2.9411764705882354e-05, + "loss": 0.5081, + "step": 10302 + }, + { + "epoch": 13.18784, + "grad_norm": 0.9951672554016113, + "learning_rate": 2.9409763905562226e-05, + "loss": 0.5509, + "step": 10303 + }, + { + "epoch": 13.18912, + "grad_norm": 0.9447710514068604, + "learning_rate": 2.9407763105242097e-05, + "loss": 0.5132, + "step": 10304 + }, + { + "epoch": 13.1904, + "grad_norm": 0.960518479347229, + "learning_rate": 2.940576230492197e-05, + "loss": 0.5609, + "step": 10305 + }, + { + "epoch": 13.19168, + "grad_norm": 0.9514713883399963, + "learning_rate": 2.940376150460184e-05, + "loss": 0.5378, + "step": 10306 + }, + { + "epoch": 13.19296, + "grad_norm": 0.9678971767425537, + "learning_rate": 2.9401760704281716e-05, + "loss": 0.5464, + "step": 10307 + }, + { + "epoch": 13.19424, + "grad_norm": 0.9750454425811768, + "learning_rate": 2.9399759903961588e-05, + "loss": 0.5489, + "step": 10308 + }, + { + "epoch": 13.19552, + "grad_norm": 0.9416602849960327, + "learning_rate": 2.939775910364146e-05, + "loss": 0.5295, + "step": 10309 + }, + { + "epoch": 13.1968, + "grad_norm": 0.9217522144317627, + "learning_rate": 2.939575830332133e-05, + "loss": 0.562, + "step": 10310 + }, + { + "epoch": 13.19808, + "grad_norm": 0.8989900946617126, + "learning_rate": 2.93937575030012e-05, + "loss": 0.5067, + "step": 10311 + }, + { + "epoch": 13.19936, + "grad_norm": 0.9681139588356018, + "learning_rate": 2.9391756702681072e-05, + "loss": 0.5817, + "step": 10312 + }, + { + "epoch": 13.20064, + "grad_norm": 0.9325006604194641, + "learning_rate": 2.9389755902360944e-05, + "loss": 0.5379, + "step": 10313 + }, + { + "epoch": 13.20192, + "grad_norm": 0.9002286195755005, + "learning_rate": 2.938775510204082e-05, + "loss": 0.5274, + "step": 10314 + }, + { + "epoch": 13.2032, + "grad_norm": 0.9191522598266602, + "learning_rate": 2.938575430172069e-05, + "loss": 0.5265, + "step": 10315 + }, + { + "epoch": 13.20448, + "grad_norm": 0.9868195056915283, + "learning_rate": 2.9383753501400563e-05, + "loss": 0.5672, + "step": 10316 + }, + { + "epoch": 13.20576, + "grad_norm": 0.935142457485199, + "learning_rate": 2.9381752701080435e-05, + "loss": 0.559, + "step": 10317 + }, + { + "epoch": 13.20704, + "grad_norm": 0.9310742616653442, + "learning_rate": 2.9379751900760303e-05, + "loss": 0.5183, + "step": 10318 + }, + { + "epoch": 13.20832, + "grad_norm": 0.893144428730011, + "learning_rate": 2.9377751100440175e-05, + "loss": 0.5092, + "step": 10319 + }, + { + "epoch": 13.2096, + "grad_norm": 0.9234650135040283, + "learning_rate": 2.9375750300120047e-05, + "loss": 0.5185, + "step": 10320 + }, + { + "epoch": 13.21088, + "grad_norm": 0.868157684803009, + "learning_rate": 2.9373749499799922e-05, + "loss": 0.4812, + "step": 10321 + }, + { + "epoch": 13.21216, + "grad_norm": 0.900688886642456, + "learning_rate": 2.9371748699479794e-05, + "loss": 0.5181, + "step": 10322 + }, + { + "epoch": 13.21344, + "grad_norm": 0.9493388533592224, + "learning_rate": 2.9369747899159666e-05, + "loss": 0.5613, + "step": 10323 + }, + { + "epoch": 13.21472, + "grad_norm": 0.9593266844749451, + "learning_rate": 2.9367747098839538e-05, + "loss": 0.5462, + "step": 10324 + }, + { + "epoch": 13.216, + "grad_norm": 1.0145825147628784, + "learning_rate": 2.936574629851941e-05, + "loss": 0.5891, + "step": 10325 + }, + { + "epoch": 13.21728, + "grad_norm": 0.9658340811729431, + "learning_rate": 2.936374549819928e-05, + "loss": 0.533, + "step": 10326 + }, + { + "epoch": 13.21856, + "grad_norm": 0.8957772850990295, + "learning_rate": 2.936174469787915e-05, + "loss": 0.5246, + "step": 10327 + }, + { + "epoch": 13.21984, + "grad_norm": 0.9555619955062866, + "learning_rate": 2.935974389755903e-05, + "loss": 0.5181, + "step": 10328 + }, + { + "epoch": 13.22112, + "grad_norm": 0.9531823396682739, + "learning_rate": 2.9357743097238897e-05, + "loss": 0.5459, + "step": 10329 + }, + { + "epoch": 13.2224, + "grad_norm": 0.8920832276344299, + "learning_rate": 2.935574229691877e-05, + "loss": 0.5796, + "step": 10330 + }, + { + "epoch": 13.22368, + "grad_norm": 0.9434327483177185, + "learning_rate": 2.935374149659864e-05, + "loss": 0.5694, + "step": 10331 + }, + { + "epoch": 13.22496, + "grad_norm": 0.9526799321174622, + "learning_rate": 2.9351740696278513e-05, + "loss": 0.5646, + "step": 10332 + }, + { + "epoch": 13.22624, + "grad_norm": 1.0042227506637573, + "learning_rate": 2.9349739895958385e-05, + "loss": 0.5437, + "step": 10333 + }, + { + "epoch": 13.22752, + "grad_norm": 0.974702775478363, + "learning_rate": 2.9347739095638253e-05, + "loss": 0.5569, + "step": 10334 + }, + { + "epoch": 13.2288, + "grad_norm": 0.9160645604133606, + "learning_rate": 2.9345738295318125e-05, + "loss": 0.5174, + "step": 10335 + }, + { + "epoch": 13.23008, + "grad_norm": 0.9802337884902954, + "learning_rate": 2.9343737494998004e-05, + "loss": 0.578, + "step": 10336 + }, + { + "epoch": 13.23136, + "grad_norm": 0.9759039878845215, + "learning_rate": 2.9341736694677872e-05, + "loss": 0.5744, + "step": 10337 + }, + { + "epoch": 13.23264, + "grad_norm": 0.9348410367965698, + "learning_rate": 2.9339735894357744e-05, + "loss": 0.5304, + "step": 10338 + }, + { + "epoch": 13.23392, + "grad_norm": 0.9038177728652954, + "learning_rate": 2.9337735094037616e-05, + "loss": 0.5301, + "step": 10339 + }, + { + "epoch": 13.2352, + "grad_norm": 0.880155086517334, + "learning_rate": 2.9335734293717488e-05, + "loss": 0.496, + "step": 10340 + }, + { + "epoch": 13.23648, + "grad_norm": 0.9181184768676758, + "learning_rate": 2.933373349339736e-05, + "loss": 0.4825, + "step": 10341 + }, + { + "epoch": 13.23776, + "grad_norm": 0.9265780448913574, + "learning_rate": 2.9331732693077228e-05, + "loss": 0.5313, + "step": 10342 + }, + { + "epoch": 13.23904, + "grad_norm": 0.9894605875015259, + "learning_rate": 2.9329731892757107e-05, + "loss": 0.5617, + "step": 10343 + }, + { + "epoch": 13.24032, + "grad_norm": 0.9149695634841919, + "learning_rate": 2.932773109243698e-05, + "loss": 0.5262, + "step": 10344 + }, + { + "epoch": 13.2416, + "grad_norm": 0.9303181171417236, + "learning_rate": 2.9325730292116847e-05, + "loss": 0.542, + "step": 10345 + }, + { + "epoch": 13.24288, + "grad_norm": 0.958263099193573, + "learning_rate": 2.932372949179672e-05, + "loss": 0.5283, + "step": 10346 + }, + { + "epoch": 13.24416, + "grad_norm": 0.9637152552604675, + "learning_rate": 2.932172869147659e-05, + "loss": 0.5236, + "step": 10347 + }, + { + "epoch": 13.24544, + "grad_norm": 0.9949811100959778, + "learning_rate": 2.9319727891156463e-05, + "loss": 0.5897, + "step": 10348 + }, + { + "epoch": 13.24672, + "grad_norm": 0.988006055355072, + "learning_rate": 2.9317727090836334e-05, + "loss": 0.4802, + "step": 10349 + }, + { + "epoch": 13.248, + "grad_norm": 0.9795376062393188, + "learning_rate": 2.931572629051621e-05, + "loss": 0.5514, + "step": 10350 + }, + { + "epoch": 13.24928, + "grad_norm": 0.9923257827758789, + "learning_rate": 2.931372549019608e-05, + "loss": 0.5244, + "step": 10351 + }, + { + "epoch": 13.25056, + "grad_norm": 0.9084767699241638, + "learning_rate": 2.9311724689875953e-05, + "loss": 0.5395, + "step": 10352 + }, + { + "epoch": 13.25184, + "grad_norm": 0.9532386660575867, + "learning_rate": 2.9309723889555822e-05, + "loss": 0.5635, + "step": 10353 + }, + { + "epoch": 13.25312, + "grad_norm": 0.9513797163963318, + "learning_rate": 2.9307723089235694e-05, + "loss": 0.5299, + "step": 10354 + }, + { + "epoch": 13.2544, + "grad_norm": 0.9385073184967041, + "learning_rate": 2.9305722288915566e-05, + "loss": 0.5027, + "step": 10355 + }, + { + "epoch": 13.25568, + "grad_norm": 0.9745035767555237, + "learning_rate": 2.9303721488595437e-05, + "loss": 0.4923, + "step": 10356 + }, + { + "epoch": 13.25696, + "grad_norm": 0.9193017482757568, + "learning_rate": 2.9301720688275313e-05, + "loss": 0.5418, + "step": 10357 + }, + { + "epoch": 13.25824, + "grad_norm": 0.9343931674957275, + "learning_rate": 2.9299719887955185e-05, + "loss": 0.5197, + "step": 10358 + }, + { + "epoch": 13.25952, + "grad_norm": 0.954754114151001, + "learning_rate": 2.9297719087635056e-05, + "loss": 0.5203, + "step": 10359 + }, + { + "epoch": 13.2608, + "grad_norm": 0.9113470911979675, + "learning_rate": 2.9295718287314928e-05, + "loss": 0.525, + "step": 10360 + }, + { + "epoch": 13.26208, + "grad_norm": 0.94349205493927, + "learning_rate": 2.9293717486994797e-05, + "loss": 0.5263, + "step": 10361 + }, + { + "epoch": 13.26336, + "grad_norm": 0.980129063129425, + "learning_rate": 2.929171668667467e-05, + "loss": 0.5881, + "step": 10362 + }, + { + "epoch": 13.26464, + "grad_norm": 0.9003955125808716, + "learning_rate": 2.928971588635454e-05, + "loss": 0.5014, + "step": 10363 + }, + { + "epoch": 13.26592, + "grad_norm": 0.9492081999778748, + "learning_rate": 2.928771508603442e-05, + "loss": 0.5945, + "step": 10364 + }, + { + "epoch": 13.2672, + "grad_norm": 0.9139607548713684, + "learning_rate": 2.9285714285714288e-05, + "loss": 0.4997, + "step": 10365 + }, + { + "epoch": 13.26848, + "grad_norm": 0.9153253436088562, + "learning_rate": 2.928371348539416e-05, + "loss": 0.5104, + "step": 10366 + }, + { + "epoch": 13.26976, + "grad_norm": 0.944437563419342, + "learning_rate": 2.928171268507403e-05, + "loss": 0.5496, + "step": 10367 + }, + { + "epoch": 13.27104, + "grad_norm": 0.9628861546516418, + "learning_rate": 2.9279711884753903e-05, + "loss": 0.5243, + "step": 10368 + }, + { + "epoch": 13.27232, + "grad_norm": 0.9358019232749939, + "learning_rate": 2.927771108443377e-05, + "loss": 0.5218, + "step": 10369 + }, + { + "epoch": 13.2736, + "grad_norm": 0.9231796860694885, + "learning_rate": 2.9275710284113643e-05, + "loss": 0.5187, + "step": 10370 + }, + { + "epoch": 13.27488, + "grad_norm": 0.9316679239273071, + "learning_rate": 2.9273709483793522e-05, + "loss": 0.5469, + "step": 10371 + }, + { + "epoch": 13.27616, + "grad_norm": 0.8943392634391785, + "learning_rate": 2.9271708683473394e-05, + "loss": 0.4963, + "step": 10372 + }, + { + "epoch": 13.27744, + "grad_norm": 0.9293007850646973, + "learning_rate": 2.9269707883153262e-05, + "loss": 0.5534, + "step": 10373 + }, + { + "epoch": 13.27872, + "grad_norm": 0.9566161632537842, + "learning_rate": 2.9267707082833134e-05, + "loss": 0.6032, + "step": 10374 + }, + { + "epoch": 13.28, + "grad_norm": 0.945538341999054, + "learning_rate": 2.9265706282513006e-05, + "loss": 0.525, + "step": 10375 + }, + { + "epoch": 13.28128, + "grad_norm": 0.9262788891792297, + "learning_rate": 2.9263705482192878e-05, + "loss": 0.5087, + "step": 10376 + }, + { + "epoch": 13.28256, + "grad_norm": 0.9590131044387817, + "learning_rate": 2.9261704681872746e-05, + "loss": 0.4981, + "step": 10377 + }, + { + "epoch": 13.28384, + "grad_norm": 0.9247324466705322, + "learning_rate": 2.9259703881552625e-05, + "loss": 0.5045, + "step": 10378 + }, + { + "epoch": 13.28512, + "grad_norm": 0.9148200154304504, + "learning_rate": 2.9257703081232497e-05, + "loss": 0.4692, + "step": 10379 + }, + { + "epoch": 13.2864, + "grad_norm": 0.9700202941894531, + "learning_rate": 2.925570228091237e-05, + "loss": 0.5181, + "step": 10380 + }, + { + "epoch": 13.28768, + "grad_norm": 0.9987017512321472, + "learning_rate": 2.9253701480592237e-05, + "loss": 0.5395, + "step": 10381 + }, + { + "epoch": 13.28896, + "grad_norm": 1.037541151046753, + "learning_rate": 2.925170068027211e-05, + "loss": 0.5464, + "step": 10382 + }, + { + "epoch": 13.29024, + "grad_norm": 0.979844868183136, + "learning_rate": 2.924969987995198e-05, + "loss": 0.5191, + "step": 10383 + }, + { + "epoch": 13.29152, + "grad_norm": 0.9726676344871521, + "learning_rate": 2.9247699079631853e-05, + "loss": 0.5464, + "step": 10384 + }, + { + "epoch": 13.2928, + "grad_norm": 0.9922785758972168, + "learning_rate": 2.9245698279311728e-05, + "loss": 0.5719, + "step": 10385 + }, + { + "epoch": 13.29408, + "grad_norm": 0.9066249132156372, + "learning_rate": 2.92436974789916e-05, + "loss": 0.5037, + "step": 10386 + }, + { + "epoch": 13.29536, + "grad_norm": 0.9383981823921204, + "learning_rate": 2.9241696678671472e-05, + "loss": 0.5398, + "step": 10387 + }, + { + "epoch": 13.29664, + "grad_norm": 0.9474811553955078, + "learning_rate": 2.9239695878351344e-05, + "loss": 0.5275, + "step": 10388 + }, + { + "epoch": 13.29792, + "grad_norm": 0.9555049538612366, + "learning_rate": 2.9237695078031212e-05, + "loss": 0.5737, + "step": 10389 + }, + { + "epoch": 13.2992, + "grad_norm": 0.9900108575820923, + "learning_rate": 2.9235694277711084e-05, + "loss": 0.5522, + "step": 10390 + }, + { + "epoch": 13.30048, + "grad_norm": 0.9271486401557922, + "learning_rate": 2.9233693477390956e-05, + "loss": 0.5284, + "step": 10391 + }, + { + "epoch": 13.30176, + "grad_norm": 0.9608096480369568, + "learning_rate": 2.923169267707083e-05, + "loss": 0.5538, + "step": 10392 + }, + { + "epoch": 13.30304, + "grad_norm": 0.9278596639633179, + "learning_rate": 2.9229691876750703e-05, + "loss": 0.5174, + "step": 10393 + }, + { + "epoch": 13.30432, + "grad_norm": 0.9708346128463745, + "learning_rate": 2.9227691076430575e-05, + "loss": 0.5499, + "step": 10394 + }, + { + "epoch": 13.3056, + "grad_norm": 0.9422946572303772, + "learning_rate": 2.9225690276110447e-05, + "loss": 0.5444, + "step": 10395 + }, + { + "epoch": 13.30688, + "grad_norm": 0.9658584594726562, + "learning_rate": 2.922368947579032e-05, + "loss": 0.4903, + "step": 10396 + }, + { + "epoch": 13.30816, + "grad_norm": 0.978703498840332, + "learning_rate": 2.9221688675470187e-05, + "loss": 0.5569, + "step": 10397 + }, + { + "epoch": 13.30944, + "grad_norm": 0.9500980973243713, + "learning_rate": 2.921968787515006e-05, + "loss": 0.5384, + "step": 10398 + }, + { + "epoch": 13.31072, + "grad_norm": 0.9386541247367859, + "learning_rate": 2.9217687074829937e-05, + "loss": 0.5474, + "step": 10399 + }, + { + "epoch": 13.312, + "grad_norm": 0.9124325513839722, + "learning_rate": 2.9215686274509806e-05, + "loss": 0.5232, + "step": 10400 + }, + { + "epoch": 13.31328, + "grad_norm": 0.9420004487037659, + "learning_rate": 2.9213685474189678e-05, + "loss": 0.5154, + "step": 10401 + }, + { + "epoch": 13.31456, + "grad_norm": 1.029594898223877, + "learning_rate": 2.921168467386955e-05, + "loss": 0.6302, + "step": 10402 + }, + { + "epoch": 13.31584, + "grad_norm": 0.9675827622413635, + "learning_rate": 2.920968387354942e-05, + "loss": 0.5515, + "step": 10403 + }, + { + "epoch": 13.31712, + "grad_norm": 0.9070100784301758, + "learning_rate": 2.9207683073229293e-05, + "loss": 0.5061, + "step": 10404 + }, + { + "epoch": 13.3184, + "grad_norm": 0.9273656606674194, + "learning_rate": 2.9205682272909162e-05, + "loss": 0.5412, + "step": 10405 + }, + { + "epoch": 13.31968, + "grad_norm": 0.9532672762870789, + "learning_rate": 2.920368147258904e-05, + "loss": 0.5259, + "step": 10406 + }, + { + "epoch": 13.32096, + "grad_norm": 0.9627009630203247, + "learning_rate": 2.9201680672268912e-05, + "loss": 0.5173, + "step": 10407 + }, + { + "epoch": 13.32224, + "grad_norm": 0.9319729208946228, + "learning_rate": 2.919967987194878e-05, + "loss": 0.5869, + "step": 10408 + }, + { + "epoch": 13.32352, + "grad_norm": 0.873471200466156, + "learning_rate": 2.9197679071628653e-05, + "loss": 0.4851, + "step": 10409 + }, + { + "epoch": 13.3248, + "grad_norm": 0.9522954821586609, + "learning_rate": 2.9195678271308524e-05, + "loss": 0.5164, + "step": 10410 + }, + { + "epoch": 13.32608, + "grad_norm": 0.961445689201355, + "learning_rate": 2.9193677470988396e-05, + "loss": 0.5083, + "step": 10411 + }, + { + "epoch": 13.32736, + "grad_norm": 0.9519370794296265, + "learning_rate": 2.9191676670668268e-05, + "loss": 0.5352, + "step": 10412 + }, + { + "epoch": 13.32864, + "grad_norm": 0.9867494702339172, + "learning_rate": 2.9189675870348143e-05, + "loss": 0.5495, + "step": 10413 + }, + { + "epoch": 13.32992, + "grad_norm": 0.8867868185043335, + "learning_rate": 2.9187675070028015e-05, + "loss": 0.5229, + "step": 10414 + }, + { + "epoch": 13.3312, + "grad_norm": 0.9686993360519409, + "learning_rate": 2.9185674269707887e-05, + "loss": 0.5375, + "step": 10415 + }, + { + "epoch": 13.33248, + "grad_norm": 0.9888986945152283, + "learning_rate": 2.9183673469387756e-05, + "loss": 0.6055, + "step": 10416 + }, + { + "epoch": 13.33376, + "grad_norm": 0.9201542735099792, + "learning_rate": 2.9181672669067627e-05, + "loss": 0.5122, + "step": 10417 + }, + { + "epoch": 13.33504, + "grad_norm": 0.9173356294631958, + "learning_rate": 2.91796718687475e-05, + "loss": 0.5429, + "step": 10418 + }, + { + "epoch": 13.33632, + "grad_norm": 0.9418560862541199, + "learning_rate": 2.917767106842737e-05, + "loss": 0.5358, + "step": 10419 + }, + { + "epoch": 13.3376, + "grad_norm": 0.9006747603416443, + "learning_rate": 2.9175670268107246e-05, + "loss": 0.4745, + "step": 10420 + }, + { + "epoch": 13.33888, + "grad_norm": 0.9309629201889038, + "learning_rate": 2.9173669467787118e-05, + "loss": 0.5002, + "step": 10421 + }, + { + "epoch": 13.340160000000001, + "grad_norm": 0.9739012718200684, + "learning_rate": 2.917166866746699e-05, + "loss": 0.5861, + "step": 10422 + }, + { + "epoch": 13.34144, + "grad_norm": 0.9974890351295471, + "learning_rate": 2.9169667867146862e-05, + "loss": 0.5475, + "step": 10423 + }, + { + "epoch": 13.34272, + "grad_norm": 0.9011924862861633, + "learning_rate": 2.916766706682673e-05, + "loss": 0.5111, + "step": 10424 + }, + { + "epoch": 13.344, + "grad_norm": 1.0001959800720215, + "learning_rate": 2.9165666266506602e-05, + "loss": 0.5563, + "step": 10425 + }, + { + "epoch": 13.34528, + "grad_norm": 1.0525249242782593, + "learning_rate": 2.9163665466186474e-05, + "loss": 0.5812, + "step": 10426 + }, + { + "epoch": 13.34656, + "grad_norm": 0.9634531140327454, + "learning_rate": 2.916166466586635e-05, + "loss": 0.5414, + "step": 10427 + }, + { + "epoch": 13.34784, + "grad_norm": 0.9581708908081055, + "learning_rate": 2.915966386554622e-05, + "loss": 0.5498, + "step": 10428 + }, + { + "epoch": 13.34912, + "grad_norm": 1.0304726362228394, + "learning_rate": 2.9157663065226093e-05, + "loss": 0.543, + "step": 10429 + }, + { + "epoch": 13.3504, + "grad_norm": 0.9573806524276733, + "learning_rate": 2.9155662264905965e-05, + "loss": 0.5478, + "step": 10430 + }, + { + "epoch": 13.35168, + "grad_norm": 0.8998541831970215, + "learning_rate": 2.9153661464585837e-05, + "loss": 0.5521, + "step": 10431 + }, + { + "epoch": 13.35296, + "grad_norm": 0.9216572046279907, + "learning_rate": 2.9151660664265705e-05, + "loss": 0.5198, + "step": 10432 + }, + { + "epoch": 13.35424, + "grad_norm": 0.9832442402839661, + "learning_rate": 2.9149659863945577e-05, + "loss": 0.5542, + "step": 10433 + }, + { + "epoch": 13.35552, + "grad_norm": 0.9601088166236877, + "learning_rate": 2.9147659063625456e-05, + "loss": 0.5416, + "step": 10434 + }, + { + "epoch": 13.3568, + "grad_norm": 0.8933698534965515, + "learning_rate": 2.9145658263305324e-05, + "loss": 0.5418, + "step": 10435 + }, + { + "epoch": 13.35808, + "grad_norm": 0.9842942357063293, + "learning_rate": 2.9143657462985196e-05, + "loss": 0.5526, + "step": 10436 + }, + { + "epoch": 13.35936, + "grad_norm": 0.9776619076728821, + "learning_rate": 2.9141656662665068e-05, + "loss": 0.5864, + "step": 10437 + }, + { + "epoch": 13.36064, + "grad_norm": 0.9885453581809998, + "learning_rate": 2.913965586234494e-05, + "loss": 0.584, + "step": 10438 + }, + { + "epoch": 13.36192, + "grad_norm": 0.9770693182945251, + "learning_rate": 2.913765506202481e-05, + "loss": 0.5273, + "step": 10439 + }, + { + "epoch": 13.3632, + "grad_norm": 0.9777031540870667, + "learning_rate": 2.913565426170468e-05, + "loss": 0.5872, + "step": 10440 + }, + { + "epoch": 13.36448, + "grad_norm": 0.9541405439376831, + "learning_rate": 2.913365346138456e-05, + "loss": 0.5395, + "step": 10441 + }, + { + "epoch": 13.36576, + "grad_norm": 0.9622824192047119, + "learning_rate": 2.913165266106443e-05, + "loss": 0.5505, + "step": 10442 + }, + { + "epoch": 13.36704, + "grad_norm": 0.9015635848045349, + "learning_rate": 2.91296518607443e-05, + "loss": 0.5042, + "step": 10443 + }, + { + "epoch": 13.36832, + "grad_norm": 0.9411506056785583, + "learning_rate": 2.912765106042417e-05, + "loss": 0.5998, + "step": 10444 + }, + { + "epoch": 13.3696, + "grad_norm": 0.9454710483551025, + "learning_rate": 2.9125650260104043e-05, + "loss": 0.5047, + "step": 10445 + }, + { + "epoch": 13.37088, + "grad_norm": 0.9664983153343201, + "learning_rate": 2.9123649459783915e-05, + "loss": 0.5501, + "step": 10446 + }, + { + "epoch": 13.37216, + "grad_norm": 0.9950606226921082, + "learning_rate": 2.9121648659463787e-05, + "loss": 0.548, + "step": 10447 + }, + { + "epoch": 13.37344, + "grad_norm": 0.9428080916404724, + "learning_rate": 2.9119647859143655e-05, + "loss": 0.515, + "step": 10448 + }, + { + "epoch": 13.37472, + "grad_norm": 1.0005260705947876, + "learning_rate": 2.9117647058823534e-05, + "loss": 0.6053, + "step": 10449 + }, + { + "epoch": 13.376, + "grad_norm": 0.9256664514541626, + "learning_rate": 2.9115646258503405e-05, + "loss": 0.5313, + "step": 10450 + }, + { + "epoch": 13.37728, + "grad_norm": 0.9487593173980713, + "learning_rate": 2.9113645458183274e-05, + "loss": 0.5372, + "step": 10451 + }, + { + "epoch": 13.37856, + "grad_norm": 0.9541116952896118, + "learning_rate": 2.9111644657863146e-05, + "loss": 0.565, + "step": 10452 + }, + { + "epoch": 13.37984, + "grad_norm": 0.9423818588256836, + "learning_rate": 2.9109643857543018e-05, + "loss": 0.5485, + "step": 10453 + }, + { + "epoch": 13.38112, + "grad_norm": 0.9149635434150696, + "learning_rate": 2.910764305722289e-05, + "loss": 0.4844, + "step": 10454 + }, + { + "epoch": 13.3824, + "grad_norm": 1.0056577920913696, + "learning_rate": 2.910564225690276e-05, + "loss": 0.5753, + "step": 10455 + }, + { + "epoch": 13.38368, + "grad_norm": 1.0535948276519775, + "learning_rate": 2.9103641456582637e-05, + "loss": 0.5831, + "step": 10456 + }, + { + "epoch": 13.38496, + "grad_norm": 0.9600538015365601, + "learning_rate": 2.910164065626251e-05, + "loss": 0.5554, + "step": 10457 + }, + { + "epoch": 13.38624, + "grad_norm": 0.9667772054672241, + "learning_rate": 2.909963985594238e-05, + "loss": 0.5197, + "step": 10458 + }, + { + "epoch": 13.38752, + "grad_norm": 0.949582040309906, + "learning_rate": 2.909763905562225e-05, + "loss": 0.5152, + "step": 10459 + }, + { + "epoch": 13.3888, + "grad_norm": 0.9107711315155029, + "learning_rate": 2.909563825530212e-05, + "loss": 0.4774, + "step": 10460 + }, + { + "epoch": 13.39008, + "grad_norm": 1.0431016683578491, + "learning_rate": 2.9093637454981993e-05, + "loss": 0.5594, + "step": 10461 + }, + { + "epoch": 13.39136, + "grad_norm": 0.9607868194580078, + "learning_rate": 2.9091636654661864e-05, + "loss": 0.5385, + "step": 10462 + }, + { + "epoch": 13.39264, + "grad_norm": 0.9767603278160095, + "learning_rate": 2.908963585434174e-05, + "loss": 0.5555, + "step": 10463 + }, + { + "epoch": 13.39392, + "grad_norm": 0.920050859451294, + "learning_rate": 2.908763505402161e-05, + "loss": 0.5062, + "step": 10464 + }, + { + "epoch": 13.395199999999999, + "grad_norm": 0.9358279705047607, + "learning_rate": 2.9085634253701483e-05, + "loss": 0.5381, + "step": 10465 + }, + { + "epoch": 13.39648, + "grad_norm": 0.9233344793319702, + "learning_rate": 2.9083633453381355e-05, + "loss": 0.5331, + "step": 10466 + }, + { + "epoch": 13.39776, + "grad_norm": 0.9128267168998718, + "learning_rate": 2.9081632653061224e-05, + "loss": 0.5363, + "step": 10467 + }, + { + "epoch": 13.39904, + "grad_norm": 1.0040749311447144, + "learning_rate": 2.9079631852741096e-05, + "loss": 0.5948, + "step": 10468 + }, + { + "epoch": 13.40032, + "grad_norm": 0.9266423583030701, + "learning_rate": 2.9077631052420967e-05, + "loss": 0.5511, + "step": 10469 + }, + { + "epoch": 13.4016, + "grad_norm": 0.8982942700386047, + "learning_rate": 2.9075630252100843e-05, + "loss": 0.5446, + "step": 10470 + }, + { + "epoch": 13.40288, + "grad_norm": 0.9734070301055908, + "learning_rate": 2.9073629451780714e-05, + "loss": 0.5567, + "step": 10471 + }, + { + "epoch": 13.40416, + "grad_norm": 0.9396576285362244, + "learning_rate": 2.9071628651460586e-05, + "loss": 0.5146, + "step": 10472 + }, + { + "epoch": 13.40544, + "grad_norm": 0.9016363620758057, + "learning_rate": 2.9069627851140458e-05, + "loss": 0.5465, + "step": 10473 + }, + { + "epoch": 13.40672, + "grad_norm": 0.950202465057373, + "learning_rate": 2.906762705082033e-05, + "loss": 0.5659, + "step": 10474 + }, + { + "epoch": 13.408, + "grad_norm": 0.9889881610870361, + "learning_rate": 2.90656262505002e-05, + "loss": 0.5675, + "step": 10475 + }, + { + "epoch": 13.40928, + "grad_norm": 0.9744821190834045, + "learning_rate": 2.906362545018007e-05, + "loss": 0.5625, + "step": 10476 + }, + { + "epoch": 13.41056, + "grad_norm": 0.9143860340118408, + "learning_rate": 2.906162464985995e-05, + "loss": 0.522, + "step": 10477 + }, + { + "epoch": 13.41184, + "grad_norm": 0.9224212169647217, + "learning_rate": 2.9059623849539817e-05, + "loss": 0.4886, + "step": 10478 + }, + { + "epoch": 13.41312, + "grad_norm": 0.9844509959220886, + "learning_rate": 2.905762304921969e-05, + "loss": 0.559, + "step": 10479 + }, + { + "epoch": 13.4144, + "grad_norm": 0.919024646282196, + "learning_rate": 2.905562224889956e-05, + "loss": 0.5372, + "step": 10480 + }, + { + "epoch": 13.41568, + "grad_norm": 0.9639186859130859, + "learning_rate": 2.9053621448579433e-05, + "loss": 0.5797, + "step": 10481 + }, + { + "epoch": 13.41696, + "grad_norm": 0.926937460899353, + "learning_rate": 2.9051620648259305e-05, + "loss": 0.5645, + "step": 10482 + }, + { + "epoch": 13.41824, + "grad_norm": 0.9183882474899292, + "learning_rate": 2.9049619847939173e-05, + "loss": 0.5077, + "step": 10483 + }, + { + "epoch": 13.41952, + "grad_norm": 0.9052950739860535, + "learning_rate": 2.9047619047619052e-05, + "loss": 0.5042, + "step": 10484 + }, + { + "epoch": 13.4208, + "grad_norm": 0.9210243225097656, + "learning_rate": 2.9045618247298924e-05, + "loss": 0.5045, + "step": 10485 + }, + { + "epoch": 13.42208, + "grad_norm": 0.9332923293113708, + "learning_rate": 2.9043617446978792e-05, + "loss": 0.556, + "step": 10486 + }, + { + "epoch": 13.42336, + "grad_norm": 0.8918114304542542, + "learning_rate": 2.9041616646658664e-05, + "loss": 0.5455, + "step": 10487 + }, + { + "epoch": 13.42464, + "grad_norm": 1.0036391019821167, + "learning_rate": 2.9039615846338536e-05, + "loss": 0.6048, + "step": 10488 + }, + { + "epoch": 13.42592, + "grad_norm": 0.9452251195907593, + "learning_rate": 2.9037615046018408e-05, + "loss": 0.5311, + "step": 10489 + }, + { + "epoch": 13.4272, + "grad_norm": 0.9231464266777039, + "learning_rate": 2.903561424569828e-05, + "loss": 0.5153, + "step": 10490 + }, + { + "epoch": 13.42848, + "grad_norm": 0.9702645540237427, + "learning_rate": 2.9033613445378155e-05, + "loss": 0.549, + "step": 10491 + }, + { + "epoch": 13.42976, + "grad_norm": 0.9461646676063538, + "learning_rate": 2.9031612645058027e-05, + "loss": 0.5309, + "step": 10492 + }, + { + "epoch": 13.43104, + "grad_norm": 0.9646087288856506, + "learning_rate": 2.90296118447379e-05, + "loss": 0.5819, + "step": 10493 + }, + { + "epoch": 13.43232, + "grad_norm": 0.9191693067550659, + "learning_rate": 2.9027611044417767e-05, + "loss": 0.5552, + "step": 10494 + }, + { + "epoch": 13.4336, + "grad_norm": 0.9861147999763489, + "learning_rate": 2.902561024409764e-05, + "loss": 0.5568, + "step": 10495 + }, + { + "epoch": 13.43488, + "grad_norm": 0.9543395042419434, + "learning_rate": 2.902360944377751e-05, + "loss": 0.5337, + "step": 10496 + }, + { + "epoch": 13.43616, + "grad_norm": 0.9656491875648499, + "learning_rate": 2.9021608643457383e-05, + "loss": 0.5232, + "step": 10497 + }, + { + "epoch": 13.43744, + "grad_norm": 0.9861603379249573, + "learning_rate": 2.9019607843137258e-05, + "loss": 0.5745, + "step": 10498 + }, + { + "epoch": 13.43872, + "grad_norm": 0.9877046942710876, + "learning_rate": 2.901760704281713e-05, + "loss": 0.584, + "step": 10499 + }, + { + "epoch": 13.44, + "grad_norm": 0.9454039931297302, + "learning_rate": 2.9015606242497002e-05, + "loss": 0.5021, + "step": 10500 + }, + { + "epoch": 13.44128, + "grad_norm": 0.9773223996162415, + "learning_rate": 2.9013605442176874e-05, + "loss": 0.5659, + "step": 10501 + }, + { + "epoch": 13.44256, + "grad_norm": 0.9586243033409119, + "learning_rate": 2.9011604641856742e-05, + "loss": 0.5496, + "step": 10502 + }, + { + "epoch": 13.44384, + "grad_norm": 0.9668572545051575, + "learning_rate": 2.9009603841536614e-05, + "loss": 0.5088, + "step": 10503 + }, + { + "epoch": 13.44512, + "grad_norm": 0.9630528092384338, + "learning_rate": 2.9007603041216486e-05, + "loss": 0.5007, + "step": 10504 + }, + { + "epoch": 13.4464, + "grad_norm": 0.984404444694519, + "learning_rate": 2.900560224089636e-05, + "loss": 0.5157, + "step": 10505 + }, + { + "epoch": 13.44768, + "grad_norm": 0.9502925276756287, + "learning_rate": 2.9003601440576233e-05, + "loss": 0.4632, + "step": 10506 + }, + { + "epoch": 13.44896, + "grad_norm": 0.9846747517585754, + "learning_rate": 2.9001600640256105e-05, + "loss": 0.5392, + "step": 10507 + }, + { + "epoch": 13.45024, + "grad_norm": 0.9732792377471924, + "learning_rate": 2.8999599839935977e-05, + "loss": 0.5066, + "step": 10508 + }, + { + "epoch": 13.45152, + "grad_norm": 0.9376108050346375, + "learning_rate": 2.899759903961585e-05, + "loss": 0.568, + "step": 10509 + }, + { + "epoch": 13.4528, + "grad_norm": 0.9324885010719299, + "learning_rate": 2.8995598239295717e-05, + "loss": 0.5635, + "step": 10510 + }, + { + "epoch": 13.45408, + "grad_norm": 0.9335364699363708, + "learning_rate": 2.899359743897559e-05, + "loss": 0.5284, + "step": 10511 + }, + { + "epoch": 13.45536, + "grad_norm": 0.9017517566680908, + "learning_rate": 2.8991596638655467e-05, + "loss": 0.4938, + "step": 10512 + }, + { + "epoch": 13.45664, + "grad_norm": 0.9306479096412659, + "learning_rate": 2.8989595838335336e-05, + "loss": 0.5521, + "step": 10513 + }, + { + "epoch": 13.45792, + "grad_norm": 0.9630993008613586, + "learning_rate": 2.8987595038015208e-05, + "loss": 0.5371, + "step": 10514 + }, + { + "epoch": 13.4592, + "grad_norm": 0.9604276418685913, + "learning_rate": 2.898559423769508e-05, + "loss": 0.5543, + "step": 10515 + }, + { + "epoch": 13.46048, + "grad_norm": 0.9642881155014038, + "learning_rate": 2.898359343737495e-05, + "loss": 0.525, + "step": 10516 + }, + { + "epoch": 13.46176, + "grad_norm": 0.9426066279411316, + "learning_rate": 2.8981592637054823e-05, + "loss": 0.5535, + "step": 10517 + }, + { + "epoch": 13.46304, + "grad_norm": 0.8915838003158569, + "learning_rate": 2.8979591836734692e-05, + "loss": 0.5155, + "step": 10518 + }, + { + "epoch": 13.46432, + "grad_norm": 0.9411042332649231, + "learning_rate": 2.897759103641457e-05, + "loss": 0.5197, + "step": 10519 + }, + { + "epoch": 13.4656, + "grad_norm": 0.9480369091033936, + "learning_rate": 2.8975590236094442e-05, + "loss": 0.5425, + "step": 10520 + }, + { + "epoch": 13.46688, + "grad_norm": 0.9183311462402344, + "learning_rate": 2.897358943577431e-05, + "loss": 0.5394, + "step": 10521 + }, + { + "epoch": 13.46816, + "grad_norm": 0.8983965516090393, + "learning_rate": 2.8971588635454183e-05, + "loss": 0.5216, + "step": 10522 + }, + { + "epoch": 13.46944, + "grad_norm": 0.943598210811615, + "learning_rate": 2.8969587835134054e-05, + "loss": 0.5223, + "step": 10523 + }, + { + "epoch": 13.47072, + "grad_norm": 0.9010968804359436, + "learning_rate": 2.8967587034813926e-05, + "loss": 0.5572, + "step": 10524 + }, + { + "epoch": 13.472, + "grad_norm": 0.9356712698936462, + "learning_rate": 2.8965586234493798e-05, + "loss": 0.5327, + "step": 10525 + }, + { + "epoch": 13.47328, + "grad_norm": 0.9779828786849976, + "learning_rate": 2.8963585434173673e-05, + "loss": 0.5572, + "step": 10526 + }, + { + "epoch": 13.47456, + "grad_norm": 0.9834756851196289, + "learning_rate": 2.8961584633853545e-05, + "loss": 0.5512, + "step": 10527 + }, + { + "epoch": 13.47584, + "grad_norm": 1.0116183757781982, + "learning_rate": 2.8959583833533417e-05, + "loss": 0.5726, + "step": 10528 + }, + { + "epoch": 13.47712, + "grad_norm": 0.9891726970672607, + "learning_rate": 2.8957583033213286e-05, + "loss": 0.5587, + "step": 10529 + }, + { + "epoch": 13.4784, + "grad_norm": 0.9156029224395752, + "learning_rate": 2.8955582232893157e-05, + "loss": 0.5175, + "step": 10530 + }, + { + "epoch": 13.47968, + "grad_norm": 0.9811277389526367, + "learning_rate": 2.895358143257303e-05, + "loss": 0.5572, + "step": 10531 + }, + { + "epoch": 13.48096, + "grad_norm": 0.9236703515052795, + "learning_rate": 2.89515806322529e-05, + "loss": 0.5119, + "step": 10532 + }, + { + "epoch": 13.482240000000001, + "grad_norm": 0.9552645087242126, + "learning_rate": 2.8949579831932776e-05, + "loss": 0.544, + "step": 10533 + }, + { + "epoch": 13.48352, + "grad_norm": 0.9037032127380371, + "learning_rate": 2.8947579031612648e-05, + "loss": 0.5298, + "step": 10534 + }, + { + "epoch": 13.4848, + "grad_norm": 0.947192370891571, + "learning_rate": 2.894557823129252e-05, + "loss": 0.5685, + "step": 10535 + }, + { + "epoch": 13.48608, + "grad_norm": 0.9519645571708679, + "learning_rate": 2.8943577430972392e-05, + "loss": 0.5339, + "step": 10536 + }, + { + "epoch": 13.48736, + "grad_norm": 0.9448265433311462, + "learning_rate": 2.894157663065226e-05, + "loss": 0.5473, + "step": 10537 + }, + { + "epoch": 13.48864, + "grad_norm": 0.9873289465904236, + "learning_rate": 2.8939575830332132e-05, + "loss": 0.5253, + "step": 10538 + }, + { + "epoch": 13.48992, + "grad_norm": 0.9552075862884521, + "learning_rate": 2.8937575030012004e-05, + "loss": 0.5561, + "step": 10539 + }, + { + "epoch": 13.4912, + "grad_norm": 0.9306079745292664, + "learning_rate": 2.893557422969188e-05, + "loss": 0.5058, + "step": 10540 + }, + { + "epoch": 13.49248, + "grad_norm": 0.9096477627754211, + "learning_rate": 2.893357342937175e-05, + "loss": 0.4799, + "step": 10541 + }, + { + "epoch": 13.49376, + "grad_norm": 1.00719153881073, + "learning_rate": 2.8931572629051623e-05, + "loss": 0.5475, + "step": 10542 + }, + { + "epoch": 13.49504, + "grad_norm": 1.0264086723327637, + "learning_rate": 2.8929571828731495e-05, + "loss": 0.5664, + "step": 10543 + }, + { + "epoch": 13.49632, + "grad_norm": 0.9574489593505859, + "learning_rate": 2.8927571028411367e-05, + "loss": 0.5239, + "step": 10544 + }, + { + "epoch": 13.4976, + "grad_norm": 0.9818712472915649, + "learning_rate": 2.8925570228091235e-05, + "loss": 0.5373, + "step": 10545 + }, + { + "epoch": 13.49888, + "grad_norm": 0.9273495674133301, + "learning_rate": 2.8923569427771107e-05, + "loss": 0.5535, + "step": 10546 + }, + { + "epoch": 13.50016, + "grad_norm": 0.9357905983924866, + "learning_rate": 2.8921568627450986e-05, + "loss": 0.5536, + "step": 10547 + }, + { + "epoch": 13.50144, + "grad_norm": 0.9313623309135437, + "learning_rate": 2.8919567827130854e-05, + "loss": 0.5436, + "step": 10548 + }, + { + "epoch": 13.50272, + "grad_norm": 0.9302566051483154, + "learning_rate": 2.8917567026810726e-05, + "loss": 0.503, + "step": 10549 + }, + { + "epoch": 13.504, + "grad_norm": 0.9522665143013, + "learning_rate": 2.8915566226490598e-05, + "loss": 0.5565, + "step": 10550 + }, + { + "epoch": 13.505279999999999, + "grad_norm": 0.9212354421615601, + "learning_rate": 2.891356542617047e-05, + "loss": 0.5448, + "step": 10551 + }, + { + "epoch": 13.50656, + "grad_norm": 0.9201790690422058, + "learning_rate": 2.891156462585034e-05, + "loss": 0.509, + "step": 10552 + }, + { + "epoch": 13.50784, + "grad_norm": 0.91869056224823, + "learning_rate": 2.890956382553021e-05, + "loss": 0.5319, + "step": 10553 + }, + { + "epoch": 13.50912, + "grad_norm": 0.9275939464569092, + "learning_rate": 2.890756302521009e-05, + "loss": 0.5429, + "step": 10554 + }, + { + "epoch": 13.5104, + "grad_norm": 0.9292113780975342, + "learning_rate": 2.890556222488996e-05, + "loss": 0.5673, + "step": 10555 + }, + { + "epoch": 13.51168, + "grad_norm": 0.9870272278785706, + "learning_rate": 2.890356142456983e-05, + "loss": 0.5295, + "step": 10556 + }, + { + "epoch": 13.51296, + "grad_norm": 0.9538768529891968, + "learning_rate": 2.89015606242497e-05, + "loss": 0.5421, + "step": 10557 + }, + { + "epoch": 13.514240000000001, + "grad_norm": 0.9605304598808289, + "learning_rate": 2.8899559823929573e-05, + "loss": 0.5666, + "step": 10558 + }, + { + "epoch": 13.51552, + "grad_norm": 0.9257391691207886, + "learning_rate": 2.8897559023609445e-05, + "loss": 0.5048, + "step": 10559 + }, + { + "epoch": 13.5168, + "grad_norm": 0.924060046672821, + "learning_rate": 2.8895558223289317e-05, + "loss": 0.5625, + "step": 10560 + }, + { + "epoch": 13.51808, + "grad_norm": 0.9286275506019592, + "learning_rate": 2.8893557422969185e-05, + "loss": 0.4978, + "step": 10561 + }, + { + "epoch": 13.51936, + "grad_norm": 1.009763240814209, + "learning_rate": 2.8891556622649064e-05, + "loss": 0.5748, + "step": 10562 + }, + { + "epoch": 13.52064, + "grad_norm": 0.9351606369018555, + "learning_rate": 2.8889555822328935e-05, + "loss": 0.4949, + "step": 10563 + }, + { + "epoch": 13.52192, + "grad_norm": 0.9865330457687378, + "learning_rate": 2.8887555022008804e-05, + "loss": 0.5523, + "step": 10564 + }, + { + "epoch": 13.5232, + "grad_norm": 0.9141054749488831, + "learning_rate": 2.8885554221688676e-05, + "loss": 0.535, + "step": 10565 + }, + { + "epoch": 13.52448, + "grad_norm": 0.8817773461341858, + "learning_rate": 2.8883553421368548e-05, + "loss": 0.5218, + "step": 10566 + }, + { + "epoch": 13.52576, + "grad_norm": 0.918411910533905, + "learning_rate": 2.888155262104842e-05, + "loss": 0.5497, + "step": 10567 + }, + { + "epoch": 13.52704, + "grad_norm": 0.9279268383979797, + "learning_rate": 2.887955182072829e-05, + "loss": 0.5519, + "step": 10568 + }, + { + "epoch": 13.52832, + "grad_norm": 0.9723789095878601, + "learning_rate": 2.8877551020408167e-05, + "loss": 0.5554, + "step": 10569 + }, + { + "epoch": 13.5296, + "grad_norm": 0.9536760449409485, + "learning_rate": 2.887555022008804e-05, + "loss": 0.5561, + "step": 10570 + }, + { + "epoch": 13.53088, + "grad_norm": 0.9966936111450195, + "learning_rate": 2.887354941976791e-05, + "loss": 0.5453, + "step": 10571 + }, + { + "epoch": 13.53216, + "grad_norm": 0.9413495659828186, + "learning_rate": 2.887154861944778e-05, + "loss": 0.5242, + "step": 10572 + }, + { + "epoch": 13.53344, + "grad_norm": 0.9562821388244629, + "learning_rate": 2.886954781912765e-05, + "loss": 0.5672, + "step": 10573 + }, + { + "epoch": 13.53472, + "grad_norm": 0.9556413292884827, + "learning_rate": 2.8867547018807523e-05, + "loss": 0.5895, + "step": 10574 + }, + { + "epoch": 13.536, + "grad_norm": 0.9961545467376709, + "learning_rate": 2.8865546218487394e-05, + "loss": 0.5486, + "step": 10575 + }, + { + "epoch": 13.537279999999999, + "grad_norm": 0.9472994208335876, + "learning_rate": 2.886354541816727e-05, + "loss": 0.5914, + "step": 10576 + }, + { + "epoch": 13.53856, + "grad_norm": 0.937106728553772, + "learning_rate": 2.886154461784714e-05, + "loss": 0.528, + "step": 10577 + }, + { + "epoch": 13.53984, + "grad_norm": 0.9636639952659607, + "learning_rate": 2.8859543817527013e-05, + "loss": 0.5109, + "step": 10578 + }, + { + "epoch": 13.54112, + "grad_norm": 0.9437137246131897, + "learning_rate": 2.8857543017206885e-05, + "loss": 0.5464, + "step": 10579 + }, + { + "epoch": 13.5424, + "grad_norm": 1.0153517723083496, + "learning_rate": 2.8855542216886754e-05, + "loss": 0.5756, + "step": 10580 + }, + { + "epoch": 13.54368, + "grad_norm": 0.9016063213348389, + "learning_rate": 2.8853541416566626e-05, + "loss": 0.4751, + "step": 10581 + }, + { + "epoch": 13.54496, + "grad_norm": 0.9339427947998047, + "learning_rate": 2.8851540616246497e-05, + "loss": 0.5462, + "step": 10582 + }, + { + "epoch": 13.54624, + "grad_norm": 0.9382839202880859, + "learning_rate": 2.8849539815926373e-05, + "loss": 0.5564, + "step": 10583 + }, + { + "epoch": 13.54752, + "grad_norm": 0.9853460192680359, + "learning_rate": 2.8847539015606244e-05, + "loss": 0.5785, + "step": 10584 + }, + { + "epoch": 13.5488, + "grad_norm": 0.8800140023231506, + "learning_rate": 2.8845538215286116e-05, + "loss": 0.5303, + "step": 10585 + }, + { + "epoch": 13.55008, + "grad_norm": 0.9256978631019592, + "learning_rate": 2.8843537414965988e-05, + "loss": 0.5192, + "step": 10586 + }, + { + "epoch": 13.55136, + "grad_norm": 0.9170429706573486, + "learning_rate": 2.884153661464586e-05, + "loss": 0.5438, + "step": 10587 + }, + { + "epoch": 13.55264, + "grad_norm": 0.9927389025688171, + "learning_rate": 2.883953581432573e-05, + "loss": 0.5558, + "step": 10588 + }, + { + "epoch": 13.55392, + "grad_norm": 0.9742659330368042, + "learning_rate": 2.88375350140056e-05, + "loss": 0.5768, + "step": 10589 + }, + { + "epoch": 13.5552, + "grad_norm": 1.032018780708313, + "learning_rate": 2.883553421368548e-05, + "loss": 0.5579, + "step": 10590 + }, + { + "epoch": 13.55648, + "grad_norm": 0.972882091999054, + "learning_rate": 2.8833533413365347e-05, + "loss": 0.5939, + "step": 10591 + }, + { + "epoch": 13.55776, + "grad_norm": 0.8545750975608826, + "learning_rate": 2.883153261304522e-05, + "loss": 0.5027, + "step": 10592 + }, + { + "epoch": 13.55904, + "grad_norm": 0.896020233631134, + "learning_rate": 2.882953181272509e-05, + "loss": 0.472, + "step": 10593 + }, + { + "epoch": 13.56032, + "grad_norm": 0.8964064717292786, + "learning_rate": 2.8827531012404963e-05, + "loss": 0.4956, + "step": 10594 + }, + { + "epoch": 13.5616, + "grad_norm": 0.939621090888977, + "learning_rate": 2.8825530212084835e-05, + "loss": 0.5365, + "step": 10595 + }, + { + "epoch": 13.56288, + "grad_norm": 0.9430614709854126, + "learning_rate": 2.8823529411764703e-05, + "loss": 0.5228, + "step": 10596 + }, + { + "epoch": 13.56416, + "grad_norm": 0.91892409324646, + "learning_rate": 2.8821528611444582e-05, + "loss": 0.5294, + "step": 10597 + }, + { + "epoch": 13.56544, + "grad_norm": 0.9037036895751953, + "learning_rate": 2.8819527811124454e-05, + "loss": 0.5209, + "step": 10598 + }, + { + "epoch": 13.56672, + "grad_norm": 0.9548211097717285, + "learning_rate": 2.8817527010804322e-05, + "loss": 0.5449, + "step": 10599 + }, + { + "epoch": 13.568, + "grad_norm": 0.9372276067733765, + "learning_rate": 2.8815526210484194e-05, + "loss": 0.5343, + "step": 10600 + }, + { + "epoch": 13.56928, + "grad_norm": 0.9246383905410767, + "learning_rate": 2.8813525410164066e-05, + "loss": 0.5319, + "step": 10601 + }, + { + "epoch": 13.57056, + "grad_norm": 0.947321355342865, + "learning_rate": 2.8811524609843938e-05, + "loss": 0.5573, + "step": 10602 + }, + { + "epoch": 13.57184, + "grad_norm": 0.9556154012680054, + "learning_rate": 2.880952380952381e-05, + "loss": 0.5551, + "step": 10603 + }, + { + "epoch": 13.57312, + "grad_norm": 0.9768837094306946, + "learning_rate": 2.8807523009203685e-05, + "loss": 0.5631, + "step": 10604 + }, + { + "epoch": 13.5744, + "grad_norm": 0.9723296761512756, + "learning_rate": 2.8805522208883557e-05, + "loss": 0.5453, + "step": 10605 + }, + { + "epoch": 13.57568, + "grad_norm": 0.9709224104881287, + "learning_rate": 2.880352140856343e-05, + "loss": 0.5324, + "step": 10606 + }, + { + "epoch": 13.57696, + "grad_norm": 0.9383196830749512, + "learning_rate": 2.8801520608243297e-05, + "loss": 0.5561, + "step": 10607 + }, + { + "epoch": 13.57824, + "grad_norm": 0.9480939507484436, + "learning_rate": 2.879951980792317e-05, + "loss": 0.51, + "step": 10608 + }, + { + "epoch": 13.57952, + "grad_norm": 0.9381304383277893, + "learning_rate": 2.879751900760304e-05, + "loss": 0.5704, + "step": 10609 + }, + { + "epoch": 13.5808, + "grad_norm": 0.9381782412528992, + "learning_rate": 2.8795518207282913e-05, + "loss": 0.5011, + "step": 10610 + }, + { + "epoch": 13.58208, + "grad_norm": 1.024210810661316, + "learning_rate": 2.8793517406962788e-05, + "loss": 0.5855, + "step": 10611 + }, + { + "epoch": 13.58336, + "grad_norm": 0.9958494305610657, + "learning_rate": 2.879151660664266e-05, + "loss": 0.5279, + "step": 10612 + }, + { + "epoch": 13.58464, + "grad_norm": 0.9755289554595947, + "learning_rate": 2.8789515806322532e-05, + "loss": 0.5981, + "step": 10613 + }, + { + "epoch": 13.58592, + "grad_norm": 0.9563091397285461, + "learning_rate": 2.8787515006002404e-05, + "loss": 0.5707, + "step": 10614 + }, + { + "epoch": 13.5872, + "grad_norm": 0.9441842436790466, + "learning_rate": 2.8785514205682272e-05, + "loss": 0.5695, + "step": 10615 + }, + { + "epoch": 13.58848, + "grad_norm": 0.9176508784294128, + "learning_rate": 2.8783513405362144e-05, + "loss": 0.4954, + "step": 10616 + }, + { + "epoch": 13.58976, + "grad_norm": 0.9186686277389526, + "learning_rate": 2.8781512605042016e-05, + "loss": 0.5372, + "step": 10617 + }, + { + "epoch": 13.59104, + "grad_norm": 0.9438495635986328, + "learning_rate": 2.877951180472189e-05, + "loss": 0.5552, + "step": 10618 + }, + { + "epoch": 13.59232, + "grad_norm": 0.9654157161712646, + "learning_rate": 2.8777511004401763e-05, + "loss": 0.5388, + "step": 10619 + }, + { + "epoch": 13.5936, + "grad_norm": 0.9715597629547119, + "learning_rate": 2.8775510204081635e-05, + "loss": 0.5435, + "step": 10620 + }, + { + "epoch": 13.59488, + "grad_norm": 0.9751953482627869, + "learning_rate": 2.8773509403761507e-05, + "loss": 0.5689, + "step": 10621 + }, + { + "epoch": 13.59616, + "grad_norm": 0.9504429697990417, + "learning_rate": 2.877150860344138e-05, + "loss": 0.5404, + "step": 10622 + }, + { + "epoch": 13.59744, + "grad_norm": 0.9462687373161316, + "learning_rate": 2.8769507803121247e-05, + "loss": 0.527, + "step": 10623 + }, + { + "epoch": 13.59872, + "grad_norm": 0.9122000336647034, + "learning_rate": 2.876750700280112e-05, + "loss": 0.5466, + "step": 10624 + }, + { + "epoch": 13.6, + "grad_norm": 0.9636383056640625, + "learning_rate": 2.8765506202480997e-05, + "loss": 0.5357, + "step": 10625 + }, + { + "epoch": 13.60128, + "grad_norm": 0.9176141023635864, + "learning_rate": 2.8763505402160866e-05, + "loss": 0.5333, + "step": 10626 + }, + { + "epoch": 13.60256, + "grad_norm": 0.935096025466919, + "learning_rate": 2.8761504601840738e-05, + "loss": 0.5373, + "step": 10627 + }, + { + "epoch": 13.60384, + "grad_norm": 0.9804729223251343, + "learning_rate": 2.875950380152061e-05, + "loss": 0.5202, + "step": 10628 + }, + { + "epoch": 13.60512, + "grad_norm": 0.8921434879302979, + "learning_rate": 2.875750300120048e-05, + "loss": 0.5015, + "step": 10629 + }, + { + "epoch": 13.6064, + "grad_norm": 0.9438501000404358, + "learning_rate": 2.8755502200880353e-05, + "loss": 0.5353, + "step": 10630 + }, + { + "epoch": 13.60768, + "grad_norm": 1.0135152339935303, + "learning_rate": 2.8753501400560222e-05, + "loss": 0.5581, + "step": 10631 + }, + { + "epoch": 13.60896, + "grad_norm": 0.9754303693771362, + "learning_rate": 2.87515006002401e-05, + "loss": 0.5702, + "step": 10632 + }, + { + "epoch": 13.61024, + "grad_norm": 0.9811170101165771, + "learning_rate": 2.8749499799919972e-05, + "loss": 0.5316, + "step": 10633 + }, + { + "epoch": 13.61152, + "grad_norm": 0.9455510377883911, + "learning_rate": 2.874749899959984e-05, + "loss": 0.5505, + "step": 10634 + }, + { + "epoch": 13.6128, + "grad_norm": 0.9648592472076416, + "learning_rate": 2.8745498199279713e-05, + "loss": 0.5324, + "step": 10635 + }, + { + "epoch": 13.61408, + "grad_norm": 0.9389111995697021, + "learning_rate": 2.8743497398959584e-05, + "loss": 0.5199, + "step": 10636 + }, + { + "epoch": 13.61536, + "grad_norm": 0.8730208873748779, + "learning_rate": 2.8741496598639456e-05, + "loss": 0.4906, + "step": 10637 + }, + { + "epoch": 13.61664, + "grad_norm": 0.903998613357544, + "learning_rate": 2.8739495798319328e-05, + "loss": 0.5274, + "step": 10638 + }, + { + "epoch": 13.61792, + "grad_norm": 0.9008133411407471, + "learning_rate": 2.8737494997999203e-05, + "loss": 0.4929, + "step": 10639 + }, + { + "epoch": 13.6192, + "grad_norm": 0.9376186728477478, + "learning_rate": 2.8735494197679075e-05, + "loss": 0.5746, + "step": 10640 + }, + { + "epoch": 13.62048, + "grad_norm": 0.8832375407218933, + "learning_rate": 2.8733493397358947e-05, + "loss": 0.4694, + "step": 10641 + }, + { + "epoch": 13.62176, + "grad_norm": 1.0239812135696411, + "learning_rate": 2.8731492597038816e-05, + "loss": 0.5459, + "step": 10642 + }, + { + "epoch": 13.62304, + "grad_norm": 1.0099740028381348, + "learning_rate": 2.8729491796718687e-05, + "loss": 0.5721, + "step": 10643 + }, + { + "epoch": 13.62432, + "grad_norm": 0.9773842096328735, + "learning_rate": 2.872749099639856e-05, + "loss": 0.5401, + "step": 10644 + }, + { + "epoch": 13.6256, + "grad_norm": 0.9006150364875793, + "learning_rate": 2.872549019607843e-05, + "loss": 0.5157, + "step": 10645 + }, + { + "epoch": 13.62688, + "grad_norm": 0.9231019616127014, + "learning_rate": 2.8723489395758306e-05, + "loss": 0.5334, + "step": 10646 + }, + { + "epoch": 13.62816, + "grad_norm": 0.9919240474700928, + "learning_rate": 2.8721488595438178e-05, + "loss": 0.5759, + "step": 10647 + }, + { + "epoch": 13.62944, + "grad_norm": 0.9904776215553284, + "learning_rate": 2.871948779511805e-05, + "loss": 0.5377, + "step": 10648 + }, + { + "epoch": 13.63072, + "grad_norm": 0.9417052268981934, + "learning_rate": 2.8717486994797922e-05, + "loss": 0.5053, + "step": 10649 + }, + { + "epoch": 13.632, + "grad_norm": 0.9234094023704529, + "learning_rate": 2.871548619447779e-05, + "loss": 0.5466, + "step": 10650 + }, + { + "epoch": 13.63328, + "grad_norm": 0.9561535716056824, + "learning_rate": 2.8713485394157662e-05, + "loss": 0.5591, + "step": 10651 + }, + { + "epoch": 13.63456, + "grad_norm": 0.9605762362480164, + "learning_rate": 2.8711484593837534e-05, + "loss": 0.5438, + "step": 10652 + }, + { + "epoch": 13.63584, + "grad_norm": 1.0227789878845215, + "learning_rate": 2.870948379351741e-05, + "loss": 0.5768, + "step": 10653 + }, + { + "epoch": 13.63712, + "grad_norm": 0.9260333776473999, + "learning_rate": 2.870748299319728e-05, + "loss": 0.4965, + "step": 10654 + }, + { + "epoch": 13.6384, + "grad_norm": 1.0059727430343628, + "learning_rate": 2.8705482192877153e-05, + "loss": 0.5619, + "step": 10655 + }, + { + "epoch": 13.63968, + "grad_norm": 0.9554175138473511, + "learning_rate": 2.8703481392557025e-05, + "loss": 0.5721, + "step": 10656 + }, + { + "epoch": 13.64096, + "grad_norm": 0.9822500944137573, + "learning_rate": 2.8701480592236897e-05, + "loss": 0.5116, + "step": 10657 + }, + { + "epoch": 13.64224, + "grad_norm": 0.9823669195175171, + "learning_rate": 2.8699479791916765e-05, + "loss": 0.5623, + "step": 10658 + }, + { + "epoch": 13.64352, + "grad_norm": 0.962337076663971, + "learning_rate": 2.8697478991596637e-05, + "loss": 0.5153, + "step": 10659 + }, + { + "epoch": 13.6448, + "grad_norm": 0.9577946066856384, + "learning_rate": 2.8695478191276516e-05, + "loss": 0.546, + "step": 10660 + }, + { + "epoch": 13.64608, + "grad_norm": 0.9783328771591187, + "learning_rate": 2.8693477390956384e-05, + "loss": 0.5556, + "step": 10661 + }, + { + "epoch": 13.64736, + "grad_norm": 0.9406751990318298, + "learning_rate": 2.8691476590636256e-05, + "loss": 0.5234, + "step": 10662 + }, + { + "epoch": 13.64864, + "grad_norm": 0.9680593013763428, + "learning_rate": 2.8689475790316128e-05, + "loss": 0.5553, + "step": 10663 + }, + { + "epoch": 13.64992, + "grad_norm": 0.9520314335823059, + "learning_rate": 2.8687474989996e-05, + "loss": 0.5647, + "step": 10664 + }, + { + "epoch": 13.6512, + "grad_norm": 0.8977915048599243, + "learning_rate": 2.868547418967587e-05, + "loss": 0.5178, + "step": 10665 + }, + { + "epoch": 13.65248, + "grad_norm": 0.8997889757156372, + "learning_rate": 2.868347338935574e-05, + "loss": 0.5383, + "step": 10666 + }, + { + "epoch": 13.65376, + "grad_norm": 0.9149733781814575, + "learning_rate": 2.8681472589035612e-05, + "loss": 0.5046, + "step": 10667 + }, + { + "epoch": 13.65504, + "grad_norm": 1.018157720565796, + "learning_rate": 2.867947178871549e-05, + "loss": 0.6097, + "step": 10668 + }, + { + "epoch": 13.656320000000001, + "grad_norm": 0.9306768774986267, + "learning_rate": 2.867747098839536e-05, + "loss": 0.5142, + "step": 10669 + }, + { + "epoch": 13.6576, + "grad_norm": 0.9335023760795593, + "learning_rate": 2.867547018807523e-05, + "loss": 0.5604, + "step": 10670 + }, + { + "epoch": 13.65888, + "grad_norm": 0.9208192825317383, + "learning_rate": 2.8673469387755103e-05, + "loss": 0.5307, + "step": 10671 + }, + { + "epoch": 13.66016, + "grad_norm": 0.9682392477989197, + "learning_rate": 2.8671468587434975e-05, + "loss": 0.5418, + "step": 10672 + }, + { + "epoch": 13.66144, + "grad_norm": 0.9635206460952759, + "learning_rate": 2.8669467787114846e-05, + "loss": 0.5639, + "step": 10673 + }, + { + "epoch": 13.66272, + "grad_norm": 0.9996086359024048, + "learning_rate": 2.8667466986794715e-05, + "loss": 0.5511, + "step": 10674 + }, + { + "epoch": 13.664, + "grad_norm": 0.9767386317253113, + "learning_rate": 2.8665466186474594e-05, + "loss": 0.5375, + "step": 10675 + }, + { + "epoch": 13.66528, + "grad_norm": 0.9239612221717834, + "learning_rate": 2.8663465386154465e-05, + "loss": 0.528, + "step": 10676 + }, + { + "epoch": 13.66656, + "grad_norm": 0.9513071179389954, + "learning_rate": 2.8661464585834334e-05, + "loss": 0.5234, + "step": 10677 + }, + { + "epoch": 13.66784, + "grad_norm": 1.0107206106185913, + "learning_rate": 2.8659463785514206e-05, + "loss": 0.564, + "step": 10678 + }, + { + "epoch": 13.66912, + "grad_norm": 0.9832781553268433, + "learning_rate": 2.8657462985194078e-05, + "loss": 0.5724, + "step": 10679 + }, + { + "epoch": 13.6704, + "grad_norm": 0.9765233993530273, + "learning_rate": 2.865546218487395e-05, + "loss": 0.5414, + "step": 10680 + }, + { + "epoch": 13.67168, + "grad_norm": 0.9487534761428833, + "learning_rate": 2.865346138455382e-05, + "loss": 0.4998, + "step": 10681 + }, + { + "epoch": 13.67296, + "grad_norm": 0.9580942988395691, + "learning_rate": 2.8651460584233697e-05, + "loss": 0.5587, + "step": 10682 + }, + { + "epoch": 13.67424, + "grad_norm": 0.9719009399414062, + "learning_rate": 2.864945978391357e-05, + "loss": 0.5611, + "step": 10683 + }, + { + "epoch": 13.67552, + "grad_norm": 0.9432852864265442, + "learning_rate": 2.864745898359344e-05, + "loss": 0.5628, + "step": 10684 + }, + { + "epoch": 13.6768, + "grad_norm": 0.9318643808364868, + "learning_rate": 2.864545818327331e-05, + "loss": 0.5335, + "step": 10685 + }, + { + "epoch": 13.67808, + "grad_norm": 0.9718388915061951, + "learning_rate": 2.864345738295318e-05, + "loss": 0.5531, + "step": 10686 + }, + { + "epoch": 13.679359999999999, + "grad_norm": 0.9416543245315552, + "learning_rate": 2.8641456582633052e-05, + "loss": 0.5295, + "step": 10687 + }, + { + "epoch": 13.68064, + "grad_norm": 0.9783507585525513, + "learning_rate": 2.8639455782312924e-05, + "loss": 0.5539, + "step": 10688 + }, + { + "epoch": 13.68192, + "grad_norm": 0.9382901787757874, + "learning_rate": 2.86374549819928e-05, + "loss": 0.5215, + "step": 10689 + }, + { + "epoch": 13.6832, + "grad_norm": 0.9459569454193115, + "learning_rate": 2.863545418167267e-05, + "loss": 0.5271, + "step": 10690 + }, + { + "epoch": 13.68448, + "grad_norm": 0.9002374410629272, + "learning_rate": 2.8633453381352543e-05, + "loss": 0.5393, + "step": 10691 + }, + { + "epoch": 13.68576, + "grad_norm": 0.9636310338973999, + "learning_rate": 2.8631452581032415e-05, + "loss": 0.5206, + "step": 10692 + }, + { + "epoch": 13.68704, + "grad_norm": 0.9752256274223328, + "learning_rate": 2.8629451780712284e-05, + "loss": 0.578, + "step": 10693 + }, + { + "epoch": 13.688320000000001, + "grad_norm": 0.9666173458099365, + "learning_rate": 2.8627450980392155e-05, + "loss": 0.5931, + "step": 10694 + }, + { + "epoch": 13.6896, + "grad_norm": 0.9382612109184265, + "learning_rate": 2.8625450180072027e-05, + "loss": 0.5502, + "step": 10695 + }, + { + "epoch": 13.69088, + "grad_norm": 0.9369370341300964, + "learning_rate": 2.8623449379751906e-05, + "loss": 0.5421, + "step": 10696 + }, + { + "epoch": 13.69216, + "grad_norm": 0.8949071764945984, + "learning_rate": 2.8621448579431774e-05, + "loss": 0.5083, + "step": 10697 + }, + { + "epoch": 13.69344, + "grad_norm": 0.9892618656158447, + "learning_rate": 2.8619447779111646e-05, + "loss": 0.5519, + "step": 10698 + }, + { + "epoch": 13.69472, + "grad_norm": 0.8985315561294556, + "learning_rate": 2.8617446978791518e-05, + "loss": 0.4751, + "step": 10699 + }, + { + "epoch": 13.696, + "grad_norm": 0.9138407707214355, + "learning_rate": 2.861544617847139e-05, + "loss": 0.5034, + "step": 10700 + }, + { + "epoch": 13.69728, + "grad_norm": 0.8925737142562866, + "learning_rate": 2.861344537815126e-05, + "loss": 0.5013, + "step": 10701 + }, + { + "epoch": 13.69856, + "grad_norm": 0.9540265798568726, + "learning_rate": 2.861144457783113e-05, + "loss": 0.5579, + "step": 10702 + }, + { + "epoch": 13.69984, + "grad_norm": 0.9697530269622803, + "learning_rate": 2.860944377751101e-05, + "loss": 0.527, + "step": 10703 + }, + { + "epoch": 13.70112, + "grad_norm": 0.999180257320404, + "learning_rate": 2.860744297719088e-05, + "loss": 0.5523, + "step": 10704 + }, + { + "epoch": 13.7024, + "grad_norm": 0.9945990443229675, + "learning_rate": 2.860544217687075e-05, + "loss": 0.5717, + "step": 10705 + }, + { + "epoch": 13.70368, + "grad_norm": 0.9230750203132629, + "learning_rate": 2.860344137655062e-05, + "loss": 0.5279, + "step": 10706 + }, + { + "epoch": 13.70496, + "grad_norm": 0.9194453954696655, + "learning_rate": 2.8601440576230493e-05, + "loss": 0.5262, + "step": 10707 + }, + { + "epoch": 13.70624, + "grad_norm": 0.9214555621147156, + "learning_rate": 2.8599439775910365e-05, + "loss": 0.5166, + "step": 10708 + }, + { + "epoch": 13.70752, + "grad_norm": 1.0233759880065918, + "learning_rate": 2.8597438975590233e-05, + "loss": 0.6114, + "step": 10709 + }, + { + "epoch": 13.7088, + "grad_norm": 0.8889565467834473, + "learning_rate": 2.8595438175270112e-05, + "loss": 0.4844, + "step": 10710 + }, + { + "epoch": 13.71008, + "grad_norm": 0.9614165425300598, + "learning_rate": 2.8593437374949984e-05, + "loss": 0.5454, + "step": 10711 + }, + { + "epoch": 13.711359999999999, + "grad_norm": 0.9475215673446655, + "learning_rate": 2.8591436574629856e-05, + "loss": 0.4936, + "step": 10712 + }, + { + "epoch": 13.71264, + "grad_norm": 1.000278353691101, + "learning_rate": 2.8589435774309724e-05, + "loss": 0.5549, + "step": 10713 + }, + { + "epoch": 13.71392, + "grad_norm": 0.918251097202301, + "learning_rate": 2.8587434973989596e-05, + "loss": 0.5023, + "step": 10714 + }, + { + "epoch": 13.7152, + "grad_norm": 0.9441922903060913, + "learning_rate": 2.8585434173669468e-05, + "loss": 0.5331, + "step": 10715 + }, + { + "epoch": 13.71648, + "grad_norm": 0.9622296690940857, + "learning_rate": 2.858343337334934e-05, + "loss": 0.5316, + "step": 10716 + }, + { + "epoch": 13.71776, + "grad_norm": 0.9810208678245544, + "learning_rate": 2.8581432573029215e-05, + "loss": 0.5379, + "step": 10717 + }, + { + "epoch": 13.71904, + "grad_norm": 0.9779439568519592, + "learning_rate": 2.8579431772709087e-05, + "loss": 0.5698, + "step": 10718 + }, + { + "epoch": 13.72032, + "grad_norm": 0.9007058143615723, + "learning_rate": 2.857743097238896e-05, + "loss": 0.484, + "step": 10719 + }, + { + "epoch": 13.7216, + "grad_norm": 0.964832067489624, + "learning_rate": 2.857543017206883e-05, + "loss": 0.5773, + "step": 10720 + }, + { + "epoch": 13.72288, + "grad_norm": 0.9448238015174866, + "learning_rate": 2.85734293717487e-05, + "loss": 0.5122, + "step": 10721 + }, + { + "epoch": 13.72416, + "grad_norm": 0.9428820610046387, + "learning_rate": 2.857142857142857e-05, + "loss": 0.5472, + "step": 10722 + }, + { + "epoch": 13.72544, + "grad_norm": 0.9774311184883118, + "learning_rate": 2.8569427771108443e-05, + "loss": 0.5224, + "step": 10723 + }, + { + "epoch": 13.72672, + "grad_norm": 0.9833608269691467, + "learning_rate": 2.8567426970788318e-05, + "loss": 0.5713, + "step": 10724 + }, + { + "epoch": 13.728, + "grad_norm": 0.906108558177948, + "learning_rate": 2.856542617046819e-05, + "loss": 0.5289, + "step": 10725 + }, + { + "epoch": 13.72928, + "grad_norm": 0.9727580547332764, + "learning_rate": 2.856342537014806e-05, + "loss": 0.5354, + "step": 10726 + }, + { + "epoch": 13.73056, + "grad_norm": 0.9792339205741882, + "learning_rate": 2.8561424569827934e-05, + "loss": 0.5648, + "step": 10727 + }, + { + "epoch": 13.73184, + "grad_norm": 0.962765634059906, + "learning_rate": 2.8559423769507805e-05, + "loss": 0.5599, + "step": 10728 + }, + { + "epoch": 13.73312, + "grad_norm": 0.9317491054534912, + "learning_rate": 2.8557422969187674e-05, + "loss": 0.5203, + "step": 10729 + }, + { + "epoch": 13.7344, + "grad_norm": 0.9817516207695007, + "learning_rate": 2.8555422168867546e-05, + "loss": 0.5766, + "step": 10730 + }, + { + "epoch": 13.73568, + "grad_norm": 0.9059588313102722, + "learning_rate": 2.8553421368547424e-05, + "loss": 0.5134, + "step": 10731 + }, + { + "epoch": 13.73696, + "grad_norm": 0.9380925893783569, + "learning_rate": 2.8551420568227293e-05, + "loss": 0.5188, + "step": 10732 + }, + { + "epoch": 13.73824, + "grad_norm": 0.919386088848114, + "learning_rate": 2.8549419767907165e-05, + "loss": 0.5315, + "step": 10733 + }, + { + "epoch": 13.73952, + "grad_norm": 0.991089403629303, + "learning_rate": 2.8547418967587037e-05, + "loss": 0.6093, + "step": 10734 + }, + { + "epoch": 13.7408, + "grad_norm": 0.9908062219619751, + "learning_rate": 2.854541816726691e-05, + "loss": 0.5627, + "step": 10735 + }, + { + "epoch": 13.74208, + "grad_norm": 0.9567424654960632, + "learning_rate": 2.854341736694678e-05, + "loss": 0.5442, + "step": 10736 + }, + { + "epoch": 13.74336, + "grad_norm": 0.9408556818962097, + "learning_rate": 2.854141656662665e-05, + "loss": 0.5286, + "step": 10737 + }, + { + "epoch": 13.74464, + "grad_norm": 0.9387040734291077, + "learning_rate": 2.8539415766306527e-05, + "loss": 0.548, + "step": 10738 + }, + { + "epoch": 13.74592, + "grad_norm": 0.921125054359436, + "learning_rate": 2.85374149659864e-05, + "loss": 0.5613, + "step": 10739 + }, + { + "epoch": 13.7472, + "grad_norm": 0.9063670635223389, + "learning_rate": 2.8535414165666268e-05, + "loss": 0.5457, + "step": 10740 + }, + { + "epoch": 13.74848, + "grad_norm": 1.002338171005249, + "learning_rate": 2.853341336534614e-05, + "loss": 0.5935, + "step": 10741 + }, + { + "epoch": 13.74976, + "grad_norm": 0.8982512950897217, + "learning_rate": 2.853141256502601e-05, + "loss": 0.5346, + "step": 10742 + }, + { + "epoch": 13.75104, + "grad_norm": 0.8786603808403015, + "learning_rate": 2.8529411764705883e-05, + "loss": 0.5381, + "step": 10743 + }, + { + "epoch": 13.75232, + "grad_norm": 0.9155345559120178, + "learning_rate": 2.8527410964385755e-05, + "loss": 0.5217, + "step": 10744 + }, + { + "epoch": 13.7536, + "grad_norm": 0.9769365787506104, + "learning_rate": 2.852541016406563e-05, + "loss": 0.5838, + "step": 10745 + }, + { + "epoch": 13.75488, + "grad_norm": 0.9437276124954224, + "learning_rate": 2.8523409363745502e-05, + "loss": 0.5727, + "step": 10746 + }, + { + "epoch": 13.75616, + "grad_norm": 0.9436963200569153, + "learning_rate": 2.8521408563425374e-05, + "loss": 0.5224, + "step": 10747 + }, + { + "epoch": 13.75744, + "grad_norm": 0.9029295444488525, + "learning_rate": 2.8519407763105243e-05, + "loss": 0.5258, + "step": 10748 + }, + { + "epoch": 13.75872, + "grad_norm": 0.9773640036582947, + "learning_rate": 2.8517406962785114e-05, + "loss": 0.5447, + "step": 10749 + }, + { + "epoch": 13.76, + "grad_norm": 0.9137765169143677, + "learning_rate": 2.8515406162464986e-05, + "loss": 0.543, + "step": 10750 + }, + { + "epoch": 13.76128, + "grad_norm": 0.947670042514801, + "learning_rate": 2.8513405362144858e-05, + "loss": 0.5234, + "step": 10751 + }, + { + "epoch": 13.76256, + "grad_norm": 0.9233437776565552, + "learning_rate": 2.8511404561824733e-05, + "loss": 0.5104, + "step": 10752 + }, + { + "epoch": 13.76384, + "grad_norm": 0.9210999011993408, + "learning_rate": 2.8509403761504605e-05, + "loss": 0.5731, + "step": 10753 + }, + { + "epoch": 13.76512, + "grad_norm": 0.9368586540222168, + "learning_rate": 2.8507402961184477e-05, + "loss": 0.5448, + "step": 10754 + }, + { + "epoch": 13.7664, + "grad_norm": 0.933786153793335, + "learning_rate": 2.850540216086435e-05, + "loss": 0.5324, + "step": 10755 + }, + { + "epoch": 13.76768, + "grad_norm": 0.8887897729873657, + "learning_rate": 2.8503401360544217e-05, + "loss": 0.5083, + "step": 10756 + }, + { + "epoch": 13.76896, + "grad_norm": 0.9025307893753052, + "learning_rate": 2.850140056022409e-05, + "loss": 0.5212, + "step": 10757 + }, + { + "epoch": 13.77024, + "grad_norm": 0.936424195766449, + "learning_rate": 2.849939975990396e-05, + "loss": 0.5719, + "step": 10758 + }, + { + "epoch": 13.77152, + "grad_norm": 0.936571478843689, + "learning_rate": 2.8497398959583836e-05, + "loss": 0.5558, + "step": 10759 + }, + { + "epoch": 13.7728, + "grad_norm": 0.9100072979927063, + "learning_rate": 2.8495398159263708e-05, + "loss": 0.5169, + "step": 10760 + }, + { + "epoch": 13.77408, + "grad_norm": 0.8884329199790955, + "learning_rate": 2.849339735894358e-05, + "loss": 0.5174, + "step": 10761 + }, + { + "epoch": 13.77536, + "grad_norm": 0.907352864742279, + "learning_rate": 2.8491396558623452e-05, + "loss": 0.5166, + "step": 10762 + }, + { + "epoch": 13.77664, + "grad_norm": 0.929191529750824, + "learning_rate": 2.8489395758303324e-05, + "loss": 0.5554, + "step": 10763 + }, + { + "epoch": 13.77792, + "grad_norm": 0.8757056593894958, + "learning_rate": 2.8487394957983192e-05, + "loss": 0.4899, + "step": 10764 + }, + { + "epoch": 13.7792, + "grad_norm": 0.9026574492454529, + "learning_rate": 2.8485394157663064e-05, + "loss": 0.4836, + "step": 10765 + }, + { + "epoch": 13.78048, + "grad_norm": 0.9860047101974487, + "learning_rate": 2.8483393357342943e-05, + "loss": 0.5633, + "step": 10766 + }, + { + "epoch": 13.78176, + "grad_norm": 0.8981946110725403, + "learning_rate": 2.848139255702281e-05, + "loss": 0.5248, + "step": 10767 + }, + { + "epoch": 13.78304, + "grad_norm": 0.8830896019935608, + "learning_rate": 2.8479391756702683e-05, + "loss": 0.5086, + "step": 10768 + }, + { + "epoch": 13.78432, + "grad_norm": 0.9160583019256592, + "learning_rate": 2.8477390956382555e-05, + "loss": 0.5561, + "step": 10769 + }, + { + "epoch": 13.7856, + "grad_norm": 0.9723164439201355, + "learning_rate": 2.8475390156062427e-05, + "loss": 0.6023, + "step": 10770 + }, + { + "epoch": 13.78688, + "grad_norm": 0.9493298530578613, + "learning_rate": 2.84733893557423e-05, + "loss": 0.5237, + "step": 10771 + }, + { + "epoch": 13.78816, + "grad_norm": 0.9569862484931946, + "learning_rate": 2.8471388555422167e-05, + "loss": 0.5436, + "step": 10772 + }, + { + "epoch": 13.78944, + "grad_norm": 0.9514901638031006, + "learning_rate": 2.8469387755102046e-05, + "loss": 0.5559, + "step": 10773 + }, + { + "epoch": 13.79072, + "grad_norm": 0.914356529712677, + "learning_rate": 2.8467386954781918e-05, + "loss": 0.5284, + "step": 10774 + }, + { + "epoch": 13.792, + "grad_norm": 0.8957856297492981, + "learning_rate": 2.8465386154461786e-05, + "loss": 0.51, + "step": 10775 + }, + { + "epoch": 13.79328, + "grad_norm": 0.9669928550720215, + "learning_rate": 2.8463385354141658e-05, + "loss": 0.5284, + "step": 10776 + }, + { + "epoch": 13.79456, + "grad_norm": 0.9488115310668945, + "learning_rate": 2.846138455382153e-05, + "loss": 0.5306, + "step": 10777 + }, + { + "epoch": 13.79584, + "grad_norm": 0.9263489246368408, + "learning_rate": 2.84593837535014e-05, + "loss": 0.4877, + "step": 10778 + }, + { + "epoch": 13.79712, + "grad_norm": 0.9073372483253479, + "learning_rate": 2.8457382953181273e-05, + "loss": 0.5139, + "step": 10779 + }, + { + "epoch": 13.7984, + "grad_norm": 0.9395866990089417, + "learning_rate": 2.8455382152861142e-05, + "loss": 0.5548, + "step": 10780 + }, + { + "epoch": 13.79968, + "grad_norm": 0.9865780472755432, + "learning_rate": 2.845338135254102e-05, + "loss": 0.5339, + "step": 10781 + }, + { + "epoch": 13.80096, + "grad_norm": 1.0401817560195923, + "learning_rate": 2.8451380552220892e-05, + "loss": 0.5783, + "step": 10782 + }, + { + "epoch": 13.80224, + "grad_norm": 0.9715459942817688, + "learning_rate": 2.844937975190076e-05, + "loss": 0.5637, + "step": 10783 + }, + { + "epoch": 13.80352, + "grad_norm": 0.9215720891952515, + "learning_rate": 2.8447378951580633e-05, + "loss": 0.5476, + "step": 10784 + }, + { + "epoch": 13.8048, + "grad_norm": 0.9105191826820374, + "learning_rate": 2.8445378151260505e-05, + "loss": 0.5545, + "step": 10785 + }, + { + "epoch": 13.80608, + "grad_norm": 0.9285398125648499, + "learning_rate": 2.8443377350940376e-05, + "loss": 0.5758, + "step": 10786 + }, + { + "epoch": 13.80736, + "grad_norm": 0.9510117173194885, + "learning_rate": 2.844137655062025e-05, + "loss": 0.5287, + "step": 10787 + }, + { + "epoch": 13.80864, + "grad_norm": 0.9962491989135742, + "learning_rate": 2.8439375750300124e-05, + "loss": 0.6091, + "step": 10788 + }, + { + "epoch": 13.80992, + "grad_norm": 0.9605426788330078, + "learning_rate": 2.8437374949979995e-05, + "loss": 0.5504, + "step": 10789 + }, + { + "epoch": 13.8112, + "grad_norm": 0.9639581441879272, + "learning_rate": 2.8435374149659867e-05, + "loss": 0.5177, + "step": 10790 + }, + { + "epoch": 13.81248, + "grad_norm": 0.9726257920265198, + "learning_rate": 2.8433373349339736e-05, + "loss": 0.5312, + "step": 10791 + }, + { + "epoch": 13.81376, + "grad_norm": 0.9021669030189514, + "learning_rate": 2.8431372549019608e-05, + "loss": 0.4934, + "step": 10792 + }, + { + "epoch": 13.81504, + "grad_norm": 0.9048178195953369, + "learning_rate": 2.842937174869948e-05, + "loss": 0.543, + "step": 10793 + }, + { + "epoch": 13.81632, + "grad_norm": 0.9584506750106812, + "learning_rate": 2.842737094837935e-05, + "loss": 0.5404, + "step": 10794 + }, + { + "epoch": 13.8176, + "grad_norm": 0.9696805477142334, + "learning_rate": 2.8425370148059227e-05, + "loss": 0.5502, + "step": 10795 + }, + { + "epoch": 13.81888, + "grad_norm": 0.9749921560287476, + "learning_rate": 2.84233693477391e-05, + "loss": 0.5678, + "step": 10796 + }, + { + "epoch": 13.82016, + "grad_norm": 0.959991991519928, + "learning_rate": 2.842136854741897e-05, + "loss": 0.4897, + "step": 10797 + }, + { + "epoch": 13.821439999999999, + "grad_norm": 0.9816640019416809, + "learning_rate": 2.8419367747098842e-05, + "loss": 0.5713, + "step": 10798 + }, + { + "epoch": 13.82272, + "grad_norm": 0.92801833152771, + "learning_rate": 2.841736694677871e-05, + "loss": 0.4937, + "step": 10799 + }, + { + "epoch": 13.824, + "grad_norm": 0.9041526317596436, + "learning_rate": 2.8415366146458582e-05, + "loss": 0.4989, + "step": 10800 + }, + { + "epoch": 13.82528, + "grad_norm": 0.9021725654602051, + "learning_rate": 2.8413365346138454e-05, + "loss": 0.5101, + "step": 10801 + }, + { + "epoch": 13.82656, + "grad_norm": 0.9045724272727966, + "learning_rate": 2.841136454581833e-05, + "loss": 0.5168, + "step": 10802 + }, + { + "epoch": 13.82784, + "grad_norm": 0.9894302487373352, + "learning_rate": 2.84093637454982e-05, + "loss": 0.5861, + "step": 10803 + }, + { + "epoch": 13.82912, + "grad_norm": 0.9526990652084351, + "learning_rate": 2.8407362945178073e-05, + "loss": 0.5543, + "step": 10804 + }, + { + "epoch": 13.830400000000001, + "grad_norm": 0.9203627109527588, + "learning_rate": 2.8405362144857945e-05, + "loss": 0.5118, + "step": 10805 + }, + { + "epoch": 13.83168, + "grad_norm": 0.9406778812408447, + "learning_rate": 2.8403361344537817e-05, + "loss": 0.5656, + "step": 10806 + }, + { + "epoch": 13.83296, + "grad_norm": 0.959984540939331, + "learning_rate": 2.8401360544217685e-05, + "loss": 0.5362, + "step": 10807 + }, + { + "epoch": 13.83424, + "grad_norm": 0.9703862071037292, + "learning_rate": 2.8399359743897557e-05, + "loss": 0.5502, + "step": 10808 + }, + { + "epoch": 13.83552, + "grad_norm": 0.9299468994140625, + "learning_rate": 2.8397358943577436e-05, + "loss": 0.5166, + "step": 10809 + }, + { + "epoch": 13.8368, + "grad_norm": 0.9478683471679688, + "learning_rate": 2.8395358143257304e-05, + "loss": 0.5688, + "step": 10810 + }, + { + "epoch": 13.83808, + "grad_norm": 0.9263501763343811, + "learning_rate": 2.8393357342937176e-05, + "loss": 0.5294, + "step": 10811 + }, + { + "epoch": 13.83936, + "grad_norm": 0.9850642085075378, + "learning_rate": 2.8391356542617048e-05, + "loss": 0.5819, + "step": 10812 + }, + { + "epoch": 13.84064, + "grad_norm": 0.9924013614654541, + "learning_rate": 2.838935574229692e-05, + "loss": 0.5516, + "step": 10813 + }, + { + "epoch": 13.84192, + "grad_norm": 0.9259412288665771, + "learning_rate": 2.8387354941976792e-05, + "loss": 0.5034, + "step": 10814 + }, + { + "epoch": 13.8432, + "grad_norm": 0.9189561605453491, + "learning_rate": 2.838535414165666e-05, + "loss": 0.5612, + "step": 10815 + }, + { + "epoch": 13.84448, + "grad_norm": 0.9896423816680908, + "learning_rate": 2.838335334133654e-05, + "loss": 0.5521, + "step": 10816 + }, + { + "epoch": 13.84576, + "grad_norm": 0.9858946204185486, + "learning_rate": 2.838135254101641e-05, + "loss": 0.5713, + "step": 10817 + }, + { + "epoch": 13.84704, + "grad_norm": 0.9236769080162048, + "learning_rate": 2.837935174069628e-05, + "loss": 0.5443, + "step": 10818 + }, + { + "epoch": 13.84832, + "grad_norm": 0.9041772484779358, + "learning_rate": 2.837735094037615e-05, + "loss": 0.5474, + "step": 10819 + }, + { + "epoch": 13.8496, + "grad_norm": 0.9847750663757324, + "learning_rate": 2.8375350140056023e-05, + "loss": 0.5703, + "step": 10820 + }, + { + "epoch": 13.85088, + "grad_norm": 0.9936345815658569, + "learning_rate": 2.8373349339735895e-05, + "loss": 0.5808, + "step": 10821 + }, + { + "epoch": 13.85216, + "grad_norm": 0.9457690715789795, + "learning_rate": 2.8371348539415767e-05, + "loss": 0.542, + "step": 10822 + }, + { + "epoch": 13.853439999999999, + "grad_norm": 0.9375315308570862, + "learning_rate": 2.8369347739095642e-05, + "loss": 0.5443, + "step": 10823 + }, + { + "epoch": 13.85472, + "grad_norm": 0.9101528525352478, + "learning_rate": 2.8367346938775514e-05, + "loss": 0.5034, + "step": 10824 + }, + { + "epoch": 13.856, + "grad_norm": 0.938879668712616, + "learning_rate": 2.8365346138455386e-05, + "loss": 0.5565, + "step": 10825 + }, + { + "epoch": 13.85728, + "grad_norm": 0.8987931609153748, + "learning_rate": 2.8363345338135254e-05, + "loss": 0.5148, + "step": 10826 + }, + { + "epoch": 13.85856, + "grad_norm": 0.9683275818824768, + "learning_rate": 2.8361344537815126e-05, + "loss": 0.5337, + "step": 10827 + }, + { + "epoch": 13.85984, + "grad_norm": 0.9831743240356445, + "learning_rate": 2.8359343737494998e-05, + "loss": 0.5295, + "step": 10828 + }, + { + "epoch": 13.86112, + "grad_norm": 0.9588016867637634, + "learning_rate": 2.835734293717487e-05, + "loss": 0.5394, + "step": 10829 + }, + { + "epoch": 13.862400000000001, + "grad_norm": 0.9667535424232483, + "learning_rate": 2.8355342136854745e-05, + "loss": 0.5661, + "step": 10830 + }, + { + "epoch": 13.86368, + "grad_norm": 0.9933251738548279, + "learning_rate": 2.8353341336534617e-05, + "loss": 0.5859, + "step": 10831 + }, + { + "epoch": 13.86496, + "grad_norm": 1.0498590469360352, + "learning_rate": 2.835134053621449e-05, + "loss": 0.6109, + "step": 10832 + }, + { + "epoch": 13.86624, + "grad_norm": 0.929767370223999, + "learning_rate": 2.834933973589436e-05, + "loss": 0.5512, + "step": 10833 + }, + { + "epoch": 13.86752, + "grad_norm": 0.9516801238059998, + "learning_rate": 2.834733893557423e-05, + "loss": 0.5363, + "step": 10834 + }, + { + "epoch": 13.8688, + "grad_norm": 0.9916854500770569, + "learning_rate": 2.83453381352541e-05, + "loss": 0.5854, + "step": 10835 + }, + { + "epoch": 13.87008, + "grad_norm": 0.9755607843399048, + "learning_rate": 2.8343337334933973e-05, + "loss": 0.5362, + "step": 10836 + }, + { + "epoch": 13.87136, + "grad_norm": 0.9905216097831726, + "learning_rate": 2.8341336534613848e-05, + "loss": 0.5287, + "step": 10837 + }, + { + "epoch": 13.87264, + "grad_norm": 0.9148777723312378, + "learning_rate": 2.833933573429372e-05, + "loss": 0.5432, + "step": 10838 + }, + { + "epoch": 13.87392, + "grad_norm": 0.978702962398529, + "learning_rate": 2.833733493397359e-05, + "loss": 0.5745, + "step": 10839 + }, + { + "epoch": 13.8752, + "grad_norm": 0.9320308566093445, + "learning_rate": 2.8335334133653464e-05, + "loss": 0.5186, + "step": 10840 + }, + { + "epoch": 13.87648, + "grad_norm": 0.9999722838401794, + "learning_rate": 2.8333333333333335e-05, + "loss": 0.5527, + "step": 10841 + }, + { + "epoch": 13.87776, + "grad_norm": 0.9607200026512146, + "learning_rate": 2.8331332533013204e-05, + "loss": 0.5737, + "step": 10842 + }, + { + "epoch": 13.87904, + "grad_norm": 0.9080913662910461, + "learning_rate": 2.8329331732693076e-05, + "loss": 0.4989, + "step": 10843 + }, + { + "epoch": 13.88032, + "grad_norm": 0.9405791759490967, + "learning_rate": 2.8327330932372954e-05, + "loss": 0.5243, + "step": 10844 + }, + { + "epoch": 13.8816, + "grad_norm": 0.9514108896255493, + "learning_rate": 2.8325330132052823e-05, + "loss": 0.5607, + "step": 10845 + }, + { + "epoch": 13.88288, + "grad_norm": 0.9832378029823303, + "learning_rate": 2.8323329331732695e-05, + "loss": 0.5726, + "step": 10846 + }, + { + "epoch": 13.88416, + "grad_norm": 0.9148914813995361, + "learning_rate": 2.8321328531412567e-05, + "loss": 0.4917, + "step": 10847 + }, + { + "epoch": 13.88544, + "grad_norm": 0.9236845970153809, + "learning_rate": 2.831932773109244e-05, + "loss": 0.5422, + "step": 10848 + }, + { + "epoch": 13.88672, + "grad_norm": 0.8827064037322998, + "learning_rate": 2.831732693077231e-05, + "loss": 0.563, + "step": 10849 + }, + { + "epoch": 13.888, + "grad_norm": 0.9374257922172546, + "learning_rate": 2.831532613045218e-05, + "loss": 0.5358, + "step": 10850 + }, + { + "epoch": 13.88928, + "grad_norm": 1.0206034183502197, + "learning_rate": 2.8313325330132057e-05, + "loss": 0.5629, + "step": 10851 + }, + { + "epoch": 13.89056, + "grad_norm": 0.9112451672554016, + "learning_rate": 2.831132452981193e-05, + "loss": 0.4899, + "step": 10852 + }, + { + "epoch": 13.89184, + "grad_norm": 1.0080876350402832, + "learning_rate": 2.8309323729491798e-05, + "loss": 0.5934, + "step": 10853 + }, + { + "epoch": 13.89312, + "grad_norm": 0.9685933589935303, + "learning_rate": 2.830732292917167e-05, + "loss": 0.5597, + "step": 10854 + }, + { + "epoch": 13.8944, + "grad_norm": 0.9162622094154358, + "learning_rate": 2.830532212885154e-05, + "loss": 0.5136, + "step": 10855 + }, + { + "epoch": 13.89568, + "grad_norm": 0.9283410310745239, + "learning_rate": 2.8303321328531413e-05, + "loss": 0.5515, + "step": 10856 + }, + { + "epoch": 13.89696, + "grad_norm": 0.9511690139770508, + "learning_rate": 2.8301320528211285e-05, + "loss": 0.5062, + "step": 10857 + }, + { + "epoch": 13.89824, + "grad_norm": 0.9601842761039734, + "learning_rate": 2.829931972789116e-05, + "loss": 0.5394, + "step": 10858 + }, + { + "epoch": 13.89952, + "grad_norm": 0.946259617805481, + "learning_rate": 2.8297318927571032e-05, + "loss": 0.5835, + "step": 10859 + }, + { + "epoch": 13.9008, + "grad_norm": 0.9118945598602295, + "learning_rate": 2.8295318127250904e-05, + "loss": 0.5668, + "step": 10860 + }, + { + "epoch": 13.90208, + "grad_norm": 0.982496976852417, + "learning_rate": 2.8293317326930773e-05, + "loss": 0.5393, + "step": 10861 + }, + { + "epoch": 13.90336, + "grad_norm": 0.9529083371162415, + "learning_rate": 2.8291316526610644e-05, + "loss": 0.5617, + "step": 10862 + }, + { + "epoch": 13.90464, + "grad_norm": 0.9181329011917114, + "learning_rate": 2.8289315726290516e-05, + "loss": 0.5187, + "step": 10863 + }, + { + "epoch": 13.90592, + "grad_norm": 0.9448521137237549, + "learning_rate": 2.8287314925970388e-05, + "loss": 0.5783, + "step": 10864 + }, + { + "epoch": 13.9072, + "grad_norm": 0.9216022491455078, + "learning_rate": 2.8285314125650263e-05, + "loss": 0.5452, + "step": 10865 + }, + { + "epoch": 13.90848, + "grad_norm": 0.9557605981826782, + "learning_rate": 2.8283313325330135e-05, + "loss": 0.5593, + "step": 10866 + }, + { + "epoch": 13.90976, + "grad_norm": 0.9449323415756226, + "learning_rate": 2.8281312525010007e-05, + "loss": 0.5575, + "step": 10867 + }, + { + "epoch": 13.91104, + "grad_norm": 0.945347249507904, + "learning_rate": 2.827931172468988e-05, + "loss": 0.5354, + "step": 10868 + }, + { + "epoch": 13.91232, + "grad_norm": 0.9745570421218872, + "learning_rate": 2.8277310924369747e-05, + "loss": 0.5818, + "step": 10869 + }, + { + "epoch": 13.9136, + "grad_norm": 0.9462365508079529, + "learning_rate": 2.827531012404962e-05, + "loss": 0.5278, + "step": 10870 + }, + { + "epoch": 13.91488, + "grad_norm": 0.985939621925354, + "learning_rate": 2.827330932372949e-05, + "loss": 0.5727, + "step": 10871 + }, + { + "epoch": 13.91616, + "grad_norm": 0.8899412751197815, + "learning_rate": 2.8271308523409366e-05, + "loss": 0.5279, + "step": 10872 + }, + { + "epoch": 13.91744, + "grad_norm": 0.9762179851531982, + "learning_rate": 2.8269307723089238e-05, + "loss": 0.5406, + "step": 10873 + }, + { + "epoch": 13.91872, + "grad_norm": 0.9601300358772278, + "learning_rate": 2.826730692276911e-05, + "loss": 0.5518, + "step": 10874 + }, + { + "epoch": 13.92, + "grad_norm": 1.0480626821517944, + "learning_rate": 2.8265306122448982e-05, + "loss": 0.5649, + "step": 10875 + }, + { + "epoch": 13.92128, + "grad_norm": 0.8842355012893677, + "learning_rate": 2.8263305322128854e-05, + "loss": 0.4951, + "step": 10876 + }, + { + "epoch": 13.92256, + "grad_norm": 0.9062721729278564, + "learning_rate": 2.8261304521808722e-05, + "loss": 0.5746, + "step": 10877 + }, + { + "epoch": 13.92384, + "grad_norm": 0.9198760986328125, + "learning_rate": 2.8259303721488594e-05, + "loss": 0.5183, + "step": 10878 + }, + { + "epoch": 13.92512, + "grad_norm": 0.9441488981246948, + "learning_rate": 2.8257302921168473e-05, + "loss": 0.537, + "step": 10879 + }, + { + "epoch": 13.9264, + "grad_norm": 0.9522190093994141, + "learning_rate": 2.825530212084834e-05, + "loss": 0.514, + "step": 10880 + }, + { + "epoch": 13.92768, + "grad_norm": 0.9240538477897644, + "learning_rate": 2.8253301320528213e-05, + "loss": 0.5422, + "step": 10881 + }, + { + "epoch": 13.92896, + "grad_norm": 0.9243524670600891, + "learning_rate": 2.8251300520208085e-05, + "loss": 0.5353, + "step": 10882 + }, + { + "epoch": 13.93024, + "grad_norm": 0.9364755153656006, + "learning_rate": 2.8249299719887957e-05, + "loss": 0.5339, + "step": 10883 + }, + { + "epoch": 13.93152, + "grad_norm": 0.9147524833679199, + "learning_rate": 2.824729891956783e-05, + "loss": 0.4999, + "step": 10884 + }, + { + "epoch": 13.9328, + "grad_norm": 0.956943690776825, + "learning_rate": 2.8245298119247697e-05, + "loss": 0.531, + "step": 10885 + }, + { + "epoch": 13.93408, + "grad_norm": 0.9214387536048889, + "learning_rate": 2.8243297318927576e-05, + "loss": 0.5403, + "step": 10886 + }, + { + "epoch": 13.93536, + "grad_norm": 0.9463467001914978, + "learning_rate": 2.8241296518607448e-05, + "loss": 0.5471, + "step": 10887 + }, + { + "epoch": 13.93664, + "grad_norm": 0.9077194929122925, + "learning_rate": 2.8239295718287316e-05, + "loss": 0.5406, + "step": 10888 + }, + { + "epoch": 13.93792, + "grad_norm": 0.943578839302063, + "learning_rate": 2.8237294917967188e-05, + "loss": 0.5271, + "step": 10889 + }, + { + "epoch": 13.9392, + "grad_norm": 0.9538133144378662, + "learning_rate": 2.823529411764706e-05, + "loss": 0.5552, + "step": 10890 + }, + { + "epoch": 13.94048, + "grad_norm": 0.9875748157501221, + "learning_rate": 2.823329331732693e-05, + "loss": 0.562, + "step": 10891 + }, + { + "epoch": 13.94176, + "grad_norm": 0.8694930076599121, + "learning_rate": 2.8231292517006803e-05, + "loss": 0.4823, + "step": 10892 + }, + { + "epoch": 13.94304, + "grad_norm": 0.9158459305763245, + "learning_rate": 2.8229291716686672e-05, + "loss": 0.5413, + "step": 10893 + }, + { + "epoch": 13.94432, + "grad_norm": 0.9385559558868408, + "learning_rate": 2.822729091636655e-05, + "loss": 0.5901, + "step": 10894 + }, + { + "epoch": 13.9456, + "grad_norm": 0.9433490633964539, + "learning_rate": 2.8225290116046422e-05, + "loss": 0.5487, + "step": 10895 + }, + { + "epoch": 13.94688, + "grad_norm": 0.9610795378684998, + "learning_rate": 2.822328931572629e-05, + "loss": 0.5375, + "step": 10896 + }, + { + "epoch": 13.94816, + "grad_norm": 0.9676675796508789, + "learning_rate": 2.8221288515406163e-05, + "loss": 0.5581, + "step": 10897 + }, + { + "epoch": 13.94944, + "grad_norm": 0.9557973742485046, + "learning_rate": 2.8219287715086035e-05, + "loss": 0.5599, + "step": 10898 + }, + { + "epoch": 13.95072, + "grad_norm": 0.9611872434616089, + "learning_rate": 2.8217286914765906e-05, + "loss": 0.5688, + "step": 10899 + }, + { + "epoch": 13.952, + "grad_norm": 0.9648937582969666, + "learning_rate": 2.8215286114445778e-05, + "loss": 0.5356, + "step": 10900 + }, + { + "epoch": 13.95328, + "grad_norm": 0.9507505893707275, + "learning_rate": 2.8213285314125654e-05, + "loss": 0.524, + "step": 10901 + }, + { + "epoch": 13.95456, + "grad_norm": 0.9989834427833557, + "learning_rate": 2.8211284513805525e-05, + "loss": 0.5785, + "step": 10902 + }, + { + "epoch": 13.95584, + "grad_norm": 0.9809247851371765, + "learning_rate": 2.8209283713485397e-05, + "loss": 0.5722, + "step": 10903 + }, + { + "epoch": 13.95712, + "grad_norm": 0.9577847123146057, + "learning_rate": 2.8207282913165266e-05, + "loss": 0.5891, + "step": 10904 + }, + { + "epoch": 13.9584, + "grad_norm": 1.0070008039474487, + "learning_rate": 2.8205282112845138e-05, + "loss": 0.6114, + "step": 10905 + }, + { + "epoch": 13.95968, + "grad_norm": 0.9523963928222656, + "learning_rate": 2.820328131252501e-05, + "loss": 0.545, + "step": 10906 + }, + { + "epoch": 13.96096, + "grad_norm": 0.9290968179702759, + "learning_rate": 2.820128051220488e-05, + "loss": 0.5581, + "step": 10907 + }, + { + "epoch": 13.96224, + "grad_norm": 0.8966550827026367, + "learning_rate": 2.8199279711884757e-05, + "loss": 0.5651, + "step": 10908 + }, + { + "epoch": 13.96352, + "grad_norm": 0.9531732797622681, + "learning_rate": 2.819727891156463e-05, + "loss": 0.534, + "step": 10909 + }, + { + "epoch": 13.9648, + "grad_norm": 0.9635023474693298, + "learning_rate": 2.81952781112445e-05, + "loss": 0.5338, + "step": 10910 + }, + { + "epoch": 13.96608, + "grad_norm": 0.950612485408783, + "learning_rate": 2.8193277310924372e-05, + "loss": 0.5763, + "step": 10911 + }, + { + "epoch": 13.96736, + "grad_norm": 0.9613419771194458, + "learning_rate": 2.819127651060424e-05, + "loss": 0.6066, + "step": 10912 + }, + { + "epoch": 13.96864, + "grad_norm": 0.9048581123352051, + "learning_rate": 2.8189275710284112e-05, + "loss": 0.5309, + "step": 10913 + }, + { + "epoch": 13.96992, + "grad_norm": 0.931669294834137, + "learning_rate": 2.8187274909963984e-05, + "loss": 0.5653, + "step": 10914 + }, + { + "epoch": 13.9712, + "grad_norm": 0.9555082321166992, + "learning_rate": 2.818527410964386e-05, + "loss": 0.5659, + "step": 10915 + }, + { + "epoch": 13.972480000000001, + "grad_norm": 0.9597335457801819, + "learning_rate": 2.818327330932373e-05, + "loss": 0.5385, + "step": 10916 + }, + { + "epoch": 13.97376, + "grad_norm": 1.0083085298538208, + "learning_rate": 2.8181272509003603e-05, + "loss": 0.5929, + "step": 10917 + }, + { + "epoch": 13.97504, + "grad_norm": 1.0068410634994507, + "learning_rate": 2.8179271708683475e-05, + "loss": 0.5864, + "step": 10918 + }, + { + "epoch": 13.97632, + "grad_norm": 0.9447670578956604, + "learning_rate": 2.8177270908363347e-05, + "loss": 0.5239, + "step": 10919 + }, + { + "epoch": 13.9776, + "grad_norm": 0.9446542263031006, + "learning_rate": 2.8175270108043215e-05, + "loss": 0.5369, + "step": 10920 + }, + { + "epoch": 13.97888, + "grad_norm": 0.9933118224143982, + "learning_rate": 2.8173269307723087e-05, + "loss": 0.5385, + "step": 10921 + }, + { + "epoch": 13.98016, + "grad_norm": 0.9696182012557983, + "learning_rate": 2.8171268507402966e-05, + "loss": 0.5226, + "step": 10922 + }, + { + "epoch": 13.98144, + "grad_norm": 0.9453620314598083, + "learning_rate": 2.8169267707082834e-05, + "loss": 0.5276, + "step": 10923 + }, + { + "epoch": 13.98272, + "grad_norm": 0.9572956562042236, + "learning_rate": 2.8167266906762706e-05, + "loss": 0.5629, + "step": 10924 + }, + { + "epoch": 13.984, + "grad_norm": 0.9128978252410889, + "learning_rate": 2.8165266106442578e-05, + "loss": 0.5435, + "step": 10925 + }, + { + "epoch": 13.98528, + "grad_norm": 0.926874577999115, + "learning_rate": 2.816326530612245e-05, + "loss": 0.5317, + "step": 10926 + }, + { + "epoch": 13.98656, + "grad_norm": 1.0055192708969116, + "learning_rate": 2.8161264505802322e-05, + "loss": 0.6023, + "step": 10927 + }, + { + "epoch": 13.98784, + "grad_norm": 0.9463496804237366, + "learning_rate": 2.815926370548219e-05, + "loss": 0.5565, + "step": 10928 + }, + { + "epoch": 13.98912, + "grad_norm": 0.9550392031669617, + "learning_rate": 2.815726290516207e-05, + "loss": 0.5601, + "step": 10929 + }, + { + "epoch": 13.9904, + "grad_norm": 0.9578831791877747, + "learning_rate": 2.815526210484194e-05, + "loss": 0.5703, + "step": 10930 + }, + { + "epoch": 13.99168, + "grad_norm": 0.9312989711761475, + "learning_rate": 2.815326130452181e-05, + "loss": 0.5328, + "step": 10931 + }, + { + "epoch": 13.99296, + "grad_norm": 1.000938057899475, + "learning_rate": 2.815126050420168e-05, + "loss": 0.5901, + "step": 10932 + }, + { + "epoch": 13.99424, + "grad_norm": 0.9439278244972229, + "learning_rate": 2.8149259703881553e-05, + "loss": 0.5381, + "step": 10933 + }, + { + "epoch": 13.995519999999999, + "grad_norm": 0.9305405616760254, + "learning_rate": 2.8147258903561425e-05, + "loss": 0.5736, + "step": 10934 + }, + { + "epoch": 13.9968, + "grad_norm": 0.8652977347373962, + "learning_rate": 2.8145258103241297e-05, + "loss": 0.5142, + "step": 10935 + }, + { + "epoch": 13.99808, + "grad_norm": 0.9089574217796326, + "learning_rate": 2.8143257302921172e-05, + "loss": 0.506, + "step": 10936 + }, + { + "epoch": 13.99936, + "grad_norm": 0.9164632558822632, + "learning_rate": 2.8141256502601044e-05, + "loss": 0.5394, + "step": 10937 + }, + { + "epoch": 14.00064, + "grad_norm": 1.9779025316238403, + "learning_rate": 2.8139255702280916e-05, + "loss": 0.969, + "step": 10938 + }, + { + "epoch": 14.00192, + "grad_norm": 0.9149560928344727, + "learning_rate": 2.8137254901960784e-05, + "loss": 0.5095, + "step": 10939 + }, + { + "epoch": 14.0032, + "grad_norm": 0.9277960658073425, + "learning_rate": 2.8135254101640656e-05, + "loss": 0.5741, + "step": 10940 + }, + { + "epoch": 14.00448, + "grad_norm": 0.9176803827285767, + "learning_rate": 2.8133253301320528e-05, + "loss": 0.4634, + "step": 10941 + }, + { + "epoch": 14.00576, + "grad_norm": 0.926750123500824, + "learning_rate": 2.81312525010004e-05, + "loss": 0.5669, + "step": 10942 + }, + { + "epoch": 14.00704, + "grad_norm": 0.8823717832565308, + "learning_rate": 2.8129251700680275e-05, + "loss": 0.4973, + "step": 10943 + }, + { + "epoch": 14.00832, + "grad_norm": 0.9568257927894592, + "learning_rate": 2.8127250900360147e-05, + "loss": 0.5307, + "step": 10944 + }, + { + "epoch": 14.0096, + "grad_norm": 0.9589383006095886, + "learning_rate": 2.812525010004002e-05, + "loss": 0.5054, + "step": 10945 + }, + { + "epoch": 14.01088, + "grad_norm": 0.9462265968322754, + "learning_rate": 2.812324929971989e-05, + "loss": 0.5156, + "step": 10946 + }, + { + "epoch": 14.01216, + "grad_norm": 0.9722265601158142, + "learning_rate": 2.812124849939976e-05, + "loss": 0.504, + "step": 10947 + }, + { + "epoch": 14.01344, + "grad_norm": 0.9647181034088135, + "learning_rate": 2.811924769907963e-05, + "loss": 0.5248, + "step": 10948 + }, + { + "epoch": 14.01472, + "grad_norm": 0.9778644442558289, + "learning_rate": 2.8117246898759503e-05, + "loss": 0.505, + "step": 10949 + }, + { + "epoch": 14.016, + "grad_norm": 0.9715518355369568, + "learning_rate": 2.8115246098439378e-05, + "loss": 0.5486, + "step": 10950 + }, + { + "epoch": 14.01728, + "grad_norm": 0.9561265707015991, + "learning_rate": 2.811324529811925e-05, + "loss": 0.5333, + "step": 10951 + }, + { + "epoch": 14.01856, + "grad_norm": 0.9769052863121033, + "learning_rate": 2.811124449779912e-05, + "loss": 0.5561, + "step": 10952 + }, + { + "epoch": 14.01984, + "grad_norm": 0.9301590323448181, + "learning_rate": 2.8109243697478993e-05, + "loss": 0.505, + "step": 10953 + }, + { + "epoch": 14.02112, + "grad_norm": 0.9344640374183655, + "learning_rate": 2.8107242897158865e-05, + "loss": 0.5225, + "step": 10954 + }, + { + "epoch": 14.0224, + "grad_norm": 0.9300322532653809, + "learning_rate": 2.8105242096838734e-05, + "loss": 0.494, + "step": 10955 + }, + { + "epoch": 14.02368, + "grad_norm": 0.9003793001174927, + "learning_rate": 2.8103241296518606e-05, + "loss": 0.4933, + "step": 10956 + }, + { + "epoch": 14.02496, + "grad_norm": 0.9156431555747986, + "learning_rate": 2.8101240496198484e-05, + "loss": 0.5131, + "step": 10957 + }, + { + "epoch": 14.02624, + "grad_norm": 0.929981529712677, + "learning_rate": 2.8099239695878353e-05, + "loss": 0.5237, + "step": 10958 + }, + { + "epoch": 14.02752, + "grad_norm": 0.9370753169059753, + "learning_rate": 2.8097238895558225e-05, + "loss": 0.4931, + "step": 10959 + }, + { + "epoch": 14.0288, + "grad_norm": 0.9672894477844238, + "learning_rate": 2.8095238095238096e-05, + "loss": 0.4916, + "step": 10960 + }, + { + "epoch": 14.03008, + "grad_norm": 0.9758133888244629, + "learning_rate": 2.809323729491797e-05, + "loss": 0.5458, + "step": 10961 + }, + { + "epoch": 14.03136, + "grad_norm": 1.002359390258789, + "learning_rate": 2.809123649459784e-05, + "loss": 0.5633, + "step": 10962 + }, + { + "epoch": 14.03264, + "grad_norm": 0.9776928424835205, + "learning_rate": 2.808923569427771e-05, + "loss": 0.5331, + "step": 10963 + }, + { + "epoch": 14.03392, + "grad_norm": 0.9960092306137085, + "learning_rate": 2.8087234893957587e-05, + "loss": 0.5458, + "step": 10964 + }, + { + "epoch": 14.0352, + "grad_norm": 0.9920620918273926, + "learning_rate": 2.808523409363746e-05, + "loss": 0.5853, + "step": 10965 + }, + { + "epoch": 14.03648, + "grad_norm": 0.9721470475196838, + "learning_rate": 2.8083233293317328e-05, + "loss": 0.532, + "step": 10966 + }, + { + "epoch": 14.03776, + "grad_norm": 0.8846380710601807, + "learning_rate": 2.80812324929972e-05, + "loss": 0.4606, + "step": 10967 + }, + { + "epoch": 14.03904, + "grad_norm": 0.934118926525116, + "learning_rate": 2.807923169267707e-05, + "loss": 0.4849, + "step": 10968 + }, + { + "epoch": 14.04032, + "grad_norm": 0.944482147693634, + "learning_rate": 2.8077230892356943e-05, + "loss": 0.521, + "step": 10969 + }, + { + "epoch": 14.0416, + "grad_norm": 0.9114336967468262, + "learning_rate": 2.8075230092036815e-05, + "loss": 0.5248, + "step": 10970 + }, + { + "epoch": 14.04288, + "grad_norm": 0.9594119787216187, + "learning_rate": 2.807322929171669e-05, + "loss": 0.5069, + "step": 10971 + }, + { + "epoch": 14.04416, + "grad_norm": 0.9807631373405457, + "learning_rate": 2.8071228491396562e-05, + "loss": 0.5334, + "step": 10972 + }, + { + "epoch": 14.04544, + "grad_norm": 0.9795427322387695, + "learning_rate": 2.8069227691076434e-05, + "loss": 0.5194, + "step": 10973 + }, + { + "epoch": 14.04672, + "grad_norm": 0.9742780923843384, + "learning_rate": 2.8067226890756302e-05, + "loss": 0.5439, + "step": 10974 + }, + { + "epoch": 14.048, + "grad_norm": 0.9774748682975769, + "learning_rate": 2.8065226090436174e-05, + "loss": 0.5262, + "step": 10975 + }, + { + "epoch": 14.04928, + "grad_norm": 1.0334399938583374, + "learning_rate": 2.8063225290116046e-05, + "loss": 0.56, + "step": 10976 + }, + { + "epoch": 14.05056, + "grad_norm": 1.013529896736145, + "learning_rate": 2.8061224489795918e-05, + "loss": 0.5665, + "step": 10977 + }, + { + "epoch": 14.05184, + "grad_norm": 0.9661319255828857, + "learning_rate": 2.8059223689475793e-05, + "loss": 0.5385, + "step": 10978 + }, + { + "epoch": 14.05312, + "grad_norm": 0.9551675915718079, + "learning_rate": 2.8057222889155665e-05, + "loss": 0.533, + "step": 10979 + }, + { + "epoch": 14.0544, + "grad_norm": 0.9196348786354065, + "learning_rate": 2.8055222088835537e-05, + "loss": 0.5153, + "step": 10980 + }, + { + "epoch": 14.05568, + "grad_norm": 0.9740005135536194, + "learning_rate": 2.805322128851541e-05, + "loss": 0.5095, + "step": 10981 + }, + { + "epoch": 14.05696, + "grad_norm": 0.9844241142272949, + "learning_rate": 2.8051220488195277e-05, + "loss": 0.5497, + "step": 10982 + }, + { + "epoch": 14.05824, + "grad_norm": 0.9591256380081177, + "learning_rate": 2.804921968787515e-05, + "loss": 0.5108, + "step": 10983 + }, + { + "epoch": 14.05952, + "grad_norm": 0.9638150334358215, + "learning_rate": 2.804721888755502e-05, + "loss": 0.5081, + "step": 10984 + }, + { + "epoch": 14.0608, + "grad_norm": 0.9753637909889221, + "learning_rate": 2.8045218087234896e-05, + "loss": 0.4962, + "step": 10985 + }, + { + "epoch": 14.06208, + "grad_norm": 0.9755197167396545, + "learning_rate": 2.8043217286914768e-05, + "loss": 0.5239, + "step": 10986 + }, + { + "epoch": 14.06336, + "grad_norm": 1.000517725944519, + "learning_rate": 2.804121648659464e-05, + "loss": 0.5282, + "step": 10987 + }, + { + "epoch": 14.06464, + "grad_norm": 0.9386008381843567, + "learning_rate": 2.8039215686274512e-05, + "loss": 0.5275, + "step": 10988 + }, + { + "epoch": 14.06592, + "grad_norm": 0.9570558667182922, + "learning_rate": 2.8037214885954384e-05, + "loss": 0.5064, + "step": 10989 + }, + { + "epoch": 14.0672, + "grad_norm": 0.955383837223053, + "learning_rate": 2.8035214085634252e-05, + "loss": 0.5152, + "step": 10990 + }, + { + "epoch": 14.06848, + "grad_norm": 0.9390836954116821, + "learning_rate": 2.8033213285314124e-05, + "loss": 0.4959, + "step": 10991 + }, + { + "epoch": 14.06976, + "grad_norm": 0.9478831887245178, + "learning_rate": 2.8031212484994003e-05, + "loss": 0.5058, + "step": 10992 + }, + { + "epoch": 14.07104, + "grad_norm": 0.9145018458366394, + "learning_rate": 2.802921168467387e-05, + "loss": 0.4909, + "step": 10993 + }, + { + "epoch": 14.07232, + "grad_norm": 0.9584047198295593, + "learning_rate": 2.8027210884353743e-05, + "loss": 0.5112, + "step": 10994 + }, + { + "epoch": 14.0736, + "grad_norm": 0.973054826259613, + "learning_rate": 2.8025210084033615e-05, + "loss": 0.5226, + "step": 10995 + }, + { + "epoch": 14.07488, + "grad_norm": 0.9144132733345032, + "learning_rate": 2.8023209283713487e-05, + "loss": 0.4899, + "step": 10996 + }, + { + "epoch": 14.07616, + "grad_norm": 0.9813609719276428, + "learning_rate": 2.802120848339336e-05, + "loss": 0.5154, + "step": 10997 + }, + { + "epoch": 14.07744, + "grad_norm": 0.9886991381645203, + "learning_rate": 2.8019207683073227e-05, + "loss": 0.5467, + "step": 10998 + }, + { + "epoch": 14.07872, + "grad_norm": 0.9517562985420227, + "learning_rate": 2.8017206882753106e-05, + "loss": 0.5436, + "step": 10999 + }, + { + "epoch": 14.08, + "grad_norm": 0.9583877325057983, + "learning_rate": 2.8015206082432978e-05, + "loss": 0.4949, + "step": 11000 + }, + { + "epoch": 14.08128, + "grad_norm": 0.988237738609314, + "learning_rate": 2.8013205282112846e-05, + "loss": 0.5509, + "step": 11001 + }, + { + "epoch": 14.08256, + "grad_norm": 0.956883430480957, + "learning_rate": 2.8011204481792718e-05, + "loss": 0.5071, + "step": 11002 + }, + { + "epoch": 14.08384, + "grad_norm": 0.9071076512336731, + "learning_rate": 2.800920368147259e-05, + "loss": 0.4785, + "step": 11003 + }, + { + "epoch": 14.08512, + "grad_norm": 0.9783008098602295, + "learning_rate": 2.800720288115246e-05, + "loss": 0.5356, + "step": 11004 + }, + { + "epoch": 14.0864, + "grad_norm": 0.9609231352806091, + "learning_rate": 2.8005202080832333e-05, + "loss": 0.5093, + "step": 11005 + }, + { + "epoch": 14.08768, + "grad_norm": 0.9345542192459106, + "learning_rate": 2.8003201280512202e-05, + "loss": 0.5265, + "step": 11006 + }, + { + "epoch": 14.08896, + "grad_norm": 0.94151371717453, + "learning_rate": 2.800120048019208e-05, + "loss": 0.5561, + "step": 11007 + }, + { + "epoch": 14.09024, + "grad_norm": 0.9789606928825378, + "learning_rate": 2.7999199679871952e-05, + "loss": 0.56, + "step": 11008 + }, + { + "epoch": 14.09152, + "grad_norm": 1.0095890760421753, + "learning_rate": 2.799719887955182e-05, + "loss": 0.5189, + "step": 11009 + }, + { + "epoch": 14.0928, + "grad_norm": 0.9807612895965576, + "learning_rate": 2.7995198079231693e-05, + "loss": 0.5041, + "step": 11010 + }, + { + "epoch": 14.09408, + "grad_norm": 0.9574481844902039, + "learning_rate": 2.7993197278911565e-05, + "loss": 0.5085, + "step": 11011 + }, + { + "epoch": 14.09536, + "grad_norm": 0.9162589311599731, + "learning_rate": 2.7991196478591436e-05, + "loss": 0.5223, + "step": 11012 + }, + { + "epoch": 14.09664, + "grad_norm": 0.9016433358192444, + "learning_rate": 2.7989195678271308e-05, + "loss": 0.5148, + "step": 11013 + }, + { + "epoch": 14.09792, + "grad_norm": 0.9376025199890137, + "learning_rate": 2.7987194877951184e-05, + "loss": 0.5246, + "step": 11014 + }, + { + "epoch": 14.0992, + "grad_norm": 0.9752246737480164, + "learning_rate": 2.7985194077631055e-05, + "loss": 0.5176, + "step": 11015 + }, + { + "epoch": 14.10048, + "grad_norm": 0.9751932621002197, + "learning_rate": 2.7983193277310927e-05, + "loss": 0.5399, + "step": 11016 + }, + { + "epoch": 14.10176, + "grad_norm": 0.8926282525062561, + "learning_rate": 2.7981192476990796e-05, + "loss": 0.4618, + "step": 11017 + }, + { + "epoch": 14.10304, + "grad_norm": 0.9403905272483826, + "learning_rate": 2.7979191676670668e-05, + "loss": 0.5345, + "step": 11018 + }, + { + "epoch": 14.10432, + "grad_norm": 0.9546553492546082, + "learning_rate": 2.797719087635054e-05, + "loss": 0.5458, + "step": 11019 + }, + { + "epoch": 14.1056, + "grad_norm": 0.9716793298721313, + "learning_rate": 2.797519007603041e-05, + "loss": 0.5325, + "step": 11020 + }, + { + "epoch": 14.10688, + "grad_norm": 0.9732063412666321, + "learning_rate": 2.7973189275710287e-05, + "loss": 0.5074, + "step": 11021 + }, + { + "epoch": 14.10816, + "grad_norm": 0.9816451072692871, + "learning_rate": 2.797118847539016e-05, + "loss": 0.5202, + "step": 11022 + }, + { + "epoch": 14.10944, + "grad_norm": 0.9631067514419556, + "learning_rate": 2.796918767507003e-05, + "loss": 0.5571, + "step": 11023 + }, + { + "epoch": 14.11072, + "grad_norm": 0.9688462018966675, + "learning_rate": 2.7967186874749902e-05, + "loss": 0.5302, + "step": 11024 + }, + { + "epoch": 14.112, + "grad_norm": 1.0027779340744019, + "learning_rate": 2.796518607442977e-05, + "loss": 0.5303, + "step": 11025 + }, + { + "epoch": 14.11328, + "grad_norm": 0.9702111482620239, + "learning_rate": 2.7963185274109642e-05, + "loss": 0.5138, + "step": 11026 + }, + { + "epoch": 14.11456, + "grad_norm": 1.0045280456542969, + "learning_rate": 2.7961184473789514e-05, + "loss": 0.5441, + "step": 11027 + }, + { + "epoch": 14.11584, + "grad_norm": 0.9333431720733643, + "learning_rate": 2.7959183673469393e-05, + "loss": 0.5327, + "step": 11028 + }, + { + "epoch": 14.11712, + "grad_norm": 0.9155805110931396, + "learning_rate": 2.795718287314926e-05, + "loss": 0.4828, + "step": 11029 + }, + { + "epoch": 14.1184, + "grad_norm": 0.9476427435874939, + "learning_rate": 2.7955182072829133e-05, + "loss": 0.4894, + "step": 11030 + }, + { + "epoch": 14.11968, + "grad_norm": 0.9640700817108154, + "learning_rate": 2.7953181272509005e-05, + "loss": 0.5322, + "step": 11031 + }, + { + "epoch": 14.12096, + "grad_norm": 0.9432242512702942, + "learning_rate": 2.7951180472188877e-05, + "loss": 0.5413, + "step": 11032 + }, + { + "epoch": 14.12224, + "grad_norm": 0.9809675216674805, + "learning_rate": 2.7949179671868745e-05, + "loss": 0.5346, + "step": 11033 + }, + { + "epoch": 14.12352, + "grad_norm": 0.9683305621147156, + "learning_rate": 2.7947178871548617e-05, + "loss": 0.5097, + "step": 11034 + }, + { + "epoch": 14.1248, + "grad_norm": 0.9600328803062439, + "learning_rate": 2.7945178071228496e-05, + "loss": 0.5013, + "step": 11035 + }, + { + "epoch": 14.12608, + "grad_norm": 0.9506182670593262, + "learning_rate": 2.7943177270908368e-05, + "loss": 0.491, + "step": 11036 + }, + { + "epoch": 14.12736, + "grad_norm": 0.9323163628578186, + "learning_rate": 2.7941176470588236e-05, + "loss": 0.5299, + "step": 11037 + }, + { + "epoch": 14.12864, + "grad_norm": 0.9958814978599548, + "learning_rate": 2.7939175670268108e-05, + "loss": 0.5498, + "step": 11038 + }, + { + "epoch": 14.12992, + "grad_norm": 0.9677280187606812, + "learning_rate": 2.793717486994798e-05, + "loss": 0.5417, + "step": 11039 + }, + { + "epoch": 14.1312, + "grad_norm": 0.9545553922653198, + "learning_rate": 2.7935174069627852e-05, + "loss": 0.4917, + "step": 11040 + }, + { + "epoch": 14.13248, + "grad_norm": 0.9735832214355469, + "learning_rate": 2.793317326930772e-05, + "loss": 0.5313, + "step": 11041 + }, + { + "epoch": 14.13376, + "grad_norm": 0.9970044493675232, + "learning_rate": 2.79311724689876e-05, + "loss": 0.5481, + "step": 11042 + }, + { + "epoch": 14.13504, + "grad_norm": 0.9880550503730774, + "learning_rate": 2.792917166866747e-05, + "loss": 0.5117, + "step": 11043 + }, + { + "epoch": 14.13632, + "grad_norm": 0.9498565196990967, + "learning_rate": 2.7927170868347343e-05, + "loss": 0.5318, + "step": 11044 + }, + { + "epoch": 14.1376, + "grad_norm": 0.9987189173698425, + "learning_rate": 2.792517006802721e-05, + "loss": 0.5665, + "step": 11045 + }, + { + "epoch": 14.13888, + "grad_norm": 0.9754243493080139, + "learning_rate": 2.7923169267707083e-05, + "loss": 0.5245, + "step": 11046 + }, + { + "epoch": 14.14016, + "grad_norm": 0.9672796130180359, + "learning_rate": 2.7921168467386955e-05, + "loss": 0.5555, + "step": 11047 + }, + { + "epoch": 14.14144, + "grad_norm": 0.9213780760765076, + "learning_rate": 2.7919167667066827e-05, + "loss": 0.4929, + "step": 11048 + }, + { + "epoch": 14.14272, + "grad_norm": 0.893765389919281, + "learning_rate": 2.7917166866746702e-05, + "loss": 0.4783, + "step": 11049 + }, + { + "epoch": 14.144, + "grad_norm": 0.9702581763267517, + "learning_rate": 2.7915166066426574e-05, + "loss": 0.5511, + "step": 11050 + }, + { + "epoch": 14.14528, + "grad_norm": 0.9254672527313232, + "learning_rate": 2.7913165266106446e-05, + "loss": 0.5087, + "step": 11051 + }, + { + "epoch": 14.14656, + "grad_norm": 0.9144052267074585, + "learning_rate": 2.7911164465786317e-05, + "loss": 0.5502, + "step": 11052 + }, + { + "epoch": 14.14784, + "grad_norm": 0.937017560005188, + "learning_rate": 2.7909163665466186e-05, + "loss": 0.5214, + "step": 11053 + }, + { + "epoch": 14.14912, + "grad_norm": 0.9533378481864929, + "learning_rate": 2.7907162865146058e-05, + "loss": 0.5256, + "step": 11054 + }, + { + "epoch": 14.1504, + "grad_norm": 1.019874095916748, + "learning_rate": 2.790516206482593e-05, + "loss": 0.5798, + "step": 11055 + }, + { + "epoch": 14.15168, + "grad_norm": 1.0000838041305542, + "learning_rate": 2.7903161264505805e-05, + "loss": 0.5462, + "step": 11056 + }, + { + "epoch": 14.15296, + "grad_norm": 0.9633542895317078, + "learning_rate": 2.7901160464185677e-05, + "loss": 0.5429, + "step": 11057 + }, + { + "epoch": 14.15424, + "grad_norm": 0.929341733455658, + "learning_rate": 2.789915966386555e-05, + "loss": 0.5231, + "step": 11058 + }, + { + "epoch": 14.15552, + "grad_norm": 0.9698876142501831, + "learning_rate": 2.789715886354542e-05, + "loss": 0.5094, + "step": 11059 + }, + { + "epoch": 14.1568, + "grad_norm": 0.9775946736335754, + "learning_rate": 2.7895158063225292e-05, + "loss": 0.5122, + "step": 11060 + }, + { + "epoch": 14.15808, + "grad_norm": 0.991977870464325, + "learning_rate": 2.789315726290516e-05, + "loss": 0.5952, + "step": 11061 + }, + { + "epoch": 14.15936, + "grad_norm": 0.9593073129653931, + "learning_rate": 2.7891156462585033e-05, + "loss": 0.4648, + "step": 11062 + }, + { + "epoch": 14.16064, + "grad_norm": 0.9783948063850403, + "learning_rate": 2.788915566226491e-05, + "loss": 0.5765, + "step": 11063 + }, + { + "epoch": 14.16192, + "grad_norm": 0.982947826385498, + "learning_rate": 2.788715486194478e-05, + "loss": 0.5264, + "step": 11064 + }, + { + "epoch": 14.1632, + "grad_norm": 0.9915079474449158, + "learning_rate": 2.788515406162465e-05, + "loss": 0.5371, + "step": 11065 + }, + { + "epoch": 14.16448, + "grad_norm": 0.9839328527450562, + "learning_rate": 2.7883153261304523e-05, + "loss": 0.5434, + "step": 11066 + }, + { + "epoch": 14.16576, + "grad_norm": 0.9635565280914307, + "learning_rate": 2.7881152460984395e-05, + "loss": 0.5017, + "step": 11067 + }, + { + "epoch": 14.16704, + "grad_norm": 0.9151342511177063, + "learning_rate": 2.7879151660664267e-05, + "loss": 0.5008, + "step": 11068 + }, + { + "epoch": 14.16832, + "grad_norm": 0.9379957914352417, + "learning_rate": 2.7877150860344136e-05, + "loss": 0.5265, + "step": 11069 + }, + { + "epoch": 14.1696, + "grad_norm": 0.9489511251449585, + "learning_rate": 2.7875150060024014e-05, + "loss": 0.4828, + "step": 11070 + }, + { + "epoch": 14.17088, + "grad_norm": 0.9574431777000427, + "learning_rate": 2.7873149259703886e-05, + "loss": 0.5325, + "step": 11071 + }, + { + "epoch": 14.17216, + "grad_norm": 0.9298610091209412, + "learning_rate": 2.7871148459383755e-05, + "loss": 0.5055, + "step": 11072 + }, + { + "epoch": 14.17344, + "grad_norm": 1.00288724899292, + "learning_rate": 2.7869147659063626e-05, + "loss": 0.5837, + "step": 11073 + }, + { + "epoch": 14.17472, + "grad_norm": 0.9784044623374939, + "learning_rate": 2.78671468587435e-05, + "loss": 0.5425, + "step": 11074 + }, + { + "epoch": 14.176, + "grad_norm": 0.9960471391677856, + "learning_rate": 2.786514605842337e-05, + "loss": 0.5137, + "step": 11075 + }, + { + "epoch": 14.17728, + "grad_norm": 1.01795494556427, + "learning_rate": 2.7863145258103242e-05, + "loss": 0.5483, + "step": 11076 + }, + { + "epoch": 14.17856, + "grad_norm": 1.0112005472183228, + "learning_rate": 2.7861144457783117e-05, + "loss": 0.5657, + "step": 11077 + }, + { + "epoch": 14.17984, + "grad_norm": 0.9345349073410034, + "learning_rate": 2.785914365746299e-05, + "loss": 0.5126, + "step": 11078 + }, + { + "epoch": 14.18112, + "grad_norm": 0.9684646725654602, + "learning_rate": 2.785714285714286e-05, + "loss": 0.4951, + "step": 11079 + }, + { + "epoch": 14.1824, + "grad_norm": 0.9888444542884827, + "learning_rate": 2.785514205682273e-05, + "loss": 0.5313, + "step": 11080 + }, + { + "epoch": 14.18368, + "grad_norm": 0.9643538594245911, + "learning_rate": 2.78531412565026e-05, + "loss": 0.5066, + "step": 11081 + }, + { + "epoch": 14.18496, + "grad_norm": 1.0346754789352417, + "learning_rate": 2.7851140456182473e-05, + "loss": 0.5683, + "step": 11082 + }, + { + "epoch": 14.18624, + "grad_norm": 0.9812901020050049, + "learning_rate": 2.7849139655862345e-05, + "loss": 0.5395, + "step": 11083 + }, + { + "epoch": 14.18752, + "grad_norm": 1.0163651704788208, + "learning_rate": 2.784713885554222e-05, + "loss": 0.5592, + "step": 11084 + }, + { + "epoch": 14.1888, + "grad_norm": 0.9411138892173767, + "learning_rate": 2.7845138055222092e-05, + "loss": 0.5013, + "step": 11085 + }, + { + "epoch": 14.19008, + "grad_norm": 0.9784141182899475, + "learning_rate": 2.7843137254901964e-05, + "loss": 0.5097, + "step": 11086 + }, + { + "epoch": 14.19136, + "grad_norm": 0.9341786503791809, + "learning_rate": 2.7841136454581836e-05, + "loss": 0.5253, + "step": 11087 + }, + { + "epoch": 14.19264, + "grad_norm": 0.9971571564674377, + "learning_rate": 2.7839135654261704e-05, + "loss": 0.5746, + "step": 11088 + }, + { + "epoch": 14.19392, + "grad_norm": 0.9248690009117126, + "learning_rate": 2.7837134853941576e-05, + "loss": 0.5079, + "step": 11089 + }, + { + "epoch": 14.1952, + "grad_norm": 0.9347603917121887, + "learning_rate": 2.7835134053621448e-05, + "loss": 0.5212, + "step": 11090 + }, + { + "epoch": 14.19648, + "grad_norm": 0.9436104893684387, + "learning_rate": 2.7833133253301323e-05, + "loss": 0.5332, + "step": 11091 + }, + { + "epoch": 14.19776, + "grad_norm": 0.9841783046722412, + "learning_rate": 2.7831132452981195e-05, + "loss": 0.524, + "step": 11092 + }, + { + "epoch": 14.19904, + "grad_norm": 0.9630305171012878, + "learning_rate": 2.7829131652661067e-05, + "loss": 0.5511, + "step": 11093 + }, + { + "epoch": 14.20032, + "grad_norm": 0.9855948090553284, + "learning_rate": 2.782713085234094e-05, + "loss": 0.573, + "step": 11094 + }, + { + "epoch": 14.2016, + "grad_norm": 0.9675716161727905, + "learning_rate": 2.782513005202081e-05, + "loss": 0.5066, + "step": 11095 + }, + { + "epoch": 14.20288, + "grad_norm": 1.0122812986373901, + "learning_rate": 2.782312925170068e-05, + "loss": 0.59, + "step": 11096 + }, + { + "epoch": 14.20416, + "grad_norm": 0.9795871376991272, + "learning_rate": 2.782112845138055e-05, + "loss": 0.5633, + "step": 11097 + }, + { + "epoch": 14.20544, + "grad_norm": 0.9450654983520508, + "learning_rate": 2.781912765106043e-05, + "loss": 0.5224, + "step": 11098 + }, + { + "epoch": 14.20672, + "grad_norm": 0.9646390080451965, + "learning_rate": 2.7817126850740298e-05, + "loss": 0.5386, + "step": 11099 + }, + { + "epoch": 14.208, + "grad_norm": 0.9584577679634094, + "learning_rate": 2.781512605042017e-05, + "loss": 0.493, + "step": 11100 + }, + { + "epoch": 14.20928, + "grad_norm": 1.0012218952178955, + "learning_rate": 2.7813125250100042e-05, + "loss": 0.5086, + "step": 11101 + }, + { + "epoch": 14.21056, + "grad_norm": 0.9794235229492188, + "learning_rate": 2.7811124449779914e-05, + "loss": 0.5184, + "step": 11102 + }, + { + "epoch": 14.21184, + "grad_norm": 0.9602649211883545, + "learning_rate": 2.7809123649459786e-05, + "loss": 0.558, + "step": 11103 + }, + { + "epoch": 14.21312, + "grad_norm": 0.90842205286026, + "learning_rate": 2.7807122849139654e-05, + "loss": 0.4799, + "step": 11104 + }, + { + "epoch": 14.2144, + "grad_norm": 0.9867070913314819, + "learning_rate": 2.7805122048819533e-05, + "loss": 0.5553, + "step": 11105 + }, + { + "epoch": 14.21568, + "grad_norm": 0.9673285484313965, + "learning_rate": 2.7803121248499405e-05, + "loss": 0.5077, + "step": 11106 + }, + { + "epoch": 14.21696, + "grad_norm": 1.000418782234192, + "learning_rate": 2.7801120448179273e-05, + "loss": 0.5389, + "step": 11107 + }, + { + "epoch": 14.21824, + "grad_norm": 0.9656979441642761, + "learning_rate": 2.7799119647859145e-05, + "loss": 0.5318, + "step": 11108 + }, + { + "epoch": 14.21952, + "grad_norm": 0.9900228381156921, + "learning_rate": 2.7797118847539017e-05, + "loss": 0.5487, + "step": 11109 + }, + { + "epoch": 14.2208, + "grad_norm": 0.9100193977355957, + "learning_rate": 2.779511804721889e-05, + "loss": 0.4788, + "step": 11110 + }, + { + "epoch": 14.22208, + "grad_norm": 0.9718891978263855, + "learning_rate": 2.779311724689876e-05, + "loss": 0.5599, + "step": 11111 + }, + { + "epoch": 14.22336, + "grad_norm": 1.0352201461791992, + "learning_rate": 2.7791116446578636e-05, + "loss": 0.5693, + "step": 11112 + }, + { + "epoch": 14.22464, + "grad_norm": 0.9154449105262756, + "learning_rate": 2.7789115646258508e-05, + "loss": 0.5197, + "step": 11113 + }, + { + "epoch": 14.22592, + "grad_norm": 0.9313166737556458, + "learning_rate": 2.778711484593838e-05, + "loss": 0.5425, + "step": 11114 + }, + { + "epoch": 14.2272, + "grad_norm": 0.9067119359970093, + "learning_rate": 2.7785114045618248e-05, + "loss": 0.4738, + "step": 11115 + }, + { + "epoch": 14.22848, + "grad_norm": 0.919452428817749, + "learning_rate": 2.778311324529812e-05, + "loss": 0.4808, + "step": 11116 + }, + { + "epoch": 14.22976, + "grad_norm": 0.9764316082000732, + "learning_rate": 2.778111244497799e-05, + "loss": 0.5508, + "step": 11117 + }, + { + "epoch": 14.23104, + "grad_norm": 1.002416968345642, + "learning_rate": 2.7779111644657863e-05, + "loss": 0.5727, + "step": 11118 + }, + { + "epoch": 14.23232, + "grad_norm": 0.9473809599876404, + "learning_rate": 2.7777110844337735e-05, + "loss": 0.515, + "step": 11119 + }, + { + "epoch": 14.2336, + "grad_norm": 0.9346858263015747, + "learning_rate": 2.777511004401761e-05, + "loss": 0.5267, + "step": 11120 + }, + { + "epoch": 14.23488, + "grad_norm": 0.9307919144630432, + "learning_rate": 2.7773109243697482e-05, + "loss": 0.4998, + "step": 11121 + }, + { + "epoch": 14.23616, + "grad_norm": 0.9973614811897278, + "learning_rate": 2.7771108443377354e-05, + "loss": 0.582, + "step": 11122 + }, + { + "epoch": 14.23744, + "grad_norm": 0.994784414768219, + "learning_rate": 2.7769107643057223e-05, + "loss": 0.5207, + "step": 11123 + }, + { + "epoch": 14.23872, + "grad_norm": 0.9563934206962585, + "learning_rate": 2.7767106842737095e-05, + "loss": 0.4937, + "step": 11124 + }, + { + "epoch": 14.24, + "grad_norm": 0.9174085855484009, + "learning_rate": 2.7765106042416966e-05, + "loss": 0.5371, + "step": 11125 + }, + { + "epoch": 14.24128, + "grad_norm": 0.9807153940200806, + "learning_rate": 2.7763105242096838e-05, + "loss": 0.5334, + "step": 11126 + }, + { + "epoch": 14.24256, + "grad_norm": 0.9536086916923523, + "learning_rate": 2.7761104441776714e-05, + "loss": 0.5261, + "step": 11127 + }, + { + "epoch": 14.24384, + "grad_norm": 0.9382270574569702, + "learning_rate": 2.7759103641456585e-05, + "loss": 0.5195, + "step": 11128 + }, + { + "epoch": 14.24512, + "grad_norm": 0.98002028465271, + "learning_rate": 2.7757102841136457e-05, + "loss": 0.5247, + "step": 11129 + }, + { + "epoch": 14.2464, + "grad_norm": 0.940831184387207, + "learning_rate": 2.775510204081633e-05, + "loss": 0.4827, + "step": 11130 + }, + { + "epoch": 14.24768, + "grad_norm": 0.8894546031951904, + "learning_rate": 2.7753101240496198e-05, + "loss": 0.5129, + "step": 11131 + }, + { + "epoch": 14.24896, + "grad_norm": 0.8587113618850708, + "learning_rate": 2.775110044017607e-05, + "loss": 0.4721, + "step": 11132 + }, + { + "epoch": 14.25024, + "grad_norm": 0.9889330863952637, + "learning_rate": 2.774909963985594e-05, + "loss": 0.573, + "step": 11133 + }, + { + "epoch": 14.25152, + "grad_norm": 0.9522219300270081, + "learning_rate": 2.7747098839535817e-05, + "loss": 0.4859, + "step": 11134 + }, + { + "epoch": 14.2528, + "grad_norm": 1.0236855745315552, + "learning_rate": 2.774509803921569e-05, + "loss": 0.5632, + "step": 11135 + }, + { + "epoch": 14.25408, + "grad_norm": 0.9715608358383179, + "learning_rate": 2.774309723889556e-05, + "loss": 0.5655, + "step": 11136 + }, + { + "epoch": 14.25536, + "grad_norm": 0.9080358147621155, + "learning_rate": 2.7741096438575432e-05, + "loss": 0.5086, + "step": 11137 + }, + { + "epoch": 14.25664, + "grad_norm": 0.9496445059776306, + "learning_rate": 2.7739095638255304e-05, + "loss": 0.5476, + "step": 11138 + }, + { + "epoch": 14.25792, + "grad_norm": 0.9452351331710815, + "learning_rate": 2.7737094837935172e-05, + "loss": 0.4651, + "step": 11139 + }, + { + "epoch": 14.2592, + "grad_norm": 0.9179670810699463, + "learning_rate": 2.7735094037615044e-05, + "loss": 0.523, + "step": 11140 + }, + { + "epoch": 14.26048, + "grad_norm": 1.0410470962524414, + "learning_rate": 2.7733093237294923e-05, + "loss": 0.5671, + "step": 11141 + }, + { + "epoch": 14.26176, + "grad_norm": 1.0138649940490723, + "learning_rate": 2.773109243697479e-05, + "loss": 0.5479, + "step": 11142 + }, + { + "epoch": 14.26304, + "grad_norm": 1.071816325187683, + "learning_rate": 2.7729091636654663e-05, + "loss": 0.5894, + "step": 11143 + }, + { + "epoch": 14.26432, + "grad_norm": 0.9818564653396606, + "learning_rate": 2.7727090836334535e-05, + "loss": 0.5264, + "step": 11144 + }, + { + "epoch": 14.2656, + "grad_norm": 1.001348853111267, + "learning_rate": 2.7725090036014407e-05, + "loss": 0.4765, + "step": 11145 + }, + { + "epoch": 14.26688, + "grad_norm": 1.0460922718048096, + "learning_rate": 2.772308923569428e-05, + "loss": 0.5337, + "step": 11146 + }, + { + "epoch": 14.26816, + "grad_norm": 0.9920286536216736, + "learning_rate": 2.7721088435374147e-05, + "loss": 0.5336, + "step": 11147 + }, + { + "epoch": 14.26944, + "grad_norm": 0.9713577628135681, + "learning_rate": 2.7719087635054026e-05, + "loss": 0.5586, + "step": 11148 + }, + { + "epoch": 14.27072, + "grad_norm": 0.9439414143562317, + "learning_rate": 2.7717086834733898e-05, + "loss": 0.5438, + "step": 11149 + }, + { + "epoch": 14.272, + "grad_norm": 0.956102192401886, + "learning_rate": 2.7715086034413766e-05, + "loss": 0.5383, + "step": 11150 + }, + { + "epoch": 14.27328, + "grad_norm": 0.9565941095352173, + "learning_rate": 2.7713085234093638e-05, + "loss": 0.5267, + "step": 11151 + }, + { + "epoch": 14.27456, + "grad_norm": 0.9341673851013184, + "learning_rate": 2.771108443377351e-05, + "loss": 0.5157, + "step": 11152 + }, + { + "epoch": 14.27584, + "grad_norm": 0.948688268661499, + "learning_rate": 2.7709083633453382e-05, + "loss": 0.5132, + "step": 11153 + }, + { + "epoch": 14.27712, + "grad_norm": 1.0164926052093506, + "learning_rate": 2.7707082833133254e-05, + "loss": 0.5641, + "step": 11154 + }, + { + "epoch": 14.2784, + "grad_norm": 0.9440988302230835, + "learning_rate": 2.770508203281313e-05, + "loss": 0.5071, + "step": 11155 + }, + { + "epoch": 14.27968, + "grad_norm": 0.9467429518699646, + "learning_rate": 2.7703081232493e-05, + "loss": 0.4859, + "step": 11156 + }, + { + "epoch": 14.28096, + "grad_norm": 0.923644483089447, + "learning_rate": 2.7701080432172873e-05, + "loss": 0.4908, + "step": 11157 + }, + { + "epoch": 14.28224, + "grad_norm": 0.924755334854126, + "learning_rate": 2.769907963185274e-05, + "loss": 0.5031, + "step": 11158 + }, + { + "epoch": 14.28352, + "grad_norm": 0.971537709236145, + "learning_rate": 2.7697078831532613e-05, + "loss": 0.55, + "step": 11159 + }, + { + "epoch": 14.2848, + "grad_norm": 0.9279425144195557, + "learning_rate": 2.7695078031212485e-05, + "loss": 0.4737, + "step": 11160 + }, + { + "epoch": 14.28608, + "grad_norm": 1.0148504972457886, + "learning_rate": 2.7693077230892357e-05, + "loss": 0.562, + "step": 11161 + }, + { + "epoch": 14.28736, + "grad_norm": 1.0114786624908447, + "learning_rate": 2.7691076430572232e-05, + "loss": 0.5845, + "step": 11162 + }, + { + "epoch": 14.288640000000001, + "grad_norm": 1.0205031633377075, + "learning_rate": 2.7689075630252104e-05, + "loss": 0.5958, + "step": 11163 + }, + { + "epoch": 14.28992, + "grad_norm": 1.0409082174301147, + "learning_rate": 2.7687074829931976e-05, + "loss": 0.5586, + "step": 11164 + }, + { + "epoch": 14.2912, + "grad_norm": 0.9876707196235657, + "learning_rate": 2.7685074029611847e-05, + "loss": 0.5519, + "step": 11165 + }, + { + "epoch": 14.29248, + "grad_norm": 1.0097538232803345, + "learning_rate": 2.7683073229291716e-05, + "loss": 0.5301, + "step": 11166 + }, + { + "epoch": 14.29376, + "grad_norm": 0.9804957509040833, + "learning_rate": 2.7681072428971588e-05, + "loss": 0.5528, + "step": 11167 + }, + { + "epoch": 14.29504, + "grad_norm": 0.9977876543998718, + "learning_rate": 2.767907162865146e-05, + "loss": 0.5245, + "step": 11168 + }, + { + "epoch": 14.29632, + "grad_norm": 0.9734411239624023, + "learning_rate": 2.7677070828331335e-05, + "loss": 0.5306, + "step": 11169 + }, + { + "epoch": 14.2976, + "grad_norm": 0.9642010927200317, + "learning_rate": 2.7675070028011207e-05, + "loss": 0.5472, + "step": 11170 + }, + { + "epoch": 14.29888, + "grad_norm": 1.0015202760696411, + "learning_rate": 2.767306922769108e-05, + "loss": 0.5343, + "step": 11171 + }, + { + "epoch": 14.30016, + "grad_norm": 0.9820809364318848, + "learning_rate": 2.767106842737095e-05, + "loss": 0.5606, + "step": 11172 + }, + { + "epoch": 14.30144, + "grad_norm": 0.977370023727417, + "learning_rate": 2.7669067627050822e-05, + "loss": 0.5411, + "step": 11173 + }, + { + "epoch": 14.30272, + "grad_norm": 0.9267496466636658, + "learning_rate": 2.766706682673069e-05, + "loss": 0.4905, + "step": 11174 + }, + { + "epoch": 14.304, + "grad_norm": 0.9372310042381287, + "learning_rate": 2.7665066026410563e-05, + "loss": 0.521, + "step": 11175 + }, + { + "epoch": 14.30528, + "grad_norm": 0.9425508975982666, + "learning_rate": 2.766306522609044e-05, + "loss": 0.5014, + "step": 11176 + }, + { + "epoch": 14.30656, + "grad_norm": 0.9979614019393921, + "learning_rate": 2.766106442577031e-05, + "loss": 0.5129, + "step": 11177 + }, + { + "epoch": 14.30784, + "grad_norm": 1.0363802909851074, + "learning_rate": 2.765906362545018e-05, + "loss": 0.5654, + "step": 11178 + }, + { + "epoch": 14.30912, + "grad_norm": 1.0523051023483276, + "learning_rate": 2.7657062825130053e-05, + "loss": 0.5443, + "step": 11179 + }, + { + "epoch": 14.3104, + "grad_norm": 0.9782631993293762, + "learning_rate": 2.7655062024809925e-05, + "loss": 0.5205, + "step": 11180 + }, + { + "epoch": 14.31168, + "grad_norm": 0.9576646685600281, + "learning_rate": 2.7653061224489797e-05, + "loss": 0.5151, + "step": 11181 + }, + { + "epoch": 14.31296, + "grad_norm": 0.9194750189781189, + "learning_rate": 2.7651060424169666e-05, + "loss": 0.5067, + "step": 11182 + }, + { + "epoch": 14.31424, + "grad_norm": 0.9091538786888123, + "learning_rate": 2.7649059623849544e-05, + "loss": 0.4496, + "step": 11183 + }, + { + "epoch": 14.31552, + "grad_norm": 1.0529427528381348, + "learning_rate": 2.7647058823529416e-05, + "loss": 0.6212, + "step": 11184 + }, + { + "epoch": 14.3168, + "grad_norm": 1.0431804656982422, + "learning_rate": 2.7645058023209285e-05, + "loss": 0.6295, + "step": 11185 + }, + { + "epoch": 14.31808, + "grad_norm": 0.9597348570823669, + "learning_rate": 2.7643057222889156e-05, + "loss": 0.52, + "step": 11186 + }, + { + "epoch": 14.31936, + "grad_norm": 1.0246033668518066, + "learning_rate": 2.7641056422569028e-05, + "loss": 0.5113, + "step": 11187 + }, + { + "epoch": 14.32064, + "grad_norm": 0.9777224659919739, + "learning_rate": 2.76390556222489e-05, + "loss": 0.4957, + "step": 11188 + }, + { + "epoch": 14.32192, + "grad_norm": 0.9850444793701172, + "learning_rate": 2.7637054821928772e-05, + "loss": 0.5251, + "step": 11189 + }, + { + "epoch": 14.3232, + "grad_norm": 0.9483627080917358, + "learning_rate": 2.7635054021608647e-05, + "loss": 0.5139, + "step": 11190 + }, + { + "epoch": 14.32448, + "grad_norm": 0.9668890833854675, + "learning_rate": 2.763305322128852e-05, + "loss": 0.5483, + "step": 11191 + }, + { + "epoch": 14.32576, + "grad_norm": 0.9546102285385132, + "learning_rate": 2.763105242096839e-05, + "loss": 0.5344, + "step": 11192 + }, + { + "epoch": 14.32704, + "grad_norm": 0.9604453444480896, + "learning_rate": 2.762905162064826e-05, + "loss": 0.5439, + "step": 11193 + }, + { + "epoch": 14.32832, + "grad_norm": 0.984946072101593, + "learning_rate": 2.762705082032813e-05, + "loss": 0.5376, + "step": 11194 + }, + { + "epoch": 14.3296, + "grad_norm": 1.022682547569275, + "learning_rate": 2.7625050020008003e-05, + "loss": 0.5918, + "step": 11195 + }, + { + "epoch": 14.33088, + "grad_norm": 0.9947453141212463, + "learning_rate": 2.7623049219687875e-05, + "loss": 0.556, + "step": 11196 + }, + { + "epoch": 14.33216, + "grad_norm": 0.9552220106124878, + "learning_rate": 2.762104841936775e-05, + "loss": 0.5282, + "step": 11197 + }, + { + "epoch": 14.33344, + "grad_norm": 0.9819296598434448, + "learning_rate": 2.7619047619047622e-05, + "loss": 0.474, + "step": 11198 + }, + { + "epoch": 14.33472, + "grad_norm": 1.0161857604980469, + "learning_rate": 2.7617046818727494e-05, + "loss": 0.5428, + "step": 11199 + }, + { + "epoch": 14.336, + "grad_norm": 0.950628399848938, + "learning_rate": 2.7615046018407366e-05, + "loss": 0.5011, + "step": 11200 + }, + { + "epoch": 14.33728, + "grad_norm": 0.9224458336830139, + "learning_rate": 2.7613045218087234e-05, + "loss": 0.5053, + "step": 11201 + }, + { + "epoch": 14.33856, + "grad_norm": 0.9512698650360107, + "learning_rate": 2.7611044417767106e-05, + "loss": 0.5124, + "step": 11202 + }, + { + "epoch": 14.33984, + "grad_norm": 0.905447244644165, + "learning_rate": 2.7609043617446978e-05, + "loss": 0.4627, + "step": 11203 + }, + { + "epoch": 14.34112, + "grad_norm": 0.9614179730415344, + "learning_rate": 2.7607042817126853e-05, + "loss": 0.5108, + "step": 11204 + }, + { + "epoch": 14.3424, + "grad_norm": 1.0302612781524658, + "learning_rate": 2.7605042016806725e-05, + "loss": 0.588, + "step": 11205 + }, + { + "epoch": 14.343679999999999, + "grad_norm": 0.9715131521224976, + "learning_rate": 2.7603041216486597e-05, + "loss": 0.5215, + "step": 11206 + }, + { + "epoch": 14.34496, + "grad_norm": 0.9700056314468384, + "learning_rate": 2.760104041616647e-05, + "loss": 0.511, + "step": 11207 + }, + { + "epoch": 14.34624, + "grad_norm": 0.9388474225997925, + "learning_rate": 2.759903961584634e-05, + "loss": 0.5359, + "step": 11208 + }, + { + "epoch": 14.34752, + "grad_norm": 0.9849310517311096, + "learning_rate": 2.759703881552621e-05, + "loss": 0.5323, + "step": 11209 + }, + { + "epoch": 14.3488, + "grad_norm": 0.990974485874176, + "learning_rate": 2.759503801520608e-05, + "loss": 0.5483, + "step": 11210 + }, + { + "epoch": 14.35008, + "grad_norm": 1.0002540349960327, + "learning_rate": 2.759303721488596e-05, + "loss": 0.5135, + "step": 11211 + }, + { + "epoch": 14.35136, + "grad_norm": 0.9734601378440857, + "learning_rate": 2.7591036414565828e-05, + "loss": 0.5341, + "step": 11212 + }, + { + "epoch": 14.35264, + "grad_norm": 0.9525935649871826, + "learning_rate": 2.75890356142457e-05, + "loss": 0.5107, + "step": 11213 + }, + { + "epoch": 14.35392, + "grad_norm": 0.9455302357673645, + "learning_rate": 2.7587034813925572e-05, + "loss": 0.528, + "step": 11214 + }, + { + "epoch": 14.3552, + "grad_norm": 0.9510558843612671, + "learning_rate": 2.7585034013605444e-05, + "loss": 0.5205, + "step": 11215 + }, + { + "epoch": 14.35648, + "grad_norm": 0.9504905939102173, + "learning_rate": 2.7583033213285316e-05, + "loss": 0.5393, + "step": 11216 + }, + { + "epoch": 14.35776, + "grad_norm": 0.952308714389801, + "learning_rate": 2.7581032412965184e-05, + "loss": 0.5079, + "step": 11217 + }, + { + "epoch": 14.35904, + "grad_norm": 1.0244531631469727, + "learning_rate": 2.7579031612645063e-05, + "loss": 0.5758, + "step": 11218 + }, + { + "epoch": 14.36032, + "grad_norm": 0.9500291347503662, + "learning_rate": 2.7577030812324934e-05, + "loss": 0.5063, + "step": 11219 + }, + { + "epoch": 14.3616, + "grad_norm": 0.9819626808166504, + "learning_rate": 2.7575030012004803e-05, + "loss": 0.5453, + "step": 11220 + }, + { + "epoch": 14.36288, + "grad_norm": 0.9494471549987793, + "learning_rate": 2.7573029211684675e-05, + "loss": 0.525, + "step": 11221 + }, + { + "epoch": 14.36416, + "grad_norm": 0.9260940551757812, + "learning_rate": 2.7571028411364547e-05, + "loss": 0.5371, + "step": 11222 + }, + { + "epoch": 14.36544, + "grad_norm": 0.9827948808670044, + "learning_rate": 2.756902761104442e-05, + "loss": 0.5559, + "step": 11223 + }, + { + "epoch": 14.36672, + "grad_norm": 0.9918002486228943, + "learning_rate": 2.756702681072429e-05, + "loss": 0.5184, + "step": 11224 + }, + { + "epoch": 14.368, + "grad_norm": 0.9910105466842651, + "learning_rate": 2.756502601040416e-05, + "loss": 0.571, + "step": 11225 + }, + { + "epoch": 14.36928, + "grad_norm": 1.0316925048828125, + "learning_rate": 2.7563025210084037e-05, + "loss": 0.5355, + "step": 11226 + }, + { + "epoch": 14.37056, + "grad_norm": 0.9141017198562622, + "learning_rate": 2.756102440976391e-05, + "loss": 0.4943, + "step": 11227 + }, + { + "epoch": 14.37184, + "grad_norm": 0.9571624994277954, + "learning_rate": 2.7559023609443778e-05, + "loss": 0.5288, + "step": 11228 + }, + { + "epoch": 14.37312, + "grad_norm": 0.95545893907547, + "learning_rate": 2.755702280912365e-05, + "loss": 0.5231, + "step": 11229 + }, + { + "epoch": 14.3744, + "grad_norm": 0.8849372863769531, + "learning_rate": 2.755502200880352e-05, + "loss": 0.4974, + "step": 11230 + }, + { + "epoch": 14.37568, + "grad_norm": 0.9513952136039734, + "learning_rate": 2.7553021208483393e-05, + "loss": 0.5517, + "step": 11231 + }, + { + "epoch": 14.37696, + "grad_norm": 0.9136958122253418, + "learning_rate": 2.7551020408163265e-05, + "loss": 0.5258, + "step": 11232 + }, + { + "epoch": 14.37824, + "grad_norm": 0.9673413038253784, + "learning_rate": 2.754901960784314e-05, + "loss": 0.5092, + "step": 11233 + }, + { + "epoch": 14.37952, + "grad_norm": 0.9891186952590942, + "learning_rate": 2.7547018807523012e-05, + "loss": 0.5038, + "step": 11234 + }, + { + "epoch": 14.3808, + "grad_norm": 0.9878220558166504, + "learning_rate": 2.7545018007202884e-05, + "loss": 0.5578, + "step": 11235 + }, + { + "epoch": 14.38208, + "grad_norm": 0.9703464508056641, + "learning_rate": 2.7543017206882753e-05, + "loss": 0.5463, + "step": 11236 + }, + { + "epoch": 14.38336, + "grad_norm": 1.003145456314087, + "learning_rate": 2.7541016406562625e-05, + "loss": 0.5513, + "step": 11237 + }, + { + "epoch": 14.38464, + "grad_norm": 0.9494891166687012, + "learning_rate": 2.7539015606242496e-05, + "loss": 0.4822, + "step": 11238 + }, + { + "epoch": 14.38592, + "grad_norm": 0.9667163491249084, + "learning_rate": 2.7537014805922368e-05, + "loss": 0.5371, + "step": 11239 + }, + { + "epoch": 14.3872, + "grad_norm": 0.9678400754928589, + "learning_rate": 2.7535014005602243e-05, + "loss": 0.5323, + "step": 11240 + }, + { + "epoch": 14.38848, + "grad_norm": 1.001518964767456, + "learning_rate": 2.7533013205282115e-05, + "loss": 0.5314, + "step": 11241 + }, + { + "epoch": 14.38976, + "grad_norm": 1.0005130767822266, + "learning_rate": 2.7531012404961987e-05, + "loss": 0.5229, + "step": 11242 + }, + { + "epoch": 14.39104, + "grad_norm": 0.9950253367424011, + "learning_rate": 2.752901160464186e-05, + "loss": 0.5628, + "step": 11243 + }, + { + "epoch": 14.39232, + "grad_norm": 0.8834913969039917, + "learning_rate": 2.7527010804321728e-05, + "loss": 0.4599, + "step": 11244 + }, + { + "epoch": 14.3936, + "grad_norm": 0.9402278661727905, + "learning_rate": 2.75250100040016e-05, + "loss": 0.5007, + "step": 11245 + }, + { + "epoch": 14.39488, + "grad_norm": 1.0537763833999634, + "learning_rate": 2.752300920368147e-05, + "loss": 0.569, + "step": 11246 + }, + { + "epoch": 14.39616, + "grad_norm": 1.0012062788009644, + "learning_rate": 2.7521008403361346e-05, + "loss": 0.5324, + "step": 11247 + }, + { + "epoch": 14.39744, + "grad_norm": 0.9675048589706421, + "learning_rate": 2.751900760304122e-05, + "loss": 0.5517, + "step": 11248 + }, + { + "epoch": 14.39872, + "grad_norm": 0.9507377743721008, + "learning_rate": 2.751700680272109e-05, + "loss": 0.5041, + "step": 11249 + }, + { + "epoch": 14.4, + "grad_norm": 0.9572357535362244, + "learning_rate": 2.7515006002400962e-05, + "loss": 0.5373, + "step": 11250 + }, + { + "epoch": 14.40128, + "grad_norm": 0.9814470410346985, + "learning_rate": 2.7513005202080834e-05, + "loss": 0.5297, + "step": 11251 + }, + { + "epoch": 14.40256, + "grad_norm": 0.9352920651435852, + "learning_rate": 2.7511004401760702e-05, + "loss": 0.4801, + "step": 11252 + }, + { + "epoch": 14.40384, + "grad_norm": 0.9089069962501526, + "learning_rate": 2.7509003601440574e-05, + "loss": 0.4697, + "step": 11253 + }, + { + "epoch": 14.40512, + "grad_norm": 0.9568341374397278, + "learning_rate": 2.7507002801120453e-05, + "loss": 0.53, + "step": 11254 + }, + { + "epoch": 14.4064, + "grad_norm": 0.9579848647117615, + "learning_rate": 2.750500200080032e-05, + "loss": 0.4911, + "step": 11255 + }, + { + "epoch": 14.40768, + "grad_norm": 0.9732376337051392, + "learning_rate": 2.7503001200480193e-05, + "loss": 0.5377, + "step": 11256 + }, + { + "epoch": 14.40896, + "grad_norm": 0.9632934331893921, + "learning_rate": 2.7501000400160065e-05, + "loss": 0.5283, + "step": 11257 + }, + { + "epoch": 14.41024, + "grad_norm": 0.9508453011512756, + "learning_rate": 2.7498999599839937e-05, + "loss": 0.5232, + "step": 11258 + }, + { + "epoch": 14.41152, + "grad_norm": 0.9188300371170044, + "learning_rate": 2.749699879951981e-05, + "loss": 0.522, + "step": 11259 + }, + { + "epoch": 14.4128, + "grad_norm": 0.9479809999465942, + "learning_rate": 2.7494997999199677e-05, + "loss": 0.5126, + "step": 11260 + }, + { + "epoch": 14.41408, + "grad_norm": 0.9927715063095093, + "learning_rate": 2.7492997198879556e-05, + "loss": 0.5527, + "step": 11261 + }, + { + "epoch": 14.41536, + "grad_norm": 1.0165045261383057, + "learning_rate": 2.7490996398559428e-05, + "loss": 0.5617, + "step": 11262 + }, + { + "epoch": 14.41664, + "grad_norm": 0.9869691133499146, + "learning_rate": 2.7488995598239296e-05, + "loss": 0.5413, + "step": 11263 + }, + { + "epoch": 14.41792, + "grad_norm": 0.9490245580673218, + "learning_rate": 2.7486994797919168e-05, + "loss": 0.4778, + "step": 11264 + }, + { + "epoch": 14.4192, + "grad_norm": 0.967729389667511, + "learning_rate": 2.748499399759904e-05, + "loss": 0.5177, + "step": 11265 + }, + { + "epoch": 14.42048, + "grad_norm": 0.9395437240600586, + "learning_rate": 2.7482993197278912e-05, + "loss": 0.5426, + "step": 11266 + }, + { + "epoch": 14.42176, + "grad_norm": 0.9222921133041382, + "learning_rate": 2.7480992396958784e-05, + "loss": 0.4713, + "step": 11267 + }, + { + "epoch": 14.42304, + "grad_norm": 0.9864060878753662, + "learning_rate": 2.747899159663866e-05, + "loss": 0.5087, + "step": 11268 + }, + { + "epoch": 14.42432, + "grad_norm": 0.9627559781074524, + "learning_rate": 2.747699079631853e-05, + "loss": 0.4965, + "step": 11269 + }, + { + "epoch": 14.4256, + "grad_norm": 0.9115942120552063, + "learning_rate": 2.7474989995998403e-05, + "loss": 0.4682, + "step": 11270 + }, + { + "epoch": 14.42688, + "grad_norm": 0.96644127368927, + "learning_rate": 2.747298919567827e-05, + "loss": 0.532, + "step": 11271 + }, + { + "epoch": 14.42816, + "grad_norm": 1.0033857822418213, + "learning_rate": 2.7470988395358143e-05, + "loss": 0.5577, + "step": 11272 + }, + { + "epoch": 14.42944, + "grad_norm": 1.0521118640899658, + "learning_rate": 2.7468987595038015e-05, + "loss": 0.5645, + "step": 11273 + }, + { + "epoch": 14.43072, + "grad_norm": 0.985821008682251, + "learning_rate": 2.7466986794717887e-05, + "loss": 0.5584, + "step": 11274 + }, + { + "epoch": 14.432, + "grad_norm": 1.0375871658325195, + "learning_rate": 2.7464985994397762e-05, + "loss": 0.5631, + "step": 11275 + }, + { + "epoch": 14.43328, + "grad_norm": 0.935610294342041, + "learning_rate": 2.7462985194077634e-05, + "loss": 0.4976, + "step": 11276 + }, + { + "epoch": 14.43456, + "grad_norm": 1.0138261318206787, + "learning_rate": 2.7460984393757506e-05, + "loss": 0.5645, + "step": 11277 + }, + { + "epoch": 14.43584, + "grad_norm": 1.0506336688995361, + "learning_rate": 2.7458983593437377e-05, + "loss": 0.5702, + "step": 11278 + }, + { + "epoch": 14.43712, + "grad_norm": 0.951737105846405, + "learning_rate": 2.7456982793117246e-05, + "loss": 0.4803, + "step": 11279 + }, + { + "epoch": 14.4384, + "grad_norm": 0.9894746541976929, + "learning_rate": 2.7454981992797118e-05, + "loss": 0.5372, + "step": 11280 + }, + { + "epoch": 14.43968, + "grad_norm": 0.9843027591705322, + "learning_rate": 2.745298119247699e-05, + "loss": 0.5739, + "step": 11281 + }, + { + "epoch": 14.44096, + "grad_norm": 0.980569064617157, + "learning_rate": 2.7450980392156865e-05, + "loss": 0.4889, + "step": 11282 + }, + { + "epoch": 14.44224, + "grad_norm": 0.9253454208374023, + "learning_rate": 2.7448979591836737e-05, + "loss": 0.5233, + "step": 11283 + }, + { + "epoch": 14.44352, + "grad_norm": 0.9626498818397522, + "learning_rate": 2.744697879151661e-05, + "loss": 0.5467, + "step": 11284 + }, + { + "epoch": 14.4448, + "grad_norm": 0.9599905014038086, + "learning_rate": 2.744497799119648e-05, + "loss": 0.5341, + "step": 11285 + }, + { + "epoch": 14.44608, + "grad_norm": 0.9758439064025879, + "learning_rate": 2.7442977190876352e-05, + "loss": 0.5201, + "step": 11286 + }, + { + "epoch": 14.44736, + "grad_norm": 1.003113865852356, + "learning_rate": 2.744097639055622e-05, + "loss": 0.523, + "step": 11287 + }, + { + "epoch": 14.44864, + "grad_norm": 0.9655167460441589, + "learning_rate": 2.7438975590236093e-05, + "loss": 0.5333, + "step": 11288 + }, + { + "epoch": 14.44992, + "grad_norm": 0.9504974484443665, + "learning_rate": 2.743697478991597e-05, + "loss": 0.5356, + "step": 11289 + }, + { + "epoch": 14.4512, + "grad_norm": 0.978461503982544, + "learning_rate": 2.743497398959584e-05, + "loss": 0.5443, + "step": 11290 + }, + { + "epoch": 14.45248, + "grad_norm": 0.9474901556968689, + "learning_rate": 2.743297318927571e-05, + "loss": 0.489, + "step": 11291 + }, + { + "epoch": 14.45376, + "grad_norm": 0.9023638367652893, + "learning_rate": 2.7430972388955583e-05, + "loss": 0.4626, + "step": 11292 + }, + { + "epoch": 14.45504, + "grad_norm": 0.939704418182373, + "learning_rate": 2.7428971588635455e-05, + "loss": 0.4852, + "step": 11293 + }, + { + "epoch": 14.45632, + "grad_norm": 0.9497430920600891, + "learning_rate": 2.7426970788315327e-05, + "loss": 0.496, + "step": 11294 + }, + { + "epoch": 14.4576, + "grad_norm": 0.9100688695907593, + "learning_rate": 2.7424969987995196e-05, + "loss": 0.5156, + "step": 11295 + }, + { + "epoch": 14.45888, + "grad_norm": 0.9579876661300659, + "learning_rate": 2.7422969187675074e-05, + "loss": 0.493, + "step": 11296 + }, + { + "epoch": 14.46016, + "grad_norm": 0.9984821677207947, + "learning_rate": 2.7420968387354946e-05, + "loss": 0.5359, + "step": 11297 + }, + { + "epoch": 14.46144, + "grad_norm": 0.9672602415084839, + "learning_rate": 2.7418967587034815e-05, + "loss": 0.5613, + "step": 11298 + }, + { + "epoch": 14.462720000000001, + "grad_norm": 1.0040628910064697, + "learning_rate": 2.7416966786714686e-05, + "loss": 0.5751, + "step": 11299 + }, + { + "epoch": 14.464, + "grad_norm": 1.0041166543960571, + "learning_rate": 2.7414965986394558e-05, + "loss": 0.5865, + "step": 11300 + }, + { + "epoch": 14.46528, + "grad_norm": 0.950853168964386, + "learning_rate": 2.741296518607443e-05, + "loss": 0.5002, + "step": 11301 + }, + { + "epoch": 14.46656, + "grad_norm": 0.9579245448112488, + "learning_rate": 2.7410964385754302e-05, + "loss": 0.5227, + "step": 11302 + }, + { + "epoch": 14.46784, + "grad_norm": 0.9853748679161072, + "learning_rate": 2.7408963585434177e-05, + "loss": 0.5181, + "step": 11303 + }, + { + "epoch": 14.46912, + "grad_norm": 0.9100056290626526, + "learning_rate": 2.740696278511405e-05, + "loss": 0.4892, + "step": 11304 + }, + { + "epoch": 14.4704, + "grad_norm": 0.94825679063797, + "learning_rate": 2.740496198479392e-05, + "loss": 0.4969, + "step": 11305 + }, + { + "epoch": 14.47168, + "grad_norm": 0.9380550384521484, + "learning_rate": 2.740296118447379e-05, + "loss": 0.5133, + "step": 11306 + }, + { + "epoch": 14.47296, + "grad_norm": 0.9455284476280212, + "learning_rate": 2.740096038415366e-05, + "loss": 0.5307, + "step": 11307 + }, + { + "epoch": 14.47424, + "grad_norm": 0.95405513048172, + "learning_rate": 2.7398959583833533e-05, + "loss": 0.4977, + "step": 11308 + }, + { + "epoch": 14.47552, + "grad_norm": 0.9854465126991272, + "learning_rate": 2.7396958783513405e-05, + "loss": 0.5289, + "step": 11309 + }, + { + "epoch": 14.4768, + "grad_norm": 1.018682599067688, + "learning_rate": 2.739495798319328e-05, + "loss": 0.5819, + "step": 11310 + }, + { + "epoch": 14.47808, + "grad_norm": 0.9085429310798645, + "learning_rate": 2.7392957182873152e-05, + "loss": 0.4768, + "step": 11311 + }, + { + "epoch": 14.47936, + "grad_norm": 1.025566577911377, + "learning_rate": 2.7390956382553024e-05, + "loss": 0.5631, + "step": 11312 + }, + { + "epoch": 14.48064, + "grad_norm": 0.9369559288024902, + "learning_rate": 2.7388955582232896e-05, + "loss": 0.5121, + "step": 11313 + }, + { + "epoch": 14.48192, + "grad_norm": 0.9811607003211975, + "learning_rate": 2.7386954781912764e-05, + "loss": 0.5434, + "step": 11314 + }, + { + "epoch": 14.4832, + "grad_norm": 0.9261855483055115, + "learning_rate": 2.7384953981592636e-05, + "loss": 0.547, + "step": 11315 + }, + { + "epoch": 14.48448, + "grad_norm": 0.9290302395820618, + "learning_rate": 2.7382953181272508e-05, + "loss": 0.5299, + "step": 11316 + }, + { + "epoch": 14.48576, + "grad_norm": 0.9527462124824524, + "learning_rate": 2.7380952380952383e-05, + "loss": 0.5147, + "step": 11317 + }, + { + "epoch": 14.48704, + "grad_norm": 0.9608799815177917, + "learning_rate": 2.7378951580632255e-05, + "loss": 0.5475, + "step": 11318 + }, + { + "epoch": 14.48832, + "grad_norm": 0.9241880774497986, + "learning_rate": 2.7376950780312127e-05, + "loss": 0.5598, + "step": 11319 + }, + { + "epoch": 14.4896, + "grad_norm": 0.948318362236023, + "learning_rate": 2.7374949979992e-05, + "loss": 0.5819, + "step": 11320 + }, + { + "epoch": 14.49088, + "grad_norm": 0.9118000268936157, + "learning_rate": 2.737294917967187e-05, + "loss": 0.488, + "step": 11321 + }, + { + "epoch": 14.49216, + "grad_norm": 0.9568605422973633, + "learning_rate": 2.737094837935174e-05, + "loss": 0.5289, + "step": 11322 + }, + { + "epoch": 14.49344, + "grad_norm": 0.9479710459709167, + "learning_rate": 2.736894757903161e-05, + "loss": 0.5361, + "step": 11323 + }, + { + "epoch": 14.49472, + "grad_norm": 0.9818686246871948, + "learning_rate": 2.736694677871149e-05, + "loss": 0.5237, + "step": 11324 + }, + { + "epoch": 14.496, + "grad_norm": 0.9445642828941345, + "learning_rate": 2.7364945978391358e-05, + "loss": 0.4977, + "step": 11325 + }, + { + "epoch": 14.49728, + "grad_norm": 0.9737011790275574, + "learning_rate": 2.736294517807123e-05, + "loss": 0.5182, + "step": 11326 + }, + { + "epoch": 14.49856, + "grad_norm": 0.9840577244758606, + "learning_rate": 2.7360944377751102e-05, + "loss": 0.5137, + "step": 11327 + }, + { + "epoch": 14.49984, + "grad_norm": 0.9450149536132812, + "learning_rate": 2.7358943577430974e-05, + "loss": 0.4973, + "step": 11328 + }, + { + "epoch": 14.50112, + "grad_norm": 0.9480723738670349, + "learning_rate": 2.7356942777110846e-05, + "loss": 0.5709, + "step": 11329 + }, + { + "epoch": 14.5024, + "grad_norm": 0.9991519451141357, + "learning_rate": 2.7354941976790714e-05, + "loss": 0.5573, + "step": 11330 + }, + { + "epoch": 14.50368, + "grad_norm": 0.9923670291900635, + "learning_rate": 2.7352941176470593e-05, + "loss": 0.6224, + "step": 11331 + }, + { + "epoch": 14.50496, + "grad_norm": 0.9765622615814209, + "learning_rate": 2.7350940376150464e-05, + "loss": 0.5312, + "step": 11332 + }, + { + "epoch": 14.50624, + "grad_norm": 0.978398859500885, + "learning_rate": 2.7348939575830333e-05, + "loss": 0.5265, + "step": 11333 + }, + { + "epoch": 14.50752, + "grad_norm": 1.0118024349212646, + "learning_rate": 2.7346938775510205e-05, + "loss": 0.5901, + "step": 11334 + }, + { + "epoch": 14.5088, + "grad_norm": 0.9619473814964294, + "learning_rate": 2.7344937975190077e-05, + "loss": 0.5405, + "step": 11335 + }, + { + "epoch": 14.51008, + "grad_norm": 0.970476508140564, + "learning_rate": 2.734293717486995e-05, + "loss": 0.5543, + "step": 11336 + }, + { + "epoch": 14.51136, + "grad_norm": 0.9238805174827576, + "learning_rate": 2.734093637454982e-05, + "loss": 0.4825, + "step": 11337 + }, + { + "epoch": 14.51264, + "grad_norm": 1.0360538959503174, + "learning_rate": 2.733893557422969e-05, + "loss": 0.5385, + "step": 11338 + }, + { + "epoch": 14.51392, + "grad_norm": 0.9870235323905945, + "learning_rate": 2.7336934773909567e-05, + "loss": 0.5012, + "step": 11339 + }, + { + "epoch": 14.5152, + "grad_norm": 0.9728274345397949, + "learning_rate": 2.733493397358944e-05, + "loss": 0.5382, + "step": 11340 + }, + { + "epoch": 14.51648, + "grad_norm": 0.989501416683197, + "learning_rate": 2.7332933173269308e-05, + "loss": 0.5216, + "step": 11341 + }, + { + "epoch": 14.517759999999999, + "grad_norm": 0.987844705581665, + "learning_rate": 2.733093237294918e-05, + "loss": 0.5553, + "step": 11342 + }, + { + "epoch": 14.51904, + "grad_norm": 0.9757768511772156, + "learning_rate": 2.732893157262905e-05, + "loss": 0.5604, + "step": 11343 + }, + { + "epoch": 14.52032, + "grad_norm": 0.9981015920639038, + "learning_rate": 2.7326930772308923e-05, + "loss": 0.5521, + "step": 11344 + }, + { + "epoch": 14.5216, + "grad_norm": 0.9795306921005249, + "learning_rate": 2.7324929971988795e-05, + "loss": 0.5164, + "step": 11345 + }, + { + "epoch": 14.52288, + "grad_norm": 0.9881823062896729, + "learning_rate": 2.732292917166867e-05, + "loss": 0.519, + "step": 11346 + }, + { + "epoch": 14.52416, + "grad_norm": 1.0085053443908691, + "learning_rate": 2.7320928371348542e-05, + "loss": 0.5578, + "step": 11347 + }, + { + "epoch": 14.52544, + "grad_norm": 1.0021884441375732, + "learning_rate": 2.7318927571028414e-05, + "loss": 0.5362, + "step": 11348 + }, + { + "epoch": 14.52672, + "grad_norm": 0.9783837795257568, + "learning_rate": 2.7316926770708283e-05, + "loss": 0.5256, + "step": 11349 + }, + { + "epoch": 14.528, + "grad_norm": 1.005036473274231, + "learning_rate": 2.7314925970388155e-05, + "loss": 0.5495, + "step": 11350 + }, + { + "epoch": 14.52928, + "grad_norm": 0.9625844359397888, + "learning_rate": 2.7312925170068026e-05, + "loss": 0.5355, + "step": 11351 + }, + { + "epoch": 14.53056, + "grad_norm": 0.961077094078064, + "learning_rate": 2.7310924369747898e-05, + "loss": 0.518, + "step": 11352 + }, + { + "epoch": 14.53184, + "grad_norm": 0.9395869970321655, + "learning_rate": 2.7308923569427773e-05, + "loss": 0.5194, + "step": 11353 + }, + { + "epoch": 14.53312, + "grad_norm": 0.9758129715919495, + "learning_rate": 2.7306922769107645e-05, + "loss": 0.5533, + "step": 11354 + }, + { + "epoch": 14.5344, + "grad_norm": 0.9722764492034912, + "learning_rate": 2.7304921968787517e-05, + "loss": 0.536, + "step": 11355 + }, + { + "epoch": 14.53568, + "grad_norm": 0.9164559245109558, + "learning_rate": 2.730292116846739e-05, + "loss": 0.468, + "step": 11356 + }, + { + "epoch": 14.53696, + "grad_norm": 1.0066019296646118, + "learning_rate": 2.7300920368147258e-05, + "loss": 0.5945, + "step": 11357 + }, + { + "epoch": 14.53824, + "grad_norm": 0.9370825886726379, + "learning_rate": 2.729891956782713e-05, + "loss": 0.5005, + "step": 11358 + }, + { + "epoch": 14.53952, + "grad_norm": 0.9669891595840454, + "learning_rate": 2.7296918767507e-05, + "loss": 0.5536, + "step": 11359 + }, + { + "epoch": 14.5408, + "grad_norm": 0.9711414575576782, + "learning_rate": 2.729491796718688e-05, + "loss": 0.5114, + "step": 11360 + }, + { + "epoch": 14.54208, + "grad_norm": 0.9631317257881165, + "learning_rate": 2.729291716686675e-05, + "loss": 0.5221, + "step": 11361 + }, + { + "epoch": 14.54336, + "grad_norm": 0.9969499707221985, + "learning_rate": 2.729091636654662e-05, + "loss": 0.5097, + "step": 11362 + }, + { + "epoch": 14.54464, + "grad_norm": 0.9977988004684448, + "learning_rate": 2.7288915566226492e-05, + "loss": 0.5569, + "step": 11363 + }, + { + "epoch": 14.54592, + "grad_norm": 0.90044105052948, + "learning_rate": 2.7286914765906364e-05, + "loss": 0.4707, + "step": 11364 + }, + { + "epoch": 14.5472, + "grad_norm": 0.9298081398010254, + "learning_rate": 2.7284913965586232e-05, + "loss": 0.5013, + "step": 11365 + }, + { + "epoch": 14.54848, + "grad_norm": 0.987301766872406, + "learning_rate": 2.7282913165266104e-05, + "loss": 0.5666, + "step": 11366 + }, + { + "epoch": 14.54976, + "grad_norm": 0.9727098345756531, + "learning_rate": 2.7280912364945983e-05, + "loss": 0.5379, + "step": 11367 + }, + { + "epoch": 14.55104, + "grad_norm": 0.9952585101127625, + "learning_rate": 2.7278911564625855e-05, + "loss": 0.5741, + "step": 11368 + }, + { + "epoch": 14.55232, + "grad_norm": 0.9384405612945557, + "learning_rate": 2.7276910764305723e-05, + "loss": 0.502, + "step": 11369 + }, + { + "epoch": 14.5536, + "grad_norm": 0.9742892980575562, + "learning_rate": 2.7274909963985595e-05, + "loss": 0.5407, + "step": 11370 + }, + { + "epoch": 14.55488, + "grad_norm": 0.9823377132415771, + "learning_rate": 2.7272909163665467e-05, + "loss": 0.5218, + "step": 11371 + }, + { + "epoch": 14.55616, + "grad_norm": 0.9821147322654724, + "learning_rate": 2.727090836334534e-05, + "loss": 0.5134, + "step": 11372 + }, + { + "epoch": 14.55744, + "grad_norm": 1.0253291130065918, + "learning_rate": 2.7268907563025207e-05, + "loss": 0.5562, + "step": 11373 + }, + { + "epoch": 14.55872, + "grad_norm": 0.9838807582855225, + "learning_rate": 2.7266906762705086e-05, + "loss": 0.5282, + "step": 11374 + }, + { + "epoch": 14.56, + "grad_norm": 0.9515048861503601, + "learning_rate": 2.7264905962384958e-05, + "loss": 0.5034, + "step": 11375 + }, + { + "epoch": 14.56128, + "grad_norm": 0.9546962976455688, + "learning_rate": 2.726290516206483e-05, + "loss": 0.5325, + "step": 11376 + }, + { + "epoch": 14.56256, + "grad_norm": 0.9903567433357239, + "learning_rate": 2.7260904361744698e-05, + "loss": 0.5126, + "step": 11377 + }, + { + "epoch": 14.56384, + "grad_norm": 0.9805188775062561, + "learning_rate": 2.725890356142457e-05, + "loss": 0.513, + "step": 11378 + }, + { + "epoch": 14.56512, + "grad_norm": 1.0438790321350098, + "learning_rate": 2.7256902761104442e-05, + "loss": 0.5515, + "step": 11379 + }, + { + "epoch": 14.5664, + "grad_norm": 0.9887431263923645, + "learning_rate": 2.7254901960784314e-05, + "loss": 0.526, + "step": 11380 + }, + { + "epoch": 14.56768, + "grad_norm": 0.9955745339393616, + "learning_rate": 2.725290116046419e-05, + "loss": 0.5261, + "step": 11381 + }, + { + "epoch": 14.56896, + "grad_norm": 0.9546988606452942, + "learning_rate": 2.725090036014406e-05, + "loss": 0.5462, + "step": 11382 + }, + { + "epoch": 14.57024, + "grad_norm": 0.966931939125061, + "learning_rate": 2.7248899559823933e-05, + "loss": 0.5326, + "step": 11383 + }, + { + "epoch": 14.57152, + "grad_norm": 0.9506143927574158, + "learning_rate": 2.7246898759503804e-05, + "loss": 0.4998, + "step": 11384 + }, + { + "epoch": 14.5728, + "grad_norm": 0.9455040097236633, + "learning_rate": 2.7244897959183673e-05, + "loss": 0.5466, + "step": 11385 + }, + { + "epoch": 14.57408, + "grad_norm": 0.9447420239448547, + "learning_rate": 2.7242897158863545e-05, + "loss": 0.4939, + "step": 11386 + }, + { + "epoch": 14.57536, + "grad_norm": 0.9344480037689209, + "learning_rate": 2.7240896358543417e-05, + "loss": 0.5223, + "step": 11387 + }, + { + "epoch": 14.57664, + "grad_norm": 0.9362328052520752, + "learning_rate": 2.7238895558223292e-05, + "loss": 0.5015, + "step": 11388 + }, + { + "epoch": 14.57792, + "grad_norm": 0.9401606917381287, + "learning_rate": 2.7236894757903164e-05, + "loss": 0.4988, + "step": 11389 + }, + { + "epoch": 14.5792, + "grad_norm": 0.9665933847427368, + "learning_rate": 2.7234893957583036e-05, + "loss": 0.5126, + "step": 11390 + }, + { + "epoch": 14.58048, + "grad_norm": 0.8998576998710632, + "learning_rate": 2.7232893157262907e-05, + "loss": 0.505, + "step": 11391 + }, + { + "epoch": 14.58176, + "grad_norm": 0.9965265393257141, + "learning_rate": 2.723089235694278e-05, + "loss": 0.569, + "step": 11392 + }, + { + "epoch": 14.58304, + "grad_norm": 0.9181508421897888, + "learning_rate": 2.7228891556622648e-05, + "loss": 0.4929, + "step": 11393 + }, + { + "epoch": 14.58432, + "grad_norm": 0.9306789040565491, + "learning_rate": 2.722689075630252e-05, + "loss": 0.5179, + "step": 11394 + }, + { + "epoch": 14.5856, + "grad_norm": 0.998447597026825, + "learning_rate": 2.7224889955982398e-05, + "loss": 0.5673, + "step": 11395 + }, + { + "epoch": 14.58688, + "grad_norm": 0.9387089014053345, + "learning_rate": 2.7222889155662267e-05, + "loss": 0.5328, + "step": 11396 + }, + { + "epoch": 14.58816, + "grad_norm": 0.9925042986869812, + "learning_rate": 2.722088835534214e-05, + "loss": 0.5403, + "step": 11397 + }, + { + "epoch": 14.58944, + "grad_norm": 0.9419589638710022, + "learning_rate": 2.721888755502201e-05, + "loss": 0.5322, + "step": 11398 + }, + { + "epoch": 14.59072, + "grad_norm": 0.9522401094436646, + "learning_rate": 2.7216886754701882e-05, + "loss": 0.5214, + "step": 11399 + }, + { + "epoch": 14.592, + "grad_norm": 0.9598711729049683, + "learning_rate": 2.7214885954381754e-05, + "loss": 0.5013, + "step": 11400 + }, + { + "epoch": 14.59328, + "grad_norm": 0.9690408706665039, + "learning_rate": 2.7212885154061623e-05, + "loss": 0.5262, + "step": 11401 + }, + { + "epoch": 14.59456, + "grad_norm": 0.9637435078620911, + "learning_rate": 2.72108843537415e-05, + "loss": 0.5114, + "step": 11402 + }, + { + "epoch": 14.59584, + "grad_norm": 0.9893304109573364, + "learning_rate": 2.7208883553421373e-05, + "loss": 0.5414, + "step": 11403 + }, + { + "epoch": 14.59712, + "grad_norm": 1.0310964584350586, + "learning_rate": 2.720688275310124e-05, + "loss": 0.5245, + "step": 11404 + }, + { + "epoch": 14.5984, + "grad_norm": 0.9509328007698059, + "learning_rate": 2.7204881952781113e-05, + "loss": 0.5463, + "step": 11405 + }, + { + "epoch": 14.59968, + "grad_norm": 0.9694564938545227, + "learning_rate": 2.7202881152460985e-05, + "loss": 0.5419, + "step": 11406 + }, + { + "epoch": 14.60096, + "grad_norm": 0.970043420791626, + "learning_rate": 2.7200880352140857e-05, + "loss": 0.5445, + "step": 11407 + }, + { + "epoch": 14.60224, + "grad_norm": 0.9511335492134094, + "learning_rate": 2.719887955182073e-05, + "loss": 0.5206, + "step": 11408 + }, + { + "epoch": 14.60352, + "grad_norm": 1.030502200126648, + "learning_rate": 2.7196878751500604e-05, + "loss": 0.6213, + "step": 11409 + }, + { + "epoch": 14.604800000000001, + "grad_norm": 1.0002635717391968, + "learning_rate": 2.7194877951180476e-05, + "loss": 0.5467, + "step": 11410 + }, + { + "epoch": 14.60608, + "grad_norm": 0.9449097514152527, + "learning_rate": 2.7192877150860348e-05, + "loss": 0.5446, + "step": 11411 + }, + { + "epoch": 14.60736, + "grad_norm": 0.9701420664787292, + "learning_rate": 2.7190876350540216e-05, + "loss": 0.5615, + "step": 11412 + }, + { + "epoch": 14.60864, + "grad_norm": 0.9248257279396057, + "learning_rate": 2.7188875550220088e-05, + "loss": 0.5041, + "step": 11413 + }, + { + "epoch": 14.60992, + "grad_norm": 0.9535519480705261, + "learning_rate": 2.718687474989996e-05, + "loss": 0.5423, + "step": 11414 + }, + { + "epoch": 14.6112, + "grad_norm": 0.9663887619972229, + "learning_rate": 2.7184873949579832e-05, + "loss": 0.5678, + "step": 11415 + }, + { + "epoch": 14.61248, + "grad_norm": 1.0246509313583374, + "learning_rate": 2.7182873149259707e-05, + "loss": 0.6351, + "step": 11416 + }, + { + "epoch": 14.61376, + "grad_norm": 0.9978649616241455, + "learning_rate": 2.718087234893958e-05, + "loss": 0.5342, + "step": 11417 + }, + { + "epoch": 14.61504, + "grad_norm": 0.9514932036399841, + "learning_rate": 2.717887154861945e-05, + "loss": 0.4586, + "step": 11418 + }, + { + "epoch": 14.61632, + "grad_norm": 1.0197707414627075, + "learning_rate": 2.7176870748299323e-05, + "loss": 0.5932, + "step": 11419 + }, + { + "epoch": 14.6176, + "grad_norm": 1.017253041267395, + "learning_rate": 2.717486994797919e-05, + "loss": 0.5063, + "step": 11420 + }, + { + "epoch": 14.61888, + "grad_norm": 0.9654613137245178, + "learning_rate": 2.7172869147659063e-05, + "loss": 0.5249, + "step": 11421 + }, + { + "epoch": 14.62016, + "grad_norm": 0.9739159345626831, + "learning_rate": 2.7170868347338935e-05, + "loss": 0.5241, + "step": 11422 + }, + { + "epoch": 14.62144, + "grad_norm": 0.9443714022636414, + "learning_rate": 2.716886754701881e-05, + "loss": 0.5465, + "step": 11423 + }, + { + "epoch": 14.62272, + "grad_norm": 1.0117498636245728, + "learning_rate": 2.7166866746698682e-05, + "loss": 0.5488, + "step": 11424 + }, + { + "epoch": 14.624, + "grad_norm": 0.9801613092422485, + "learning_rate": 2.7164865946378554e-05, + "loss": 0.5194, + "step": 11425 + }, + { + "epoch": 14.62528, + "grad_norm": 0.9613388180732727, + "learning_rate": 2.7162865146058426e-05, + "loss": 0.5651, + "step": 11426 + }, + { + "epoch": 14.62656, + "grad_norm": 0.9645081758499146, + "learning_rate": 2.7160864345738298e-05, + "loss": 0.5064, + "step": 11427 + }, + { + "epoch": 14.627839999999999, + "grad_norm": 0.9990333318710327, + "learning_rate": 2.7158863545418166e-05, + "loss": 0.5582, + "step": 11428 + }, + { + "epoch": 14.62912, + "grad_norm": 0.9323675036430359, + "learning_rate": 2.7156862745098038e-05, + "loss": 0.5146, + "step": 11429 + }, + { + "epoch": 14.6304, + "grad_norm": 0.9680500030517578, + "learning_rate": 2.7154861944777917e-05, + "loss": 0.4641, + "step": 11430 + }, + { + "epoch": 14.63168, + "grad_norm": 0.9934640526771545, + "learning_rate": 2.7152861144457785e-05, + "loss": 0.5237, + "step": 11431 + }, + { + "epoch": 14.63296, + "grad_norm": 0.9925525188446045, + "learning_rate": 2.7150860344137657e-05, + "loss": 0.526, + "step": 11432 + }, + { + "epoch": 14.63424, + "grad_norm": 0.9799636602401733, + "learning_rate": 2.714885954381753e-05, + "loss": 0.5483, + "step": 11433 + }, + { + "epoch": 14.63552, + "grad_norm": 0.945448637008667, + "learning_rate": 2.71468587434974e-05, + "loss": 0.5187, + "step": 11434 + }, + { + "epoch": 14.636800000000001, + "grad_norm": 0.9392439126968384, + "learning_rate": 2.7144857943177272e-05, + "loss": 0.5117, + "step": 11435 + }, + { + "epoch": 14.63808, + "grad_norm": 0.9761615991592407, + "learning_rate": 2.714285714285714e-05, + "loss": 0.5538, + "step": 11436 + }, + { + "epoch": 14.63936, + "grad_norm": 0.9610094428062439, + "learning_rate": 2.714085634253702e-05, + "loss": 0.5368, + "step": 11437 + }, + { + "epoch": 14.64064, + "grad_norm": 0.9939576387405396, + "learning_rate": 2.713885554221689e-05, + "loss": 0.6027, + "step": 11438 + }, + { + "epoch": 14.64192, + "grad_norm": 0.9574117064476013, + "learning_rate": 2.713685474189676e-05, + "loss": 0.5258, + "step": 11439 + }, + { + "epoch": 14.6432, + "grad_norm": 0.9553491473197937, + "learning_rate": 2.7134853941576632e-05, + "loss": 0.5076, + "step": 11440 + }, + { + "epoch": 14.64448, + "grad_norm": 0.9836270809173584, + "learning_rate": 2.7132853141256504e-05, + "loss": 0.5249, + "step": 11441 + }, + { + "epoch": 14.64576, + "grad_norm": 0.9523620009422302, + "learning_rate": 2.7130852340936375e-05, + "loss": 0.5237, + "step": 11442 + }, + { + "epoch": 14.64704, + "grad_norm": 0.9464503526687622, + "learning_rate": 2.7128851540616247e-05, + "loss": 0.5415, + "step": 11443 + }, + { + "epoch": 14.64832, + "grad_norm": 0.9364238381385803, + "learning_rate": 2.7126850740296123e-05, + "loss": 0.5156, + "step": 11444 + }, + { + "epoch": 14.6496, + "grad_norm": 0.943153440952301, + "learning_rate": 2.7124849939975994e-05, + "loss": 0.516, + "step": 11445 + }, + { + "epoch": 14.65088, + "grad_norm": 0.9400473833084106, + "learning_rate": 2.7122849139655866e-05, + "loss": 0.5315, + "step": 11446 + }, + { + "epoch": 14.65216, + "grad_norm": 0.9996716380119324, + "learning_rate": 2.7120848339335735e-05, + "loss": 0.5381, + "step": 11447 + }, + { + "epoch": 14.65344, + "grad_norm": 0.9744866490364075, + "learning_rate": 2.7118847539015607e-05, + "loss": 0.5502, + "step": 11448 + }, + { + "epoch": 14.65472, + "grad_norm": 0.9782332181930542, + "learning_rate": 2.711684673869548e-05, + "loss": 0.5651, + "step": 11449 + }, + { + "epoch": 14.656, + "grad_norm": 1.014737844467163, + "learning_rate": 2.711484593837535e-05, + "loss": 0.5673, + "step": 11450 + }, + { + "epoch": 14.65728, + "grad_norm": 0.9900893568992615, + "learning_rate": 2.7112845138055222e-05, + "loss": 0.5413, + "step": 11451 + }, + { + "epoch": 14.65856, + "grad_norm": 0.9428865909576416, + "learning_rate": 2.7110844337735097e-05, + "loss": 0.5564, + "step": 11452 + }, + { + "epoch": 14.659839999999999, + "grad_norm": 0.973512589931488, + "learning_rate": 2.710884353741497e-05, + "loss": 0.5656, + "step": 11453 + }, + { + "epoch": 14.66112, + "grad_norm": 0.9764522910118103, + "learning_rate": 2.710684273709484e-05, + "loss": 0.5316, + "step": 11454 + }, + { + "epoch": 14.6624, + "grad_norm": 0.962592363357544, + "learning_rate": 2.710484193677471e-05, + "loss": 0.5184, + "step": 11455 + }, + { + "epoch": 14.66368, + "grad_norm": 0.9462336301803589, + "learning_rate": 2.710284113645458e-05, + "loss": 0.5146, + "step": 11456 + }, + { + "epoch": 14.66496, + "grad_norm": 0.9827730655670166, + "learning_rate": 2.7100840336134453e-05, + "loss": 0.5192, + "step": 11457 + }, + { + "epoch": 14.66624, + "grad_norm": 1.0178593397140503, + "learning_rate": 2.7098839535814325e-05, + "loss": 0.5204, + "step": 11458 + }, + { + "epoch": 14.66752, + "grad_norm": 0.9616353511810303, + "learning_rate": 2.70968387354942e-05, + "loss": 0.5118, + "step": 11459 + }, + { + "epoch": 14.6688, + "grad_norm": 0.9945552349090576, + "learning_rate": 2.7094837935174072e-05, + "loss": 0.5167, + "step": 11460 + }, + { + "epoch": 14.67008, + "grad_norm": 0.967586100101471, + "learning_rate": 2.7092837134853944e-05, + "loss": 0.5346, + "step": 11461 + }, + { + "epoch": 14.67136, + "grad_norm": 0.9664031863212585, + "learning_rate": 2.7090836334533816e-05, + "loss": 0.5105, + "step": 11462 + }, + { + "epoch": 14.67264, + "grad_norm": 0.9893897771835327, + "learning_rate": 2.7088835534213684e-05, + "loss": 0.5307, + "step": 11463 + }, + { + "epoch": 14.67392, + "grad_norm": 1.0116097927093506, + "learning_rate": 2.7086834733893556e-05, + "loss": 0.4891, + "step": 11464 + }, + { + "epoch": 14.6752, + "grad_norm": 0.9823921322822571, + "learning_rate": 2.7084833933573428e-05, + "loss": 0.542, + "step": 11465 + }, + { + "epoch": 14.67648, + "grad_norm": 1.0313193798065186, + "learning_rate": 2.7082833133253303e-05, + "loss": 0.5909, + "step": 11466 + }, + { + "epoch": 14.67776, + "grad_norm": 0.9933533072471619, + "learning_rate": 2.7080832332933175e-05, + "loss": 0.5606, + "step": 11467 + }, + { + "epoch": 14.67904, + "grad_norm": 0.9638269543647766, + "learning_rate": 2.7078831532613047e-05, + "loss": 0.5142, + "step": 11468 + }, + { + "epoch": 14.68032, + "grad_norm": 1.006666660308838, + "learning_rate": 2.707683073229292e-05, + "loss": 0.5522, + "step": 11469 + }, + { + "epoch": 14.6816, + "grad_norm": 0.9415097832679749, + "learning_rate": 2.707482993197279e-05, + "loss": 0.5229, + "step": 11470 + }, + { + "epoch": 14.68288, + "grad_norm": 0.9513869881629944, + "learning_rate": 2.707282913165266e-05, + "loss": 0.5032, + "step": 11471 + }, + { + "epoch": 14.68416, + "grad_norm": 1.0023972988128662, + "learning_rate": 2.707082833133253e-05, + "loss": 0.5571, + "step": 11472 + }, + { + "epoch": 14.68544, + "grad_norm": 0.9924998879432678, + "learning_rate": 2.706882753101241e-05, + "loss": 0.5413, + "step": 11473 + }, + { + "epoch": 14.68672, + "grad_norm": 0.9861060380935669, + "learning_rate": 2.7066826730692278e-05, + "loss": 0.5158, + "step": 11474 + }, + { + "epoch": 14.688, + "grad_norm": 0.964842677116394, + "learning_rate": 2.706482593037215e-05, + "loss": 0.5428, + "step": 11475 + }, + { + "epoch": 14.68928, + "grad_norm": 0.9527415633201599, + "learning_rate": 2.7062825130052022e-05, + "loss": 0.5302, + "step": 11476 + }, + { + "epoch": 14.69056, + "grad_norm": 0.9425960779190063, + "learning_rate": 2.7060824329731894e-05, + "loss": 0.5503, + "step": 11477 + }, + { + "epoch": 14.69184, + "grad_norm": 0.9891713261604309, + "learning_rate": 2.7058823529411766e-05, + "loss": 0.5782, + "step": 11478 + }, + { + "epoch": 14.69312, + "grad_norm": 0.9919348359107971, + "learning_rate": 2.7056822729091634e-05, + "loss": 0.5411, + "step": 11479 + }, + { + "epoch": 14.6944, + "grad_norm": 0.9964098334312439, + "learning_rate": 2.7054821928771513e-05, + "loss": 0.5517, + "step": 11480 + }, + { + "epoch": 14.69568, + "grad_norm": 0.9663441181182861, + "learning_rate": 2.7052821128451385e-05, + "loss": 0.5354, + "step": 11481 + }, + { + "epoch": 14.69696, + "grad_norm": 1.0156707763671875, + "learning_rate": 2.7050820328131253e-05, + "loss": 0.5209, + "step": 11482 + }, + { + "epoch": 14.69824, + "grad_norm": 0.9387539625167847, + "learning_rate": 2.7048819527811125e-05, + "loss": 0.4976, + "step": 11483 + }, + { + "epoch": 14.69952, + "grad_norm": 0.9886208772659302, + "learning_rate": 2.7046818727490997e-05, + "loss": 0.5635, + "step": 11484 + }, + { + "epoch": 14.7008, + "grad_norm": 0.9804397225379944, + "learning_rate": 2.704481792717087e-05, + "loss": 0.5384, + "step": 11485 + }, + { + "epoch": 14.70208, + "grad_norm": 0.944187581539154, + "learning_rate": 2.704281712685074e-05, + "loss": 0.5429, + "step": 11486 + }, + { + "epoch": 14.70336, + "grad_norm": 0.9738987684249878, + "learning_rate": 2.7040816326530616e-05, + "loss": 0.5619, + "step": 11487 + }, + { + "epoch": 14.70464, + "grad_norm": 0.9703164100646973, + "learning_rate": 2.7038815526210488e-05, + "loss": 0.546, + "step": 11488 + }, + { + "epoch": 14.70592, + "grad_norm": 0.9917868375778198, + "learning_rate": 2.703681472589036e-05, + "loss": 0.558, + "step": 11489 + }, + { + "epoch": 14.7072, + "grad_norm": 0.9812465906143188, + "learning_rate": 2.7034813925570228e-05, + "loss": 0.5587, + "step": 11490 + }, + { + "epoch": 14.70848, + "grad_norm": 0.9691387414932251, + "learning_rate": 2.70328131252501e-05, + "loss": 0.5575, + "step": 11491 + }, + { + "epoch": 14.70976, + "grad_norm": 0.912678062915802, + "learning_rate": 2.7030812324929972e-05, + "loss": 0.4773, + "step": 11492 + }, + { + "epoch": 14.71104, + "grad_norm": 0.9488722085952759, + "learning_rate": 2.7028811524609844e-05, + "loss": 0.5633, + "step": 11493 + }, + { + "epoch": 14.71232, + "grad_norm": 0.958460807800293, + "learning_rate": 2.702681072428972e-05, + "loss": 0.509, + "step": 11494 + }, + { + "epoch": 14.7136, + "grad_norm": 0.9496520757675171, + "learning_rate": 2.702480992396959e-05, + "loss": 0.5236, + "step": 11495 + }, + { + "epoch": 14.71488, + "grad_norm": 0.9681755900382996, + "learning_rate": 2.7022809123649463e-05, + "loss": 0.5415, + "step": 11496 + }, + { + "epoch": 14.71616, + "grad_norm": 1.0062313079833984, + "learning_rate": 2.7020808323329334e-05, + "loss": 0.5432, + "step": 11497 + }, + { + "epoch": 14.71744, + "grad_norm": 0.962822675704956, + "learning_rate": 2.7018807523009203e-05, + "loss": 0.5481, + "step": 11498 + }, + { + "epoch": 14.71872, + "grad_norm": 1.0065944194793701, + "learning_rate": 2.7016806722689075e-05, + "loss": 0.5707, + "step": 11499 + }, + { + "epoch": 14.72, + "grad_norm": 0.9401991367340088, + "learning_rate": 2.7014805922368947e-05, + "loss": 0.5246, + "step": 11500 + }, + { + "epoch": 14.72128, + "grad_norm": 0.8997707366943359, + "learning_rate": 2.7012805122048822e-05, + "loss": 0.4671, + "step": 11501 + }, + { + "epoch": 14.72256, + "grad_norm": 0.9574297070503235, + "learning_rate": 2.7010804321728694e-05, + "loss": 0.5616, + "step": 11502 + }, + { + "epoch": 14.72384, + "grad_norm": 0.9319608807563782, + "learning_rate": 2.7008803521408566e-05, + "loss": 0.5094, + "step": 11503 + }, + { + "epoch": 14.72512, + "grad_norm": 0.9995977282524109, + "learning_rate": 2.7006802721088437e-05, + "loss": 0.5743, + "step": 11504 + }, + { + "epoch": 14.7264, + "grad_norm": 0.9879163503646851, + "learning_rate": 2.700480192076831e-05, + "loss": 0.5305, + "step": 11505 + }, + { + "epoch": 14.72768, + "grad_norm": 0.9732846021652222, + "learning_rate": 2.7002801120448178e-05, + "loss": 0.5276, + "step": 11506 + }, + { + "epoch": 14.72896, + "grad_norm": 0.9981286525726318, + "learning_rate": 2.700080032012805e-05, + "loss": 0.5467, + "step": 11507 + }, + { + "epoch": 14.73024, + "grad_norm": 1.0308117866516113, + "learning_rate": 2.6998799519807928e-05, + "loss": 0.5996, + "step": 11508 + }, + { + "epoch": 14.73152, + "grad_norm": 0.9829885363578796, + "learning_rate": 2.6996798719487797e-05, + "loss": 0.529, + "step": 11509 + }, + { + "epoch": 14.7328, + "grad_norm": 0.9994106292724609, + "learning_rate": 2.699479791916767e-05, + "loss": 0.5486, + "step": 11510 + }, + { + "epoch": 14.73408, + "grad_norm": 0.9333558082580566, + "learning_rate": 2.699279711884754e-05, + "loss": 0.4884, + "step": 11511 + }, + { + "epoch": 14.73536, + "grad_norm": 0.9170252680778503, + "learning_rate": 2.6990796318527412e-05, + "loss": 0.5259, + "step": 11512 + }, + { + "epoch": 14.73664, + "grad_norm": 0.9396823644638062, + "learning_rate": 2.6988795518207284e-05, + "loss": 0.5302, + "step": 11513 + }, + { + "epoch": 14.73792, + "grad_norm": 0.9388831853866577, + "learning_rate": 2.6986794717887153e-05, + "loss": 0.4803, + "step": 11514 + }, + { + "epoch": 14.7392, + "grad_norm": 1.0120586156845093, + "learning_rate": 2.698479391756703e-05, + "loss": 0.581, + "step": 11515 + }, + { + "epoch": 14.74048, + "grad_norm": 1.0425275564193726, + "learning_rate": 2.6982793117246903e-05, + "loss": 0.5611, + "step": 11516 + }, + { + "epoch": 14.74176, + "grad_norm": 1.0189977884292603, + "learning_rate": 2.698079231692677e-05, + "loss": 0.566, + "step": 11517 + }, + { + "epoch": 14.74304, + "grad_norm": 0.9831307530403137, + "learning_rate": 2.6978791516606643e-05, + "loss": 0.5258, + "step": 11518 + }, + { + "epoch": 14.74432, + "grad_norm": 0.9544398188591003, + "learning_rate": 2.6976790716286515e-05, + "loss": 0.5207, + "step": 11519 + }, + { + "epoch": 14.7456, + "grad_norm": 0.9739946722984314, + "learning_rate": 2.6974789915966387e-05, + "loss": 0.5362, + "step": 11520 + }, + { + "epoch": 14.74688, + "grad_norm": 1.0266259908676147, + "learning_rate": 2.697278911564626e-05, + "loss": 0.5669, + "step": 11521 + }, + { + "epoch": 14.74816, + "grad_norm": 0.9911479949951172, + "learning_rate": 2.6970788315326134e-05, + "loss": 0.529, + "step": 11522 + }, + { + "epoch": 14.74944, + "grad_norm": 0.978696882724762, + "learning_rate": 2.6968787515006006e-05, + "loss": 0.5497, + "step": 11523 + }, + { + "epoch": 14.75072, + "grad_norm": 0.9742518663406372, + "learning_rate": 2.6966786714685878e-05, + "loss": 0.4836, + "step": 11524 + }, + { + "epoch": 14.752, + "grad_norm": 0.9718377590179443, + "learning_rate": 2.6964785914365746e-05, + "loss": 0.513, + "step": 11525 + }, + { + "epoch": 14.75328, + "grad_norm": 1.0205025672912598, + "learning_rate": 2.6962785114045618e-05, + "loss": 0.582, + "step": 11526 + }, + { + "epoch": 14.75456, + "grad_norm": 0.9338187575340271, + "learning_rate": 2.696078431372549e-05, + "loss": 0.4745, + "step": 11527 + }, + { + "epoch": 14.75584, + "grad_norm": 0.9623693823814392, + "learning_rate": 2.6958783513405362e-05, + "loss": 0.5462, + "step": 11528 + }, + { + "epoch": 14.75712, + "grad_norm": 0.9271833896636963, + "learning_rate": 2.6956782713085237e-05, + "loss": 0.4699, + "step": 11529 + }, + { + "epoch": 14.7584, + "grad_norm": 1.0047674179077148, + "learning_rate": 2.695478191276511e-05, + "loss": 0.5474, + "step": 11530 + }, + { + "epoch": 14.75968, + "grad_norm": 0.9866589903831482, + "learning_rate": 2.695278111244498e-05, + "loss": 0.5667, + "step": 11531 + }, + { + "epoch": 14.76096, + "grad_norm": 0.9769319891929626, + "learning_rate": 2.6950780312124853e-05, + "loss": 0.5532, + "step": 11532 + }, + { + "epoch": 14.76224, + "grad_norm": 0.9511915445327759, + "learning_rate": 2.694877951180472e-05, + "loss": 0.522, + "step": 11533 + }, + { + "epoch": 14.76352, + "grad_norm": 0.9802474975585938, + "learning_rate": 2.6946778711484593e-05, + "loss": 0.5575, + "step": 11534 + }, + { + "epoch": 14.7648, + "grad_norm": 0.8974894285202026, + "learning_rate": 2.6944777911164465e-05, + "loss": 0.4573, + "step": 11535 + }, + { + "epoch": 14.76608, + "grad_norm": 0.9369537830352783, + "learning_rate": 2.694277711084434e-05, + "loss": 0.5447, + "step": 11536 + }, + { + "epoch": 14.76736, + "grad_norm": 0.9481881260871887, + "learning_rate": 2.6940776310524212e-05, + "loss": 0.5191, + "step": 11537 + }, + { + "epoch": 14.76864, + "grad_norm": 0.9813864231109619, + "learning_rate": 2.6938775510204084e-05, + "loss": 0.5228, + "step": 11538 + }, + { + "epoch": 14.76992, + "grad_norm": 0.9654099345207214, + "learning_rate": 2.6936774709883956e-05, + "loss": 0.4354, + "step": 11539 + }, + { + "epoch": 14.7712, + "grad_norm": 1.015555739402771, + "learning_rate": 2.6934773909563828e-05, + "loss": 0.5356, + "step": 11540 + }, + { + "epoch": 14.77248, + "grad_norm": 0.9228314161300659, + "learning_rate": 2.6932773109243696e-05, + "loss": 0.497, + "step": 11541 + }, + { + "epoch": 14.77376, + "grad_norm": 0.9956296682357788, + "learning_rate": 2.6930772308923568e-05, + "loss": 0.5217, + "step": 11542 + }, + { + "epoch": 14.77504, + "grad_norm": 1.0242211818695068, + "learning_rate": 2.6928771508603447e-05, + "loss": 0.4918, + "step": 11543 + }, + { + "epoch": 14.77632, + "grad_norm": 1.0041379928588867, + "learning_rate": 2.6926770708283315e-05, + "loss": 0.5375, + "step": 11544 + }, + { + "epoch": 14.7776, + "grad_norm": 0.9402958750724792, + "learning_rate": 2.6924769907963187e-05, + "loss": 0.4983, + "step": 11545 + }, + { + "epoch": 14.778880000000001, + "grad_norm": 0.9316099882125854, + "learning_rate": 2.692276910764306e-05, + "loss": 0.4897, + "step": 11546 + }, + { + "epoch": 14.78016, + "grad_norm": 1.0062158107757568, + "learning_rate": 2.692076830732293e-05, + "loss": 0.6163, + "step": 11547 + }, + { + "epoch": 14.78144, + "grad_norm": 0.93266361951828, + "learning_rate": 2.6918767507002802e-05, + "loss": 0.5614, + "step": 11548 + }, + { + "epoch": 14.78272, + "grad_norm": 0.9960542321205139, + "learning_rate": 2.691676670668267e-05, + "loss": 0.556, + "step": 11549 + }, + { + "epoch": 14.784, + "grad_norm": 0.9723289608955383, + "learning_rate": 2.691476590636255e-05, + "loss": 0.5528, + "step": 11550 + }, + { + "epoch": 14.78528, + "grad_norm": 1.003118872642517, + "learning_rate": 2.691276510604242e-05, + "loss": 0.5594, + "step": 11551 + }, + { + "epoch": 14.78656, + "grad_norm": 0.8769537806510925, + "learning_rate": 2.691076430572229e-05, + "loss": 0.4636, + "step": 11552 + }, + { + "epoch": 14.78784, + "grad_norm": 0.929743766784668, + "learning_rate": 2.6908763505402162e-05, + "loss": 0.497, + "step": 11553 + }, + { + "epoch": 14.78912, + "grad_norm": 0.9782609343528748, + "learning_rate": 2.6906762705082034e-05, + "loss": 0.5343, + "step": 11554 + }, + { + "epoch": 14.7904, + "grad_norm": 0.9649235010147095, + "learning_rate": 2.6904761904761905e-05, + "loss": 0.5329, + "step": 11555 + }, + { + "epoch": 14.79168, + "grad_norm": 1.0030887126922607, + "learning_rate": 2.6902761104441777e-05, + "loss": 0.5787, + "step": 11556 + }, + { + "epoch": 14.79296, + "grad_norm": 1.0339874029159546, + "learning_rate": 2.6900760304121653e-05, + "loss": 0.5822, + "step": 11557 + }, + { + "epoch": 14.79424, + "grad_norm": 1.0336726903915405, + "learning_rate": 2.6898759503801524e-05, + "loss": 0.573, + "step": 11558 + }, + { + "epoch": 14.79552, + "grad_norm": 0.9409865140914917, + "learning_rate": 2.6896758703481396e-05, + "loss": 0.5171, + "step": 11559 + }, + { + "epoch": 14.7968, + "grad_norm": 0.9395885467529297, + "learning_rate": 2.6894757903161265e-05, + "loss": 0.52, + "step": 11560 + }, + { + "epoch": 14.79808, + "grad_norm": 1.0146170854568481, + "learning_rate": 2.6892757102841137e-05, + "loss": 0.538, + "step": 11561 + }, + { + "epoch": 14.79936, + "grad_norm": 1.051053524017334, + "learning_rate": 2.689075630252101e-05, + "loss": 0.6081, + "step": 11562 + }, + { + "epoch": 14.80064, + "grad_norm": 0.9499222636222839, + "learning_rate": 2.688875550220088e-05, + "loss": 0.5111, + "step": 11563 + }, + { + "epoch": 14.801919999999999, + "grad_norm": 1.0103505849838257, + "learning_rate": 2.6886754701880752e-05, + "loss": 0.5686, + "step": 11564 + }, + { + "epoch": 14.8032, + "grad_norm": 1.0131018161773682, + "learning_rate": 2.6884753901560627e-05, + "loss": 0.5578, + "step": 11565 + }, + { + "epoch": 14.80448, + "grad_norm": 1.0048017501831055, + "learning_rate": 2.68827531012405e-05, + "loss": 0.5235, + "step": 11566 + }, + { + "epoch": 14.80576, + "grad_norm": 1.0479151010513306, + "learning_rate": 2.688075230092037e-05, + "loss": 0.5589, + "step": 11567 + }, + { + "epoch": 14.80704, + "grad_norm": 1.0059313774108887, + "learning_rate": 2.687875150060024e-05, + "loss": 0.5576, + "step": 11568 + }, + { + "epoch": 14.80832, + "grad_norm": 0.9548071622848511, + "learning_rate": 2.687675070028011e-05, + "loss": 0.5122, + "step": 11569 + }, + { + "epoch": 14.8096, + "grad_norm": 0.9280616641044617, + "learning_rate": 2.6874749899959983e-05, + "loss": 0.5034, + "step": 11570 + }, + { + "epoch": 14.810880000000001, + "grad_norm": 0.9565810561180115, + "learning_rate": 2.6872749099639855e-05, + "loss": 0.516, + "step": 11571 + }, + { + "epoch": 14.81216, + "grad_norm": 0.9888594150543213, + "learning_rate": 2.687074829931973e-05, + "loss": 0.5403, + "step": 11572 + }, + { + "epoch": 14.81344, + "grad_norm": 1.0492602586746216, + "learning_rate": 2.6868747498999602e-05, + "loss": 0.5664, + "step": 11573 + }, + { + "epoch": 14.81472, + "grad_norm": 0.9663681983947754, + "learning_rate": 2.6866746698679474e-05, + "loss": 0.5895, + "step": 11574 + }, + { + "epoch": 14.816, + "grad_norm": 0.9314732551574707, + "learning_rate": 2.6864745898359346e-05, + "loss": 0.4985, + "step": 11575 + }, + { + "epoch": 14.81728, + "grad_norm": 0.9417006373405457, + "learning_rate": 2.6862745098039214e-05, + "loss": 0.5434, + "step": 11576 + }, + { + "epoch": 14.81856, + "grad_norm": 0.968472421169281, + "learning_rate": 2.6860744297719086e-05, + "loss": 0.5591, + "step": 11577 + }, + { + "epoch": 14.81984, + "grad_norm": 0.9163793325424194, + "learning_rate": 2.6858743497398958e-05, + "loss": 0.5035, + "step": 11578 + }, + { + "epoch": 14.82112, + "grad_norm": 0.9645053744316101, + "learning_rate": 2.6856742697078833e-05, + "loss": 0.5461, + "step": 11579 + }, + { + "epoch": 14.8224, + "grad_norm": 0.9942086338996887, + "learning_rate": 2.6854741896758705e-05, + "loss": 0.5775, + "step": 11580 + }, + { + "epoch": 14.82368, + "grad_norm": 0.9701970219612122, + "learning_rate": 2.6852741096438577e-05, + "loss": 0.5237, + "step": 11581 + }, + { + "epoch": 14.82496, + "grad_norm": 0.9851134419441223, + "learning_rate": 2.685074029611845e-05, + "loss": 0.5184, + "step": 11582 + }, + { + "epoch": 14.82624, + "grad_norm": 0.8879162073135376, + "learning_rate": 2.684873949579832e-05, + "loss": 0.4711, + "step": 11583 + }, + { + "epoch": 14.82752, + "grad_norm": 0.9536734819412231, + "learning_rate": 2.684673869547819e-05, + "loss": 0.5822, + "step": 11584 + }, + { + "epoch": 14.8288, + "grad_norm": 0.9417433142662048, + "learning_rate": 2.684473789515806e-05, + "loss": 0.5429, + "step": 11585 + }, + { + "epoch": 14.83008, + "grad_norm": 0.9906011819839478, + "learning_rate": 2.684273709483794e-05, + "loss": 0.5599, + "step": 11586 + }, + { + "epoch": 14.83136, + "grad_norm": 1.0022450685501099, + "learning_rate": 2.6840736294517808e-05, + "loss": 0.565, + "step": 11587 + }, + { + "epoch": 14.83264, + "grad_norm": 0.9821887016296387, + "learning_rate": 2.683873549419768e-05, + "loss": 0.5372, + "step": 11588 + }, + { + "epoch": 14.833919999999999, + "grad_norm": 0.9912195801734924, + "learning_rate": 2.6836734693877552e-05, + "loss": 0.5902, + "step": 11589 + }, + { + "epoch": 14.8352, + "grad_norm": 0.940270721912384, + "learning_rate": 2.6834733893557424e-05, + "loss": 0.5177, + "step": 11590 + }, + { + "epoch": 14.83648, + "grad_norm": 0.9974461793899536, + "learning_rate": 2.6832733093237296e-05, + "loss": 0.5458, + "step": 11591 + }, + { + "epoch": 14.83776, + "grad_norm": 1.0043706893920898, + "learning_rate": 2.6830732292917164e-05, + "loss": 0.6071, + "step": 11592 + }, + { + "epoch": 14.83904, + "grad_norm": 0.982296347618103, + "learning_rate": 2.6828731492597043e-05, + "loss": 0.5237, + "step": 11593 + }, + { + "epoch": 14.84032, + "grad_norm": 1.0043891668319702, + "learning_rate": 2.6826730692276915e-05, + "loss": 0.5551, + "step": 11594 + }, + { + "epoch": 14.8416, + "grad_norm": 1.0041606426239014, + "learning_rate": 2.6824729891956783e-05, + "loss": 0.5376, + "step": 11595 + }, + { + "epoch": 14.84288, + "grad_norm": 0.952582061290741, + "learning_rate": 2.6822729091636655e-05, + "loss": 0.5425, + "step": 11596 + }, + { + "epoch": 14.84416, + "grad_norm": 0.905483603477478, + "learning_rate": 2.6820728291316527e-05, + "loss": 0.5078, + "step": 11597 + }, + { + "epoch": 14.84544, + "grad_norm": 0.9707022309303284, + "learning_rate": 2.68187274909964e-05, + "loss": 0.5593, + "step": 11598 + }, + { + "epoch": 14.84672, + "grad_norm": 0.9420384168624878, + "learning_rate": 2.681672669067627e-05, + "loss": 0.5346, + "step": 11599 + }, + { + "epoch": 14.848, + "grad_norm": 1.0081489086151123, + "learning_rate": 2.6814725890356146e-05, + "loss": 0.5217, + "step": 11600 + }, + { + "epoch": 14.84928, + "grad_norm": 1.0194686651229858, + "learning_rate": 2.6812725090036018e-05, + "loss": 0.6055, + "step": 11601 + }, + { + "epoch": 14.85056, + "grad_norm": 0.9814406037330627, + "learning_rate": 2.681072428971589e-05, + "loss": 0.5336, + "step": 11602 + }, + { + "epoch": 14.85184, + "grad_norm": 0.9600327610969543, + "learning_rate": 2.6808723489395758e-05, + "loss": 0.5383, + "step": 11603 + }, + { + "epoch": 14.85312, + "grad_norm": 0.9688712954521179, + "learning_rate": 2.680672268907563e-05, + "loss": 0.5513, + "step": 11604 + }, + { + "epoch": 14.8544, + "grad_norm": 0.9412886500358582, + "learning_rate": 2.6804721888755502e-05, + "loss": 0.5537, + "step": 11605 + }, + { + "epoch": 14.85568, + "grad_norm": 0.9556344151496887, + "learning_rate": 2.6802721088435374e-05, + "loss": 0.5508, + "step": 11606 + }, + { + "epoch": 14.85696, + "grad_norm": 0.9847762584686279, + "learning_rate": 2.680072028811525e-05, + "loss": 0.5563, + "step": 11607 + }, + { + "epoch": 14.85824, + "grad_norm": 0.9027768969535828, + "learning_rate": 2.679871948779512e-05, + "loss": 0.5691, + "step": 11608 + }, + { + "epoch": 14.85952, + "grad_norm": 0.998230516910553, + "learning_rate": 2.6796718687474993e-05, + "loss": 0.5422, + "step": 11609 + }, + { + "epoch": 14.8608, + "grad_norm": 1.0053776502609253, + "learning_rate": 2.6794717887154864e-05, + "loss": 0.5396, + "step": 11610 + }, + { + "epoch": 14.86208, + "grad_norm": 1.0294034481048584, + "learning_rate": 2.6792717086834733e-05, + "loss": 0.5769, + "step": 11611 + }, + { + "epoch": 14.86336, + "grad_norm": 0.9711460471153259, + "learning_rate": 2.6790716286514605e-05, + "loss": 0.5277, + "step": 11612 + }, + { + "epoch": 14.86464, + "grad_norm": 0.9948602318763733, + "learning_rate": 2.6788715486194477e-05, + "loss": 0.5497, + "step": 11613 + }, + { + "epoch": 14.86592, + "grad_norm": 0.9797696471214294, + "learning_rate": 2.6786714685874352e-05, + "loss": 0.5707, + "step": 11614 + }, + { + "epoch": 14.8672, + "grad_norm": 1.006882905960083, + "learning_rate": 2.6784713885554224e-05, + "loss": 0.5857, + "step": 11615 + }, + { + "epoch": 14.86848, + "grad_norm": 0.9534305930137634, + "learning_rate": 2.6782713085234096e-05, + "loss": 0.5066, + "step": 11616 + }, + { + "epoch": 14.86976, + "grad_norm": 1.002995491027832, + "learning_rate": 2.6780712284913967e-05, + "loss": 0.5673, + "step": 11617 + }, + { + "epoch": 14.87104, + "grad_norm": 0.9653766751289368, + "learning_rate": 2.677871148459384e-05, + "loss": 0.5393, + "step": 11618 + }, + { + "epoch": 14.87232, + "grad_norm": 0.9714248776435852, + "learning_rate": 2.6776710684273708e-05, + "loss": 0.5344, + "step": 11619 + }, + { + "epoch": 14.8736, + "grad_norm": 0.9720675349235535, + "learning_rate": 2.677470988395358e-05, + "loss": 0.5118, + "step": 11620 + }, + { + "epoch": 14.87488, + "grad_norm": 0.9434537887573242, + "learning_rate": 2.6772709083633458e-05, + "loss": 0.5116, + "step": 11621 + }, + { + "epoch": 14.87616, + "grad_norm": 0.9508132338523865, + "learning_rate": 2.6770708283313327e-05, + "loss": 0.5432, + "step": 11622 + }, + { + "epoch": 14.87744, + "grad_norm": 0.9770334362983704, + "learning_rate": 2.67687074829932e-05, + "loss": 0.5533, + "step": 11623 + }, + { + "epoch": 14.87872, + "grad_norm": 0.9566938877105713, + "learning_rate": 2.676670668267307e-05, + "loss": 0.5081, + "step": 11624 + }, + { + "epoch": 14.88, + "grad_norm": 1.0178238153457642, + "learning_rate": 2.6764705882352942e-05, + "loss": 0.5921, + "step": 11625 + }, + { + "epoch": 14.88128, + "grad_norm": 0.9234164357185364, + "learning_rate": 2.6762705082032814e-05, + "loss": 0.523, + "step": 11626 + }, + { + "epoch": 14.88256, + "grad_norm": 0.9184479713439941, + "learning_rate": 2.6760704281712683e-05, + "loss": 0.5278, + "step": 11627 + }, + { + "epoch": 14.88384, + "grad_norm": 0.9186223745346069, + "learning_rate": 2.675870348139256e-05, + "loss": 0.4659, + "step": 11628 + }, + { + "epoch": 14.88512, + "grad_norm": 0.9434010982513428, + "learning_rate": 2.6756702681072433e-05, + "loss": 0.517, + "step": 11629 + }, + { + "epoch": 14.8864, + "grad_norm": 0.9369627833366394, + "learning_rate": 2.67547018807523e-05, + "loss": 0.5197, + "step": 11630 + }, + { + "epoch": 14.88768, + "grad_norm": 0.9298577308654785, + "learning_rate": 2.6752701080432173e-05, + "loss": 0.5219, + "step": 11631 + }, + { + "epoch": 14.88896, + "grad_norm": 0.9377476572990417, + "learning_rate": 2.6750700280112045e-05, + "loss": 0.5519, + "step": 11632 + }, + { + "epoch": 14.89024, + "grad_norm": 0.919325590133667, + "learning_rate": 2.6748699479791917e-05, + "loss": 0.5225, + "step": 11633 + }, + { + "epoch": 14.89152, + "grad_norm": 0.9516395330429077, + "learning_rate": 2.674669867947179e-05, + "loss": 0.5508, + "step": 11634 + }, + { + "epoch": 14.8928, + "grad_norm": 0.9320101737976074, + "learning_rate": 2.6744697879151664e-05, + "loss": 0.5241, + "step": 11635 + }, + { + "epoch": 14.89408, + "grad_norm": 1.018315315246582, + "learning_rate": 2.6742697078831536e-05, + "loss": 0.5826, + "step": 11636 + }, + { + "epoch": 14.89536, + "grad_norm": 0.973010241985321, + "learning_rate": 2.6740696278511408e-05, + "loss": 0.5401, + "step": 11637 + }, + { + "epoch": 14.89664, + "grad_norm": 0.9724129438400269, + "learning_rate": 2.6738695478191276e-05, + "loss": 0.5432, + "step": 11638 + }, + { + "epoch": 14.89792, + "grad_norm": 0.9453209042549133, + "learning_rate": 2.6736694677871148e-05, + "loss": 0.5506, + "step": 11639 + }, + { + "epoch": 14.8992, + "grad_norm": 1.0045145750045776, + "learning_rate": 2.673469387755102e-05, + "loss": 0.5437, + "step": 11640 + }, + { + "epoch": 14.90048, + "grad_norm": 1.0044034719467163, + "learning_rate": 2.6732693077230892e-05, + "loss": 0.5566, + "step": 11641 + }, + { + "epoch": 14.90176, + "grad_norm": 0.9657561182975769, + "learning_rate": 2.6730692276910767e-05, + "loss": 0.5102, + "step": 11642 + }, + { + "epoch": 14.90304, + "grad_norm": 0.9859712719917297, + "learning_rate": 2.672869147659064e-05, + "loss": 0.5598, + "step": 11643 + }, + { + "epoch": 14.90432, + "grad_norm": 0.9568543434143066, + "learning_rate": 2.672669067627051e-05, + "loss": 0.5439, + "step": 11644 + }, + { + "epoch": 14.9056, + "grad_norm": 1.0132269859313965, + "learning_rate": 2.6724689875950383e-05, + "loss": 0.5572, + "step": 11645 + }, + { + "epoch": 14.90688, + "grad_norm": 0.9919137358665466, + "learning_rate": 2.672268907563025e-05, + "loss": 0.5823, + "step": 11646 + }, + { + "epoch": 14.90816, + "grad_norm": 0.9762164354324341, + "learning_rate": 2.6720688275310123e-05, + "loss": 0.5324, + "step": 11647 + }, + { + "epoch": 14.90944, + "grad_norm": 0.9875562191009521, + "learning_rate": 2.6718687474989995e-05, + "loss": 0.5269, + "step": 11648 + }, + { + "epoch": 14.91072, + "grad_norm": 0.9606843590736389, + "learning_rate": 2.671668667466987e-05, + "loss": 0.5141, + "step": 11649 + }, + { + "epoch": 14.912, + "grad_norm": 0.979857861995697, + "learning_rate": 2.6714685874349742e-05, + "loss": 0.5595, + "step": 11650 + }, + { + "epoch": 14.91328, + "grad_norm": 0.9310894012451172, + "learning_rate": 2.6712685074029614e-05, + "loss": 0.5369, + "step": 11651 + }, + { + "epoch": 14.91456, + "grad_norm": 0.9621683955192566, + "learning_rate": 2.6710684273709486e-05, + "loss": 0.5379, + "step": 11652 + }, + { + "epoch": 14.91584, + "grad_norm": 0.9423104524612427, + "learning_rate": 2.6708683473389358e-05, + "loss": 0.5239, + "step": 11653 + }, + { + "epoch": 14.91712, + "grad_norm": 0.9782088398933411, + "learning_rate": 2.6706682673069226e-05, + "loss": 0.5157, + "step": 11654 + }, + { + "epoch": 14.9184, + "grad_norm": 1.0111697912216187, + "learning_rate": 2.6704681872749098e-05, + "loss": 0.5385, + "step": 11655 + }, + { + "epoch": 14.91968, + "grad_norm": 0.9448067545890808, + "learning_rate": 2.6702681072428977e-05, + "loss": 0.5221, + "step": 11656 + }, + { + "epoch": 14.920960000000001, + "grad_norm": 0.9532007575035095, + "learning_rate": 2.6700680272108845e-05, + "loss": 0.5573, + "step": 11657 + }, + { + "epoch": 14.92224, + "grad_norm": 0.9769750237464905, + "learning_rate": 2.6698679471788717e-05, + "loss": 0.5437, + "step": 11658 + }, + { + "epoch": 14.92352, + "grad_norm": 0.9078409671783447, + "learning_rate": 2.669667867146859e-05, + "loss": 0.4773, + "step": 11659 + }, + { + "epoch": 14.9248, + "grad_norm": 0.9596759676933289, + "learning_rate": 2.669467787114846e-05, + "loss": 0.5117, + "step": 11660 + }, + { + "epoch": 14.92608, + "grad_norm": 0.9927992820739746, + "learning_rate": 2.6692677070828332e-05, + "loss": 0.5405, + "step": 11661 + }, + { + "epoch": 14.92736, + "grad_norm": 0.9593184590339661, + "learning_rate": 2.66906762705082e-05, + "loss": 0.4832, + "step": 11662 + }, + { + "epoch": 14.92864, + "grad_norm": 0.9789624214172363, + "learning_rate": 2.668867547018808e-05, + "loss": 0.5316, + "step": 11663 + }, + { + "epoch": 14.92992, + "grad_norm": 0.9484257698059082, + "learning_rate": 2.668667466986795e-05, + "loss": 0.5088, + "step": 11664 + }, + { + "epoch": 14.9312, + "grad_norm": 0.987421989440918, + "learning_rate": 2.668467386954782e-05, + "loss": 0.5787, + "step": 11665 + }, + { + "epoch": 14.93248, + "grad_norm": 0.973088264465332, + "learning_rate": 2.6682673069227692e-05, + "loss": 0.5346, + "step": 11666 + }, + { + "epoch": 14.93376, + "grad_norm": 0.985235333442688, + "learning_rate": 2.6680672268907564e-05, + "loss": 0.5391, + "step": 11667 + }, + { + "epoch": 14.93504, + "grad_norm": 1.0299049615859985, + "learning_rate": 2.6678671468587435e-05, + "loss": 0.5586, + "step": 11668 + }, + { + "epoch": 14.93632, + "grad_norm": 0.9780097603797913, + "learning_rate": 2.6676670668267307e-05, + "loss": 0.5194, + "step": 11669 + }, + { + "epoch": 14.9376, + "grad_norm": 1.037703037261963, + "learning_rate": 2.6674669867947183e-05, + "loss": 0.5629, + "step": 11670 + }, + { + "epoch": 14.93888, + "grad_norm": 0.9950383305549622, + "learning_rate": 2.6672669067627054e-05, + "loss": 0.5533, + "step": 11671 + }, + { + "epoch": 14.94016, + "grad_norm": 0.9457634091377258, + "learning_rate": 2.6670668267306926e-05, + "loss": 0.5002, + "step": 11672 + }, + { + "epoch": 14.94144, + "grad_norm": 1.0216537714004517, + "learning_rate": 2.6668667466986795e-05, + "loss": 0.5612, + "step": 11673 + }, + { + "epoch": 14.94272, + "grad_norm": 0.9794032573699951, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.5796, + "step": 11674 + }, + { + "epoch": 14.943999999999999, + "grad_norm": 0.965833306312561, + "learning_rate": 2.666466586634654e-05, + "loss": 0.5874, + "step": 11675 + }, + { + "epoch": 14.94528, + "grad_norm": 0.9718341827392578, + "learning_rate": 2.666266506602641e-05, + "loss": 0.5002, + "step": 11676 + }, + { + "epoch": 14.94656, + "grad_norm": 0.9411438703536987, + "learning_rate": 2.6660664265706282e-05, + "loss": 0.5241, + "step": 11677 + }, + { + "epoch": 14.94784, + "grad_norm": 0.9548771381378174, + "learning_rate": 2.6658663465386157e-05, + "loss": 0.5152, + "step": 11678 + }, + { + "epoch": 14.94912, + "grad_norm": 0.9769624471664429, + "learning_rate": 2.665666266506603e-05, + "loss": 0.5417, + "step": 11679 + }, + { + "epoch": 14.9504, + "grad_norm": 0.9090695977210999, + "learning_rate": 2.66546618647459e-05, + "loss": 0.5338, + "step": 11680 + }, + { + "epoch": 14.95168, + "grad_norm": 0.9659811854362488, + "learning_rate": 2.665266106442577e-05, + "loss": 0.5788, + "step": 11681 + }, + { + "epoch": 14.952960000000001, + "grad_norm": 0.94170081615448, + "learning_rate": 2.665066026410564e-05, + "loss": 0.5491, + "step": 11682 + }, + { + "epoch": 14.95424, + "grad_norm": 0.9707318544387817, + "learning_rate": 2.6648659463785513e-05, + "loss": 0.5736, + "step": 11683 + }, + { + "epoch": 14.95552, + "grad_norm": 0.9650655388832092, + "learning_rate": 2.6646658663465385e-05, + "loss": 0.5422, + "step": 11684 + }, + { + "epoch": 14.9568, + "grad_norm": 0.9502249360084534, + "learning_rate": 2.664465786314526e-05, + "loss": 0.5211, + "step": 11685 + }, + { + "epoch": 14.95808, + "grad_norm": 0.9722442030906677, + "learning_rate": 2.6642657062825132e-05, + "loss": 0.5209, + "step": 11686 + }, + { + "epoch": 14.95936, + "grad_norm": 0.9735600352287292, + "learning_rate": 2.6640656262505004e-05, + "loss": 0.5587, + "step": 11687 + }, + { + "epoch": 14.96064, + "grad_norm": 0.9050617218017578, + "learning_rate": 2.6638655462184876e-05, + "loss": 0.495, + "step": 11688 + }, + { + "epoch": 14.96192, + "grad_norm": 1.0128991603851318, + "learning_rate": 2.6636654661864744e-05, + "loss": 0.5692, + "step": 11689 + }, + { + "epoch": 14.9632, + "grad_norm": 0.970399796962738, + "learning_rate": 2.6634653861544616e-05, + "loss": 0.5175, + "step": 11690 + }, + { + "epoch": 14.96448, + "grad_norm": 0.9725888967514038, + "learning_rate": 2.6632653061224488e-05, + "loss": 0.5323, + "step": 11691 + }, + { + "epoch": 14.96576, + "grad_norm": 0.9731080532073975, + "learning_rate": 2.6630652260904367e-05, + "loss": 0.5548, + "step": 11692 + }, + { + "epoch": 14.96704, + "grad_norm": 0.9690375924110413, + "learning_rate": 2.6628651460584235e-05, + "loss": 0.5147, + "step": 11693 + }, + { + "epoch": 14.96832, + "grad_norm": 1.0425267219543457, + "learning_rate": 2.6626650660264107e-05, + "loss": 0.5284, + "step": 11694 + }, + { + "epoch": 14.9696, + "grad_norm": 0.949582576751709, + "learning_rate": 2.662464985994398e-05, + "loss": 0.4982, + "step": 11695 + }, + { + "epoch": 14.97088, + "grad_norm": 0.9666782021522522, + "learning_rate": 2.662264905962385e-05, + "loss": 0.5504, + "step": 11696 + }, + { + "epoch": 14.97216, + "grad_norm": 1.0077440738677979, + "learning_rate": 2.662064825930372e-05, + "loss": 0.5647, + "step": 11697 + }, + { + "epoch": 14.97344, + "grad_norm": 0.930424690246582, + "learning_rate": 2.661864745898359e-05, + "loss": 0.484, + "step": 11698 + }, + { + "epoch": 14.97472, + "grad_norm": 0.9621975421905518, + "learning_rate": 2.661664665866347e-05, + "loss": 0.5133, + "step": 11699 + }, + { + "epoch": 14.975999999999999, + "grad_norm": 0.9930300116539001, + "learning_rate": 2.661464585834334e-05, + "loss": 0.5704, + "step": 11700 + }, + { + "epoch": 14.97728, + "grad_norm": 0.965140163898468, + "learning_rate": 2.661264505802321e-05, + "loss": 0.5111, + "step": 11701 + }, + { + "epoch": 14.97856, + "grad_norm": 0.9955794215202332, + "learning_rate": 2.6610644257703082e-05, + "loss": 0.5608, + "step": 11702 + }, + { + "epoch": 14.97984, + "grad_norm": 0.9986573457717896, + "learning_rate": 2.6608643457382954e-05, + "loss": 0.5501, + "step": 11703 + }, + { + "epoch": 14.98112, + "grad_norm": 0.9306737780570984, + "learning_rate": 2.6606642657062826e-05, + "loss": 0.5065, + "step": 11704 + }, + { + "epoch": 14.9824, + "grad_norm": 0.9776188135147095, + "learning_rate": 2.6604641856742694e-05, + "loss": 0.5241, + "step": 11705 + }, + { + "epoch": 14.98368, + "grad_norm": 0.9727427363395691, + "learning_rate": 2.6602641056422573e-05, + "loss": 0.515, + "step": 11706 + }, + { + "epoch": 14.98496, + "grad_norm": 1.011610507965088, + "learning_rate": 2.6600640256102445e-05, + "loss": 0.5472, + "step": 11707 + }, + { + "epoch": 14.98624, + "grad_norm": 1.0158885717391968, + "learning_rate": 2.6598639455782316e-05, + "loss": 0.5642, + "step": 11708 + }, + { + "epoch": 14.98752, + "grad_norm": 1.021340250968933, + "learning_rate": 2.6596638655462185e-05, + "loss": 0.5585, + "step": 11709 + }, + { + "epoch": 14.9888, + "grad_norm": 0.91712486743927, + "learning_rate": 2.6594637855142057e-05, + "loss": 0.5085, + "step": 11710 + }, + { + "epoch": 14.99008, + "grad_norm": 0.9759134650230408, + "learning_rate": 2.659263705482193e-05, + "loss": 0.5493, + "step": 11711 + }, + { + "epoch": 14.99136, + "grad_norm": 1.0136007070541382, + "learning_rate": 2.65906362545018e-05, + "loss": 0.5891, + "step": 11712 + }, + { + "epoch": 14.99264, + "grad_norm": 0.9995754361152649, + "learning_rate": 2.6588635454181676e-05, + "loss": 0.4942, + "step": 11713 + }, + { + "epoch": 14.99392, + "grad_norm": 0.9608016610145569, + "learning_rate": 2.6586634653861548e-05, + "loss": 0.5207, + "step": 11714 + }, + { + "epoch": 14.9952, + "grad_norm": 0.9471200108528137, + "learning_rate": 2.658463385354142e-05, + "loss": 0.5334, + "step": 11715 + }, + { + "epoch": 14.99648, + "grad_norm": 0.971295952796936, + "learning_rate": 2.658263305322129e-05, + "loss": 0.5604, + "step": 11716 + }, + { + "epoch": 14.99776, + "grad_norm": 0.9903369545936584, + "learning_rate": 2.658063225290116e-05, + "loss": 0.6, + "step": 11717 + }, + { + "epoch": 14.99904, + "grad_norm": 0.9256072044372559, + "learning_rate": 2.657863145258103e-05, + "loss": 0.5046, + "step": 11718 + }, + { + "epoch": 15.00032, + "grad_norm": 1.9394627809524536, + "learning_rate": 2.6576630652260904e-05, + "loss": 0.813, + "step": 11719 + }, + { + "epoch": 15.0016, + "grad_norm": 0.9253193140029907, + "learning_rate": 2.657462985194078e-05, + "loss": 0.5114, + "step": 11720 + }, + { + "epoch": 15.00288, + "grad_norm": 0.9425203800201416, + "learning_rate": 2.657262905162065e-05, + "loss": 0.4764, + "step": 11721 + }, + { + "epoch": 15.00416, + "grad_norm": 0.9434118866920471, + "learning_rate": 2.6570628251300522e-05, + "loss": 0.4907, + "step": 11722 + }, + { + "epoch": 15.00544, + "grad_norm": 0.9589188694953918, + "learning_rate": 2.6568627450980394e-05, + "loss": 0.5433, + "step": 11723 + }, + { + "epoch": 15.00672, + "grad_norm": 0.9243043661117554, + "learning_rate": 2.6566626650660266e-05, + "loss": 0.5402, + "step": 11724 + }, + { + "epoch": 15.008, + "grad_norm": 0.9611799120903015, + "learning_rate": 2.6564625850340135e-05, + "loss": 0.5056, + "step": 11725 + }, + { + "epoch": 15.00928, + "grad_norm": 0.8841933608055115, + "learning_rate": 2.6562625050020007e-05, + "loss": 0.4763, + "step": 11726 + }, + { + "epoch": 15.01056, + "grad_norm": 0.9348115921020508, + "learning_rate": 2.6560624249699885e-05, + "loss": 0.4711, + "step": 11727 + }, + { + "epoch": 15.01184, + "grad_norm": 0.9851812720298767, + "learning_rate": 2.6558623449379754e-05, + "loss": 0.5166, + "step": 11728 + }, + { + "epoch": 15.01312, + "grad_norm": 0.967349648475647, + "learning_rate": 2.6556622649059625e-05, + "loss": 0.4667, + "step": 11729 + }, + { + "epoch": 15.0144, + "grad_norm": 1.0413135290145874, + "learning_rate": 2.6554621848739497e-05, + "loss": 0.5798, + "step": 11730 + }, + { + "epoch": 15.01568, + "grad_norm": 0.9787873029708862, + "learning_rate": 2.655262104841937e-05, + "loss": 0.5299, + "step": 11731 + }, + { + "epoch": 15.01696, + "grad_norm": 0.9922900199890137, + "learning_rate": 2.655062024809924e-05, + "loss": 0.5363, + "step": 11732 + }, + { + "epoch": 15.01824, + "grad_norm": 0.9843411445617676, + "learning_rate": 2.654861944777911e-05, + "loss": 0.5272, + "step": 11733 + }, + { + "epoch": 15.01952, + "grad_norm": 0.9954893589019775, + "learning_rate": 2.6546618647458988e-05, + "loss": 0.5566, + "step": 11734 + }, + { + "epoch": 15.0208, + "grad_norm": 0.9984802603721619, + "learning_rate": 2.654461784713886e-05, + "loss": 0.561, + "step": 11735 + }, + { + "epoch": 15.02208, + "grad_norm": 0.992762565612793, + "learning_rate": 2.654261704681873e-05, + "loss": 0.4935, + "step": 11736 + }, + { + "epoch": 15.02336, + "grad_norm": 0.9480631351470947, + "learning_rate": 2.65406162464986e-05, + "loss": 0.493, + "step": 11737 + }, + { + "epoch": 15.02464, + "grad_norm": 1.0148262977600098, + "learning_rate": 2.6538615446178472e-05, + "loss": 0.5401, + "step": 11738 + }, + { + "epoch": 15.02592, + "grad_norm": 0.9743524193763733, + "learning_rate": 2.6536614645858344e-05, + "loss": 0.5219, + "step": 11739 + }, + { + "epoch": 15.0272, + "grad_norm": 1.019026279449463, + "learning_rate": 2.6534613845538216e-05, + "loss": 0.5246, + "step": 11740 + }, + { + "epoch": 15.02848, + "grad_norm": 0.9823682308197021, + "learning_rate": 2.653261304521809e-05, + "loss": 0.5357, + "step": 11741 + }, + { + "epoch": 15.02976, + "grad_norm": 0.9612826108932495, + "learning_rate": 2.6530612244897963e-05, + "loss": 0.522, + "step": 11742 + }, + { + "epoch": 15.03104, + "grad_norm": 0.9693880677223206, + "learning_rate": 2.6528611444577835e-05, + "loss": 0.5181, + "step": 11743 + }, + { + "epoch": 15.03232, + "grad_norm": 0.9547830820083618, + "learning_rate": 2.6526610644257703e-05, + "loss": 0.5113, + "step": 11744 + }, + { + "epoch": 15.0336, + "grad_norm": 0.9662365317344666, + "learning_rate": 2.6524609843937575e-05, + "loss": 0.5069, + "step": 11745 + }, + { + "epoch": 15.03488, + "grad_norm": 0.9782978296279907, + "learning_rate": 2.6522609043617447e-05, + "loss": 0.5354, + "step": 11746 + }, + { + "epoch": 15.03616, + "grad_norm": 0.9016302824020386, + "learning_rate": 2.652060824329732e-05, + "loss": 0.4629, + "step": 11747 + }, + { + "epoch": 15.03744, + "grad_norm": 0.8719879388809204, + "learning_rate": 2.6518607442977194e-05, + "loss": 0.4583, + "step": 11748 + }, + { + "epoch": 15.03872, + "grad_norm": 1.0020391941070557, + "learning_rate": 2.6516606642657066e-05, + "loss": 0.5467, + "step": 11749 + }, + { + "epoch": 15.04, + "grad_norm": 0.999260663986206, + "learning_rate": 2.6514605842336938e-05, + "loss": 0.5304, + "step": 11750 + }, + { + "epoch": 15.04128, + "grad_norm": 1.00063157081604, + "learning_rate": 2.651260504201681e-05, + "loss": 0.5185, + "step": 11751 + }, + { + "epoch": 15.04256, + "grad_norm": 1.0226551294326782, + "learning_rate": 2.6510604241696678e-05, + "loss": 0.5016, + "step": 11752 + }, + { + "epoch": 15.04384, + "grad_norm": 0.99168461561203, + "learning_rate": 2.650860344137655e-05, + "loss": 0.5583, + "step": 11753 + }, + { + "epoch": 15.04512, + "grad_norm": 0.9974046349525452, + "learning_rate": 2.6506602641056422e-05, + "loss": 0.4811, + "step": 11754 + }, + { + "epoch": 15.0464, + "grad_norm": 1.019737958908081, + "learning_rate": 2.6504601840736297e-05, + "loss": 0.5457, + "step": 11755 + }, + { + "epoch": 15.04768, + "grad_norm": 0.9845877289772034, + "learning_rate": 2.650260104041617e-05, + "loss": 0.4997, + "step": 11756 + }, + { + "epoch": 15.04896, + "grad_norm": 0.9450182318687439, + "learning_rate": 2.650060024009604e-05, + "loss": 0.51, + "step": 11757 + }, + { + "epoch": 15.05024, + "grad_norm": 1.0113811492919922, + "learning_rate": 2.6498599439775913e-05, + "loss": 0.5266, + "step": 11758 + }, + { + "epoch": 15.05152, + "grad_norm": 0.9556260108947754, + "learning_rate": 2.6496598639455785e-05, + "loss": 0.517, + "step": 11759 + }, + { + "epoch": 15.0528, + "grad_norm": 0.9880708456039429, + "learning_rate": 2.6494597839135653e-05, + "loss": 0.5527, + "step": 11760 + }, + { + "epoch": 15.05408, + "grad_norm": 0.8803279399871826, + "learning_rate": 2.6492597038815525e-05, + "loss": 0.4476, + "step": 11761 + }, + { + "epoch": 15.05536, + "grad_norm": 0.974906861782074, + "learning_rate": 2.6490596238495404e-05, + "loss": 0.5251, + "step": 11762 + }, + { + "epoch": 15.05664, + "grad_norm": 0.9600889086723328, + "learning_rate": 2.6488595438175272e-05, + "loss": 0.4955, + "step": 11763 + }, + { + "epoch": 15.05792, + "grad_norm": 0.9697887301445007, + "learning_rate": 2.6486594637855144e-05, + "loss": 0.5455, + "step": 11764 + }, + { + "epoch": 15.0592, + "grad_norm": 0.9973766207695007, + "learning_rate": 2.6484593837535016e-05, + "loss": 0.4653, + "step": 11765 + }, + { + "epoch": 15.06048, + "grad_norm": 1.0244485139846802, + "learning_rate": 2.6482593037214888e-05, + "loss": 0.5189, + "step": 11766 + }, + { + "epoch": 15.06176, + "grad_norm": 0.9942975640296936, + "learning_rate": 2.648059223689476e-05, + "loss": 0.5736, + "step": 11767 + }, + { + "epoch": 15.06304, + "grad_norm": 0.9399450421333313, + "learning_rate": 2.6478591436574628e-05, + "loss": 0.5184, + "step": 11768 + }, + { + "epoch": 15.06432, + "grad_norm": 0.9376527070999146, + "learning_rate": 2.6476590636254507e-05, + "loss": 0.494, + "step": 11769 + }, + { + "epoch": 15.0656, + "grad_norm": 0.9570677280426025, + "learning_rate": 2.647458983593438e-05, + "loss": 0.5268, + "step": 11770 + }, + { + "epoch": 15.06688, + "grad_norm": 0.9823542833328247, + "learning_rate": 2.6472589035614247e-05, + "loss": 0.5312, + "step": 11771 + }, + { + "epoch": 15.06816, + "grad_norm": 1.0721344947814941, + "learning_rate": 2.647058823529412e-05, + "loss": 0.52, + "step": 11772 + }, + { + "epoch": 15.06944, + "grad_norm": 1.008894681930542, + "learning_rate": 2.646858743497399e-05, + "loss": 0.5164, + "step": 11773 + }, + { + "epoch": 15.07072, + "grad_norm": 0.9895505905151367, + "learning_rate": 2.6466586634653862e-05, + "loss": 0.5588, + "step": 11774 + }, + { + "epoch": 15.072, + "grad_norm": 1.0512263774871826, + "learning_rate": 2.6464585834333734e-05, + "loss": 0.5231, + "step": 11775 + }, + { + "epoch": 15.07328, + "grad_norm": 0.891127347946167, + "learning_rate": 2.646258503401361e-05, + "loss": 0.4783, + "step": 11776 + }, + { + "epoch": 15.07456, + "grad_norm": 0.9510053992271423, + "learning_rate": 2.646058423369348e-05, + "loss": 0.5321, + "step": 11777 + }, + { + "epoch": 15.07584, + "grad_norm": 0.954017698764801, + "learning_rate": 2.6458583433373353e-05, + "loss": 0.5153, + "step": 11778 + }, + { + "epoch": 15.07712, + "grad_norm": 1.0456651449203491, + "learning_rate": 2.6456582633053222e-05, + "loss": 0.5779, + "step": 11779 + }, + { + "epoch": 15.0784, + "grad_norm": 0.9539825320243835, + "learning_rate": 2.6454581832733094e-05, + "loss": 0.5285, + "step": 11780 + }, + { + "epoch": 15.07968, + "grad_norm": 0.952126681804657, + "learning_rate": 2.6452581032412965e-05, + "loss": 0.512, + "step": 11781 + }, + { + "epoch": 15.08096, + "grad_norm": 0.9592217803001404, + "learning_rate": 2.6450580232092837e-05, + "loss": 0.4852, + "step": 11782 + }, + { + "epoch": 15.08224, + "grad_norm": 0.9491934180259705, + "learning_rate": 2.644857943177271e-05, + "loss": 0.4663, + "step": 11783 + }, + { + "epoch": 15.08352, + "grad_norm": 0.9953993558883667, + "learning_rate": 2.6446578631452584e-05, + "loss": 0.4667, + "step": 11784 + }, + { + "epoch": 15.0848, + "grad_norm": 1.0062161684036255, + "learning_rate": 2.6444577831132456e-05, + "loss": 0.4812, + "step": 11785 + }, + { + "epoch": 15.08608, + "grad_norm": 0.9338573217391968, + "learning_rate": 2.6442577030812328e-05, + "loss": 0.5201, + "step": 11786 + }, + { + "epoch": 15.08736, + "grad_norm": 0.9554229378700256, + "learning_rate": 2.6440576230492197e-05, + "loss": 0.5064, + "step": 11787 + }, + { + "epoch": 15.08864, + "grad_norm": 0.9917376041412354, + "learning_rate": 2.643857543017207e-05, + "loss": 0.528, + "step": 11788 + }, + { + "epoch": 15.08992, + "grad_norm": 0.9698293209075928, + "learning_rate": 2.643657462985194e-05, + "loss": 0.4922, + "step": 11789 + }, + { + "epoch": 15.0912, + "grad_norm": 1.039360761642456, + "learning_rate": 2.6434573829531812e-05, + "loss": 0.5244, + "step": 11790 + }, + { + "epoch": 15.09248, + "grad_norm": 0.9987143874168396, + "learning_rate": 2.6432573029211687e-05, + "loss": 0.5006, + "step": 11791 + }, + { + "epoch": 15.09376, + "grad_norm": 0.9327332377433777, + "learning_rate": 2.643057222889156e-05, + "loss": 0.4795, + "step": 11792 + }, + { + "epoch": 15.09504, + "grad_norm": 0.9864438772201538, + "learning_rate": 2.642857142857143e-05, + "loss": 0.4949, + "step": 11793 + }, + { + "epoch": 15.09632, + "grad_norm": 0.99493008852005, + "learning_rate": 2.6426570628251303e-05, + "loss": 0.5297, + "step": 11794 + }, + { + "epoch": 15.0976, + "grad_norm": 0.9412096738815308, + "learning_rate": 2.642456982793117e-05, + "loss": 0.4775, + "step": 11795 + }, + { + "epoch": 15.09888, + "grad_norm": 0.9617460370063782, + "learning_rate": 2.6422569027611043e-05, + "loss": 0.4855, + "step": 11796 + }, + { + "epoch": 15.10016, + "grad_norm": 0.982379674911499, + "learning_rate": 2.6420568227290915e-05, + "loss": 0.5227, + "step": 11797 + }, + { + "epoch": 15.10144, + "grad_norm": 1.020799994468689, + "learning_rate": 2.641856742697079e-05, + "loss": 0.579, + "step": 11798 + }, + { + "epoch": 15.10272, + "grad_norm": 1.0289467573165894, + "learning_rate": 2.6416566626650662e-05, + "loss": 0.5353, + "step": 11799 + }, + { + "epoch": 15.104, + "grad_norm": 1.0211548805236816, + "learning_rate": 2.6414565826330534e-05, + "loss": 0.5503, + "step": 11800 + }, + { + "epoch": 15.10528, + "grad_norm": 1.011851191520691, + "learning_rate": 2.6412565026010406e-05, + "loss": 0.5338, + "step": 11801 + }, + { + "epoch": 15.10656, + "grad_norm": 1.0263190269470215, + "learning_rate": 2.6410564225690278e-05, + "loss": 0.5643, + "step": 11802 + }, + { + "epoch": 15.10784, + "grad_norm": 0.9878085851669312, + "learning_rate": 2.6408563425370146e-05, + "loss": 0.5279, + "step": 11803 + }, + { + "epoch": 15.10912, + "grad_norm": 0.9296584129333496, + "learning_rate": 2.6406562625050018e-05, + "loss": 0.4767, + "step": 11804 + }, + { + "epoch": 15.1104, + "grad_norm": 0.9973602294921875, + "learning_rate": 2.6404561824729897e-05, + "loss": 0.5266, + "step": 11805 + }, + { + "epoch": 15.11168, + "grad_norm": 0.9674133658409119, + "learning_rate": 2.6402561024409765e-05, + "loss": 0.511, + "step": 11806 + }, + { + "epoch": 15.11296, + "grad_norm": 1.0054680109024048, + "learning_rate": 2.6400560224089637e-05, + "loss": 0.5167, + "step": 11807 + }, + { + "epoch": 15.11424, + "grad_norm": 0.9890505075454712, + "learning_rate": 2.639855942376951e-05, + "loss": 0.5279, + "step": 11808 + }, + { + "epoch": 15.11552, + "grad_norm": 0.9986791610717773, + "learning_rate": 2.639655862344938e-05, + "loss": 0.5025, + "step": 11809 + }, + { + "epoch": 15.1168, + "grad_norm": 0.9749217629432678, + "learning_rate": 2.6394557823129253e-05, + "loss": 0.4886, + "step": 11810 + }, + { + "epoch": 15.11808, + "grad_norm": 0.9737854599952698, + "learning_rate": 2.639255702280912e-05, + "loss": 0.4664, + "step": 11811 + }, + { + "epoch": 15.11936, + "grad_norm": 0.9293329119682312, + "learning_rate": 2.6390556222489e-05, + "loss": 0.4985, + "step": 11812 + }, + { + "epoch": 15.12064, + "grad_norm": 0.9365719556808472, + "learning_rate": 2.638855542216887e-05, + "loss": 0.5201, + "step": 11813 + }, + { + "epoch": 15.12192, + "grad_norm": 0.9597119092941284, + "learning_rate": 2.638655462184874e-05, + "loss": 0.5045, + "step": 11814 + }, + { + "epoch": 15.1232, + "grad_norm": 1.0304876565933228, + "learning_rate": 2.6384553821528612e-05, + "loss": 0.5272, + "step": 11815 + }, + { + "epoch": 15.12448, + "grad_norm": 0.9947621822357178, + "learning_rate": 2.6382553021208484e-05, + "loss": 0.5527, + "step": 11816 + }, + { + "epoch": 15.12576, + "grad_norm": 0.9298506379127502, + "learning_rate": 2.6380552220888356e-05, + "loss": 0.4879, + "step": 11817 + }, + { + "epoch": 15.12704, + "grad_norm": 1.0342029333114624, + "learning_rate": 2.6378551420568228e-05, + "loss": 0.5352, + "step": 11818 + }, + { + "epoch": 15.12832, + "grad_norm": 0.9854941368103027, + "learning_rate": 2.6376550620248103e-05, + "loss": 0.5068, + "step": 11819 + }, + { + "epoch": 15.1296, + "grad_norm": 1.0126068592071533, + "learning_rate": 2.6374549819927975e-05, + "loss": 0.5505, + "step": 11820 + }, + { + "epoch": 15.13088, + "grad_norm": 1.0106240510940552, + "learning_rate": 2.6372549019607846e-05, + "loss": 0.5167, + "step": 11821 + }, + { + "epoch": 15.13216, + "grad_norm": 1.0682857036590576, + "learning_rate": 2.6370548219287715e-05, + "loss": 0.5697, + "step": 11822 + }, + { + "epoch": 15.13344, + "grad_norm": 1.0406882762908936, + "learning_rate": 2.6368547418967587e-05, + "loss": 0.5549, + "step": 11823 + }, + { + "epoch": 15.13472, + "grad_norm": 0.9883754253387451, + "learning_rate": 2.636654661864746e-05, + "loss": 0.4826, + "step": 11824 + }, + { + "epoch": 15.136, + "grad_norm": 1.0199334621429443, + "learning_rate": 2.636454581832733e-05, + "loss": 0.5189, + "step": 11825 + }, + { + "epoch": 15.13728, + "grad_norm": 1.0423178672790527, + "learning_rate": 2.6362545018007206e-05, + "loss": 0.5777, + "step": 11826 + }, + { + "epoch": 15.13856, + "grad_norm": 1.055101752281189, + "learning_rate": 2.6360544217687078e-05, + "loss": 0.555, + "step": 11827 + }, + { + "epoch": 15.13984, + "grad_norm": 0.9862048625946045, + "learning_rate": 2.635854341736695e-05, + "loss": 0.5281, + "step": 11828 + }, + { + "epoch": 15.14112, + "grad_norm": 0.9423441886901855, + "learning_rate": 2.635654261704682e-05, + "loss": 0.4727, + "step": 11829 + }, + { + "epoch": 15.1424, + "grad_norm": 0.9520114660263062, + "learning_rate": 2.635454181672669e-05, + "loss": 0.541, + "step": 11830 + }, + { + "epoch": 15.14368, + "grad_norm": 0.9528147578239441, + "learning_rate": 2.635254101640656e-05, + "loss": 0.5085, + "step": 11831 + }, + { + "epoch": 15.14496, + "grad_norm": 0.9359756112098694, + "learning_rate": 2.6350540216086434e-05, + "loss": 0.4601, + "step": 11832 + }, + { + "epoch": 15.14624, + "grad_norm": 0.9768989086151123, + "learning_rate": 2.634853941576631e-05, + "loss": 0.494, + "step": 11833 + }, + { + "epoch": 15.14752, + "grad_norm": 0.9752456545829773, + "learning_rate": 2.634653861544618e-05, + "loss": 0.5484, + "step": 11834 + }, + { + "epoch": 15.1488, + "grad_norm": 0.9548261165618896, + "learning_rate": 2.6344537815126052e-05, + "loss": 0.5219, + "step": 11835 + }, + { + "epoch": 15.150079999999999, + "grad_norm": 1.0379620790481567, + "learning_rate": 2.6342537014805924e-05, + "loss": 0.5714, + "step": 11836 + }, + { + "epoch": 15.15136, + "grad_norm": 0.9867843389511108, + "learning_rate": 2.6340536214485796e-05, + "loss": 0.5445, + "step": 11837 + }, + { + "epoch": 15.15264, + "grad_norm": 0.9981434941291809, + "learning_rate": 2.6338535414165665e-05, + "loss": 0.4993, + "step": 11838 + }, + { + "epoch": 15.15392, + "grad_norm": 1.0145291090011597, + "learning_rate": 2.6336534613845537e-05, + "loss": 0.52, + "step": 11839 + }, + { + "epoch": 15.1552, + "grad_norm": 0.9970989227294922, + "learning_rate": 2.6334533813525415e-05, + "loss": 0.5516, + "step": 11840 + }, + { + "epoch": 15.15648, + "grad_norm": 0.9794284701347351, + "learning_rate": 2.6332533013205284e-05, + "loss": 0.5283, + "step": 11841 + }, + { + "epoch": 15.15776, + "grad_norm": 1.0003759860992432, + "learning_rate": 2.6330532212885155e-05, + "loss": 0.4844, + "step": 11842 + }, + { + "epoch": 15.15904, + "grad_norm": 0.949975311756134, + "learning_rate": 2.6328531412565027e-05, + "loss": 0.4862, + "step": 11843 + }, + { + "epoch": 15.16032, + "grad_norm": 0.9530737400054932, + "learning_rate": 2.63265306122449e-05, + "loss": 0.4736, + "step": 11844 + }, + { + "epoch": 15.1616, + "grad_norm": 0.9621790051460266, + "learning_rate": 2.632452981192477e-05, + "loss": 0.4617, + "step": 11845 + }, + { + "epoch": 15.16288, + "grad_norm": 1.0266780853271484, + "learning_rate": 2.632252901160464e-05, + "loss": 0.5228, + "step": 11846 + }, + { + "epoch": 15.16416, + "grad_norm": 0.9723334908485413, + "learning_rate": 2.6320528211284518e-05, + "loss": 0.4968, + "step": 11847 + }, + { + "epoch": 15.16544, + "grad_norm": 0.9815566539764404, + "learning_rate": 2.631852741096439e-05, + "loss": 0.5549, + "step": 11848 + }, + { + "epoch": 15.16672, + "grad_norm": 0.9635928869247437, + "learning_rate": 2.631652661064426e-05, + "loss": 0.5149, + "step": 11849 + }, + { + "epoch": 15.168, + "grad_norm": 0.9444295167922974, + "learning_rate": 2.631452581032413e-05, + "loss": 0.5338, + "step": 11850 + }, + { + "epoch": 15.16928, + "grad_norm": 0.9650683403015137, + "learning_rate": 2.6312525010004002e-05, + "loss": 0.5011, + "step": 11851 + }, + { + "epoch": 15.17056, + "grad_norm": 0.97637939453125, + "learning_rate": 2.6310524209683874e-05, + "loss": 0.4643, + "step": 11852 + }, + { + "epoch": 15.17184, + "grad_norm": 0.9784893989562988, + "learning_rate": 2.6308523409363746e-05, + "loss": 0.4875, + "step": 11853 + }, + { + "epoch": 15.17312, + "grad_norm": 0.9220801591873169, + "learning_rate": 2.630652260904362e-05, + "loss": 0.4402, + "step": 11854 + }, + { + "epoch": 15.1744, + "grad_norm": 1.0712640285491943, + "learning_rate": 2.6304521808723493e-05, + "loss": 0.5405, + "step": 11855 + }, + { + "epoch": 15.17568, + "grad_norm": 0.9892664551734924, + "learning_rate": 2.6302521008403365e-05, + "loss": 0.5264, + "step": 11856 + }, + { + "epoch": 15.17696, + "grad_norm": 0.9567806720733643, + "learning_rate": 2.6300520208083233e-05, + "loss": 0.4772, + "step": 11857 + }, + { + "epoch": 15.17824, + "grad_norm": 0.9925405979156494, + "learning_rate": 2.6298519407763105e-05, + "loss": 0.5128, + "step": 11858 + }, + { + "epoch": 15.17952, + "grad_norm": 0.9644914865493774, + "learning_rate": 2.6296518607442977e-05, + "loss": 0.4847, + "step": 11859 + }, + { + "epoch": 15.1808, + "grad_norm": 0.9735480546951294, + "learning_rate": 2.629451780712285e-05, + "loss": 0.5335, + "step": 11860 + }, + { + "epoch": 15.18208, + "grad_norm": 0.99360591173172, + "learning_rate": 2.6292517006802724e-05, + "loss": 0.5185, + "step": 11861 + }, + { + "epoch": 15.18336, + "grad_norm": 0.9777383208274841, + "learning_rate": 2.6290516206482596e-05, + "loss": 0.5173, + "step": 11862 + }, + { + "epoch": 15.18464, + "grad_norm": 0.997850775718689, + "learning_rate": 2.6288515406162468e-05, + "loss": 0.5199, + "step": 11863 + }, + { + "epoch": 15.18592, + "grad_norm": 0.9816862344741821, + "learning_rate": 2.628651460584234e-05, + "loss": 0.5277, + "step": 11864 + }, + { + "epoch": 15.1872, + "grad_norm": 1.0436856746673584, + "learning_rate": 2.6284513805522208e-05, + "loss": 0.5121, + "step": 11865 + }, + { + "epoch": 15.18848, + "grad_norm": 0.9631795287132263, + "learning_rate": 2.628251300520208e-05, + "loss": 0.4858, + "step": 11866 + }, + { + "epoch": 15.18976, + "grad_norm": 0.8962351083755493, + "learning_rate": 2.6280512204881952e-05, + "loss": 0.5049, + "step": 11867 + }, + { + "epoch": 15.19104, + "grad_norm": 0.99549400806427, + "learning_rate": 2.6278511404561827e-05, + "loss": 0.5051, + "step": 11868 + }, + { + "epoch": 15.19232, + "grad_norm": 0.926495373249054, + "learning_rate": 2.62765106042417e-05, + "loss": 0.4831, + "step": 11869 + }, + { + "epoch": 15.1936, + "grad_norm": 1.0090394020080566, + "learning_rate": 2.627450980392157e-05, + "loss": 0.5091, + "step": 11870 + }, + { + "epoch": 15.19488, + "grad_norm": 1.0354359149932861, + "learning_rate": 2.6272509003601443e-05, + "loss": 0.558, + "step": 11871 + }, + { + "epoch": 15.19616, + "grad_norm": 1.04960036277771, + "learning_rate": 2.6270508203281315e-05, + "loss": 0.5582, + "step": 11872 + }, + { + "epoch": 15.19744, + "grad_norm": 1.0015323162078857, + "learning_rate": 2.6268507402961183e-05, + "loss": 0.4953, + "step": 11873 + }, + { + "epoch": 15.19872, + "grad_norm": 0.971566915512085, + "learning_rate": 2.6266506602641055e-05, + "loss": 0.5299, + "step": 11874 + }, + { + "epoch": 15.2, + "grad_norm": 0.9685811996459961, + "learning_rate": 2.6264505802320934e-05, + "loss": 0.5111, + "step": 11875 + }, + { + "epoch": 15.20128, + "grad_norm": 0.9589853882789612, + "learning_rate": 2.6262505002000802e-05, + "loss": 0.4904, + "step": 11876 + }, + { + "epoch": 15.20256, + "grad_norm": 1.025733232498169, + "learning_rate": 2.6260504201680674e-05, + "loss": 0.4765, + "step": 11877 + }, + { + "epoch": 15.20384, + "grad_norm": 1.0392835140228271, + "learning_rate": 2.6258503401360546e-05, + "loss": 0.5459, + "step": 11878 + }, + { + "epoch": 15.20512, + "grad_norm": 0.9788011312484741, + "learning_rate": 2.6256502601040418e-05, + "loss": 0.4549, + "step": 11879 + }, + { + "epoch": 15.2064, + "grad_norm": 0.9786105751991272, + "learning_rate": 2.625450180072029e-05, + "loss": 0.5015, + "step": 11880 + }, + { + "epoch": 15.20768, + "grad_norm": 0.9867866635322571, + "learning_rate": 2.6252501000400158e-05, + "loss": 0.5019, + "step": 11881 + }, + { + "epoch": 15.20896, + "grad_norm": 1.0612010955810547, + "learning_rate": 2.6250500200080037e-05, + "loss": 0.5723, + "step": 11882 + }, + { + "epoch": 15.21024, + "grad_norm": 0.9832754731178284, + "learning_rate": 2.624849939975991e-05, + "loss": 0.5225, + "step": 11883 + }, + { + "epoch": 15.21152, + "grad_norm": 1.044143795967102, + "learning_rate": 2.6246498599439777e-05, + "loss": 0.5303, + "step": 11884 + }, + { + "epoch": 15.2128, + "grad_norm": 0.9782372713088989, + "learning_rate": 2.624449779911965e-05, + "loss": 0.5279, + "step": 11885 + }, + { + "epoch": 15.21408, + "grad_norm": 1.020448923110962, + "learning_rate": 2.624249699879952e-05, + "loss": 0.5357, + "step": 11886 + }, + { + "epoch": 15.21536, + "grad_norm": 1.0035127401351929, + "learning_rate": 2.6240496198479392e-05, + "loss": 0.5383, + "step": 11887 + }, + { + "epoch": 15.21664, + "grad_norm": 0.9856539964675903, + "learning_rate": 2.6238495398159264e-05, + "loss": 0.5267, + "step": 11888 + }, + { + "epoch": 15.21792, + "grad_norm": 0.979160726070404, + "learning_rate": 2.623649459783914e-05, + "loss": 0.4924, + "step": 11889 + }, + { + "epoch": 15.2192, + "grad_norm": 0.9597509503364563, + "learning_rate": 2.623449379751901e-05, + "loss": 0.5109, + "step": 11890 + }, + { + "epoch": 15.22048, + "grad_norm": 0.9971436262130737, + "learning_rate": 2.6232492997198883e-05, + "loss": 0.5233, + "step": 11891 + }, + { + "epoch": 15.22176, + "grad_norm": 0.9758315086364746, + "learning_rate": 2.623049219687875e-05, + "loss": 0.5042, + "step": 11892 + }, + { + "epoch": 15.22304, + "grad_norm": 1.0089176893234253, + "learning_rate": 2.6228491396558624e-05, + "loss": 0.5225, + "step": 11893 + }, + { + "epoch": 15.22432, + "grad_norm": 0.9823707342147827, + "learning_rate": 2.6226490596238495e-05, + "loss": 0.5532, + "step": 11894 + }, + { + "epoch": 15.2256, + "grad_norm": 1.033499836921692, + "learning_rate": 2.6224489795918367e-05, + "loss": 0.5525, + "step": 11895 + }, + { + "epoch": 15.22688, + "grad_norm": 0.9601119160652161, + "learning_rate": 2.622248899559824e-05, + "loss": 0.5249, + "step": 11896 + }, + { + "epoch": 15.22816, + "grad_norm": 0.9555276036262512, + "learning_rate": 2.6220488195278114e-05, + "loss": 0.4677, + "step": 11897 + }, + { + "epoch": 15.22944, + "grad_norm": 1.0126163959503174, + "learning_rate": 2.6218487394957986e-05, + "loss": 0.5543, + "step": 11898 + }, + { + "epoch": 15.23072, + "grad_norm": 1.0423436164855957, + "learning_rate": 2.6216486594637858e-05, + "loss": 0.5366, + "step": 11899 + }, + { + "epoch": 15.232, + "grad_norm": 1.0581696033477783, + "learning_rate": 2.6214485794317727e-05, + "loss": 0.5205, + "step": 11900 + }, + { + "epoch": 15.23328, + "grad_norm": 1.026396632194519, + "learning_rate": 2.62124849939976e-05, + "loss": 0.4984, + "step": 11901 + }, + { + "epoch": 15.23456, + "grad_norm": 1.0131559371948242, + "learning_rate": 2.621048419367747e-05, + "loss": 0.498, + "step": 11902 + }, + { + "epoch": 15.23584, + "grad_norm": 1.0236692428588867, + "learning_rate": 2.6208483393357342e-05, + "loss": 0.5357, + "step": 11903 + }, + { + "epoch": 15.23712, + "grad_norm": 0.9787890911102295, + "learning_rate": 2.6206482593037217e-05, + "loss": 0.4933, + "step": 11904 + }, + { + "epoch": 15.2384, + "grad_norm": 0.9795733094215393, + "learning_rate": 2.620448179271709e-05, + "loss": 0.523, + "step": 11905 + }, + { + "epoch": 15.23968, + "grad_norm": 1.0700660943984985, + "learning_rate": 2.620248099239696e-05, + "loss": 0.5248, + "step": 11906 + }, + { + "epoch": 15.24096, + "grad_norm": 0.9715010523796082, + "learning_rate": 2.6200480192076833e-05, + "loss": 0.4913, + "step": 11907 + }, + { + "epoch": 15.24224, + "grad_norm": 0.9805188179016113, + "learning_rate": 2.61984793917567e-05, + "loss": 0.5119, + "step": 11908 + }, + { + "epoch": 15.24352, + "grad_norm": 1.0102686882019043, + "learning_rate": 2.6196478591436573e-05, + "loss": 0.5117, + "step": 11909 + }, + { + "epoch": 15.2448, + "grad_norm": 1.045552372932434, + "learning_rate": 2.6194477791116445e-05, + "loss": 0.563, + "step": 11910 + }, + { + "epoch": 15.24608, + "grad_norm": 0.9204160571098328, + "learning_rate": 2.619247699079632e-05, + "loss": 0.4714, + "step": 11911 + }, + { + "epoch": 15.24736, + "grad_norm": 1.0518666505813599, + "learning_rate": 2.6190476190476192e-05, + "loss": 0.5461, + "step": 11912 + }, + { + "epoch": 15.24864, + "grad_norm": 1.0590685606002808, + "learning_rate": 2.6188475390156064e-05, + "loss": 0.5677, + "step": 11913 + }, + { + "epoch": 15.24992, + "grad_norm": 0.9414172172546387, + "learning_rate": 2.6186474589835936e-05, + "loss": 0.4822, + "step": 11914 + }, + { + "epoch": 15.2512, + "grad_norm": 0.9399723410606384, + "learning_rate": 2.6184473789515808e-05, + "loss": 0.509, + "step": 11915 + }, + { + "epoch": 15.25248, + "grad_norm": 0.9907810688018799, + "learning_rate": 2.6182472989195676e-05, + "loss": 0.5117, + "step": 11916 + }, + { + "epoch": 15.25376, + "grad_norm": 1.0035853385925293, + "learning_rate": 2.6180472188875548e-05, + "loss": 0.5313, + "step": 11917 + }, + { + "epoch": 15.25504, + "grad_norm": 0.9608493447303772, + "learning_rate": 2.6178471388555427e-05, + "loss": 0.4925, + "step": 11918 + }, + { + "epoch": 15.25632, + "grad_norm": 0.933178722858429, + "learning_rate": 2.6176470588235295e-05, + "loss": 0.5003, + "step": 11919 + }, + { + "epoch": 15.2576, + "grad_norm": 1.016798734664917, + "learning_rate": 2.6174469787915167e-05, + "loss": 0.5243, + "step": 11920 + }, + { + "epoch": 15.25888, + "grad_norm": 1.0794856548309326, + "learning_rate": 2.617246898759504e-05, + "loss": 0.5542, + "step": 11921 + }, + { + "epoch": 15.26016, + "grad_norm": 0.988269031047821, + "learning_rate": 2.617046818727491e-05, + "loss": 0.5095, + "step": 11922 + }, + { + "epoch": 15.26144, + "grad_norm": 1.0120782852172852, + "learning_rate": 2.6168467386954783e-05, + "loss": 0.5286, + "step": 11923 + }, + { + "epoch": 15.26272, + "grad_norm": 0.9555339813232422, + "learning_rate": 2.616646658663465e-05, + "loss": 0.4668, + "step": 11924 + }, + { + "epoch": 15.264, + "grad_norm": 0.9648301005363464, + "learning_rate": 2.616446578631453e-05, + "loss": 0.5279, + "step": 11925 + }, + { + "epoch": 15.26528, + "grad_norm": 1.041354775428772, + "learning_rate": 2.61624649859944e-05, + "loss": 0.5173, + "step": 11926 + }, + { + "epoch": 15.26656, + "grad_norm": 0.9829549193382263, + "learning_rate": 2.616046418567427e-05, + "loss": 0.5126, + "step": 11927 + }, + { + "epoch": 15.26784, + "grad_norm": 1.0028904676437378, + "learning_rate": 2.6158463385354142e-05, + "loss": 0.5008, + "step": 11928 + }, + { + "epoch": 15.269120000000001, + "grad_norm": 0.9798859357833862, + "learning_rate": 2.6156462585034014e-05, + "loss": 0.5285, + "step": 11929 + }, + { + "epoch": 15.2704, + "grad_norm": 1.0254261493682861, + "learning_rate": 2.6154461784713886e-05, + "loss": 0.5012, + "step": 11930 + }, + { + "epoch": 15.27168, + "grad_norm": 0.9710574746131897, + "learning_rate": 2.6152460984393757e-05, + "loss": 0.5052, + "step": 11931 + }, + { + "epoch": 15.27296, + "grad_norm": 1.0092233419418335, + "learning_rate": 2.6150460184073633e-05, + "loss": 0.5666, + "step": 11932 + }, + { + "epoch": 15.27424, + "grad_norm": 1.0055060386657715, + "learning_rate": 2.6148459383753505e-05, + "loss": 0.5386, + "step": 11933 + }, + { + "epoch": 15.27552, + "grad_norm": 1.033969759941101, + "learning_rate": 2.6146458583433376e-05, + "loss": 0.5583, + "step": 11934 + }, + { + "epoch": 15.2768, + "grad_norm": 0.95125812292099, + "learning_rate": 2.6144457783113245e-05, + "loss": 0.4865, + "step": 11935 + }, + { + "epoch": 15.27808, + "grad_norm": 1.0022237300872803, + "learning_rate": 2.6142456982793117e-05, + "loss": 0.5342, + "step": 11936 + }, + { + "epoch": 15.27936, + "grad_norm": 0.9819777011871338, + "learning_rate": 2.614045618247299e-05, + "loss": 0.5323, + "step": 11937 + }, + { + "epoch": 15.28064, + "grad_norm": 0.982841968536377, + "learning_rate": 2.613845538215286e-05, + "loss": 0.4979, + "step": 11938 + }, + { + "epoch": 15.28192, + "grad_norm": 1.0762674808502197, + "learning_rate": 2.6136454581832736e-05, + "loss": 0.559, + "step": 11939 + }, + { + "epoch": 15.2832, + "grad_norm": 1.0399508476257324, + "learning_rate": 2.6134453781512608e-05, + "loss": 0.5466, + "step": 11940 + }, + { + "epoch": 15.28448, + "grad_norm": 0.9770174026489258, + "learning_rate": 2.613245298119248e-05, + "loss": 0.5081, + "step": 11941 + }, + { + "epoch": 15.28576, + "grad_norm": 1.06191885471344, + "learning_rate": 2.613045218087235e-05, + "loss": 0.5334, + "step": 11942 + }, + { + "epoch": 15.28704, + "grad_norm": 0.9418057203292847, + "learning_rate": 2.612845138055222e-05, + "loss": 0.4954, + "step": 11943 + }, + { + "epoch": 15.28832, + "grad_norm": 0.9617568254470825, + "learning_rate": 2.612645058023209e-05, + "loss": 0.5103, + "step": 11944 + }, + { + "epoch": 15.2896, + "grad_norm": 1.0067156553268433, + "learning_rate": 2.6124449779911963e-05, + "loss": 0.5035, + "step": 11945 + }, + { + "epoch": 15.29088, + "grad_norm": 1.002389907836914, + "learning_rate": 2.612244897959184e-05, + "loss": 0.5344, + "step": 11946 + }, + { + "epoch": 15.292159999999999, + "grad_norm": 1.0987991094589233, + "learning_rate": 2.612044817927171e-05, + "loss": 0.5497, + "step": 11947 + }, + { + "epoch": 15.29344, + "grad_norm": 1.0003621578216553, + "learning_rate": 2.6118447378951582e-05, + "loss": 0.4983, + "step": 11948 + }, + { + "epoch": 15.29472, + "grad_norm": 0.9967955946922302, + "learning_rate": 2.6116446578631454e-05, + "loss": 0.5338, + "step": 11949 + }, + { + "epoch": 15.296, + "grad_norm": 1.0217162370681763, + "learning_rate": 2.6114445778311326e-05, + "loss": 0.5592, + "step": 11950 + }, + { + "epoch": 15.29728, + "grad_norm": 1.0223591327667236, + "learning_rate": 2.6112444977991195e-05, + "loss": 0.5181, + "step": 11951 + }, + { + "epoch": 15.29856, + "grad_norm": 0.9928198456764221, + "learning_rate": 2.6110444177671066e-05, + "loss": 0.5228, + "step": 11952 + }, + { + "epoch": 15.29984, + "grad_norm": 1.0652741193771362, + "learning_rate": 2.6108443377350945e-05, + "loss": 0.5615, + "step": 11953 + }, + { + "epoch": 15.30112, + "grad_norm": 1.0429683923721313, + "learning_rate": 2.6106442577030814e-05, + "loss": 0.5492, + "step": 11954 + }, + { + "epoch": 15.3024, + "grad_norm": 0.9588310718536377, + "learning_rate": 2.6104441776710685e-05, + "loss": 0.4757, + "step": 11955 + }, + { + "epoch": 15.30368, + "grad_norm": 0.9648290872573853, + "learning_rate": 2.6102440976390557e-05, + "loss": 0.4989, + "step": 11956 + }, + { + "epoch": 15.30496, + "grad_norm": 1.0313849449157715, + "learning_rate": 2.610044017607043e-05, + "loss": 0.5461, + "step": 11957 + }, + { + "epoch": 15.30624, + "grad_norm": 1.0368759632110596, + "learning_rate": 2.60984393757503e-05, + "loss": 0.5372, + "step": 11958 + }, + { + "epoch": 15.30752, + "grad_norm": 0.9525302052497864, + "learning_rate": 2.609643857543017e-05, + "loss": 0.4805, + "step": 11959 + }, + { + "epoch": 15.3088, + "grad_norm": 0.9847881197929382, + "learning_rate": 2.6094437775110048e-05, + "loss": 0.5264, + "step": 11960 + }, + { + "epoch": 15.31008, + "grad_norm": 0.9874829053878784, + "learning_rate": 2.609243697478992e-05, + "loss": 0.5335, + "step": 11961 + }, + { + "epoch": 15.31136, + "grad_norm": 1.0466731786727905, + "learning_rate": 2.609043617446979e-05, + "loss": 0.5521, + "step": 11962 + }, + { + "epoch": 15.31264, + "grad_norm": 1.0074479579925537, + "learning_rate": 2.608843537414966e-05, + "loss": 0.5141, + "step": 11963 + }, + { + "epoch": 15.31392, + "grad_norm": 1.0698914527893066, + "learning_rate": 2.6086434573829532e-05, + "loss": 0.5662, + "step": 11964 + }, + { + "epoch": 15.3152, + "grad_norm": 0.9645558595657349, + "learning_rate": 2.6084433773509404e-05, + "loss": 0.5126, + "step": 11965 + }, + { + "epoch": 15.31648, + "grad_norm": 0.9718447923660278, + "learning_rate": 2.6082432973189276e-05, + "loss": 0.5003, + "step": 11966 + }, + { + "epoch": 15.31776, + "grad_norm": 0.9619754552841187, + "learning_rate": 2.608043217286915e-05, + "loss": 0.52, + "step": 11967 + }, + { + "epoch": 15.31904, + "grad_norm": 0.940642237663269, + "learning_rate": 2.6078431372549023e-05, + "loss": 0.5113, + "step": 11968 + }, + { + "epoch": 15.32032, + "grad_norm": 1.0012346506118774, + "learning_rate": 2.6076430572228895e-05, + "loss": 0.4932, + "step": 11969 + }, + { + "epoch": 15.3216, + "grad_norm": 0.9923637509346008, + "learning_rate": 2.6074429771908763e-05, + "loss": 0.5567, + "step": 11970 + }, + { + "epoch": 15.32288, + "grad_norm": 0.9815508723258972, + "learning_rate": 2.6072428971588635e-05, + "loss": 0.5412, + "step": 11971 + }, + { + "epoch": 15.32416, + "grad_norm": 1.0121711492538452, + "learning_rate": 2.6070428171268507e-05, + "loss": 0.54, + "step": 11972 + }, + { + "epoch": 15.32544, + "grad_norm": 1.0295419692993164, + "learning_rate": 2.606842737094838e-05, + "loss": 0.5261, + "step": 11973 + }, + { + "epoch": 15.32672, + "grad_norm": 0.9861733913421631, + "learning_rate": 2.6066426570628254e-05, + "loss": 0.5237, + "step": 11974 + }, + { + "epoch": 15.328, + "grad_norm": 0.9513747692108154, + "learning_rate": 2.6064425770308126e-05, + "loss": 0.5107, + "step": 11975 + }, + { + "epoch": 15.32928, + "grad_norm": 0.9383821487426758, + "learning_rate": 2.6062424969987998e-05, + "loss": 0.5109, + "step": 11976 + }, + { + "epoch": 15.33056, + "grad_norm": 0.9647513628005981, + "learning_rate": 2.606042416966787e-05, + "loss": 0.5326, + "step": 11977 + }, + { + "epoch": 15.33184, + "grad_norm": 1.0194227695465088, + "learning_rate": 2.6058423369347738e-05, + "loss": 0.5523, + "step": 11978 + }, + { + "epoch": 15.33312, + "grad_norm": 0.9552738070487976, + "learning_rate": 2.605642256902761e-05, + "loss": 0.494, + "step": 11979 + }, + { + "epoch": 15.3344, + "grad_norm": 0.9982839822769165, + "learning_rate": 2.6054421768707482e-05, + "loss": 0.5138, + "step": 11980 + }, + { + "epoch": 15.33568, + "grad_norm": 0.926050066947937, + "learning_rate": 2.6052420968387357e-05, + "loss": 0.4827, + "step": 11981 + }, + { + "epoch": 15.33696, + "grad_norm": 0.9754160046577454, + "learning_rate": 2.605042016806723e-05, + "loss": 0.4864, + "step": 11982 + }, + { + "epoch": 15.33824, + "grad_norm": 1.0375244617462158, + "learning_rate": 2.60484193677471e-05, + "loss": 0.5324, + "step": 11983 + }, + { + "epoch": 15.33952, + "grad_norm": 0.998441755771637, + "learning_rate": 2.6046418567426973e-05, + "loss": 0.5374, + "step": 11984 + }, + { + "epoch": 15.3408, + "grad_norm": 0.9597004055976868, + "learning_rate": 2.6044417767106845e-05, + "loss": 0.5497, + "step": 11985 + }, + { + "epoch": 15.34208, + "grad_norm": 0.984380304813385, + "learning_rate": 2.6042416966786713e-05, + "loss": 0.4974, + "step": 11986 + }, + { + "epoch": 15.34336, + "grad_norm": 1.0443543195724487, + "learning_rate": 2.6040416166466585e-05, + "loss": 0.5094, + "step": 11987 + }, + { + "epoch": 15.34464, + "grad_norm": 1.033092737197876, + "learning_rate": 2.6038415366146463e-05, + "loss": 0.5502, + "step": 11988 + }, + { + "epoch": 15.34592, + "grad_norm": 1.0299837589263916, + "learning_rate": 2.6036414565826332e-05, + "loss": 0.5757, + "step": 11989 + }, + { + "epoch": 15.3472, + "grad_norm": 0.9337131381034851, + "learning_rate": 2.6034413765506204e-05, + "loss": 0.4797, + "step": 11990 + }, + { + "epoch": 15.34848, + "grad_norm": 0.9382844567298889, + "learning_rate": 2.6032412965186076e-05, + "loss": 0.4631, + "step": 11991 + }, + { + "epoch": 15.34976, + "grad_norm": 0.9595440626144409, + "learning_rate": 2.6030412164865948e-05, + "loss": 0.5302, + "step": 11992 + }, + { + "epoch": 15.35104, + "grad_norm": 0.9686310887336731, + "learning_rate": 2.602841136454582e-05, + "loss": 0.5061, + "step": 11993 + }, + { + "epoch": 15.35232, + "grad_norm": 0.9132741093635559, + "learning_rate": 2.6026410564225688e-05, + "loss": 0.4801, + "step": 11994 + }, + { + "epoch": 15.3536, + "grad_norm": 0.9716625213623047, + "learning_rate": 2.6024409763905566e-05, + "loss": 0.5035, + "step": 11995 + }, + { + "epoch": 15.35488, + "grad_norm": 1.003369927406311, + "learning_rate": 2.602240896358544e-05, + "loss": 0.547, + "step": 11996 + }, + { + "epoch": 15.35616, + "grad_norm": 1.0026856660842896, + "learning_rate": 2.6020408163265307e-05, + "loss": 0.547, + "step": 11997 + }, + { + "epoch": 15.35744, + "grad_norm": 0.9878264665603638, + "learning_rate": 2.601840736294518e-05, + "loss": 0.5284, + "step": 11998 + }, + { + "epoch": 15.35872, + "grad_norm": 1.0457453727722168, + "learning_rate": 2.601640656262505e-05, + "loss": 0.5354, + "step": 11999 + }, + { + "epoch": 15.36, + "grad_norm": 1.0366843938827515, + "learning_rate": 2.6014405762304922e-05, + "loss": 0.5955, + "step": 12000 + }, + { + "epoch": 15.36128, + "grad_norm": 0.9786233901977539, + "learning_rate": 2.6012404961984794e-05, + "loss": 0.521, + "step": 12001 + }, + { + "epoch": 15.36256, + "grad_norm": 1.0292565822601318, + "learning_rate": 2.601040416166467e-05, + "loss": 0.5014, + "step": 12002 + }, + { + "epoch": 15.36384, + "grad_norm": 1.0155521631240845, + "learning_rate": 2.600840336134454e-05, + "loss": 0.5361, + "step": 12003 + }, + { + "epoch": 15.36512, + "grad_norm": 0.9816110134124756, + "learning_rate": 2.6006402561024413e-05, + "loss": 0.5233, + "step": 12004 + }, + { + "epoch": 15.3664, + "grad_norm": 0.9563271999359131, + "learning_rate": 2.600440176070428e-05, + "loss": 0.5141, + "step": 12005 + }, + { + "epoch": 15.36768, + "grad_norm": 1.029533863067627, + "learning_rate": 2.6002400960384154e-05, + "loss": 0.5331, + "step": 12006 + }, + { + "epoch": 15.36896, + "grad_norm": 1.0046855211257935, + "learning_rate": 2.6000400160064025e-05, + "loss": 0.535, + "step": 12007 + }, + { + "epoch": 15.37024, + "grad_norm": 0.9804155826568604, + "learning_rate": 2.5998399359743897e-05, + "loss": 0.513, + "step": 12008 + }, + { + "epoch": 15.37152, + "grad_norm": 0.9942843914031982, + "learning_rate": 2.599639855942377e-05, + "loss": 0.5471, + "step": 12009 + }, + { + "epoch": 15.3728, + "grad_norm": 1.0089268684387207, + "learning_rate": 2.5994397759103644e-05, + "loss": 0.5792, + "step": 12010 + }, + { + "epoch": 15.37408, + "grad_norm": 0.9854798913002014, + "learning_rate": 2.5992396958783516e-05, + "loss": 0.5647, + "step": 12011 + }, + { + "epoch": 15.37536, + "grad_norm": 1.0184553861618042, + "learning_rate": 2.5990396158463388e-05, + "loss": 0.4929, + "step": 12012 + }, + { + "epoch": 15.37664, + "grad_norm": 0.9660348296165466, + "learning_rate": 2.5988395358143257e-05, + "loss": 0.5246, + "step": 12013 + }, + { + "epoch": 15.37792, + "grad_norm": 0.9935752153396606, + "learning_rate": 2.598639455782313e-05, + "loss": 0.5349, + "step": 12014 + }, + { + "epoch": 15.3792, + "grad_norm": 0.953775942325592, + "learning_rate": 2.5984393757503e-05, + "loss": 0.5124, + "step": 12015 + }, + { + "epoch": 15.38048, + "grad_norm": 0.9608764052391052, + "learning_rate": 2.5982392957182872e-05, + "loss": 0.5077, + "step": 12016 + }, + { + "epoch": 15.38176, + "grad_norm": 0.9942414164543152, + "learning_rate": 2.5980392156862747e-05, + "loss": 0.5343, + "step": 12017 + }, + { + "epoch": 15.38304, + "grad_norm": 0.9368318915367126, + "learning_rate": 2.597839135654262e-05, + "loss": 0.5105, + "step": 12018 + }, + { + "epoch": 15.38432, + "grad_norm": 1.0121771097183228, + "learning_rate": 2.597639055622249e-05, + "loss": 0.5283, + "step": 12019 + }, + { + "epoch": 15.3856, + "grad_norm": 1.025707721710205, + "learning_rate": 2.5974389755902363e-05, + "loss": 0.5341, + "step": 12020 + }, + { + "epoch": 15.38688, + "grad_norm": 1.002144694328308, + "learning_rate": 2.597238895558223e-05, + "loss": 0.5056, + "step": 12021 + }, + { + "epoch": 15.38816, + "grad_norm": 0.9893521070480347, + "learning_rate": 2.5970388155262103e-05, + "loss": 0.5587, + "step": 12022 + }, + { + "epoch": 15.38944, + "grad_norm": 0.9755342602729797, + "learning_rate": 2.5968387354941975e-05, + "loss": 0.5413, + "step": 12023 + }, + { + "epoch": 15.39072, + "grad_norm": 0.972171425819397, + "learning_rate": 2.5966386554621854e-05, + "loss": 0.5081, + "step": 12024 + }, + { + "epoch": 15.392, + "grad_norm": 1.0347623825073242, + "learning_rate": 2.5964385754301722e-05, + "loss": 0.5589, + "step": 12025 + }, + { + "epoch": 15.39328, + "grad_norm": 0.9687240123748779, + "learning_rate": 2.5962384953981594e-05, + "loss": 0.5451, + "step": 12026 + }, + { + "epoch": 15.39456, + "grad_norm": 0.9607928991317749, + "learning_rate": 2.5960384153661466e-05, + "loss": 0.5309, + "step": 12027 + }, + { + "epoch": 15.39584, + "grad_norm": 0.9592061042785645, + "learning_rate": 2.5958383353341338e-05, + "loss": 0.5466, + "step": 12028 + }, + { + "epoch": 15.39712, + "grad_norm": 1.015899419784546, + "learning_rate": 2.5956382553021206e-05, + "loss": 0.5245, + "step": 12029 + }, + { + "epoch": 15.3984, + "grad_norm": 0.9896045327186584, + "learning_rate": 2.5954381752701078e-05, + "loss": 0.5316, + "step": 12030 + }, + { + "epoch": 15.39968, + "grad_norm": 0.9522262811660767, + "learning_rate": 2.5952380952380957e-05, + "loss": 0.5192, + "step": 12031 + }, + { + "epoch": 15.40096, + "grad_norm": 0.987667441368103, + "learning_rate": 2.595038015206083e-05, + "loss": 0.5136, + "step": 12032 + }, + { + "epoch": 15.40224, + "grad_norm": 1.0220928192138672, + "learning_rate": 2.5948379351740697e-05, + "loss": 0.57, + "step": 12033 + }, + { + "epoch": 15.40352, + "grad_norm": 1.0101734399795532, + "learning_rate": 2.594637855142057e-05, + "loss": 0.5165, + "step": 12034 + }, + { + "epoch": 15.4048, + "grad_norm": 0.9381194114685059, + "learning_rate": 2.594437775110044e-05, + "loss": 0.4597, + "step": 12035 + }, + { + "epoch": 15.40608, + "grad_norm": 1.0812232494354248, + "learning_rate": 2.5942376950780313e-05, + "loss": 0.5632, + "step": 12036 + }, + { + "epoch": 15.40736, + "grad_norm": 1.0491623878479004, + "learning_rate": 2.594037615046018e-05, + "loss": 0.538, + "step": 12037 + }, + { + "epoch": 15.40864, + "grad_norm": 0.9841446876525879, + "learning_rate": 2.593837535014006e-05, + "loss": 0.5059, + "step": 12038 + }, + { + "epoch": 15.40992, + "grad_norm": 0.9881943464279175, + "learning_rate": 2.593637454981993e-05, + "loss": 0.5154, + "step": 12039 + }, + { + "epoch": 15.411200000000001, + "grad_norm": 1.0402194261550903, + "learning_rate": 2.5934373749499803e-05, + "loss": 0.5756, + "step": 12040 + }, + { + "epoch": 15.41248, + "grad_norm": 1.095178246498108, + "learning_rate": 2.5932372949179672e-05, + "loss": 0.5601, + "step": 12041 + }, + { + "epoch": 15.41376, + "grad_norm": 1.0817269086837769, + "learning_rate": 2.5930372148859544e-05, + "loss": 0.5595, + "step": 12042 + }, + { + "epoch": 15.41504, + "grad_norm": 0.9906639456748962, + "learning_rate": 2.5928371348539416e-05, + "loss": 0.5067, + "step": 12043 + }, + { + "epoch": 15.41632, + "grad_norm": 0.9647719860076904, + "learning_rate": 2.5926370548219287e-05, + "loss": 0.4848, + "step": 12044 + }, + { + "epoch": 15.4176, + "grad_norm": 0.9558652639389038, + "learning_rate": 2.5924369747899163e-05, + "loss": 0.5381, + "step": 12045 + }, + { + "epoch": 15.41888, + "grad_norm": 0.9958375692367554, + "learning_rate": 2.5922368947579035e-05, + "loss": 0.5672, + "step": 12046 + }, + { + "epoch": 15.42016, + "grad_norm": 0.980514645576477, + "learning_rate": 2.5920368147258906e-05, + "loss": 0.5479, + "step": 12047 + }, + { + "epoch": 15.42144, + "grad_norm": 1.041264533996582, + "learning_rate": 2.5918367346938778e-05, + "loss": 0.5433, + "step": 12048 + }, + { + "epoch": 15.42272, + "grad_norm": 0.9948632121086121, + "learning_rate": 2.5916366546618647e-05, + "loss": 0.5445, + "step": 12049 + }, + { + "epoch": 15.424, + "grad_norm": 0.9686235189437866, + "learning_rate": 2.591436574629852e-05, + "loss": 0.5375, + "step": 12050 + }, + { + "epoch": 15.42528, + "grad_norm": 0.9626991748809814, + "learning_rate": 2.591236494597839e-05, + "loss": 0.5071, + "step": 12051 + }, + { + "epoch": 15.42656, + "grad_norm": 0.9826670289039612, + "learning_rate": 2.5910364145658266e-05, + "loss": 0.52, + "step": 12052 + }, + { + "epoch": 15.42784, + "grad_norm": 0.9592919945716858, + "learning_rate": 2.5908363345338138e-05, + "loss": 0.5125, + "step": 12053 + }, + { + "epoch": 15.42912, + "grad_norm": 0.9787027835845947, + "learning_rate": 2.590636254501801e-05, + "loss": 0.5241, + "step": 12054 + }, + { + "epoch": 15.4304, + "grad_norm": 0.9847190380096436, + "learning_rate": 2.590436174469788e-05, + "loss": 0.4971, + "step": 12055 + }, + { + "epoch": 15.43168, + "grad_norm": 0.9840123653411865, + "learning_rate": 2.5902360944377753e-05, + "loss": 0.55, + "step": 12056 + }, + { + "epoch": 15.43296, + "grad_norm": 0.9877153635025024, + "learning_rate": 2.590036014405762e-05, + "loss": 0.4923, + "step": 12057 + }, + { + "epoch": 15.43424, + "grad_norm": 0.9981076717376709, + "learning_rate": 2.5898359343737493e-05, + "loss": 0.5311, + "step": 12058 + }, + { + "epoch": 15.43552, + "grad_norm": 1.1250180006027222, + "learning_rate": 2.5896358543417372e-05, + "loss": 0.5702, + "step": 12059 + }, + { + "epoch": 15.4368, + "grad_norm": 1.031806230545044, + "learning_rate": 2.589435774309724e-05, + "loss": 0.568, + "step": 12060 + }, + { + "epoch": 15.43808, + "grad_norm": 0.9630105495452881, + "learning_rate": 2.5892356942777112e-05, + "loss": 0.4828, + "step": 12061 + }, + { + "epoch": 15.43936, + "grad_norm": 1.0042251348495483, + "learning_rate": 2.5890356142456984e-05, + "loss": 0.4779, + "step": 12062 + }, + { + "epoch": 15.44064, + "grad_norm": 0.9796602725982666, + "learning_rate": 2.5888355342136856e-05, + "loss": 0.4996, + "step": 12063 + }, + { + "epoch": 15.44192, + "grad_norm": 1.0524052381515503, + "learning_rate": 2.5886354541816728e-05, + "loss": 0.5759, + "step": 12064 + }, + { + "epoch": 15.4432, + "grad_norm": 1.0064433813095093, + "learning_rate": 2.5884353741496596e-05, + "loss": 0.4933, + "step": 12065 + }, + { + "epoch": 15.44448, + "grad_norm": 0.9477246403694153, + "learning_rate": 2.5882352941176475e-05, + "loss": 0.5054, + "step": 12066 + }, + { + "epoch": 15.44576, + "grad_norm": 0.9989529848098755, + "learning_rate": 2.5880352140856347e-05, + "loss": 0.5503, + "step": 12067 + }, + { + "epoch": 15.44704, + "grad_norm": 1.0528007745742798, + "learning_rate": 2.5878351340536215e-05, + "loss": 0.5832, + "step": 12068 + }, + { + "epoch": 15.44832, + "grad_norm": 0.9893178939819336, + "learning_rate": 2.5876350540216087e-05, + "loss": 0.4902, + "step": 12069 + }, + { + "epoch": 15.4496, + "grad_norm": 0.9771265387535095, + "learning_rate": 2.587434973989596e-05, + "loss": 0.531, + "step": 12070 + }, + { + "epoch": 15.45088, + "grad_norm": 1.0311421155929565, + "learning_rate": 2.587234893957583e-05, + "loss": 0.6119, + "step": 12071 + }, + { + "epoch": 15.45216, + "grad_norm": 0.9262276291847229, + "learning_rate": 2.5870348139255703e-05, + "loss": 0.5396, + "step": 12072 + }, + { + "epoch": 15.45344, + "grad_norm": 0.9662413001060486, + "learning_rate": 2.5868347338935578e-05, + "loss": 0.5006, + "step": 12073 + }, + { + "epoch": 15.45472, + "grad_norm": 1.0002639293670654, + "learning_rate": 2.586634653861545e-05, + "loss": 0.5263, + "step": 12074 + }, + { + "epoch": 15.456, + "grad_norm": 1.010391354560852, + "learning_rate": 2.5864345738295322e-05, + "loss": 0.5296, + "step": 12075 + }, + { + "epoch": 15.45728, + "grad_norm": 0.9372914433479309, + "learning_rate": 2.586234493797519e-05, + "loss": 0.4601, + "step": 12076 + }, + { + "epoch": 15.45856, + "grad_norm": 0.9640786647796631, + "learning_rate": 2.5860344137655062e-05, + "loss": 0.5049, + "step": 12077 + }, + { + "epoch": 15.45984, + "grad_norm": 0.9395653009414673, + "learning_rate": 2.5858343337334934e-05, + "loss": 0.4935, + "step": 12078 + }, + { + "epoch": 15.46112, + "grad_norm": 0.9529640078544617, + "learning_rate": 2.5856342537014806e-05, + "loss": 0.5194, + "step": 12079 + }, + { + "epoch": 15.4624, + "grad_norm": 0.9985076189041138, + "learning_rate": 2.585434173669468e-05, + "loss": 0.5531, + "step": 12080 + }, + { + "epoch": 15.46368, + "grad_norm": 0.924523115158081, + "learning_rate": 2.5852340936374553e-05, + "loss": 0.4675, + "step": 12081 + }, + { + "epoch": 15.46496, + "grad_norm": 1.0334564447402954, + "learning_rate": 2.5850340136054425e-05, + "loss": 0.5463, + "step": 12082 + }, + { + "epoch": 15.466239999999999, + "grad_norm": 0.9668273329734802, + "learning_rate": 2.5848339335734297e-05, + "loss": 0.5158, + "step": 12083 + }, + { + "epoch": 15.46752, + "grad_norm": 1.0096427202224731, + "learning_rate": 2.5846338535414165e-05, + "loss": 0.5181, + "step": 12084 + }, + { + "epoch": 15.4688, + "grad_norm": 0.9349405169487, + "learning_rate": 2.5844337735094037e-05, + "loss": 0.4826, + "step": 12085 + }, + { + "epoch": 15.47008, + "grad_norm": 0.9573792219161987, + "learning_rate": 2.584233693477391e-05, + "loss": 0.4755, + "step": 12086 + }, + { + "epoch": 15.47136, + "grad_norm": 1.014906644821167, + "learning_rate": 2.5840336134453784e-05, + "loss": 0.515, + "step": 12087 + }, + { + "epoch": 15.47264, + "grad_norm": 0.9619317054748535, + "learning_rate": 2.5838335334133656e-05, + "loss": 0.5234, + "step": 12088 + }, + { + "epoch": 15.47392, + "grad_norm": 0.9710426330566406, + "learning_rate": 2.5836334533813528e-05, + "loss": 0.4808, + "step": 12089 + }, + { + "epoch": 15.4752, + "grad_norm": 1.0196417570114136, + "learning_rate": 2.58343337334934e-05, + "loss": 0.5433, + "step": 12090 + }, + { + "epoch": 15.47648, + "grad_norm": 0.9249922037124634, + "learning_rate": 2.583233293317327e-05, + "loss": 0.4858, + "step": 12091 + }, + { + "epoch": 15.47776, + "grad_norm": 1.008486032485962, + "learning_rate": 2.583033213285314e-05, + "loss": 0.5453, + "step": 12092 + }, + { + "epoch": 15.47904, + "grad_norm": 0.9991050362586975, + "learning_rate": 2.5828331332533012e-05, + "loss": 0.5167, + "step": 12093 + }, + { + "epoch": 15.48032, + "grad_norm": 0.9885373115539551, + "learning_rate": 2.582633053221289e-05, + "loss": 0.5235, + "step": 12094 + }, + { + "epoch": 15.4816, + "grad_norm": 0.9774200320243835, + "learning_rate": 2.582432973189276e-05, + "loss": 0.4835, + "step": 12095 + }, + { + "epoch": 15.48288, + "grad_norm": 0.9708588123321533, + "learning_rate": 2.582232893157263e-05, + "loss": 0.5181, + "step": 12096 + }, + { + "epoch": 15.48416, + "grad_norm": 1.0187863111495972, + "learning_rate": 2.5820328131252503e-05, + "loss": 0.5702, + "step": 12097 + }, + { + "epoch": 15.48544, + "grad_norm": 0.9514103531837463, + "learning_rate": 2.5818327330932375e-05, + "loss": 0.4906, + "step": 12098 + }, + { + "epoch": 15.48672, + "grad_norm": 0.965501606464386, + "learning_rate": 2.5816326530612246e-05, + "loss": 0.5514, + "step": 12099 + }, + { + "epoch": 15.488, + "grad_norm": 1.0036544799804688, + "learning_rate": 2.5814325730292115e-05, + "loss": 0.4937, + "step": 12100 + }, + { + "epoch": 15.48928, + "grad_norm": 1.047786831855774, + "learning_rate": 2.5812324929971993e-05, + "loss": 0.535, + "step": 12101 + }, + { + "epoch": 15.49056, + "grad_norm": 1.0028244256973267, + "learning_rate": 2.5810324129651865e-05, + "loss": 0.5845, + "step": 12102 + }, + { + "epoch": 15.49184, + "grad_norm": 0.9661297798156738, + "learning_rate": 2.5808323329331734e-05, + "loss": 0.4615, + "step": 12103 + }, + { + "epoch": 15.49312, + "grad_norm": 0.9932764172554016, + "learning_rate": 2.5806322529011606e-05, + "loss": 0.4964, + "step": 12104 + }, + { + "epoch": 15.4944, + "grad_norm": 0.9625186920166016, + "learning_rate": 2.5804321728691478e-05, + "loss": 0.5339, + "step": 12105 + }, + { + "epoch": 15.49568, + "grad_norm": 0.9661415219306946, + "learning_rate": 2.580232092837135e-05, + "loss": 0.5432, + "step": 12106 + }, + { + "epoch": 15.49696, + "grad_norm": 1.0386898517608643, + "learning_rate": 2.580032012805122e-05, + "loss": 0.5427, + "step": 12107 + }, + { + "epoch": 15.49824, + "grad_norm": 0.9474533200263977, + "learning_rate": 2.5798319327731096e-05, + "loss": 0.5278, + "step": 12108 + }, + { + "epoch": 15.49952, + "grad_norm": 1.0017415285110474, + "learning_rate": 2.579631852741097e-05, + "loss": 0.5476, + "step": 12109 + }, + { + "epoch": 15.5008, + "grad_norm": 1.017276406288147, + "learning_rate": 2.579431772709084e-05, + "loss": 0.5015, + "step": 12110 + }, + { + "epoch": 15.50208, + "grad_norm": 1.0097792148590088, + "learning_rate": 2.579231692677071e-05, + "loss": 0.5492, + "step": 12111 + }, + { + "epoch": 15.50336, + "grad_norm": 0.9869034290313721, + "learning_rate": 2.579031612645058e-05, + "loss": 0.535, + "step": 12112 + }, + { + "epoch": 15.50464, + "grad_norm": 1.0194756984710693, + "learning_rate": 2.5788315326130452e-05, + "loss": 0.5019, + "step": 12113 + }, + { + "epoch": 15.50592, + "grad_norm": 1.0005152225494385, + "learning_rate": 2.5786314525810324e-05, + "loss": 0.525, + "step": 12114 + }, + { + "epoch": 15.5072, + "grad_norm": 0.9427708387374878, + "learning_rate": 2.57843137254902e-05, + "loss": 0.5217, + "step": 12115 + }, + { + "epoch": 15.50848, + "grad_norm": 0.9799354672431946, + "learning_rate": 2.578231292517007e-05, + "loss": 0.4981, + "step": 12116 + }, + { + "epoch": 15.50976, + "grad_norm": 0.9807402491569519, + "learning_rate": 2.5780312124849943e-05, + "loss": 0.5026, + "step": 12117 + }, + { + "epoch": 15.51104, + "grad_norm": 0.9583036303520203, + "learning_rate": 2.5778311324529815e-05, + "loss": 0.5072, + "step": 12118 + }, + { + "epoch": 15.51232, + "grad_norm": 1.0171658992767334, + "learning_rate": 2.5776310524209684e-05, + "loss": 0.5592, + "step": 12119 + }, + { + "epoch": 15.5136, + "grad_norm": 0.9614086747169495, + "learning_rate": 2.5774309723889555e-05, + "loss": 0.4961, + "step": 12120 + }, + { + "epoch": 15.51488, + "grad_norm": 0.9986427426338196, + "learning_rate": 2.5772308923569427e-05, + "loss": 0.4993, + "step": 12121 + }, + { + "epoch": 15.51616, + "grad_norm": 0.9052167534828186, + "learning_rate": 2.57703081232493e-05, + "loss": 0.4783, + "step": 12122 + }, + { + "epoch": 15.51744, + "grad_norm": 0.9978095889091492, + "learning_rate": 2.5768307322929174e-05, + "loss": 0.5733, + "step": 12123 + }, + { + "epoch": 15.51872, + "grad_norm": 0.9685370326042175, + "learning_rate": 2.5766306522609046e-05, + "loss": 0.51, + "step": 12124 + }, + { + "epoch": 15.52, + "grad_norm": 1.008240818977356, + "learning_rate": 2.5764305722288918e-05, + "loss": 0.5129, + "step": 12125 + }, + { + "epoch": 15.52128, + "grad_norm": 0.9818560481071472, + "learning_rate": 2.576230492196879e-05, + "loss": 0.5501, + "step": 12126 + }, + { + "epoch": 15.52256, + "grad_norm": 0.9996564984321594, + "learning_rate": 2.576030412164866e-05, + "loss": 0.4986, + "step": 12127 + }, + { + "epoch": 15.52384, + "grad_norm": 1.0474154949188232, + "learning_rate": 2.575830332132853e-05, + "loss": 0.5526, + "step": 12128 + }, + { + "epoch": 15.52512, + "grad_norm": 1.0156049728393555, + "learning_rate": 2.5756302521008402e-05, + "loss": 0.5273, + "step": 12129 + }, + { + "epoch": 15.5264, + "grad_norm": 0.9961782693862915, + "learning_rate": 2.5754301720688277e-05, + "loss": 0.491, + "step": 12130 + }, + { + "epoch": 15.52768, + "grad_norm": 1.0487329959869385, + "learning_rate": 2.575230092036815e-05, + "loss": 0.5798, + "step": 12131 + }, + { + "epoch": 15.52896, + "grad_norm": 0.9667060971260071, + "learning_rate": 2.575030012004802e-05, + "loss": 0.5385, + "step": 12132 + }, + { + "epoch": 15.53024, + "grad_norm": 0.9694793820381165, + "learning_rate": 2.5748299319727893e-05, + "loss": 0.5248, + "step": 12133 + }, + { + "epoch": 15.53152, + "grad_norm": 0.9100515246391296, + "learning_rate": 2.5746298519407765e-05, + "loss": 0.4769, + "step": 12134 + }, + { + "epoch": 15.5328, + "grad_norm": 0.9730302095413208, + "learning_rate": 2.5744297719087633e-05, + "loss": 0.5182, + "step": 12135 + }, + { + "epoch": 15.53408, + "grad_norm": 0.9823408126831055, + "learning_rate": 2.5742296918767505e-05, + "loss": 0.5195, + "step": 12136 + }, + { + "epoch": 15.53536, + "grad_norm": 1.023830771446228, + "learning_rate": 2.5740296118447384e-05, + "loss": 0.5501, + "step": 12137 + }, + { + "epoch": 15.53664, + "grad_norm": 0.9773147106170654, + "learning_rate": 2.5738295318127252e-05, + "loss": 0.5068, + "step": 12138 + }, + { + "epoch": 15.53792, + "grad_norm": 0.9705550074577332, + "learning_rate": 2.5736294517807124e-05, + "loss": 0.5073, + "step": 12139 + }, + { + "epoch": 15.5392, + "grad_norm": 0.9751460552215576, + "learning_rate": 2.5734293717486996e-05, + "loss": 0.5036, + "step": 12140 + }, + { + "epoch": 15.54048, + "grad_norm": 0.9804943799972534, + "learning_rate": 2.5732292917166868e-05, + "loss": 0.4845, + "step": 12141 + }, + { + "epoch": 15.54176, + "grad_norm": 1.020609974861145, + "learning_rate": 2.573029211684674e-05, + "loss": 0.5516, + "step": 12142 + }, + { + "epoch": 15.54304, + "grad_norm": 0.9483124017715454, + "learning_rate": 2.5728291316526608e-05, + "loss": 0.4902, + "step": 12143 + }, + { + "epoch": 15.54432, + "grad_norm": 1.0113613605499268, + "learning_rate": 2.5726290516206487e-05, + "loss": 0.5527, + "step": 12144 + }, + { + "epoch": 15.5456, + "grad_norm": 1.037793755531311, + "learning_rate": 2.572428971588636e-05, + "loss": 0.543, + "step": 12145 + }, + { + "epoch": 15.54688, + "grad_norm": 1.0010935068130493, + "learning_rate": 2.5722288915566227e-05, + "loss": 0.5157, + "step": 12146 + }, + { + "epoch": 15.54816, + "grad_norm": 0.9994384050369263, + "learning_rate": 2.57202881152461e-05, + "loss": 0.5223, + "step": 12147 + }, + { + "epoch": 15.54944, + "grad_norm": 0.9662322998046875, + "learning_rate": 2.571828731492597e-05, + "loss": 0.5161, + "step": 12148 + }, + { + "epoch": 15.55072, + "grad_norm": 1.0219558477401733, + "learning_rate": 2.5716286514605843e-05, + "loss": 0.534, + "step": 12149 + }, + { + "epoch": 15.552, + "grad_norm": 0.9742558002471924, + "learning_rate": 2.5714285714285714e-05, + "loss": 0.5151, + "step": 12150 + }, + { + "epoch": 15.55328, + "grad_norm": 1.0059118270874023, + "learning_rate": 2.571228491396559e-05, + "loss": 0.5637, + "step": 12151 + }, + { + "epoch": 15.55456, + "grad_norm": 0.9485430121421814, + "learning_rate": 2.571028411364546e-05, + "loss": 0.4919, + "step": 12152 + }, + { + "epoch": 15.55584, + "grad_norm": 0.9600156545639038, + "learning_rate": 2.5708283313325333e-05, + "loss": 0.529, + "step": 12153 + }, + { + "epoch": 15.55712, + "grad_norm": 0.9811621308326721, + "learning_rate": 2.5706282513005202e-05, + "loss": 0.5033, + "step": 12154 + }, + { + "epoch": 15.5584, + "grad_norm": 1.0301507711410522, + "learning_rate": 2.5704281712685074e-05, + "loss": 0.501, + "step": 12155 + }, + { + "epoch": 15.55968, + "grad_norm": 0.9439396858215332, + "learning_rate": 2.5702280912364946e-05, + "loss": 0.5265, + "step": 12156 + }, + { + "epoch": 15.56096, + "grad_norm": 1.0617479085922241, + "learning_rate": 2.5700280112044817e-05, + "loss": 0.5651, + "step": 12157 + }, + { + "epoch": 15.56224, + "grad_norm": 0.9912658333778381, + "learning_rate": 2.5698279311724693e-05, + "loss": 0.5087, + "step": 12158 + }, + { + "epoch": 15.56352, + "grad_norm": 0.9808444380760193, + "learning_rate": 2.5696278511404565e-05, + "loss": 0.5334, + "step": 12159 + }, + { + "epoch": 15.5648, + "grad_norm": 0.9171379804611206, + "learning_rate": 2.5694277711084436e-05, + "loss": 0.5009, + "step": 12160 + }, + { + "epoch": 15.56608, + "grad_norm": 0.9609277248382568, + "learning_rate": 2.5692276910764308e-05, + "loss": 0.5078, + "step": 12161 + }, + { + "epoch": 15.56736, + "grad_norm": 0.9825374484062195, + "learning_rate": 2.5690276110444177e-05, + "loss": 0.5374, + "step": 12162 + }, + { + "epoch": 15.56864, + "grad_norm": 0.9431901574134827, + "learning_rate": 2.568827531012405e-05, + "loss": 0.4915, + "step": 12163 + }, + { + "epoch": 15.56992, + "grad_norm": 1.0128616094589233, + "learning_rate": 2.568627450980392e-05, + "loss": 0.5483, + "step": 12164 + }, + { + "epoch": 15.5712, + "grad_norm": 1.0459868907928467, + "learning_rate": 2.5684273709483796e-05, + "loss": 0.5579, + "step": 12165 + }, + { + "epoch": 15.57248, + "grad_norm": 1.015820026397705, + "learning_rate": 2.5682272909163668e-05, + "loss": 0.5315, + "step": 12166 + }, + { + "epoch": 15.57376, + "grad_norm": 0.9853371977806091, + "learning_rate": 2.568027210884354e-05, + "loss": 0.513, + "step": 12167 + }, + { + "epoch": 15.57504, + "grad_norm": 0.9857305288314819, + "learning_rate": 2.567827130852341e-05, + "loss": 0.5006, + "step": 12168 + }, + { + "epoch": 15.57632, + "grad_norm": 1.0142581462860107, + "learning_rate": 2.5676270508203283e-05, + "loss": 0.5309, + "step": 12169 + }, + { + "epoch": 15.5776, + "grad_norm": 0.9901478886604309, + "learning_rate": 2.567426970788315e-05, + "loss": 0.5302, + "step": 12170 + }, + { + "epoch": 15.57888, + "grad_norm": 0.9273139238357544, + "learning_rate": 2.5672268907563023e-05, + "loss": 0.4937, + "step": 12171 + }, + { + "epoch": 15.58016, + "grad_norm": 0.984489917755127, + "learning_rate": 2.5670268107242902e-05, + "loss": 0.547, + "step": 12172 + }, + { + "epoch": 15.58144, + "grad_norm": 1.0063872337341309, + "learning_rate": 2.566826730692277e-05, + "loss": 0.5502, + "step": 12173 + }, + { + "epoch": 15.58272, + "grad_norm": 1.0095993280410767, + "learning_rate": 2.5666266506602642e-05, + "loss": 0.5082, + "step": 12174 + }, + { + "epoch": 15.584, + "grad_norm": 0.9566926956176758, + "learning_rate": 2.5664265706282514e-05, + "loss": 0.5288, + "step": 12175 + }, + { + "epoch": 15.585280000000001, + "grad_norm": 0.9695842862129211, + "learning_rate": 2.5662264905962386e-05, + "loss": 0.5461, + "step": 12176 + }, + { + "epoch": 15.58656, + "grad_norm": 0.9567855000495911, + "learning_rate": 2.5660264105642258e-05, + "loss": 0.5013, + "step": 12177 + }, + { + "epoch": 15.58784, + "grad_norm": 0.983958899974823, + "learning_rate": 2.5658263305322126e-05, + "loss": 0.5322, + "step": 12178 + }, + { + "epoch": 15.58912, + "grad_norm": 1.0091723203659058, + "learning_rate": 2.5656262505002005e-05, + "loss": 0.5172, + "step": 12179 + }, + { + "epoch": 15.5904, + "grad_norm": 1.0233546495437622, + "learning_rate": 2.5654261704681877e-05, + "loss": 0.5247, + "step": 12180 + }, + { + "epoch": 15.59168, + "grad_norm": 0.9641100764274597, + "learning_rate": 2.5652260904361745e-05, + "loss": 0.5123, + "step": 12181 + }, + { + "epoch": 15.59296, + "grad_norm": 0.9538204669952393, + "learning_rate": 2.5650260104041617e-05, + "loss": 0.5048, + "step": 12182 + }, + { + "epoch": 15.59424, + "grad_norm": 1.0550053119659424, + "learning_rate": 2.564825930372149e-05, + "loss": 0.5824, + "step": 12183 + }, + { + "epoch": 15.59552, + "grad_norm": 0.9931796193122864, + "learning_rate": 2.564625850340136e-05, + "loss": 0.526, + "step": 12184 + }, + { + "epoch": 15.5968, + "grad_norm": 1.0209304094314575, + "learning_rate": 2.5644257703081233e-05, + "loss": 0.5275, + "step": 12185 + }, + { + "epoch": 15.59808, + "grad_norm": 0.9978035688400269, + "learning_rate": 2.5642256902761108e-05, + "loss": 0.5207, + "step": 12186 + }, + { + "epoch": 15.59936, + "grad_norm": 1.0060955286026, + "learning_rate": 2.564025610244098e-05, + "loss": 0.5259, + "step": 12187 + }, + { + "epoch": 15.60064, + "grad_norm": 1.0725250244140625, + "learning_rate": 2.5638255302120852e-05, + "loss": 0.5654, + "step": 12188 + }, + { + "epoch": 15.60192, + "grad_norm": 1.0049766302108765, + "learning_rate": 2.563625450180072e-05, + "loss": 0.5537, + "step": 12189 + }, + { + "epoch": 15.6032, + "grad_norm": 0.9810140132904053, + "learning_rate": 2.5634253701480592e-05, + "loss": 0.4873, + "step": 12190 + }, + { + "epoch": 15.60448, + "grad_norm": 0.9922715425491333, + "learning_rate": 2.5632252901160464e-05, + "loss": 0.5372, + "step": 12191 + }, + { + "epoch": 15.60576, + "grad_norm": 0.9921250939369202, + "learning_rate": 2.5630252100840336e-05, + "loss": 0.523, + "step": 12192 + }, + { + "epoch": 15.60704, + "grad_norm": 1.0218485593795776, + "learning_rate": 2.562825130052021e-05, + "loss": 0.5281, + "step": 12193 + }, + { + "epoch": 15.608319999999999, + "grad_norm": 1.0210494995117188, + "learning_rate": 2.5626250500200083e-05, + "loss": 0.5154, + "step": 12194 + }, + { + "epoch": 15.6096, + "grad_norm": 1.0493415594100952, + "learning_rate": 2.5624249699879955e-05, + "loss": 0.5801, + "step": 12195 + }, + { + "epoch": 15.61088, + "grad_norm": 0.95387864112854, + "learning_rate": 2.5622248899559827e-05, + "loss": 0.5033, + "step": 12196 + }, + { + "epoch": 15.61216, + "grad_norm": 0.9641955494880676, + "learning_rate": 2.5620248099239695e-05, + "loss": 0.521, + "step": 12197 + }, + { + "epoch": 15.61344, + "grad_norm": 0.979554295539856, + "learning_rate": 2.5618247298919567e-05, + "loss": 0.5124, + "step": 12198 + }, + { + "epoch": 15.61472, + "grad_norm": 1.063732385635376, + "learning_rate": 2.561624649859944e-05, + "loss": 0.5712, + "step": 12199 + }, + { + "epoch": 15.616, + "grad_norm": 1.0626933574676514, + "learning_rate": 2.5614245698279314e-05, + "loss": 0.5761, + "step": 12200 + }, + { + "epoch": 15.617280000000001, + "grad_norm": 1.0086123943328857, + "learning_rate": 2.5612244897959186e-05, + "loss": 0.5195, + "step": 12201 + }, + { + "epoch": 15.61856, + "grad_norm": 0.990342915058136, + "learning_rate": 2.5610244097639058e-05, + "loss": 0.5288, + "step": 12202 + }, + { + "epoch": 15.61984, + "grad_norm": 0.9492666125297546, + "learning_rate": 2.560824329731893e-05, + "loss": 0.4753, + "step": 12203 + }, + { + "epoch": 15.62112, + "grad_norm": 1.011853575706482, + "learning_rate": 2.56062424969988e-05, + "loss": 0.5095, + "step": 12204 + }, + { + "epoch": 15.6224, + "grad_norm": 1.003688931465149, + "learning_rate": 2.560424169667867e-05, + "loss": 0.4808, + "step": 12205 + }, + { + "epoch": 15.62368, + "grad_norm": 0.9985147714614868, + "learning_rate": 2.5602240896358542e-05, + "loss": 0.5343, + "step": 12206 + }, + { + "epoch": 15.62496, + "grad_norm": 0.9690064787864685, + "learning_rate": 2.560024009603842e-05, + "loss": 0.4995, + "step": 12207 + }, + { + "epoch": 15.62624, + "grad_norm": 0.9825969338417053, + "learning_rate": 2.559823929571829e-05, + "loss": 0.5231, + "step": 12208 + }, + { + "epoch": 15.62752, + "grad_norm": 0.995742917060852, + "learning_rate": 2.559623849539816e-05, + "loss": 0.5243, + "step": 12209 + }, + { + "epoch": 15.6288, + "grad_norm": 1.037103295326233, + "learning_rate": 2.5594237695078033e-05, + "loss": 0.5265, + "step": 12210 + }, + { + "epoch": 15.63008, + "grad_norm": 1.0141079425811768, + "learning_rate": 2.5592236894757904e-05, + "loss": 0.5545, + "step": 12211 + }, + { + "epoch": 15.63136, + "grad_norm": 0.9893406629562378, + "learning_rate": 2.5590236094437776e-05, + "loss": 0.5033, + "step": 12212 + }, + { + "epoch": 15.63264, + "grad_norm": 1.0220900774002075, + "learning_rate": 2.5588235294117645e-05, + "loss": 0.5212, + "step": 12213 + }, + { + "epoch": 15.63392, + "grad_norm": 0.9686633348464966, + "learning_rate": 2.5586234493797523e-05, + "loss": 0.5521, + "step": 12214 + }, + { + "epoch": 15.6352, + "grad_norm": 1.001504898071289, + "learning_rate": 2.5584233693477395e-05, + "loss": 0.5756, + "step": 12215 + }, + { + "epoch": 15.63648, + "grad_norm": 1.0070838928222656, + "learning_rate": 2.5582232893157264e-05, + "loss": 0.5075, + "step": 12216 + }, + { + "epoch": 15.63776, + "grad_norm": 0.9980425834655762, + "learning_rate": 2.5580232092837136e-05, + "loss": 0.5527, + "step": 12217 + }, + { + "epoch": 15.63904, + "grad_norm": 0.9670035243034363, + "learning_rate": 2.5578231292517007e-05, + "loss": 0.49, + "step": 12218 + }, + { + "epoch": 15.64032, + "grad_norm": 0.9716927409172058, + "learning_rate": 2.557623049219688e-05, + "loss": 0.4885, + "step": 12219 + }, + { + "epoch": 15.6416, + "grad_norm": 0.9749883413314819, + "learning_rate": 2.557422969187675e-05, + "loss": 0.5409, + "step": 12220 + }, + { + "epoch": 15.64288, + "grad_norm": 0.9985603094100952, + "learning_rate": 2.5572228891556626e-05, + "loss": 0.5294, + "step": 12221 + }, + { + "epoch": 15.64416, + "grad_norm": 1.0051274299621582, + "learning_rate": 2.5570228091236498e-05, + "loss": 0.5685, + "step": 12222 + }, + { + "epoch": 15.64544, + "grad_norm": 1.0328760147094727, + "learning_rate": 2.556822729091637e-05, + "loss": 0.6053, + "step": 12223 + }, + { + "epoch": 15.64672, + "grad_norm": 1.0340243577957153, + "learning_rate": 2.556622649059624e-05, + "loss": 0.5212, + "step": 12224 + }, + { + "epoch": 15.648, + "grad_norm": 1.0244271755218506, + "learning_rate": 2.556422569027611e-05, + "loss": 0.5555, + "step": 12225 + }, + { + "epoch": 15.64928, + "grad_norm": 0.9867590069770813, + "learning_rate": 2.5562224889955982e-05, + "loss": 0.5331, + "step": 12226 + }, + { + "epoch": 15.65056, + "grad_norm": 1.015299916267395, + "learning_rate": 2.5560224089635854e-05, + "loss": 0.5381, + "step": 12227 + }, + { + "epoch": 15.65184, + "grad_norm": 0.9973687529563904, + "learning_rate": 2.555822328931573e-05, + "loss": 0.5776, + "step": 12228 + }, + { + "epoch": 15.65312, + "grad_norm": 0.9658114314079285, + "learning_rate": 2.55562224889956e-05, + "loss": 0.4938, + "step": 12229 + }, + { + "epoch": 15.6544, + "grad_norm": 0.9462615847587585, + "learning_rate": 2.5554221688675473e-05, + "loss": 0.4878, + "step": 12230 + }, + { + "epoch": 15.65568, + "grad_norm": 0.9629518389701843, + "learning_rate": 2.5552220888355345e-05, + "loss": 0.544, + "step": 12231 + }, + { + "epoch": 15.65696, + "grad_norm": 1.0248364210128784, + "learning_rate": 2.5550220088035213e-05, + "loss": 0.5085, + "step": 12232 + }, + { + "epoch": 15.65824, + "grad_norm": 0.9903928637504578, + "learning_rate": 2.5548219287715085e-05, + "loss": 0.5624, + "step": 12233 + }, + { + "epoch": 15.65952, + "grad_norm": 0.9919432401657104, + "learning_rate": 2.5546218487394957e-05, + "loss": 0.5008, + "step": 12234 + }, + { + "epoch": 15.6608, + "grad_norm": 1.0050952434539795, + "learning_rate": 2.554421768707483e-05, + "loss": 0.5812, + "step": 12235 + }, + { + "epoch": 15.66208, + "grad_norm": 0.9851815104484558, + "learning_rate": 2.5542216886754704e-05, + "loss": 0.5016, + "step": 12236 + }, + { + "epoch": 15.66336, + "grad_norm": 0.9741938710212708, + "learning_rate": 2.5540216086434576e-05, + "loss": 0.5038, + "step": 12237 + }, + { + "epoch": 15.66464, + "grad_norm": 1.0300729274749756, + "learning_rate": 2.5538215286114448e-05, + "loss": 0.5195, + "step": 12238 + }, + { + "epoch": 15.66592, + "grad_norm": 0.9596061110496521, + "learning_rate": 2.553621448579432e-05, + "loss": 0.5052, + "step": 12239 + }, + { + "epoch": 15.6672, + "grad_norm": 0.9867904186248779, + "learning_rate": 2.553421368547419e-05, + "loss": 0.5353, + "step": 12240 + }, + { + "epoch": 15.66848, + "grad_norm": 0.9471587538719177, + "learning_rate": 2.553221288515406e-05, + "loss": 0.4976, + "step": 12241 + }, + { + "epoch": 15.66976, + "grad_norm": 0.9685238599777222, + "learning_rate": 2.5530212084833932e-05, + "loss": 0.5168, + "step": 12242 + }, + { + "epoch": 15.67104, + "grad_norm": 0.9731021523475647, + "learning_rate": 2.5528211284513807e-05, + "loss": 0.5298, + "step": 12243 + }, + { + "epoch": 15.67232, + "grad_norm": 0.9390130043029785, + "learning_rate": 2.552621048419368e-05, + "loss": 0.517, + "step": 12244 + }, + { + "epoch": 15.6736, + "grad_norm": 1.0047056674957275, + "learning_rate": 2.552420968387355e-05, + "loss": 0.5198, + "step": 12245 + }, + { + "epoch": 15.67488, + "grad_norm": 1.0100324153900146, + "learning_rate": 2.5522208883553423e-05, + "loss": 0.5573, + "step": 12246 + }, + { + "epoch": 15.67616, + "grad_norm": 1.0420469045639038, + "learning_rate": 2.5520208083233295e-05, + "loss": 0.5394, + "step": 12247 + }, + { + "epoch": 15.67744, + "grad_norm": 1.0220342874526978, + "learning_rate": 2.5518207282913163e-05, + "loss": 0.5864, + "step": 12248 + }, + { + "epoch": 15.67872, + "grad_norm": 1.0437568426132202, + "learning_rate": 2.5516206482593035e-05, + "loss": 0.5309, + "step": 12249 + }, + { + "epoch": 15.68, + "grad_norm": 1.0043889284133911, + "learning_rate": 2.5514205682272914e-05, + "loss": 0.5631, + "step": 12250 + }, + { + "epoch": 15.68128, + "grad_norm": 1.024249792098999, + "learning_rate": 2.5512204881952782e-05, + "loss": 0.5329, + "step": 12251 + }, + { + "epoch": 15.68256, + "grad_norm": 0.9508040547370911, + "learning_rate": 2.5510204081632654e-05, + "loss": 0.5257, + "step": 12252 + }, + { + "epoch": 15.68384, + "grad_norm": 1.0146198272705078, + "learning_rate": 2.5508203281312526e-05, + "loss": 0.5494, + "step": 12253 + }, + { + "epoch": 15.68512, + "grad_norm": 0.9662773609161377, + "learning_rate": 2.5506202480992398e-05, + "loss": 0.4899, + "step": 12254 + }, + { + "epoch": 15.6864, + "grad_norm": 0.9972757697105408, + "learning_rate": 2.550420168067227e-05, + "loss": 0.5409, + "step": 12255 + }, + { + "epoch": 15.68768, + "grad_norm": 0.9856722950935364, + "learning_rate": 2.5502200880352138e-05, + "loss": 0.5048, + "step": 12256 + }, + { + "epoch": 15.68896, + "grad_norm": 1.0038642883300781, + "learning_rate": 2.5500200080032017e-05, + "loss": 0.5522, + "step": 12257 + }, + { + "epoch": 15.69024, + "grad_norm": 1.0079247951507568, + "learning_rate": 2.549819927971189e-05, + "loss": 0.5588, + "step": 12258 + }, + { + "epoch": 15.69152, + "grad_norm": 1.0044342279434204, + "learning_rate": 2.5496198479391757e-05, + "loss": 0.5339, + "step": 12259 + }, + { + "epoch": 15.6928, + "grad_norm": 0.9543859362602234, + "learning_rate": 2.549419767907163e-05, + "loss": 0.5141, + "step": 12260 + }, + { + "epoch": 15.69408, + "grad_norm": 1.0886956453323364, + "learning_rate": 2.54921968787515e-05, + "loss": 0.5861, + "step": 12261 + }, + { + "epoch": 15.69536, + "grad_norm": 0.9997818470001221, + "learning_rate": 2.5490196078431373e-05, + "loss": 0.4868, + "step": 12262 + }, + { + "epoch": 15.69664, + "grad_norm": 0.9534517526626587, + "learning_rate": 2.5488195278111244e-05, + "loss": 0.5402, + "step": 12263 + }, + { + "epoch": 15.69792, + "grad_norm": 0.9827191233634949, + "learning_rate": 2.548619447779112e-05, + "loss": 0.5272, + "step": 12264 + }, + { + "epoch": 15.6992, + "grad_norm": 0.9801238775253296, + "learning_rate": 2.548419367747099e-05, + "loss": 0.5243, + "step": 12265 + }, + { + "epoch": 15.70048, + "grad_norm": 0.9996204376220703, + "learning_rate": 2.5482192877150863e-05, + "loss": 0.535, + "step": 12266 + }, + { + "epoch": 15.70176, + "grad_norm": 1.0119619369506836, + "learning_rate": 2.5480192076830732e-05, + "loss": 0.5261, + "step": 12267 + }, + { + "epoch": 15.70304, + "grad_norm": 1.0047118663787842, + "learning_rate": 2.5478191276510604e-05, + "loss": 0.4841, + "step": 12268 + }, + { + "epoch": 15.70432, + "grad_norm": 1.0151153802871704, + "learning_rate": 2.5476190476190476e-05, + "loss": 0.5364, + "step": 12269 + }, + { + "epoch": 15.7056, + "grad_norm": 1.0285605192184448, + "learning_rate": 2.5474189675870347e-05, + "loss": 0.5459, + "step": 12270 + }, + { + "epoch": 15.70688, + "grad_norm": 0.9856500625610352, + "learning_rate": 2.5472188875550223e-05, + "loss": 0.5358, + "step": 12271 + }, + { + "epoch": 15.70816, + "grad_norm": 1.0488793849945068, + "learning_rate": 2.5470188075230095e-05, + "loss": 0.5405, + "step": 12272 + }, + { + "epoch": 15.70944, + "grad_norm": 1.0134892463684082, + "learning_rate": 2.5468187274909966e-05, + "loss": 0.5732, + "step": 12273 + }, + { + "epoch": 15.71072, + "grad_norm": 0.9521108269691467, + "learning_rate": 2.5466186474589838e-05, + "loss": 0.4745, + "step": 12274 + }, + { + "epoch": 15.712, + "grad_norm": 0.9995734095573425, + "learning_rate": 2.5464185674269707e-05, + "loss": 0.5472, + "step": 12275 + }, + { + "epoch": 15.71328, + "grad_norm": 1.0318055152893066, + "learning_rate": 2.546218487394958e-05, + "loss": 0.5725, + "step": 12276 + }, + { + "epoch": 15.71456, + "grad_norm": 1.0022940635681152, + "learning_rate": 2.546018407362945e-05, + "loss": 0.5544, + "step": 12277 + }, + { + "epoch": 15.71584, + "grad_norm": 0.9959270358085632, + "learning_rate": 2.5458183273309326e-05, + "loss": 0.52, + "step": 12278 + }, + { + "epoch": 15.71712, + "grad_norm": 1.0101356506347656, + "learning_rate": 2.5456182472989198e-05, + "loss": 0.5537, + "step": 12279 + }, + { + "epoch": 15.7184, + "grad_norm": 0.9922164678573608, + "learning_rate": 2.545418167266907e-05, + "loss": 0.5343, + "step": 12280 + }, + { + "epoch": 15.71968, + "grad_norm": 1.0045051574707031, + "learning_rate": 2.545218087234894e-05, + "loss": 0.5776, + "step": 12281 + }, + { + "epoch": 15.72096, + "grad_norm": 0.9495313763618469, + "learning_rate": 2.5450180072028813e-05, + "loss": 0.509, + "step": 12282 + }, + { + "epoch": 15.72224, + "grad_norm": 0.9955906867980957, + "learning_rate": 2.544817927170868e-05, + "loss": 0.5389, + "step": 12283 + }, + { + "epoch": 15.72352, + "grad_norm": 1.0342867374420166, + "learning_rate": 2.5446178471388553e-05, + "loss": 0.5492, + "step": 12284 + }, + { + "epoch": 15.7248, + "grad_norm": 0.966540515422821, + "learning_rate": 2.5444177671068432e-05, + "loss": 0.5449, + "step": 12285 + }, + { + "epoch": 15.72608, + "grad_norm": 0.9552671313285828, + "learning_rate": 2.54421768707483e-05, + "loss": 0.5026, + "step": 12286 + }, + { + "epoch": 15.727360000000001, + "grad_norm": 0.9747936129570007, + "learning_rate": 2.5440176070428172e-05, + "loss": 0.524, + "step": 12287 + }, + { + "epoch": 15.72864, + "grad_norm": 1.0628736019134521, + "learning_rate": 2.5438175270108044e-05, + "loss": 0.5114, + "step": 12288 + }, + { + "epoch": 15.72992, + "grad_norm": 1.016642689704895, + "learning_rate": 2.5436174469787916e-05, + "loss": 0.5479, + "step": 12289 + }, + { + "epoch": 15.7312, + "grad_norm": 0.95323246717453, + "learning_rate": 2.5434173669467788e-05, + "loss": 0.5381, + "step": 12290 + }, + { + "epoch": 15.73248, + "grad_norm": 1.0170929431915283, + "learning_rate": 2.5432172869147656e-05, + "loss": 0.5459, + "step": 12291 + }, + { + "epoch": 15.73376, + "grad_norm": 0.990404486656189, + "learning_rate": 2.5430172068827535e-05, + "loss": 0.5469, + "step": 12292 + }, + { + "epoch": 15.73504, + "grad_norm": 1.013763189315796, + "learning_rate": 2.5428171268507407e-05, + "loss": 0.498, + "step": 12293 + }, + { + "epoch": 15.73632, + "grad_norm": 0.9921509623527527, + "learning_rate": 2.5426170468187275e-05, + "loss": 0.5439, + "step": 12294 + }, + { + "epoch": 15.7376, + "grad_norm": 0.9957842230796814, + "learning_rate": 2.5424169667867147e-05, + "loss": 0.5197, + "step": 12295 + }, + { + "epoch": 15.73888, + "grad_norm": 0.9728386998176575, + "learning_rate": 2.542216886754702e-05, + "loss": 0.4966, + "step": 12296 + }, + { + "epoch": 15.74016, + "grad_norm": 1.0001862049102783, + "learning_rate": 2.542016806722689e-05, + "loss": 0.5375, + "step": 12297 + }, + { + "epoch": 15.74144, + "grad_norm": 0.9885843992233276, + "learning_rate": 2.5418167266906763e-05, + "loss": 0.545, + "step": 12298 + }, + { + "epoch": 15.74272, + "grad_norm": 1.0801782608032227, + "learning_rate": 2.5416166466586638e-05, + "loss": 0.548, + "step": 12299 + }, + { + "epoch": 15.744, + "grad_norm": 1.014716625213623, + "learning_rate": 2.541416566626651e-05, + "loss": 0.5159, + "step": 12300 + }, + { + "epoch": 15.74528, + "grad_norm": 1.070065975189209, + "learning_rate": 2.5412164865946382e-05, + "loss": 0.5531, + "step": 12301 + }, + { + "epoch": 15.74656, + "grad_norm": 0.9282522201538086, + "learning_rate": 2.541016406562625e-05, + "loss": 0.5189, + "step": 12302 + }, + { + "epoch": 15.74784, + "grad_norm": 0.9973820447921753, + "learning_rate": 2.5408163265306122e-05, + "loss": 0.5093, + "step": 12303 + }, + { + "epoch": 15.74912, + "grad_norm": 0.9329702258110046, + "learning_rate": 2.5406162464985994e-05, + "loss": 0.4989, + "step": 12304 + }, + { + "epoch": 15.750399999999999, + "grad_norm": 0.9909161925315857, + "learning_rate": 2.5404161664665866e-05, + "loss": 0.5578, + "step": 12305 + }, + { + "epoch": 15.75168, + "grad_norm": 1.0105434656143188, + "learning_rate": 2.540216086434574e-05, + "loss": 0.5251, + "step": 12306 + }, + { + "epoch": 15.75296, + "grad_norm": 1.0197798013687134, + "learning_rate": 2.5400160064025613e-05, + "loss": 0.5527, + "step": 12307 + }, + { + "epoch": 15.75424, + "grad_norm": 1.027341604232788, + "learning_rate": 2.5398159263705485e-05, + "loss": 0.5716, + "step": 12308 + }, + { + "epoch": 15.75552, + "grad_norm": 0.9711216688156128, + "learning_rate": 2.5396158463385357e-05, + "loss": 0.5455, + "step": 12309 + }, + { + "epoch": 15.7568, + "grad_norm": 0.9567652940750122, + "learning_rate": 2.5394157663065225e-05, + "loss": 0.475, + "step": 12310 + }, + { + "epoch": 15.75808, + "grad_norm": 0.9756172895431519, + "learning_rate": 2.5392156862745097e-05, + "loss": 0.534, + "step": 12311 + }, + { + "epoch": 15.759360000000001, + "grad_norm": 1.0259792804718018, + "learning_rate": 2.539015606242497e-05, + "loss": 0.5237, + "step": 12312 + }, + { + "epoch": 15.76064, + "grad_norm": 0.9864662885665894, + "learning_rate": 2.5388155262104844e-05, + "loss": 0.5443, + "step": 12313 + }, + { + "epoch": 15.76192, + "grad_norm": 0.9610289335250854, + "learning_rate": 2.5386154461784716e-05, + "loss": 0.5058, + "step": 12314 + }, + { + "epoch": 15.7632, + "grad_norm": 0.9790290594100952, + "learning_rate": 2.5384153661464588e-05, + "loss": 0.5117, + "step": 12315 + }, + { + "epoch": 15.76448, + "grad_norm": 0.9570515751838684, + "learning_rate": 2.538215286114446e-05, + "loss": 0.5391, + "step": 12316 + }, + { + "epoch": 15.76576, + "grad_norm": 0.979626476764679, + "learning_rate": 2.538015206082433e-05, + "loss": 0.5167, + "step": 12317 + }, + { + "epoch": 15.76704, + "grad_norm": 0.9989627599716187, + "learning_rate": 2.53781512605042e-05, + "loss": 0.5157, + "step": 12318 + }, + { + "epoch": 15.76832, + "grad_norm": 1.0324374437332153, + "learning_rate": 2.5376150460184072e-05, + "loss": 0.5204, + "step": 12319 + }, + { + "epoch": 15.7696, + "grad_norm": 0.9673824906349182, + "learning_rate": 2.537414965986395e-05, + "loss": 0.5313, + "step": 12320 + }, + { + "epoch": 15.77088, + "grad_norm": 1.0042349100112915, + "learning_rate": 2.537214885954382e-05, + "loss": 0.5402, + "step": 12321 + }, + { + "epoch": 15.77216, + "grad_norm": 1.0308949947357178, + "learning_rate": 2.537014805922369e-05, + "loss": 0.5681, + "step": 12322 + }, + { + "epoch": 15.77344, + "grad_norm": 0.9506542086601257, + "learning_rate": 2.5368147258903563e-05, + "loss": 0.5449, + "step": 12323 + }, + { + "epoch": 15.77472, + "grad_norm": 0.9522877335548401, + "learning_rate": 2.5366146458583434e-05, + "loss": 0.5617, + "step": 12324 + }, + { + "epoch": 15.776, + "grad_norm": 0.9548900127410889, + "learning_rate": 2.5364145658263306e-05, + "loss": 0.5247, + "step": 12325 + }, + { + "epoch": 15.77728, + "grad_norm": 0.9789671301841736, + "learning_rate": 2.5362144857943175e-05, + "loss": 0.5114, + "step": 12326 + }, + { + "epoch": 15.77856, + "grad_norm": 0.942644476890564, + "learning_rate": 2.5360144057623053e-05, + "loss": 0.4917, + "step": 12327 + }, + { + "epoch": 15.77984, + "grad_norm": 0.9783756732940674, + "learning_rate": 2.5358143257302925e-05, + "loss": 0.4895, + "step": 12328 + }, + { + "epoch": 15.78112, + "grad_norm": 0.9837256669998169, + "learning_rate": 2.5356142456982794e-05, + "loss": 0.519, + "step": 12329 + }, + { + "epoch": 15.782399999999999, + "grad_norm": 0.9281373023986816, + "learning_rate": 2.5354141656662666e-05, + "loss": 0.4883, + "step": 12330 + }, + { + "epoch": 15.78368, + "grad_norm": 1.0375429391860962, + "learning_rate": 2.5352140856342537e-05, + "loss": 0.5655, + "step": 12331 + }, + { + "epoch": 15.78496, + "grad_norm": 1.0052881240844727, + "learning_rate": 2.535014005602241e-05, + "loss": 0.5547, + "step": 12332 + }, + { + "epoch": 15.78624, + "grad_norm": 1.0248472690582275, + "learning_rate": 2.534813925570228e-05, + "loss": 0.5778, + "step": 12333 + }, + { + "epoch": 15.78752, + "grad_norm": 1.0107492208480835, + "learning_rate": 2.5346138455382156e-05, + "loss": 0.5068, + "step": 12334 + }, + { + "epoch": 15.7888, + "grad_norm": 0.9795581698417664, + "learning_rate": 2.5344137655062028e-05, + "loss": 0.5389, + "step": 12335 + }, + { + "epoch": 15.79008, + "grad_norm": 0.9817120432853699, + "learning_rate": 2.53421368547419e-05, + "loss": 0.5071, + "step": 12336 + }, + { + "epoch": 15.79136, + "grad_norm": 0.9362112879753113, + "learning_rate": 2.534013605442177e-05, + "loss": 0.5537, + "step": 12337 + }, + { + "epoch": 15.79264, + "grad_norm": 0.9294254183769226, + "learning_rate": 2.533813525410164e-05, + "loss": 0.4974, + "step": 12338 + }, + { + "epoch": 15.79392, + "grad_norm": 0.9781113862991333, + "learning_rate": 2.5336134453781512e-05, + "loss": 0.4655, + "step": 12339 + }, + { + "epoch": 15.7952, + "grad_norm": 0.9783672094345093, + "learning_rate": 2.5334133653461384e-05, + "loss": 0.5533, + "step": 12340 + }, + { + "epoch": 15.79648, + "grad_norm": 0.9659748673439026, + "learning_rate": 2.5332132853141256e-05, + "loss": 0.5486, + "step": 12341 + }, + { + "epoch": 15.79776, + "grad_norm": 0.9942366480827332, + "learning_rate": 2.533013205282113e-05, + "loss": 0.5433, + "step": 12342 + }, + { + "epoch": 15.79904, + "grad_norm": 0.9866067171096802, + "learning_rate": 2.5328131252501003e-05, + "loss": 0.5127, + "step": 12343 + }, + { + "epoch": 15.80032, + "grad_norm": 1.0013525485992432, + "learning_rate": 2.5326130452180875e-05, + "loss": 0.5194, + "step": 12344 + }, + { + "epoch": 15.8016, + "grad_norm": 0.9939999580383301, + "learning_rate": 2.5324129651860743e-05, + "loss": 0.5284, + "step": 12345 + }, + { + "epoch": 15.80288, + "grad_norm": 0.9597903490066528, + "learning_rate": 2.5322128851540615e-05, + "loss": 0.5152, + "step": 12346 + }, + { + "epoch": 15.80416, + "grad_norm": 0.9546245336532593, + "learning_rate": 2.5320128051220487e-05, + "loss": 0.487, + "step": 12347 + }, + { + "epoch": 15.80544, + "grad_norm": 1.0091270208358765, + "learning_rate": 2.531812725090036e-05, + "loss": 0.5678, + "step": 12348 + }, + { + "epoch": 15.80672, + "grad_norm": 0.9878095984458923, + "learning_rate": 2.5316126450580234e-05, + "loss": 0.5059, + "step": 12349 + }, + { + "epoch": 15.808, + "grad_norm": 1.0209447145462036, + "learning_rate": 2.5314125650260106e-05, + "loss": 0.5348, + "step": 12350 + }, + { + "epoch": 15.80928, + "grad_norm": 1.0176520347595215, + "learning_rate": 2.5312124849939978e-05, + "loss": 0.4969, + "step": 12351 + }, + { + "epoch": 15.81056, + "grad_norm": 1.0019172430038452, + "learning_rate": 2.531012404961985e-05, + "loss": 0.5235, + "step": 12352 + }, + { + "epoch": 15.81184, + "grad_norm": 1.047505259513855, + "learning_rate": 2.530812324929972e-05, + "loss": 0.6304, + "step": 12353 + }, + { + "epoch": 15.81312, + "grad_norm": 0.9699926376342773, + "learning_rate": 2.530612244897959e-05, + "loss": 0.5278, + "step": 12354 + }, + { + "epoch": 15.8144, + "grad_norm": 0.9239999055862427, + "learning_rate": 2.5304121648659462e-05, + "loss": 0.5137, + "step": 12355 + }, + { + "epoch": 15.81568, + "grad_norm": 0.9999382495880127, + "learning_rate": 2.530212084833934e-05, + "loss": 0.5398, + "step": 12356 + }, + { + "epoch": 15.81696, + "grad_norm": 0.9642220139503479, + "learning_rate": 2.530012004801921e-05, + "loss": 0.5051, + "step": 12357 + }, + { + "epoch": 15.81824, + "grad_norm": 0.9850364327430725, + "learning_rate": 2.529811924769908e-05, + "loss": 0.5086, + "step": 12358 + }, + { + "epoch": 15.81952, + "grad_norm": 0.9347919225692749, + "learning_rate": 2.5296118447378953e-05, + "loss": 0.5029, + "step": 12359 + }, + { + "epoch": 15.8208, + "grad_norm": 1.001004695892334, + "learning_rate": 2.5294117647058825e-05, + "loss": 0.5255, + "step": 12360 + }, + { + "epoch": 15.82208, + "grad_norm": 1.0347155332565308, + "learning_rate": 2.5292116846738693e-05, + "loss": 0.5276, + "step": 12361 + }, + { + "epoch": 15.82336, + "grad_norm": 1.0250365734100342, + "learning_rate": 2.5290116046418565e-05, + "loss": 0.5609, + "step": 12362 + }, + { + "epoch": 15.82464, + "grad_norm": 1.0703151226043701, + "learning_rate": 2.5288115246098444e-05, + "loss": 0.5144, + "step": 12363 + }, + { + "epoch": 15.82592, + "grad_norm": 1.0669794082641602, + "learning_rate": 2.5286114445778316e-05, + "loss": 0.5801, + "step": 12364 + }, + { + "epoch": 15.8272, + "grad_norm": 1.0120995044708252, + "learning_rate": 2.5284113645458184e-05, + "loss": 0.4951, + "step": 12365 + }, + { + "epoch": 15.82848, + "grad_norm": 0.9758439660072327, + "learning_rate": 2.5282112845138056e-05, + "loss": 0.5294, + "step": 12366 + }, + { + "epoch": 15.82976, + "grad_norm": 0.9557347893714905, + "learning_rate": 2.5280112044817928e-05, + "loss": 0.4897, + "step": 12367 + }, + { + "epoch": 15.83104, + "grad_norm": 0.9893117547035217, + "learning_rate": 2.52781112444978e-05, + "loss": 0.5063, + "step": 12368 + }, + { + "epoch": 15.83232, + "grad_norm": 1.008176326751709, + "learning_rate": 2.5276110444177668e-05, + "loss": 0.5255, + "step": 12369 + }, + { + "epoch": 15.8336, + "grad_norm": 0.9538099765777588, + "learning_rate": 2.5274109643857547e-05, + "loss": 0.5154, + "step": 12370 + }, + { + "epoch": 15.83488, + "grad_norm": 0.9922953844070435, + "learning_rate": 2.527210884353742e-05, + "loss": 0.5088, + "step": 12371 + }, + { + "epoch": 15.83616, + "grad_norm": 0.9348974823951721, + "learning_rate": 2.527010804321729e-05, + "loss": 0.4637, + "step": 12372 + }, + { + "epoch": 15.83744, + "grad_norm": 1.0178333520889282, + "learning_rate": 2.526810724289716e-05, + "loss": 0.532, + "step": 12373 + }, + { + "epoch": 15.83872, + "grad_norm": 1.0431467294692993, + "learning_rate": 2.526610644257703e-05, + "loss": 0.5307, + "step": 12374 + }, + { + "epoch": 15.84, + "grad_norm": 1.0939209461212158, + "learning_rate": 2.5264105642256903e-05, + "loss": 0.5397, + "step": 12375 + }, + { + "epoch": 15.84128, + "grad_norm": 1.0265597105026245, + "learning_rate": 2.5262104841936774e-05, + "loss": 0.5354, + "step": 12376 + }, + { + "epoch": 15.84256, + "grad_norm": 0.9757692813873291, + "learning_rate": 2.526010404161665e-05, + "loss": 0.4957, + "step": 12377 + }, + { + "epoch": 15.84384, + "grad_norm": 0.9945160150527954, + "learning_rate": 2.525810324129652e-05, + "loss": 0.5204, + "step": 12378 + }, + { + "epoch": 15.84512, + "grad_norm": 0.9997515082359314, + "learning_rate": 2.5256102440976393e-05, + "loss": 0.5403, + "step": 12379 + }, + { + "epoch": 15.8464, + "grad_norm": 0.9899279475212097, + "learning_rate": 2.5254101640656265e-05, + "loss": 0.5597, + "step": 12380 + }, + { + "epoch": 15.84768, + "grad_norm": 0.941484272480011, + "learning_rate": 2.5252100840336134e-05, + "loss": 0.5027, + "step": 12381 + }, + { + "epoch": 15.84896, + "grad_norm": 0.984250545501709, + "learning_rate": 2.5250100040016006e-05, + "loss": 0.5449, + "step": 12382 + }, + { + "epoch": 15.85024, + "grad_norm": 0.9902025461196899, + "learning_rate": 2.5248099239695877e-05, + "loss": 0.5443, + "step": 12383 + }, + { + "epoch": 15.85152, + "grad_norm": 0.9709980487823486, + "learning_rate": 2.5246098439375753e-05, + "loss": 0.5031, + "step": 12384 + }, + { + "epoch": 15.8528, + "grad_norm": 0.9857783317565918, + "learning_rate": 2.5244097639055625e-05, + "loss": 0.5027, + "step": 12385 + }, + { + "epoch": 15.85408, + "grad_norm": 1.0132930278778076, + "learning_rate": 2.5242096838735496e-05, + "loss": 0.5154, + "step": 12386 + }, + { + "epoch": 15.85536, + "grad_norm": 0.9710603952407837, + "learning_rate": 2.5240096038415368e-05, + "loss": 0.5182, + "step": 12387 + }, + { + "epoch": 15.85664, + "grad_norm": 0.9620773196220398, + "learning_rate": 2.523809523809524e-05, + "loss": 0.4874, + "step": 12388 + }, + { + "epoch": 15.85792, + "grad_norm": 0.9953191876411438, + "learning_rate": 2.523609443777511e-05, + "loss": 0.5538, + "step": 12389 + }, + { + "epoch": 15.8592, + "grad_norm": 0.956991970539093, + "learning_rate": 2.523409363745498e-05, + "loss": 0.512, + "step": 12390 + }, + { + "epoch": 15.86048, + "grad_norm": 1.0285273790359497, + "learning_rate": 2.523209283713486e-05, + "loss": 0.5422, + "step": 12391 + }, + { + "epoch": 15.86176, + "grad_norm": 1.0568751096725464, + "learning_rate": 2.5230092036814728e-05, + "loss": 0.5794, + "step": 12392 + }, + { + "epoch": 15.86304, + "grad_norm": 1.0147608518600464, + "learning_rate": 2.52280912364946e-05, + "loss": 0.5809, + "step": 12393 + }, + { + "epoch": 15.86432, + "grad_norm": 0.9740559458732605, + "learning_rate": 2.522609043617447e-05, + "loss": 0.5204, + "step": 12394 + }, + { + "epoch": 15.8656, + "grad_norm": 1.0392513275146484, + "learning_rate": 2.5224089635854343e-05, + "loss": 0.5702, + "step": 12395 + }, + { + "epoch": 15.86688, + "grad_norm": 0.9570985436439514, + "learning_rate": 2.5222088835534215e-05, + "loss": 0.5251, + "step": 12396 + }, + { + "epoch": 15.86816, + "grad_norm": 1.0077378749847412, + "learning_rate": 2.5220088035214083e-05, + "loss": 0.5459, + "step": 12397 + }, + { + "epoch": 15.86944, + "grad_norm": 0.9814931154251099, + "learning_rate": 2.5218087234893962e-05, + "loss": 0.5089, + "step": 12398 + }, + { + "epoch": 15.87072, + "grad_norm": 1.0061900615692139, + "learning_rate": 2.5216086434573834e-05, + "loss": 0.534, + "step": 12399 + }, + { + "epoch": 15.872, + "grad_norm": 0.9620992541313171, + "learning_rate": 2.5214085634253702e-05, + "loss": 0.5163, + "step": 12400 + }, + { + "epoch": 15.87328, + "grad_norm": 0.9781507253646851, + "learning_rate": 2.5212084833933574e-05, + "loss": 0.5391, + "step": 12401 + }, + { + "epoch": 15.87456, + "grad_norm": 0.9385673403739929, + "learning_rate": 2.5210084033613446e-05, + "loss": 0.5046, + "step": 12402 + }, + { + "epoch": 15.87584, + "grad_norm": 0.9664863348007202, + "learning_rate": 2.5208083233293318e-05, + "loss": 0.5104, + "step": 12403 + }, + { + "epoch": 15.87712, + "grad_norm": 0.9696258306503296, + "learning_rate": 2.520608243297319e-05, + "loss": 0.4796, + "step": 12404 + }, + { + "epoch": 15.8784, + "grad_norm": 0.9488195776939392, + "learning_rate": 2.5204081632653065e-05, + "loss": 0.4884, + "step": 12405 + }, + { + "epoch": 15.87968, + "grad_norm": 1.0127061605453491, + "learning_rate": 2.5202080832332937e-05, + "loss": 0.5115, + "step": 12406 + }, + { + "epoch": 15.88096, + "grad_norm": 0.9693582057952881, + "learning_rate": 2.520008003201281e-05, + "loss": 0.5386, + "step": 12407 + }, + { + "epoch": 15.88224, + "grad_norm": 0.9904215335845947, + "learning_rate": 2.5198079231692677e-05, + "loss": 0.556, + "step": 12408 + }, + { + "epoch": 15.88352, + "grad_norm": 0.9979194402694702, + "learning_rate": 2.519607843137255e-05, + "loss": 0.5033, + "step": 12409 + }, + { + "epoch": 15.8848, + "grad_norm": 1.007330060005188, + "learning_rate": 2.519407763105242e-05, + "loss": 0.5607, + "step": 12410 + }, + { + "epoch": 15.88608, + "grad_norm": 1.0108498334884644, + "learning_rate": 2.5192076830732293e-05, + "loss": 0.5434, + "step": 12411 + }, + { + "epoch": 15.88736, + "grad_norm": 0.9750909805297852, + "learning_rate": 2.5190076030412168e-05, + "loss": 0.5233, + "step": 12412 + }, + { + "epoch": 15.88864, + "grad_norm": 0.982916533946991, + "learning_rate": 2.518807523009204e-05, + "loss": 0.5241, + "step": 12413 + }, + { + "epoch": 15.88992, + "grad_norm": 0.9335137009620667, + "learning_rate": 2.5186074429771912e-05, + "loss": 0.4939, + "step": 12414 + }, + { + "epoch": 15.8912, + "grad_norm": 0.922966718673706, + "learning_rate": 2.5184073629451784e-05, + "loss": 0.5243, + "step": 12415 + }, + { + "epoch": 15.89248, + "grad_norm": 1.0012139081954956, + "learning_rate": 2.5182072829131652e-05, + "loss": 0.5089, + "step": 12416 + }, + { + "epoch": 15.89376, + "grad_norm": 0.9846066832542419, + "learning_rate": 2.5180072028811524e-05, + "loss": 0.5127, + "step": 12417 + }, + { + "epoch": 15.89504, + "grad_norm": 0.9632936120033264, + "learning_rate": 2.5178071228491396e-05, + "loss": 0.5822, + "step": 12418 + }, + { + "epoch": 15.89632, + "grad_norm": 0.9547779560089111, + "learning_rate": 2.517607042817127e-05, + "loss": 0.5153, + "step": 12419 + }, + { + "epoch": 15.8976, + "grad_norm": 0.9924589991569519, + "learning_rate": 2.5174069627851143e-05, + "loss": 0.558, + "step": 12420 + }, + { + "epoch": 15.89888, + "grad_norm": 0.9812869429588318, + "learning_rate": 2.5172068827531015e-05, + "loss": 0.5061, + "step": 12421 + }, + { + "epoch": 15.90016, + "grad_norm": 1.0087809562683105, + "learning_rate": 2.5170068027210887e-05, + "loss": 0.5524, + "step": 12422 + }, + { + "epoch": 15.901440000000001, + "grad_norm": 0.9655302166938782, + "learning_rate": 2.516806722689076e-05, + "loss": 0.5159, + "step": 12423 + }, + { + "epoch": 15.90272, + "grad_norm": 1.032762050628662, + "learning_rate": 2.5166066426570627e-05, + "loss": 0.5757, + "step": 12424 + }, + { + "epoch": 15.904, + "grad_norm": 0.9734963178634644, + "learning_rate": 2.51640656262505e-05, + "loss": 0.4966, + "step": 12425 + }, + { + "epoch": 15.90528, + "grad_norm": 0.9694871306419373, + "learning_rate": 2.5162064825930377e-05, + "loss": 0.5307, + "step": 12426 + }, + { + "epoch": 15.90656, + "grad_norm": 1.0066063404083252, + "learning_rate": 2.5160064025610246e-05, + "loss": 0.5717, + "step": 12427 + }, + { + "epoch": 15.90784, + "grad_norm": 0.9474746584892273, + "learning_rate": 2.5158063225290118e-05, + "loss": 0.5199, + "step": 12428 + }, + { + "epoch": 15.90912, + "grad_norm": 0.9809539914131165, + "learning_rate": 2.515606242496999e-05, + "loss": 0.5082, + "step": 12429 + }, + { + "epoch": 15.9104, + "grad_norm": 1.0064231157302856, + "learning_rate": 2.515406162464986e-05, + "loss": 0.4924, + "step": 12430 + }, + { + "epoch": 15.91168, + "grad_norm": 0.9995524883270264, + "learning_rate": 2.5152060824329733e-05, + "loss": 0.5142, + "step": 12431 + }, + { + "epoch": 15.91296, + "grad_norm": 0.9583382606506348, + "learning_rate": 2.5150060024009602e-05, + "loss": 0.4687, + "step": 12432 + }, + { + "epoch": 15.91424, + "grad_norm": 0.9433925747871399, + "learning_rate": 2.514805922368948e-05, + "loss": 0.5122, + "step": 12433 + }, + { + "epoch": 15.91552, + "grad_norm": 0.9556870460510254, + "learning_rate": 2.5146058423369352e-05, + "loss": 0.5444, + "step": 12434 + }, + { + "epoch": 15.9168, + "grad_norm": 1.0067849159240723, + "learning_rate": 2.514405762304922e-05, + "loss": 0.5555, + "step": 12435 + }, + { + "epoch": 15.91808, + "grad_norm": 0.9572538733482361, + "learning_rate": 2.5142056822729093e-05, + "loss": 0.5077, + "step": 12436 + }, + { + "epoch": 15.91936, + "grad_norm": 1.0347596406936646, + "learning_rate": 2.5140056022408964e-05, + "loss": 0.5287, + "step": 12437 + }, + { + "epoch": 15.92064, + "grad_norm": 0.9975804090499878, + "learning_rate": 2.5138055222088836e-05, + "loss": 0.5213, + "step": 12438 + }, + { + "epoch": 15.92192, + "grad_norm": 0.9736266136169434, + "learning_rate": 2.5136054421768708e-05, + "loss": 0.5386, + "step": 12439 + }, + { + "epoch": 15.9232, + "grad_norm": 1.077915072441101, + "learning_rate": 2.5134053621448583e-05, + "loss": 0.5772, + "step": 12440 + }, + { + "epoch": 15.924479999999999, + "grad_norm": 1.0086665153503418, + "learning_rate": 2.5132052821128455e-05, + "loss": 0.5173, + "step": 12441 + }, + { + "epoch": 15.92576, + "grad_norm": 1.012130856513977, + "learning_rate": 2.5130052020808327e-05, + "loss": 0.5162, + "step": 12442 + }, + { + "epoch": 15.92704, + "grad_norm": 0.9995039701461792, + "learning_rate": 2.5128051220488196e-05, + "loss": 0.5068, + "step": 12443 + }, + { + "epoch": 15.92832, + "grad_norm": 0.9592408537864685, + "learning_rate": 2.5126050420168067e-05, + "loss": 0.507, + "step": 12444 + }, + { + "epoch": 15.9296, + "grad_norm": 0.9532192945480347, + "learning_rate": 2.512404961984794e-05, + "loss": 0.5134, + "step": 12445 + }, + { + "epoch": 15.93088, + "grad_norm": 1.0366421937942505, + "learning_rate": 2.512204881952781e-05, + "loss": 0.5393, + "step": 12446 + }, + { + "epoch": 15.93216, + "grad_norm": 0.9751626253128052, + "learning_rate": 2.5120048019207686e-05, + "loss": 0.5326, + "step": 12447 + }, + { + "epoch": 15.933440000000001, + "grad_norm": 0.9007095694541931, + "learning_rate": 2.5118047218887558e-05, + "loss": 0.4805, + "step": 12448 + }, + { + "epoch": 15.93472, + "grad_norm": 1.0274094343185425, + "learning_rate": 2.511604641856743e-05, + "loss": 0.5352, + "step": 12449 + }, + { + "epoch": 15.936, + "grad_norm": 0.9921882152557373, + "learning_rate": 2.5114045618247302e-05, + "loss": 0.5379, + "step": 12450 + }, + { + "epoch": 15.93728, + "grad_norm": 1.0404365062713623, + "learning_rate": 2.511204481792717e-05, + "loss": 0.5624, + "step": 12451 + }, + { + "epoch": 15.93856, + "grad_norm": 0.9763307571411133, + "learning_rate": 2.5110044017607042e-05, + "loss": 0.5373, + "step": 12452 + }, + { + "epoch": 15.93984, + "grad_norm": 0.939691424369812, + "learning_rate": 2.5108043217286914e-05, + "loss": 0.5621, + "step": 12453 + }, + { + "epoch": 15.94112, + "grad_norm": 0.9770581126213074, + "learning_rate": 2.5106042416966786e-05, + "loss": 0.5202, + "step": 12454 + }, + { + "epoch": 15.9424, + "grad_norm": 1.0360689163208008, + "learning_rate": 2.510404161664666e-05, + "loss": 0.5714, + "step": 12455 + }, + { + "epoch": 15.94368, + "grad_norm": 0.9993544816970825, + "learning_rate": 2.5102040816326533e-05, + "loss": 0.5125, + "step": 12456 + }, + { + "epoch": 15.94496, + "grad_norm": 1.0267115831375122, + "learning_rate": 2.5100040016006405e-05, + "loss": 0.5312, + "step": 12457 + }, + { + "epoch": 15.94624, + "grad_norm": 0.9762638807296753, + "learning_rate": 2.5098039215686277e-05, + "loss": 0.529, + "step": 12458 + }, + { + "epoch": 15.94752, + "grad_norm": 1.0173263549804688, + "learning_rate": 2.5096038415366145e-05, + "loss": 0.5196, + "step": 12459 + }, + { + "epoch": 15.9488, + "grad_norm": 1.0800774097442627, + "learning_rate": 2.5094037615046017e-05, + "loss": 0.6021, + "step": 12460 + }, + { + "epoch": 15.95008, + "grad_norm": 1.0533719062805176, + "learning_rate": 2.509203681472589e-05, + "loss": 0.5971, + "step": 12461 + }, + { + "epoch": 15.95136, + "grad_norm": 1.0251033306121826, + "learning_rate": 2.5090036014405764e-05, + "loss": 0.5547, + "step": 12462 + }, + { + "epoch": 15.95264, + "grad_norm": 0.9884242415428162, + "learning_rate": 2.5088035214085636e-05, + "loss": 0.5788, + "step": 12463 + }, + { + "epoch": 15.95392, + "grad_norm": 0.9535346627235413, + "learning_rate": 2.5086034413765508e-05, + "loss": 0.488, + "step": 12464 + }, + { + "epoch": 15.9552, + "grad_norm": 0.9252434372901917, + "learning_rate": 2.508403361344538e-05, + "loss": 0.4886, + "step": 12465 + }, + { + "epoch": 15.956479999999999, + "grad_norm": 1.00466787815094, + "learning_rate": 2.508203281312525e-05, + "loss": 0.5489, + "step": 12466 + }, + { + "epoch": 15.95776, + "grad_norm": 0.9779563546180725, + "learning_rate": 2.508003201280512e-05, + "loss": 0.5357, + "step": 12467 + }, + { + "epoch": 15.95904, + "grad_norm": 0.9743767380714417, + "learning_rate": 2.5078031212484992e-05, + "loss": 0.4939, + "step": 12468 + }, + { + "epoch": 15.96032, + "grad_norm": 0.9605897665023804, + "learning_rate": 2.507603041216487e-05, + "loss": 0.4879, + "step": 12469 + }, + { + "epoch": 15.9616, + "grad_norm": 0.9653512835502625, + "learning_rate": 2.507402961184474e-05, + "loss": 0.51, + "step": 12470 + }, + { + "epoch": 15.96288, + "grad_norm": 0.9319489002227783, + "learning_rate": 2.507202881152461e-05, + "loss": 0.4609, + "step": 12471 + }, + { + "epoch": 15.96416, + "grad_norm": 0.994040310382843, + "learning_rate": 2.5070028011204483e-05, + "loss": 0.5493, + "step": 12472 + }, + { + "epoch": 15.96544, + "grad_norm": 1.0005146265029907, + "learning_rate": 2.5068027210884355e-05, + "loss": 0.5584, + "step": 12473 + }, + { + "epoch": 15.96672, + "grad_norm": 0.9767276048660278, + "learning_rate": 2.5066026410564227e-05, + "loss": 0.4839, + "step": 12474 + }, + { + "epoch": 15.968, + "grad_norm": 0.9705262780189514, + "learning_rate": 2.5064025610244095e-05, + "loss": 0.5139, + "step": 12475 + }, + { + "epoch": 15.96928, + "grad_norm": 1.0184811353683472, + "learning_rate": 2.5062024809923974e-05, + "loss": 0.5444, + "step": 12476 + }, + { + "epoch": 15.97056, + "grad_norm": 1.0089943408966064, + "learning_rate": 2.5060024009603845e-05, + "loss": 0.5257, + "step": 12477 + }, + { + "epoch": 15.97184, + "grad_norm": 1.0058419704437256, + "learning_rate": 2.5058023209283714e-05, + "loss": 0.4931, + "step": 12478 + }, + { + "epoch": 15.97312, + "grad_norm": 1.0035637617111206, + "learning_rate": 2.5056022408963586e-05, + "loss": 0.4789, + "step": 12479 + }, + { + "epoch": 15.9744, + "grad_norm": 1.051369309425354, + "learning_rate": 2.5054021608643458e-05, + "loss": 0.5567, + "step": 12480 + }, + { + "epoch": 15.97568, + "grad_norm": 1.0520974397659302, + "learning_rate": 2.505202080832333e-05, + "loss": 0.5678, + "step": 12481 + }, + { + "epoch": 15.97696, + "grad_norm": 0.9842495918273926, + "learning_rate": 2.50500200080032e-05, + "loss": 0.5092, + "step": 12482 + }, + { + "epoch": 15.97824, + "grad_norm": 0.9604758620262146, + "learning_rate": 2.5048019207683077e-05, + "loss": 0.526, + "step": 12483 + }, + { + "epoch": 15.97952, + "grad_norm": 0.9821440577507019, + "learning_rate": 2.504601840736295e-05, + "loss": 0.4723, + "step": 12484 + }, + { + "epoch": 15.9808, + "grad_norm": 0.9930018782615662, + "learning_rate": 2.504401760704282e-05, + "loss": 0.5205, + "step": 12485 + }, + { + "epoch": 15.98208, + "grad_norm": 1.0078686475753784, + "learning_rate": 2.504201680672269e-05, + "loss": 0.5439, + "step": 12486 + }, + { + "epoch": 15.98336, + "grad_norm": 1.0228537321090698, + "learning_rate": 2.504001600640256e-05, + "loss": 0.5895, + "step": 12487 + }, + { + "epoch": 15.98464, + "grad_norm": 1.0596520900726318, + "learning_rate": 2.5038015206082433e-05, + "loss": 0.5966, + "step": 12488 + }, + { + "epoch": 15.98592, + "grad_norm": 0.9747412800788879, + "learning_rate": 2.5036014405762304e-05, + "loss": 0.5209, + "step": 12489 + }, + { + "epoch": 15.9872, + "grad_norm": 1.090663194656372, + "learning_rate": 2.503401360544218e-05, + "loss": 0.545, + "step": 12490 + }, + { + "epoch": 15.98848, + "grad_norm": 0.9801415205001831, + "learning_rate": 2.503201280512205e-05, + "loss": 0.5044, + "step": 12491 + }, + { + "epoch": 15.98976, + "grad_norm": 0.9882249236106873, + "learning_rate": 2.5030012004801923e-05, + "loss": 0.5395, + "step": 12492 + }, + { + "epoch": 15.99104, + "grad_norm": 0.9727967381477356, + "learning_rate": 2.5028011204481795e-05, + "loss": 0.5298, + "step": 12493 + }, + { + "epoch": 15.99232, + "grad_norm": 1.004541277885437, + "learning_rate": 2.5026010404161664e-05, + "loss": 0.5377, + "step": 12494 + }, + { + "epoch": 15.9936, + "grad_norm": 0.9552838206291199, + "learning_rate": 2.5024009603841536e-05, + "loss": 0.5197, + "step": 12495 + }, + { + "epoch": 15.99488, + "grad_norm": 0.9584947228431702, + "learning_rate": 2.5022008803521407e-05, + "loss": 0.5345, + "step": 12496 + }, + { + "epoch": 15.99616, + "grad_norm": 0.9853367209434509, + "learning_rate": 2.5020008003201283e-05, + "loss": 0.5047, + "step": 12497 + }, + { + "epoch": 15.99744, + "grad_norm": 0.9928620457649231, + "learning_rate": 2.5018007202881154e-05, + "loss": 0.5282, + "step": 12498 + }, + { + "epoch": 15.99872, + "grad_norm": 1.0169389247894287, + "learning_rate": 2.5016006402561026e-05, + "loss": 0.5764, + "step": 12499 + }, + { + "epoch": 16.0, + "grad_norm": Infinity, + "learning_rate": 2.5016006402561026e-05, + "loss": 0.9264, + "step": 12500 + }, + { + "epoch": 16.00128, + "grad_norm": 0.9896093010902405, + "learning_rate": 2.5014005602240898e-05, + "loss": 0.4865, + "step": 12501 + }, + { + "epoch": 16.00256, + "grad_norm": 0.9680584669113159, + "learning_rate": 2.501200480192077e-05, + "loss": 0.4852, + "step": 12502 + }, + { + "epoch": 16.00384, + "grad_norm": 0.9140750765800476, + "learning_rate": 2.501000400160064e-05, + "loss": 0.48, + "step": 12503 + }, + { + "epoch": 16.00512, + "grad_norm": 0.9292126893997192, + "learning_rate": 2.500800320128051e-05, + "loss": 0.4647, + "step": 12504 + }, + { + "epoch": 16.0064, + "grad_norm": 0.9622963070869446, + "learning_rate": 2.500600240096039e-05, + "loss": 0.4908, + "step": 12505 + }, + { + "epoch": 16.00768, + "grad_norm": 0.9972251653671265, + "learning_rate": 2.5004001600640257e-05, + "loss": 0.5065, + "step": 12506 + }, + { + "epoch": 16.00896, + "grad_norm": 1.0023291110992432, + "learning_rate": 2.500200080032013e-05, + "loss": 0.5344, + "step": 12507 + }, + { + "epoch": 16.01024, + "grad_norm": 0.9497984647750854, + "learning_rate": 2.5e-05, + "loss": 0.4935, + "step": 12508 + }, + { + "epoch": 16.01152, + "grad_norm": 0.9904937148094177, + "learning_rate": 2.4997999199679873e-05, + "loss": 0.4871, + "step": 12509 + }, + { + "epoch": 16.0128, + "grad_norm": 0.9689871668815613, + "learning_rate": 2.4995998399359745e-05, + "loss": 0.471, + "step": 12510 + }, + { + "epoch": 16.01408, + "grad_norm": 0.9814728498458862, + "learning_rate": 2.4993997599039617e-05, + "loss": 0.5067, + "step": 12511 + }, + { + "epoch": 16.01536, + "grad_norm": 0.97007817029953, + "learning_rate": 2.499199679871949e-05, + "loss": 0.507, + "step": 12512 + }, + { + "epoch": 16.01664, + "grad_norm": 1.0243626832962036, + "learning_rate": 2.498999599839936e-05, + "loss": 0.5726, + "step": 12513 + }, + { + "epoch": 16.01792, + "grad_norm": 1.0011252164840698, + "learning_rate": 2.4987995198079232e-05, + "loss": 0.4805, + "step": 12514 + }, + { + "epoch": 16.0192, + "grad_norm": 0.9926708936691284, + "learning_rate": 2.4985994397759104e-05, + "loss": 0.5031, + "step": 12515 + }, + { + "epoch": 16.02048, + "grad_norm": 0.9110226035118103, + "learning_rate": 2.4983993597438976e-05, + "loss": 0.4117, + "step": 12516 + }, + { + "epoch": 16.02176, + "grad_norm": 1.0108290910720825, + "learning_rate": 2.498199279711885e-05, + "loss": 0.5429, + "step": 12517 + }, + { + "epoch": 16.02304, + "grad_norm": 0.990385115146637, + "learning_rate": 2.497999199679872e-05, + "loss": 0.4741, + "step": 12518 + }, + { + "epoch": 16.02432, + "grad_norm": 0.982943058013916, + "learning_rate": 2.497799119647859e-05, + "loss": 0.5495, + "step": 12519 + }, + { + "epoch": 16.0256, + "grad_norm": 0.9531762599945068, + "learning_rate": 2.4975990396158463e-05, + "loss": 0.4758, + "step": 12520 + }, + { + "epoch": 16.02688, + "grad_norm": 0.9687629342079163, + "learning_rate": 2.497398959583834e-05, + "loss": 0.503, + "step": 12521 + }, + { + "epoch": 16.02816, + "grad_norm": 1.029827356338501, + "learning_rate": 2.4971988795518207e-05, + "loss": 0.5105, + "step": 12522 + }, + { + "epoch": 16.02944, + "grad_norm": 1.062757968902588, + "learning_rate": 2.496998799519808e-05, + "loss": 0.5112, + "step": 12523 + }, + { + "epoch": 16.03072, + "grad_norm": 1.0137965679168701, + "learning_rate": 2.4967987194877954e-05, + "loss": 0.5003, + "step": 12524 + }, + { + "epoch": 16.032, + "grad_norm": 1.0039608478546143, + "learning_rate": 2.4965986394557826e-05, + "loss": 0.5081, + "step": 12525 + }, + { + "epoch": 16.03328, + "grad_norm": 0.9910953044891357, + "learning_rate": 2.4963985594237695e-05, + "loss": 0.5416, + "step": 12526 + }, + { + "epoch": 16.03456, + "grad_norm": 1.0009675025939941, + "learning_rate": 2.4961984793917566e-05, + "loss": 0.5186, + "step": 12527 + }, + { + "epoch": 16.03584, + "grad_norm": 0.9520965814590454, + "learning_rate": 2.4959983993597442e-05, + "loss": 0.499, + "step": 12528 + }, + { + "epoch": 16.03712, + "grad_norm": 0.9937947392463684, + "learning_rate": 2.4957983193277314e-05, + "loss": 0.5301, + "step": 12529 + }, + { + "epoch": 16.0384, + "grad_norm": 1.0274006128311157, + "learning_rate": 2.4955982392957182e-05, + "loss": 0.5275, + "step": 12530 + }, + { + "epoch": 16.03968, + "grad_norm": 0.9785746932029724, + "learning_rate": 2.4953981592637057e-05, + "loss": 0.5235, + "step": 12531 + }, + { + "epoch": 16.04096, + "grad_norm": 0.9842962026596069, + "learning_rate": 2.495198079231693e-05, + "loss": 0.4919, + "step": 12532 + }, + { + "epoch": 16.04224, + "grad_norm": 0.9303730726242065, + "learning_rate": 2.49499799919968e-05, + "loss": 0.4506, + "step": 12533 + }, + { + "epoch": 16.04352, + "grad_norm": 1.0411185026168823, + "learning_rate": 2.494797919167667e-05, + "loss": 0.5188, + "step": 12534 + }, + { + "epoch": 16.0448, + "grad_norm": 0.9499898552894592, + "learning_rate": 2.4945978391356545e-05, + "loss": 0.4906, + "step": 12535 + }, + { + "epoch": 16.04608, + "grad_norm": 0.9907384514808655, + "learning_rate": 2.4943977591036417e-05, + "loss": 0.5185, + "step": 12536 + }, + { + "epoch": 16.04736, + "grad_norm": 0.9777514338493347, + "learning_rate": 2.494197679071629e-05, + "loss": 0.5008, + "step": 12537 + }, + { + "epoch": 16.04864, + "grad_norm": 1.0125657320022583, + "learning_rate": 2.4939975990396157e-05, + "loss": 0.4957, + "step": 12538 + }, + { + "epoch": 16.04992, + "grad_norm": 1.0564517974853516, + "learning_rate": 2.4937975190076032e-05, + "loss": 0.5403, + "step": 12539 + }, + { + "epoch": 16.0512, + "grad_norm": 1.0502736568450928, + "learning_rate": 2.4935974389755904e-05, + "loss": 0.5516, + "step": 12540 + }, + { + "epoch": 16.05248, + "grad_norm": 0.9993465542793274, + "learning_rate": 2.4933973589435776e-05, + "loss": 0.5124, + "step": 12541 + }, + { + "epoch": 16.05376, + "grad_norm": 1.0142031908035278, + "learning_rate": 2.4931972789115648e-05, + "loss": 0.4857, + "step": 12542 + }, + { + "epoch": 16.05504, + "grad_norm": 1.0206118822097778, + "learning_rate": 2.492997198879552e-05, + "loss": 0.5747, + "step": 12543 + }, + { + "epoch": 16.05632, + "grad_norm": 1.0780686140060425, + "learning_rate": 2.492797118847539e-05, + "loss": 0.5194, + "step": 12544 + }, + { + "epoch": 16.0576, + "grad_norm": 0.9203510284423828, + "learning_rate": 2.4925970388155263e-05, + "loss": 0.5112, + "step": 12545 + }, + { + "epoch": 16.05888, + "grad_norm": 0.9816039800643921, + "learning_rate": 2.4923969587835135e-05, + "loss": 0.5326, + "step": 12546 + }, + { + "epoch": 16.06016, + "grad_norm": 1.0037397146224976, + "learning_rate": 2.4921968787515007e-05, + "loss": 0.5013, + "step": 12547 + }, + { + "epoch": 16.06144, + "grad_norm": 1.0208714008331299, + "learning_rate": 2.491996798719488e-05, + "loss": 0.4871, + "step": 12548 + }, + { + "epoch": 16.06272, + "grad_norm": 1.050378441810608, + "learning_rate": 2.491796718687475e-05, + "loss": 0.551, + "step": 12549 + }, + { + "epoch": 16.064, + "grad_norm": 1.0114853382110596, + "learning_rate": 2.4915966386554623e-05, + "loss": 0.5621, + "step": 12550 + }, + { + "epoch": 16.06528, + "grad_norm": 0.9831593036651611, + "learning_rate": 2.4913965586234494e-05, + "loss": 0.5084, + "step": 12551 + }, + { + "epoch": 16.06656, + "grad_norm": 0.9667739868164062, + "learning_rate": 2.4911964785914366e-05, + "loss": 0.4955, + "step": 12552 + }, + { + "epoch": 16.06784, + "grad_norm": 1.0448590517044067, + "learning_rate": 2.4909963985594238e-05, + "loss": 0.5371, + "step": 12553 + }, + { + "epoch": 16.06912, + "grad_norm": 1.012335181236267, + "learning_rate": 2.490796318527411e-05, + "loss": 0.505, + "step": 12554 + }, + { + "epoch": 16.0704, + "grad_norm": 1.0684611797332764, + "learning_rate": 2.4905962384953982e-05, + "loss": 0.5381, + "step": 12555 + }, + { + "epoch": 16.07168, + "grad_norm": 1.0244140625, + "learning_rate": 2.4903961584633857e-05, + "loss": 0.4584, + "step": 12556 + }, + { + "epoch": 16.07296, + "grad_norm": 1.0357544422149658, + "learning_rate": 2.4901960784313726e-05, + "loss": 0.5328, + "step": 12557 + }, + { + "epoch": 16.07424, + "grad_norm": 1.0519928932189941, + "learning_rate": 2.4899959983993597e-05, + "loss": 0.4919, + "step": 12558 + }, + { + "epoch": 16.07552, + "grad_norm": 1.0284450054168701, + "learning_rate": 2.489795918367347e-05, + "loss": 0.5221, + "step": 12559 + }, + { + "epoch": 16.0768, + "grad_norm": 0.9368478059768677, + "learning_rate": 2.4895958383353345e-05, + "loss": 0.4828, + "step": 12560 + }, + { + "epoch": 16.07808, + "grad_norm": 0.9959625601768494, + "learning_rate": 2.4893957583033213e-05, + "loss": 0.5332, + "step": 12561 + }, + { + "epoch": 16.07936, + "grad_norm": 1.113194465637207, + "learning_rate": 2.4891956782713085e-05, + "loss": 0.509, + "step": 12562 + }, + { + "epoch": 16.08064, + "grad_norm": 0.9688543081283569, + "learning_rate": 2.488995598239296e-05, + "loss": 0.5131, + "step": 12563 + }, + { + "epoch": 16.08192, + "grad_norm": 1.0220595598220825, + "learning_rate": 2.4887955182072832e-05, + "loss": 0.4878, + "step": 12564 + }, + { + "epoch": 16.0832, + "grad_norm": 1.0453155040740967, + "learning_rate": 2.48859543817527e-05, + "loss": 0.5074, + "step": 12565 + }, + { + "epoch": 16.08448, + "grad_norm": 1.0528833866119385, + "learning_rate": 2.4883953581432572e-05, + "loss": 0.5193, + "step": 12566 + }, + { + "epoch": 16.08576, + "grad_norm": 1.0217548608779907, + "learning_rate": 2.4881952781112448e-05, + "loss": 0.5449, + "step": 12567 + }, + { + "epoch": 16.087040000000002, + "grad_norm": 1.0063393115997314, + "learning_rate": 2.487995198079232e-05, + "loss": 0.4872, + "step": 12568 + }, + { + "epoch": 16.08832, + "grad_norm": 1.0022279024124146, + "learning_rate": 2.4877951180472188e-05, + "loss": 0.5315, + "step": 12569 + }, + { + "epoch": 16.0896, + "grad_norm": 0.9380884170532227, + "learning_rate": 2.4875950380152063e-05, + "loss": 0.4819, + "step": 12570 + }, + { + "epoch": 16.09088, + "grad_norm": 1.018511176109314, + "learning_rate": 2.4873949579831935e-05, + "loss": 0.5077, + "step": 12571 + }, + { + "epoch": 16.09216, + "grad_norm": 0.9977579116821289, + "learning_rate": 2.4871948779511807e-05, + "loss": 0.491, + "step": 12572 + }, + { + "epoch": 16.09344, + "grad_norm": 1.025163173675537, + "learning_rate": 2.4869947979191675e-05, + "loss": 0.4826, + "step": 12573 + }, + { + "epoch": 16.09472, + "grad_norm": 1.0350524187088013, + "learning_rate": 2.486794717887155e-05, + "loss": 0.5445, + "step": 12574 + }, + { + "epoch": 16.096, + "grad_norm": 1.023727297782898, + "learning_rate": 2.4865946378551422e-05, + "loss": 0.5394, + "step": 12575 + }, + { + "epoch": 16.09728, + "grad_norm": 0.9656332731246948, + "learning_rate": 2.4863945578231294e-05, + "loss": 0.4724, + "step": 12576 + }, + { + "epoch": 16.09856, + "grad_norm": 1.0451605319976807, + "learning_rate": 2.4861944777911166e-05, + "loss": 0.5221, + "step": 12577 + }, + { + "epoch": 16.09984, + "grad_norm": 1.0490944385528564, + "learning_rate": 2.4859943977591038e-05, + "loss": 0.5555, + "step": 12578 + }, + { + "epoch": 16.10112, + "grad_norm": 1.0549135208129883, + "learning_rate": 2.485794317727091e-05, + "loss": 0.4902, + "step": 12579 + }, + { + "epoch": 16.1024, + "grad_norm": 0.9879729151725769, + "learning_rate": 2.485594237695078e-05, + "loss": 0.4875, + "step": 12580 + }, + { + "epoch": 16.10368, + "grad_norm": 1.011025071144104, + "learning_rate": 2.4853941576630654e-05, + "loss": 0.4903, + "step": 12581 + }, + { + "epoch": 16.10496, + "grad_norm": 0.9577101469039917, + "learning_rate": 2.4851940776310525e-05, + "loss": 0.5068, + "step": 12582 + }, + { + "epoch": 16.10624, + "grad_norm": 0.9981372356414795, + "learning_rate": 2.4849939975990397e-05, + "loss": 0.5308, + "step": 12583 + }, + { + "epoch": 16.10752, + "grad_norm": 0.9399566054344177, + "learning_rate": 2.484793917567027e-05, + "loss": 0.4912, + "step": 12584 + }, + { + "epoch": 16.1088, + "grad_norm": 0.9757966995239258, + "learning_rate": 2.484593837535014e-05, + "loss": 0.5236, + "step": 12585 + }, + { + "epoch": 16.11008, + "grad_norm": 1.0582964420318604, + "learning_rate": 2.4843937575030013e-05, + "loss": 0.5207, + "step": 12586 + }, + { + "epoch": 16.11136, + "grad_norm": 0.9841398596763611, + "learning_rate": 2.4841936774709885e-05, + "loss": 0.5375, + "step": 12587 + }, + { + "epoch": 16.11264, + "grad_norm": 0.9746212959289551, + "learning_rate": 2.4839935974389757e-05, + "loss": 0.4938, + "step": 12588 + }, + { + "epoch": 16.11392, + "grad_norm": 0.9631856679916382, + "learning_rate": 2.483793517406963e-05, + "loss": 0.4965, + "step": 12589 + }, + { + "epoch": 16.1152, + "grad_norm": 0.99147629737854, + "learning_rate": 2.48359343737495e-05, + "loss": 0.5078, + "step": 12590 + }, + { + "epoch": 16.11648, + "grad_norm": 1.013726830482483, + "learning_rate": 2.4833933573429375e-05, + "loss": 0.5205, + "step": 12591 + }, + { + "epoch": 16.11776, + "grad_norm": 1.0126923322677612, + "learning_rate": 2.4831932773109244e-05, + "loss": 0.47, + "step": 12592 + }, + { + "epoch": 16.11904, + "grad_norm": 0.9509507417678833, + "learning_rate": 2.4829931972789116e-05, + "loss": 0.5231, + "step": 12593 + }, + { + "epoch": 16.12032, + "grad_norm": 1.1204897165298462, + "learning_rate": 2.4827931172468988e-05, + "loss": 0.537, + "step": 12594 + }, + { + "epoch": 16.1216, + "grad_norm": 1.0702080726623535, + "learning_rate": 2.4825930372148863e-05, + "loss": 0.5298, + "step": 12595 + }, + { + "epoch": 16.12288, + "grad_norm": 0.982305645942688, + "learning_rate": 2.482392957182873e-05, + "loss": 0.5061, + "step": 12596 + }, + { + "epoch": 16.12416, + "grad_norm": 0.9869879484176636, + "learning_rate": 2.4821928771508603e-05, + "loss": 0.5259, + "step": 12597 + }, + { + "epoch": 16.12544, + "grad_norm": 1.026171088218689, + "learning_rate": 2.4819927971188475e-05, + "loss": 0.4958, + "step": 12598 + }, + { + "epoch": 16.12672, + "grad_norm": 0.9694840312004089, + "learning_rate": 2.481792717086835e-05, + "loss": 0.5093, + "step": 12599 + }, + { + "epoch": 16.128, + "grad_norm": 1.0091004371643066, + "learning_rate": 2.481592637054822e-05, + "loss": 0.4943, + "step": 12600 + }, + { + "epoch": 16.12928, + "grad_norm": 1.0162972211837769, + "learning_rate": 2.481392557022809e-05, + "loss": 0.4813, + "step": 12601 + }, + { + "epoch": 16.13056, + "grad_norm": 1.0108006000518799, + "learning_rate": 2.4811924769907966e-05, + "loss": 0.5231, + "step": 12602 + }, + { + "epoch": 16.13184, + "grad_norm": 1.009252667427063, + "learning_rate": 2.4809923969587838e-05, + "loss": 0.5657, + "step": 12603 + }, + { + "epoch": 16.13312, + "grad_norm": 0.9438086748123169, + "learning_rate": 2.4807923169267706e-05, + "loss": 0.4868, + "step": 12604 + }, + { + "epoch": 16.1344, + "grad_norm": 0.9571229815483093, + "learning_rate": 2.4805922368947578e-05, + "loss": 0.4877, + "step": 12605 + }, + { + "epoch": 16.13568, + "grad_norm": 1.0769860744476318, + "learning_rate": 2.4803921568627453e-05, + "loss": 0.5616, + "step": 12606 + }, + { + "epoch": 16.13696, + "grad_norm": 0.9902837872505188, + "learning_rate": 2.4801920768307325e-05, + "loss": 0.5381, + "step": 12607 + }, + { + "epoch": 16.13824, + "grad_norm": 0.9671089053153992, + "learning_rate": 2.4799919967987194e-05, + "loss": 0.5305, + "step": 12608 + }, + { + "epoch": 16.13952, + "grad_norm": 1.058725118637085, + "learning_rate": 2.479791916766707e-05, + "loss": 0.4747, + "step": 12609 + }, + { + "epoch": 16.1408, + "grad_norm": 1.0563637018203735, + "learning_rate": 2.479591836734694e-05, + "loss": 0.5424, + "step": 12610 + }, + { + "epoch": 16.14208, + "grad_norm": 0.9643771052360535, + "learning_rate": 2.4793917567026813e-05, + "loss": 0.4893, + "step": 12611 + }, + { + "epoch": 16.14336, + "grad_norm": 1.021338939666748, + "learning_rate": 2.479191676670668e-05, + "loss": 0.5032, + "step": 12612 + }, + { + "epoch": 16.14464, + "grad_norm": 0.9763515591621399, + "learning_rate": 2.4789915966386556e-05, + "loss": 0.5038, + "step": 12613 + }, + { + "epoch": 16.14592, + "grad_norm": 1.0414294004440308, + "learning_rate": 2.4787915166066428e-05, + "loss": 0.4937, + "step": 12614 + }, + { + "epoch": 16.1472, + "grad_norm": 1.0470645427703857, + "learning_rate": 2.47859143657463e-05, + "loss": 0.5073, + "step": 12615 + }, + { + "epoch": 16.14848, + "grad_norm": 0.9787927269935608, + "learning_rate": 2.4783913565426172e-05, + "loss": 0.517, + "step": 12616 + }, + { + "epoch": 16.14976, + "grad_norm": 1.0010355710983276, + "learning_rate": 2.4781912765106044e-05, + "loss": 0.4683, + "step": 12617 + }, + { + "epoch": 16.15104, + "grad_norm": 0.978118896484375, + "learning_rate": 2.4779911964785916e-05, + "loss": 0.4818, + "step": 12618 + }, + { + "epoch": 16.15232, + "grad_norm": 0.9748190641403198, + "learning_rate": 2.4777911164465787e-05, + "loss": 0.5617, + "step": 12619 + }, + { + "epoch": 16.1536, + "grad_norm": 0.9519315361976624, + "learning_rate": 2.477591036414566e-05, + "loss": 0.4726, + "step": 12620 + }, + { + "epoch": 16.15488, + "grad_norm": 1.0163748264312744, + "learning_rate": 2.477390956382553e-05, + "loss": 0.5284, + "step": 12621 + }, + { + "epoch": 16.15616, + "grad_norm": 1.035246729850769, + "learning_rate": 2.4771908763505403e-05, + "loss": 0.5376, + "step": 12622 + }, + { + "epoch": 16.15744, + "grad_norm": 1.0474730730056763, + "learning_rate": 2.4769907963185275e-05, + "loss": 0.5061, + "step": 12623 + }, + { + "epoch": 16.15872, + "grad_norm": 0.9941502809524536, + "learning_rate": 2.4767907162865147e-05, + "loss": 0.5582, + "step": 12624 + }, + { + "epoch": 16.16, + "grad_norm": 1.0640026330947876, + "learning_rate": 2.476590636254502e-05, + "loss": 0.513, + "step": 12625 + }, + { + "epoch": 16.16128, + "grad_norm": 1.030907154083252, + "learning_rate": 2.476390556222489e-05, + "loss": 0.4757, + "step": 12626 + }, + { + "epoch": 16.16256, + "grad_norm": 0.996875524520874, + "learning_rate": 2.4761904761904762e-05, + "loss": 0.4995, + "step": 12627 + }, + { + "epoch": 16.16384, + "grad_norm": 0.9607611894607544, + "learning_rate": 2.4759903961584634e-05, + "loss": 0.4947, + "step": 12628 + }, + { + "epoch": 16.16512, + "grad_norm": 0.9883370995521545, + "learning_rate": 2.4757903161264506e-05, + "loss": 0.5161, + "step": 12629 + }, + { + "epoch": 16.1664, + "grad_norm": 1.0071051120758057, + "learning_rate": 2.475590236094438e-05, + "loss": 0.4924, + "step": 12630 + }, + { + "epoch": 16.16768, + "grad_norm": 1.0023144483566284, + "learning_rate": 2.475390156062425e-05, + "loss": 0.4846, + "step": 12631 + }, + { + "epoch": 16.16896, + "grad_norm": 1.0280662775039673, + "learning_rate": 2.475190076030412e-05, + "loss": 0.5245, + "step": 12632 + }, + { + "epoch": 16.17024, + "grad_norm": 1.0610262155532837, + "learning_rate": 2.4749899959983993e-05, + "loss": 0.5165, + "step": 12633 + }, + { + "epoch": 16.17152, + "grad_norm": 0.9850628972053528, + "learning_rate": 2.474789915966387e-05, + "loss": 0.4994, + "step": 12634 + }, + { + "epoch": 16.1728, + "grad_norm": 1.0291600227355957, + "learning_rate": 2.4745898359343737e-05, + "loss": 0.5098, + "step": 12635 + }, + { + "epoch": 16.17408, + "grad_norm": 0.9818286895751953, + "learning_rate": 2.474389755902361e-05, + "loss": 0.5086, + "step": 12636 + }, + { + "epoch": 16.17536, + "grad_norm": 0.9875121712684631, + "learning_rate": 2.4741896758703484e-05, + "loss": 0.475, + "step": 12637 + }, + { + "epoch": 16.17664, + "grad_norm": 0.9203592538833618, + "learning_rate": 2.4739895958383356e-05, + "loss": 0.4558, + "step": 12638 + }, + { + "epoch": 16.17792, + "grad_norm": 0.9402249455451965, + "learning_rate": 2.4737895158063225e-05, + "loss": 0.4928, + "step": 12639 + }, + { + "epoch": 16.1792, + "grad_norm": 0.9855098724365234, + "learning_rate": 2.4735894357743096e-05, + "loss": 0.4993, + "step": 12640 + }, + { + "epoch": 16.18048, + "grad_norm": 0.9504002332687378, + "learning_rate": 2.4733893557422972e-05, + "loss": 0.5067, + "step": 12641 + }, + { + "epoch": 16.18176, + "grad_norm": 1.0600100755691528, + "learning_rate": 2.4731892757102844e-05, + "loss": 0.5632, + "step": 12642 + }, + { + "epoch": 16.18304, + "grad_norm": 1.0117945671081543, + "learning_rate": 2.4729891956782712e-05, + "loss": 0.5322, + "step": 12643 + }, + { + "epoch": 16.18432, + "grad_norm": 0.9786906242370605, + "learning_rate": 2.4727891156462587e-05, + "loss": 0.4928, + "step": 12644 + }, + { + "epoch": 16.1856, + "grad_norm": 0.9911783337593079, + "learning_rate": 2.472589035614246e-05, + "loss": 0.4789, + "step": 12645 + }, + { + "epoch": 16.18688, + "grad_norm": 1.0481369495391846, + "learning_rate": 2.472388955582233e-05, + "loss": 0.5374, + "step": 12646 + }, + { + "epoch": 16.18816, + "grad_norm": 0.9751612544059753, + "learning_rate": 2.47218887555022e-05, + "loss": 0.4826, + "step": 12647 + }, + { + "epoch": 16.18944, + "grad_norm": 1.0269287824630737, + "learning_rate": 2.4719887955182075e-05, + "loss": 0.5436, + "step": 12648 + }, + { + "epoch": 16.19072, + "grad_norm": 0.966488242149353, + "learning_rate": 2.4717887154861947e-05, + "loss": 0.4886, + "step": 12649 + }, + { + "epoch": 16.192, + "grad_norm": 1.0197010040283203, + "learning_rate": 2.471588635454182e-05, + "loss": 0.5519, + "step": 12650 + }, + { + "epoch": 16.19328, + "grad_norm": 0.9842081665992737, + "learning_rate": 2.4713885554221687e-05, + "loss": 0.5198, + "step": 12651 + }, + { + "epoch": 16.19456, + "grad_norm": 1.0210785865783691, + "learning_rate": 2.4711884753901562e-05, + "loss": 0.4755, + "step": 12652 + }, + { + "epoch": 16.19584, + "grad_norm": 1.0439190864562988, + "learning_rate": 2.4709883953581434e-05, + "loss": 0.4878, + "step": 12653 + }, + { + "epoch": 16.19712, + "grad_norm": 0.9892864227294922, + "learning_rate": 2.4707883153261306e-05, + "loss": 0.4941, + "step": 12654 + }, + { + "epoch": 16.1984, + "grad_norm": 1.1083271503448486, + "learning_rate": 2.4705882352941178e-05, + "loss": 0.5406, + "step": 12655 + }, + { + "epoch": 16.19968, + "grad_norm": 1.0279945135116577, + "learning_rate": 2.470388155262105e-05, + "loss": 0.5296, + "step": 12656 + }, + { + "epoch": 16.20096, + "grad_norm": 0.9678087830543518, + "learning_rate": 2.470188075230092e-05, + "loss": 0.4879, + "step": 12657 + }, + { + "epoch": 16.20224, + "grad_norm": 1.0200779438018799, + "learning_rate": 2.4699879951980793e-05, + "loss": 0.5342, + "step": 12658 + }, + { + "epoch": 16.20352, + "grad_norm": 1.0152758359909058, + "learning_rate": 2.4697879151660665e-05, + "loss": 0.5535, + "step": 12659 + }, + { + "epoch": 16.2048, + "grad_norm": 1.0245957374572754, + "learning_rate": 2.4695878351340537e-05, + "loss": 0.5309, + "step": 12660 + }, + { + "epoch": 16.20608, + "grad_norm": 1.054991364479065, + "learning_rate": 2.469387755102041e-05, + "loss": 0.5356, + "step": 12661 + }, + { + "epoch": 16.20736, + "grad_norm": 1.0215634107589722, + "learning_rate": 2.4691876750700284e-05, + "loss": 0.5165, + "step": 12662 + }, + { + "epoch": 16.20864, + "grad_norm": 0.9687957763671875, + "learning_rate": 2.4689875950380153e-05, + "loss": 0.4773, + "step": 12663 + }, + { + "epoch": 16.20992, + "grad_norm": 1.0658154487609863, + "learning_rate": 2.4687875150060024e-05, + "loss": 0.582, + "step": 12664 + }, + { + "epoch": 16.2112, + "grad_norm": 1.0069797039031982, + "learning_rate": 2.4685874349739896e-05, + "loss": 0.5316, + "step": 12665 + }, + { + "epoch": 16.21248, + "grad_norm": 1.0188541412353516, + "learning_rate": 2.468387354941977e-05, + "loss": 0.5182, + "step": 12666 + }, + { + "epoch": 16.21376, + "grad_norm": 1.0730429887771606, + "learning_rate": 2.468187274909964e-05, + "loss": 0.5226, + "step": 12667 + }, + { + "epoch": 16.21504, + "grad_norm": 1.0113489627838135, + "learning_rate": 2.4679871948779512e-05, + "loss": 0.4887, + "step": 12668 + }, + { + "epoch": 16.21632, + "grad_norm": 0.9426849484443665, + "learning_rate": 2.4677871148459387e-05, + "loss": 0.4777, + "step": 12669 + }, + { + "epoch": 16.2176, + "grad_norm": 0.9736514687538147, + "learning_rate": 2.467587034813926e-05, + "loss": 0.4941, + "step": 12670 + }, + { + "epoch": 16.21888, + "grad_norm": 1.0153673887252808, + "learning_rate": 2.4673869547819127e-05, + "loss": 0.5524, + "step": 12671 + }, + { + "epoch": 16.22016, + "grad_norm": 1.018025279045105, + "learning_rate": 2.4671868747499e-05, + "loss": 0.4976, + "step": 12672 + }, + { + "epoch": 16.22144, + "grad_norm": 0.9701156616210938, + "learning_rate": 2.4669867947178875e-05, + "loss": 0.4712, + "step": 12673 + }, + { + "epoch": 16.22272, + "grad_norm": 1.0330697298049927, + "learning_rate": 2.4667867146858746e-05, + "loss": 0.4938, + "step": 12674 + }, + { + "epoch": 16.224, + "grad_norm": 0.9708216190338135, + "learning_rate": 2.4665866346538615e-05, + "loss": 0.4665, + "step": 12675 + }, + { + "epoch": 16.22528, + "grad_norm": 0.9829319715499878, + "learning_rate": 2.466386554621849e-05, + "loss": 0.5134, + "step": 12676 + }, + { + "epoch": 16.22656, + "grad_norm": 1.0143637657165527, + "learning_rate": 2.4661864745898362e-05, + "loss": 0.5448, + "step": 12677 + }, + { + "epoch": 16.22784, + "grad_norm": 1.0487371683120728, + "learning_rate": 2.4659863945578234e-05, + "loss": 0.5299, + "step": 12678 + }, + { + "epoch": 16.22912, + "grad_norm": 0.9548150300979614, + "learning_rate": 2.4657863145258102e-05, + "loss": 0.4779, + "step": 12679 + }, + { + "epoch": 16.2304, + "grad_norm": 1.0021804571151733, + "learning_rate": 2.4655862344937978e-05, + "loss": 0.497, + "step": 12680 + }, + { + "epoch": 16.23168, + "grad_norm": 0.9995138049125671, + "learning_rate": 2.465386154461785e-05, + "loss": 0.5175, + "step": 12681 + }, + { + "epoch": 16.23296, + "grad_norm": 1.010904312133789, + "learning_rate": 2.465186074429772e-05, + "loss": 0.5227, + "step": 12682 + }, + { + "epoch": 16.23424, + "grad_norm": 0.9441868662834167, + "learning_rate": 2.4649859943977593e-05, + "loss": 0.5053, + "step": 12683 + }, + { + "epoch": 16.23552, + "grad_norm": 1.0118534564971924, + "learning_rate": 2.4647859143657465e-05, + "loss": 0.5208, + "step": 12684 + }, + { + "epoch": 16.2368, + "grad_norm": 1.0776724815368652, + "learning_rate": 2.4645858343337337e-05, + "loss": 0.5097, + "step": 12685 + }, + { + "epoch": 16.23808, + "grad_norm": 1.064697504043579, + "learning_rate": 2.464385754301721e-05, + "loss": 0.5166, + "step": 12686 + }, + { + "epoch": 16.23936, + "grad_norm": 1.0309760570526123, + "learning_rate": 2.464185674269708e-05, + "loss": 0.5428, + "step": 12687 + }, + { + "epoch": 16.24064, + "grad_norm": 0.9784319400787354, + "learning_rate": 2.4639855942376952e-05, + "loss": 0.4882, + "step": 12688 + }, + { + "epoch": 16.24192, + "grad_norm": 1.0319559574127197, + "learning_rate": 2.4637855142056824e-05, + "loss": 0.527, + "step": 12689 + }, + { + "epoch": 16.2432, + "grad_norm": 0.991915762424469, + "learning_rate": 2.4635854341736696e-05, + "loss": 0.523, + "step": 12690 + }, + { + "epoch": 16.24448, + "grad_norm": 1.043216586112976, + "learning_rate": 2.4633853541416568e-05, + "loss": 0.4977, + "step": 12691 + }, + { + "epoch": 16.24576, + "grad_norm": 1.063425064086914, + "learning_rate": 2.463185274109644e-05, + "loss": 0.5245, + "step": 12692 + }, + { + "epoch": 16.24704, + "grad_norm": 1.0236256122589111, + "learning_rate": 2.462985194077631e-05, + "loss": 0.5112, + "step": 12693 + }, + { + "epoch": 16.24832, + "grad_norm": 0.992674708366394, + "learning_rate": 2.4627851140456183e-05, + "loss": 0.5062, + "step": 12694 + }, + { + "epoch": 16.2496, + "grad_norm": 1.0538854598999023, + "learning_rate": 2.4625850340136055e-05, + "loss": 0.5244, + "step": 12695 + }, + { + "epoch": 16.25088, + "grad_norm": 1.021213412284851, + "learning_rate": 2.4623849539815927e-05, + "loss": 0.5097, + "step": 12696 + }, + { + "epoch": 16.25216, + "grad_norm": 1.0427947044372559, + "learning_rate": 2.4621848739495802e-05, + "loss": 0.5288, + "step": 12697 + }, + { + "epoch": 16.25344, + "grad_norm": 1.079925537109375, + "learning_rate": 2.461984793917567e-05, + "loss": 0.511, + "step": 12698 + }, + { + "epoch": 16.25472, + "grad_norm": 1.0293481349945068, + "learning_rate": 2.4617847138855543e-05, + "loss": 0.534, + "step": 12699 + }, + { + "epoch": 16.256, + "grad_norm": 1.0903656482696533, + "learning_rate": 2.4615846338535415e-05, + "loss": 0.5344, + "step": 12700 + }, + { + "epoch": 16.25728, + "grad_norm": 0.9660569429397583, + "learning_rate": 2.461384553821529e-05, + "loss": 0.507, + "step": 12701 + }, + { + "epoch": 16.25856, + "grad_norm": 0.9629020094871521, + "learning_rate": 2.461184473789516e-05, + "loss": 0.4976, + "step": 12702 + }, + { + "epoch": 16.25984, + "grad_norm": 0.9647486209869385, + "learning_rate": 2.460984393757503e-05, + "loss": 0.504, + "step": 12703 + }, + { + "epoch": 16.26112, + "grad_norm": 0.996180534362793, + "learning_rate": 2.4607843137254902e-05, + "loss": 0.5255, + "step": 12704 + }, + { + "epoch": 16.2624, + "grad_norm": 0.9972898960113525, + "learning_rate": 2.4605842336934777e-05, + "loss": 0.5179, + "step": 12705 + }, + { + "epoch": 16.26368, + "grad_norm": 0.97672438621521, + "learning_rate": 2.4603841536614646e-05, + "loss": 0.4798, + "step": 12706 + }, + { + "epoch": 16.26496, + "grad_norm": 1.013742446899414, + "learning_rate": 2.4601840736294518e-05, + "loss": 0.542, + "step": 12707 + }, + { + "epoch": 16.26624, + "grad_norm": 1.0334280729293823, + "learning_rate": 2.4599839935974393e-05, + "loss": 0.517, + "step": 12708 + }, + { + "epoch": 16.26752, + "grad_norm": 0.966270923614502, + "learning_rate": 2.4597839135654265e-05, + "loss": 0.4617, + "step": 12709 + }, + { + "epoch": 16.2688, + "grad_norm": 1.0279269218444824, + "learning_rate": 2.4595838335334133e-05, + "loss": 0.5156, + "step": 12710 + }, + { + "epoch": 16.27008, + "grad_norm": 0.993587851524353, + "learning_rate": 2.4593837535014005e-05, + "loss": 0.5515, + "step": 12711 + }, + { + "epoch": 16.27136, + "grad_norm": 1.055559515953064, + "learning_rate": 2.459183673469388e-05, + "loss": 0.5437, + "step": 12712 + }, + { + "epoch": 16.27264, + "grad_norm": 1.0224788188934326, + "learning_rate": 2.4589835934373752e-05, + "loss": 0.5237, + "step": 12713 + }, + { + "epoch": 16.27392, + "grad_norm": 1.0352107286453247, + "learning_rate": 2.458783513405362e-05, + "loss": 0.518, + "step": 12714 + }, + { + "epoch": 16.2752, + "grad_norm": 1.0856186151504517, + "learning_rate": 2.4585834333733496e-05, + "loss": 0.5168, + "step": 12715 + }, + { + "epoch": 16.27648, + "grad_norm": 1.0590870380401611, + "learning_rate": 2.4583833533413368e-05, + "loss": 0.517, + "step": 12716 + }, + { + "epoch": 16.27776, + "grad_norm": 1.0563161373138428, + "learning_rate": 2.458183273309324e-05, + "loss": 0.5362, + "step": 12717 + }, + { + "epoch": 16.27904, + "grad_norm": 1.0422947406768799, + "learning_rate": 2.4579831932773108e-05, + "loss": 0.5014, + "step": 12718 + }, + { + "epoch": 16.28032, + "grad_norm": 1.0459036827087402, + "learning_rate": 2.4577831132452983e-05, + "loss": 0.5276, + "step": 12719 + }, + { + "epoch": 16.2816, + "grad_norm": 1.057701587677002, + "learning_rate": 2.4575830332132855e-05, + "loss": 0.4813, + "step": 12720 + }, + { + "epoch": 16.28288, + "grad_norm": 1.0466841459274292, + "learning_rate": 2.4573829531812727e-05, + "loss": 0.5175, + "step": 12721 + }, + { + "epoch": 16.28416, + "grad_norm": 0.9757436513900757, + "learning_rate": 2.45718287314926e-05, + "loss": 0.4863, + "step": 12722 + }, + { + "epoch": 16.28544, + "grad_norm": 1.0128544569015503, + "learning_rate": 2.456982793117247e-05, + "loss": 0.507, + "step": 12723 + }, + { + "epoch": 16.28672, + "grad_norm": 0.9682973027229309, + "learning_rate": 2.4567827130852343e-05, + "loss": 0.5172, + "step": 12724 + }, + { + "epoch": 16.288, + "grad_norm": 0.9596603512763977, + "learning_rate": 2.4565826330532214e-05, + "loss": 0.522, + "step": 12725 + }, + { + "epoch": 16.28928, + "grad_norm": 0.9290403723716736, + "learning_rate": 2.4563825530212086e-05, + "loss": 0.472, + "step": 12726 + }, + { + "epoch": 16.29056, + "grad_norm": 1.052056908607483, + "learning_rate": 2.4561824729891958e-05, + "loss": 0.5415, + "step": 12727 + }, + { + "epoch": 16.29184, + "grad_norm": 1.0477123260498047, + "learning_rate": 2.455982392957183e-05, + "loss": 0.5393, + "step": 12728 + }, + { + "epoch": 16.29312, + "grad_norm": 0.9668442010879517, + "learning_rate": 2.4557823129251702e-05, + "loss": 0.4741, + "step": 12729 + }, + { + "epoch": 16.2944, + "grad_norm": 1.0437530279159546, + "learning_rate": 2.4555822328931574e-05, + "loss": 0.5, + "step": 12730 + }, + { + "epoch": 16.29568, + "grad_norm": 0.9850262999534607, + "learning_rate": 2.4553821528611446e-05, + "loss": 0.5108, + "step": 12731 + }, + { + "epoch": 16.29696, + "grad_norm": 0.9622449278831482, + "learning_rate": 2.4551820728291317e-05, + "loss": 0.506, + "step": 12732 + }, + { + "epoch": 16.29824, + "grad_norm": 1.0005285739898682, + "learning_rate": 2.454981992797119e-05, + "loss": 0.5083, + "step": 12733 + }, + { + "epoch": 16.29952, + "grad_norm": 0.9644542336463928, + "learning_rate": 2.454781912765106e-05, + "loss": 0.4986, + "step": 12734 + }, + { + "epoch": 16.3008, + "grad_norm": 1.0753077268600464, + "learning_rate": 2.4545818327330933e-05, + "loss": 0.5464, + "step": 12735 + }, + { + "epoch": 16.30208, + "grad_norm": 1.04744291305542, + "learning_rate": 2.4543817527010808e-05, + "loss": 0.5277, + "step": 12736 + }, + { + "epoch": 16.30336, + "grad_norm": 0.9650123119354248, + "learning_rate": 2.4541816726690677e-05, + "loss": 0.4648, + "step": 12737 + }, + { + "epoch": 16.30464, + "grad_norm": 0.9497844576835632, + "learning_rate": 2.453981592637055e-05, + "loss": 0.5344, + "step": 12738 + }, + { + "epoch": 16.30592, + "grad_norm": 1.0927493572235107, + "learning_rate": 2.453781512605042e-05, + "loss": 0.5106, + "step": 12739 + }, + { + "epoch": 16.3072, + "grad_norm": 1.0164883136749268, + "learning_rate": 2.4535814325730296e-05, + "loss": 0.5394, + "step": 12740 + }, + { + "epoch": 16.30848, + "grad_norm": 0.9408139586448669, + "learning_rate": 2.4533813525410164e-05, + "loss": 0.4882, + "step": 12741 + }, + { + "epoch": 16.30976, + "grad_norm": 0.9942525625228882, + "learning_rate": 2.4531812725090036e-05, + "loss": 0.5058, + "step": 12742 + }, + { + "epoch": 16.31104, + "grad_norm": 1.0041530132293701, + "learning_rate": 2.452981192476991e-05, + "loss": 0.4444, + "step": 12743 + }, + { + "epoch": 16.31232, + "grad_norm": 0.9434515833854675, + "learning_rate": 2.4527811124449783e-05, + "loss": 0.4969, + "step": 12744 + }, + { + "epoch": 16.3136, + "grad_norm": 1.0025871992111206, + "learning_rate": 2.452581032412965e-05, + "loss": 0.4901, + "step": 12745 + }, + { + "epoch": 16.31488, + "grad_norm": 0.9953147768974304, + "learning_rate": 2.4523809523809523e-05, + "loss": 0.512, + "step": 12746 + }, + { + "epoch": 16.31616, + "grad_norm": 1.1160242557525635, + "learning_rate": 2.45218087234894e-05, + "loss": 0.5995, + "step": 12747 + }, + { + "epoch": 16.31744, + "grad_norm": 0.9796026349067688, + "learning_rate": 2.451980792316927e-05, + "loss": 0.4865, + "step": 12748 + }, + { + "epoch": 16.31872, + "grad_norm": 1.0219937562942505, + "learning_rate": 2.451780712284914e-05, + "loss": 0.4902, + "step": 12749 + }, + { + "epoch": 16.32, + "grad_norm": 1.036454200744629, + "learning_rate": 2.4515806322529014e-05, + "loss": 0.5686, + "step": 12750 + }, + { + "epoch": 16.32128, + "grad_norm": 0.9680700302124023, + "learning_rate": 2.4513805522208886e-05, + "loss": 0.5085, + "step": 12751 + }, + { + "epoch": 16.32256, + "grad_norm": 1.001165509223938, + "learning_rate": 2.4511804721888758e-05, + "loss": 0.5052, + "step": 12752 + }, + { + "epoch": 16.32384, + "grad_norm": 0.9691818356513977, + "learning_rate": 2.4509803921568626e-05, + "loss": 0.5176, + "step": 12753 + }, + { + "epoch": 16.32512, + "grad_norm": 0.9570626616477966, + "learning_rate": 2.45078031212485e-05, + "loss": 0.4953, + "step": 12754 + }, + { + "epoch": 16.3264, + "grad_norm": 1.052276849746704, + "learning_rate": 2.4505802320928374e-05, + "loss": 0.5756, + "step": 12755 + }, + { + "epoch": 16.32768, + "grad_norm": 0.9954919219017029, + "learning_rate": 2.4503801520608245e-05, + "loss": 0.4592, + "step": 12756 + }, + { + "epoch": 16.32896, + "grad_norm": 1.041548490524292, + "learning_rate": 2.4501800720288117e-05, + "loss": 0.4829, + "step": 12757 + }, + { + "epoch": 16.33024, + "grad_norm": 1.0538829565048218, + "learning_rate": 2.449979991996799e-05, + "loss": 0.5347, + "step": 12758 + }, + { + "epoch": 16.33152, + "grad_norm": 1.0308974981307983, + "learning_rate": 2.449779911964786e-05, + "loss": 0.5045, + "step": 12759 + }, + { + "epoch": 16.3328, + "grad_norm": 1.0308443307876587, + "learning_rate": 2.4495798319327733e-05, + "loss": 0.5121, + "step": 12760 + }, + { + "epoch": 16.33408, + "grad_norm": 1.0774739980697632, + "learning_rate": 2.4493797519007605e-05, + "loss": 0.5345, + "step": 12761 + }, + { + "epoch": 16.33536, + "grad_norm": 1.0473657846450806, + "learning_rate": 2.4491796718687477e-05, + "loss": 0.5251, + "step": 12762 + }, + { + "epoch": 16.33664, + "grad_norm": 1.0154544115066528, + "learning_rate": 2.448979591836735e-05, + "loss": 0.498, + "step": 12763 + }, + { + "epoch": 16.33792, + "grad_norm": 0.9398629069328308, + "learning_rate": 2.448779511804722e-05, + "loss": 0.493, + "step": 12764 + }, + { + "epoch": 16.3392, + "grad_norm": 1.0081340074539185, + "learning_rate": 2.4485794317727092e-05, + "loss": 0.5177, + "step": 12765 + }, + { + "epoch": 16.34048, + "grad_norm": 1.0168988704681396, + "learning_rate": 2.4483793517406964e-05, + "loss": 0.5382, + "step": 12766 + }, + { + "epoch": 16.34176, + "grad_norm": 1.0033296346664429, + "learning_rate": 2.4481792717086836e-05, + "loss": 0.5449, + "step": 12767 + }, + { + "epoch": 16.34304, + "grad_norm": 0.9844812154769897, + "learning_rate": 2.4479791916766708e-05, + "loss": 0.4916, + "step": 12768 + }, + { + "epoch": 16.34432, + "grad_norm": 1.042959213256836, + "learning_rate": 2.447779111644658e-05, + "loss": 0.4871, + "step": 12769 + }, + { + "epoch": 16.3456, + "grad_norm": 1.0168309211730957, + "learning_rate": 2.447579031612645e-05, + "loss": 0.4914, + "step": 12770 + }, + { + "epoch": 16.34688, + "grad_norm": 1.0198049545288086, + "learning_rate": 2.4473789515806323e-05, + "loss": 0.4982, + "step": 12771 + }, + { + "epoch": 16.34816, + "grad_norm": 1.0532821416854858, + "learning_rate": 2.4471788715486195e-05, + "loss": 0.5367, + "step": 12772 + }, + { + "epoch": 16.34944, + "grad_norm": 1.0325431823730469, + "learning_rate": 2.4469787915166067e-05, + "loss": 0.5169, + "step": 12773 + }, + { + "epoch": 16.35072, + "grad_norm": 0.9796453714370728, + "learning_rate": 2.446778711484594e-05, + "loss": 0.4951, + "step": 12774 + }, + { + "epoch": 16.352, + "grad_norm": 1.0543535947799683, + "learning_rate": 2.4465786314525814e-05, + "loss": 0.5832, + "step": 12775 + }, + { + "epoch": 16.35328, + "grad_norm": 1.0445586442947388, + "learning_rate": 2.4463785514205683e-05, + "loss": 0.5339, + "step": 12776 + }, + { + "epoch": 16.35456, + "grad_norm": 1.0602067708969116, + "learning_rate": 2.4461784713885554e-05, + "loss": 0.5414, + "step": 12777 + }, + { + "epoch": 16.35584, + "grad_norm": 1.0592375993728638, + "learning_rate": 2.4459783913565426e-05, + "loss": 0.5309, + "step": 12778 + }, + { + "epoch": 16.35712, + "grad_norm": 1.0167428255081177, + "learning_rate": 2.44577831132453e-05, + "loss": 0.5576, + "step": 12779 + }, + { + "epoch": 16.3584, + "grad_norm": 1.0088391304016113, + "learning_rate": 2.445578231292517e-05, + "loss": 0.4863, + "step": 12780 + }, + { + "epoch": 16.35968, + "grad_norm": 1.0060917139053345, + "learning_rate": 2.4453781512605042e-05, + "loss": 0.4916, + "step": 12781 + }, + { + "epoch": 16.36096, + "grad_norm": 0.9650666117668152, + "learning_rate": 2.4451780712284917e-05, + "loss": 0.483, + "step": 12782 + }, + { + "epoch": 16.36224, + "grad_norm": 1.0193016529083252, + "learning_rate": 2.444977991196479e-05, + "loss": 0.5077, + "step": 12783 + }, + { + "epoch": 16.36352, + "grad_norm": 1.00182044506073, + "learning_rate": 2.4447779111644657e-05, + "loss": 0.4533, + "step": 12784 + }, + { + "epoch": 16.3648, + "grad_norm": 1.016948938369751, + "learning_rate": 2.444577831132453e-05, + "loss": 0.4998, + "step": 12785 + }, + { + "epoch": 16.36608, + "grad_norm": 1.0034083127975464, + "learning_rate": 2.4443777511004404e-05, + "loss": 0.5126, + "step": 12786 + }, + { + "epoch": 16.36736, + "grad_norm": 1.0436108112335205, + "learning_rate": 2.4441776710684276e-05, + "loss": 0.503, + "step": 12787 + }, + { + "epoch": 16.36864, + "grad_norm": 1.076514482498169, + "learning_rate": 2.4439775910364145e-05, + "loss": 0.5608, + "step": 12788 + }, + { + "epoch": 16.36992, + "grad_norm": 1.050675868988037, + "learning_rate": 2.443777511004402e-05, + "loss": 0.5245, + "step": 12789 + }, + { + "epoch": 16.3712, + "grad_norm": 0.9627003073692322, + "learning_rate": 2.4435774309723892e-05, + "loss": 0.4655, + "step": 12790 + }, + { + "epoch": 16.37248, + "grad_norm": 1.0105419158935547, + "learning_rate": 2.4433773509403764e-05, + "loss": 0.5192, + "step": 12791 + }, + { + "epoch": 16.37376, + "grad_norm": 1.0531201362609863, + "learning_rate": 2.4431772709083632e-05, + "loss": 0.5213, + "step": 12792 + }, + { + "epoch": 16.37504, + "grad_norm": 1.0447622537612915, + "learning_rate": 2.4429771908763507e-05, + "loss": 0.4838, + "step": 12793 + }, + { + "epoch": 16.37632, + "grad_norm": 1.0392229557037354, + "learning_rate": 2.442777110844338e-05, + "loss": 0.5205, + "step": 12794 + }, + { + "epoch": 16.3776, + "grad_norm": 0.9850015640258789, + "learning_rate": 2.442577030812325e-05, + "loss": 0.5161, + "step": 12795 + }, + { + "epoch": 16.37888, + "grad_norm": 1.0995105504989624, + "learning_rate": 2.4423769507803123e-05, + "loss": 0.5643, + "step": 12796 + }, + { + "epoch": 16.38016, + "grad_norm": 1.0674548149108887, + "learning_rate": 2.4421768707482995e-05, + "loss": 0.5182, + "step": 12797 + }, + { + "epoch": 16.38144, + "grad_norm": 1.1218688488006592, + "learning_rate": 2.4419767907162867e-05, + "loss": 0.5739, + "step": 12798 + }, + { + "epoch": 16.38272, + "grad_norm": 0.980241060256958, + "learning_rate": 2.441776710684274e-05, + "loss": 0.5029, + "step": 12799 + }, + { + "epoch": 16.384, + "grad_norm": 1.0264852046966553, + "learning_rate": 2.441576630652261e-05, + "loss": 0.5188, + "step": 12800 + }, + { + "epoch": 16.38528, + "grad_norm": 1.005523920059204, + "learning_rate": 2.4413765506202482e-05, + "loss": 0.5254, + "step": 12801 + }, + { + "epoch": 16.38656, + "grad_norm": 1.0361162424087524, + "learning_rate": 2.4411764705882354e-05, + "loss": 0.4976, + "step": 12802 + }, + { + "epoch": 16.38784, + "grad_norm": 1.035775899887085, + "learning_rate": 2.4409763905562226e-05, + "loss": 0.5238, + "step": 12803 + }, + { + "epoch": 16.38912, + "grad_norm": 1.037307858467102, + "learning_rate": 2.4407763105242098e-05, + "loss": 0.5143, + "step": 12804 + }, + { + "epoch": 16.3904, + "grad_norm": 0.9811749458312988, + "learning_rate": 2.440576230492197e-05, + "loss": 0.5074, + "step": 12805 + }, + { + "epoch": 16.39168, + "grad_norm": 1.11452317237854, + "learning_rate": 2.440376150460184e-05, + "loss": 0.4935, + "step": 12806 + }, + { + "epoch": 16.39296, + "grad_norm": 0.9943965077400208, + "learning_rate": 2.4401760704281713e-05, + "loss": 0.5208, + "step": 12807 + }, + { + "epoch": 16.39424, + "grad_norm": 1.129791259765625, + "learning_rate": 2.4399759903961585e-05, + "loss": 0.5443, + "step": 12808 + }, + { + "epoch": 16.39552, + "grad_norm": 1.0286704301834106, + "learning_rate": 2.4397759103641457e-05, + "loss": 0.5206, + "step": 12809 + }, + { + "epoch": 16.3968, + "grad_norm": 1.0669382810592651, + "learning_rate": 2.4395758303321332e-05, + "loss": 0.5445, + "step": 12810 + }, + { + "epoch": 16.39808, + "grad_norm": 1.0054514408111572, + "learning_rate": 2.43937575030012e-05, + "loss": 0.5141, + "step": 12811 + }, + { + "epoch": 16.39936, + "grad_norm": 1.0013278722763062, + "learning_rate": 2.4391756702681073e-05, + "loss": 0.5338, + "step": 12812 + }, + { + "epoch": 16.40064, + "grad_norm": 1.0158824920654297, + "learning_rate": 2.4389755902360945e-05, + "loss": 0.5281, + "step": 12813 + }, + { + "epoch": 16.40192, + "grad_norm": 0.9622918367385864, + "learning_rate": 2.438775510204082e-05, + "loss": 0.4957, + "step": 12814 + }, + { + "epoch": 16.4032, + "grad_norm": 1.0246028900146484, + "learning_rate": 2.438575430172069e-05, + "loss": 0.5386, + "step": 12815 + }, + { + "epoch": 16.40448, + "grad_norm": 1.0182970762252808, + "learning_rate": 2.438375350140056e-05, + "loss": 0.4567, + "step": 12816 + }, + { + "epoch": 16.40576, + "grad_norm": 1.0306717157363892, + "learning_rate": 2.4381752701080432e-05, + "loss": 0.574, + "step": 12817 + }, + { + "epoch": 16.40704, + "grad_norm": 1.003909707069397, + "learning_rate": 2.4379751900760307e-05, + "loss": 0.4996, + "step": 12818 + }, + { + "epoch": 16.40832, + "grad_norm": 1.0095973014831543, + "learning_rate": 2.4377751100440176e-05, + "loss": 0.5096, + "step": 12819 + }, + { + "epoch": 16.4096, + "grad_norm": 1.0749574899673462, + "learning_rate": 2.4375750300120048e-05, + "loss": 0.5673, + "step": 12820 + }, + { + "epoch": 16.41088, + "grad_norm": 1.0384900569915771, + "learning_rate": 2.4373749499799923e-05, + "loss": 0.4899, + "step": 12821 + }, + { + "epoch": 16.41216, + "grad_norm": 1.0096982717514038, + "learning_rate": 2.4371748699479795e-05, + "loss": 0.5459, + "step": 12822 + }, + { + "epoch": 16.41344, + "grad_norm": 0.9829618334770203, + "learning_rate": 2.4369747899159663e-05, + "loss": 0.5178, + "step": 12823 + }, + { + "epoch": 16.41472, + "grad_norm": 1.046731948852539, + "learning_rate": 2.4367747098839535e-05, + "loss": 0.5299, + "step": 12824 + }, + { + "epoch": 16.416, + "grad_norm": 1.0445644855499268, + "learning_rate": 2.436574629851941e-05, + "loss": 0.5256, + "step": 12825 + }, + { + "epoch": 16.41728, + "grad_norm": 1.0414682626724243, + "learning_rate": 2.4363745498199282e-05, + "loss": 0.5141, + "step": 12826 + }, + { + "epoch": 16.41856, + "grad_norm": 1.0373891592025757, + "learning_rate": 2.436174469787915e-05, + "loss": 0.4973, + "step": 12827 + }, + { + "epoch": 16.41984, + "grad_norm": 0.9886151552200317, + "learning_rate": 2.4359743897559026e-05, + "loss": 0.5466, + "step": 12828 + }, + { + "epoch": 16.42112, + "grad_norm": 1.0570114850997925, + "learning_rate": 2.4357743097238898e-05, + "loss": 0.5385, + "step": 12829 + }, + { + "epoch": 16.4224, + "grad_norm": 1.0138678550720215, + "learning_rate": 2.435574229691877e-05, + "loss": 0.5263, + "step": 12830 + }, + { + "epoch": 16.42368, + "grad_norm": 1.007300615310669, + "learning_rate": 2.4353741496598638e-05, + "loss": 0.5041, + "step": 12831 + }, + { + "epoch": 16.42496, + "grad_norm": 1.026974081993103, + "learning_rate": 2.4351740696278513e-05, + "loss": 0.521, + "step": 12832 + }, + { + "epoch": 16.42624, + "grad_norm": 0.9623899459838867, + "learning_rate": 2.4349739895958385e-05, + "loss": 0.4611, + "step": 12833 + }, + { + "epoch": 16.42752, + "grad_norm": 1.0570974349975586, + "learning_rate": 2.4347739095638257e-05, + "loss": 0.5223, + "step": 12834 + }, + { + "epoch": 16.4288, + "grad_norm": 0.989608883857727, + "learning_rate": 2.434573829531813e-05, + "loss": 0.4902, + "step": 12835 + }, + { + "epoch": 16.43008, + "grad_norm": 1.0434807538986206, + "learning_rate": 2.4343737494998e-05, + "loss": 0.5691, + "step": 12836 + }, + { + "epoch": 16.43136, + "grad_norm": 1.035138487815857, + "learning_rate": 2.4341736694677873e-05, + "loss": 0.5493, + "step": 12837 + }, + { + "epoch": 16.43264, + "grad_norm": 1.1263927221298218, + "learning_rate": 2.4339735894357744e-05, + "loss": 0.5908, + "step": 12838 + }, + { + "epoch": 16.43392, + "grad_norm": 1.0934780836105347, + "learning_rate": 2.4337735094037616e-05, + "loss": 0.5754, + "step": 12839 + }, + { + "epoch": 16.4352, + "grad_norm": 1.0011106729507446, + "learning_rate": 2.4335734293717488e-05, + "loss": 0.4959, + "step": 12840 + }, + { + "epoch": 16.43648, + "grad_norm": 1.0350295305252075, + "learning_rate": 2.433373349339736e-05, + "loss": 0.5492, + "step": 12841 + }, + { + "epoch": 16.43776, + "grad_norm": 1.044190526008606, + "learning_rate": 2.4331732693077232e-05, + "loss": 0.5304, + "step": 12842 + }, + { + "epoch": 16.43904, + "grad_norm": 0.9419692158699036, + "learning_rate": 2.4329731892757104e-05, + "loss": 0.4753, + "step": 12843 + }, + { + "epoch": 16.44032, + "grad_norm": 0.9518587589263916, + "learning_rate": 2.4327731092436976e-05, + "loss": 0.4901, + "step": 12844 + }, + { + "epoch": 16.4416, + "grad_norm": 0.9863883852958679, + "learning_rate": 2.4325730292116847e-05, + "loss": 0.5236, + "step": 12845 + }, + { + "epoch": 16.44288, + "grad_norm": 1.0617551803588867, + "learning_rate": 2.432372949179672e-05, + "loss": 0.5631, + "step": 12846 + }, + { + "epoch": 16.44416, + "grad_norm": 1.0354115962982178, + "learning_rate": 2.432172869147659e-05, + "loss": 0.499, + "step": 12847 + }, + { + "epoch": 16.44544, + "grad_norm": 1.103115439414978, + "learning_rate": 2.4319727891156463e-05, + "loss": 0.5406, + "step": 12848 + }, + { + "epoch": 16.44672, + "grad_norm": 0.9908240437507629, + "learning_rate": 2.4317727090836338e-05, + "loss": 0.478, + "step": 12849 + }, + { + "epoch": 16.448, + "grad_norm": 1.0611495971679688, + "learning_rate": 2.4315726290516207e-05, + "loss": 0.5106, + "step": 12850 + }, + { + "epoch": 16.44928, + "grad_norm": 1.039263367652893, + "learning_rate": 2.431372549019608e-05, + "loss": 0.5012, + "step": 12851 + }, + { + "epoch": 16.45056, + "grad_norm": 1.0724480152130127, + "learning_rate": 2.431172468987595e-05, + "loss": 0.5161, + "step": 12852 + }, + { + "epoch": 16.45184, + "grad_norm": 1.0709526538848877, + "learning_rate": 2.4309723889555826e-05, + "loss": 0.4908, + "step": 12853 + }, + { + "epoch": 16.45312, + "grad_norm": 1.0412678718566895, + "learning_rate": 2.4307723089235694e-05, + "loss": 0.5438, + "step": 12854 + }, + { + "epoch": 16.4544, + "grad_norm": 0.9475101828575134, + "learning_rate": 2.4305722288915566e-05, + "loss": 0.4536, + "step": 12855 + }, + { + "epoch": 16.45568, + "grad_norm": 0.9541323781013489, + "learning_rate": 2.430372148859544e-05, + "loss": 0.5116, + "step": 12856 + }, + { + "epoch": 16.45696, + "grad_norm": 1.0412318706512451, + "learning_rate": 2.4301720688275313e-05, + "loss": 0.538, + "step": 12857 + }, + { + "epoch": 16.45824, + "grad_norm": 1.0488194227218628, + "learning_rate": 2.429971988795518e-05, + "loss": 0.5507, + "step": 12858 + }, + { + "epoch": 16.45952, + "grad_norm": 0.9851821660995483, + "learning_rate": 2.4297719087635053e-05, + "loss": 0.5014, + "step": 12859 + }, + { + "epoch": 16.4608, + "grad_norm": 0.9385454654693604, + "learning_rate": 2.429571828731493e-05, + "loss": 0.4872, + "step": 12860 + }, + { + "epoch": 16.46208, + "grad_norm": 1.0448015928268433, + "learning_rate": 2.42937174869948e-05, + "loss": 0.5212, + "step": 12861 + }, + { + "epoch": 16.46336, + "grad_norm": 0.9894904494285583, + "learning_rate": 2.429171668667467e-05, + "loss": 0.5181, + "step": 12862 + }, + { + "epoch": 16.46464, + "grad_norm": 1.0554355382919312, + "learning_rate": 2.4289715886354544e-05, + "loss": 0.557, + "step": 12863 + }, + { + "epoch": 16.46592, + "grad_norm": 1.055113434791565, + "learning_rate": 2.4287715086034416e-05, + "loss": 0.5408, + "step": 12864 + }, + { + "epoch": 16.4672, + "grad_norm": 0.9813659191131592, + "learning_rate": 2.4285714285714288e-05, + "loss": 0.518, + "step": 12865 + }, + { + "epoch": 16.46848, + "grad_norm": 1.0583056211471558, + "learning_rate": 2.4283713485394156e-05, + "loss": 0.5171, + "step": 12866 + }, + { + "epoch": 16.46976, + "grad_norm": 0.9968990087509155, + "learning_rate": 2.428171268507403e-05, + "loss": 0.5158, + "step": 12867 + }, + { + "epoch": 16.47104, + "grad_norm": 0.9917585849761963, + "learning_rate": 2.4279711884753904e-05, + "loss": 0.5027, + "step": 12868 + }, + { + "epoch": 16.47232, + "grad_norm": 0.9894549250602722, + "learning_rate": 2.4277711084433775e-05, + "loss": 0.4815, + "step": 12869 + }, + { + "epoch": 16.4736, + "grad_norm": 1.0116682052612305, + "learning_rate": 2.4275710284113647e-05, + "loss": 0.5373, + "step": 12870 + }, + { + "epoch": 16.47488, + "grad_norm": 0.9584359526634216, + "learning_rate": 2.427370948379352e-05, + "loss": 0.4933, + "step": 12871 + }, + { + "epoch": 16.47616, + "grad_norm": 0.9404587745666504, + "learning_rate": 2.427170868347339e-05, + "loss": 0.4687, + "step": 12872 + }, + { + "epoch": 16.47744, + "grad_norm": 0.9797256588935852, + "learning_rate": 2.4269707883153263e-05, + "loss": 0.5015, + "step": 12873 + }, + { + "epoch": 16.47872, + "grad_norm": 1.0498969554901123, + "learning_rate": 2.4267707082833135e-05, + "loss": 0.5247, + "step": 12874 + }, + { + "epoch": 16.48, + "grad_norm": 0.9874524474143982, + "learning_rate": 2.4265706282513007e-05, + "loss": 0.4865, + "step": 12875 + }, + { + "epoch": 16.48128, + "grad_norm": 0.9777351021766663, + "learning_rate": 2.426370548219288e-05, + "loss": 0.4579, + "step": 12876 + }, + { + "epoch": 16.48256, + "grad_norm": 1.0781558752059937, + "learning_rate": 2.426170468187275e-05, + "loss": 0.5586, + "step": 12877 + }, + { + "epoch": 16.48384, + "grad_norm": 1.029046893119812, + "learning_rate": 2.4259703881552622e-05, + "loss": 0.5054, + "step": 12878 + }, + { + "epoch": 16.48512, + "grad_norm": 0.9987738728523254, + "learning_rate": 2.4257703081232494e-05, + "loss": 0.4803, + "step": 12879 + }, + { + "epoch": 16.4864, + "grad_norm": 1.0528926849365234, + "learning_rate": 2.4255702280912366e-05, + "loss": 0.5177, + "step": 12880 + }, + { + "epoch": 16.48768, + "grad_norm": 1.0812759399414062, + "learning_rate": 2.4253701480592238e-05, + "loss": 0.5125, + "step": 12881 + }, + { + "epoch": 16.48896, + "grad_norm": 0.9843981266021729, + "learning_rate": 2.425170068027211e-05, + "loss": 0.4758, + "step": 12882 + }, + { + "epoch": 16.49024, + "grad_norm": 0.9296069145202637, + "learning_rate": 2.424969987995198e-05, + "loss": 0.5027, + "step": 12883 + }, + { + "epoch": 16.49152, + "grad_norm": 0.9559636116027832, + "learning_rate": 2.4247699079631853e-05, + "loss": 0.511, + "step": 12884 + }, + { + "epoch": 16.4928, + "grad_norm": 1.0256993770599365, + "learning_rate": 2.4245698279311725e-05, + "loss": 0.5539, + "step": 12885 + }, + { + "epoch": 16.49408, + "grad_norm": 1.0049717426300049, + "learning_rate": 2.4243697478991597e-05, + "loss": 0.5549, + "step": 12886 + }, + { + "epoch": 16.49536, + "grad_norm": 1.005260705947876, + "learning_rate": 2.424169667867147e-05, + "loss": 0.5164, + "step": 12887 + }, + { + "epoch": 16.49664, + "grad_norm": 0.9868449568748474, + "learning_rate": 2.4239695878351344e-05, + "loss": 0.5297, + "step": 12888 + }, + { + "epoch": 16.49792, + "grad_norm": 1.0604848861694336, + "learning_rate": 2.4237695078031213e-05, + "loss": 0.5506, + "step": 12889 + }, + { + "epoch": 16.4992, + "grad_norm": 1.0443841218948364, + "learning_rate": 2.4235694277711084e-05, + "loss": 0.5574, + "step": 12890 + }, + { + "epoch": 16.50048, + "grad_norm": 1.0032176971435547, + "learning_rate": 2.4233693477390956e-05, + "loss": 0.5244, + "step": 12891 + }, + { + "epoch": 16.50176, + "grad_norm": 1.0291366577148438, + "learning_rate": 2.423169267707083e-05, + "loss": 0.5231, + "step": 12892 + }, + { + "epoch": 16.50304, + "grad_norm": 1.0801621675491333, + "learning_rate": 2.42296918767507e-05, + "loss": 0.5645, + "step": 12893 + }, + { + "epoch": 16.50432, + "grad_norm": 1.0463597774505615, + "learning_rate": 2.4227691076430572e-05, + "loss": 0.4742, + "step": 12894 + }, + { + "epoch": 16.5056, + "grad_norm": 1.0148074626922607, + "learning_rate": 2.4225690276110447e-05, + "loss": 0.5203, + "step": 12895 + }, + { + "epoch": 16.50688, + "grad_norm": 0.9906325340270996, + "learning_rate": 2.422368947579032e-05, + "loss": 0.4931, + "step": 12896 + }, + { + "epoch": 16.50816, + "grad_norm": 0.9908265471458435, + "learning_rate": 2.4221688675470187e-05, + "loss": 0.5016, + "step": 12897 + }, + { + "epoch": 16.50944, + "grad_norm": 0.9555728435516357, + "learning_rate": 2.421968787515006e-05, + "loss": 0.4763, + "step": 12898 + }, + { + "epoch": 16.51072, + "grad_norm": 0.9900997281074524, + "learning_rate": 2.4217687074829934e-05, + "loss": 0.4907, + "step": 12899 + }, + { + "epoch": 16.512, + "grad_norm": 0.9736559391021729, + "learning_rate": 2.4215686274509806e-05, + "loss": 0.4814, + "step": 12900 + }, + { + "epoch": 16.51328, + "grad_norm": 1.0533136129379272, + "learning_rate": 2.4213685474189675e-05, + "loss": 0.5556, + "step": 12901 + }, + { + "epoch": 16.51456, + "grad_norm": 0.982058584690094, + "learning_rate": 2.421168467386955e-05, + "loss": 0.4757, + "step": 12902 + }, + { + "epoch": 16.51584, + "grad_norm": 1.0134705305099487, + "learning_rate": 2.4209683873549422e-05, + "loss": 0.5173, + "step": 12903 + }, + { + "epoch": 16.51712, + "grad_norm": 1.0609053373336792, + "learning_rate": 2.4207683073229294e-05, + "loss": 0.5624, + "step": 12904 + }, + { + "epoch": 16.5184, + "grad_norm": 1.0328656435012817, + "learning_rate": 2.4205682272909162e-05, + "loss": 0.5456, + "step": 12905 + }, + { + "epoch": 16.51968, + "grad_norm": 0.997097373008728, + "learning_rate": 2.4203681472589037e-05, + "loss": 0.4958, + "step": 12906 + }, + { + "epoch": 16.52096, + "grad_norm": 1.0784469842910767, + "learning_rate": 2.420168067226891e-05, + "loss": 0.4847, + "step": 12907 + }, + { + "epoch": 16.52224, + "grad_norm": 1.0922346115112305, + "learning_rate": 2.419967987194878e-05, + "loss": 0.5269, + "step": 12908 + }, + { + "epoch": 16.52352, + "grad_norm": 1.0385076999664307, + "learning_rate": 2.4197679071628653e-05, + "loss": 0.472, + "step": 12909 + }, + { + "epoch": 16.5248, + "grad_norm": 0.9775056838989258, + "learning_rate": 2.4195678271308525e-05, + "loss": 0.5025, + "step": 12910 + }, + { + "epoch": 16.52608, + "grad_norm": 1.0673224925994873, + "learning_rate": 2.4193677470988397e-05, + "loss": 0.5495, + "step": 12911 + }, + { + "epoch": 16.52736, + "grad_norm": 1.0127609968185425, + "learning_rate": 2.419167667066827e-05, + "loss": 0.5031, + "step": 12912 + }, + { + "epoch": 16.52864, + "grad_norm": 0.9412071108818054, + "learning_rate": 2.418967587034814e-05, + "loss": 0.4704, + "step": 12913 + }, + { + "epoch": 16.52992, + "grad_norm": 0.9896215200424194, + "learning_rate": 2.4187675070028012e-05, + "loss": 0.4851, + "step": 12914 + }, + { + "epoch": 16.5312, + "grad_norm": 1.0404068231582642, + "learning_rate": 2.4185674269707884e-05, + "loss": 0.5212, + "step": 12915 + }, + { + "epoch": 16.53248, + "grad_norm": 1.0728304386138916, + "learning_rate": 2.4183673469387756e-05, + "loss": 0.5685, + "step": 12916 + }, + { + "epoch": 16.53376, + "grad_norm": 1.0526484251022339, + "learning_rate": 2.4181672669067628e-05, + "loss": 0.5223, + "step": 12917 + }, + { + "epoch": 16.53504, + "grad_norm": 1.0103858709335327, + "learning_rate": 2.41796718687475e-05, + "loss": 0.5132, + "step": 12918 + }, + { + "epoch": 16.53632, + "grad_norm": 1.032500982284546, + "learning_rate": 2.417767106842737e-05, + "loss": 0.4861, + "step": 12919 + }, + { + "epoch": 16.5376, + "grad_norm": 1.0344327688217163, + "learning_rate": 2.4175670268107243e-05, + "loss": 0.5153, + "step": 12920 + }, + { + "epoch": 16.53888, + "grad_norm": 1.008581280708313, + "learning_rate": 2.4173669467787115e-05, + "loss": 0.5462, + "step": 12921 + }, + { + "epoch": 16.54016, + "grad_norm": 1.00281822681427, + "learning_rate": 2.4171668667466987e-05, + "loss": 0.5099, + "step": 12922 + }, + { + "epoch": 16.54144, + "grad_norm": 1.0224007368087769, + "learning_rate": 2.4169667867146862e-05, + "loss": 0.5216, + "step": 12923 + }, + { + "epoch": 16.54272, + "grad_norm": 0.9643240571022034, + "learning_rate": 2.416766706682673e-05, + "loss": 0.5092, + "step": 12924 + }, + { + "epoch": 16.544, + "grad_norm": 1.013013482093811, + "learning_rate": 2.4165666266506603e-05, + "loss": 0.5128, + "step": 12925 + }, + { + "epoch": 16.545279999999998, + "grad_norm": 1.0628739595413208, + "learning_rate": 2.4163665466186475e-05, + "loss": 0.5326, + "step": 12926 + }, + { + "epoch": 16.54656, + "grad_norm": 0.9615119099617004, + "learning_rate": 2.416166466586635e-05, + "loss": 0.5209, + "step": 12927 + }, + { + "epoch": 16.54784, + "grad_norm": 1.0781883001327515, + "learning_rate": 2.415966386554622e-05, + "loss": 0.5558, + "step": 12928 + }, + { + "epoch": 16.54912, + "grad_norm": 0.9772311449050903, + "learning_rate": 2.415766306522609e-05, + "loss": 0.5129, + "step": 12929 + }, + { + "epoch": 16.5504, + "grad_norm": 1.0191627740859985, + "learning_rate": 2.4155662264905962e-05, + "loss": 0.5298, + "step": 12930 + }, + { + "epoch": 16.55168, + "grad_norm": 1.017059087753296, + "learning_rate": 2.4153661464585837e-05, + "loss": 0.5203, + "step": 12931 + }, + { + "epoch": 16.55296, + "grad_norm": 0.9743010997772217, + "learning_rate": 2.4151660664265706e-05, + "loss": 0.5098, + "step": 12932 + }, + { + "epoch": 16.55424, + "grad_norm": 1.0007681846618652, + "learning_rate": 2.4149659863945578e-05, + "loss": 0.4947, + "step": 12933 + }, + { + "epoch": 16.55552, + "grad_norm": 1.0352661609649658, + "learning_rate": 2.4147659063625453e-05, + "loss": 0.5854, + "step": 12934 + }, + { + "epoch": 16.5568, + "grad_norm": 1.0481826066970825, + "learning_rate": 2.4145658263305325e-05, + "loss": 0.5548, + "step": 12935 + }, + { + "epoch": 16.55808, + "grad_norm": 0.9723829030990601, + "learning_rate": 2.4143657462985193e-05, + "loss": 0.5207, + "step": 12936 + }, + { + "epoch": 16.55936, + "grad_norm": 1.0767929553985596, + "learning_rate": 2.4141656662665065e-05, + "loss": 0.5427, + "step": 12937 + }, + { + "epoch": 16.56064, + "grad_norm": 1.0616075992584229, + "learning_rate": 2.413965586234494e-05, + "loss": 0.5186, + "step": 12938 + }, + { + "epoch": 16.56192, + "grad_norm": 1.0053772926330566, + "learning_rate": 2.4137655062024812e-05, + "loss": 0.5074, + "step": 12939 + }, + { + "epoch": 16.5632, + "grad_norm": 0.963870108127594, + "learning_rate": 2.413565426170468e-05, + "loss": 0.522, + "step": 12940 + }, + { + "epoch": 16.56448, + "grad_norm": 1.0813841819763184, + "learning_rate": 2.4133653461384556e-05, + "loss": 0.523, + "step": 12941 + }, + { + "epoch": 16.56576, + "grad_norm": 1.0653431415557861, + "learning_rate": 2.4131652661064428e-05, + "loss": 0.5574, + "step": 12942 + }, + { + "epoch": 16.56704, + "grad_norm": 0.9429889917373657, + "learning_rate": 2.41296518607443e-05, + "loss": 0.4838, + "step": 12943 + }, + { + "epoch": 16.56832, + "grad_norm": 1.001644492149353, + "learning_rate": 2.4127651060424168e-05, + "loss": 0.5048, + "step": 12944 + }, + { + "epoch": 16.5696, + "grad_norm": 1.0215190649032593, + "learning_rate": 2.4125650260104043e-05, + "loss": 0.5575, + "step": 12945 + }, + { + "epoch": 16.57088, + "grad_norm": 1.084291696548462, + "learning_rate": 2.4123649459783915e-05, + "loss": 0.583, + "step": 12946 + }, + { + "epoch": 16.57216, + "grad_norm": 0.9871845245361328, + "learning_rate": 2.4121648659463787e-05, + "loss": 0.4813, + "step": 12947 + }, + { + "epoch": 16.57344, + "grad_norm": 1.0017796754837036, + "learning_rate": 2.411964785914366e-05, + "loss": 0.5364, + "step": 12948 + }, + { + "epoch": 16.57472, + "grad_norm": 0.9939451813697815, + "learning_rate": 2.411764705882353e-05, + "loss": 0.529, + "step": 12949 + }, + { + "epoch": 16.576, + "grad_norm": 1.0428829193115234, + "learning_rate": 2.4115646258503403e-05, + "loss": 0.5333, + "step": 12950 + }, + { + "epoch": 16.577280000000002, + "grad_norm": 1.0062199831008911, + "learning_rate": 2.4113645458183274e-05, + "loss": 0.4992, + "step": 12951 + }, + { + "epoch": 16.57856, + "grad_norm": 1.0425946712493896, + "learning_rate": 2.4111644657863146e-05, + "loss": 0.5304, + "step": 12952 + }, + { + "epoch": 16.57984, + "grad_norm": 1.0212525129318237, + "learning_rate": 2.4109643857543018e-05, + "loss": 0.52, + "step": 12953 + }, + { + "epoch": 16.58112, + "grad_norm": 1.069036841392517, + "learning_rate": 2.410764305722289e-05, + "loss": 0.5671, + "step": 12954 + }, + { + "epoch": 16.5824, + "grad_norm": 1.0660767555236816, + "learning_rate": 2.4105642256902762e-05, + "loss": 0.5542, + "step": 12955 + }, + { + "epoch": 16.58368, + "grad_norm": 1.009103536605835, + "learning_rate": 2.4103641456582634e-05, + "loss": 0.5098, + "step": 12956 + }, + { + "epoch": 16.58496, + "grad_norm": 1.0511337518692017, + "learning_rate": 2.4101640656262506e-05, + "loss": 0.5654, + "step": 12957 + }, + { + "epoch": 16.58624, + "grad_norm": 0.9621989727020264, + "learning_rate": 2.4099639855942377e-05, + "loss": 0.4795, + "step": 12958 + }, + { + "epoch": 16.58752, + "grad_norm": 1.049293041229248, + "learning_rate": 2.409763905562225e-05, + "loss": 0.5408, + "step": 12959 + }, + { + "epoch": 16.5888, + "grad_norm": 1.0931252241134644, + "learning_rate": 2.409563825530212e-05, + "loss": 0.5048, + "step": 12960 + }, + { + "epoch": 16.59008, + "grad_norm": 1.0250253677368164, + "learning_rate": 2.4093637454981993e-05, + "loss": 0.5235, + "step": 12961 + }, + { + "epoch": 16.59136, + "grad_norm": 1.0003156661987305, + "learning_rate": 2.4091636654661868e-05, + "loss": 0.4948, + "step": 12962 + }, + { + "epoch": 16.59264, + "grad_norm": 1.0581244230270386, + "learning_rate": 2.4089635854341737e-05, + "loss": 0.5393, + "step": 12963 + }, + { + "epoch": 16.59392, + "grad_norm": 1.029645323753357, + "learning_rate": 2.408763505402161e-05, + "loss": 0.5131, + "step": 12964 + }, + { + "epoch": 16.5952, + "grad_norm": 0.9942212700843811, + "learning_rate": 2.408563425370148e-05, + "loss": 0.5036, + "step": 12965 + }, + { + "epoch": 16.59648, + "grad_norm": 1.1347296237945557, + "learning_rate": 2.4083633453381356e-05, + "loss": 0.5557, + "step": 12966 + }, + { + "epoch": 16.59776, + "grad_norm": 1.0200735330581665, + "learning_rate": 2.4081632653061224e-05, + "loss": 0.5125, + "step": 12967 + }, + { + "epoch": 16.59904, + "grad_norm": 1.0105204582214355, + "learning_rate": 2.4079631852741096e-05, + "loss": 0.4875, + "step": 12968 + }, + { + "epoch": 16.60032, + "grad_norm": 0.9845042824745178, + "learning_rate": 2.407763105242097e-05, + "loss": 0.4829, + "step": 12969 + }, + { + "epoch": 16.6016, + "grad_norm": 1.0377755165100098, + "learning_rate": 2.4075630252100843e-05, + "loss": 0.5135, + "step": 12970 + }, + { + "epoch": 16.60288, + "grad_norm": 1.0604853630065918, + "learning_rate": 2.407362945178071e-05, + "loss": 0.5411, + "step": 12971 + }, + { + "epoch": 16.60416, + "grad_norm": 0.9907283782958984, + "learning_rate": 2.4071628651460583e-05, + "loss": 0.4962, + "step": 12972 + }, + { + "epoch": 16.60544, + "grad_norm": 1.0216723680496216, + "learning_rate": 2.406962785114046e-05, + "loss": 0.5194, + "step": 12973 + }, + { + "epoch": 16.60672, + "grad_norm": 0.9833100438117981, + "learning_rate": 2.406762705082033e-05, + "loss": 0.5267, + "step": 12974 + }, + { + "epoch": 16.608, + "grad_norm": 0.9777483940124512, + "learning_rate": 2.40656262505002e-05, + "loss": 0.5334, + "step": 12975 + }, + { + "epoch": 16.60928, + "grad_norm": 1.0363882780075073, + "learning_rate": 2.4063625450180074e-05, + "loss": 0.5079, + "step": 12976 + }, + { + "epoch": 16.61056, + "grad_norm": 1.0367538928985596, + "learning_rate": 2.4061624649859946e-05, + "loss": 0.5033, + "step": 12977 + }, + { + "epoch": 16.61184, + "grad_norm": 1.020905613899231, + "learning_rate": 2.4059623849539818e-05, + "loss": 0.5121, + "step": 12978 + }, + { + "epoch": 16.61312, + "grad_norm": 0.9727598428726196, + "learning_rate": 2.4057623049219686e-05, + "loss": 0.4858, + "step": 12979 + }, + { + "epoch": 16.6144, + "grad_norm": 1.0102119445800781, + "learning_rate": 2.405562224889956e-05, + "loss": 0.5275, + "step": 12980 + }, + { + "epoch": 16.61568, + "grad_norm": 1.0345350503921509, + "learning_rate": 2.4053621448579433e-05, + "loss": 0.5482, + "step": 12981 + }, + { + "epoch": 16.61696, + "grad_norm": 1.0067529678344727, + "learning_rate": 2.4051620648259305e-05, + "loss": 0.5192, + "step": 12982 + }, + { + "epoch": 16.61824, + "grad_norm": 0.9604501724243164, + "learning_rate": 2.4049619847939174e-05, + "loss": 0.4999, + "step": 12983 + }, + { + "epoch": 16.61952, + "grad_norm": 0.9932731986045837, + "learning_rate": 2.404761904761905e-05, + "loss": 0.5266, + "step": 12984 + }, + { + "epoch": 16.6208, + "grad_norm": 0.9318062663078308, + "learning_rate": 2.404561824729892e-05, + "loss": 0.4916, + "step": 12985 + }, + { + "epoch": 16.62208, + "grad_norm": 1.0295798778533936, + "learning_rate": 2.4043617446978793e-05, + "loss": 0.5223, + "step": 12986 + }, + { + "epoch": 16.62336, + "grad_norm": 1.082562804222107, + "learning_rate": 2.4041616646658665e-05, + "loss": 0.5536, + "step": 12987 + }, + { + "epoch": 16.62464, + "grad_norm": 0.9973915219306946, + "learning_rate": 2.4039615846338536e-05, + "loss": 0.4904, + "step": 12988 + }, + { + "epoch": 16.62592, + "grad_norm": 0.9800871014595032, + "learning_rate": 2.403761504601841e-05, + "loss": 0.516, + "step": 12989 + }, + { + "epoch": 16.6272, + "grad_norm": 1.0225999355316162, + "learning_rate": 2.403561424569828e-05, + "loss": 0.5226, + "step": 12990 + }, + { + "epoch": 16.62848, + "grad_norm": 0.9972506761550903, + "learning_rate": 2.4033613445378152e-05, + "loss": 0.5076, + "step": 12991 + }, + { + "epoch": 16.62976, + "grad_norm": 1.0018681287765503, + "learning_rate": 2.4031612645058024e-05, + "loss": 0.5113, + "step": 12992 + }, + { + "epoch": 16.63104, + "grad_norm": 1.0438956022262573, + "learning_rate": 2.4029611844737896e-05, + "loss": 0.5164, + "step": 12993 + }, + { + "epoch": 16.63232, + "grad_norm": 1.0386935472488403, + "learning_rate": 2.402761104441777e-05, + "loss": 0.5395, + "step": 12994 + }, + { + "epoch": 16.6336, + "grad_norm": 1.0958852767944336, + "learning_rate": 2.402561024409764e-05, + "loss": 0.5501, + "step": 12995 + }, + { + "epoch": 16.63488, + "grad_norm": 0.9334178566932678, + "learning_rate": 2.402360944377751e-05, + "loss": 0.4921, + "step": 12996 + }, + { + "epoch": 16.63616, + "grad_norm": 1.008756399154663, + "learning_rate": 2.4021608643457383e-05, + "loss": 0.5099, + "step": 12997 + }, + { + "epoch": 16.63744, + "grad_norm": 0.9561886787414551, + "learning_rate": 2.401960784313726e-05, + "loss": 0.4785, + "step": 12998 + }, + { + "epoch": 16.63872, + "grad_norm": 1.0232385396957397, + "learning_rate": 2.4017607042817127e-05, + "loss": 0.5235, + "step": 12999 + }, + { + "epoch": 16.64, + "grad_norm": 0.9976513981819153, + "learning_rate": 2.4015606242497e-05, + "loss": 0.5444, + "step": 13000 + }, + { + "epoch": 16.64128, + "grad_norm": 1.0245577096939087, + "learning_rate": 2.4013605442176874e-05, + "loss": 0.5077, + "step": 13001 + }, + { + "epoch": 16.64256, + "grad_norm": 1.0046392679214478, + "learning_rate": 2.4011604641856746e-05, + "loss": 0.5291, + "step": 13002 + }, + { + "epoch": 16.64384, + "grad_norm": 1.0134575366973877, + "learning_rate": 2.4009603841536614e-05, + "loss": 0.5246, + "step": 13003 + }, + { + "epoch": 16.64512, + "grad_norm": 1.0273959636688232, + "learning_rate": 2.4007603041216486e-05, + "loss": 0.5474, + "step": 13004 + }, + { + "epoch": 16.6464, + "grad_norm": 1.09654700756073, + "learning_rate": 2.400560224089636e-05, + "loss": 0.5338, + "step": 13005 + }, + { + "epoch": 16.64768, + "grad_norm": 0.9553311467170715, + "learning_rate": 2.4003601440576233e-05, + "loss": 0.5073, + "step": 13006 + }, + { + "epoch": 16.64896, + "grad_norm": 1.0136662721633911, + "learning_rate": 2.4001600640256102e-05, + "loss": 0.4933, + "step": 13007 + }, + { + "epoch": 16.65024, + "grad_norm": 0.9974261522293091, + "learning_rate": 2.3999599839935977e-05, + "loss": 0.4961, + "step": 13008 + }, + { + "epoch": 16.65152, + "grad_norm": 1.0348639488220215, + "learning_rate": 2.399759903961585e-05, + "loss": 0.5167, + "step": 13009 + }, + { + "epoch": 16.6528, + "grad_norm": 1.0358352661132812, + "learning_rate": 2.399559823929572e-05, + "loss": 0.5012, + "step": 13010 + }, + { + "epoch": 16.65408, + "grad_norm": 0.9727697372436523, + "learning_rate": 2.399359743897559e-05, + "loss": 0.4838, + "step": 13011 + }, + { + "epoch": 16.65536, + "grad_norm": 1.0957095623016357, + "learning_rate": 2.3991596638655464e-05, + "loss": 0.5401, + "step": 13012 + }, + { + "epoch": 16.65664, + "grad_norm": 1.0502195358276367, + "learning_rate": 2.3989595838335336e-05, + "loss": 0.5342, + "step": 13013 + }, + { + "epoch": 16.65792, + "grad_norm": 0.9960721135139465, + "learning_rate": 2.3987595038015208e-05, + "loss": 0.4892, + "step": 13014 + }, + { + "epoch": 16.6592, + "grad_norm": 0.9923244714736938, + "learning_rate": 2.398559423769508e-05, + "loss": 0.5019, + "step": 13015 + }, + { + "epoch": 16.66048, + "grad_norm": 1.0378888845443726, + "learning_rate": 2.3983593437374952e-05, + "loss": 0.4938, + "step": 13016 + }, + { + "epoch": 16.66176, + "grad_norm": 0.9792540669441223, + "learning_rate": 2.3981592637054824e-05, + "loss": 0.5203, + "step": 13017 + }, + { + "epoch": 16.66304, + "grad_norm": 1.0740691423416138, + "learning_rate": 2.3979591836734696e-05, + "loss": 0.4963, + "step": 13018 + }, + { + "epoch": 16.66432, + "grad_norm": 1.0150448083877563, + "learning_rate": 2.3977591036414567e-05, + "loss": 0.5391, + "step": 13019 + }, + { + "epoch": 16.6656, + "grad_norm": 0.9979774355888367, + "learning_rate": 2.397559023609444e-05, + "loss": 0.5039, + "step": 13020 + }, + { + "epoch": 16.66688, + "grad_norm": 0.9845618009567261, + "learning_rate": 2.397358943577431e-05, + "loss": 0.5409, + "step": 13021 + }, + { + "epoch": 16.66816, + "grad_norm": 0.9184175133705139, + "learning_rate": 2.3971588635454183e-05, + "loss": 0.5081, + "step": 13022 + }, + { + "epoch": 16.66944, + "grad_norm": 1.0316041707992554, + "learning_rate": 2.3969587835134055e-05, + "loss": 0.539, + "step": 13023 + }, + { + "epoch": 16.67072, + "grad_norm": 1.0392125844955444, + "learning_rate": 2.3967587034813927e-05, + "loss": 0.5423, + "step": 13024 + }, + { + "epoch": 16.672, + "grad_norm": 0.9866761565208435, + "learning_rate": 2.39655862344938e-05, + "loss": 0.5351, + "step": 13025 + }, + { + "epoch": 16.67328, + "grad_norm": 1.0141420364379883, + "learning_rate": 2.396358543417367e-05, + "loss": 0.5219, + "step": 13026 + }, + { + "epoch": 16.67456, + "grad_norm": 1.0628118515014648, + "learning_rate": 2.3961584633853542e-05, + "loss": 0.5491, + "step": 13027 + }, + { + "epoch": 16.67584, + "grad_norm": 1.0219802856445312, + "learning_rate": 2.3959583833533414e-05, + "loss": 0.5423, + "step": 13028 + }, + { + "epoch": 16.67712, + "grad_norm": 1.0285046100616455, + "learning_rate": 2.395758303321329e-05, + "loss": 0.5219, + "step": 13029 + }, + { + "epoch": 16.6784, + "grad_norm": 0.9643558859825134, + "learning_rate": 2.3955582232893158e-05, + "loss": 0.4952, + "step": 13030 + }, + { + "epoch": 16.67968, + "grad_norm": 1.1057844161987305, + "learning_rate": 2.395358143257303e-05, + "loss": 0.5293, + "step": 13031 + }, + { + "epoch": 16.68096, + "grad_norm": 1.0154691934585571, + "learning_rate": 2.39515806322529e-05, + "loss": 0.5028, + "step": 13032 + }, + { + "epoch": 16.68224, + "grad_norm": 0.9593093991279602, + "learning_rate": 2.3949579831932777e-05, + "loss": 0.4931, + "step": 13033 + }, + { + "epoch": 16.68352, + "grad_norm": 1.0532431602478027, + "learning_rate": 2.3947579031612645e-05, + "loss": 0.5077, + "step": 13034 + }, + { + "epoch": 16.6848, + "grad_norm": 0.959784984588623, + "learning_rate": 2.3945578231292517e-05, + "loss": 0.5014, + "step": 13035 + }, + { + "epoch": 16.68608, + "grad_norm": 1.0308352708816528, + "learning_rate": 2.3943577430972392e-05, + "loss": 0.5016, + "step": 13036 + }, + { + "epoch": 16.687359999999998, + "grad_norm": 0.9219370484352112, + "learning_rate": 2.3941576630652264e-05, + "loss": 0.5001, + "step": 13037 + }, + { + "epoch": 16.68864, + "grad_norm": 0.9760138988494873, + "learning_rate": 2.3939575830332133e-05, + "loss": 0.5212, + "step": 13038 + }, + { + "epoch": 16.68992, + "grad_norm": 1.0665395259857178, + "learning_rate": 2.3937575030012005e-05, + "loss": 0.5621, + "step": 13039 + }, + { + "epoch": 16.6912, + "grad_norm": 0.9828092455863953, + "learning_rate": 2.393557422969188e-05, + "loss": 0.5331, + "step": 13040 + }, + { + "epoch": 16.69248, + "grad_norm": 1.0022916793823242, + "learning_rate": 2.393357342937175e-05, + "loss": 0.5121, + "step": 13041 + }, + { + "epoch": 16.69376, + "grad_norm": 0.931938886642456, + "learning_rate": 2.393157262905162e-05, + "loss": 0.4934, + "step": 13042 + }, + { + "epoch": 16.69504, + "grad_norm": 1.0225458145141602, + "learning_rate": 2.3929571828731492e-05, + "loss": 0.5217, + "step": 13043 + }, + { + "epoch": 16.69632, + "grad_norm": 1.0572493076324463, + "learning_rate": 2.3927571028411367e-05, + "loss": 0.5562, + "step": 13044 + }, + { + "epoch": 16.6976, + "grad_norm": 0.9570837616920471, + "learning_rate": 2.392557022809124e-05, + "loss": 0.4917, + "step": 13045 + }, + { + "epoch": 16.69888, + "grad_norm": 0.9432888627052307, + "learning_rate": 2.3923569427771108e-05, + "loss": 0.4917, + "step": 13046 + }, + { + "epoch": 16.70016, + "grad_norm": 0.9940226078033447, + "learning_rate": 2.3921568627450983e-05, + "loss": 0.5083, + "step": 13047 + }, + { + "epoch": 16.70144, + "grad_norm": 0.9966458082199097, + "learning_rate": 2.3919567827130855e-05, + "loss": 0.5097, + "step": 13048 + }, + { + "epoch": 16.70272, + "grad_norm": 1.0165756940841675, + "learning_rate": 2.3917567026810727e-05, + "loss": 0.5624, + "step": 13049 + }, + { + "epoch": 16.704, + "grad_norm": 1.0315126180648804, + "learning_rate": 2.3915566226490595e-05, + "loss": 0.5547, + "step": 13050 + }, + { + "epoch": 16.70528, + "grad_norm": 1.0564510822296143, + "learning_rate": 2.391356542617047e-05, + "loss": 0.5763, + "step": 13051 + }, + { + "epoch": 16.70656, + "grad_norm": 1.0285402536392212, + "learning_rate": 2.3911564625850342e-05, + "loss": 0.5526, + "step": 13052 + }, + { + "epoch": 16.70784, + "grad_norm": 0.9533161520957947, + "learning_rate": 2.3909563825530214e-05, + "loss": 0.5125, + "step": 13053 + }, + { + "epoch": 16.70912, + "grad_norm": 1.0255242586135864, + "learning_rate": 2.3907563025210086e-05, + "loss": 0.5404, + "step": 13054 + }, + { + "epoch": 16.7104, + "grad_norm": 0.9758644104003906, + "learning_rate": 2.3905562224889958e-05, + "loss": 0.4887, + "step": 13055 + }, + { + "epoch": 16.71168, + "grad_norm": 0.9290502667427063, + "learning_rate": 2.390356142456983e-05, + "loss": 0.4621, + "step": 13056 + }, + { + "epoch": 16.71296, + "grad_norm": 1.0182784795761108, + "learning_rate": 2.39015606242497e-05, + "loss": 0.5575, + "step": 13057 + }, + { + "epoch": 16.71424, + "grad_norm": 1.022233009338379, + "learning_rate": 2.3899559823929573e-05, + "loss": 0.5254, + "step": 13058 + }, + { + "epoch": 16.71552, + "grad_norm": 1.0084254741668701, + "learning_rate": 2.3897559023609445e-05, + "loss": 0.5153, + "step": 13059 + }, + { + "epoch": 16.7168, + "grad_norm": 0.9809401631355286, + "learning_rate": 2.3895558223289317e-05, + "loss": 0.5125, + "step": 13060 + }, + { + "epoch": 16.71808, + "grad_norm": 0.9509784579277039, + "learning_rate": 2.389355742296919e-05, + "loss": 0.4663, + "step": 13061 + }, + { + "epoch": 16.71936, + "grad_norm": 1.1078130006790161, + "learning_rate": 2.389155662264906e-05, + "loss": 0.5595, + "step": 13062 + }, + { + "epoch": 16.72064, + "grad_norm": 1.0031615495681763, + "learning_rate": 2.3889555822328933e-05, + "loss": 0.5288, + "step": 13063 + }, + { + "epoch": 16.72192, + "grad_norm": 0.9790761470794678, + "learning_rate": 2.3887555022008804e-05, + "loss": 0.4976, + "step": 13064 + }, + { + "epoch": 16.7232, + "grad_norm": 0.9689204096794128, + "learning_rate": 2.3885554221688676e-05, + "loss": 0.5193, + "step": 13065 + }, + { + "epoch": 16.72448, + "grad_norm": 1.0444097518920898, + "learning_rate": 2.3883553421368548e-05, + "loss": 0.5472, + "step": 13066 + }, + { + "epoch": 16.72576, + "grad_norm": 1.0322445631027222, + "learning_rate": 2.388155262104842e-05, + "loss": 0.5432, + "step": 13067 + }, + { + "epoch": 16.72704, + "grad_norm": 0.9607560038566589, + "learning_rate": 2.3879551820728295e-05, + "loss": 0.4699, + "step": 13068 + }, + { + "epoch": 16.72832, + "grad_norm": 1.0295300483703613, + "learning_rate": 2.3877551020408164e-05, + "loss": 0.5077, + "step": 13069 + }, + { + "epoch": 16.7296, + "grad_norm": 1.0513696670532227, + "learning_rate": 2.3875550220088036e-05, + "loss": 0.5587, + "step": 13070 + }, + { + "epoch": 16.73088, + "grad_norm": 1.0276129245758057, + "learning_rate": 2.3873549419767907e-05, + "loss": 0.5124, + "step": 13071 + }, + { + "epoch": 16.73216, + "grad_norm": 0.9594248533248901, + "learning_rate": 2.3871548619447783e-05, + "loss": 0.4769, + "step": 13072 + }, + { + "epoch": 16.73344, + "grad_norm": 1.0701425075531006, + "learning_rate": 2.386954781912765e-05, + "loss": 0.5459, + "step": 13073 + }, + { + "epoch": 16.73472, + "grad_norm": 1.044277548789978, + "learning_rate": 2.3867547018807523e-05, + "loss": 0.5692, + "step": 13074 + }, + { + "epoch": 16.736, + "grad_norm": 1.0274498462677002, + "learning_rate": 2.3865546218487398e-05, + "loss": 0.5312, + "step": 13075 + }, + { + "epoch": 16.73728, + "grad_norm": 1.0216330289840698, + "learning_rate": 2.386354541816727e-05, + "loss": 0.5514, + "step": 13076 + }, + { + "epoch": 16.73856, + "grad_norm": 1.0108169317245483, + "learning_rate": 2.386154461784714e-05, + "loss": 0.537, + "step": 13077 + }, + { + "epoch": 16.73984, + "grad_norm": 1.015303134918213, + "learning_rate": 2.385954381752701e-05, + "loss": 0.522, + "step": 13078 + }, + { + "epoch": 16.74112, + "grad_norm": 1.0391994714736938, + "learning_rate": 2.3857543017206886e-05, + "loss": 0.5815, + "step": 13079 + }, + { + "epoch": 16.7424, + "grad_norm": 1.0588417053222656, + "learning_rate": 2.3855542216886757e-05, + "loss": 0.5296, + "step": 13080 + }, + { + "epoch": 16.74368, + "grad_norm": 0.9854347109794617, + "learning_rate": 2.3853541416566626e-05, + "loss": 0.5049, + "step": 13081 + }, + { + "epoch": 16.74496, + "grad_norm": 1.0034525394439697, + "learning_rate": 2.38515406162465e-05, + "loss": 0.5144, + "step": 13082 + }, + { + "epoch": 16.74624, + "grad_norm": 1.0394407510757446, + "learning_rate": 2.3849539815926373e-05, + "loss": 0.4741, + "step": 13083 + }, + { + "epoch": 16.74752, + "grad_norm": 0.9602055549621582, + "learning_rate": 2.3847539015606245e-05, + "loss": 0.5129, + "step": 13084 + }, + { + "epoch": 16.7488, + "grad_norm": 1.01785147190094, + "learning_rate": 2.3845538215286113e-05, + "loss": 0.5215, + "step": 13085 + }, + { + "epoch": 16.75008, + "grad_norm": 1.021774411201477, + "learning_rate": 2.384353741496599e-05, + "loss": 0.5161, + "step": 13086 + }, + { + "epoch": 16.75136, + "grad_norm": 1.0218771696090698, + "learning_rate": 2.384153661464586e-05, + "loss": 0.5066, + "step": 13087 + }, + { + "epoch": 16.75264, + "grad_norm": 0.9877253174781799, + "learning_rate": 2.3839535814325732e-05, + "loss": 0.4995, + "step": 13088 + }, + { + "epoch": 16.75392, + "grad_norm": 1.0353214740753174, + "learning_rate": 2.3837535014005604e-05, + "loss": 0.4538, + "step": 13089 + }, + { + "epoch": 16.7552, + "grad_norm": 1.052165150642395, + "learning_rate": 2.3835534213685476e-05, + "loss": 0.5353, + "step": 13090 + }, + { + "epoch": 16.75648, + "grad_norm": 0.9751322269439697, + "learning_rate": 2.3833533413365348e-05, + "loss": 0.4861, + "step": 13091 + }, + { + "epoch": 16.75776, + "grad_norm": 1.0614244937896729, + "learning_rate": 2.383153261304522e-05, + "loss": 0.562, + "step": 13092 + }, + { + "epoch": 16.75904, + "grad_norm": 1.001287817955017, + "learning_rate": 2.382953181272509e-05, + "loss": 0.5248, + "step": 13093 + }, + { + "epoch": 16.76032, + "grad_norm": 1.015852451324463, + "learning_rate": 2.3827531012404963e-05, + "loss": 0.5339, + "step": 13094 + }, + { + "epoch": 16.7616, + "grad_norm": 1.0684832334518433, + "learning_rate": 2.3825530212084835e-05, + "loss": 0.5295, + "step": 13095 + }, + { + "epoch": 16.76288, + "grad_norm": 0.9663676619529724, + "learning_rate": 2.3823529411764707e-05, + "loss": 0.4559, + "step": 13096 + }, + { + "epoch": 16.76416, + "grad_norm": 0.949224591255188, + "learning_rate": 2.382152861144458e-05, + "loss": 0.5483, + "step": 13097 + }, + { + "epoch": 16.76544, + "grad_norm": 1.01194167137146, + "learning_rate": 2.381952781112445e-05, + "loss": 0.5269, + "step": 13098 + }, + { + "epoch": 16.76672, + "grad_norm": 1.0378564596176147, + "learning_rate": 2.3817527010804323e-05, + "loss": 0.5227, + "step": 13099 + }, + { + "epoch": 16.768, + "grad_norm": 1.0125080347061157, + "learning_rate": 2.3815526210484195e-05, + "loss": 0.5238, + "step": 13100 + }, + { + "epoch": 16.76928, + "grad_norm": 1.0728344917297363, + "learning_rate": 2.3813525410164066e-05, + "loss": 0.5434, + "step": 13101 + }, + { + "epoch": 16.77056, + "grad_norm": 0.9800313711166382, + "learning_rate": 2.381152460984394e-05, + "loss": 0.4794, + "step": 13102 + }, + { + "epoch": 16.77184, + "grad_norm": 1.0031391382217407, + "learning_rate": 2.380952380952381e-05, + "loss": 0.505, + "step": 13103 + }, + { + "epoch": 16.77312, + "grad_norm": 0.9527682662010193, + "learning_rate": 2.3807523009203682e-05, + "loss": 0.5031, + "step": 13104 + }, + { + "epoch": 16.7744, + "grad_norm": 1.0169705152511597, + "learning_rate": 2.3805522208883554e-05, + "loss": 0.4872, + "step": 13105 + }, + { + "epoch": 16.77568, + "grad_norm": 0.9913395643234253, + "learning_rate": 2.3803521408563426e-05, + "loss": 0.5042, + "step": 13106 + }, + { + "epoch": 16.77696, + "grad_norm": 1.0133116245269775, + "learning_rate": 2.38015206082433e-05, + "loss": 0.5202, + "step": 13107 + }, + { + "epoch": 16.77824, + "grad_norm": 1.0359431505203247, + "learning_rate": 2.379951980792317e-05, + "loss": 0.5379, + "step": 13108 + }, + { + "epoch": 16.77952, + "grad_norm": 0.9904114603996277, + "learning_rate": 2.379751900760304e-05, + "loss": 0.486, + "step": 13109 + }, + { + "epoch": 16.7808, + "grad_norm": 0.9596001505851746, + "learning_rate": 2.3795518207282913e-05, + "loss": 0.4627, + "step": 13110 + }, + { + "epoch": 16.78208, + "grad_norm": 1.0215861797332764, + "learning_rate": 2.379351740696279e-05, + "loss": 0.5149, + "step": 13111 + }, + { + "epoch": 16.78336, + "grad_norm": 1.0180429220199585, + "learning_rate": 2.3791516606642657e-05, + "loss": 0.5304, + "step": 13112 + }, + { + "epoch": 16.78464, + "grad_norm": 0.942301332950592, + "learning_rate": 2.378951580632253e-05, + "loss": 0.5205, + "step": 13113 + }, + { + "epoch": 16.78592, + "grad_norm": 0.9987159371376038, + "learning_rate": 2.3787515006002404e-05, + "loss": 0.5267, + "step": 13114 + }, + { + "epoch": 16.7872, + "grad_norm": 1.0503463745117188, + "learning_rate": 2.3785514205682276e-05, + "loss": 0.5245, + "step": 13115 + }, + { + "epoch": 16.78848, + "grad_norm": 1.0316410064697266, + "learning_rate": 2.3783513405362144e-05, + "loss": 0.5262, + "step": 13116 + }, + { + "epoch": 16.78976, + "grad_norm": 0.9606108665466309, + "learning_rate": 2.3781512605042016e-05, + "loss": 0.5006, + "step": 13117 + }, + { + "epoch": 16.79104, + "grad_norm": 1.021532654762268, + "learning_rate": 2.377951180472189e-05, + "loss": 0.5441, + "step": 13118 + }, + { + "epoch": 16.79232, + "grad_norm": 0.9799433350563049, + "learning_rate": 2.3777511004401763e-05, + "loss": 0.5082, + "step": 13119 + }, + { + "epoch": 16.7936, + "grad_norm": 0.9904650449752808, + "learning_rate": 2.3775510204081632e-05, + "loss": 0.55, + "step": 13120 + }, + { + "epoch": 16.79488, + "grad_norm": 1.006087303161621, + "learning_rate": 2.3773509403761507e-05, + "loss": 0.5431, + "step": 13121 + }, + { + "epoch": 16.79616, + "grad_norm": 1.0185682773590088, + "learning_rate": 2.377150860344138e-05, + "loss": 0.5338, + "step": 13122 + }, + { + "epoch": 16.79744, + "grad_norm": 1.039462924003601, + "learning_rate": 2.376950780312125e-05, + "loss": 0.5525, + "step": 13123 + }, + { + "epoch": 16.79872, + "grad_norm": 1.048966884613037, + "learning_rate": 2.376750700280112e-05, + "loss": 0.5458, + "step": 13124 + }, + { + "epoch": 16.8, + "grad_norm": 1.0096321105957031, + "learning_rate": 2.3765506202480994e-05, + "loss": 0.5175, + "step": 13125 + }, + { + "epoch": 16.80128, + "grad_norm": 1.0281754732131958, + "learning_rate": 2.3763505402160866e-05, + "loss": 0.54, + "step": 13126 + }, + { + "epoch": 16.80256, + "grad_norm": 1.0194934606552124, + "learning_rate": 2.3761504601840738e-05, + "loss": 0.5144, + "step": 13127 + }, + { + "epoch": 16.80384, + "grad_norm": 1.0557507276535034, + "learning_rate": 2.375950380152061e-05, + "loss": 0.537, + "step": 13128 + }, + { + "epoch": 16.80512, + "grad_norm": 0.926566481590271, + "learning_rate": 2.3757503001200482e-05, + "loss": 0.4973, + "step": 13129 + }, + { + "epoch": 16.8064, + "grad_norm": 1.016688585281372, + "learning_rate": 2.3755502200880354e-05, + "loss": 0.524, + "step": 13130 + }, + { + "epoch": 16.80768, + "grad_norm": 1.0396153926849365, + "learning_rate": 2.3753501400560226e-05, + "loss": 0.5364, + "step": 13131 + }, + { + "epoch": 16.80896, + "grad_norm": 0.9891914129257202, + "learning_rate": 2.3751500600240097e-05, + "loss": 0.5359, + "step": 13132 + }, + { + "epoch": 16.81024, + "grad_norm": 1.0839710235595703, + "learning_rate": 2.374949979991997e-05, + "loss": 0.5577, + "step": 13133 + }, + { + "epoch": 16.81152, + "grad_norm": 0.9661899209022522, + "learning_rate": 2.374749899959984e-05, + "loss": 0.4845, + "step": 13134 + }, + { + "epoch": 16.8128, + "grad_norm": 0.9867687821388245, + "learning_rate": 2.3745498199279713e-05, + "loss": 0.5227, + "step": 13135 + }, + { + "epoch": 16.81408, + "grad_norm": 1.0412554740905762, + "learning_rate": 2.3743497398959585e-05, + "loss": 0.5154, + "step": 13136 + }, + { + "epoch": 16.81536, + "grad_norm": 0.9606292843818665, + "learning_rate": 2.3741496598639457e-05, + "loss": 0.4987, + "step": 13137 + }, + { + "epoch": 16.81664, + "grad_norm": 0.9731855392456055, + "learning_rate": 2.373949579831933e-05, + "loss": 0.4813, + "step": 13138 + }, + { + "epoch": 16.81792, + "grad_norm": 1.0362776517868042, + "learning_rate": 2.37374949979992e-05, + "loss": 0.5173, + "step": 13139 + }, + { + "epoch": 16.8192, + "grad_norm": 1.0115454196929932, + "learning_rate": 2.3735494197679072e-05, + "loss": 0.4787, + "step": 13140 + }, + { + "epoch": 16.82048, + "grad_norm": 1.0400524139404297, + "learning_rate": 2.3733493397358944e-05, + "loss": 0.5218, + "step": 13141 + }, + { + "epoch": 16.82176, + "grad_norm": 1.0309675931930542, + "learning_rate": 2.373149259703882e-05, + "loss": 0.5008, + "step": 13142 + }, + { + "epoch": 16.82304, + "grad_norm": 0.9864956140518188, + "learning_rate": 2.3729491796718688e-05, + "loss": 0.4848, + "step": 13143 + }, + { + "epoch": 16.82432, + "grad_norm": 0.9826066493988037, + "learning_rate": 2.372749099639856e-05, + "loss": 0.4918, + "step": 13144 + }, + { + "epoch": 16.8256, + "grad_norm": 1.0326327085494995, + "learning_rate": 2.372549019607843e-05, + "loss": 0.5138, + "step": 13145 + }, + { + "epoch": 16.82688, + "grad_norm": 0.9792665839195251, + "learning_rate": 2.3723489395758307e-05, + "loss": 0.5088, + "step": 13146 + }, + { + "epoch": 16.82816, + "grad_norm": 0.9995619654655457, + "learning_rate": 2.3721488595438175e-05, + "loss": 0.5337, + "step": 13147 + }, + { + "epoch": 16.829439999999998, + "grad_norm": 1.030884027481079, + "learning_rate": 2.3719487795118047e-05, + "loss": 0.541, + "step": 13148 + }, + { + "epoch": 16.83072, + "grad_norm": 1.0245966911315918, + "learning_rate": 2.3717486994797922e-05, + "loss": 0.4978, + "step": 13149 + }, + { + "epoch": 16.832, + "grad_norm": 1.035640835762024, + "learning_rate": 2.3715486194477794e-05, + "loss": 0.5688, + "step": 13150 + }, + { + "epoch": 16.83328, + "grad_norm": 1.0522053241729736, + "learning_rate": 2.3713485394157663e-05, + "loss": 0.5527, + "step": 13151 + }, + { + "epoch": 16.83456, + "grad_norm": 1.0623236894607544, + "learning_rate": 2.3711484593837535e-05, + "loss": 0.5676, + "step": 13152 + }, + { + "epoch": 16.83584, + "grad_norm": 0.9654913544654846, + "learning_rate": 2.370948379351741e-05, + "loss": 0.5085, + "step": 13153 + }, + { + "epoch": 16.83712, + "grad_norm": 0.9618136286735535, + "learning_rate": 2.370748299319728e-05, + "loss": 0.5171, + "step": 13154 + }, + { + "epoch": 16.8384, + "grad_norm": 1.009698748588562, + "learning_rate": 2.370548219287715e-05, + "loss": 0.4982, + "step": 13155 + }, + { + "epoch": 16.83968, + "grad_norm": 1.0301358699798584, + "learning_rate": 2.3703481392557022e-05, + "loss": 0.5463, + "step": 13156 + }, + { + "epoch": 16.84096, + "grad_norm": 1.008836269378662, + "learning_rate": 2.3701480592236897e-05, + "loss": 0.4709, + "step": 13157 + }, + { + "epoch": 16.84224, + "grad_norm": 1.0866061449050903, + "learning_rate": 2.369947979191677e-05, + "loss": 0.5407, + "step": 13158 + }, + { + "epoch": 16.84352, + "grad_norm": 1.0506260395050049, + "learning_rate": 2.3697478991596638e-05, + "loss": 0.5297, + "step": 13159 + }, + { + "epoch": 16.8448, + "grad_norm": 1.0603467226028442, + "learning_rate": 2.3695478191276513e-05, + "loss": 0.5368, + "step": 13160 + }, + { + "epoch": 16.84608, + "grad_norm": 0.9975579977035522, + "learning_rate": 2.3693477390956385e-05, + "loss": 0.5138, + "step": 13161 + }, + { + "epoch": 16.84736, + "grad_norm": 1.0625996589660645, + "learning_rate": 2.3691476590636257e-05, + "loss": 0.5583, + "step": 13162 + }, + { + "epoch": 16.84864, + "grad_norm": 1.0248199701309204, + "learning_rate": 2.3689475790316125e-05, + "loss": 0.5415, + "step": 13163 + }, + { + "epoch": 16.84992, + "grad_norm": 1.0083729028701782, + "learning_rate": 2.3687474989996e-05, + "loss": 0.5248, + "step": 13164 + }, + { + "epoch": 16.8512, + "grad_norm": 1.0073386430740356, + "learning_rate": 2.3685474189675872e-05, + "loss": 0.523, + "step": 13165 + }, + { + "epoch": 16.85248, + "grad_norm": 1.023363471031189, + "learning_rate": 2.3683473389355744e-05, + "loss": 0.5001, + "step": 13166 + }, + { + "epoch": 16.85376, + "grad_norm": 0.9759731888771057, + "learning_rate": 2.3681472589035616e-05, + "loss": 0.5141, + "step": 13167 + }, + { + "epoch": 16.85504, + "grad_norm": 1.0707515478134155, + "learning_rate": 2.3679471788715488e-05, + "loss": 0.5621, + "step": 13168 + }, + { + "epoch": 16.85632, + "grad_norm": 1.0116777420043945, + "learning_rate": 2.367747098839536e-05, + "loss": 0.5197, + "step": 13169 + }, + { + "epoch": 16.8576, + "grad_norm": 1.0196561813354492, + "learning_rate": 2.367547018807523e-05, + "loss": 0.5688, + "step": 13170 + }, + { + "epoch": 16.85888, + "grad_norm": 1.0545904636383057, + "learning_rate": 2.3673469387755103e-05, + "loss": 0.5704, + "step": 13171 + }, + { + "epoch": 16.86016, + "grad_norm": 1.0500190258026123, + "learning_rate": 2.3671468587434975e-05, + "loss": 0.5012, + "step": 13172 + }, + { + "epoch": 16.86144, + "grad_norm": 0.9819349646568298, + "learning_rate": 2.3669467787114847e-05, + "loss": 0.5261, + "step": 13173 + }, + { + "epoch": 16.86272, + "grad_norm": 0.993928074836731, + "learning_rate": 2.366746698679472e-05, + "loss": 0.5433, + "step": 13174 + }, + { + "epoch": 16.864, + "grad_norm": 0.9953247308731079, + "learning_rate": 2.366546618647459e-05, + "loss": 0.5125, + "step": 13175 + }, + { + "epoch": 16.86528, + "grad_norm": 1.0047765970230103, + "learning_rate": 2.3663465386154463e-05, + "loss": 0.5015, + "step": 13176 + }, + { + "epoch": 16.86656, + "grad_norm": 1.0829325914382935, + "learning_rate": 2.3661464585834334e-05, + "loss": 0.5706, + "step": 13177 + }, + { + "epoch": 16.86784, + "grad_norm": 1.0480440855026245, + "learning_rate": 2.3659463785514206e-05, + "loss": 0.5776, + "step": 13178 + }, + { + "epoch": 16.86912, + "grad_norm": 1.0479168891906738, + "learning_rate": 2.3657462985194078e-05, + "loss": 0.5047, + "step": 13179 + }, + { + "epoch": 16.8704, + "grad_norm": 0.9871065616607666, + "learning_rate": 2.365546218487395e-05, + "loss": 0.4929, + "step": 13180 + }, + { + "epoch": 16.87168, + "grad_norm": 1.0528372526168823, + "learning_rate": 2.3653461384553825e-05, + "loss": 0.5289, + "step": 13181 + }, + { + "epoch": 16.87296, + "grad_norm": 1.0125209093093872, + "learning_rate": 2.3651460584233694e-05, + "loss": 0.5539, + "step": 13182 + }, + { + "epoch": 16.87424, + "grad_norm": 1.0450409650802612, + "learning_rate": 2.3649459783913565e-05, + "loss": 0.5374, + "step": 13183 + }, + { + "epoch": 16.87552, + "grad_norm": 1.061608910560608, + "learning_rate": 2.3647458983593437e-05, + "loss": 0.52, + "step": 13184 + }, + { + "epoch": 16.8768, + "grad_norm": 1.019355058670044, + "learning_rate": 2.3645458183273313e-05, + "loss": 0.499, + "step": 13185 + }, + { + "epoch": 16.87808, + "grad_norm": 0.9892388582229614, + "learning_rate": 2.364345738295318e-05, + "loss": 0.5473, + "step": 13186 + }, + { + "epoch": 16.87936, + "grad_norm": 1.001673936843872, + "learning_rate": 2.3641456582633053e-05, + "loss": 0.4966, + "step": 13187 + }, + { + "epoch": 16.88064, + "grad_norm": 1.078865647315979, + "learning_rate": 2.3639455782312928e-05, + "loss": 0.5499, + "step": 13188 + }, + { + "epoch": 16.88192, + "grad_norm": 0.9952927827835083, + "learning_rate": 2.36374549819928e-05, + "loss": 0.5131, + "step": 13189 + }, + { + "epoch": 16.8832, + "grad_norm": 0.9789214730262756, + "learning_rate": 2.363545418167267e-05, + "loss": 0.513, + "step": 13190 + }, + { + "epoch": 16.88448, + "grad_norm": 1.0156183242797852, + "learning_rate": 2.363345338135254e-05, + "loss": 0.5402, + "step": 13191 + }, + { + "epoch": 16.88576, + "grad_norm": 1.0390909910202026, + "learning_rate": 2.3631452581032416e-05, + "loss": 0.571, + "step": 13192 + }, + { + "epoch": 16.88704, + "grad_norm": 1.0474461317062378, + "learning_rate": 2.3629451780712287e-05, + "loss": 0.5456, + "step": 13193 + }, + { + "epoch": 16.88832, + "grad_norm": 1.0926357507705688, + "learning_rate": 2.3627450980392156e-05, + "loss": 0.51, + "step": 13194 + }, + { + "epoch": 16.8896, + "grad_norm": 1.0534778833389282, + "learning_rate": 2.362545018007203e-05, + "loss": 0.5292, + "step": 13195 + }, + { + "epoch": 16.89088, + "grad_norm": 1.0174009799957275, + "learning_rate": 2.3623449379751903e-05, + "loss": 0.5295, + "step": 13196 + }, + { + "epoch": 16.89216, + "grad_norm": 1.018369436264038, + "learning_rate": 2.3621448579431775e-05, + "loss": 0.5362, + "step": 13197 + }, + { + "epoch": 16.89344, + "grad_norm": 1.008921504020691, + "learning_rate": 2.3619447779111643e-05, + "loss": 0.5035, + "step": 13198 + }, + { + "epoch": 16.89472, + "grad_norm": 1.040116548538208, + "learning_rate": 2.361744697879152e-05, + "loss": 0.5335, + "step": 13199 + }, + { + "epoch": 16.896, + "grad_norm": 1.0576521158218384, + "learning_rate": 2.361544617847139e-05, + "loss": 0.5574, + "step": 13200 + }, + { + "epoch": 16.89728, + "grad_norm": 1.048539161682129, + "learning_rate": 2.3613445378151262e-05, + "loss": 0.5106, + "step": 13201 + }, + { + "epoch": 16.89856, + "grad_norm": 1.0253418684005737, + "learning_rate": 2.3611444577831134e-05, + "loss": 0.5217, + "step": 13202 + }, + { + "epoch": 16.89984, + "grad_norm": 1.0338140726089478, + "learning_rate": 2.3609443777511006e-05, + "loss": 0.5126, + "step": 13203 + }, + { + "epoch": 16.90112, + "grad_norm": 1.0068703889846802, + "learning_rate": 2.3607442977190878e-05, + "loss": 0.5127, + "step": 13204 + }, + { + "epoch": 16.9024, + "grad_norm": 0.9766025543212891, + "learning_rate": 2.360544217687075e-05, + "loss": 0.4986, + "step": 13205 + }, + { + "epoch": 16.90368, + "grad_norm": 1.0029348134994507, + "learning_rate": 2.360344137655062e-05, + "loss": 0.509, + "step": 13206 + }, + { + "epoch": 16.90496, + "grad_norm": 0.9763104915618896, + "learning_rate": 2.3601440576230493e-05, + "loss": 0.4673, + "step": 13207 + }, + { + "epoch": 16.90624, + "grad_norm": 0.9850437045097351, + "learning_rate": 2.3599439775910365e-05, + "loss": 0.544, + "step": 13208 + }, + { + "epoch": 16.90752, + "grad_norm": 1.063561201095581, + "learning_rate": 2.3597438975590237e-05, + "loss": 0.5269, + "step": 13209 + }, + { + "epoch": 16.9088, + "grad_norm": 1.0321555137634277, + "learning_rate": 2.359543817527011e-05, + "loss": 0.5038, + "step": 13210 + }, + { + "epoch": 16.91008, + "grad_norm": 1.0148087739944458, + "learning_rate": 2.359343737494998e-05, + "loss": 0.5437, + "step": 13211 + }, + { + "epoch": 16.91136, + "grad_norm": 0.9977015852928162, + "learning_rate": 2.3591436574629853e-05, + "loss": 0.5425, + "step": 13212 + }, + { + "epoch": 16.91264, + "grad_norm": 1.0292987823486328, + "learning_rate": 2.3589435774309725e-05, + "loss": 0.5345, + "step": 13213 + }, + { + "epoch": 16.91392, + "grad_norm": 0.9989826679229736, + "learning_rate": 2.3587434973989596e-05, + "loss": 0.5079, + "step": 13214 + }, + { + "epoch": 16.9152, + "grad_norm": 0.987629771232605, + "learning_rate": 2.3585434173669468e-05, + "loss": 0.5053, + "step": 13215 + }, + { + "epoch": 16.91648, + "grad_norm": 1.0139482021331787, + "learning_rate": 2.358343337334934e-05, + "loss": 0.5206, + "step": 13216 + }, + { + "epoch": 16.91776, + "grad_norm": 0.9933033585548401, + "learning_rate": 2.3581432573029212e-05, + "loss": 0.5358, + "step": 13217 + }, + { + "epoch": 16.91904, + "grad_norm": 1.0258675813674927, + "learning_rate": 2.3579431772709084e-05, + "loss": 0.5204, + "step": 13218 + }, + { + "epoch": 16.92032, + "grad_norm": 1.0290369987487793, + "learning_rate": 2.3577430972388956e-05, + "loss": 0.5262, + "step": 13219 + }, + { + "epoch": 16.9216, + "grad_norm": 1.0307328701019287, + "learning_rate": 2.357543017206883e-05, + "loss": 0.5321, + "step": 13220 + }, + { + "epoch": 16.92288, + "grad_norm": 1.0345529317855835, + "learning_rate": 2.35734293717487e-05, + "loss": 0.5526, + "step": 13221 + }, + { + "epoch": 16.92416, + "grad_norm": 1.028792381286621, + "learning_rate": 2.357142857142857e-05, + "loss": 0.5265, + "step": 13222 + }, + { + "epoch": 16.925440000000002, + "grad_norm": 1.0457243919372559, + "learning_rate": 2.3569427771108443e-05, + "loss": 0.5508, + "step": 13223 + }, + { + "epoch": 16.92672, + "grad_norm": 1.011638879776001, + "learning_rate": 2.356742697078832e-05, + "loss": 0.5209, + "step": 13224 + }, + { + "epoch": 16.928, + "grad_norm": 1.0729031562805176, + "learning_rate": 2.3565426170468187e-05, + "loss": 0.5676, + "step": 13225 + }, + { + "epoch": 16.92928, + "grad_norm": 0.988837718963623, + "learning_rate": 2.356342537014806e-05, + "loss": 0.5399, + "step": 13226 + }, + { + "epoch": 16.93056, + "grad_norm": 0.9590950608253479, + "learning_rate": 2.3561424569827934e-05, + "loss": 0.4938, + "step": 13227 + }, + { + "epoch": 16.93184, + "grad_norm": 0.9690542817115784, + "learning_rate": 2.3559423769507806e-05, + "loss": 0.4957, + "step": 13228 + }, + { + "epoch": 16.93312, + "grad_norm": 1.0184234380722046, + "learning_rate": 2.3557422969187674e-05, + "loss": 0.5152, + "step": 13229 + }, + { + "epoch": 16.9344, + "grad_norm": 1.0626813173294067, + "learning_rate": 2.3555422168867546e-05, + "loss": 0.558, + "step": 13230 + }, + { + "epoch": 16.93568, + "grad_norm": 0.9686869978904724, + "learning_rate": 2.355342136854742e-05, + "loss": 0.5276, + "step": 13231 + }, + { + "epoch": 16.93696, + "grad_norm": 0.9972003102302551, + "learning_rate": 2.3551420568227293e-05, + "loss": 0.5422, + "step": 13232 + }, + { + "epoch": 16.93824, + "grad_norm": 1.024174690246582, + "learning_rate": 2.3549419767907162e-05, + "loss": 0.5273, + "step": 13233 + }, + { + "epoch": 16.93952, + "grad_norm": 1.0208815336227417, + "learning_rate": 2.3547418967587037e-05, + "loss": 0.5221, + "step": 13234 + }, + { + "epoch": 16.9408, + "grad_norm": 0.9970613718032837, + "learning_rate": 2.354541816726691e-05, + "loss": 0.5001, + "step": 13235 + }, + { + "epoch": 16.94208, + "grad_norm": 1.0984002351760864, + "learning_rate": 2.354341736694678e-05, + "loss": 0.5119, + "step": 13236 + }, + { + "epoch": 16.94336, + "grad_norm": 1.0116389989852905, + "learning_rate": 2.354141656662665e-05, + "loss": 0.541, + "step": 13237 + }, + { + "epoch": 16.94464, + "grad_norm": 1.0177326202392578, + "learning_rate": 2.3539415766306524e-05, + "loss": 0.5292, + "step": 13238 + }, + { + "epoch": 16.94592, + "grad_norm": 0.9551138877868652, + "learning_rate": 2.3537414965986396e-05, + "loss": 0.4805, + "step": 13239 + }, + { + "epoch": 16.9472, + "grad_norm": 1.0099093914031982, + "learning_rate": 2.3535414165666268e-05, + "loss": 0.5158, + "step": 13240 + }, + { + "epoch": 16.94848, + "grad_norm": 1.0355970859527588, + "learning_rate": 2.353341336534614e-05, + "loss": 0.517, + "step": 13241 + }, + { + "epoch": 16.94976, + "grad_norm": 1.0377235412597656, + "learning_rate": 2.3531412565026012e-05, + "loss": 0.5256, + "step": 13242 + }, + { + "epoch": 16.95104, + "grad_norm": 0.973661482334137, + "learning_rate": 2.3529411764705884e-05, + "loss": 0.4841, + "step": 13243 + }, + { + "epoch": 16.95232, + "grad_norm": 0.950306236743927, + "learning_rate": 2.3527410964385756e-05, + "loss": 0.5096, + "step": 13244 + }, + { + "epoch": 16.9536, + "grad_norm": 1.070381999015808, + "learning_rate": 2.3525410164065627e-05, + "loss": 0.5756, + "step": 13245 + }, + { + "epoch": 16.95488, + "grad_norm": 1.0381155014038086, + "learning_rate": 2.35234093637455e-05, + "loss": 0.5439, + "step": 13246 + }, + { + "epoch": 16.95616, + "grad_norm": 1.0218827724456787, + "learning_rate": 2.352140856342537e-05, + "loss": 0.5185, + "step": 13247 + }, + { + "epoch": 16.95744, + "grad_norm": 1.0366841554641724, + "learning_rate": 2.3519407763105243e-05, + "loss": 0.5542, + "step": 13248 + }, + { + "epoch": 16.95872, + "grad_norm": 0.9674044251441956, + "learning_rate": 2.3517406962785115e-05, + "loss": 0.4658, + "step": 13249 + }, + { + "epoch": 16.96, + "grad_norm": 0.9855802655220032, + "learning_rate": 2.3515406162464987e-05, + "loss": 0.5267, + "step": 13250 + }, + { + "epoch": 16.96128, + "grad_norm": 1.0081998109817505, + "learning_rate": 2.351340536214486e-05, + "loss": 0.5058, + "step": 13251 + }, + { + "epoch": 16.96256, + "grad_norm": 0.9830219149589539, + "learning_rate": 2.351140456182473e-05, + "loss": 0.4796, + "step": 13252 + }, + { + "epoch": 16.96384, + "grad_norm": 0.97882479429245, + "learning_rate": 2.3509403761504602e-05, + "loss": 0.4897, + "step": 13253 + }, + { + "epoch": 16.96512, + "grad_norm": 1.0170930624008179, + "learning_rate": 2.3507402961184474e-05, + "loss": 0.513, + "step": 13254 + }, + { + "epoch": 16.9664, + "grad_norm": 1.030259132385254, + "learning_rate": 2.350540216086435e-05, + "loss": 0.4973, + "step": 13255 + }, + { + "epoch": 16.96768, + "grad_norm": 1.0129339694976807, + "learning_rate": 2.3503401360544218e-05, + "loss": 0.4936, + "step": 13256 + }, + { + "epoch": 16.96896, + "grad_norm": 1.0687109231948853, + "learning_rate": 2.350140056022409e-05, + "loss": 0.5226, + "step": 13257 + }, + { + "epoch": 16.97024, + "grad_norm": 1.0357649326324463, + "learning_rate": 2.349939975990396e-05, + "loss": 0.5376, + "step": 13258 + }, + { + "epoch": 16.97152, + "grad_norm": 1.0332026481628418, + "learning_rate": 2.3497398959583837e-05, + "loss": 0.4773, + "step": 13259 + }, + { + "epoch": 16.9728, + "grad_norm": 0.9495790600776672, + "learning_rate": 2.3495398159263705e-05, + "loss": 0.4737, + "step": 13260 + }, + { + "epoch": 16.97408, + "grad_norm": 1.0137715339660645, + "learning_rate": 2.3493397358943577e-05, + "loss": 0.5459, + "step": 13261 + }, + { + "epoch": 16.97536, + "grad_norm": 0.9835166931152344, + "learning_rate": 2.349139655862345e-05, + "loss": 0.5019, + "step": 13262 + }, + { + "epoch": 16.97664, + "grad_norm": 1.000338077545166, + "learning_rate": 2.3489395758303324e-05, + "loss": 0.5223, + "step": 13263 + }, + { + "epoch": 16.97792, + "grad_norm": 0.9843599796295166, + "learning_rate": 2.3487394957983193e-05, + "loss": 0.4935, + "step": 13264 + }, + { + "epoch": 16.9792, + "grad_norm": 1.0332714319229126, + "learning_rate": 2.3485394157663065e-05, + "loss": 0.5263, + "step": 13265 + }, + { + "epoch": 16.98048, + "grad_norm": 1.008151650428772, + "learning_rate": 2.348339335734294e-05, + "loss": 0.5458, + "step": 13266 + }, + { + "epoch": 16.98176, + "grad_norm": 1.0616765022277832, + "learning_rate": 2.348139255702281e-05, + "loss": 0.5493, + "step": 13267 + }, + { + "epoch": 16.98304, + "grad_norm": 1.007568359375, + "learning_rate": 2.347939175670268e-05, + "loss": 0.5289, + "step": 13268 + }, + { + "epoch": 16.98432, + "grad_norm": 1.0306227207183838, + "learning_rate": 2.3477390956382552e-05, + "loss": 0.5618, + "step": 13269 + }, + { + "epoch": 16.9856, + "grad_norm": 1.0478519201278687, + "learning_rate": 2.3475390156062427e-05, + "loss": 0.5575, + "step": 13270 + }, + { + "epoch": 16.98688, + "grad_norm": 0.9916083216667175, + "learning_rate": 2.34733893557423e-05, + "loss": 0.5117, + "step": 13271 + }, + { + "epoch": 16.98816, + "grad_norm": 1.0166401863098145, + "learning_rate": 2.3471388555422168e-05, + "loss": 0.5053, + "step": 13272 + }, + { + "epoch": 16.98944, + "grad_norm": 0.9944577217102051, + "learning_rate": 2.3469387755102043e-05, + "loss": 0.5063, + "step": 13273 + }, + { + "epoch": 16.99072, + "grad_norm": 1.0171867609024048, + "learning_rate": 2.3467386954781915e-05, + "loss": 0.5275, + "step": 13274 + }, + { + "epoch": 16.992, + "grad_norm": 1.002347469329834, + "learning_rate": 2.3465386154461786e-05, + "loss": 0.504, + "step": 13275 + }, + { + "epoch": 16.99328, + "grad_norm": 0.9947987794876099, + "learning_rate": 2.3463385354141655e-05, + "loss": 0.4975, + "step": 13276 + }, + { + "epoch": 16.99456, + "grad_norm": 1.0772300958633423, + "learning_rate": 2.346138455382153e-05, + "loss": 0.5841, + "step": 13277 + }, + { + "epoch": 16.99584, + "grad_norm": 1.0796716213226318, + "learning_rate": 2.3459383753501402e-05, + "loss": 0.5478, + "step": 13278 + }, + { + "epoch": 16.99712, + "grad_norm": 1.0109952688217163, + "learning_rate": 2.3457382953181274e-05, + "loss": 0.497, + "step": 13279 + }, + { + "epoch": 16.9984, + "grad_norm": 1.00393545627594, + "learning_rate": 2.3455382152861146e-05, + "loss": 0.4905, + "step": 13280 + }, + { + "epoch": 16.99968, + "grad_norm": 1.0317285060882568, + "learning_rate": 2.3453381352541018e-05, + "loss": 0.5052, + "step": 13281 + }, + { + "epoch": 17.00096, + "grad_norm": 2.4376354217529297, + "learning_rate": 2.345138055222089e-05, + "loss": 0.999, + "step": 13282 + }, + { + "epoch": 17.00224, + "grad_norm": 0.9784718155860901, + "learning_rate": 2.344937975190076e-05, + "loss": 0.4915, + "step": 13283 + }, + { + "epoch": 17.00352, + "grad_norm": 1.0082448720932007, + "learning_rate": 2.3447378951580633e-05, + "loss": 0.4813, + "step": 13284 + }, + { + "epoch": 17.0048, + "grad_norm": 1.0144582986831665, + "learning_rate": 2.3445378151260505e-05, + "loss": 0.5056, + "step": 13285 + }, + { + "epoch": 17.00608, + "grad_norm": 0.9649600386619568, + "learning_rate": 2.3443377350940377e-05, + "loss": 0.4901, + "step": 13286 + }, + { + "epoch": 17.00736, + "grad_norm": 0.9816195368766785, + "learning_rate": 2.344137655062025e-05, + "loss": 0.491, + "step": 13287 + }, + { + "epoch": 17.00864, + "grad_norm": 0.9808714985847473, + "learning_rate": 2.343937575030012e-05, + "loss": 0.4976, + "step": 13288 + }, + { + "epoch": 17.00992, + "grad_norm": 0.9497635364532471, + "learning_rate": 2.3437374949979992e-05, + "loss": 0.4637, + "step": 13289 + }, + { + "epoch": 17.0112, + "grad_norm": 1.0557515621185303, + "learning_rate": 2.3435374149659864e-05, + "loss": 0.5267, + "step": 13290 + }, + { + "epoch": 17.01248, + "grad_norm": 1.0407460927963257, + "learning_rate": 2.3433373349339736e-05, + "loss": 0.5172, + "step": 13291 + }, + { + "epoch": 17.01376, + "grad_norm": 1.0044418573379517, + "learning_rate": 2.3431372549019608e-05, + "loss": 0.5482, + "step": 13292 + }, + { + "epoch": 17.01504, + "grad_norm": 1.036288857460022, + "learning_rate": 2.342937174869948e-05, + "loss": 0.54, + "step": 13293 + }, + { + "epoch": 17.01632, + "grad_norm": 1.0211939811706543, + "learning_rate": 2.3427370948379355e-05, + "loss": 0.4912, + "step": 13294 + }, + { + "epoch": 17.0176, + "grad_norm": 0.9974328279495239, + "learning_rate": 2.3425370148059224e-05, + "loss": 0.4945, + "step": 13295 + }, + { + "epoch": 17.01888, + "grad_norm": 1.0395445823669434, + "learning_rate": 2.3423369347739095e-05, + "loss": 0.5235, + "step": 13296 + }, + { + "epoch": 17.02016, + "grad_norm": 1.0685923099517822, + "learning_rate": 2.3421368547418967e-05, + "loss": 0.5011, + "step": 13297 + }, + { + "epoch": 17.02144, + "grad_norm": 0.9720517992973328, + "learning_rate": 2.3419367747098843e-05, + "loss": 0.4791, + "step": 13298 + }, + { + "epoch": 17.02272, + "grad_norm": 1.0298750400543213, + "learning_rate": 2.341736694677871e-05, + "loss": 0.5083, + "step": 13299 + }, + { + "epoch": 17.024, + "grad_norm": 1.0151759386062622, + "learning_rate": 2.3415366146458583e-05, + "loss": 0.4656, + "step": 13300 + }, + { + "epoch": 17.02528, + "grad_norm": 1.1034287214279175, + "learning_rate": 2.3413365346138458e-05, + "loss": 0.5559, + "step": 13301 + }, + { + "epoch": 17.02656, + "grad_norm": 0.9982373714447021, + "learning_rate": 2.341136454581833e-05, + "loss": 0.4586, + "step": 13302 + }, + { + "epoch": 17.02784, + "grad_norm": 1.0372883081436157, + "learning_rate": 2.34093637454982e-05, + "loss": 0.4949, + "step": 13303 + }, + { + "epoch": 17.02912, + "grad_norm": 1.0765658617019653, + "learning_rate": 2.340736294517807e-05, + "loss": 0.5206, + "step": 13304 + }, + { + "epoch": 17.0304, + "grad_norm": 1.0272283554077148, + "learning_rate": 2.3405362144857946e-05, + "loss": 0.4676, + "step": 13305 + }, + { + "epoch": 17.03168, + "grad_norm": 1.091143012046814, + "learning_rate": 2.3403361344537817e-05, + "loss": 0.5431, + "step": 13306 + }, + { + "epoch": 17.03296, + "grad_norm": 0.9813713431358337, + "learning_rate": 2.3401360544217686e-05, + "loss": 0.4549, + "step": 13307 + }, + { + "epoch": 17.03424, + "grad_norm": 0.9769755601882935, + "learning_rate": 2.339935974389756e-05, + "loss": 0.468, + "step": 13308 + }, + { + "epoch": 17.03552, + "grad_norm": 1.0127575397491455, + "learning_rate": 2.3397358943577433e-05, + "loss": 0.5391, + "step": 13309 + }, + { + "epoch": 17.0368, + "grad_norm": 1.0613768100738525, + "learning_rate": 2.3395358143257305e-05, + "loss": 0.5372, + "step": 13310 + }, + { + "epoch": 17.03808, + "grad_norm": 0.9925457239151001, + "learning_rate": 2.3393357342937173e-05, + "loss": 0.4997, + "step": 13311 + }, + { + "epoch": 17.03936, + "grad_norm": 0.9370602965354919, + "learning_rate": 2.339135654261705e-05, + "loss": 0.474, + "step": 13312 + }, + { + "epoch": 17.04064, + "grad_norm": 1.0538114309310913, + "learning_rate": 2.338935574229692e-05, + "loss": 0.4834, + "step": 13313 + }, + { + "epoch": 17.04192, + "grad_norm": 1.0021767616271973, + "learning_rate": 2.3387354941976792e-05, + "loss": 0.503, + "step": 13314 + }, + { + "epoch": 17.0432, + "grad_norm": 1.0035548210144043, + "learning_rate": 2.3385354141656664e-05, + "loss": 0.4991, + "step": 13315 + }, + { + "epoch": 17.04448, + "grad_norm": 1.0278247594833374, + "learning_rate": 2.3383353341336536e-05, + "loss": 0.4885, + "step": 13316 + }, + { + "epoch": 17.04576, + "grad_norm": 1.0547192096710205, + "learning_rate": 2.3381352541016408e-05, + "loss": 0.5255, + "step": 13317 + }, + { + "epoch": 17.04704, + "grad_norm": 1.0186964273452759, + "learning_rate": 2.337935174069628e-05, + "loss": 0.5052, + "step": 13318 + }, + { + "epoch": 17.04832, + "grad_norm": 1.0712857246398926, + "learning_rate": 2.337735094037615e-05, + "loss": 0.4951, + "step": 13319 + }, + { + "epoch": 17.0496, + "grad_norm": 1.0552232265472412, + "learning_rate": 2.3375350140056023e-05, + "loss": 0.5391, + "step": 13320 + }, + { + "epoch": 17.05088, + "grad_norm": 0.9899628758430481, + "learning_rate": 2.3373349339735895e-05, + "loss": 0.4996, + "step": 13321 + }, + { + "epoch": 17.05216, + "grad_norm": 1.085460901260376, + "learning_rate": 2.3371348539415767e-05, + "loss": 0.5247, + "step": 13322 + }, + { + "epoch": 17.05344, + "grad_norm": 1.0540425777435303, + "learning_rate": 2.336934773909564e-05, + "loss": 0.5785, + "step": 13323 + }, + { + "epoch": 17.05472, + "grad_norm": 1.0214723348617554, + "learning_rate": 2.336734693877551e-05, + "loss": 0.5305, + "step": 13324 + }, + { + "epoch": 17.056, + "grad_norm": 1.0059022903442383, + "learning_rate": 2.3365346138455383e-05, + "loss": 0.487, + "step": 13325 + }, + { + "epoch": 17.05728, + "grad_norm": 1.063796877861023, + "learning_rate": 2.3363345338135258e-05, + "loss": 0.5328, + "step": 13326 + }, + { + "epoch": 17.05856, + "grad_norm": 1.0171808004379272, + "learning_rate": 2.3361344537815126e-05, + "loss": 0.4785, + "step": 13327 + }, + { + "epoch": 17.05984, + "grad_norm": 1.0254714488983154, + "learning_rate": 2.3359343737494998e-05, + "loss": 0.501, + "step": 13328 + }, + { + "epoch": 17.06112, + "grad_norm": 1.0111802816390991, + "learning_rate": 2.335734293717487e-05, + "loss": 0.5116, + "step": 13329 + }, + { + "epoch": 17.0624, + "grad_norm": 1.0492706298828125, + "learning_rate": 2.3355342136854745e-05, + "loss": 0.5129, + "step": 13330 + }, + { + "epoch": 17.06368, + "grad_norm": 1.0869669914245605, + "learning_rate": 2.3353341336534614e-05, + "loss": 0.4696, + "step": 13331 + }, + { + "epoch": 17.06496, + "grad_norm": 1.0306788682937622, + "learning_rate": 2.3351340536214486e-05, + "loss": 0.4977, + "step": 13332 + }, + { + "epoch": 17.06624, + "grad_norm": 0.9910968542098999, + "learning_rate": 2.334933973589436e-05, + "loss": 0.4801, + "step": 13333 + }, + { + "epoch": 17.06752, + "grad_norm": 0.9870019555091858, + "learning_rate": 2.3347338935574233e-05, + "loss": 0.4426, + "step": 13334 + }, + { + "epoch": 17.0688, + "grad_norm": 1.0062360763549805, + "learning_rate": 2.33453381352541e-05, + "loss": 0.5187, + "step": 13335 + }, + { + "epoch": 17.07008, + "grad_norm": 1.0343031883239746, + "learning_rate": 2.3343337334933973e-05, + "loss": 0.5119, + "step": 13336 + }, + { + "epoch": 17.07136, + "grad_norm": 1.0217961072921753, + "learning_rate": 2.334133653461385e-05, + "loss": 0.5541, + "step": 13337 + }, + { + "epoch": 17.07264, + "grad_norm": 1.0784296989440918, + "learning_rate": 2.333933573429372e-05, + "loss": 0.5345, + "step": 13338 + }, + { + "epoch": 17.07392, + "grad_norm": 1.029133677482605, + "learning_rate": 2.333733493397359e-05, + "loss": 0.5253, + "step": 13339 + }, + { + "epoch": 17.0752, + "grad_norm": 1.0548806190490723, + "learning_rate": 2.3335334133653464e-05, + "loss": 0.5053, + "step": 13340 + }, + { + "epoch": 17.07648, + "grad_norm": 1.0157487392425537, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.4751, + "step": 13341 + }, + { + "epoch": 17.07776, + "grad_norm": 1.0173470973968506, + "learning_rate": 2.3331332533013208e-05, + "loss": 0.4752, + "step": 13342 + }, + { + "epoch": 17.07904, + "grad_norm": 0.9745169281959534, + "learning_rate": 2.3329331732693076e-05, + "loss": 0.486, + "step": 13343 + }, + { + "epoch": 17.08032, + "grad_norm": 0.9814679622650146, + "learning_rate": 2.332733093237295e-05, + "loss": 0.5124, + "step": 13344 + }, + { + "epoch": 17.0816, + "grad_norm": 0.9977136850357056, + "learning_rate": 2.3325330132052823e-05, + "loss": 0.4984, + "step": 13345 + }, + { + "epoch": 17.08288, + "grad_norm": 0.9431197643280029, + "learning_rate": 2.3323329331732695e-05, + "loss": 0.4411, + "step": 13346 + }, + { + "epoch": 17.08416, + "grad_norm": 1.0017660856246948, + "learning_rate": 2.3321328531412567e-05, + "loss": 0.5211, + "step": 13347 + }, + { + "epoch": 17.08544, + "grad_norm": 0.9581298232078552, + "learning_rate": 2.331932773109244e-05, + "loss": 0.5069, + "step": 13348 + }, + { + "epoch": 17.08672, + "grad_norm": 1.0260916948318481, + "learning_rate": 2.331732693077231e-05, + "loss": 0.5305, + "step": 13349 + }, + { + "epoch": 17.088, + "grad_norm": 1.032827615737915, + "learning_rate": 2.3315326130452183e-05, + "loss": 0.5041, + "step": 13350 + }, + { + "epoch": 17.08928, + "grad_norm": 1.0370451211929321, + "learning_rate": 2.3313325330132054e-05, + "loss": 0.4872, + "step": 13351 + }, + { + "epoch": 17.09056, + "grad_norm": 1.0834031105041504, + "learning_rate": 2.3311324529811926e-05, + "loss": 0.5399, + "step": 13352 + }, + { + "epoch": 17.09184, + "grad_norm": 1.0523909330368042, + "learning_rate": 2.3309323729491798e-05, + "loss": 0.4924, + "step": 13353 + }, + { + "epoch": 17.09312, + "grad_norm": 1.0100966691970825, + "learning_rate": 2.330732292917167e-05, + "loss": 0.5309, + "step": 13354 + }, + { + "epoch": 17.0944, + "grad_norm": 1.0047094821929932, + "learning_rate": 2.3305322128851542e-05, + "loss": 0.4504, + "step": 13355 + }, + { + "epoch": 17.09568, + "grad_norm": 1.0484347343444824, + "learning_rate": 2.3303321328531414e-05, + "loss": 0.5239, + "step": 13356 + }, + { + "epoch": 17.09696, + "grad_norm": 1.0204596519470215, + "learning_rate": 2.3301320528211286e-05, + "loss": 0.5148, + "step": 13357 + }, + { + "epoch": 17.09824, + "grad_norm": 0.9914649128913879, + "learning_rate": 2.3299319727891157e-05, + "loss": 0.46, + "step": 13358 + }, + { + "epoch": 17.09952, + "grad_norm": 1.0509700775146484, + "learning_rate": 2.329731892757103e-05, + "loss": 0.4891, + "step": 13359 + }, + { + "epoch": 17.1008, + "grad_norm": 1.0409367084503174, + "learning_rate": 2.32953181272509e-05, + "loss": 0.5121, + "step": 13360 + }, + { + "epoch": 17.10208, + "grad_norm": 1.0810407400131226, + "learning_rate": 2.3293317326930776e-05, + "loss": 0.4976, + "step": 13361 + }, + { + "epoch": 17.10336, + "grad_norm": 1.0771454572677612, + "learning_rate": 2.3291316526610645e-05, + "loss": 0.4679, + "step": 13362 + }, + { + "epoch": 17.10464, + "grad_norm": 1.1189974546432495, + "learning_rate": 2.3289315726290517e-05, + "loss": 0.572, + "step": 13363 + }, + { + "epoch": 17.10592, + "grad_norm": 1.017249584197998, + "learning_rate": 2.328731492597039e-05, + "loss": 0.5068, + "step": 13364 + }, + { + "epoch": 17.1072, + "grad_norm": 0.990572988986969, + "learning_rate": 2.3285314125650264e-05, + "loss": 0.4766, + "step": 13365 + }, + { + "epoch": 17.10848, + "grad_norm": 1.0602976083755493, + "learning_rate": 2.3283313325330132e-05, + "loss": 0.4712, + "step": 13366 + }, + { + "epoch": 17.10976, + "grad_norm": 1.0583609342575073, + "learning_rate": 2.3281312525010004e-05, + "loss": 0.5533, + "step": 13367 + }, + { + "epoch": 17.11104, + "grad_norm": 1.0422335863113403, + "learning_rate": 2.327931172468988e-05, + "loss": 0.5399, + "step": 13368 + }, + { + "epoch": 17.11232, + "grad_norm": 0.9954720139503479, + "learning_rate": 2.327731092436975e-05, + "loss": 0.4705, + "step": 13369 + }, + { + "epoch": 17.1136, + "grad_norm": 1.0625905990600586, + "learning_rate": 2.327531012404962e-05, + "loss": 0.5102, + "step": 13370 + }, + { + "epoch": 17.11488, + "grad_norm": 1.03073251247406, + "learning_rate": 2.327330932372949e-05, + "loss": 0.5117, + "step": 13371 + }, + { + "epoch": 17.11616, + "grad_norm": 0.9936673045158386, + "learning_rate": 2.3271308523409367e-05, + "loss": 0.5368, + "step": 13372 + }, + { + "epoch": 17.11744, + "grad_norm": 1.014151692390442, + "learning_rate": 2.326930772308924e-05, + "loss": 0.4523, + "step": 13373 + }, + { + "epoch": 17.11872, + "grad_norm": 1.0433018207550049, + "learning_rate": 2.3267306922769107e-05, + "loss": 0.5037, + "step": 13374 + }, + { + "epoch": 17.12, + "grad_norm": 1.0825138092041016, + "learning_rate": 2.326530612244898e-05, + "loss": 0.5088, + "step": 13375 + }, + { + "epoch": 17.12128, + "grad_norm": 1.0675740242004395, + "learning_rate": 2.3263305322128854e-05, + "loss": 0.5024, + "step": 13376 + }, + { + "epoch": 17.12256, + "grad_norm": 1.002501130104065, + "learning_rate": 2.3261304521808726e-05, + "loss": 0.4748, + "step": 13377 + }, + { + "epoch": 17.12384, + "grad_norm": 1.042352557182312, + "learning_rate": 2.3259303721488595e-05, + "loss": 0.4869, + "step": 13378 + }, + { + "epoch": 17.12512, + "grad_norm": 1.0172719955444336, + "learning_rate": 2.325730292116847e-05, + "loss": 0.4917, + "step": 13379 + }, + { + "epoch": 17.1264, + "grad_norm": 0.9971911907196045, + "learning_rate": 2.325530212084834e-05, + "loss": 0.4934, + "step": 13380 + }, + { + "epoch": 17.12768, + "grad_norm": 0.9829162359237671, + "learning_rate": 2.3253301320528213e-05, + "loss": 0.5106, + "step": 13381 + }, + { + "epoch": 17.12896, + "grad_norm": 1.0853031873703003, + "learning_rate": 2.3251300520208082e-05, + "loss": 0.5248, + "step": 13382 + }, + { + "epoch": 17.13024, + "grad_norm": 1.0231006145477295, + "learning_rate": 2.3249299719887957e-05, + "loss": 0.5012, + "step": 13383 + }, + { + "epoch": 17.13152, + "grad_norm": 1.0397844314575195, + "learning_rate": 2.324729891956783e-05, + "loss": 0.5402, + "step": 13384 + }, + { + "epoch": 17.1328, + "grad_norm": 1.0465348958969116, + "learning_rate": 2.32452981192477e-05, + "loss": 0.5115, + "step": 13385 + }, + { + "epoch": 17.13408, + "grad_norm": 1.0421459674835205, + "learning_rate": 2.3243297318927573e-05, + "loss": 0.4913, + "step": 13386 + }, + { + "epoch": 17.13536, + "grad_norm": 0.991023063659668, + "learning_rate": 2.3241296518607445e-05, + "loss": 0.5016, + "step": 13387 + }, + { + "epoch": 17.13664, + "grad_norm": 1.0662450790405273, + "learning_rate": 2.3239295718287316e-05, + "loss": 0.5424, + "step": 13388 + }, + { + "epoch": 17.13792, + "grad_norm": 1.0365347862243652, + "learning_rate": 2.323729491796719e-05, + "loss": 0.534, + "step": 13389 + }, + { + "epoch": 17.1392, + "grad_norm": 1.0068080425262451, + "learning_rate": 2.323529411764706e-05, + "loss": 0.4582, + "step": 13390 + }, + { + "epoch": 17.14048, + "grad_norm": 0.9789687991142273, + "learning_rate": 2.3233293317326932e-05, + "loss": 0.4683, + "step": 13391 + }, + { + "epoch": 17.14176, + "grad_norm": 0.9680244326591492, + "learning_rate": 2.3231292517006804e-05, + "loss": 0.47, + "step": 13392 + }, + { + "epoch": 17.14304, + "grad_norm": 1.0483875274658203, + "learning_rate": 2.3229291716686676e-05, + "loss": 0.5163, + "step": 13393 + }, + { + "epoch": 17.14432, + "grad_norm": 0.998321533203125, + "learning_rate": 2.3227290916366548e-05, + "loss": 0.5058, + "step": 13394 + }, + { + "epoch": 17.1456, + "grad_norm": 1.0452215671539307, + "learning_rate": 2.322529011604642e-05, + "loss": 0.5377, + "step": 13395 + }, + { + "epoch": 17.14688, + "grad_norm": 1.0813066959381104, + "learning_rate": 2.322328931572629e-05, + "loss": 0.4744, + "step": 13396 + }, + { + "epoch": 17.14816, + "grad_norm": 1.0850361585617065, + "learning_rate": 2.3221288515406163e-05, + "loss": 0.5534, + "step": 13397 + }, + { + "epoch": 17.14944, + "grad_norm": 1.034781575202942, + "learning_rate": 2.3219287715086035e-05, + "loss": 0.5279, + "step": 13398 + }, + { + "epoch": 17.15072, + "grad_norm": 1.0639398097991943, + "learning_rate": 2.3217286914765907e-05, + "loss": 0.5009, + "step": 13399 + }, + { + "epoch": 17.152, + "grad_norm": 1.0536409616470337, + "learning_rate": 2.3215286114445782e-05, + "loss": 0.5132, + "step": 13400 + }, + { + "epoch": 17.15328, + "grad_norm": 1.0476963520050049, + "learning_rate": 2.321328531412565e-05, + "loss": 0.5241, + "step": 13401 + }, + { + "epoch": 17.15456, + "grad_norm": 1.0148664712905884, + "learning_rate": 2.3211284513805522e-05, + "loss": 0.4843, + "step": 13402 + }, + { + "epoch": 17.15584, + "grad_norm": 0.9978867769241333, + "learning_rate": 2.3209283713485394e-05, + "loss": 0.5003, + "step": 13403 + }, + { + "epoch": 17.15712, + "grad_norm": 1.0880756378173828, + "learning_rate": 2.320728291316527e-05, + "loss": 0.5188, + "step": 13404 + }, + { + "epoch": 17.1584, + "grad_norm": 1.064671516418457, + "learning_rate": 2.3205282112845138e-05, + "loss": 0.5898, + "step": 13405 + }, + { + "epoch": 17.15968, + "grad_norm": 1.0149790048599243, + "learning_rate": 2.320328131252501e-05, + "loss": 0.4774, + "step": 13406 + }, + { + "epoch": 17.16096, + "grad_norm": 0.992540180683136, + "learning_rate": 2.3201280512204885e-05, + "loss": 0.4796, + "step": 13407 + }, + { + "epoch": 17.16224, + "grad_norm": 1.0193244218826294, + "learning_rate": 2.3199279711884757e-05, + "loss": 0.503, + "step": 13408 + }, + { + "epoch": 17.16352, + "grad_norm": 1.0211812257766724, + "learning_rate": 2.3197278911564625e-05, + "loss": 0.5344, + "step": 13409 + }, + { + "epoch": 17.1648, + "grad_norm": 0.9962586164474487, + "learning_rate": 2.3195278111244497e-05, + "loss": 0.477, + "step": 13410 + }, + { + "epoch": 17.16608, + "grad_norm": 1.0423258543014526, + "learning_rate": 2.3193277310924373e-05, + "loss": 0.5273, + "step": 13411 + }, + { + "epoch": 17.16736, + "grad_norm": 1.0666061639785767, + "learning_rate": 2.3191276510604244e-05, + "loss": 0.4738, + "step": 13412 + }, + { + "epoch": 17.16864, + "grad_norm": 1.0447877645492554, + "learning_rate": 2.3189275710284113e-05, + "loss": 0.5042, + "step": 13413 + }, + { + "epoch": 17.16992, + "grad_norm": 1.041363000869751, + "learning_rate": 2.3187274909963988e-05, + "loss": 0.5165, + "step": 13414 + }, + { + "epoch": 17.1712, + "grad_norm": 0.9531150460243225, + "learning_rate": 2.318527410964386e-05, + "loss": 0.4656, + "step": 13415 + }, + { + "epoch": 17.17248, + "grad_norm": 0.975502610206604, + "learning_rate": 2.3183273309323732e-05, + "loss": 0.4421, + "step": 13416 + }, + { + "epoch": 17.17376, + "grad_norm": 1.0709878206253052, + "learning_rate": 2.31812725090036e-05, + "loss": 0.5181, + "step": 13417 + }, + { + "epoch": 17.17504, + "grad_norm": 1.0228393077850342, + "learning_rate": 2.3179271708683476e-05, + "loss": 0.488, + "step": 13418 + }, + { + "epoch": 17.17632, + "grad_norm": 0.9908788800239563, + "learning_rate": 2.3177270908363347e-05, + "loss": 0.5135, + "step": 13419 + }, + { + "epoch": 17.1776, + "grad_norm": 1.0089093446731567, + "learning_rate": 2.317527010804322e-05, + "loss": 0.4801, + "step": 13420 + }, + { + "epoch": 17.17888, + "grad_norm": 1.0426452159881592, + "learning_rate": 2.317326930772309e-05, + "loss": 0.5075, + "step": 13421 + }, + { + "epoch": 17.18016, + "grad_norm": 1.021355390548706, + "learning_rate": 2.3171268507402963e-05, + "loss": 0.4907, + "step": 13422 + }, + { + "epoch": 17.18144, + "grad_norm": 1.0270839929580688, + "learning_rate": 2.3169267707082835e-05, + "loss": 0.4559, + "step": 13423 + }, + { + "epoch": 17.18272, + "grad_norm": 0.9954252243041992, + "learning_rate": 2.3167266906762707e-05, + "loss": 0.5026, + "step": 13424 + }, + { + "epoch": 17.184, + "grad_norm": 0.9519053101539612, + "learning_rate": 2.316526610644258e-05, + "loss": 0.465, + "step": 13425 + }, + { + "epoch": 17.18528, + "grad_norm": 0.966871976852417, + "learning_rate": 2.316326530612245e-05, + "loss": 0.4922, + "step": 13426 + }, + { + "epoch": 17.18656, + "grad_norm": 1.1276544332504272, + "learning_rate": 2.3161264505802322e-05, + "loss": 0.5007, + "step": 13427 + }, + { + "epoch": 17.18784, + "grad_norm": 1.0328774452209473, + "learning_rate": 2.3159263705482194e-05, + "loss": 0.4846, + "step": 13428 + }, + { + "epoch": 17.18912, + "grad_norm": 1.0253841876983643, + "learning_rate": 2.3157262905162066e-05, + "loss": 0.48, + "step": 13429 + }, + { + "epoch": 17.1904, + "grad_norm": 1.1044909954071045, + "learning_rate": 2.3155262104841938e-05, + "loss": 0.4854, + "step": 13430 + }, + { + "epoch": 17.19168, + "grad_norm": 1.1031279563903809, + "learning_rate": 2.315326130452181e-05, + "loss": 0.5431, + "step": 13431 + }, + { + "epoch": 17.19296, + "grad_norm": 1.0845425128936768, + "learning_rate": 2.315126050420168e-05, + "loss": 0.5446, + "step": 13432 + }, + { + "epoch": 17.19424, + "grad_norm": 1.0719091892242432, + "learning_rate": 2.3149259703881553e-05, + "loss": 0.525, + "step": 13433 + }, + { + "epoch": 17.19552, + "grad_norm": 1.115065097808838, + "learning_rate": 2.3147258903561425e-05, + "loss": 0.5301, + "step": 13434 + }, + { + "epoch": 17.1968, + "grad_norm": 1.0398420095443726, + "learning_rate": 2.3145258103241297e-05, + "loss": 0.5075, + "step": 13435 + }, + { + "epoch": 17.19808, + "grad_norm": 0.9958305358886719, + "learning_rate": 2.314325730292117e-05, + "loss": 0.4951, + "step": 13436 + }, + { + "epoch": 17.19936, + "grad_norm": 1.0259790420532227, + "learning_rate": 2.314125650260104e-05, + "loss": 0.4844, + "step": 13437 + }, + { + "epoch": 17.20064, + "grad_norm": 1.1417039632797241, + "learning_rate": 2.3139255702280913e-05, + "loss": 0.5455, + "step": 13438 + }, + { + "epoch": 17.20192, + "grad_norm": 1.0283827781677246, + "learning_rate": 2.3137254901960788e-05, + "loss": 0.4931, + "step": 13439 + }, + { + "epoch": 17.2032, + "grad_norm": 0.9671483635902405, + "learning_rate": 2.3135254101640656e-05, + "loss": 0.4683, + "step": 13440 + }, + { + "epoch": 17.20448, + "grad_norm": 0.9877151250839233, + "learning_rate": 2.3133253301320528e-05, + "loss": 0.4812, + "step": 13441 + }, + { + "epoch": 17.20576, + "grad_norm": 1.0239356756210327, + "learning_rate": 2.31312525010004e-05, + "loss": 0.4782, + "step": 13442 + }, + { + "epoch": 17.20704, + "grad_norm": 1.1128602027893066, + "learning_rate": 2.3129251700680275e-05, + "loss": 0.5034, + "step": 13443 + }, + { + "epoch": 17.20832, + "grad_norm": 1.0178550481796265, + "learning_rate": 2.3127250900360144e-05, + "loss": 0.5063, + "step": 13444 + }, + { + "epoch": 17.209600000000002, + "grad_norm": 1.0738757848739624, + "learning_rate": 2.3125250100040016e-05, + "loss": 0.5127, + "step": 13445 + }, + { + "epoch": 17.21088, + "grad_norm": 1.0246436595916748, + "learning_rate": 2.312324929971989e-05, + "loss": 0.5128, + "step": 13446 + }, + { + "epoch": 17.21216, + "grad_norm": 1.0808448791503906, + "learning_rate": 2.3121248499399763e-05, + "loss": 0.5116, + "step": 13447 + }, + { + "epoch": 17.21344, + "grad_norm": 1.0280914306640625, + "learning_rate": 2.311924769907963e-05, + "loss": 0.5175, + "step": 13448 + }, + { + "epoch": 17.21472, + "grad_norm": 1.0112377405166626, + "learning_rate": 2.3117246898759503e-05, + "loss": 0.4666, + "step": 13449 + }, + { + "epoch": 17.216, + "grad_norm": 1.0588704347610474, + "learning_rate": 2.311524609843938e-05, + "loss": 0.4821, + "step": 13450 + }, + { + "epoch": 17.21728, + "grad_norm": 1.005056381225586, + "learning_rate": 2.311324529811925e-05, + "loss": 0.5253, + "step": 13451 + }, + { + "epoch": 17.21856, + "grad_norm": 0.979955792427063, + "learning_rate": 2.311124449779912e-05, + "loss": 0.4832, + "step": 13452 + }, + { + "epoch": 17.21984, + "grad_norm": 0.989325225353241, + "learning_rate": 2.3109243697478994e-05, + "loss": 0.514, + "step": 13453 + }, + { + "epoch": 17.22112, + "grad_norm": 1.1037169694900513, + "learning_rate": 2.3107242897158866e-05, + "loss": 0.5589, + "step": 13454 + }, + { + "epoch": 17.2224, + "grad_norm": 1.101965069770813, + "learning_rate": 2.3105242096838738e-05, + "loss": 0.5082, + "step": 13455 + }, + { + "epoch": 17.22368, + "grad_norm": 1.0133678913116455, + "learning_rate": 2.3103241296518606e-05, + "loss": 0.5253, + "step": 13456 + }, + { + "epoch": 17.22496, + "grad_norm": 0.9965882301330566, + "learning_rate": 2.310124049619848e-05, + "loss": 0.4828, + "step": 13457 + }, + { + "epoch": 17.22624, + "grad_norm": 1.0856853723526, + "learning_rate": 2.3099239695878353e-05, + "loss": 0.533, + "step": 13458 + }, + { + "epoch": 17.22752, + "grad_norm": 0.9921742081642151, + "learning_rate": 2.3097238895558225e-05, + "loss": 0.5221, + "step": 13459 + }, + { + "epoch": 17.2288, + "grad_norm": 1.0070276260375977, + "learning_rate": 2.3095238095238097e-05, + "loss": 0.4822, + "step": 13460 + }, + { + "epoch": 17.23008, + "grad_norm": 1.0681109428405762, + "learning_rate": 2.309323729491797e-05, + "loss": 0.5146, + "step": 13461 + }, + { + "epoch": 17.23136, + "grad_norm": 1.0588202476501465, + "learning_rate": 2.309123649459784e-05, + "loss": 0.484, + "step": 13462 + }, + { + "epoch": 17.23264, + "grad_norm": 1.072527527809143, + "learning_rate": 2.3089235694277712e-05, + "loss": 0.5442, + "step": 13463 + }, + { + "epoch": 17.23392, + "grad_norm": 1.0427266359329224, + "learning_rate": 2.3087234893957584e-05, + "loss": 0.4935, + "step": 13464 + }, + { + "epoch": 17.2352, + "grad_norm": 1.0477319955825806, + "learning_rate": 2.3085234093637456e-05, + "loss": 0.4786, + "step": 13465 + }, + { + "epoch": 17.23648, + "grad_norm": 1.1097187995910645, + "learning_rate": 2.3083233293317328e-05, + "loss": 0.5198, + "step": 13466 + }, + { + "epoch": 17.23776, + "grad_norm": 1.0430151224136353, + "learning_rate": 2.30812324929972e-05, + "loss": 0.5408, + "step": 13467 + }, + { + "epoch": 17.23904, + "grad_norm": 1.084143042564392, + "learning_rate": 2.3079231692677072e-05, + "loss": 0.5259, + "step": 13468 + }, + { + "epoch": 17.24032, + "grad_norm": 1.034934163093567, + "learning_rate": 2.3077230892356944e-05, + "loss": 0.5068, + "step": 13469 + }, + { + "epoch": 17.2416, + "grad_norm": 0.9962584972381592, + "learning_rate": 2.3075230092036815e-05, + "loss": 0.4753, + "step": 13470 + }, + { + "epoch": 17.24288, + "grad_norm": 1.0584473609924316, + "learning_rate": 2.3073229291716687e-05, + "loss": 0.4369, + "step": 13471 + }, + { + "epoch": 17.24416, + "grad_norm": 1.0417580604553223, + "learning_rate": 2.307122849139656e-05, + "loss": 0.547, + "step": 13472 + }, + { + "epoch": 17.24544, + "grad_norm": 0.9787613749504089, + "learning_rate": 2.306922769107643e-05, + "loss": 0.4825, + "step": 13473 + }, + { + "epoch": 17.24672, + "grad_norm": 0.97577303647995, + "learning_rate": 2.3067226890756306e-05, + "loss": 0.4792, + "step": 13474 + }, + { + "epoch": 17.248, + "grad_norm": 1.0570039749145508, + "learning_rate": 2.3065226090436175e-05, + "loss": 0.4757, + "step": 13475 + }, + { + "epoch": 17.24928, + "grad_norm": 0.9857975840568542, + "learning_rate": 2.3063225290116047e-05, + "loss": 0.541, + "step": 13476 + }, + { + "epoch": 17.25056, + "grad_norm": 1.0609130859375, + "learning_rate": 2.306122448979592e-05, + "loss": 0.547, + "step": 13477 + }, + { + "epoch": 17.25184, + "grad_norm": 1.0602465867996216, + "learning_rate": 2.3059223689475794e-05, + "loss": 0.5039, + "step": 13478 + }, + { + "epoch": 17.25312, + "grad_norm": 1.033346176147461, + "learning_rate": 2.3057222889155662e-05, + "loss": 0.4925, + "step": 13479 + }, + { + "epoch": 17.2544, + "grad_norm": 1.0470037460327148, + "learning_rate": 2.3055222088835534e-05, + "loss": 0.5261, + "step": 13480 + }, + { + "epoch": 17.25568, + "grad_norm": 1.0750315189361572, + "learning_rate": 2.305322128851541e-05, + "loss": 0.5026, + "step": 13481 + }, + { + "epoch": 17.25696, + "grad_norm": 1.0455011129379272, + "learning_rate": 2.305122048819528e-05, + "loss": 0.4955, + "step": 13482 + }, + { + "epoch": 17.25824, + "grad_norm": 0.9804509282112122, + "learning_rate": 2.304921968787515e-05, + "loss": 0.4493, + "step": 13483 + }, + { + "epoch": 17.25952, + "grad_norm": 1.0425822734832764, + "learning_rate": 2.304721888755502e-05, + "loss": 0.5265, + "step": 13484 + }, + { + "epoch": 17.2608, + "grad_norm": 1.0288803577423096, + "learning_rate": 2.3045218087234897e-05, + "loss": 0.5335, + "step": 13485 + }, + { + "epoch": 17.26208, + "grad_norm": 1.0013490915298462, + "learning_rate": 2.304321728691477e-05, + "loss": 0.5016, + "step": 13486 + }, + { + "epoch": 17.26336, + "grad_norm": 0.9813631176948547, + "learning_rate": 2.3041216486594637e-05, + "loss": 0.4945, + "step": 13487 + }, + { + "epoch": 17.26464, + "grad_norm": 1.004841923713684, + "learning_rate": 2.303921568627451e-05, + "loss": 0.5357, + "step": 13488 + }, + { + "epoch": 17.26592, + "grad_norm": 1.007587194442749, + "learning_rate": 2.3037214885954384e-05, + "loss": 0.4896, + "step": 13489 + }, + { + "epoch": 17.2672, + "grad_norm": 1.0714519023895264, + "learning_rate": 2.3035214085634256e-05, + "loss": 0.5111, + "step": 13490 + }, + { + "epoch": 17.26848, + "grad_norm": 1.0348763465881348, + "learning_rate": 2.3033213285314124e-05, + "loss": 0.4855, + "step": 13491 + }, + { + "epoch": 17.26976, + "grad_norm": 1.0198546648025513, + "learning_rate": 2.3031212484994e-05, + "loss": 0.5217, + "step": 13492 + }, + { + "epoch": 17.27104, + "grad_norm": 1.0204856395721436, + "learning_rate": 2.302921168467387e-05, + "loss": 0.5029, + "step": 13493 + }, + { + "epoch": 17.27232, + "grad_norm": 1.0740776062011719, + "learning_rate": 2.3027210884353743e-05, + "loss": 0.5458, + "step": 13494 + }, + { + "epoch": 17.2736, + "grad_norm": 1.0617952346801758, + "learning_rate": 2.3025210084033612e-05, + "loss": 0.5018, + "step": 13495 + }, + { + "epoch": 17.27488, + "grad_norm": 0.9945394992828369, + "learning_rate": 2.3023209283713487e-05, + "loss": 0.5174, + "step": 13496 + }, + { + "epoch": 17.27616, + "grad_norm": 1.033125400543213, + "learning_rate": 2.302120848339336e-05, + "loss": 0.5133, + "step": 13497 + }, + { + "epoch": 17.27744, + "grad_norm": 1.0384323596954346, + "learning_rate": 2.301920768307323e-05, + "loss": 0.5283, + "step": 13498 + }, + { + "epoch": 17.27872, + "grad_norm": 1.0113089084625244, + "learning_rate": 2.3017206882753103e-05, + "loss": 0.4772, + "step": 13499 + }, + { + "epoch": 17.28, + "grad_norm": 1.0131186246871948, + "learning_rate": 2.3015206082432975e-05, + "loss": 0.4923, + "step": 13500 + }, + { + "epoch": 17.28128, + "grad_norm": 1.0085313320159912, + "learning_rate": 2.3013205282112846e-05, + "loss": 0.525, + "step": 13501 + }, + { + "epoch": 17.28256, + "grad_norm": 0.9978111386299133, + "learning_rate": 2.3011204481792718e-05, + "loss": 0.5065, + "step": 13502 + }, + { + "epoch": 17.28384, + "grad_norm": 1.0002354383468628, + "learning_rate": 2.300920368147259e-05, + "loss": 0.4846, + "step": 13503 + }, + { + "epoch": 17.28512, + "grad_norm": 1.0522061586380005, + "learning_rate": 2.3007202881152462e-05, + "loss": 0.5395, + "step": 13504 + }, + { + "epoch": 17.2864, + "grad_norm": 1.0922776460647583, + "learning_rate": 2.3005202080832334e-05, + "loss": 0.5106, + "step": 13505 + }, + { + "epoch": 17.28768, + "grad_norm": 1.027981162071228, + "learning_rate": 2.3003201280512206e-05, + "loss": 0.4703, + "step": 13506 + }, + { + "epoch": 17.28896, + "grad_norm": 1.0434186458587646, + "learning_rate": 2.3001200480192078e-05, + "loss": 0.5184, + "step": 13507 + }, + { + "epoch": 17.29024, + "grad_norm": 1.038582682609558, + "learning_rate": 2.299919967987195e-05, + "loss": 0.5318, + "step": 13508 + }, + { + "epoch": 17.29152, + "grad_norm": 1.0460221767425537, + "learning_rate": 2.299719887955182e-05, + "loss": 0.5173, + "step": 13509 + }, + { + "epoch": 17.2928, + "grad_norm": 1.0172295570373535, + "learning_rate": 2.2995198079231693e-05, + "loss": 0.4987, + "step": 13510 + }, + { + "epoch": 17.29408, + "grad_norm": 1.1028789281845093, + "learning_rate": 2.2993197278911565e-05, + "loss": 0.545, + "step": 13511 + }, + { + "epoch": 17.29536, + "grad_norm": 1.0653462409973145, + "learning_rate": 2.2991196478591437e-05, + "loss": 0.5482, + "step": 13512 + }, + { + "epoch": 17.29664, + "grad_norm": 1.0706826448440552, + "learning_rate": 2.2989195678271312e-05, + "loss": 0.5027, + "step": 13513 + }, + { + "epoch": 17.29792, + "grad_norm": 1.0168458223342896, + "learning_rate": 2.298719487795118e-05, + "loss": 0.5167, + "step": 13514 + }, + { + "epoch": 17.2992, + "grad_norm": 0.9857696890830994, + "learning_rate": 2.2985194077631052e-05, + "loss": 0.5102, + "step": 13515 + }, + { + "epoch": 17.30048, + "grad_norm": 1.0181854963302612, + "learning_rate": 2.2983193277310924e-05, + "loss": 0.4949, + "step": 13516 + }, + { + "epoch": 17.30176, + "grad_norm": 1.10274338722229, + "learning_rate": 2.29811924769908e-05, + "loss": 0.5449, + "step": 13517 + }, + { + "epoch": 17.30304, + "grad_norm": 1.0308619737625122, + "learning_rate": 2.2979191676670668e-05, + "loss": 0.5152, + "step": 13518 + }, + { + "epoch": 17.30432, + "grad_norm": 1.0274208784103394, + "learning_rate": 2.297719087635054e-05, + "loss": 0.5126, + "step": 13519 + }, + { + "epoch": 17.3056, + "grad_norm": 1.0590589046478271, + "learning_rate": 2.2975190076030415e-05, + "loss": 0.516, + "step": 13520 + }, + { + "epoch": 17.30688, + "grad_norm": 1.0562139749526978, + "learning_rate": 2.2973189275710287e-05, + "loss": 0.5627, + "step": 13521 + }, + { + "epoch": 17.30816, + "grad_norm": 1.004634141921997, + "learning_rate": 2.2971188475390155e-05, + "loss": 0.49, + "step": 13522 + }, + { + "epoch": 17.30944, + "grad_norm": 1.0177441835403442, + "learning_rate": 2.2969187675070027e-05, + "loss": 0.4957, + "step": 13523 + }, + { + "epoch": 17.31072, + "grad_norm": 1.0136632919311523, + "learning_rate": 2.2967186874749903e-05, + "loss": 0.507, + "step": 13524 + }, + { + "epoch": 17.312, + "grad_norm": 0.9950380325317383, + "learning_rate": 2.2965186074429774e-05, + "loss": 0.4692, + "step": 13525 + }, + { + "epoch": 17.31328, + "grad_norm": 1.0641790628433228, + "learning_rate": 2.2963185274109643e-05, + "loss": 0.4797, + "step": 13526 + }, + { + "epoch": 17.31456, + "grad_norm": 1.120981216430664, + "learning_rate": 2.2961184473789518e-05, + "loss": 0.5234, + "step": 13527 + }, + { + "epoch": 17.31584, + "grad_norm": 1.044729471206665, + "learning_rate": 2.295918367346939e-05, + "loss": 0.4913, + "step": 13528 + }, + { + "epoch": 17.31712, + "grad_norm": 1.029380202293396, + "learning_rate": 2.2957182873149262e-05, + "loss": 0.5046, + "step": 13529 + }, + { + "epoch": 17.3184, + "grad_norm": 1.0592548847198486, + "learning_rate": 2.295518207282913e-05, + "loss": 0.4982, + "step": 13530 + }, + { + "epoch": 17.31968, + "grad_norm": 1.0422892570495605, + "learning_rate": 2.2953181272509006e-05, + "loss": 0.5258, + "step": 13531 + }, + { + "epoch": 17.32096, + "grad_norm": 1.0550442934036255, + "learning_rate": 2.2951180472188877e-05, + "loss": 0.5656, + "step": 13532 + }, + { + "epoch": 17.32224, + "grad_norm": 1.009332299232483, + "learning_rate": 2.294917967186875e-05, + "loss": 0.473, + "step": 13533 + }, + { + "epoch": 17.32352, + "grad_norm": 1.0183335542678833, + "learning_rate": 2.294717887154862e-05, + "loss": 0.5133, + "step": 13534 + }, + { + "epoch": 17.3248, + "grad_norm": 1.013777732849121, + "learning_rate": 2.2945178071228493e-05, + "loss": 0.5414, + "step": 13535 + }, + { + "epoch": 17.32608, + "grad_norm": 1.0590869188308716, + "learning_rate": 2.2943177270908365e-05, + "loss": 0.5051, + "step": 13536 + }, + { + "epoch": 17.32736, + "grad_norm": 1.0427137613296509, + "learning_rate": 2.2941176470588237e-05, + "loss": 0.5346, + "step": 13537 + }, + { + "epoch": 17.32864, + "grad_norm": 0.9942336678504944, + "learning_rate": 2.293917567026811e-05, + "loss": 0.4948, + "step": 13538 + }, + { + "epoch": 17.32992, + "grad_norm": 1.0325044393539429, + "learning_rate": 2.293717486994798e-05, + "loss": 0.5118, + "step": 13539 + }, + { + "epoch": 17.3312, + "grad_norm": 1.0152339935302734, + "learning_rate": 2.2935174069627852e-05, + "loss": 0.5144, + "step": 13540 + }, + { + "epoch": 17.33248, + "grad_norm": 1.0435190200805664, + "learning_rate": 2.2933173269307724e-05, + "loss": 0.5059, + "step": 13541 + }, + { + "epoch": 17.33376, + "grad_norm": 1.0658721923828125, + "learning_rate": 2.2931172468987596e-05, + "loss": 0.5129, + "step": 13542 + }, + { + "epoch": 17.33504, + "grad_norm": 1.0623787641525269, + "learning_rate": 2.2929171668667468e-05, + "loss": 0.4906, + "step": 13543 + }, + { + "epoch": 17.33632, + "grad_norm": 1.0778841972351074, + "learning_rate": 2.292717086834734e-05, + "loss": 0.5309, + "step": 13544 + }, + { + "epoch": 17.3376, + "grad_norm": 1.048268437385559, + "learning_rate": 2.292517006802721e-05, + "loss": 0.5067, + "step": 13545 + }, + { + "epoch": 17.33888, + "grad_norm": 1.0772168636322021, + "learning_rate": 2.2923169267707083e-05, + "loss": 0.5369, + "step": 13546 + }, + { + "epoch": 17.34016, + "grad_norm": 1.0442776679992676, + "learning_rate": 2.2921168467386955e-05, + "loss": 0.5185, + "step": 13547 + }, + { + "epoch": 17.34144, + "grad_norm": 1.0044589042663574, + "learning_rate": 2.2919167667066827e-05, + "loss": 0.4811, + "step": 13548 + }, + { + "epoch": 17.34272, + "grad_norm": 1.067563533782959, + "learning_rate": 2.29171668667467e-05, + "loss": 0.5228, + "step": 13549 + }, + { + "epoch": 17.344, + "grad_norm": 1.026764988899231, + "learning_rate": 2.291516606642657e-05, + "loss": 0.5043, + "step": 13550 + }, + { + "epoch": 17.34528, + "grad_norm": 1.0351366996765137, + "learning_rate": 2.2913165266106443e-05, + "loss": 0.5151, + "step": 13551 + }, + { + "epoch": 17.34656, + "grad_norm": 1.0662847757339478, + "learning_rate": 2.2911164465786318e-05, + "loss": 0.5239, + "step": 13552 + }, + { + "epoch": 17.34784, + "grad_norm": 1.1130237579345703, + "learning_rate": 2.2909163665466186e-05, + "loss": 0.5437, + "step": 13553 + }, + { + "epoch": 17.34912, + "grad_norm": 1.0121694803237915, + "learning_rate": 2.2907162865146058e-05, + "loss": 0.4844, + "step": 13554 + }, + { + "epoch": 17.3504, + "grad_norm": 1.0110938549041748, + "learning_rate": 2.290516206482593e-05, + "loss": 0.4779, + "step": 13555 + }, + { + "epoch": 17.35168, + "grad_norm": 1.0952173471450806, + "learning_rate": 2.2903161264505805e-05, + "loss": 0.5603, + "step": 13556 + }, + { + "epoch": 17.35296, + "grad_norm": 1.018991231918335, + "learning_rate": 2.2901160464185674e-05, + "loss": 0.5099, + "step": 13557 + }, + { + "epoch": 17.35424, + "grad_norm": 1.0593852996826172, + "learning_rate": 2.2899159663865546e-05, + "loss": 0.5467, + "step": 13558 + }, + { + "epoch": 17.35552, + "grad_norm": 1.0670006275177002, + "learning_rate": 2.289715886354542e-05, + "loss": 0.495, + "step": 13559 + }, + { + "epoch": 17.3568, + "grad_norm": 1.0311278104782104, + "learning_rate": 2.2895158063225293e-05, + "loss": 0.481, + "step": 13560 + }, + { + "epoch": 17.35808, + "grad_norm": 1.0428582429885864, + "learning_rate": 2.289315726290516e-05, + "loss": 0.4989, + "step": 13561 + }, + { + "epoch": 17.35936, + "grad_norm": 1.0917315483093262, + "learning_rate": 2.2891156462585033e-05, + "loss": 0.5338, + "step": 13562 + }, + { + "epoch": 17.36064, + "grad_norm": 0.999539852142334, + "learning_rate": 2.288915566226491e-05, + "loss": 0.5455, + "step": 13563 + }, + { + "epoch": 17.36192, + "grad_norm": 1.0183244943618774, + "learning_rate": 2.288715486194478e-05, + "loss": 0.515, + "step": 13564 + }, + { + "epoch": 17.3632, + "grad_norm": 1.0455024242401123, + "learning_rate": 2.288515406162465e-05, + "loss": 0.5141, + "step": 13565 + }, + { + "epoch": 17.36448, + "grad_norm": 1.0188703536987305, + "learning_rate": 2.2883153261304524e-05, + "loss": 0.4722, + "step": 13566 + }, + { + "epoch": 17.36576, + "grad_norm": 1.0280158519744873, + "learning_rate": 2.2881152460984396e-05, + "loss": 0.5013, + "step": 13567 + }, + { + "epoch": 17.36704, + "grad_norm": 1.048551082611084, + "learning_rate": 2.2879151660664268e-05, + "loss": 0.5232, + "step": 13568 + }, + { + "epoch": 17.36832, + "grad_norm": 0.9960765838623047, + "learning_rate": 2.2877150860344136e-05, + "loss": 0.5111, + "step": 13569 + }, + { + "epoch": 17.3696, + "grad_norm": 0.9852240085601807, + "learning_rate": 2.287515006002401e-05, + "loss": 0.4906, + "step": 13570 + }, + { + "epoch": 17.37088, + "grad_norm": 1.0797041654586792, + "learning_rate": 2.2873149259703883e-05, + "loss": 0.5014, + "step": 13571 + }, + { + "epoch": 17.37216, + "grad_norm": 1.0308854579925537, + "learning_rate": 2.2871148459383755e-05, + "loss": 0.4957, + "step": 13572 + }, + { + "epoch": 17.37344, + "grad_norm": 1.0648754835128784, + "learning_rate": 2.2869147659063627e-05, + "loss": 0.4862, + "step": 13573 + }, + { + "epoch": 17.37472, + "grad_norm": 1.0644230842590332, + "learning_rate": 2.28671468587435e-05, + "loss": 0.5365, + "step": 13574 + }, + { + "epoch": 17.376, + "grad_norm": 1.071189522743225, + "learning_rate": 2.286514605842337e-05, + "loss": 0.5117, + "step": 13575 + }, + { + "epoch": 17.37728, + "grad_norm": 1.0346791744232178, + "learning_rate": 2.2863145258103242e-05, + "loss": 0.532, + "step": 13576 + }, + { + "epoch": 17.37856, + "grad_norm": 1.0855563879013062, + "learning_rate": 2.2861144457783114e-05, + "loss": 0.5115, + "step": 13577 + }, + { + "epoch": 17.37984, + "grad_norm": 0.9857675433158875, + "learning_rate": 2.2859143657462986e-05, + "loss": 0.473, + "step": 13578 + }, + { + "epoch": 17.38112, + "grad_norm": 0.993156373500824, + "learning_rate": 2.2857142857142858e-05, + "loss": 0.4923, + "step": 13579 + }, + { + "epoch": 17.3824, + "grad_norm": 1.099673867225647, + "learning_rate": 2.285514205682273e-05, + "loss": 0.5255, + "step": 13580 + }, + { + "epoch": 17.38368, + "grad_norm": 1.0772475004196167, + "learning_rate": 2.2853141256502602e-05, + "loss": 0.5019, + "step": 13581 + }, + { + "epoch": 17.38496, + "grad_norm": 1.078121304512024, + "learning_rate": 2.2851140456182474e-05, + "loss": 0.5288, + "step": 13582 + }, + { + "epoch": 17.38624, + "grad_norm": 0.9957267642021179, + "learning_rate": 2.2849139655862345e-05, + "loss": 0.4716, + "step": 13583 + }, + { + "epoch": 17.38752, + "grad_norm": 1.0318092107772827, + "learning_rate": 2.2847138855542217e-05, + "loss": 0.4994, + "step": 13584 + }, + { + "epoch": 17.3888, + "grad_norm": 1.078931212425232, + "learning_rate": 2.284513805522209e-05, + "loss": 0.538, + "step": 13585 + }, + { + "epoch": 17.39008, + "grad_norm": 1.092041254043579, + "learning_rate": 2.284313725490196e-05, + "loss": 0.5409, + "step": 13586 + }, + { + "epoch": 17.39136, + "grad_norm": 1.0563862323760986, + "learning_rate": 2.2841136454581836e-05, + "loss": 0.4859, + "step": 13587 + }, + { + "epoch": 17.39264, + "grad_norm": 1.045304298400879, + "learning_rate": 2.2839135654261705e-05, + "loss": 0.4956, + "step": 13588 + }, + { + "epoch": 17.39392, + "grad_norm": 1.0128166675567627, + "learning_rate": 2.2837134853941577e-05, + "loss": 0.476, + "step": 13589 + }, + { + "epoch": 17.3952, + "grad_norm": 0.9987632036209106, + "learning_rate": 2.283513405362145e-05, + "loss": 0.4838, + "step": 13590 + }, + { + "epoch": 17.39648, + "grad_norm": 1.012587547302246, + "learning_rate": 2.2833133253301324e-05, + "loss": 0.4904, + "step": 13591 + }, + { + "epoch": 17.39776, + "grad_norm": 1.0492074489593506, + "learning_rate": 2.2831132452981192e-05, + "loss": 0.5372, + "step": 13592 + }, + { + "epoch": 17.39904, + "grad_norm": 1.0137112140655518, + "learning_rate": 2.2829131652661064e-05, + "loss": 0.4681, + "step": 13593 + }, + { + "epoch": 17.40032, + "grad_norm": 1.0790600776672363, + "learning_rate": 2.282713085234094e-05, + "loss": 0.5596, + "step": 13594 + }, + { + "epoch": 17.4016, + "grad_norm": 0.9867367744445801, + "learning_rate": 2.282513005202081e-05, + "loss": 0.4537, + "step": 13595 + }, + { + "epoch": 17.40288, + "grad_norm": 1.0197902917861938, + "learning_rate": 2.282312925170068e-05, + "loss": 0.5013, + "step": 13596 + }, + { + "epoch": 17.40416, + "grad_norm": 1.0174394845962524, + "learning_rate": 2.282112845138055e-05, + "loss": 0.512, + "step": 13597 + }, + { + "epoch": 17.40544, + "grad_norm": 1.000441312789917, + "learning_rate": 2.2819127651060427e-05, + "loss": 0.4996, + "step": 13598 + }, + { + "epoch": 17.40672, + "grad_norm": 1.0385953187942505, + "learning_rate": 2.28171268507403e-05, + "loss": 0.5161, + "step": 13599 + }, + { + "epoch": 17.408, + "grad_norm": 1.1014004945755005, + "learning_rate": 2.2815126050420167e-05, + "loss": 0.4972, + "step": 13600 + }, + { + "epoch": 17.40928, + "grad_norm": 1.0908480882644653, + "learning_rate": 2.281312525010004e-05, + "loss": 0.5294, + "step": 13601 + }, + { + "epoch": 17.41056, + "grad_norm": 1.0491387844085693, + "learning_rate": 2.2811124449779914e-05, + "loss": 0.5295, + "step": 13602 + }, + { + "epoch": 17.41184, + "grad_norm": 1.1002565622329712, + "learning_rate": 2.2809123649459786e-05, + "loss": 0.5095, + "step": 13603 + }, + { + "epoch": 17.41312, + "grad_norm": 1.077600359916687, + "learning_rate": 2.2807122849139654e-05, + "loss": 0.485, + "step": 13604 + }, + { + "epoch": 17.4144, + "grad_norm": 1.1393417119979858, + "learning_rate": 2.280512204881953e-05, + "loss": 0.5553, + "step": 13605 + }, + { + "epoch": 17.41568, + "grad_norm": 1.087091088294983, + "learning_rate": 2.28031212484994e-05, + "loss": 0.5519, + "step": 13606 + }, + { + "epoch": 17.41696, + "grad_norm": 1.0322144031524658, + "learning_rate": 2.2801120448179273e-05, + "loss": 0.5036, + "step": 13607 + }, + { + "epoch": 17.41824, + "grad_norm": 1.0630682706832886, + "learning_rate": 2.2799119647859142e-05, + "loss": 0.5113, + "step": 13608 + }, + { + "epoch": 17.41952, + "grad_norm": 1.0402500629425049, + "learning_rate": 2.2797118847539017e-05, + "loss": 0.4692, + "step": 13609 + }, + { + "epoch": 17.4208, + "grad_norm": 1.02988600730896, + "learning_rate": 2.279511804721889e-05, + "loss": 0.4878, + "step": 13610 + }, + { + "epoch": 17.42208, + "grad_norm": 1.0270915031433105, + "learning_rate": 2.279311724689876e-05, + "loss": 0.4991, + "step": 13611 + }, + { + "epoch": 17.42336, + "grad_norm": 0.9723539352416992, + "learning_rate": 2.2791116446578633e-05, + "loss": 0.4838, + "step": 13612 + }, + { + "epoch": 17.42464, + "grad_norm": 1.06536066532135, + "learning_rate": 2.2789115646258505e-05, + "loss": 0.5087, + "step": 13613 + }, + { + "epoch": 17.42592, + "grad_norm": 1.0233323574066162, + "learning_rate": 2.2787114845938376e-05, + "loss": 0.5379, + "step": 13614 + }, + { + "epoch": 17.4272, + "grad_norm": 1.0208466053009033, + "learning_rate": 2.2785114045618248e-05, + "loss": 0.4942, + "step": 13615 + }, + { + "epoch": 17.42848, + "grad_norm": 1.020458698272705, + "learning_rate": 2.278311324529812e-05, + "loss": 0.5093, + "step": 13616 + }, + { + "epoch": 17.42976, + "grad_norm": 1.0593698024749756, + "learning_rate": 2.2781112444977992e-05, + "loss": 0.5102, + "step": 13617 + }, + { + "epoch": 17.43104, + "grad_norm": 1.0400246381759644, + "learning_rate": 2.2779111644657864e-05, + "loss": 0.5102, + "step": 13618 + }, + { + "epoch": 17.43232, + "grad_norm": 1.0877729654312134, + "learning_rate": 2.2777110844337736e-05, + "loss": 0.5376, + "step": 13619 + }, + { + "epoch": 17.4336, + "grad_norm": 1.0149822235107422, + "learning_rate": 2.2775110044017608e-05, + "loss": 0.5096, + "step": 13620 + }, + { + "epoch": 17.43488, + "grad_norm": 1.0287227630615234, + "learning_rate": 2.277310924369748e-05, + "loss": 0.5049, + "step": 13621 + }, + { + "epoch": 17.43616, + "grad_norm": 0.98603755235672, + "learning_rate": 2.277110844337735e-05, + "loss": 0.4833, + "step": 13622 + }, + { + "epoch": 17.43744, + "grad_norm": 1.0274131298065186, + "learning_rate": 2.2769107643057223e-05, + "loss": 0.4929, + "step": 13623 + }, + { + "epoch": 17.43872, + "grad_norm": 1.006752610206604, + "learning_rate": 2.2767106842737095e-05, + "loss": 0.4812, + "step": 13624 + }, + { + "epoch": 17.44, + "grad_norm": 1.0845232009887695, + "learning_rate": 2.2765106042416967e-05, + "loss": 0.5392, + "step": 13625 + }, + { + "epoch": 17.44128, + "grad_norm": 1.0171184539794922, + "learning_rate": 2.2763105242096842e-05, + "loss": 0.4821, + "step": 13626 + }, + { + "epoch": 17.44256, + "grad_norm": 1.019047498703003, + "learning_rate": 2.276110444177671e-05, + "loss": 0.5066, + "step": 13627 + }, + { + "epoch": 17.44384, + "grad_norm": 0.9561603665351868, + "learning_rate": 2.2759103641456582e-05, + "loss": 0.4448, + "step": 13628 + }, + { + "epoch": 17.44512, + "grad_norm": 1.011527180671692, + "learning_rate": 2.2757102841136454e-05, + "loss": 0.4852, + "step": 13629 + }, + { + "epoch": 17.4464, + "grad_norm": 1.019681453704834, + "learning_rate": 2.275510204081633e-05, + "loss": 0.4853, + "step": 13630 + }, + { + "epoch": 17.44768, + "grad_norm": 1.005802035331726, + "learning_rate": 2.2753101240496198e-05, + "loss": 0.4922, + "step": 13631 + }, + { + "epoch": 17.44896, + "grad_norm": 1.0661100149154663, + "learning_rate": 2.275110044017607e-05, + "loss": 0.5282, + "step": 13632 + }, + { + "epoch": 17.45024, + "grad_norm": 1.1106263399124146, + "learning_rate": 2.2749099639855945e-05, + "loss": 0.5464, + "step": 13633 + }, + { + "epoch": 17.45152, + "grad_norm": 1.0604326725006104, + "learning_rate": 2.2747098839535817e-05, + "loss": 0.4823, + "step": 13634 + }, + { + "epoch": 17.4528, + "grad_norm": 1.089737057685852, + "learning_rate": 2.2745098039215685e-05, + "loss": 0.5092, + "step": 13635 + }, + { + "epoch": 17.45408, + "grad_norm": 1.071022391319275, + "learning_rate": 2.2743097238895557e-05, + "loss": 0.5489, + "step": 13636 + }, + { + "epoch": 17.45536, + "grad_norm": 1.0083519220352173, + "learning_rate": 2.2741096438575433e-05, + "loss": 0.479, + "step": 13637 + }, + { + "epoch": 17.45664, + "grad_norm": 1.0257757902145386, + "learning_rate": 2.2739095638255304e-05, + "loss": 0.5014, + "step": 13638 + }, + { + "epoch": 17.45792, + "grad_norm": 1.0760114192962646, + "learning_rate": 2.2737094837935173e-05, + "loss": 0.57, + "step": 13639 + }, + { + "epoch": 17.4592, + "grad_norm": 1.0728118419647217, + "learning_rate": 2.2735094037615048e-05, + "loss": 0.5392, + "step": 13640 + }, + { + "epoch": 17.46048, + "grad_norm": 1.043785572052002, + "learning_rate": 2.273309323729492e-05, + "loss": 0.5259, + "step": 13641 + }, + { + "epoch": 17.46176, + "grad_norm": 1.0340181589126587, + "learning_rate": 2.2731092436974792e-05, + "loss": 0.5326, + "step": 13642 + }, + { + "epoch": 17.46304, + "grad_norm": 1.094905138015747, + "learning_rate": 2.272909163665466e-05, + "loss": 0.5099, + "step": 13643 + }, + { + "epoch": 17.46432, + "grad_norm": 1.0578844547271729, + "learning_rate": 2.2727090836334536e-05, + "loss": 0.5155, + "step": 13644 + }, + { + "epoch": 17.4656, + "grad_norm": 1.028712272644043, + "learning_rate": 2.2725090036014407e-05, + "loss": 0.487, + "step": 13645 + }, + { + "epoch": 17.46688, + "grad_norm": 1.080743670463562, + "learning_rate": 2.272308923569428e-05, + "loss": 0.5096, + "step": 13646 + }, + { + "epoch": 17.46816, + "grad_norm": 1.0488895177841187, + "learning_rate": 2.272108843537415e-05, + "loss": 0.477, + "step": 13647 + }, + { + "epoch": 17.46944, + "grad_norm": 1.0804858207702637, + "learning_rate": 2.2719087635054023e-05, + "loss": 0.5193, + "step": 13648 + }, + { + "epoch": 17.47072, + "grad_norm": 1.074198603630066, + "learning_rate": 2.2717086834733895e-05, + "loss": 0.518, + "step": 13649 + }, + { + "epoch": 17.472, + "grad_norm": 1.0862224102020264, + "learning_rate": 2.2715086034413767e-05, + "loss": 0.5612, + "step": 13650 + }, + { + "epoch": 17.47328, + "grad_norm": 1.0850809812545776, + "learning_rate": 2.271308523409364e-05, + "loss": 0.5247, + "step": 13651 + }, + { + "epoch": 17.47456, + "grad_norm": 1.0903500318527222, + "learning_rate": 2.271108443377351e-05, + "loss": 0.5088, + "step": 13652 + }, + { + "epoch": 17.47584, + "grad_norm": 1.0023061037063599, + "learning_rate": 2.2709083633453382e-05, + "loss": 0.5279, + "step": 13653 + }, + { + "epoch": 17.47712, + "grad_norm": 0.980223536491394, + "learning_rate": 2.2707082833133254e-05, + "loss": 0.5039, + "step": 13654 + }, + { + "epoch": 17.4784, + "grad_norm": 1.0036488771438599, + "learning_rate": 2.2705082032813126e-05, + "loss": 0.5017, + "step": 13655 + }, + { + "epoch": 17.47968, + "grad_norm": 1.0191391706466675, + "learning_rate": 2.2703081232492998e-05, + "loss": 0.4988, + "step": 13656 + }, + { + "epoch": 17.48096, + "grad_norm": 0.9996136426925659, + "learning_rate": 2.270108043217287e-05, + "loss": 0.5057, + "step": 13657 + }, + { + "epoch": 17.48224, + "grad_norm": 1.033124327659607, + "learning_rate": 2.2699079631852745e-05, + "loss": 0.5378, + "step": 13658 + }, + { + "epoch": 17.48352, + "grad_norm": 1.1039026975631714, + "learning_rate": 2.2697078831532613e-05, + "loss": 0.5528, + "step": 13659 + }, + { + "epoch": 17.4848, + "grad_norm": 1.0512938499450684, + "learning_rate": 2.2695078031212485e-05, + "loss": 0.4989, + "step": 13660 + }, + { + "epoch": 17.48608, + "grad_norm": 1.054443120956421, + "learning_rate": 2.2693077230892357e-05, + "loss": 0.5054, + "step": 13661 + }, + { + "epoch": 17.48736, + "grad_norm": 1.0660535097122192, + "learning_rate": 2.2691076430572232e-05, + "loss": 0.504, + "step": 13662 + }, + { + "epoch": 17.48864, + "grad_norm": 0.9800325036048889, + "learning_rate": 2.26890756302521e-05, + "loss": 0.4377, + "step": 13663 + }, + { + "epoch": 17.48992, + "grad_norm": 1.013704538345337, + "learning_rate": 2.2687074829931973e-05, + "loss": 0.5319, + "step": 13664 + }, + { + "epoch": 17.4912, + "grad_norm": 1.0099461078643799, + "learning_rate": 2.2685074029611848e-05, + "loss": 0.4664, + "step": 13665 + }, + { + "epoch": 17.49248, + "grad_norm": 1.0758755207061768, + "learning_rate": 2.268307322929172e-05, + "loss": 0.5314, + "step": 13666 + }, + { + "epoch": 17.49376, + "grad_norm": 1.0263993740081787, + "learning_rate": 2.2681072428971588e-05, + "loss": 0.4989, + "step": 13667 + }, + { + "epoch": 17.49504, + "grad_norm": 1.1413928270339966, + "learning_rate": 2.267907162865146e-05, + "loss": 0.5251, + "step": 13668 + }, + { + "epoch": 17.49632, + "grad_norm": 0.9962292909622192, + "learning_rate": 2.2677070828331335e-05, + "loss": 0.485, + "step": 13669 + }, + { + "epoch": 17.4976, + "grad_norm": 0.9963700771331787, + "learning_rate": 2.2675070028011207e-05, + "loss": 0.5073, + "step": 13670 + }, + { + "epoch": 17.49888, + "grad_norm": 1.0219050645828247, + "learning_rate": 2.2673069227691076e-05, + "loss": 0.5144, + "step": 13671 + }, + { + "epoch": 17.50016, + "grad_norm": 0.9802215695381165, + "learning_rate": 2.267106842737095e-05, + "loss": 0.5066, + "step": 13672 + }, + { + "epoch": 17.50144, + "grad_norm": 0.9986078143119812, + "learning_rate": 2.2669067627050823e-05, + "loss": 0.4959, + "step": 13673 + }, + { + "epoch": 17.50272, + "grad_norm": 1.017099142074585, + "learning_rate": 2.2667066826730695e-05, + "loss": 0.49, + "step": 13674 + }, + { + "epoch": 17.504, + "grad_norm": 1.0355932712554932, + "learning_rate": 2.2665066026410563e-05, + "loss": 0.5282, + "step": 13675 + }, + { + "epoch": 17.50528, + "grad_norm": 0.9949687123298645, + "learning_rate": 2.266306522609044e-05, + "loss": 0.4931, + "step": 13676 + }, + { + "epoch": 17.50656, + "grad_norm": 0.975506603717804, + "learning_rate": 2.266106442577031e-05, + "loss": 0.525, + "step": 13677 + }, + { + "epoch": 17.50784, + "grad_norm": 1.0513207912445068, + "learning_rate": 2.2659063625450182e-05, + "loss": 0.5377, + "step": 13678 + }, + { + "epoch": 17.50912, + "grad_norm": 1.0164129734039307, + "learning_rate": 2.2657062825130054e-05, + "loss": 0.4787, + "step": 13679 + }, + { + "epoch": 17.5104, + "grad_norm": 1.022405982017517, + "learning_rate": 2.2655062024809926e-05, + "loss": 0.4741, + "step": 13680 + }, + { + "epoch": 17.51168, + "grad_norm": 1.0068978071212769, + "learning_rate": 2.2653061224489798e-05, + "loss": 0.4911, + "step": 13681 + }, + { + "epoch": 17.51296, + "grad_norm": 0.9547240138053894, + "learning_rate": 2.265106042416967e-05, + "loss": 0.4298, + "step": 13682 + }, + { + "epoch": 17.51424, + "grad_norm": 0.969224750995636, + "learning_rate": 2.264905962384954e-05, + "loss": 0.4678, + "step": 13683 + }, + { + "epoch": 17.51552, + "grad_norm": 0.9585568904876709, + "learning_rate": 2.2647058823529413e-05, + "loss": 0.4658, + "step": 13684 + }, + { + "epoch": 17.5168, + "grad_norm": 1.0375971794128418, + "learning_rate": 2.2645058023209285e-05, + "loss": 0.5459, + "step": 13685 + }, + { + "epoch": 17.51808, + "grad_norm": 1.1199681758880615, + "learning_rate": 2.2643057222889157e-05, + "loss": 0.5432, + "step": 13686 + }, + { + "epoch": 17.51936, + "grad_norm": 1.0504252910614014, + "learning_rate": 2.264105642256903e-05, + "loss": 0.531, + "step": 13687 + }, + { + "epoch": 17.52064, + "grad_norm": 1.0494788885116577, + "learning_rate": 2.26390556222489e-05, + "loss": 0.5018, + "step": 13688 + }, + { + "epoch": 17.52192, + "grad_norm": 1.0414574146270752, + "learning_rate": 2.2637054821928772e-05, + "loss": 0.5317, + "step": 13689 + }, + { + "epoch": 17.5232, + "grad_norm": 1.058502197265625, + "learning_rate": 2.2635054021608644e-05, + "loss": 0.5139, + "step": 13690 + }, + { + "epoch": 17.52448, + "grad_norm": 1.1048195362091064, + "learning_rate": 2.2633053221288516e-05, + "loss": 0.5667, + "step": 13691 + }, + { + "epoch": 17.52576, + "grad_norm": 1.0545276403427124, + "learning_rate": 2.2631052420968388e-05, + "loss": 0.4954, + "step": 13692 + }, + { + "epoch": 17.52704, + "grad_norm": 1.0869113206863403, + "learning_rate": 2.2629051620648263e-05, + "loss": 0.564, + "step": 13693 + }, + { + "epoch": 17.52832, + "grad_norm": 1.0492844581604004, + "learning_rate": 2.2627050820328132e-05, + "loss": 0.54, + "step": 13694 + }, + { + "epoch": 17.5296, + "grad_norm": 1.0702095031738281, + "learning_rate": 2.2625050020008004e-05, + "loss": 0.5171, + "step": 13695 + }, + { + "epoch": 17.53088, + "grad_norm": 1.0479813814163208, + "learning_rate": 2.2623049219687875e-05, + "loss": 0.5329, + "step": 13696 + }, + { + "epoch": 17.53216, + "grad_norm": 1.0335677862167358, + "learning_rate": 2.262104841936775e-05, + "loss": 0.5585, + "step": 13697 + }, + { + "epoch": 17.53344, + "grad_norm": 1.0604044198989868, + "learning_rate": 2.261904761904762e-05, + "loss": 0.5967, + "step": 13698 + }, + { + "epoch": 17.53472, + "grad_norm": 0.9840775728225708, + "learning_rate": 2.261704681872749e-05, + "loss": 0.496, + "step": 13699 + }, + { + "epoch": 17.536, + "grad_norm": 0.9827370643615723, + "learning_rate": 2.2615046018407366e-05, + "loss": 0.4696, + "step": 13700 + }, + { + "epoch": 17.53728, + "grad_norm": 1.0692596435546875, + "learning_rate": 2.2613045218087238e-05, + "loss": 0.5455, + "step": 13701 + }, + { + "epoch": 17.53856, + "grad_norm": 1.096990704536438, + "learning_rate": 2.2611044417767107e-05, + "loss": 0.5268, + "step": 13702 + }, + { + "epoch": 17.53984, + "grad_norm": 1.0311455726623535, + "learning_rate": 2.260904361744698e-05, + "loss": 0.5189, + "step": 13703 + }, + { + "epoch": 17.54112, + "grad_norm": 0.9897575378417969, + "learning_rate": 2.2607042817126854e-05, + "loss": 0.4777, + "step": 13704 + }, + { + "epoch": 17.5424, + "grad_norm": 1.0446470975875854, + "learning_rate": 2.2605042016806726e-05, + "loss": 0.5085, + "step": 13705 + }, + { + "epoch": 17.54368, + "grad_norm": 1.0381147861480713, + "learning_rate": 2.2603041216486594e-05, + "loss": 0.5308, + "step": 13706 + }, + { + "epoch": 17.54496, + "grad_norm": 1.0593794584274292, + "learning_rate": 2.260104041616647e-05, + "loss": 0.4676, + "step": 13707 + }, + { + "epoch": 17.54624, + "grad_norm": 1.0510727167129517, + "learning_rate": 2.259903961584634e-05, + "loss": 0.5292, + "step": 13708 + }, + { + "epoch": 17.54752, + "grad_norm": 1.047689437866211, + "learning_rate": 2.2597038815526213e-05, + "loss": 0.5205, + "step": 13709 + }, + { + "epoch": 17.5488, + "grad_norm": 1.0579311847686768, + "learning_rate": 2.259503801520608e-05, + "loss": 0.5061, + "step": 13710 + }, + { + "epoch": 17.55008, + "grad_norm": 1.0636060237884521, + "learning_rate": 2.2593037214885957e-05, + "loss": 0.5463, + "step": 13711 + }, + { + "epoch": 17.55136, + "grad_norm": 0.990552544593811, + "learning_rate": 2.259103641456583e-05, + "loss": 0.4527, + "step": 13712 + }, + { + "epoch": 17.55264, + "grad_norm": 1.0721439123153687, + "learning_rate": 2.25890356142457e-05, + "loss": 0.5183, + "step": 13713 + }, + { + "epoch": 17.55392, + "grad_norm": 1.0679312944412231, + "learning_rate": 2.258703481392557e-05, + "loss": 0.557, + "step": 13714 + }, + { + "epoch": 17.5552, + "grad_norm": 0.9733208417892456, + "learning_rate": 2.2585034013605444e-05, + "loss": 0.507, + "step": 13715 + }, + { + "epoch": 17.55648, + "grad_norm": 1.1034499406814575, + "learning_rate": 2.2583033213285316e-05, + "loss": 0.5612, + "step": 13716 + }, + { + "epoch": 17.557760000000002, + "grad_norm": 1.0827926397323608, + "learning_rate": 2.2581032412965188e-05, + "loss": 0.5139, + "step": 13717 + }, + { + "epoch": 17.55904, + "grad_norm": 0.9990882873535156, + "learning_rate": 2.257903161264506e-05, + "loss": 0.4797, + "step": 13718 + }, + { + "epoch": 17.56032, + "grad_norm": 0.9691088199615479, + "learning_rate": 2.257703081232493e-05, + "loss": 0.4385, + "step": 13719 + }, + { + "epoch": 17.5616, + "grad_norm": 1.0070648193359375, + "learning_rate": 2.2575030012004803e-05, + "loss": 0.4986, + "step": 13720 + }, + { + "epoch": 17.56288, + "grad_norm": 1.0735926628112793, + "learning_rate": 2.2573029211684675e-05, + "loss": 0.5415, + "step": 13721 + }, + { + "epoch": 17.56416, + "grad_norm": 1.0106046199798584, + "learning_rate": 2.2571028411364547e-05, + "loss": 0.4826, + "step": 13722 + }, + { + "epoch": 17.56544, + "grad_norm": 1.0311856269836426, + "learning_rate": 2.256902761104442e-05, + "loss": 0.5534, + "step": 13723 + }, + { + "epoch": 17.56672, + "grad_norm": 1.014507532119751, + "learning_rate": 2.256702681072429e-05, + "loss": 0.4764, + "step": 13724 + }, + { + "epoch": 17.568, + "grad_norm": 1.0640530586242676, + "learning_rate": 2.2565026010404163e-05, + "loss": 0.5272, + "step": 13725 + }, + { + "epoch": 17.56928, + "grad_norm": 1.1166555881500244, + "learning_rate": 2.2563025210084035e-05, + "loss": 0.572, + "step": 13726 + }, + { + "epoch": 17.57056, + "grad_norm": 1.061682105064392, + "learning_rate": 2.2561024409763906e-05, + "loss": 0.5373, + "step": 13727 + }, + { + "epoch": 17.57184, + "grad_norm": 1.0302103757858276, + "learning_rate": 2.2559023609443778e-05, + "loss": 0.4871, + "step": 13728 + }, + { + "epoch": 17.57312, + "grad_norm": 1.040071964263916, + "learning_rate": 2.255702280912365e-05, + "loss": 0.5096, + "step": 13729 + }, + { + "epoch": 17.5744, + "grad_norm": 1.0365045070648193, + "learning_rate": 2.2555022008803522e-05, + "loss": 0.4992, + "step": 13730 + }, + { + "epoch": 17.57568, + "grad_norm": 1.0997463464736938, + "learning_rate": 2.2553021208483394e-05, + "loss": 0.532, + "step": 13731 + }, + { + "epoch": 17.57696, + "grad_norm": 1.0069595575332642, + "learning_rate": 2.255102040816327e-05, + "loss": 0.4886, + "step": 13732 + }, + { + "epoch": 17.57824, + "grad_norm": 1.1098484992980957, + "learning_rate": 2.2549019607843138e-05, + "loss": 0.526, + "step": 13733 + }, + { + "epoch": 17.57952, + "grad_norm": 1.034224033355713, + "learning_rate": 2.254701880752301e-05, + "loss": 0.5107, + "step": 13734 + }, + { + "epoch": 17.5808, + "grad_norm": 0.9972882866859436, + "learning_rate": 2.254501800720288e-05, + "loss": 0.4979, + "step": 13735 + }, + { + "epoch": 17.58208, + "grad_norm": 1.033673644065857, + "learning_rate": 2.2543017206882756e-05, + "loss": 0.491, + "step": 13736 + }, + { + "epoch": 17.58336, + "grad_norm": 1.0336534976959229, + "learning_rate": 2.2541016406562625e-05, + "loss": 0.5155, + "step": 13737 + }, + { + "epoch": 17.58464, + "grad_norm": 1.0323609113693237, + "learning_rate": 2.2539015606242497e-05, + "loss": 0.5079, + "step": 13738 + }, + { + "epoch": 17.58592, + "grad_norm": 1.0115076303482056, + "learning_rate": 2.2537014805922372e-05, + "loss": 0.5356, + "step": 13739 + }, + { + "epoch": 17.5872, + "grad_norm": 0.9737477898597717, + "learning_rate": 2.2535014005602244e-05, + "loss": 0.4799, + "step": 13740 + }, + { + "epoch": 17.58848, + "grad_norm": 1.0207149982452393, + "learning_rate": 2.2533013205282112e-05, + "loss": 0.4796, + "step": 13741 + }, + { + "epoch": 17.58976, + "grad_norm": 1.0256177186965942, + "learning_rate": 2.2531012404961984e-05, + "loss": 0.5228, + "step": 13742 + }, + { + "epoch": 17.59104, + "grad_norm": 1.0296961069107056, + "learning_rate": 2.252901160464186e-05, + "loss": 0.5046, + "step": 13743 + }, + { + "epoch": 17.59232, + "grad_norm": 1.0195798873901367, + "learning_rate": 2.252701080432173e-05, + "loss": 0.4933, + "step": 13744 + }, + { + "epoch": 17.5936, + "grad_norm": 1.1169263124465942, + "learning_rate": 2.25250100040016e-05, + "loss": 0.5553, + "step": 13745 + }, + { + "epoch": 17.59488, + "grad_norm": 1.0939453840255737, + "learning_rate": 2.2523009203681475e-05, + "loss": 0.5122, + "step": 13746 + }, + { + "epoch": 17.59616, + "grad_norm": 1.0578433275222778, + "learning_rate": 2.2521008403361347e-05, + "loss": 0.5133, + "step": 13747 + }, + { + "epoch": 17.59744, + "grad_norm": 1.0464441776275635, + "learning_rate": 2.251900760304122e-05, + "loss": 0.5562, + "step": 13748 + }, + { + "epoch": 17.59872, + "grad_norm": 1.0032329559326172, + "learning_rate": 2.2517006802721087e-05, + "loss": 0.5132, + "step": 13749 + }, + { + "epoch": 17.6, + "grad_norm": 1.0605465173721313, + "learning_rate": 2.2515006002400962e-05, + "loss": 0.5201, + "step": 13750 + }, + { + "epoch": 17.60128, + "grad_norm": 1.0465607643127441, + "learning_rate": 2.2513005202080834e-05, + "loss": 0.5203, + "step": 13751 + }, + { + "epoch": 17.60256, + "grad_norm": 1.0221221446990967, + "learning_rate": 2.2511004401760706e-05, + "loss": 0.4927, + "step": 13752 + }, + { + "epoch": 17.60384, + "grad_norm": 0.970077633857727, + "learning_rate": 2.2509003601440578e-05, + "loss": 0.4861, + "step": 13753 + }, + { + "epoch": 17.60512, + "grad_norm": 1.0048155784606934, + "learning_rate": 2.250700280112045e-05, + "loss": 0.479, + "step": 13754 + }, + { + "epoch": 17.6064, + "grad_norm": 1.0469225645065308, + "learning_rate": 2.2505002000800322e-05, + "loss": 0.5249, + "step": 13755 + }, + { + "epoch": 17.60768, + "grad_norm": 1.0476946830749512, + "learning_rate": 2.2503001200480194e-05, + "loss": 0.4966, + "step": 13756 + }, + { + "epoch": 17.60896, + "grad_norm": 1.0547471046447754, + "learning_rate": 2.2501000400160065e-05, + "loss": 0.496, + "step": 13757 + }, + { + "epoch": 17.61024, + "grad_norm": 1.0567193031311035, + "learning_rate": 2.2498999599839937e-05, + "loss": 0.5211, + "step": 13758 + }, + { + "epoch": 17.61152, + "grad_norm": 1.0392005443572998, + "learning_rate": 2.249699879951981e-05, + "loss": 0.5055, + "step": 13759 + }, + { + "epoch": 17.6128, + "grad_norm": 1.090825080871582, + "learning_rate": 2.249499799919968e-05, + "loss": 0.5679, + "step": 13760 + }, + { + "epoch": 17.61408, + "grad_norm": 1.0225039720535278, + "learning_rate": 2.2492997198879553e-05, + "loss": 0.4868, + "step": 13761 + }, + { + "epoch": 17.61536, + "grad_norm": 1.011189341545105, + "learning_rate": 2.2490996398559425e-05, + "loss": 0.5217, + "step": 13762 + }, + { + "epoch": 17.61664, + "grad_norm": 1.0232630968093872, + "learning_rate": 2.2488995598239297e-05, + "loss": 0.4865, + "step": 13763 + }, + { + "epoch": 17.61792, + "grad_norm": 1.0764962434768677, + "learning_rate": 2.248699479791917e-05, + "loss": 0.5524, + "step": 13764 + }, + { + "epoch": 17.6192, + "grad_norm": 1.0205249786376953, + "learning_rate": 2.248499399759904e-05, + "loss": 0.4819, + "step": 13765 + }, + { + "epoch": 17.62048, + "grad_norm": 1.089855670928955, + "learning_rate": 2.2482993197278912e-05, + "loss": 0.5628, + "step": 13766 + }, + { + "epoch": 17.62176, + "grad_norm": 1.1043068170547485, + "learning_rate": 2.2480992396958784e-05, + "loss": 0.5314, + "step": 13767 + }, + { + "epoch": 17.62304, + "grad_norm": 1.0724689960479736, + "learning_rate": 2.2478991596638656e-05, + "loss": 0.5377, + "step": 13768 + }, + { + "epoch": 17.62432, + "grad_norm": 1.0592753887176514, + "learning_rate": 2.2476990796318528e-05, + "loss": 0.5109, + "step": 13769 + }, + { + "epoch": 17.6256, + "grad_norm": 1.0443251132965088, + "learning_rate": 2.24749899959984e-05, + "loss": 0.54, + "step": 13770 + }, + { + "epoch": 17.62688, + "grad_norm": 1.0627379417419434, + "learning_rate": 2.2472989195678275e-05, + "loss": 0.5206, + "step": 13771 + }, + { + "epoch": 17.62816, + "grad_norm": 1.033038854598999, + "learning_rate": 2.2470988395358143e-05, + "loss": 0.4665, + "step": 13772 + }, + { + "epoch": 17.62944, + "grad_norm": 1.1007275581359863, + "learning_rate": 2.2468987595038015e-05, + "loss": 0.5408, + "step": 13773 + }, + { + "epoch": 17.63072, + "grad_norm": 1.051284670829773, + "learning_rate": 2.2466986794717887e-05, + "loss": 0.509, + "step": 13774 + }, + { + "epoch": 17.632, + "grad_norm": 1.059944748878479, + "learning_rate": 2.2464985994397762e-05, + "loss": 0.5193, + "step": 13775 + }, + { + "epoch": 17.63328, + "grad_norm": 1.0539425611495972, + "learning_rate": 2.246298519407763e-05, + "loss": 0.5161, + "step": 13776 + }, + { + "epoch": 17.63456, + "grad_norm": 0.9573315382003784, + "learning_rate": 2.2460984393757503e-05, + "loss": 0.4629, + "step": 13777 + }, + { + "epoch": 17.63584, + "grad_norm": 1.0628639459609985, + "learning_rate": 2.2458983593437378e-05, + "loss": 0.5524, + "step": 13778 + }, + { + "epoch": 17.63712, + "grad_norm": 0.9728816747665405, + "learning_rate": 2.245698279311725e-05, + "loss": 0.4881, + "step": 13779 + }, + { + "epoch": 17.6384, + "grad_norm": 0.996763288974762, + "learning_rate": 2.2454981992797118e-05, + "loss": 0.4589, + "step": 13780 + }, + { + "epoch": 17.63968, + "grad_norm": 1.0928605794906616, + "learning_rate": 2.245298119247699e-05, + "loss": 0.5651, + "step": 13781 + }, + { + "epoch": 17.64096, + "grad_norm": 1.067138433456421, + "learning_rate": 2.2450980392156865e-05, + "loss": 0.5257, + "step": 13782 + }, + { + "epoch": 17.64224, + "grad_norm": 1.0764461755752563, + "learning_rate": 2.2448979591836737e-05, + "loss": 0.5539, + "step": 13783 + }, + { + "epoch": 17.64352, + "grad_norm": 1.0611051321029663, + "learning_rate": 2.2446978791516606e-05, + "loss": 0.5756, + "step": 13784 + }, + { + "epoch": 17.6448, + "grad_norm": 1.0139261484146118, + "learning_rate": 2.244497799119648e-05, + "loss": 0.4869, + "step": 13785 + }, + { + "epoch": 17.64608, + "grad_norm": 1.0263339281082153, + "learning_rate": 2.2442977190876353e-05, + "loss": 0.5112, + "step": 13786 + }, + { + "epoch": 17.64736, + "grad_norm": 1.041606068611145, + "learning_rate": 2.2440976390556225e-05, + "loss": 0.5362, + "step": 13787 + }, + { + "epoch": 17.64864, + "grad_norm": 1.01603102684021, + "learning_rate": 2.2438975590236093e-05, + "loss": 0.4863, + "step": 13788 + }, + { + "epoch": 17.64992, + "grad_norm": 1.0228776931762695, + "learning_rate": 2.2436974789915968e-05, + "loss": 0.5174, + "step": 13789 + }, + { + "epoch": 17.6512, + "grad_norm": 1.0253472328186035, + "learning_rate": 2.243497398959584e-05, + "loss": 0.502, + "step": 13790 + }, + { + "epoch": 17.65248, + "grad_norm": 1.0483967065811157, + "learning_rate": 2.2432973189275712e-05, + "loss": 0.5241, + "step": 13791 + }, + { + "epoch": 17.65376, + "grad_norm": 0.9999913573265076, + "learning_rate": 2.2430972388955584e-05, + "loss": 0.5112, + "step": 13792 + }, + { + "epoch": 17.65504, + "grad_norm": 0.9996472001075745, + "learning_rate": 2.2428971588635456e-05, + "loss": 0.4946, + "step": 13793 + }, + { + "epoch": 17.65632, + "grad_norm": 1.0371149778366089, + "learning_rate": 2.2426970788315328e-05, + "loss": 0.5217, + "step": 13794 + }, + { + "epoch": 17.6576, + "grad_norm": 1.040626883506775, + "learning_rate": 2.24249699879952e-05, + "loss": 0.5226, + "step": 13795 + }, + { + "epoch": 17.65888, + "grad_norm": 1.0527793169021606, + "learning_rate": 2.242296918767507e-05, + "loss": 0.5302, + "step": 13796 + }, + { + "epoch": 17.66016, + "grad_norm": 1.0551269054412842, + "learning_rate": 2.2420968387354943e-05, + "loss": 0.524, + "step": 13797 + }, + { + "epoch": 17.66144, + "grad_norm": 1.0026994943618774, + "learning_rate": 2.2418967587034815e-05, + "loss": 0.4582, + "step": 13798 + }, + { + "epoch": 17.66272, + "grad_norm": 1.0396666526794434, + "learning_rate": 2.2416966786714687e-05, + "loss": 0.49, + "step": 13799 + }, + { + "epoch": 17.664, + "grad_norm": 1.0443949699401855, + "learning_rate": 2.241496598639456e-05, + "loss": 0.4863, + "step": 13800 + }, + { + "epoch": 17.66528, + "grad_norm": 1.02041757106781, + "learning_rate": 2.241296518607443e-05, + "loss": 0.5075, + "step": 13801 + }, + { + "epoch": 17.66656, + "grad_norm": 1.0015771389007568, + "learning_rate": 2.2410964385754302e-05, + "loss": 0.4828, + "step": 13802 + }, + { + "epoch": 17.667839999999998, + "grad_norm": 1.0166736841201782, + "learning_rate": 2.2408963585434174e-05, + "loss": 0.4961, + "step": 13803 + }, + { + "epoch": 17.66912, + "grad_norm": 1.0742013454437256, + "learning_rate": 2.2406962785114046e-05, + "loss": 0.5438, + "step": 13804 + }, + { + "epoch": 17.6704, + "grad_norm": 1.0339466333389282, + "learning_rate": 2.2404961984793918e-05, + "loss": 0.5245, + "step": 13805 + }, + { + "epoch": 17.67168, + "grad_norm": 1.0903747081756592, + "learning_rate": 2.2402961184473793e-05, + "loss": 0.5367, + "step": 13806 + }, + { + "epoch": 17.67296, + "grad_norm": 1.049980878829956, + "learning_rate": 2.2400960384153662e-05, + "loss": 0.5121, + "step": 13807 + }, + { + "epoch": 17.67424, + "grad_norm": 1.027825117111206, + "learning_rate": 2.2398959583833534e-05, + "loss": 0.4781, + "step": 13808 + }, + { + "epoch": 17.67552, + "grad_norm": 1.0278798341751099, + "learning_rate": 2.2396958783513405e-05, + "loss": 0.4927, + "step": 13809 + }, + { + "epoch": 17.6768, + "grad_norm": 1.1051456928253174, + "learning_rate": 2.239495798319328e-05, + "loss": 0.5276, + "step": 13810 + }, + { + "epoch": 17.67808, + "grad_norm": 1.029605507850647, + "learning_rate": 2.239295718287315e-05, + "loss": 0.4806, + "step": 13811 + }, + { + "epoch": 17.67936, + "grad_norm": 1.0395790338516235, + "learning_rate": 2.239095638255302e-05, + "loss": 0.4941, + "step": 13812 + }, + { + "epoch": 17.68064, + "grad_norm": 1.017499566078186, + "learning_rate": 2.2388955582232896e-05, + "loss": 0.5287, + "step": 13813 + }, + { + "epoch": 17.68192, + "grad_norm": 1.0549243688583374, + "learning_rate": 2.2386954781912768e-05, + "loss": 0.5473, + "step": 13814 + }, + { + "epoch": 17.6832, + "grad_norm": 0.9981441497802734, + "learning_rate": 2.2384953981592637e-05, + "loss": 0.5042, + "step": 13815 + }, + { + "epoch": 17.68448, + "grad_norm": 1.0279474258422852, + "learning_rate": 2.238295318127251e-05, + "loss": 0.5113, + "step": 13816 + }, + { + "epoch": 17.68576, + "grad_norm": 1.023318886756897, + "learning_rate": 2.2380952380952384e-05, + "loss": 0.5169, + "step": 13817 + }, + { + "epoch": 17.68704, + "grad_norm": 1.0410925149917603, + "learning_rate": 2.2378951580632256e-05, + "loss": 0.5097, + "step": 13818 + }, + { + "epoch": 17.68832, + "grad_norm": 1.0423842668533325, + "learning_rate": 2.2376950780312124e-05, + "loss": 0.5164, + "step": 13819 + }, + { + "epoch": 17.6896, + "grad_norm": 1.0338177680969238, + "learning_rate": 2.2374949979991996e-05, + "loss": 0.5043, + "step": 13820 + }, + { + "epoch": 17.69088, + "grad_norm": 1.0916544198989868, + "learning_rate": 2.237294917967187e-05, + "loss": 0.5311, + "step": 13821 + }, + { + "epoch": 17.69216, + "grad_norm": 1.0973963737487793, + "learning_rate": 2.2370948379351743e-05, + "loss": 0.5184, + "step": 13822 + }, + { + "epoch": 17.69344, + "grad_norm": 1.0347501039505005, + "learning_rate": 2.236894757903161e-05, + "loss": 0.4996, + "step": 13823 + }, + { + "epoch": 17.69472, + "grad_norm": 0.9996992349624634, + "learning_rate": 2.2366946778711487e-05, + "loss": 0.4913, + "step": 13824 + }, + { + "epoch": 17.696, + "grad_norm": 1.081697702407837, + "learning_rate": 2.236494597839136e-05, + "loss": 0.5354, + "step": 13825 + }, + { + "epoch": 17.69728, + "grad_norm": 1.0013635158538818, + "learning_rate": 2.236294517807123e-05, + "loss": 0.5452, + "step": 13826 + }, + { + "epoch": 17.69856, + "grad_norm": 1.0059187412261963, + "learning_rate": 2.23609443777511e-05, + "loss": 0.5179, + "step": 13827 + }, + { + "epoch": 17.699840000000002, + "grad_norm": 1.0418435335159302, + "learning_rate": 2.2358943577430974e-05, + "loss": 0.4985, + "step": 13828 + }, + { + "epoch": 17.70112, + "grad_norm": 1.1040401458740234, + "learning_rate": 2.2356942777110846e-05, + "loss": 0.5672, + "step": 13829 + }, + { + "epoch": 17.7024, + "grad_norm": 1.0777002573013306, + "learning_rate": 2.2354941976790718e-05, + "loss": 0.5346, + "step": 13830 + }, + { + "epoch": 17.70368, + "grad_norm": 1.057003378868103, + "learning_rate": 2.235294117647059e-05, + "loss": 0.529, + "step": 13831 + }, + { + "epoch": 17.70496, + "grad_norm": 1.0604835748672485, + "learning_rate": 2.235094037615046e-05, + "loss": 0.504, + "step": 13832 + }, + { + "epoch": 17.70624, + "grad_norm": 1.0369086265563965, + "learning_rate": 2.2348939575830333e-05, + "loss": 0.512, + "step": 13833 + }, + { + "epoch": 17.70752, + "grad_norm": 1.0409032106399536, + "learning_rate": 2.2346938775510205e-05, + "loss": 0.4881, + "step": 13834 + }, + { + "epoch": 17.7088, + "grad_norm": 1.0097156763076782, + "learning_rate": 2.2344937975190077e-05, + "loss": 0.5367, + "step": 13835 + }, + { + "epoch": 17.71008, + "grad_norm": 0.9942651391029358, + "learning_rate": 2.234293717486995e-05, + "loss": 0.4977, + "step": 13836 + }, + { + "epoch": 17.71136, + "grad_norm": 1.0652927160263062, + "learning_rate": 2.234093637454982e-05, + "loss": 0.5629, + "step": 13837 + }, + { + "epoch": 17.71264, + "grad_norm": 1.0204808712005615, + "learning_rate": 2.2338935574229693e-05, + "loss": 0.4821, + "step": 13838 + }, + { + "epoch": 17.71392, + "grad_norm": 1.051592230796814, + "learning_rate": 2.2336934773909565e-05, + "loss": 0.4978, + "step": 13839 + }, + { + "epoch": 17.7152, + "grad_norm": 1.0325385332107544, + "learning_rate": 2.2334933973589436e-05, + "loss": 0.5587, + "step": 13840 + }, + { + "epoch": 17.71648, + "grad_norm": 0.9920241236686707, + "learning_rate": 2.2332933173269308e-05, + "loss": 0.5126, + "step": 13841 + }, + { + "epoch": 17.71776, + "grad_norm": 0.9792672395706177, + "learning_rate": 2.233093237294918e-05, + "loss": 0.4736, + "step": 13842 + }, + { + "epoch": 17.71904, + "grad_norm": 1.0789382457733154, + "learning_rate": 2.2328931572629052e-05, + "loss": 0.5335, + "step": 13843 + }, + { + "epoch": 17.72032, + "grad_norm": 1.0387771129608154, + "learning_rate": 2.2326930772308924e-05, + "loss": 0.5135, + "step": 13844 + }, + { + "epoch": 17.7216, + "grad_norm": 1.0767048597335815, + "learning_rate": 2.23249299719888e-05, + "loss": 0.5385, + "step": 13845 + }, + { + "epoch": 17.72288, + "grad_norm": 1.025465965270996, + "learning_rate": 2.2322929171668668e-05, + "loss": 0.4867, + "step": 13846 + }, + { + "epoch": 17.72416, + "grad_norm": 1.0004117488861084, + "learning_rate": 2.232092837134854e-05, + "loss": 0.5034, + "step": 13847 + }, + { + "epoch": 17.72544, + "grad_norm": 1.0588462352752686, + "learning_rate": 2.231892757102841e-05, + "loss": 0.5126, + "step": 13848 + }, + { + "epoch": 17.72672, + "grad_norm": 1.0342752933502197, + "learning_rate": 2.2316926770708286e-05, + "loss": 0.5027, + "step": 13849 + }, + { + "epoch": 17.728, + "grad_norm": 1.0341858863830566, + "learning_rate": 2.2314925970388155e-05, + "loss": 0.512, + "step": 13850 + }, + { + "epoch": 17.72928, + "grad_norm": 0.9943906664848328, + "learning_rate": 2.2312925170068027e-05, + "loss": 0.4869, + "step": 13851 + }, + { + "epoch": 17.73056, + "grad_norm": 1.0998703241348267, + "learning_rate": 2.2310924369747902e-05, + "loss": 0.5629, + "step": 13852 + }, + { + "epoch": 17.73184, + "grad_norm": 1.0420924425125122, + "learning_rate": 2.2308923569427774e-05, + "loss": 0.505, + "step": 13853 + }, + { + "epoch": 17.73312, + "grad_norm": 1.0193160772323608, + "learning_rate": 2.2306922769107642e-05, + "loss": 0.5082, + "step": 13854 + }, + { + "epoch": 17.7344, + "grad_norm": 1.0729001760482788, + "learning_rate": 2.2304921968787514e-05, + "loss": 0.4944, + "step": 13855 + }, + { + "epoch": 17.73568, + "grad_norm": 0.9842273592948914, + "learning_rate": 2.230292116846739e-05, + "loss": 0.4526, + "step": 13856 + }, + { + "epoch": 17.73696, + "grad_norm": 1.1139209270477295, + "learning_rate": 2.230092036814726e-05, + "loss": 0.5272, + "step": 13857 + }, + { + "epoch": 17.73824, + "grad_norm": 1.0628962516784668, + "learning_rate": 2.229891956782713e-05, + "loss": 0.5155, + "step": 13858 + }, + { + "epoch": 17.73952, + "grad_norm": 1.0056374073028564, + "learning_rate": 2.2296918767507005e-05, + "loss": 0.4806, + "step": 13859 + }, + { + "epoch": 17.7408, + "grad_norm": 1.0062882900238037, + "learning_rate": 2.2294917967186877e-05, + "loss": 0.5315, + "step": 13860 + }, + { + "epoch": 17.74208, + "grad_norm": 1.046491265296936, + "learning_rate": 2.229291716686675e-05, + "loss": 0.5553, + "step": 13861 + }, + { + "epoch": 17.74336, + "grad_norm": 1.0050787925720215, + "learning_rate": 2.2290916366546617e-05, + "loss": 0.5059, + "step": 13862 + }, + { + "epoch": 17.74464, + "grad_norm": 1.0405722856521606, + "learning_rate": 2.2288915566226492e-05, + "loss": 0.5053, + "step": 13863 + }, + { + "epoch": 17.74592, + "grad_norm": 1.0229580402374268, + "learning_rate": 2.2286914765906364e-05, + "loss": 0.5597, + "step": 13864 + }, + { + "epoch": 17.7472, + "grad_norm": 1.025348424911499, + "learning_rate": 2.2284913965586236e-05, + "loss": 0.5118, + "step": 13865 + }, + { + "epoch": 17.74848, + "grad_norm": 1.0150192975997925, + "learning_rate": 2.2282913165266108e-05, + "loss": 0.4987, + "step": 13866 + }, + { + "epoch": 17.74976, + "grad_norm": 1.0555064678192139, + "learning_rate": 2.228091236494598e-05, + "loss": 0.5402, + "step": 13867 + }, + { + "epoch": 17.75104, + "grad_norm": 1.0361968278884888, + "learning_rate": 2.2278911564625852e-05, + "loss": 0.5342, + "step": 13868 + }, + { + "epoch": 17.75232, + "grad_norm": 1.0574556589126587, + "learning_rate": 2.2276910764305724e-05, + "loss": 0.5591, + "step": 13869 + }, + { + "epoch": 17.7536, + "grad_norm": 1.021784782409668, + "learning_rate": 2.2274909963985595e-05, + "loss": 0.538, + "step": 13870 + }, + { + "epoch": 17.75488, + "grad_norm": 0.9862391352653503, + "learning_rate": 2.2272909163665467e-05, + "loss": 0.4779, + "step": 13871 + }, + { + "epoch": 17.75616, + "grad_norm": 1.0507396459579468, + "learning_rate": 2.227090836334534e-05, + "loss": 0.4927, + "step": 13872 + }, + { + "epoch": 17.75744, + "grad_norm": 0.9640161991119385, + "learning_rate": 2.226890756302521e-05, + "loss": 0.4648, + "step": 13873 + }, + { + "epoch": 17.75872, + "grad_norm": 1.0764763355255127, + "learning_rate": 2.2266906762705083e-05, + "loss": 0.5556, + "step": 13874 + }, + { + "epoch": 17.76, + "grad_norm": 1.0238678455352783, + "learning_rate": 2.2264905962384955e-05, + "loss": 0.5237, + "step": 13875 + }, + { + "epoch": 17.76128, + "grad_norm": 1.0710828304290771, + "learning_rate": 2.2262905162064827e-05, + "loss": 0.579, + "step": 13876 + }, + { + "epoch": 17.76256, + "grad_norm": 1.02402663230896, + "learning_rate": 2.22609043617447e-05, + "loss": 0.5118, + "step": 13877 + }, + { + "epoch": 17.76384, + "grad_norm": 1.037920355796814, + "learning_rate": 2.225890356142457e-05, + "loss": 0.5287, + "step": 13878 + }, + { + "epoch": 17.76512, + "grad_norm": 0.9906246066093445, + "learning_rate": 2.2256902761104442e-05, + "loss": 0.5074, + "step": 13879 + }, + { + "epoch": 17.7664, + "grad_norm": 1.0306636095046997, + "learning_rate": 2.2254901960784314e-05, + "loss": 0.4908, + "step": 13880 + }, + { + "epoch": 17.76768, + "grad_norm": 1.0256168842315674, + "learning_rate": 2.2252901160464186e-05, + "loss": 0.4749, + "step": 13881 + }, + { + "epoch": 17.76896, + "grad_norm": 1.0150201320648193, + "learning_rate": 2.2250900360144058e-05, + "loss": 0.5489, + "step": 13882 + }, + { + "epoch": 17.77024, + "grad_norm": 1.1066970825195312, + "learning_rate": 2.224889955982393e-05, + "loss": 0.583, + "step": 13883 + }, + { + "epoch": 17.77152, + "grad_norm": 0.9935269355773926, + "learning_rate": 2.2246898759503805e-05, + "loss": 0.471, + "step": 13884 + }, + { + "epoch": 17.7728, + "grad_norm": 1.0902072191238403, + "learning_rate": 2.2244897959183673e-05, + "loss": 0.5516, + "step": 13885 + }, + { + "epoch": 17.77408, + "grad_norm": 1.0285429954528809, + "learning_rate": 2.2242897158863545e-05, + "loss": 0.5101, + "step": 13886 + }, + { + "epoch": 17.77536, + "grad_norm": 0.9977599382400513, + "learning_rate": 2.2240896358543417e-05, + "loss": 0.5045, + "step": 13887 + }, + { + "epoch": 17.77664, + "grad_norm": 1.0018473863601685, + "learning_rate": 2.2238895558223292e-05, + "loss": 0.5293, + "step": 13888 + }, + { + "epoch": 17.77792, + "grad_norm": 1.027341604232788, + "learning_rate": 2.223689475790316e-05, + "loss": 0.5137, + "step": 13889 + }, + { + "epoch": 17.7792, + "grad_norm": 1.0242568254470825, + "learning_rate": 2.2234893957583033e-05, + "loss": 0.5404, + "step": 13890 + }, + { + "epoch": 17.78048, + "grad_norm": 0.9905862212181091, + "learning_rate": 2.2232893157262908e-05, + "loss": 0.5376, + "step": 13891 + }, + { + "epoch": 17.78176, + "grad_norm": 0.919065535068512, + "learning_rate": 2.223089235694278e-05, + "loss": 0.4514, + "step": 13892 + }, + { + "epoch": 17.78304, + "grad_norm": 0.9959515333175659, + "learning_rate": 2.2228891556622648e-05, + "loss": 0.4923, + "step": 13893 + }, + { + "epoch": 17.78432, + "grad_norm": 1.0490683317184448, + "learning_rate": 2.222689075630252e-05, + "loss": 0.4993, + "step": 13894 + }, + { + "epoch": 17.7856, + "grad_norm": 1.04204261302948, + "learning_rate": 2.2224889955982395e-05, + "loss": 0.523, + "step": 13895 + }, + { + "epoch": 17.78688, + "grad_norm": 1.001023530960083, + "learning_rate": 2.2222889155662267e-05, + "loss": 0.524, + "step": 13896 + }, + { + "epoch": 17.78816, + "grad_norm": 1.0491348505020142, + "learning_rate": 2.2220888355342136e-05, + "loss": 0.4963, + "step": 13897 + }, + { + "epoch": 17.78944, + "grad_norm": 1.046494960784912, + "learning_rate": 2.221888755502201e-05, + "loss": 0.5521, + "step": 13898 + }, + { + "epoch": 17.79072, + "grad_norm": 1.0331013202667236, + "learning_rate": 2.2216886754701883e-05, + "loss": 0.5306, + "step": 13899 + }, + { + "epoch": 17.792, + "grad_norm": 0.9884205460548401, + "learning_rate": 2.2214885954381755e-05, + "loss": 0.469, + "step": 13900 + }, + { + "epoch": 17.79328, + "grad_norm": 1.1054868698120117, + "learning_rate": 2.2212885154061623e-05, + "loss": 0.4882, + "step": 13901 + }, + { + "epoch": 17.79456, + "grad_norm": 1.0739738941192627, + "learning_rate": 2.2210884353741498e-05, + "loss": 0.5627, + "step": 13902 + }, + { + "epoch": 17.79584, + "grad_norm": 0.9757269024848938, + "learning_rate": 2.220888355342137e-05, + "loss": 0.474, + "step": 13903 + }, + { + "epoch": 17.79712, + "grad_norm": 0.9898340702056885, + "learning_rate": 2.2206882753101242e-05, + "loss": 0.4801, + "step": 13904 + }, + { + "epoch": 17.7984, + "grad_norm": 0.9995481967926025, + "learning_rate": 2.2204881952781114e-05, + "loss": 0.468, + "step": 13905 + }, + { + "epoch": 17.79968, + "grad_norm": 1.0464537143707275, + "learning_rate": 2.2202881152460986e-05, + "loss": 0.4884, + "step": 13906 + }, + { + "epoch": 17.80096, + "grad_norm": 1.0073833465576172, + "learning_rate": 2.2200880352140858e-05, + "loss": 0.5333, + "step": 13907 + }, + { + "epoch": 17.80224, + "grad_norm": 1.025449514389038, + "learning_rate": 2.219887955182073e-05, + "loss": 0.5314, + "step": 13908 + }, + { + "epoch": 17.80352, + "grad_norm": 0.9930814504623413, + "learning_rate": 2.21968787515006e-05, + "loss": 0.5318, + "step": 13909 + }, + { + "epoch": 17.8048, + "grad_norm": 0.9848999977111816, + "learning_rate": 2.2194877951180473e-05, + "loss": 0.4798, + "step": 13910 + }, + { + "epoch": 17.80608, + "grad_norm": 1.0277302265167236, + "learning_rate": 2.2192877150860345e-05, + "loss": 0.4967, + "step": 13911 + }, + { + "epoch": 17.80736, + "grad_norm": 1.0796613693237305, + "learning_rate": 2.2190876350540217e-05, + "loss": 0.5304, + "step": 13912 + }, + { + "epoch": 17.80864, + "grad_norm": 1.055225133895874, + "learning_rate": 2.218887555022009e-05, + "loss": 0.5225, + "step": 13913 + }, + { + "epoch": 17.809919999999998, + "grad_norm": 1.039272665977478, + "learning_rate": 2.218687474989996e-05, + "loss": 0.4892, + "step": 13914 + }, + { + "epoch": 17.8112, + "grad_norm": 1.0163494348526, + "learning_rate": 2.2184873949579832e-05, + "loss": 0.4807, + "step": 13915 + }, + { + "epoch": 17.81248, + "grad_norm": 1.0309205055236816, + "learning_rate": 2.2182873149259704e-05, + "loss": 0.534, + "step": 13916 + }, + { + "epoch": 17.81376, + "grad_norm": 0.9438468813896179, + "learning_rate": 2.2180872348939576e-05, + "loss": 0.4871, + "step": 13917 + }, + { + "epoch": 17.81504, + "grad_norm": 1.0279080867767334, + "learning_rate": 2.2178871548619448e-05, + "loss": 0.474, + "step": 13918 + }, + { + "epoch": 17.81632, + "grad_norm": 1.0761826038360596, + "learning_rate": 2.2176870748299323e-05, + "loss": 0.53, + "step": 13919 + }, + { + "epoch": 17.8176, + "grad_norm": 1.025966763496399, + "learning_rate": 2.217486994797919e-05, + "loss": 0.505, + "step": 13920 + }, + { + "epoch": 17.81888, + "grad_norm": 1.0639162063598633, + "learning_rate": 2.2172869147659064e-05, + "loss": 0.5237, + "step": 13921 + }, + { + "epoch": 17.82016, + "grad_norm": 1.0175074338912964, + "learning_rate": 2.2170868347338935e-05, + "loss": 0.5159, + "step": 13922 + }, + { + "epoch": 17.82144, + "grad_norm": 1.073966145515442, + "learning_rate": 2.216886754701881e-05, + "loss": 0.5687, + "step": 13923 + }, + { + "epoch": 17.82272, + "grad_norm": 1.0590076446533203, + "learning_rate": 2.216686674669868e-05, + "loss": 0.4935, + "step": 13924 + }, + { + "epoch": 17.824, + "grad_norm": 0.9631067514419556, + "learning_rate": 2.216486594637855e-05, + "loss": 0.4657, + "step": 13925 + }, + { + "epoch": 17.82528, + "grad_norm": 1.0229510068893433, + "learning_rate": 2.2162865146058426e-05, + "loss": 0.4885, + "step": 13926 + }, + { + "epoch": 17.82656, + "grad_norm": 1.0246883630752563, + "learning_rate": 2.2160864345738298e-05, + "loss": 0.5242, + "step": 13927 + }, + { + "epoch": 17.82784, + "grad_norm": 1.0278314352035522, + "learning_rate": 2.2158863545418167e-05, + "loss": 0.5112, + "step": 13928 + }, + { + "epoch": 17.82912, + "grad_norm": 1.0408381223678589, + "learning_rate": 2.215686274509804e-05, + "loss": 0.5122, + "step": 13929 + }, + { + "epoch": 17.8304, + "grad_norm": 0.9866704940795898, + "learning_rate": 2.2154861944777914e-05, + "loss": 0.5143, + "step": 13930 + }, + { + "epoch": 17.83168, + "grad_norm": 1.005631446838379, + "learning_rate": 2.2152861144457786e-05, + "loss": 0.4981, + "step": 13931 + }, + { + "epoch": 17.83296, + "grad_norm": 0.9910889267921448, + "learning_rate": 2.2150860344137654e-05, + "loss": 0.4984, + "step": 13932 + }, + { + "epoch": 17.83424, + "grad_norm": 1.0972979068756104, + "learning_rate": 2.2148859543817526e-05, + "loss": 0.5075, + "step": 13933 + }, + { + "epoch": 17.83552, + "grad_norm": 1.0541285276412964, + "learning_rate": 2.21468587434974e-05, + "loss": 0.484, + "step": 13934 + }, + { + "epoch": 17.8368, + "grad_norm": 1.0120394229888916, + "learning_rate": 2.2144857943177273e-05, + "loss": 0.4929, + "step": 13935 + }, + { + "epoch": 17.83808, + "grad_norm": 1.0291119813919067, + "learning_rate": 2.214285714285714e-05, + "loss": 0.5132, + "step": 13936 + }, + { + "epoch": 17.83936, + "grad_norm": 1.034484624862671, + "learning_rate": 2.2140856342537017e-05, + "loss": 0.4899, + "step": 13937 + }, + { + "epoch": 17.84064, + "grad_norm": 1.0646271705627441, + "learning_rate": 2.213885554221689e-05, + "loss": 0.4747, + "step": 13938 + }, + { + "epoch": 17.841920000000002, + "grad_norm": 1.0118857622146606, + "learning_rate": 2.213685474189676e-05, + "loss": 0.4824, + "step": 13939 + }, + { + "epoch": 17.8432, + "grad_norm": 1.0535541772842407, + "learning_rate": 2.213485394157663e-05, + "loss": 0.4842, + "step": 13940 + }, + { + "epoch": 17.84448, + "grad_norm": 1.028926968574524, + "learning_rate": 2.2132853141256504e-05, + "loss": 0.5243, + "step": 13941 + }, + { + "epoch": 17.84576, + "grad_norm": 1.0521934032440186, + "learning_rate": 2.2130852340936376e-05, + "loss": 0.5343, + "step": 13942 + }, + { + "epoch": 17.84704, + "grad_norm": 1.0421346426010132, + "learning_rate": 2.2128851540616248e-05, + "loss": 0.4903, + "step": 13943 + }, + { + "epoch": 17.84832, + "grad_norm": 1.0207431316375732, + "learning_rate": 2.212685074029612e-05, + "loss": 0.509, + "step": 13944 + }, + { + "epoch": 17.8496, + "grad_norm": 1.0445460081100464, + "learning_rate": 2.212484993997599e-05, + "loss": 0.5096, + "step": 13945 + }, + { + "epoch": 17.85088, + "grad_norm": 1.0498018264770508, + "learning_rate": 2.2122849139655863e-05, + "loss": 0.4767, + "step": 13946 + }, + { + "epoch": 17.85216, + "grad_norm": 1.0488752126693726, + "learning_rate": 2.2120848339335735e-05, + "loss": 0.5048, + "step": 13947 + }, + { + "epoch": 17.85344, + "grad_norm": 0.981802225112915, + "learning_rate": 2.2118847539015607e-05, + "loss": 0.5113, + "step": 13948 + }, + { + "epoch": 17.85472, + "grad_norm": 0.988862156867981, + "learning_rate": 2.211684673869548e-05, + "loss": 0.4887, + "step": 13949 + }, + { + "epoch": 17.856, + "grad_norm": 1.0477064847946167, + "learning_rate": 2.211484593837535e-05, + "loss": 0.5259, + "step": 13950 + }, + { + "epoch": 17.85728, + "grad_norm": 1.0353902578353882, + "learning_rate": 2.2112845138055223e-05, + "loss": 0.5022, + "step": 13951 + }, + { + "epoch": 17.85856, + "grad_norm": 1.0380783081054688, + "learning_rate": 2.2110844337735094e-05, + "loss": 0.5197, + "step": 13952 + }, + { + "epoch": 17.85984, + "grad_norm": 0.9977851510047913, + "learning_rate": 2.2108843537414966e-05, + "loss": 0.4844, + "step": 13953 + }, + { + "epoch": 17.86112, + "grad_norm": 1.031292200088501, + "learning_rate": 2.2106842737094838e-05, + "loss": 0.5341, + "step": 13954 + }, + { + "epoch": 17.8624, + "grad_norm": 1.0284069776535034, + "learning_rate": 2.210484193677471e-05, + "loss": 0.4888, + "step": 13955 + }, + { + "epoch": 17.86368, + "grad_norm": 1.0368765592575073, + "learning_rate": 2.2102841136454582e-05, + "loss": 0.5599, + "step": 13956 + }, + { + "epoch": 17.86496, + "grad_norm": 0.9683583378791809, + "learning_rate": 2.2100840336134454e-05, + "loss": 0.5076, + "step": 13957 + }, + { + "epoch": 17.86624, + "grad_norm": 1.0280176401138306, + "learning_rate": 2.209883953581433e-05, + "loss": 0.5281, + "step": 13958 + }, + { + "epoch": 17.86752, + "grad_norm": 1.078447937965393, + "learning_rate": 2.2096838735494197e-05, + "loss": 0.5285, + "step": 13959 + }, + { + "epoch": 17.8688, + "grad_norm": 1.0445505380630493, + "learning_rate": 2.209483793517407e-05, + "loss": 0.5509, + "step": 13960 + }, + { + "epoch": 17.87008, + "grad_norm": 1.0536412000656128, + "learning_rate": 2.209283713485394e-05, + "loss": 0.5003, + "step": 13961 + }, + { + "epoch": 17.87136, + "grad_norm": 0.9787836074829102, + "learning_rate": 2.2090836334533816e-05, + "loss": 0.4461, + "step": 13962 + }, + { + "epoch": 17.87264, + "grad_norm": 1.0095018148422241, + "learning_rate": 2.2088835534213685e-05, + "loss": 0.4955, + "step": 13963 + }, + { + "epoch": 17.87392, + "grad_norm": 1.0202372074127197, + "learning_rate": 2.2086834733893557e-05, + "loss": 0.5101, + "step": 13964 + }, + { + "epoch": 17.8752, + "grad_norm": 1.010644555091858, + "learning_rate": 2.2084833933573432e-05, + "loss": 0.508, + "step": 13965 + }, + { + "epoch": 17.87648, + "grad_norm": 1.0773608684539795, + "learning_rate": 2.2082833133253304e-05, + "loss": 0.4861, + "step": 13966 + }, + { + "epoch": 17.87776, + "grad_norm": 1.0331664085388184, + "learning_rate": 2.2080832332933172e-05, + "loss": 0.5204, + "step": 13967 + }, + { + "epoch": 17.87904, + "grad_norm": 1.081826090812683, + "learning_rate": 2.2078831532613044e-05, + "loss": 0.5497, + "step": 13968 + }, + { + "epoch": 17.88032, + "grad_norm": 1.0755976438522339, + "learning_rate": 2.207683073229292e-05, + "loss": 0.5408, + "step": 13969 + }, + { + "epoch": 17.8816, + "grad_norm": 1.0068060159683228, + "learning_rate": 2.207482993197279e-05, + "loss": 0.5003, + "step": 13970 + }, + { + "epoch": 17.88288, + "grad_norm": 0.9964351058006287, + "learning_rate": 2.207282913165266e-05, + "loss": 0.4895, + "step": 13971 + }, + { + "epoch": 17.88416, + "grad_norm": 1.0196988582611084, + "learning_rate": 2.2070828331332535e-05, + "loss": 0.5039, + "step": 13972 + }, + { + "epoch": 17.88544, + "grad_norm": 1.0305639505386353, + "learning_rate": 2.2068827531012407e-05, + "loss": 0.5213, + "step": 13973 + }, + { + "epoch": 17.88672, + "grad_norm": 1.0133947134017944, + "learning_rate": 2.206682673069228e-05, + "loss": 0.5076, + "step": 13974 + }, + { + "epoch": 17.888, + "grad_norm": 1.0059137344360352, + "learning_rate": 2.2064825930372147e-05, + "loss": 0.5173, + "step": 13975 + }, + { + "epoch": 17.88928, + "grad_norm": 1.0029923915863037, + "learning_rate": 2.2062825130052022e-05, + "loss": 0.5354, + "step": 13976 + }, + { + "epoch": 17.89056, + "grad_norm": 0.9647006392478943, + "learning_rate": 2.2060824329731894e-05, + "loss": 0.5142, + "step": 13977 + }, + { + "epoch": 17.89184, + "grad_norm": 1.0146187543869019, + "learning_rate": 2.2058823529411766e-05, + "loss": 0.5561, + "step": 13978 + }, + { + "epoch": 17.89312, + "grad_norm": 1.0384979248046875, + "learning_rate": 2.2056822729091638e-05, + "loss": 0.5139, + "step": 13979 + }, + { + "epoch": 17.8944, + "grad_norm": 0.9908726215362549, + "learning_rate": 2.205482192877151e-05, + "loss": 0.5435, + "step": 13980 + }, + { + "epoch": 17.89568, + "grad_norm": 1.0110816955566406, + "learning_rate": 2.2052821128451382e-05, + "loss": 0.5114, + "step": 13981 + }, + { + "epoch": 17.89696, + "grad_norm": 1.0027389526367188, + "learning_rate": 2.2050820328131254e-05, + "loss": 0.5125, + "step": 13982 + }, + { + "epoch": 17.89824, + "grad_norm": 0.9981145262718201, + "learning_rate": 2.2048819527811125e-05, + "loss": 0.5052, + "step": 13983 + }, + { + "epoch": 17.89952, + "grad_norm": 1.0781457424163818, + "learning_rate": 2.2046818727490997e-05, + "loss": 0.5885, + "step": 13984 + }, + { + "epoch": 17.9008, + "grad_norm": 1.0383636951446533, + "learning_rate": 2.204481792717087e-05, + "loss": 0.5163, + "step": 13985 + }, + { + "epoch": 17.90208, + "grad_norm": 1.020270824432373, + "learning_rate": 2.2042817126850744e-05, + "loss": 0.5207, + "step": 13986 + }, + { + "epoch": 17.90336, + "grad_norm": 1.0009729862213135, + "learning_rate": 2.2040816326530613e-05, + "loss": 0.6061, + "step": 13987 + }, + { + "epoch": 17.90464, + "grad_norm": 1.0783268213272095, + "learning_rate": 2.2038815526210485e-05, + "loss": 0.521, + "step": 13988 + }, + { + "epoch": 17.90592, + "grad_norm": 1.0087323188781738, + "learning_rate": 2.2036814725890357e-05, + "loss": 0.4846, + "step": 13989 + }, + { + "epoch": 17.9072, + "grad_norm": 1.005253553390503, + "learning_rate": 2.2034813925570232e-05, + "loss": 0.5024, + "step": 13990 + }, + { + "epoch": 17.90848, + "grad_norm": 1.0598344802856445, + "learning_rate": 2.20328131252501e-05, + "loss": 0.5176, + "step": 13991 + }, + { + "epoch": 17.90976, + "grad_norm": 1.010216474533081, + "learning_rate": 2.2030812324929972e-05, + "loss": 0.4949, + "step": 13992 + }, + { + "epoch": 17.91104, + "grad_norm": 1.0626128911972046, + "learning_rate": 2.2028811524609844e-05, + "loss": 0.5698, + "step": 13993 + }, + { + "epoch": 17.91232, + "grad_norm": 1.0089972019195557, + "learning_rate": 2.202681072428972e-05, + "loss": 0.4999, + "step": 13994 + }, + { + "epoch": 17.9136, + "grad_norm": 1.0127496719360352, + "learning_rate": 2.2024809923969588e-05, + "loss": 0.5169, + "step": 13995 + }, + { + "epoch": 17.91488, + "grad_norm": 1.058738350868225, + "learning_rate": 2.202280912364946e-05, + "loss": 0.4963, + "step": 13996 + }, + { + "epoch": 17.91616, + "grad_norm": 1.0318009853363037, + "learning_rate": 2.2020808323329335e-05, + "loss": 0.517, + "step": 13997 + }, + { + "epoch": 17.91744, + "grad_norm": 1.0412254333496094, + "learning_rate": 2.2018807523009207e-05, + "loss": 0.5259, + "step": 13998 + }, + { + "epoch": 17.91872, + "grad_norm": 1.061491847038269, + "learning_rate": 2.2016806722689075e-05, + "loss": 0.5451, + "step": 13999 + }, + { + "epoch": 17.92, + "grad_norm": 1.0782477855682373, + "learning_rate": 2.2014805922368947e-05, + "loss": 0.5462, + "step": 14000 + }, + { + "epoch": 17.92128, + "grad_norm": 1.0478885173797607, + "learning_rate": 2.2012805122048822e-05, + "loss": 0.5489, + "step": 14001 + }, + { + "epoch": 17.92256, + "grad_norm": 1.0254915952682495, + "learning_rate": 2.2010804321728694e-05, + "loss": 0.5149, + "step": 14002 + }, + { + "epoch": 17.92384, + "grad_norm": 1.0770113468170166, + "learning_rate": 2.2008803521408563e-05, + "loss": 0.5494, + "step": 14003 + }, + { + "epoch": 17.92512, + "grad_norm": 1.0335642099380493, + "learning_rate": 2.2006802721088438e-05, + "loss": 0.4951, + "step": 14004 + }, + { + "epoch": 17.9264, + "grad_norm": 1.0393280982971191, + "learning_rate": 2.200480192076831e-05, + "loss": 0.5505, + "step": 14005 + }, + { + "epoch": 17.92768, + "grad_norm": 0.9929527044296265, + "learning_rate": 2.200280112044818e-05, + "loss": 0.4796, + "step": 14006 + }, + { + "epoch": 17.92896, + "grad_norm": 0.9717233777046204, + "learning_rate": 2.200080032012805e-05, + "loss": 0.4864, + "step": 14007 + }, + { + "epoch": 17.93024, + "grad_norm": 1.0142488479614258, + "learning_rate": 2.1998799519807925e-05, + "loss": 0.5637, + "step": 14008 + }, + { + "epoch": 17.93152, + "grad_norm": 1.0879509449005127, + "learning_rate": 2.1996798719487797e-05, + "loss": 0.5425, + "step": 14009 + }, + { + "epoch": 17.9328, + "grad_norm": 1.0561232566833496, + "learning_rate": 2.199479791916767e-05, + "loss": 0.4915, + "step": 14010 + }, + { + "epoch": 17.93408, + "grad_norm": 1.0183889865875244, + "learning_rate": 2.199279711884754e-05, + "loss": 0.4996, + "step": 14011 + }, + { + "epoch": 17.93536, + "grad_norm": 0.9836266040802002, + "learning_rate": 2.1990796318527413e-05, + "loss": 0.4174, + "step": 14012 + }, + { + "epoch": 17.93664, + "grad_norm": 1.0205684900283813, + "learning_rate": 2.1988795518207285e-05, + "loss": 0.5009, + "step": 14013 + }, + { + "epoch": 17.93792, + "grad_norm": 1.0839546918869019, + "learning_rate": 2.1986794717887156e-05, + "loss": 0.5532, + "step": 14014 + }, + { + "epoch": 17.9392, + "grad_norm": 0.9612104296684265, + "learning_rate": 2.1984793917567028e-05, + "loss": 0.4711, + "step": 14015 + }, + { + "epoch": 17.94048, + "grad_norm": 1.0516858100891113, + "learning_rate": 2.19827931172469e-05, + "loss": 0.5215, + "step": 14016 + }, + { + "epoch": 17.94176, + "grad_norm": 1.1283824443817139, + "learning_rate": 2.1980792316926772e-05, + "loss": 0.6009, + "step": 14017 + }, + { + "epoch": 17.94304, + "grad_norm": 0.9951545000076294, + "learning_rate": 2.1978791516606644e-05, + "loss": 0.4589, + "step": 14018 + }, + { + "epoch": 17.94432, + "grad_norm": 0.9906835556030273, + "learning_rate": 2.1976790716286516e-05, + "loss": 0.5127, + "step": 14019 + }, + { + "epoch": 17.9456, + "grad_norm": 1.0642714500427246, + "learning_rate": 2.1974789915966388e-05, + "loss": 0.5101, + "step": 14020 + }, + { + "epoch": 17.94688, + "grad_norm": 1.098826289176941, + "learning_rate": 2.197278911564626e-05, + "loss": 0.5939, + "step": 14021 + }, + { + "epoch": 17.94816, + "grad_norm": 1.0480844974517822, + "learning_rate": 2.197078831532613e-05, + "loss": 0.5587, + "step": 14022 + }, + { + "epoch": 17.94944, + "grad_norm": 1.0224511623382568, + "learning_rate": 2.1968787515006003e-05, + "loss": 0.4883, + "step": 14023 + }, + { + "epoch": 17.95072, + "grad_norm": 1.0356502532958984, + "learning_rate": 2.1966786714685875e-05, + "loss": 0.5223, + "step": 14024 + }, + { + "epoch": 17.951999999999998, + "grad_norm": 1.0777286291122437, + "learning_rate": 2.196478591436575e-05, + "loss": 0.5237, + "step": 14025 + }, + { + "epoch": 17.95328, + "grad_norm": 1.1121007204055786, + "learning_rate": 2.196278511404562e-05, + "loss": 0.5715, + "step": 14026 + }, + { + "epoch": 17.95456, + "grad_norm": 1.068606972694397, + "learning_rate": 2.196078431372549e-05, + "loss": 0.5258, + "step": 14027 + }, + { + "epoch": 17.95584, + "grad_norm": 0.943596363067627, + "learning_rate": 2.1958783513405362e-05, + "loss": 0.4419, + "step": 14028 + }, + { + "epoch": 17.95712, + "grad_norm": 0.9647386074066162, + "learning_rate": 2.1956782713085238e-05, + "loss": 0.5067, + "step": 14029 + }, + { + "epoch": 17.9584, + "grad_norm": 0.9991288781166077, + "learning_rate": 2.1954781912765106e-05, + "loss": 0.4659, + "step": 14030 + }, + { + "epoch": 17.95968, + "grad_norm": 1.0615739822387695, + "learning_rate": 2.1952781112444978e-05, + "loss": 0.5157, + "step": 14031 + }, + { + "epoch": 17.96096, + "grad_norm": 1.0784728527069092, + "learning_rate": 2.1950780312124853e-05, + "loss": 0.5038, + "step": 14032 + }, + { + "epoch": 17.96224, + "grad_norm": 1.0348066091537476, + "learning_rate": 2.1948779511804725e-05, + "loss": 0.5264, + "step": 14033 + }, + { + "epoch": 17.96352, + "grad_norm": 0.9429087042808533, + "learning_rate": 2.1946778711484594e-05, + "loss": 0.4768, + "step": 14034 + }, + { + "epoch": 17.9648, + "grad_norm": 1.0070911645889282, + "learning_rate": 2.1944777911164465e-05, + "loss": 0.491, + "step": 14035 + }, + { + "epoch": 17.96608, + "grad_norm": 1.0489453077316284, + "learning_rate": 2.194277711084434e-05, + "loss": 0.5086, + "step": 14036 + }, + { + "epoch": 17.96736, + "grad_norm": 0.9969155788421631, + "learning_rate": 2.1940776310524212e-05, + "loss": 0.5171, + "step": 14037 + }, + { + "epoch": 17.96864, + "grad_norm": 1.0200451612472534, + "learning_rate": 2.193877551020408e-05, + "loss": 0.4763, + "step": 14038 + }, + { + "epoch": 17.96992, + "grad_norm": 1.1074117422103882, + "learning_rate": 2.1936774709883956e-05, + "loss": 0.5653, + "step": 14039 + }, + { + "epoch": 17.9712, + "grad_norm": 0.9458688497543335, + "learning_rate": 2.1934773909563828e-05, + "loss": 0.4885, + "step": 14040 + }, + { + "epoch": 17.97248, + "grad_norm": 0.9838624000549316, + "learning_rate": 2.19327731092437e-05, + "loss": 0.4665, + "step": 14041 + }, + { + "epoch": 17.97376, + "grad_norm": 0.9943659901618958, + "learning_rate": 2.193077230892357e-05, + "loss": 0.5485, + "step": 14042 + }, + { + "epoch": 17.97504, + "grad_norm": 1.0677356719970703, + "learning_rate": 2.1928771508603444e-05, + "loss": 0.5672, + "step": 14043 + }, + { + "epoch": 17.97632, + "grad_norm": 0.9993770122528076, + "learning_rate": 2.1926770708283315e-05, + "loss": 0.5041, + "step": 14044 + }, + { + "epoch": 17.9776, + "grad_norm": 1.0094391107559204, + "learning_rate": 2.1924769907963187e-05, + "loss": 0.5, + "step": 14045 + }, + { + "epoch": 17.97888, + "grad_norm": 0.9990932941436768, + "learning_rate": 2.1922769107643056e-05, + "loss": 0.5015, + "step": 14046 + }, + { + "epoch": 17.98016, + "grad_norm": 0.9945876002311707, + "learning_rate": 2.192076830732293e-05, + "loss": 0.4928, + "step": 14047 + }, + { + "epoch": 17.98144, + "grad_norm": 1.1232936382293701, + "learning_rate": 2.1918767507002803e-05, + "loss": 0.5627, + "step": 14048 + }, + { + "epoch": 17.98272, + "grad_norm": 1.0336296558380127, + "learning_rate": 2.1916766706682675e-05, + "loss": 0.514, + "step": 14049 + }, + { + "epoch": 17.984, + "grad_norm": 1.111663818359375, + "learning_rate": 2.1914765906362547e-05, + "loss": 0.5868, + "step": 14050 + }, + { + "epoch": 17.98528, + "grad_norm": 1.0357799530029297, + "learning_rate": 2.191276510604242e-05, + "loss": 0.5266, + "step": 14051 + }, + { + "epoch": 17.98656, + "grad_norm": 1.0117226839065552, + "learning_rate": 2.191076430572229e-05, + "loss": 0.5048, + "step": 14052 + }, + { + "epoch": 17.98784, + "grad_norm": 0.9796508550643921, + "learning_rate": 2.1908763505402162e-05, + "loss": 0.5113, + "step": 14053 + }, + { + "epoch": 17.98912, + "grad_norm": 1.028061032295227, + "learning_rate": 2.1906762705082034e-05, + "loss": 0.4864, + "step": 14054 + }, + { + "epoch": 17.9904, + "grad_norm": 1.0755497217178345, + "learning_rate": 2.1904761904761906e-05, + "loss": 0.5686, + "step": 14055 + }, + { + "epoch": 17.99168, + "grad_norm": 1.0548936128616333, + "learning_rate": 2.1902761104441778e-05, + "loss": 0.5476, + "step": 14056 + }, + { + "epoch": 17.99296, + "grad_norm": 1.0421359539031982, + "learning_rate": 2.190076030412165e-05, + "loss": 0.5111, + "step": 14057 + }, + { + "epoch": 17.99424, + "grad_norm": 1.0015569925308228, + "learning_rate": 2.189875950380152e-05, + "loss": 0.5427, + "step": 14058 + }, + { + "epoch": 17.99552, + "grad_norm": 1.0161628723144531, + "learning_rate": 2.1896758703481393e-05, + "loss": 0.5307, + "step": 14059 + }, + { + "epoch": 17.9968, + "grad_norm": 1.0471055507659912, + "learning_rate": 2.1894757903161265e-05, + "loss": 0.5164, + "step": 14060 + }, + { + "epoch": 17.99808, + "grad_norm": 1.1203445196151733, + "learning_rate": 2.1892757102841137e-05, + "loss": 0.557, + "step": 14061 + }, + { + "epoch": 17.99936, + "grad_norm": 1.0305187702178955, + "learning_rate": 2.189075630252101e-05, + "loss": 0.499, + "step": 14062 + }, + { + "epoch": 18.00064, + "grad_norm": 2.1932191848754883, + "learning_rate": 2.188875550220088e-05, + "loss": 0.9041, + "step": 14063 + }, + { + "epoch": 18.00192, + "grad_norm": 1.0409256219863892, + "learning_rate": 2.1886754701880756e-05, + "loss": 0.5222, + "step": 14064 + }, + { + "epoch": 18.0032, + "grad_norm": 1.000382900238037, + "learning_rate": 2.1884753901560624e-05, + "loss": 0.5282, + "step": 14065 + }, + { + "epoch": 18.00448, + "grad_norm": 0.9593517780303955, + "learning_rate": 2.1882753101240496e-05, + "loss": 0.4422, + "step": 14066 + }, + { + "epoch": 18.00576, + "grad_norm": 1.0473594665527344, + "learning_rate": 2.1880752300920368e-05, + "loss": 0.5121, + "step": 14067 + }, + { + "epoch": 18.00704, + "grad_norm": 1.0217726230621338, + "learning_rate": 2.1878751500600243e-05, + "loss": 0.4877, + "step": 14068 + }, + { + "epoch": 18.00832, + "grad_norm": 0.9588239789009094, + "learning_rate": 2.1876750700280112e-05, + "loss": 0.4513, + "step": 14069 + }, + { + "epoch": 18.0096, + "grad_norm": 0.9952200651168823, + "learning_rate": 2.1874749899959984e-05, + "loss": 0.4666, + "step": 14070 + }, + { + "epoch": 18.01088, + "grad_norm": 1.0490752458572388, + "learning_rate": 2.187274909963986e-05, + "loss": 0.5474, + "step": 14071 + }, + { + "epoch": 18.01216, + "grad_norm": 1.0104339122772217, + "learning_rate": 2.187074829931973e-05, + "loss": 0.5175, + "step": 14072 + }, + { + "epoch": 18.01344, + "grad_norm": 1.013339877128601, + "learning_rate": 2.18687474989996e-05, + "loss": 0.5109, + "step": 14073 + }, + { + "epoch": 18.01472, + "grad_norm": 1.0241261720657349, + "learning_rate": 2.186674669867947e-05, + "loss": 0.5022, + "step": 14074 + }, + { + "epoch": 18.016, + "grad_norm": 1.0270612239837646, + "learning_rate": 2.1864745898359346e-05, + "loss": 0.4703, + "step": 14075 + }, + { + "epoch": 18.01728, + "grad_norm": 1.0369747877120972, + "learning_rate": 2.1862745098039218e-05, + "loss": 0.5075, + "step": 14076 + }, + { + "epoch": 18.01856, + "grad_norm": 1.053289771080017, + "learning_rate": 2.1860744297719087e-05, + "loss": 0.5392, + "step": 14077 + }, + { + "epoch": 18.01984, + "grad_norm": 1.0217084884643555, + "learning_rate": 2.1858743497398962e-05, + "loss": 0.4777, + "step": 14078 + }, + { + "epoch": 18.02112, + "grad_norm": 0.9881057143211365, + "learning_rate": 2.1856742697078834e-05, + "loss": 0.4531, + "step": 14079 + }, + { + "epoch": 18.0224, + "grad_norm": 1.0281909704208374, + "learning_rate": 2.1854741896758706e-05, + "loss": 0.528, + "step": 14080 + }, + { + "epoch": 18.02368, + "grad_norm": 0.9666361808776855, + "learning_rate": 2.1852741096438574e-05, + "loss": 0.4326, + "step": 14081 + }, + { + "epoch": 18.02496, + "grad_norm": 1.0549180507659912, + "learning_rate": 2.185074029611845e-05, + "loss": 0.5504, + "step": 14082 + }, + { + "epoch": 18.02624, + "grad_norm": 0.9973421692848206, + "learning_rate": 2.184873949579832e-05, + "loss": 0.4681, + "step": 14083 + }, + { + "epoch": 18.02752, + "grad_norm": 1.0544997453689575, + "learning_rate": 2.1846738695478193e-05, + "loss": 0.4742, + "step": 14084 + }, + { + "epoch": 18.0288, + "grad_norm": 1.0692542791366577, + "learning_rate": 2.1844737895158065e-05, + "loss": 0.5059, + "step": 14085 + }, + { + "epoch": 18.03008, + "grad_norm": 1.024929165840149, + "learning_rate": 2.1842737094837937e-05, + "loss": 0.4643, + "step": 14086 + }, + { + "epoch": 18.03136, + "grad_norm": 1.0285801887512207, + "learning_rate": 2.184073629451781e-05, + "loss": 0.5188, + "step": 14087 + }, + { + "epoch": 18.03264, + "grad_norm": 1.0161205530166626, + "learning_rate": 2.183873549419768e-05, + "loss": 0.485, + "step": 14088 + }, + { + "epoch": 18.03392, + "grad_norm": 1.0766185522079468, + "learning_rate": 2.1836734693877552e-05, + "loss": 0.5295, + "step": 14089 + }, + { + "epoch": 18.0352, + "grad_norm": 1.038028359413147, + "learning_rate": 2.1834733893557424e-05, + "loss": 0.4863, + "step": 14090 + }, + { + "epoch": 18.03648, + "grad_norm": 1.0519113540649414, + "learning_rate": 2.1832733093237296e-05, + "loss": 0.5356, + "step": 14091 + }, + { + "epoch": 18.03776, + "grad_norm": 0.9999911189079285, + "learning_rate": 2.1830732292917168e-05, + "loss": 0.455, + "step": 14092 + }, + { + "epoch": 18.03904, + "grad_norm": 1.067815899848938, + "learning_rate": 2.182873149259704e-05, + "loss": 0.4824, + "step": 14093 + }, + { + "epoch": 18.04032, + "grad_norm": 1.0944994688034058, + "learning_rate": 2.1826730692276912e-05, + "loss": 0.468, + "step": 14094 + }, + { + "epoch": 18.0416, + "grad_norm": 1.105892300605774, + "learning_rate": 2.1824729891956784e-05, + "loss": 0.5392, + "step": 14095 + }, + { + "epoch": 18.04288, + "grad_norm": 1.038599967956543, + "learning_rate": 2.1822729091636655e-05, + "loss": 0.5009, + "step": 14096 + }, + { + "epoch": 18.04416, + "grad_norm": 1.0022287368774414, + "learning_rate": 2.1820728291316527e-05, + "loss": 0.473, + "step": 14097 + }, + { + "epoch": 18.04544, + "grad_norm": 1.0601282119750977, + "learning_rate": 2.18187274909964e-05, + "loss": 0.5105, + "step": 14098 + }, + { + "epoch": 18.04672, + "grad_norm": 1.0425713062286377, + "learning_rate": 2.181672669067627e-05, + "loss": 0.448, + "step": 14099 + }, + { + "epoch": 18.048, + "grad_norm": 1.0768316984176636, + "learning_rate": 2.1814725890356143e-05, + "loss": 0.5035, + "step": 14100 + }, + { + "epoch": 18.04928, + "grad_norm": 1.0370937585830688, + "learning_rate": 2.1812725090036015e-05, + "loss": 0.4883, + "step": 14101 + }, + { + "epoch": 18.05056, + "grad_norm": 1.0263327360153198, + "learning_rate": 2.1810724289715887e-05, + "loss": 0.5022, + "step": 14102 + }, + { + "epoch": 18.05184, + "grad_norm": 0.9880037307739258, + "learning_rate": 2.1808723489395762e-05, + "loss": 0.4505, + "step": 14103 + }, + { + "epoch": 18.05312, + "grad_norm": 1.0344130992889404, + "learning_rate": 2.180672268907563e-05, + "loss": 0.5191, + "step": 14104 + }, + { + "epoch": 18.0544, + "grad_norm": 1.0637491941452026, + "learning_rate": 2.1804721888755502e-05, + "loss": 0.4676, + "step": 14105 + }, + { + "epoch": 18.05568, + "grad_norm": 1.0038073062896729, + "learning_rate": 2.1802721088435374e-05, + "loss": 0.4869, + "step": 14106 + }, + { + "epoch": 18.05696, + "grad_norm": 1.0341646671295166, + "learning_rate": 2.180072028811525e-05, + "loss": 0.4734, + "step": 14107 + }, + { + "epoch": 18.05824, + "grad_norm": 1.0689383745193481, + "learning_rate": 2.1798719487795118e-05, + "loss": 0.4993, + "step": 14108 + }, + { + "epoch": 18.05952, + "grad_norm": 1.0641204118728638, + "learning_rate": 2.179671868747499e-05, + "loss": 0.5123, + "step": 14109 + }, + { + "epoch": 18.0608, + "grad_norm": 1.047566533088684, + "learning_rate": 2.1794717887154865e-05, + "loss": 0.5061, + "step": 14110 + }, + { + "epoch": 18.06208, + "grad_norm": 1.0506558418273926, + "learning_rate": 2.1792717086834737e-05, + "loss": 0.4856, + "step": 14111 + }, + { + "epoch": 18.06336, + "grad_norm": 0.985125720500946, + "learning_rate": 2.1790716286514605e-05, + "loss": 0.4718, + "step": 14112 + }, + { + "epoch": 18.06464, + "grad_norm": 1.012773036956787, + "learning_rate": 2.1788715486194477e-05, + "loss": 0.4976, + "step": 14113 + }, + { + "epoch": 18.06592, + "grad_norm": 0.9665741920471191, + "learning_rate": 2.1786714685874352e-05, + "loss": 0.4588, + "step": 14114 + }, + { + "epoch": 18.0672, + "grad_norm": 1.0558841228485107, + "learning_rate": 2.1784713885554224e-05, + "loss": 0.5345, + "step": 14115 + }, + { + "epoch": 18.06848, + "grad_norm": 1.1132677793502808, + "learning_rate": 2.1782713085234093e-05, + "loss": 0.5223, + "step": 14116 + }, + { + "epoch": 18.06976, + "grad_norm": 0.9984009265899658, + "learning_rate": 2.1780712284913968e-05, + "loss": 0.4445, + "step": 14117 + }, + { + "epoch": 18.07104, + "grad_norm": 1.0193313360214233, + "learning_rate": 2.177871148459384e-05, + "loss": 0.4674, + "step": 14118 + }, + { + "epoch": 18.07232, + "grad_norm": 1.009688138961792, + "learning_rate": 2.177671068427371e-05, + "loss": 0.449, + "step": 14119 + }, + { + "epoch": 18.0736, + "grad_norm": 0.9733455777168274, + "learning_rate": 2.177470988395358e-05, + "loss": 0.4378, + "step": 14120 + }, + { + "epoch": 18.07488, + "grad_norm": 1.0891609191894531, + "learning_rate": 2.1772709083633455e-05, + "loss": 0.4709, + "step": 14121 + }, + { + "epoch": 18.07616, + "grad_norm": 1.1113479137420654, + "learning_rate": 2.1770708283313327e-05, + "loss": 0.5089, + "step": 14122 + }, + { + "epoch": 18.07744, + "grad_norm": 1.1308077573776245, + "learning_rate": 2.17687074829932e-05, + "loss": 0.5187, + "step": 14123 + }, + { + "epoch": 18.07872, + "grad_norm": 1.1170744895935059, + "learning_rate": 2.176670668267307e-05, + "loss": 0.528, + "step": 14124 + }, + { + "epoch": 18.08, + "grad_norm": 1.0902800559997559, + "learning_rate": 2.1764705882352943e-05, + "loss": 0.5186, + "step": 14125 + }, + { + "epoch": 18.08128, + "grad_norm": 1.0982624292373657, + "learning_rate": 2.1762705082032815e-05, + "loss": 0.5318, + "step": 14126 + }, + { + "epoch": 18.08256, + "grad_norm": 1.066246747970581, + "learning_rate": 2.1760704281712686e-05, + "loss": 0.4535, + "step": 14127 + }, + { + "epoch": 18.08384, + "grad_norm": 1.0559955835342407, + "learning_rate": 2.1758703481392558e-05, + "loss": 0.5188, + "step": 14128 + }, + { + "epoch": 18.08512, + "grad_norm": 1.0126625299453735, + "learning_rate": 2.175670268107243e-05, + "loss": 0.4805, + "step": 14129 + }, + { + "epoch": 18.0864, + "grad_norm": 1.0629305839538574, + "learning_rate": 2.1754701880752302e-05, + "loss": 0.492, + "step": 14130 + }, + { + "epoch": 18.08768, + "grad_norm": 1.0466065406799316, + "learning_rate": 2.1752701080432174e-05, + "loss": 0.5372, + "step": 14131 + }, + { + "epoch": 18.08896, + "grad_norm": 1.072338581085205, + "learning_rate": 2.1750700280112046e-05, + "loss": 0.5614, + "step": 14132 + }, + { + "epoch": 18.09024, + "grad_norm": 1.0793755054473877, + "learning_rate": 2.1748699479791918e-05, + "loss": 0.5204, + "step": 14133 + }, + { + "epoch": 18.09152, + "grad_norm": 1.063680648803711, + "learning_rate": 2.174669867947179e-05, + "loss": 0.4683, + "step": 14134 + }, + { + "epoch": 18.0928, + "grad_norm": 1.070969820022583, + "learning_rate": 2.174469787915166e-05, + "loss": 0.515, + "step": 14135 + }, + { + "epoch": 18.09408, + "grad_norm": 1.0573210716247559, + "learning_rate": 2.1742697078831533e-05, + "loss": 0.5124, + "step": 14136 + }, + { + "epoch": 18.09536, + "grad_norm": 0.9679538607597351, + "learning_rate": 2.1740696278511405e-05, + "loss": 0.4349, + "step": 14137 + }, + { + "epoch": 18.09664, + "grad_norm": 1.0116899013519287, + "learning_rate": 2.173869547819128e-05, + "loss": 0.5204, + "step": 14138 + }, + { + "epoch": 18.09792, + "grad_norm": 1.0702815055847168, + "learning_rate": 2.173669467787115e-05, + "loss": 0.5164, + "step": 14139 + }, + { + "epoch": 18.0992, + "grad_norm": 1.0475645065307617, + "learning_rate": 2.173469387755102e-05, + "loss": 0.5105, + "step": 14140 + }, + { + "epoch": 18.10048, + "grad_norm": 1.0068633556365967, + "learning_rate": 2.1732693077230892e-05, + "loss": 0.4967, + "step": 14141 + }, + { + "epoch": 18.10176, + "grad_norm": 1.0416619777679443, + "learning_rate": 2.1730692276910768e-05, + "loss": 0.4917, + "step": 14142 + }, + { + "epoch": 18.10304, + "grad_norm": 0.996741533279419, + "learning_rate": 2.1728691476590636e-05, + "loss": 0.4628, + "step": 14143 + }, + { + "epoch": 18.10432, + "grad_norm": 1.0638675689697266, + "learning_rate": 2.1726690676270508e-05, + "loss": 0.5081, + "step": 14144 + }, + { + "epoch": 18.1056, + "grad_norm": 1.0904065370559692, + "learning_rate": 2.1724689875950383e-05, + "loss": 0.4946, + "step": 14145 + }, + { + "epoch": 18.10688, + "grad_norm": 1.032350778579712, + "learning_rate": 2.1722689075630255e-05, + "loss": 0.4844, + "step": 14146 + }, + { + "epoch": 18.10816, + "grad_norm": 1.0421230792999268, + "learning_rate": 2.1720688275310124e-05, + "loss": 0.4894, + "step": 14147 + }, + { + "epoch": 18.10944, + "grad_norm": 1.130836009979248, + "learning_rate": 2.1718687474989995e-05, + "loss": 0.5482, + "step": 14148 + }, + { + "epoch": 18.11072, + "grad_norm": 1.1370742321014404, + "learning_rate": 2.171668667466987e-05, + "loss": 0.5494, + "step": 14149 + }, + { + "epoch": 18.112, + "grad_norm": 1.0708554983139038, + "learning_rate": 2.1714685874349742e-05, + "loss": 0.5181, + "step": 14150 + }, + { + "epoch": 18.11328, + "grad_norm": 1.0703370571136475, + "learning_rate": 2.171268507402961e-05, + "loss": 0.5118, + "step": 14151 + }, + { + "epoch": 18.11456, + "grad_norm": 1.0429341793060303, + "learning_rate": 2.1710684273709486e-05, + "loss": 0.4991, + "step": 14152 + }, + { + "epoch": 18.11584, + "grad_norm": 1.0283002853393555, + "learning_rate": 2.1708683473389358e-05, + "loss": 0.481, + "step": 14153 + }, + { + "epoch": 18.11712, + "grad_norm": 1.0293543338775635, + "learning_rate": 2.170668267306923e-05, + "loss": 0.5489, + "step": 14154 + }, + { + "epoch": 18.1184, + "grad_norm": 1.0159963369369507, + "learning_rate": 2.17046818727491e-05, + "loss": 0.4763, + "step": 14155 + }, + { + "epoch": 18.11968, + "grad_norm": 1.0897349119186401, + "learning_rate": 2.1702681072428974e-05, + "loss": 0.5192, + "step": 14156 + }, + { + "epoch": 18.12096, + "grad_norm": 1.015059471130371, + "learning_rate": 2.1700680272108845e-05, + "loss": 0.4625, + "step": 14157 + }, + { + "epoch": 18.12224, + "grad_norm": 1.0573114156723022, + "learning_rate": 2.1698679471788717e-05, + "loss": 0.5278, + "step": 14158 + }, + { + "epoch": 18.12352, + "grad_norm": 1.0435305833816528, + "learning_rate": 2.1696678671468586e-05, + "loss": 0.4909, + "step": 14159 + }, + { + "epoch": 18.1248, + "grad_norm": 1.0256880521774292, + "learning_rate": 2.169467787114846e-05, + "loss": 0.4836, + "step": 14160 + }, + { + "epoch": 18.12608, + "grad_norm": 1.0937752723693848, + "learning_rate": 2.1692677070828333e-05, + "loss": 0.5833, + "step": 14161 + }, + { + "epoch": 18.12736, + "grad_norm": 1.005741834640503, + "learning_rate": 2.1690676270508205e-05, + "loss": 0.4886, + "step": 14162 + }, + { + "epoch": 18.12864, + "grad_norm": 1.0671494007110596, + "learning_rate": 2.1688675470188077e-05, + "loss": 0.4829, + "step": 14163 + }, + { + "epoch": 18.12992, + "grad_norm": 1.0546166896820068, + "learning_rate": 2.168667466986795e-05, + "loss": 0.5001, + "step": 14164 + }, + { + "epoch": 18.1312, + "grad_norm": 1.0181262493133545, + "learning_rate": 2.168467386954782e-05, + "loss": 0.4707, + "step": 14165 + }, + { + "epoch": 18.13248, + "grad_norm": 1.008811116218567, + "learning_rate": 2.1682673069227692e-05, + "loss": 0.4869, + "step": 14166 + }, + { + "epoch": 18.13376, + "grad_norm": 1.00252366065979, + "learning_rate": 2.1680672268907564e-05, + "loss": 0.4692, + "step": 14167 + }, + { + "epoch": 18.13504, + "grad_norm": 1.0217723846435547, + "learning_rate": 2.1678671468587436e-05, + "loss": 0.5281, + "step": 14168 + }, + { + "epoch": 18.13632, + "grad_norm": 1.0290299654006958, + "learning_rate": 2.1676670668267308e-05, + "loss": 0.4796, + "step": 14169 + }, + { + "epoch": 18.1376, + "grad_norm": 1.0708261728286743, + "learning_rate": 2.167466986794718e-05, + "loss": 0.4746, + "step": 14170 + }, + { + "epoch": 18.13888, + "grad_norm": 1.0332971811294556, + "learning_rate": 2.167266906762705e-05, + "loss": 0.485, + "step": 14171 + }, + { + "epoch": 18.14016, + "grad_norm": 1.0188629627227783, + "learning_rate": 2.1670668267306923e-05, + "loss": 0.4689, + "step": 14172 + }, + { + "epoch": 18.14144, + "grad_norm": 1.0185388326644897, + "learning_rate": 2.1668667466986795e-05, + "loss": 0.5077, + "step": 14173 + }, + { + "epoch": 18.14272, + "grad_norm": 1.0127745866775513, + "learning_rate": 2.1666666666666667e-05, + "loss": 0.4879, + "step": 14174 + }, + { + "epoch": 18.144, + "grad_norm": 1.0742979049682617, + "learning_rate": 2.166466586634654e-05, + "loss": 0.5216, + "step": 14175 + }, + { + "epoch": 18.14528, + "grad_norm": 1.0509237051010132, + "learning_rate": 2.166266506602641e-05, + "loss": 0.4848, + "step": 14176 + }, + { + "epoch": 18.14656, + "grad_norm": 1.0471779108047485, + "learning_rate": 2.1660664265706286e-05, + "loss": 0.5013, + "step": 14177 + }, + { + "epoch": 18.14784, + "grad_norm": 1.0413541793823242, + "learning_rate": 2.1658663465386154e-05, + "loss": 0.4817, + "step": 14178 + }, + { + "epoch": 18.14912, + "grad_norm": 1.007503867149353, + "learning_rate": 2.1656662665066026e-05, + "loss": 0.4622, + "step": 14179 + }, + { + "epoch": 18.1504, + "grad_norm": 1.0402361154556274, + "learning_rate": 2.1654661864745898e-05, + "loss": 0.5028, + "step": 14180 + }, + { + "epoch": 18.15168, + "grad_norm": 1.0889134407043457, + "learning_rate": 2.1652661064425773e-05, + "loss": 0.5201, + "step": 14181 + }, + { + "epoch": 18.15296, + "grad_norm": 1.0945775508880615, + "learning_rate": 2.1650660264105642e-05, + "loss": 0.4916, + "step": 14182 + }, + { + "epoch": 18.15424, + "grad_norm": 1.025062918663025, + "learning_rate": 2.1648659463785514e-05, + "loss": 0.5115, + "step": 14183 + }, + { + "epoch": 18.15552, + "grad_norm": 1.0531753301620483, + "learning_rate": 2.164665866346539e-05, + "loss": 0.4737, + "step": 14184 + }, + { + "epoch": 18.1568, + "grad_norm": 1.0863885879516602, + "learning_rate": 2.164465786314526e-05, + "loss": 0.5113, + "step": 14185 + }, + { + "epoch": 18.158079999999998, + "grad_norm": 1.0436323881149292, + "learning_rate": 2.164265706282513e-05, + "loss": 0.5154, + "step": 14186 + }, + { + "epoch": 18.15936, + "grad_norm": 1.0325742959976196, + "learning_rate": 2.1640656262505e-05, + "loss": 0.4873, + "step": 14187 + }, + { + "epoch": 18.16064, + "grad_norm": 1.0843673944473267, + "learning_rate": 2.1638655462184876e-05, + "loss": 0.4948, + "step": 14188 + }, + { + "epoch": 18.16192, + "grad_norm": 1.0292965173721313, + "learning_rate": 2.1636654661864748e-05, + "loss": 0.5001, + "step": 14189 + }, + { + "epoch": 18.1632, + "grad_norm": 1.010935664176941, + "learning_rate": 2.1634653861544617e-05, + "loss": 0.4492, + "step": 14190 + }, + { + "epoch": 18.16448, + "grad_norm": 1.0702733993530273, + "learning_rate": 2.1632653061224492e-05, + "loss": 0.486, + "step": 14191 + }, + { + "epoch": 18.16576, + "grad_norm": 1.0180492401123047, + "learning_rate": 2.1630652260904364e-05, + "loss": 0.4948, + "step": 14192 + }, + { + "epoch": 18.16704, + "grad_norm": 1.0777477025985718, + "learning_rate": 2.1628651460584236e-05, + "loss": 0.5253, + "step": 14193 + }, + { + "epoch": 18.16832, + "grad_norm": 1.03486168384552, + "learning_rate": 2.1626650660264104e-05, + "loss": 0.4609, + "step": 14194 + }, + { + "epoch": 18.1696, + "grad_norm": 1.0693778991699219, + "learning_rate": 2.162464985994398e-05, + "loss": 0.5002, + "step": 14195 + }, + { + "epoch": 18.17088, + "grad_norm": 1.0229696035385132, + "learning_rate": 2.162264905962385e-05, + "loss": 0.4825, + "step": 14196 + }, + { + "epoch": 18.17216, + "grad_norm": 1.0002144575119019, + "learning_rate": 2.1620648259303723e-05, + "loss": 0.4239, + "step": 14197 + }, + { + "epoch": 18.17344, + "grad_norm": 1.0359293222427368, + "learning_rate": 2.1618647458983595e-05, + "loss": 0.4582, + "step": 14198 + }, + { + "epoch": 18.17472, + "grad_norm": 1.0600343942642212, + "learning_rate": 2.1616646658663467e-05, + "loss": 0.5052, + "step": 14199 + }, + { + "epoch": 18.176, + "grad_norm": 1.0161858797073364, + "learning_rate": 2.161464585834334e-05, + "loss": 0.4748, + "step": 14200 + }, + { + "epoch": 18.17728, + "grad_norm": 1.0281199216842651, + "learning_rate": 2.161264505802321e-05, + "loss": 0.4505, + "step": 14201 + }, + { + "epoch": 18.17856, + "grad_norm": 1.0303138494491577, + "learning_rate": 2.1610644257703082e-05, + "loss": 0.5022, + "step": 14202 + }, + { + "epoch": 18.17984, + "grad_norm": 1.0298250913619995, + "learning_rate": 2.1608643457382954e-05, + "loss": 0.4955, + "step": 14203 + }, + { + "epoch": 18.18112, + "grad_norm": 1.0383046865463257, + "learning_rate": 2.1606642657062826e-05, + "loss": 0.503, + "step": 14204 + }, + { + "epoch": 18.1824, + "grad_norm": 1.0609245300292969, + "learning_rate": 2.1604641856742698e-05, + "loss": 0.4975, + "step": 14205 + }, + { + "epoch": 18.18368, + "grad_norm": 1.0384618043899536, + "learning_rate": 2.160264105642257e-05, + "loss": 0.5371, + "step": 14206 + }, + { + "epoch": 18.18496, + "grad_norm": 1.044183373451233, + "learning_rate": 2.160064025610244e-05, + "loss": 0.452, + "step": 14207 + }, + { + "epoch": 18.18624, + "grad_norm": 1.0944687128067017, + "learning_rate": 2.1598639455782314e-05, + "loss": 0.5241, + "step": 14208 + }, + { + "epoch": 18.18752, + "grad_norm": 1.0087506771087646, + "learning_rate": 2.1596638655462185e-05, + "loss": 0.4625, + "step": 14209 + }, + { + "epoch": 18.1888, + "grad_norm": 1.0070358514785767, + "learning_rate": 2.1594637855142057e-05, + "loss": 0.4719, + "step": 14210 + }, + { + "epoch": 18.19008, + "grad_norm": 1.0301635265350342, + "learning_rate": 2.159263705482193e-05, + "loss": 0.448, + "step": 14211 + }, + { + "epoch": 18.19136, + "grad_norm": 1.077388048171997, + "learning_rate": 2.15906362545018e-05, + "loss": 0.5245, + "step": 14212 + }, + { + "epoch": 18.19264, + "grad_norm": 1.0530251264572144, + "learning_rate": 2.1588635454181673e-05, + "loss": 0.5032, + "step": 14213 + }, + { + "epoch": 18.19392, + "grad_norm": 1.058570384979248, + "learning_rate": 2.1586634653861545e-05, + "loss": 0.5114, + "step": 14214 + }, + { + "epoch": 18.1952, + "grad_norm": 1.0303319692611694, + "learning_rate": 2.1584633853541417e-05, + "loss": 0.4486, + "step": 14215 + }, + { + "epoch": 18.19648, + "grad_norm": 1.0652695894241333, + "learning_rate": 2.1582633053221292e-05, + "loss": 0.5292, + "step": 14216 + }, + { + "epoch": 18.19776, + "grad_norm": 1.0437886714935303, + "learning_rate": 2.158063225290116e-05, + "loss": 0.5014, + "step": 14217 + }, + { + "epoch": 18.19904, + "grad_norm": 1.0610305070877075, + "learning_rate": 2.1578631452581032e-05, + "loss": 0.5458, + "step": 14218 + }, + { + "epoch": 18.20032, + "grad_norm": 0.995061457157135, + "learning_rate": 2.1576630652260904e-05, + "loss": 0.4919, + "step": 14219 + }, + { + "epoch": 18.2016, + "grad_norm": 1.0778111219406128, + "learning_rate": 2.157462985194078e-05, + "loss": 0.5306, + "step": 14220 + }, + { + "epoch": 18.20288, + "grad_norm": 1.0350569486618042, + "learning_rate": 2.1572629051620648e-05, + "loss": 0.5296, + "step": 14221 + }, + { + "epoch": 18.20416, + "grad_norm": 1.0401970148086548, + "learning_rate": 2.157062825130052e-05, + "loss": 0.5086, + "step": 14222 + }, + { + "epoch": 18.20544, + "grad_norm": 1.0326118469238281, + "learning_rate": 2.1568627450980395e-05, + "loss": 0.5022, + "step": 14223 + }, + { + "epoch": 18.20672, + "grad_norm": 1.0625585317611694, + "learning_rate": 2.1566626650660267e-05, + "loss": 0.5218, + "step": 14224 + }, + { + "epoch": 18.208, + "grad_norm": 1.0629734992980957, + "learning_rate": 2.1564625850340135e-05, + "loss": 0.5225, + "step": 14225 + }, + { + "epoch": 18.20928, + "grad_norm": 1.0436673164367676, + "learning_rate": 2.1562625050020007e-05, + "loss": 0.4921, + "step": 14226 + }, + { + "epoch": 18.21056, + "grad_norm": 1.0012400150299072, + "learning_rate": 2.1560624249699882e-05, + "loss": 0.4427, + "step": 14227 + }, + { + "epoch": 18.21184, + "grad_norm": 1.1108235120773315, + "learning_rate": 2.1558623449379754e-05, + "loss": 0.5484, + "step": 14228 + }, + { + "epoch": 18.21312, + "grad_norm": 1.080520510673523, + "learning_rate": 2.1556622649059623e-05, + "loss": 0.5021, + "step": 14229 + }, + { + "epoch": 18.2144, + "grad_norm": 1.0559675693511963, + "learning_rate": 2.1554621848739498e-05, + "loss": 0.4703, + "step": 14230 + }, + { + "epoch": 18.21568, + "grad_norm": 1.0659006834030151, + "learning_rate": 2.155262104841937e-05, + "loss": 0.5139, + "step": 14231 + }, + { + "epoch": 18.21696, + "grad_norm": 1.0316319465637207, + "learning_rate": 2.155062024809924e-05, + "loss": 0.4966, + "step": 14232 + }, + { + "epoch": 18.21824, + "grad_norm": 1.0869457721710205, + "learning_rate": 2.154861944777911e-05, + "loss": 0.5206, + "step": 14233 + }, + { + "epoch": 18.21952, + "grad_norm": 0.9864334464073181, + "learning_rate": 2.1546618647458985e-05, + "loss": 0.4524, + "step": 14234 + }, + { + "epoch": 18.2208, + "grad_norm": 1.0641202926635742, + "learning_rate": 2.1544617847138857e-05, + "loss": 0.5093, + "step": 14235 + }, + { + "epoch": 18.22208, + "grad_norm": 0.9992152452468872, + "learning_rate": 2.154261704681873e-05, + "loss": 0.448, + "step": 14236 + }, + { + "epoch": 18.22336, + "grad_norm": 1.1175810098648071, + "learning_rate": 2.15406162464986e-05, + "loss": 0.4918, + "step": 14237 + }, + { + "epoch": 18.22464, + "grad_norm": 1.0763076543807983, + "learning_rate": 2.1538615446178473e-05, + "loss": 0.4894, + "step": 14238 + }, + { + "epoch": 18.22592, + "grad_norm": 1.0875471830368042, + "learning_rate": 2.1536614645858344e-05, + "loss": 0.5458, + "step": 14239 + }, + { + "epoch": 18.2272, + "grad_norm": 1.0565011501312256, + "learning_rate": 2.1534613845538216e-05, + "loss": 0.488, + "step": 14240 + }, + { + "epoch": 18.22848, + "grad_norm": 1.0078827142715454, + "learning_rate": 2.1532613045218088e-05, + "loss": 0.4727, + "step": 14241 + }, + { + "epoch": 18.22976, + "grad_norm": 1.0515302419662476, + "learning_rate": 2.153061224489796e-05, + "loss": 0.4798, + "step": 14242 + }, + { + "epoch": 18.23104, + "grad_norm": 1.0603551864624023, + "learning_rate": 2.1528611444577832e-05, + "loss": 0.5047, + "step": 14243 + }, + { + "epoch": 18.23232, + "grad_norm": 1.032323956489563, + "learning_rate": 2.1526610644257704e-05, + "loss": 0.4825, + "step": 14244 + }, + { + "epoch": 18.2336, + "grad_norm": 1.0973374843597412, + "learning_rate": 2.1524609843937576e-05, + "loss": 0.4856, + "step": 14245 + }, + { + "epoch": 18.23488, + "grad_norm": 1.1034042835235596, + "learning_rate": 2.1522609043617447e-05, + "loss": 0.5269, + "step": 14246 + }, + { + "epoch": 18.23616, + "grad_norm": 1.0152696371078491, + "learning_rate": 2.152060824329732e-05, + "loss": 0.47, + "step": 14247 + }, + { + "epoch": 18.23744, + "grad_norm": 1.0043752193450928, + "learning_rate": 2.151860744297719e-05, + "loss": 0.5078, + "step": 14248 + }, + { + "epoch": 18.23872, + "grad_norm": 1.1099114418029785, + "learning_rate": 2.1516606642657063e-05, + "loss": 0.528, + "step": 14249 + }, + { + "epoch": 18.24, + "grad_norm": 1.0465238094329834, + "learning_rate": 2.1514605842336935e-05, + "loss": 0.5168, + "step": 14250 + }, + { + "epoch": 18.24128, + "grad_norm": 1.0789815187454224, + "learning_rate": 2.151260504201681e-05, + "loss": 0.5096, + "step": 14251 + }, + { + "epoch": 18.24256, + "grad_norm": 1.1060971021652222, + "learning_rate": 2.151060424169668e-05, + "loss": 0.5233, + "step": 14252 + }, + { + "epoch": 18.24384, + "grad_norm": 1.0800647735595703, + "learning_rate": 2.150860344137655e-05, + "loss": 0.5079, + "step": 14253 + }, + { + "epoch": 18.24512, + "grad_norm": 0.9907492995262146, + "learning_rate": 2.1506602641056422e-05, + "loss": 0.4908, + "step": 14254 + }, + { + "epoch": 18.2464, + "grad_norm": 1.0035676956176758, + "learning_rate": 2.1504601840736298e-05, + "loss": 0.4876, + "step": 14255 + }, + { + "epoch": 18.24768, + "grad_norm": 1.0252283811569214, + "learning_rate": 2.1502601040416166e-05, + "loss": 0.4663, + "step": 14256 + }, + { + "epoch": 18.24896, + "grad_norm": 1.023335337638855, + "learning_rate": 2.1500600240096038e-05, + "loss": 0.494, + "step": 14257 + }, + { + "epoch": 18.25024, + "grad_norm": 1.0962717533111572, + "learning_rate": 2.1498599439775913e-05, + "loss": 0.5021, + "step": 14258 + }, + { + "epoch": 18.25152, + "grad_norm": 1.0230556726455688, + "learning_rate": 2.1496598639455785e-05, + "loss": 0.5067, + "step": 14259 + }, + { + "epoch": 18.2528, + "grad_norm": 0.9827935099601746, + "learning_rate": 2.1494597839135653e-05, + "loss": 0.4315, + "step": 14260 + }, + { + "epoch": 18.25408, + "grad_norm": 1.0110362768173218, + "learning_rate": 2.1492597038815525e-05, + "loss": 0.4864, + "step": 14261 + }, + { + "epoch": 18.25536, + "grad_norm": 1.0713372230529785, + "learning_rate": 2.14905962384954e-05, + "loss": 0.5581, + "step": 14262 + }, + { + "epoch": 18.25664, + "grad_norm": 1.0925161838531494, + "learning_rate": 2.1488595438175272e-05, + "loss": 0.4843, + "step": 14263 + }, + { + "epoch": 18.25792, + "grad_norm": 1.0575942993164062, + "learning_rate": 2.148659463785514e-05, + "loss": 0.5034, + "step": 14264 + }, + { + "epoch": 18.2592, + "grad_norm": 1.080422282218933, + "learning_rate": 2.1484593837535016e-05, + "loss": 0.5236, + "step": 14265 + }, + { + "epoch": 18.26048, + "grad_norm": 1.0370043516159058, + "learning_rate": 2.1482593037214888e-05, + "loss": 0.4743, + "step": 14266 + }, + { + "epoch": 18.26176, + "grad_norm": 1.0181622505187988, + "learning_rate": 2.148059223689476e-05, + "loss": 0.449, + "step": 14267 + }, + { + "epoch": 18.26304, + "grad_norm": 1.0741147994995117, + "learning_rate": 2.147859143657463e-05, + "loss": 0.544, + "step": 14268 + }, + { + "epoch": 18.26432, + "grad_norm": 1.0276820659637451, + "learning_rate": 2.1476590636254504e-05, + "loss": 0.4783, + "step": 14269 + }, + { + "epoch": 18.2656, + "grad_norm": 1.094807744026184, + "learning_rate": 2.1474589835934375e-05, + "loss": 0.488, + "step": 14270 + }, + { + "epoch": 18.26688, + "grad_norm": 1.0752545595169067, + "learning_rate": 2.1472589035614247e-05, + "loss": 0.4985, + "step": 14271 + }, + { + "epoch": 18.26816, + "grad_norm": 1.1067184209823608, + "learning_rate": 2.1470588235294116e-05, + "loss": 0.4627, + "step": 14272 + }, + { + "epoch": 18.26944, + "grad_norm": 1.0492066144943237, + "learning_rate": 2.146858743497399e-05, + "loss": 0.5197, + "step": 14273 + }, + { + "epoch": 18.27072, + "grad_norm": 1.0525888204574585, + "learning_rate": 2.1466586634653863e-05, + "loss": 0.5238, + "step": 14274 + }, + { + "epoch": 18.272, + "grad_norm": 1.0903679132461548, + "learning_rate": 2.1464585834333735e-05, + "loss": 0.5231, + "step": 14275 + }, + { + "epoch": 18.27328, + "grad_norm": 1.1250532865524292, + "learning_rate": 2.1462585034013607e-05, + "loss": 0.5518, + "step": 14276 + }, + { + "epoch": 18.27456, + "grad_norm": 1.0479706525802612, + "learning_rate": 2.146058423369348e-05, + "loss": 0.4735, + "step": 14277 + }, + { + "epoch": 18.27584, + "grad_norm": 1.100510597229004, + "learning_rate": 2.145858343337335e-05, + "loss": 0.5421, + "step": 14278 + }, + { + "epoch": 18.27712, + "grad_norm": 1.0894125699996948, + "learning_rate": 2.1456582633053222e-05, + "loss": 0.4891, + "step": 14279 + }, + { + "epoch": 18.2784, + "grad_norm": 1.0481358766555786, + "learning_rate": 2.1454581832733094e-05, + "loss": 0.4968, + "step": 14280 + }, + { + "epoch": 18.27968, + "grad_norm": 1.0981334447860718, + "learning_rate": 2.1452581032412966e-05, + "loss": 0.506, + "step": 14281 + }, + { + "epoch": 18.28096, + "grad_norm": 1.0324424505233765, + "learning_rate": 2.1450580232092838e-05, + "loss": 0.5348, + "step": 14282 + }, + { + "epoch": 18.28224, + "grad_norm": 0.9970609545707703, + "learning_rate": 2.144857943177271e-05, + "loss": 0.5035, + "step": 14283 + }, + { + "epoch": 18.28352, + "grad_norm": 1.0144636631011963, + "learning_rate": 2.144657863145258e-05, + "loss": 0.499, + "step": 14284 + }, + { + "epoch": 18.2848, + "grad_norm": 1.009353756904602, + "learning_rate": 2.1444577831132453e-05, + "loss": 0.5351, + "step": 14285 + }, + { + "epoch": 18.28608, + "grad_norm": 0.9811288118362427, + "learning_rate": 2.1442577030812325e-05, + "loss": 0.4582, + "step": 14286 + }, + { + "epoch": 18.28736, + "grad_norm": 1.0266668796539307, + "learning_rate": 2.1440576230492197e-05, + "loss": 0.4908, + "step": 14287 + }, + { + "epoch": 18.28864, + "grad_norm": 1.0499911308288574, + "learning_rate": 2.143857543017207e-05, + "loss": 0.5122, + "step": 14288 + }, + { + "epoch": 18.28992, + "grad_norm": 1.0684877634048462, + "learning_rate": 2.143657462985194e-05, + "loss": 0.5143, + "step": 14289 + }, + { + "epoch": 18.2912, + "grad_norm": 1.0018022060394287, + "learning_rate": 2.1434573829531816e-05, + "loss": 0.4595, + "step": 14290 + }, + { + "epoch": 18.29248, + "grad_norm": 1.0234333276748657, + "learning_rate": 2.1432573029211684e-05, + "loss": 0.4952, + "step": 14291 + }, + { + "epoch": 18.29376, + "grad_norm": 1.015306830406189, + "learning_rate": 2.1430572228891556e-05, + "loss": 0.4971, + "step": 14292 + }, + { + "epoch": 18.29504, + "grad_norm": 1.069213628768921, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.5289, + "step": 14293 + }, + { + "epoch": 18.29632, + "grad_norm": 1.0142141580581665, + "learning_rate": 2.1426570628251303e-05, + "loss": 0.502, + "step": 14294 + }, + { + "epoch": 18.2976, + "grad_norm": 1.0849502086639404, + "learning_rate": 2.1424569827931172e-05, + "loss": 0.4753, + "step": 14295 + }, + { + "epoch": 18.29888, + "grad_norm": 1.010981798171997, + "learning_rate": 2.1422569027611044e-05, + "loss": 0.4907, + "step": 14296 + }, + { + "epoch": 18.300159999999998, + "grad_norm": 1.0261670351028442, + "learning_rate": 2.142056822729092e-05, + "loss": 0.4917, + "step": 14297 + }, + { + "epoch": 18.30144, + "grad_norm": 1.057773232460022, + "learning_rate": 2.141856742697079e-05, + "loss": 0.5051, + "step": 14298 + }, + { + "epoch": 18.30272, + "grad_norm": 1.0207065343856812, + "learning_rate": 2.141656662665066e-05, + "loss": 0.4841, + "step": 14299 + }, + { + "epoch": 18.304, + "grad_norm": 1.052815318107605, + "learning_rate": 2.141456582633053e-05, + "loss": 0.4918, + "step": 14300 + }, + { + "epoch": 18.30528, + "grad_norm": 1.0421321392059326, + "learning_rate": 2.1412565026010406e-05, + "loss": 0.5357, + "step": 14301 + }, + { + "epoch": 18.30656, + "grad_norm": 1.0197725296020508, + "learning_rate": 2.1410564225690278e-05, + "loss": 0.4925, + "step": 14302 + }, + { + "epoch": 18.30784, + "grad_norm": 1.0184986591339111, + "learning_rate": 2.1408563425370147e-05, + "loss": 0.4907, + "step": 14303 + }, + { + "epoch": 18.30912, + "grad_norm": 1.0371520519256592, + "learning_rate": 2.1406562625050022e-05, + "loss": 0.469, + "step": 14304 + }, + { + "epoch": 18.3104, + "grad_norm": 1.0472283363342285, + "learning_rate": 2.1404561824729894e-05, + "loss": 0.4778, + "step": 14305 + }, + { + "epoch": 18.31168, + "grad_norm": 1.0578492879867554, + "learning_rate": 2.1402561024409766e-05, + "loss": 0.5058, + "step": 14306 + }, + { + "epoch": 18.31296, + "grad_norm": 1.044542670249939, + "learning_rate": 2.1400560224089634e-05, + "loss": 0.5086, + "step": 14307 + }, + { + "epoch": 18.31424, + "grad_norm": 1.0305312871932983, + "learning_rate": 2.139855942376951e-05, + "loss": 0.4927, + "step": 14308 + }, + { + "epoch": 18.31552, + "grad_norm": 1.1355465650558472, + "learning_rate": 2.139655862344938e-05, + "loss": 0.5151, + "step": 14309 + }, + { + "epoch": 18.3168, + "grad_norm": 1.0777387619018555, + "learning_rate": 2.1394557823129253e-05, + "loss": 0.5034, + "step": 14310 + }, + { + "epoch": 18.31808, + "grad_norm": 1.1117095947265625, + "learning_rate": 2.1392557022809125e-05, + "loss": 0.5293, + "step": 14311 + }, + { + "epoch": 18.31936, + "grad_norm": 1.0630356073379517, + "learning_rate": 2.1390556222488997e-05, + "loss": 0.5152, + "step": 14312 + }, + { + "epoch": 18.32064, + "grad_norm": 1.0686274766921997, + "learning_rate": 2.138855542216887e-05, + "loss": 0.4792, + "step": 14313 + }, + { + "epoch": 18.32192, + "grad_norm": 1.0616755485534668, + "learning_rate": 2.138655462184874e-05, + "loss": 0.4932, + "step": 14314 + }, + { + "epoch": 18.3232, + "grad_norm": 1.079272985458374, + "learning_rate": 2.1384553821528612e-05, + "loss": 0.4785, + "step": 14315 + }, + { + "epoch": 18.32448, + "grad_norm": 1.0683273077011108, + "learning_rate": 2.1382553021208484e-05, + "loss": 0.517, + "step": 14316 + }, + { + "epoch": 18.32576, + "grad_norm": 1.0856717824935913, + "learning_rate": 2.1380552220888356e-05, + "loss": 0.489, + "step": 14317 + }, + { + "epoch": 18.32704, + "grad_norm": 1.0788426399230957, + "learning_rate": 2.137855142056823e-05, + "loss": 0.4987, + "step": 14318 + }, + { + "epoch": 18.32832, + "grad_norm": 1.0643631219863892, + "learning_rate": 2.13765506202481e-05, + "loss": 0.4891, + "step": 14319 + }, + { + "epoch": 18.3296, + "grad_norm": 1.1336820125579834, + "learning_rate": 2.137454981992797e-05, + "loss": 0.5683, + "step": 14320 + }, + { + "epoch": 18.33088, + "grad_norm": 1.0837833881378174, + "learning_rate": 2.1372549019607844e-05, + "loss": 0.4934, + "step": 14321 + }, + { + "epoch": 18.332160000000002, + "grad_norm": 1.0664664506912231, + "learning_rate": 2.137054821928772e-05, + "loss": 0.5126, + "step": 14322 + }, + { + "epoch": 18.33344, + "grad_norm": 1.0915263891220093, + "learning_rate": 2.1368547418967587e-05, + "loss": 0.5302, + "step": 14323 + }, + { + "epoch": 18.33472, + "grad_norm": 1.0284439325332642, + "learning_rate": 2.136654661864746e-05, + "loss": 0.4844, + "step": 14324 + }, + { + "epoch": 18.336, + "grad_norm": 1.0181630849838257, + "learning_rate": 2.136454581832733e-05, + "loss": 0.5208, + "step": 14325 + }, + { + "epoch": 18.33728, + "grad_norm": 1.0594981908798218, + "learning_rate": 2.1362545018007206e-05, + "loss": 0.5249, + "step": 14326 + }, + { + "epoch": 18.33856, + "grad_norm": 1.0511082410812378, + "learning_rate": 2.1360544217687075e-05, + "loss": 0.5147, + "step": 14327 + }, + { + "epoch": 18.33984, + "grad_norm": 1.042351484298706, + "learning_rate": 2.1358543417366947e-05, + "loss": 0.498, + "step": 14328 + }, + { + "epoch": 18.34112, + "grad_norm": 1.043094515800476, + "learning_rate": 2.1356542617046822e-05, + "loss": 0.4971, + "step": 14329 + }, + { + "epoch": 18.3424, + "grad_norm": 1.0453187227249146, + "learning_rate": 2.1354541816726694e-05, + "loss": 0.5092, + "step": 14330 + }, + { + "epoch": 18.34368, + "grad_norm": 1.0761491060256958, + "learning_rate": 2.1352541016406562e-05, + "loss": 0.5334, + "step": 14331 + }, + { + "epoch": 18.34496, + "grad_norm": 0.980190098285675, + "learning_rate": 2.1350540216086434e-05, + "loss": 0.4569, + "step": 14332 + }, + { + "epoch": 18.34624, + "grad_norm": 1.0307832956314087, + "learning_rate": 2.134853941576631e-05, + "loss": 0.4986, + "step": 14333 + }, + { + "epoch": 18.34752, + "grad_norm": 1.0275218486785889, + "learning_rate": 2.134653861544618e-05, + "loss": 0.4659, + "step": 14334 + }, + { + "epoch": 18.3488, + "grad_norm": 1.1118059158325195, + "learning_rate": 2.134453781512605e-05, + "loss": 0.5302, + "step": 14335 + }, + { + "epoch": 18.35008, + "grad_norm": 1.1193236112594604, + "learning_rate": 2.1342537014805925e-05, + "loss": 0.5402, + "step": 14336 + }, + { + "epoch": 18.35136, + "grad_norm": 1.0722352266311646, + "learning_rate": 2.1340536214485797e-05, + "loss": 0.458, + "step": 14337 + }, + { + "epoch": 18.35264, + "grad_norm": 1.0813183784484863, + "learning_rate": 2.133853541416567e-05, + "loss": 0.5093, + "step": 14338 + }, + { + "epoch": 18.35392, + "grad_norm": 1.0807271003723145, + "learning_rate": 2.1336534613845537e-05, + "loss": 0.5005, + "step": 14339 + }, + { + "epoch": 18.3552, + "grad_norm": 1.1022156476974487, + "learning_rate": 2.1334533813525412e-05, + "loss": 0.5259, + "step": 14340 + }, + { + "epoch": 18.35648, + "grad_norm": 1.0361143350601196, + "learning_rate": 2.1332533013205284e-05, + "loss": 0.5219, + "step": 14341 + }, + { + "epoch": 18.35776, + "grad_norm": 1.035565972328186, + "learning_rate": 2.1330532212885156e-05, + "loss": 0.4797, + "step": 14342 + }, + { + "epoch": 18.35904, + "grad_norm": 1.088816523551941, + "learning_rate": 2.1328531412565028e-05, + "loss": 0.4918, + "step": 14343 + }, + { + "epoch": 18.36032, + "grad_norm": 1.0977002382278442, + "learning_rate": 2.13265306122449e-05, + "loss": 0.5464, + "step": 14344 + }, + { + "epoch": 18.3616, + "grad_norm": 1.0365880727767944, + "learning_rate": 2.132452981192477e-05, + "loss": 0.5105, + "step": 14345 + }, + { + "epoch": 18.36288, + "grad_norm": 1.0541160106658936, + "learning_rate": 2.1322529011604643e-05, + "loss": 0.5156, + "step": 14346 + }, + { + "epoch": 18.36416, + "grad_norm": 1.0975985527038574, + "learning_rate": 2.1320528211284515e-05, + "loss": 0.5529, + "step": 14347 + }, + { + "epoch": 18.36544, + "grad_norm": 1.104960322380066, + "learning_rate": 2.1318527410964387e-05, + "loss": 0.5088, + "step": 14348 + }, + { + "epoch": 18.36672, + "grad_norm": 1.0740171670913696, + "learning_rate": 2.131652661064426e-05, + "loss": 0.508, + "step": 14349 + }, + { + "epoch": 18.368, + "grad_norm": 1.1651798486709595, + "learning_rate": 2.131452581032413e-05, + "loss": 0.5813, + "step": 14350 + }, + { + "epoch": 18.36928, + "grad_norm": 1.0417823791503906, + "learning_rate": 2.1312525010004003e-05, + "loss": 0.5102, + "step": 14351 + }, + { + "epoch": 18.37056, + "grad_norm": 1.0664454698562622, + "learning_rate": 2.1310524209683874e-05, + "loss": 0.544, + "step": 14352 + }, + { + "epoch": 18.37184, + "grad_norm": 1.0032341480255127, + "learning_rate": 2.1308523409363746e-05, + "loss": 0.4847, + "step": 14353 + }, + { + "epoch": 18.37312, + "grad_norm": 1.0495922565460205, + "learning_rate": 2.1306522609043618e-05, + "loss": 0.4937, + "step": 14354 + }, + { + "epoch": 18.3744, + "grad_norm": 1.0448967218399048, + "learning_rate": 2.130452180872349e-05, + "loss": 0.5093, + "step": 14355 + }, + { + "epoch": 18.37568, + "grad_norm": 1.141594648361206, + "learning_rate": 2.1302521008403362e-05, + "loss": 0.5038, + "step": 14356 + }, + { + "epoch": 18.37696, + "grad_norm": 1.0529148578643799, + "learning_rate": 2.1300520208083237e-05, + "loss": 0.5381, + "step": 14357 + }, + { + "epoch": 18.37824, + "grad_norm": 1.064096450805664, + "learning_rate": 2.1298519407763106e-05, + "loss": 0.4849, + "step": 14358 + }, + { + "epoch": 18.37952, + "grad_norm": 1.0783796310424805, + "learning_rate": 2.1296518607442977e-05, + "loss": 0.483, + "step": 14359 + }, + { + "epoch": 18.3808, + "grad_norm": 1.0386112928390503, + "learning_rate": 2.129451780712285e-05, + "loss": 0.5142, + "step": 14360 + }, + { + "epoch": 18.38208, + "grad_norm": 1.104622721672058, + "learning_rate": 2.1292517006802725e-05, + "loss": 0.5947, + "step": 14361 + }, + { + "epoch": 18.38336, + "grad_norm": 1.0468897819519043, + "learning_rate": 2.1290516206482593e-05, + "loss": 0.4647, + "step": 14362 + }, + { + "epoch": 18.38464, + "grad_norm": 1.0939141511917114, + "learning_rate": 2.1288515406162465e-05, + "loss": 0.5201, + "step": 14363 + }, + { + "epoch": 18.38592, + "grad_norm": 1.0154157876968384, + "learning_rate": 2.128651460584234e-05, + "loss": 0.4947, + "step": 14364 + }, + { + "epoch": 18.3872, + "grad_norm": 1.0433145761489868, + "learning_rate": 2.1284513805522212e-05, + "loss": 0.4675, + "step": 14365 + }, + { + "epoch": 18.38848, + "grad_norm": 1.0545381307601929, + "learning_rate": 2.128251300520208e-05, + "loss": 0.5372, + "step": 14366 + }, + { + "epoch": 18.38976, + "grad_norm": 1.0403685569763184, + "learning_rate": 2.1280512204881952e-05, + "loss": 0.5044, + "step": 14367 + }, + { + "epoch": 18.39104, + "grad_norm": 1.0547093152999878, + "learning_rate": 2.1278511404561828e-05, + "loss": 0.5217, + "step": 14368 + }, + { + "epoch": 18.39232, + "grad_norm": 1.0452656745910645, + "learning_rate": 2.12765106042417e-05, + "loss": 0.5011, + "step": 14369 + }, + { + "epoch": 18.3936, + "grad_norm": 1.124005675315857, + "learning_rate": 2.1274509803921568e-05, + "loss": 0.5562, + "step": 14370 + }, + { + "epoch": 18.39488, + "grad_norm": 1.0460633039474487, + "learning_rate": 2.1272509003601443e-05, + "loss": 0.4712, + "step": 14371 + }, + { + "epoch": 18.39616, + "grad_norm": 1.0718797445297241, + "learning_rate": 2.1270508203281315e-05, + "loss": 0.5197, + "step": 14372 + }, + { + "epoch": 18.39744, + "grad_norm": 1.036122441291809, + "learning_rate": 2.1268507402961187e-05, + "loss": 0.4993, + "step": 14373 + }, + { + "epoch": 18.39872, + "grad_norm": 1.087517499923706, + "learning_rate": 2.1266506602641055e-05, + "loss": 0.5288, + "step": 14374 + }, + { + "epoch": 18.4, + "grad_norm": 1.0326744318008423, + "learning_rate": 2.126450580232093e-05, + "loss": 0.4968, + "step": 14375 + }, + { + "epoch": 18.40128, + "grad_norm": 1.0811477899551392, + "learning_rate": 2.1262505002000802e-05, + "loss": 0.5151, + "step": 14376 + }, + { + "epoch": 18.40256, + "grad_norm": 1.0682767629623413, + "learning_rate": 2.1260504201680674e-05, + "loss": 0.5454, + "step": 14377 + }, + { + "epoch": 18.40384, + "grad_norm": 1.0771913528442383, + "learning_rate": 2.1258503401360543e-05, + "loss": 0.5331, + "step": 14378 + }, + { + "epoch": 18.40512, + "grad_norm": 1.0536373853683472, + "learning_rate": 2.1256502601040418e-05, + "loss": 0.5086, + "step": 14379 + }, + { + "epoch": 18.4064, + "grad_norm": 1.0189549922943115, + "learning_rate": 2.125450180072029e-05, + "loss": 0.497, + "step": 14380 + }, + { + "epoch": 18.40768, + "grad_norm": 1.0349353551864624, + "learning_rate": 2.1252501000400162e-05, + "loss": 0.4787, + "step": 14381 + }, + { + "epoch": 18.40896, + "grad_norm": 1.0494370460510254, + "learning_rate": 2.1250500200080034e-05, + "loss": 0.5013, + "step": 14382 + }, + { + "epoch": 18.41024, + "grad_norm": 1.0900458097457886, + "learning_rate": 2.1248499399759905e-05, + "loss": 0.5222, + "step": 14383 + }, + { + "epoch": 18.41152, + "grad_norm": 1.0162975788116455, + "learning_rate": 2.1246498599439777e-05, + "loss": 0.4884, + "step": 14384 + }, + { + "epoch": 18.4128, + "grad_norm": 1.079209804534912, + "learning_rate": 2.124449779911965e-05, + "loss": 0.5416, + "step": 14385 + }, + { + "epoch": 18.41408, + "grad_norm": 1.0865408182144165, + "learning_rate": 2.124249699879952e-05, + "loss": 0.5044, + "step": 14386 + }, + { + "epoch": 18.41536, + "grad_norm": 1.0583715438842773, + "learning_rate": 2.1240496198479393e-05, + "loss": 0.5338, + "step": 14387 + }, + { + "epoch": 18.41664, + "grad_norm": 1.0370880365371704, + "learning_rate": 2.1238495398159265e-05, + "loss": 0.4833, + "step": 14388 + }, + { + "epoch": 18.41792, + "grad_norm": 1.0636683702468872, + "learning_rate": 2.1236494597839137e-05, + "loss": 0.4915, + "step": 14389 + }, + { + "epoch": 18.4192, + "grad_norm": 1.079131841659546, + "learning_rate": 2.123449379751901e-05, + "loss": 0.4931, + "step": 14390 + }, + { + "epoch": 18.42048, + "grad_norm": 1.0714484453201294, + "learning_rate": 2.123249299719888e-05, + "loss": 0.4701, + "step": 14391 + }, + { + "epoch": 18.42176, + "grad_norm": 1.1398820877075195, + "learning_rate": 2.1230492196878752e-05, + "loss": 0.5147, + "step": 14392 + }, + { + "epoch": 18.42304, + "grad_norm": 1.0825884342193604, + "learning_rate": 2.1228491396558624e-05, + "loss": 0.4938, + "step": 14393 + }, + { + "epoch": 18.42432, + "grad_norm": 1.0806691646575928, + "learning_rate": 2.1226490596238496e-05, + "loss": 0.5118, + "step": 14394 + }, + { + "epoch": 18.4256, + "grad_norm": 1.0657216310501099, + "learning_rate": 2.1224489795918368e-05, + "loss": 0.5168, + "step": 14395 + }, + { + "epoch": 18.42688, + "grad_norm": 1.0217489004135132, + "learning_rate": 2.1222488995598243e-05, + "loss": 0.4725, + "step": 14396 + }, + { + "epoch": 18.42816, + "grad_norm": 1.0732911825180054, + "learning_rate": 2.122048819527811e-05, + "loss": 0.5417, + "step": 14397 + }, + { + "epoch": 18.42944, + "grad_norm": 1.091342568397522, + "learning_rate": 2.1218487394957983e-05, + "loss": 0.5097, + "step": 14398 + }, + { + "epoch": 18.43072, + "grad_norm": 1.0096945762634277, + "learning_rate": 2.1216486594637855e-05, + "loss": 0.4584, + "step": 14399 + }, + { + "epoch": 18.432, + "grad_norm": 0.9952658414840698, + "learning_rate": 2.121448579431773e-05, + "loss": 0.4642, + "step": 14400 + }, + { + "epoch": 18.43328, + "grad_norm": 1.108870506286621, + "learning_rate": 2.12124849939976e-05, + "loss": 0.5275, + "step": 14401 + }, + { + "epoch": 18.43456, + "grad_norm": 1.1087889671325684, + "learning_rate": 2.121048419367747e-05, + "loss": 0.5089, + "step": 14402 + }, + { + "epoch": 18.43584, + "grad_norm": 1.057420253753662, + "learning_rate": 2.1208483393357346e-05, + "loss": 0.513, + "step": 14403 + }, + { + "epoch": 18.43712, + "grad_norm": 1.1254922151565552, + "learning_rate": 2.1206482593037218e-05, + "loss": 0.5404, + "step": 14404 + }, + { + "epoch": 18.4384, + "grad_norm": 1.0429891347885132, + "learning_rate": 2.1204481792717086e-05, + "loss": 0.4935, + "step": 14405 + }, + { + "epoch": 18.43968, + "grad_norm": 1.089011549949646, + "learning_rate": 2.1202480992396958e-05, + "loss": 0.5318, + "step": 14406 + }, + { + "epoch": 18.44096, + "grad_norm": 1.080314040184021, + "learning_rate": 2.1200480192076833e-05, + "loss": 0.5239, + "step": 14407 + }, + { + "epoch": 18.44224, + "grad_norm": 1.047875165939331, + "learning_rate": 2.1198479391756705e-05, + "loss": 0.4918, + "step": 14408 + }, + { + "epoch": 18.44352, + "grad_norm": 1.0228180885314941, + "learning_rate": 2.1196478591436574e-05, + "loss": 0.4958, + "step": 14409 + }, + { + "epoch": 18.4448, + "grad_norm": 1.0188748836517334, + "learning_rate": 2.119447779111645e-05, + "loss": 0.4798, + "step": 14410 + }, + { + "epoch": 18.44608, + "grad_norm": 1.0608867406845093, + "learning_rate": 2.119247699079632e-05, + "loss": 0.5117, + "step": 14411 + }, + { + "epoch": 18.44736, + "grad_norm": 1.0850874185562134, + "learning_rate": 2.1190476190476193e-05, + "loss": 0.5156, + "step": 14412 + }, + { + "epoch": 18.44864, + "grad_norm": 1.093931794166565, + "learning_rate": 2.118847539015606e-05, + "loss": 0.5186, + "step": 14413 + }, + { + "epoch": 18.44992, + "grad_norm": 1.0857752561569214, + "learning_rate": 2.1186474589835936e-05, + "loss": 0.5215, + "step": 14414 + }, + { + "epoch": 18.4512, + "grad_norm": 1.0228488445281982, + "learning_rate": 2.1184473789515808e-05, + "loss": 0.4972, + "step": 14415 + }, + { + "epoch": 18.45248, + "grad_norm": 0.979611337184906, + "learning_rate": 2.118247298919568e-05, + "loss": 0.4884, + "step": 14416 + }, + { + "epoch": 18.45376, + "grad_norm": 1.0769535303115845, + "learning_rate": 2.1180472188875552e-05, + "loss": 0.53, + "step": 14417 + }, + { + "epoch": 18.45504, + "grad_norm": 1.103134274482727, + "learning_rate": 2.1178471388555424e-05, + "loss": 0.5321, + "step": 14418 + }, + { + "epoch": 18.45632, + "grad_norm": 1.0389997959136963, + "learning_rate": 2.1176470588235296e-05, + "loss": 0.4775, + "step": 14419 + }, + { + "epoch": 18.4576, + "grad_norm": 1.014994740486145, + "learning_rate": 2.1174469787915168e-05, + "loss": 0.4521, + "step": 14420 + }, + { + "epoch": 18.45888, + "grad_norm": 1.0282139778137207, + "learning_rate": 2.117246898759504e-05, + "loss": 0.5012, + "step": 14421 + }, + { + "epoch": 18.46016, + "grad_norm": 1.0462567806243896, + "learning_rate": 2.117046818727491e-05, + "loss": 0.5489, + "step": 14422 + }, + { + "epoch": 18.46144, + "grad_norm": 1.0593222379684448, + "learning_rate": 2.1168467386954783e-05, + "loss": 0.528, + "step": 14423 + }, + { + "epoch": 18.46272, + "grad_norm": 1.0701686143875122, + "learning_rate": 2.1166466586634655e-05, + "loss": 0.4987, + "step": 14424 + }, + { + "epoch": 18.464, + "grad_norm": 1.0602664947509766, + "learning_rate": 2.1164465786314527e-05, + "loss": 0.503, + "step": 14425 + }, + { + "epoch": 18.46528, + "grad_norm": 1.018227458000183, + "learning_rate": 2.11624649859944e-05, + "loss": 0.4681, + "step": 14426 + }, + { + "epoch": 18.46656, + "grad_norm": 1.078520655632019, + "learning_rate": 2.116046418567427e-05, + "loss": 0.5599, + "step": 14427 + }, + { + "epoch": 18.46784, + "grad_norm": 1.0530571937561035, + "learning_rate": 2.1158463385354142e-05, + "loss": 0.4804, + "step": 14428 + }, + { + "epoch": 18.46912, + "grad_norm": 1.029566764831543, + "learning_rate": 2.1156462585034014e-05, + "loss": 0.4644, + "step": 14429 + }, + { + "epoch": 18.4704, + "grad_norm": 1.0319150686264038, + "learning_rate": 2.1154461784713886e-05, + "loss": 0.4924, + "step": 14430 + }, + { + "epoch": 18.47168, + "grad_norm": 1.114824891090393, + "learning_rate": 2.115246098439376e-05, + "loss": 0.57, + "step": 14431 + }, + { + "epoch": 18.47296, + "grad_norm": 1.093182921409607, + "learning_rate": 2.115046018407363e-05, + "loss": 0.552, + "step": 14432 + }, + { + "epoch": 18.47424, + "grad_norm": 1.050522804260254, + "learning_rate": 2.11484593837535e-05, + "loss": 0.4954, + "step": 14433 + }, + { + "epoch": 18.47552, + "grad_norm": 1.0565167665481567, + "learning_rate": 2.1146458583433374e-05, + "loss": 0.5101, + "step": 14434 + }, + { + "epoch": 18.4768, + "grad_norm": 1.065156102180481, + "learning_rate": 2.114445778311325e-05, + "loss": 0.5184, + "step": 14435 + }, + { + "epoch": 18.47808, + "grad_norm": 0.9988115429878235, + "learning_rate": 2.1142456982793117e-05, + "loss": 0.4639, + "step": 14436 + }, + { + "epoch": 18.47936, + "grad_norm": 1.0403028726577759, + "learning_rate": 2.114045618247299e-05, + "loss": 0.4858, + "step": 14437 + }, + { + "epoch": 18.48064, + "grad_norm": 1.0715668201446533, + "learning_rate": 2.113845538215286e-05, + "loss": 0.5176, + "step": 14438 + }, + { + "epoch": 18.48192, + "grad_norm": 0.9925726652145386, + "learning_rate": 2.1136454581832736e-05, + "loss": 0.4261, + "step": 14439 + }, + { + "epoch": 18.4832, + "grad_norm": 1.02934730052948, + "learning_rate": 2.1134453781512605e-05, + "loss": 0.491, + "step": 14440 + }, + { + "epoch": 18.48448, + "grad_norm": 1.0200088024139404, + "learning_rate": 2.1132452981192476e-05, + "loss": 0.4953, + "step": 14441 + }, + { + "epoch": 18.48576, + "grad_norm": 1.080019235610962, + "learning_rate": 2.1130452180872352e-05, + "loss": 0.5132, + "step": 14442 + }, + { + "epoch": 18.48704, + "grad_norm": 1.0620362758636475, + "learning_rate": 2.1128451380552224e-05, + "loss": 0.5103, + "step": 14443 + }, + { + "epoch": 18.48832, + "grad_norm": 1.0634804964065552, + "learning_rate": 2.1126450580232092e-05, + "loss": 0.5085, + "step": 14444 + }, + { + "epoch": 18.4896, + "grad_norm": 1.110755443572998, + "learning_rate": 2.1124449779911964e-05, + "loss": 0.532, + "step": 14445 + }, + { + "epoch": 18.49088, + "grad_norm": 1.0549490451812744, + "learning_rate": 2.112244897959184e-05, + "loss": 0.4641, + "step": 14446 + }, + { + "epoch": 18.49216, + "grad_norm": 1.0575416088104248, + "learning_rate": 2.112044817927171e-05, + "loss": 0.4892, + "step": 14447 + }, + { + "epoch": 18.49344, + "grad_norm": 1.1010053157806396, + "learning_rate": 2.111844737895158e-05, + "loss": 0.4996, + "step": 14448 + }, + { + "epoch": 18.49472, + "grad_norm": 1.0477495193481445, + "learning_rate": 2.1116446578631455e-05, + "loss": 0.4811, + "step": 14449 + }, + { + "epoch": 18.496, + "grad_norm": 1.125105619430542, + "learning_rate": 2.1114445778311327e-05, + "loss": 0.5519, + "step": 14450 + }, + { + "epoch": 18.49728, + "grad_norm": 1.1108721494674683, + "learning_rate": 2.11124449779912e-05, + "loss": 0.5117, + "step": 14451 + }, + { + "epoch": 18.49856, + "grad_norm": 1.0684090852737427, + "learning_rate": 2.1110444177671067e-05, + "loss": 0.4912, + "step": 14452 + }, + { + "epoch": 18.49984, + "grad_norm": 1.1287049055099487, + "learning_rate": 2.1108443377350942e-05, + "loss": 0.5469, + "step": 14453 + }, + { + "epoch": 18.50112, + "grad_norm": 1.048940658569336, + "learning_rate": 2.1106442577030814e-05, + "loss": 0.4936, + "step": 14454 + }, + { + "epoch": 18.5024, + "grad_norm": 1.0817766189575195, + "learning_rate": 2.1104441776710686e-05, + "loss": 0.5038, + "step": 14455 + }, + { + "epoch": 18.50368, + "grad_norm": 1.0768336057662964, + "learning_rate": 2.1102440976390558e-05, + "loss": 0.5887, + "step": 14456 + }, + { + "epoch": 18.50496, + "grad_norm": 1.0072085857391357, + "learning_rate": 2.110044017607043e-05, + "loss": 0.478, + "step": 14457 + }, + { + "epoch": 18.50624, + "grad_norm": 0.9800832867622375, + "learning_rate": 2.10984393757503e-05, + "loss": 0.4799, + "step": 14458 + }, + { + "epoch": 18.50752, + "grad_norm": 1.042318344116211, + "learning_rate": 2.1096438575430173e-05, + "loss": 0.5065, + "step": 14459 + }, + { + "epoch": 18.5088, + "grad_norm": 1.1013107299804688, + "learning_rate": 2.1094437775110045e-05, + "loss": 0.6008, + "step": 14460 + }, + { + "epoch": 18.51008, + "grad_norm": 0.9804616570472717, + "learning_rate": 2.1092436974789917e-05, + "loss": 0.4585, + "step": 14461 + }, + { + "epoch": 18.51136, + "grad_norm": 1.0614451169967651, + "learning_rate": 2.109043617446979e-05, + "loss": 0.4966, + "step": 14462 + }, + { + "epoch": 18.51264, + "grad_norm": 0.9891878962516785, + "learning_rate": 2.108843537414966e-05, + "loss": 0.516, + "step": 14463 + }, + { + "epoch": 18.51392, + "grad_norm": 1.0645172595977783, + "learning_rate": 2.1086434573829533e-05, + "loss": 0.5074, + "step": 14464 + }, + { + "epoch": 18.5152, + "grad_norm": 1.0641040802001953, + "learning_rate": 2.1084433773509404e-05, + "loss": 0.5201, + "step": 14465 + }, + { + "epoch": 18.51648, + "grad_norm": 0.9872971177101135, + "learning_rate": 2.1082432973189276e-05, + "loss": 0.4698, + "step": 14466 + }, + { + "epoch": 18.51776, + "grad_norm": 1.167199730873108, + "learning_rate": 2.1080432172869148e-05, + "loss": 0.5102, + "step": 14467 + }, + { + "epoch": 18.51904, + "grad_norm": 1.1635416746139526, + "learning_rate": 2.107843137254902e-05, + "loss": 0.5385, + "step": 14468 + }, + { + "epoch": 18.52032, + "grad_norm": 1.1156731843948364, + "learning_rate": 2.1076430572228892e-05, + "loss": 0.5442, + "step": 14469 + }, + { + "epoch": 18.5216, + "grad_norm": 1.0268969535827637, + "learning_rate": 2.1074429771908767e-05, + "loss": 0.4385, + "step": 14470 + }, + { + "epoch": 18.52288, + "grad_norm": 1.0554258823394775, + "learning_rate": 2.1072428971588636e-05, + "loss": 0.5172, + "step": 14471 + }, + { + "epoch": 18.52416, + "grad_norm": 1.07488214969635, + "learning_rate": 2.1070428171268507e-05, + "loss": 0.5373, + "step": 14472 + }, + { + "epoch": 18.52544, + "grad_norm": 0.9701617360115051, + "learning_rate": 2.106842737094838e-05, + "loss": 0.4699, + "step": 14473 + }, + { + "epoch": 18.52672, + "grad_norm": 1.0536491870880127, + "learning_rate": 2.1066426570628255e-05, + "loss": 0.5075, + "step": 14474 + }, + { + "epoch": 18.528, + "grad_norm": 1.1077998876571655, + "learning_rate": 2.1064425770308123e-05, + "loss": 0.559, + "step": 14475 + }, + { + "epoch": 18.52928, + "grad_norm": 1.043217658996582, + "learning_rate": 2.1062424969987995e-05, + "loss": 0.4921, + "step": 14476 + }, + { + "epoch": 18.53056, + "grad_norm": 1.0634691715240479, + "learning_rate": 2.106042416966787e-05, + "loss": 0.4999, + "step": 14477 + }, + { + "epoch": 18.53184, + "grad_norm": 1.024617075920105, + "learning_rate": 2.1058423369347742e-05, + "loss": 0.5021, + "step": 14478 + }, + { + "epoch": 18.53312, + "grad_norm": 1.026159405708313, + "learning_rate": 2.105642256902761e-05, + "loss": 0.5201, + "step": 14479 + }, + { + "epoch": 18.5344, + "grad_norm": 1.085721731185913, + "learning_rate": 2.1054421768707482e-05, + "loss": 0.5542, + "step": 14480 + }, + { + "epoch": 18.53568, + "grad_norm": 1.0542292594909668, + "learning_rate": 2.1052420968387358e-05, + "loss": 0.5239, + "step": 14481 + }, + { + "epoch": 18.53696, + "grad_norm": 1.0141193866729736, + "learning_rate": 2.105042016806723e-05, + "loss": 0.5116, + "step": 14482 + }, + { + "epoch": 18.538240000000002, + "grad_norm": 1.106048822402954, + "learning_rate": 2.1048419367747098e-05, + "loss": 0.5843, + "step": 14483 + }, + { + "epoch": 18.53952, + "grad_norm": 1.0620492696762085, + "learning_rate": 2.1046418567426973e-05, + "loss": 0.5049, + "step": 14484 + }, + { + "epoch": 18.5408, + "grad_norm": 1.0204299688339233, + "learning_rate": 2.1044417767106845e-05, + "loss": 0.507, + "step": 14485 + }, + { + "epoch": 18.54208, + "grad_norm": 1.0955986976623535, + "learning_rate": 2.1042416966786717e-05, + "loss": 0.5323, + "step": 14486 + }, + { + "epoch": 18.54336, + "grad_norm": 1.0099011659622192, + "learning_rate": 2.1040416166466585e-05, + "loss": 0.5007, + "step": 14487 + }, + { + "epoch": 18.54464, + "grad_norm": 0.9762060642242432, + "learning_rate": 2.103841536614646e-05, + "loss": 0.4201, + "step": 14488 + }, + { + "epoch": 18.54592, + "grad_norm": 1.060850739479065, + "learning_rate": 2.1036414565826332e-05, + "loss": 0.4815, + "step": 14489 + }, + { + "epoch": 18.5472, + "grad_norm": 1.077972173690796, + "learning_rate": 2.1034413765506204e-05, + "loss": 0.5148, + "step": 14490 + }, + { + "epoch": 18.54848, + "grad_norm": 1.1198941469192505, + "learning_rate": 2.1032412965186073e-05, + "loss": 0.5361, + "step": 14491 + }, + { + "epoch": 18.54976, + "grad_norm": 1.1299846172332764, + "learning_rate": 2.1030412164865948e-05, + "loss": 0.5398, + "step": 14492 + }, + { + "epoch": 18.55104, + "grad_norm": 0.9961510896682739, + "learning_rate": 2.102841136454582e-05, + "loss": 0.425, + "step": 14493 + }, + { + "epoch": 18.55232, + "grad_norm": 1.0336405038833618, + "learning_rate": 2.102641056422569e-05, + "loss": 0.5029, + "step": 14494 + }, + { + "epoch": 18.5536, + "grad_norm": 1.073876142501831, + "learning_rate": 2.1024409763905564e-05, + "loss": 0.4941, + "step": 14495 + }, + { + "epoch": 18.55488, + "grad_norm": 1.0787442922592163, + "learning_rate": 2.1022408963585435e-05, + "loss": 0.5394, + "step": 14496 + }, + { + "epoch": 18.55616, + "grad_norm": 1.0297425985336304, + "learning_rate": 2.1020408163265307e-05, + "loss": 0.4841, + "step": 14497 + }, + { + "epoch": 18.55744, + "grad_norm": 1.0471930503845215, + "learning_rate": 2.101840736294518e-05, + "loss": 0.4956, + "step": 14498 + }, + { + "epoch": 18.55872, + "grad_norm": 1.0681778192520142, + "learning_rate": 2.101640656262505e-05, + "loss": 0.5614, + "step": 14499 + }, + { + "epoch": 18.56, + "grad_norm": 1.0257924795150757, + "learning_rate": 2.1014405762304923e-05, + "loss": 0.4542, + "step": 14500 + }, + { + "epoch": 18.56128, + "grad_norm": 1.028685450553894, + "learning_rate": 2.1012404961984795e-05, + "loss": 0.4824, + "step": 14501 + }, + { + "epoch": 18.56256, + "grad_norm": 1.0713627338409424, + "learning_rate": 2.1010404161664667e-05, + "loss": 0.543, + "step": 14502 + }, + { + "epoch": 18.56384, + "grad_norm": 1.0989772081375122, + "learning_rate": 2.100840336134454e-05, + "loss": 0.5345, + "step": 14503 + }, + { + "epoch": 18.56512, + "grad_norm": 1.0844435691833496, + "learning_rate": 2.100640256102441e-05, + "loss": 0.514, + "step": 14504 + }, + { + "epoch": 18.5664, + "grad_norm": 1.0661566257476807, + "learning_rate": 2.1004401760704282e-05, + "loss": 0.5177, + "step": 14505 + }, + { + "epoch": 18.56768, + "grad_norm": 1.0676720142364502, + "learning_rate": 2.1002400960384154e-05, + "loss": 0.5007, + "step": 14506 + }, + { + "epoch": 18.56896, + "grad_norm": 1.0638859272003174, + "learning_rate": 2.1000400160064026e-05, + "loss": 0.5185, + "step": 14507 + }, + { + "epoch": 18.57024, + "grad_norm": 1.020068883895874, + "learning_rate": 2.0998399359743898e-05, + "loss": 0.4734, + "step": 14508 + }, + { + "epoch": 18.57152, + "grad_norm": 1.103440523147583, + "learning_rate": 2.0996398559423773e-05, + "loss": 0.5404, + "step": 14509 + }, + { + "epoch": 18.5728, + "grad_norm": 1.064695119857788, + "learning_rate": 2.099439775910364e-05, + "loss": 0.5363, + "step": 14510 + }, + { + "epoch": 18.57408, + "grad_norm": 1.0438659191131592, + "learning_rate": 2.0992396958783513e-05, + "loss": 0.534, + "step": 14511 + }, + { + "epoch": 18.57536, + "grad_norm": 1.0252081155776978, + "learning_rate": 2.0990396158463385e-05, + "loss": 0.4815, + "step": 14512 + }, + { + "epoch": 18.57664, + "grad_norm": 1.0469212532043457, + "learning_rate": 2.098839535814326e-05, + "loss": 0.5046, + "step": 14513 + }, + { + "epoch": 18.57792, + "grad_norm": 1.011010766029358, + "learning_rate": 2.098639455782313e-05, + "loss": 0.4726, + "step": 14514 + }, + { + "epoch": 18.5792, + "grad_norm": 1.0593057870864868, + "learning_rate": 2.0984393757503e-05, + "loss": 0.5198, + "step": 14515 + }, + { + "epoch": 18.58048, + "grad_norm": 1.0215234756469727, + "learning_rate": 2.0982392957182876e-05, + "loss": 0.5168, + "step": 14516 + }, + { + "epoch": 18.58176, + "grad_norm": 1.0414518117904663, + "learning_rate": 2.0980392156862748e-05, + "loss": 0.5049, + "step": 14517 + }, + { + "epoch": 18.58304, + "grad_norm": 1.1356487274169922, + "learning_rate": 2.0978391356542616e-05, + "loss": 0.5467, + "step": 14518 + }, + { + "epoch": 18.584319999999998, + "grad_norm": 1.0748487710952759, + "learning_rate": 2.0976390556222488e-05, + "loss": 0.5602, + "step": 14519 + }, + { + "epoch": 18.5856, + "grad_norm": 1.0394898653030396, + "learning_rate": 2.0974389755902363e-05, + "loss": 0.5286, + "step": 14520 + }, + { + "epoch": 18.58688, + "grad_norm": 1.0443836450576782, + "learning_rate": 2.0972388955582235e-05, + "loss": 0.5007, + "step": 14521 + }, + { + "epoch": 18.58816, + "grad_norm": 1.083796739578247, + "learning_rate": 2.0970388155262104e-05, + "loss": 0.5159, + "step": 14522 + }, + { + "epoch": 18.58944, + "grad_norm": 1.0455292463302612, + "learning_rate": 2.096838735494198e-05, + "loss": 0.4783, + "step": 14523 + }, + { + "epoch": 18.59072, + "grad_norm": 1.0481806993484497, + "learning_rate": 2.096638655462185e-05, + "loss": 0.5356, + "step": 14524 + }, + { + "epoch": 18.592, + "grad_norm": 0.9991112947463989, + "learning_rate": 2.0964385754301723e-05, + "loss": 0.479, + "step": 14525 + }, + { + "epoch": 18.59328, + "grad_norm": 1.079669713973999, + "learning_rate": 2.096238495398159e-05, + "loss": 0.5138, + "step": 14526 + }, + { + "epoch": 18.59456, + "grad_norm": 1.0523369312286377, + "learning_rate": 2.0960384153661466e-05, + "loss": 0.4889, + "step": 14527 + }, + { + "epoch": 18.59584, + "grad_norm": 1.1188772916793823, + "learning_rate": 2.0958383353341338e-05, + "loss": 0.5201, + "step": 14528 + }, + { + "epoch": 18.59712, + "grad_norm": 1.111080527305603, + "learning_rate": 2.095638255302121e-05, + "loss": 0.4918, + "step": 14529 + }, + { + "epoch": 18.5984, + "grad_norm": 1.019318699836731, + "learning_rate": 2.0954381752701082e-05, + "loss": 0.4547, + "step": 14530 + }, + { + "epoch": 18.59968, + "grad_norm": 1.0236701965332031, + "learning_rate": 2.0952380952380954e-05, + "loss": 0.4956, + "step": 14531 + }, + { + "epoch": 18.60096, + "grad_norm": 1.06638503074646, + "learning_rate": 2.0950380152060826e-05, + "loss": 0.4941, + "step": 14532 + }, + { + "epoch": 18.60224, + "grad_norm": 1.0441734790802002, + "learning_rate": 2.0948379351740697e-05, + "loss": 0.4714, + "step": 14533 + }, + { + "epoch": 18.60352, + "grad_norm": 1.0851494073867798, + "learning_rate": 2.094637855142057e-05, + "loss": 0.5392, + "step": 14534 + }, + { + "epoch": 18.6048, + "grad_norm": 1.0086928606033325, + "learning_rate": 2.094437775110044e-05, + "loss": 0.4795, + "step": 14535 + }, + { + "epoch": 18.60608, + "grad_norm": 1.0636036396026611, + "learning_rate": 2.0942376950780313e-05, + "loss": 0.5342, + "step": 14536 + }, + { + "epoch": 18.60736, + "grad_norm": 1.0684483051300049, + "learning_rate": 2.0940376150460185e-05, + "loss": 0.5094, + "step": 14537 + }, + { + "epoch": 18.60864, + "grad_norm": 1.0535985231399536, + "learning_rate": 2.0938375350140057e-05, + "loss": 0.4892, + "step": 14538 + }, + { + "epoch": 18.60992, + "grad_norm": 1.0609737634658813, + "learning_rate": 2.093637454981993e-05, + "loss": 0.5016, + "step": 14539 + }, + { + "epoch": 18.6112, + "grad_norm": 1.0694406032562256, + "learning_rate": 2.09343737494998e-05, + "loss": 0.5134, + "step": 14540 + }, + { + "epoch": 18.61248, + "grad_norm": 1.0116039514541626, + "learning_rate": 2.0932372949179672e-05, + "loss": 0.4585, + "step": 14541 + }, + { + "epoch": 18.61376, + "grad_norm": 1.125255823135376, + "learning_rate": 2.0930372148859544e-05, + "loss": 0.5355, + "step": 14542 + }, + { + "epoch": 18.61504, + "grad_norm": 1.111659288406372, + "learning_rate": 2.0928371348539416e-05, + "loss": 0.5312, + "step": 14543 + }, + { + "epoch": 18.61632, + "grad_norm": 1.1183501482009888, + "learning_rate": 2.092637054821929e-05, + "loss": 0.5408, + "step": 14544 + }, + { + "epoch": 18.6176, + "grad_norm": 1.0673730373382568, + "learning_rate": 2.092436974789916e-05, + "loss": 0.5031, + "step": 14545 + }, + { + "epoch": 18.61888, + "grad_norm": 1.039713978767395, + "learning_rate": 2.092236894757903e-05, + "loss": 0.5115, + "step": 14546 + }, + { + "epoch": 18.62016, + "grad_norm": 1.0131070613861084, + "learning_rate": 2.0920368147258903e-05, + "loss": 0.4742, + "step": 14547 + }, + { + "epoch": 18.62144, + "grad_norm": 1.0833195447921753, + "learning_rate": 2.091836734693878e-05, + "loss": 0.5393, + "step": 14548 + }, + { + "epoch": 18.62272, + "grad_norm": 1.193190097808838, + "learning_rate": 2.0916366546618647e-05, + "loss": 0.5043, + "step": 14549 + }, + { + "epoch": 18.624, + "grad_norm": 1.0585458278656006, + "learning_rate": 2.091436574629852e-05, + "loss": 0.4841, + "step": 14550 + }, + { + "epoch": 18.62528, + "grad_norm": 1.0493146181106567, + "learning_rate": 2.091236494597839e-05, + "loss": 0.5084, + "step": 14551 + }, + { + "epoch": 18.62656, + "grad_norm": 1.0405904054641724, + "learning_rate": 2.0910364145658266e-05, + "loss": 0.5273, + "step": 14552 + }, + { + "epoch": 18.62784, + "grad_norm": 1.0221456289291382, + "learning_rate": 2.0908363345338135e-05, + "loss": 0.4846, + "step": 14553 + }, + { + "epoch": 18.62912, + "grad_norm": 1.0705711841583252, + "learning_rate": 2.0906362545018006e-05, + "loss": 0.5271, + "step": 14554 + }, + { + "epoch": 18.6304, + "grad_norm": 1.0186147689819336, + "learning_rate": 2.0904361744697882e-05, + "loss": 0.4886, + "step": 14555 + }, + { + "epoch": 18.63168, + "grad_norm": 1.0867236852645874, + "learning_rate": 2.0902360944377754e-05, + "loss": 0.517, + "step": 14556 + }, + { + "epoch": 18.63296, + "grad_norm": 1.061066746711731, + "learning_rate": 2.0900360144057622e-05, + "loss": 0.5189, + "step": 14557 + }, + { + "epoch": 18.63424, + "grad_norm": 1.0720415115356445, + "learning_rate": 2.0898359343737494e-05, + "loss": 0.5382, + "step": 14558 + }, + { + "epoch": 18.63552, + "grad_norm": 1.061384916305542, + "learning_rate": 2.089635854341737e-05, + "loss": 0.5331, + "step": 14559 + }, + { + "epoch": 18.6368, + "grad_norm": 1.0454514026641846, + "learning_rate": 2.089435774309724e-05, + "loss": 0.4793, + "step": 14560 + }, + { + "epoch": 18.63808, + "grad_norm": 1.0604616403579712, + "learning_rate": 2.089235694277711e-05, + "loss": 0.4637, + "step": 14561 + }, + { + "epoch": 18.63936, + "grad_norm": 1.0945936441421509, + "learning_rate": 2.0890356142456985e-05, + "loss": 0.5413, + "step": 14562 + }, + { + "epoch": 18.64064, + "grad_norm": 1.0144293308258057, + "learning_rate": 2.0888355342136857e-05, + "loss": 0.5071, + "step": 14563 + }, + { + "epoch": 18.64192, + "grad_norm": 1.0849329233169556, + "learning_rate": 2.088635454181673e-05, + "loss": 0.5343, + "step": 14564 + }, + { + "epoch": 18.6432, + "grad_norm": 0.9900054931640625, + "learning_rate": 2.0884353741496597e-05, + "loss": 0.4709, + "step": 14565 + }, + { + "epoch": 18.64448, + "grad_norm": 1.0568320751190186, + "learning_rate": 2.0882352941176472e-05, + "loss": 0.4829, + "step": 14566 + }, + { + "epoch": 18.64576, + "grad_norm": 1.1182734966278076, + "learning_rate": 2.0880352140856344e-05, + "loss": 0.5742, + "step": 14567 + }, + { + "epoch": 18.64704, + "grad_norm": 1.1429896354675293, + "learning_rate": 2.0878351340536216e-05, + "loss": 0.5731, + "step": 14568 + }, + { + "epoch": 18.64832, + "grad_norm": 1.0689339637756348, + "learning_rate": 2.0876350540216088e-05, + "loss": 0.496, + "step": 14569 + }, + { + "epoch": 18.6496, + "grad_norm": 1.0403820276260376, + "learning_rate": 2.087434973989596e-05, + "loss": 0.4286, + "step": 14570 + }, + { + "epoch": 18.65088, + "grad_norm": 1.0727622509002686, + "learning_rate": 2.087234893957583e-05, + "loss": 0.5124, + "step": 14571 + }, + { + "epoch": 18.65216, + "grad_norm": 1.0290236473083496, + "learning_rate": 2.0870348139255703e-05, + "loss": 0.4925, + "step": 14572 + }, + { + "epoch": 18.65344, + "grad_norm": 1.0462875366210938, + "learning_rate": 2.0868347338935575e-05, + "loss": 0.4666, + "step": 14573 + }, + { + "epoch": 18.65472, + "grad_norm": 1.0664476156234741, + "learning_rate": 2.0866346538615447e-05, + "loss": 0.5112, + "step": 14574 + }, + { + "epoch": 18.656, + "grad_norm": 1.0481314659118652, + "learning_rate": 2.086434573829532e-05, + "loss": 0.4763, + "step": 14575 + }, + { + "epoch": 18.65728, + "grad_norm": 1.0135314464569092, + "learning_rate": 2.086234493797519e-05, + "loss": 0.4885, + "step": 14576 + }, + { + "epoch": 18.65856, + "grad_norm": 0.9977083206176758, + "learning_rate": 2.0860344137655063e-05, + "loss": 0.4949, + "step": 14577 + }, + { + "epoch": 18.65984, + "grad_norm": 1.043670892715454, + "learning_rate": 2.0858343337334934e-05, + "loss": 0.5031, + "step": 14578 + }, + { + "epoch": 18.66112, + "grad_norm": 1.0433337688446045, + "learning_rate": 2.0856342537014806e-05, + "loss": 0.5061, + "step": 14579 + }, + { + "epoch": 18.6624, + "grad_norm": 1.006962537765503, + "learning_rate": 2.0854341736694678e-05, + "loss": 0.5039, + "step": 14580 + }, + { + "epoch": 18.66368, + "grad_norm": 1.046007513999939, + "learning_rate": 2.085234093637455e-05, + "loss": 0.5434, + "step": 14581 + }, + { + "epoch": 18.66496, + "grad_norm": 1.0263768434524536, + "learning_rate": 2.0850340136054422e-05, + "loss": 0.5117, + "step": 14582 + }, + { + "epoch": 18.66624, + "grad_norm": 1.0261712074279785, + "learning_rate": 2.0848339335734297e-05, + "loss": 0.4705, + "step": 14583 + }, + { + "epoch": 18.66752, + "grad_norm": 1.0606975555419922, + "learning_rate": 2.0846338535414166e-05, + "loss": 0.4931, + "step": 14584 + }, + { + "epoch": 18.6688, + "grad_norm": 1.0955727100372314, + "learning_rate": 2.0844337735094037e-05, + "loss": 0.5149, + "step": 14585 + }, + { + "epoch": 18.67008, + "grad_norm": 0.9943707585334778, + "learning_rate": 2.084233693477391e-05, + "loss": 0.4668, + "step": 14586 + }, + { + "epoch": 18.67136, + "grad_norm": 1.082153081893921, + "learning_rate": 2.0840336134453785e-05, + "loss": 0.5313, + "step": 14587 + }, + { + "epoch": 18.67264, + "grad_norm": 1.0203741788864136, + "learning_rate": 2.0838335334133653e-05, + "loss": 0.4759, + "step": 14588 + }, + { + "epoch": 18.67392, + "grad_norm": 1.0411763191223145, + "learning_rate": 2.0836334533813525e-05, + "loss": 0.5068, + "step": 14589 + }, + { + "epoch": 18.6752, + "grad_norm": 1.0571635961532593, + "learning_rate": 2.08343337334934e-05, + "loss": 0.5386, + "step": 14590 + }, + { + "epoch": 18.67648, + "grad_norm": 1.0805469751358032, + "learning_rate": 2.0832332933173272e-05, + "loss": 0.5289, + "step": 14591 + }, + { + "epoch": 18.67776, + "grad_norm": 1.01336669921875, + "learning_rate": 2.083033213285314e-05, + "loss": 0.4634, + "step": 14592 + }, + { + "epoch": 18.67904, + "grad_norm": 1.0346022844314575, + "learning_rate": 2.0828331332533012e-05, + "loss": 0.504, + "step": 14593 + }, + { + "epoch": 18.680320000000002, + "grad_norm": 1.0586795806884766, + "learning_rate": 2.0826330532212888e-05, + "loss": 0.4977, + "step": 14594 + }, + { + "epoch": 18.6816, + "grad_norm": 1.0512717962265015, + "learning_rate": 2.082432973189276e-05, + "loss": 0.4652, + "step": 14595 + }, + { + "epoch": 18.68288, + "grad_norm": 1.0366772413253784, + "learning_rate": 2.0822328931572628e-05, + "loss": 0.4688, + "step": 14596 + }, + { + "epoch": 18.68416, + "grad_norm": 1.096342921257019, + "learning_rate": 2.0820328131252503e-05, + "loss": 0.5036, + "step": 14597 + }, + { + "epoch": 18.68544, + "grad_norm": 1.0564435720443726, + "learning_rate": 2.0818327330932375e-05, + "loss": 0.4801, + "step": 14598 + }, + { + "epoch": 18.68672, + "grad_norm": 1.0218154191970825, + "learning_rate": 2.0816326530612247e-05, + "loss": 0.5102, + "step": 14599 + }, + { + "epoch": 18.688, + "grad_norm": 1.0529659986495972, + "learning_rate": 2.0814325730292115e-05, + "loss": 0.5007, + "step": 14600 + }, + { + "epoch": 18.68928, + "grad_norm": 1.0413590669631958, + "learning_rate": 2.081232492997199e-05, + "loss": 0.4678, + "step": 14601 + }, + { + "epoch": 18.69056, + "grad_norm": 1.0885825157165527, + "learning_rate": 2.0810324129651862e-05, + "loss": 0.5116, + "step": 14602 + }, + { + "epoch": 18.69184, + "grad_norm": 1.0246473550796509, + "learning_rate": 2.0808323329331734e-05, + "loss": 0.4818, + "step": 14603 + }, + { + "epoch": 18.69312, + "grad_norm": 1.1003787517547607, + "learning_rate": 2.0806322529011603e-05, + "loss": 0.5168, + "step": 14604 + }, + { + "epoch": 18.6944, + "grad_norm": 1.0684752464294434, + "learning_rate": 2.0804321728691478e-05, + "loss": 0.4637, + "step": 14605 + }, + { + "epoch": 18.69568, + "grad_norm": 1.1062880754470825, + "learning_rate": 2.080232092837135e-05, + "loss": 0.5151, + "step": 14606 + }, + { + "epoch": 18.69696, + "grad_norm": 1.0668532848358154, + "learning_rate": 2.080032012805122e-05, + "loss": 0.5465, + "step": 14607 + }, + { + "epoch": 18.69824, + "grad_norm": 1.0295997858047485, + "learning_rate": 2.0798319327731094e-05, + "loss": 0.4645, + "step": 14608 + }, + { + "epoch": 18.69952, + "grad_norm": 1.020206093788147, + "learning_rate": 2.0796318527410965e-05, + "loss": 0.4971, + "step": 14609 + }, + { + "epoch": 18.7008, + "grad_norm": 1.0235021114349365, + "learning_rate": 2.0794317727090837e-05, + "loss": 0.5198, + "step": 14610 + }, + { + "epoch": 18.70208, + "grad_norm": 1.0551282167434692, + "learning_rate": 2.079231692677071e-05, + "loss": 0.5083, + "step": 14611 + }, + { + "epoch": 18.70336, + "grad_norm": 1.0582573413848877, + "learning_rate": 2.079031612645058e-05, + "loss": 0.5274, + "step": 14612 + }, + { + "epoch": 18.70464, + "grad_norm": 1.0594959259033203, + "learning_rate": 2.0788315326130453e-05, + "loss": 0.4954, + "step": 14613 + }, + { + "epoch": 18.70592, + "grad_norm": 1.0503864288330078, + "learning_rate": 2.0786314525810325e-05, + "loss": 0.5453, + "step": 14614 + }, + { + "epoch": 18.7072, + "grad_norm": 1.0637823343276978, + "learning_rate": 2.0784313725490197e-05, + "loss": 0.5736, + "step": 14615 + }, + { + "epoch": 18.70848, + "grad_norm": 1.0056936740875244, + "learning_rate": 2.078231292517007e-05, + "loss": 0.4663, + "step": 14616 + }, + { + "epoch": 18.70976, + "grad_norm": 1.081607699394226, + "learning_rate": 2.078031212484994e-05, + "loss": 0.5333, + "step": 14617 + }, + { + "epoch": 18.71104, + "grad_norm": 1.1000036001205444, + "learning_rate": 2.0778311324529812e-05, + "loss": 0.5505, + "step": 14618 + }, + { + "epoch": 18.71232, + "grad_norm": 1.0323201417922974, + "learning_rate": 2.0776310524209684e-05, + "loss": 0.4896, + "step": 14619 + }, + { + "epoch": 18.7136, + "grad_norm": 1.0627686977386475, + "learning_rate": 2.0774309723889556e-05, + "loss": 0.5104, + "step": 14620 + }, + { + "epoch": 18.71488, + "grad_norm": 1.0364760160446167, + "learning_rate": 2.0772308923569428e-05, + "loss": 0.5054, + "step": 14621 + }, + { + "epoch": 18.71616, + "grad_norm": 1.0415117740631104, + "learning_rate": 2.0770308123249303e-05, + "loss": 0.4796, + "step": 14622 + }, + { + "epoch": 18.71744, + "grad_norm": 1.0772327184677124, + "learning_rate": 2.076830732292917e-05, + "loss": 0.527, + "step": 14623 + }, + { + "epoch": 18.71872, + "grad_norm": 1.140191674232483, + "learning_rate": 2.0766306522609043e-05, + "loss": 0.5161, + "step": 14624 + }, + { + "epoch": 18.72, + "grad_norm": 1.1282105445861816, + "learning_rate": 2.0764305722288915e-05, + "loss": 0.5677, + "step": 14625 + }, + { + "epoch": 18.72128, + "grad_norm": 1.060404658317566, + "learning_rate": 2.076230492196879e-05, + "loss": 0.5116, + "step": 14626 + }, + { + "epoch": 18.72256, + "grad_norm": 1.0670077800750732, + "learning_rate": 2.076030412164866e-05, + "loss": 0.4953, + "step": 14627 + }, + { + "epoch": 18.72384, + "grad_norm": 1.036635398864746, + "learning_rate": 2.075830332132853e-05, + "loss": 0.4991, + "step": 14628 + }, + { + "epoch": 18.72512, + "grad_norm": 1.0683354139328003, + "learning_rate": 2.0756302521008406e-05, + "loss": 0.5511, + "step": 14629 + }, + { + "epoch": 18.7264, + "grad_norm": 1.094277024269104, + "learning_rate": 2.0754301720688278e-05, + "loss": 0.5233, + "step": 14630 + }, + { + "epoch": 18.72768, + "grad_norm": 1.093468189239502, + "learning_rate": 2.0752300920368146e-05, + "loss": 0.5616, + "step": 14631 + }, + { + "epoch": 18.72896, + "grad_norm": 1.0831431150436401, + "learning_rate": 2.0750300120048018e-05, + "loss": 0.5404, + "step": 14632 + }, + { + "epoch": 18.73024, + "grad_norm": 1.1620758771896362, + "learning_rate": 2.0748299319727893e-05, + "loss": 0.5539, + "step": 14633 + }, + { + "epoch": 18.73152, + "grad_norm": 1.077682614326477, + "learning_rate": 2.0746298519407765e-05, + "loss": 0.4842, + "step": 14634 + }, + { + "epoch": 18.7328, + "grad_norm": 1.0813980102539062, + "learning_rate": 2.0744297719087634e-05, + "loss": 0.4992, + "step": 14635 + }, + { + "epoch": 18.73408, + "grad_norm": 1.1097936630249023, + "learning_rate": 2.074229691876751e-05, + "loss": 0.569, + "step": 14636 + }, + { + "epoch": 18.73536, + "grad_norm": 1.0540353059768677, + "learning_rate": 2.074029611844738e-05, + "loss": 0.4809, + "step": 14637 + }, + { + "epoch": 18.73664, + "grad_norm": 1.0522795915603638, + "learning_rate": 2.0738295318127253e-05, + "loss": 0.5611, + "step": 14638 + }, + { + "epoch": 18.73792, + "grad_norm": 1.0796175003051758, + "learning_rate": 2.073629451780712e-05, + "loss": 0.522, + "step": 14639 + }, + { + "epoch": 18.7392, + "grad_norm": 1.0437400341033936, + "learning_rate": 2.0734293717486996e-05, + "loss": 0.503, + "step": 14640 + }, + { + "epoch": 18.74048, + "grad_norm": 1.0430114269256592, + "learning_rate": 2.0732292917166868e-05, + "loss": 0.4834, + "step": 14641 + }, + { + "epoch": 18.74176, + "grad_norm": 1.015177845954895, + "learning_rate": 2.073029211684674e-05, + "loss": 0.4529, + "step": 14642 + }, + { + "epoch": 18.74304, + "grad_norm": 1.0426579713821411, + "learning_rate": 2.0728291316526612e-05, + "loss": 0.4777, + "step": 14643 + }, + { + "epoch": 18.74432, + "grad_norm": 1.106436848640442, + "learning_rate": 2.0726290516206484e-05, + "loss": 0.5592, + "step": 14644 + }, + { + "epoch": 18.7456, + "grad_norm": 1.096053123474121, + "learning_rate": 2.0724289715886356e-05, + "loss": 0.5066, + "step": 14645 + }, + { + "epoch": 18.74688, + "grad_norm": 1.077257752418518, + "learning_rate": 2.0722288915566227e-05, + "loss": 0.4936, + "step": 14646 + }, + { + "epoch": 18.74816, + "grad_norm": 1.0427172183990479, + "learning_rate": 2.07202881152461e-05, + "loss": 0.5277, + "step": 14647 + }, + { + "epoch": 18.74944, + "grad_norm": 1.0899598598480225, + "learning_rate": 2.071828731492597e-05, + "loss": 0.5181, + "step": 14648 + }, + { + "epoch": 18.75072, + "grad_norm": 1.05521559715271, + "learning_rate": 2.0716286514605843e-05, + "loss": 0.5036, + "step": 14649 + }, + { + "epoch": 18.752, + "grad_norm": 1.0131468772888184, + "learning_rate": 2.0714285714285718e-05, + "loss": 0.4956, + "step": 14650 + }, + { + "epoch": 18.75328, + "grad_norm": 1.0845153331756592, + "learning_rate": 2.0712284913965587e-05, + "loss": 0.5164, + "step": 14651 + }, + { + "epoch": 18.75456, + "grad_norm": 1.0508294105529785, + "learning_rate": 2.071028411364546e-05, + "loss": 0.4958, + "step": 14652 + }, + { + "epoch": 18.75584, + "grad_norm": 1.0884485244750977, + "learning_rate": 2.070828331332533e-05, + "loss": 0.5257, + "step": 14653 + }, + { + "epoch": 18.75712, + "grad_norm": 1.0835343599319458, + "learning_rate": 2.0706282513005206e-05, + "loss": 0.5102, + "step": 14654 + }, + { + "epoch": 18.7584, + "grad_norm": 1.0350970029830933, + "learning_rate": 2.0704281712685074e-05, + "loss": 0.4823, + "step": 14655 + }, + { + "epoch": 18.75968, + "grad_norm": 1.0485219955444336, + "learning_rate": 2.0702280912364946e-05, + "loss": 0.4848, + "step": 14656 + }, + { + "epoch": 18.76096, + "grad_norm": 1.053740382194519, + "learning_rate": 2.0700280112044818e-05, + "loss": 0.5062, + "step": 14657 + }, + { + "epoch": 18.76224, + "grad_norm": 1.0165501832962036, + "learning_rate": 2.0698279311724693e-05, + "loss": 0.4833, + "step": 14658 + }, + { + "epoch": 18.76352, + "grad_norm": 1.0042489767074585, + "learning_rate": 2.069627851140456e-05, + "loss": 0.4763, + "step": 14659 + }, + { + "epoch": 18.7648, + "grad_norm": 1.03883695602417, + "learning_rate": 2.0694277711084433e-05, + "loss": 0.5079, + "step": 14660 + }, + { + "epoch": 18.76608, + "grad_norm": 1.0785133838653564, + "learning_rate": 2.069227691076431e-05, + "loss": 0.5352, + "step": 14661 + }, + { + "epoch": 18.76736, + "grad_norm": 1.0216038227081299, + "learning_rate": 2.069027611044418e-05, + "loss": 0.4591, + "step": 14662 + }, + { + "epoch": 18.76864, + "grad_norm": 1.0611780881881714, + "learning_rate": 2.068827531012405e-05, + "loss": 0.5309, + "step": 14663 + }, + { + "epoch": 18.76992, + "grad_norm": 1.055109977722168, + "learning_rate": 2.068627450980392e-05, + "loss": 0.5455, + "step": 14664 + }, + { + "epoch": 18.7712, + "grad_norm": 1.064369559288025, + "learning_rate": 2.0684273709483796e-05, + "loss": 0.5099, + "step": 14665 + }, + { + "epoch": 18.77248, + "grad_norm": 1.0237936973571777, + "learning_rate": 2.0682272909163668e-05, + "loss": 0.4916, + "step": 14666 + }, + { + "epoch": 18.77376, + "grad_norm": 1.1315503120422363, + "learning_rate": 2.0680272108843536e-05, + "loss": 0.5178, + "step": 14667 + }, + { + "epoch": 18.77504, + "grad_norm": 1.0587965250015259, + "learning_rate": 2.0678271308523412e-05, + "loss": 0.5022, + "step": 14668 + }, + { + "epoch": 18.77632, + "grad_norm": 1.0879102945327759, + "learning_rate": 2.0676270508203284e-05, + "loss": 0.4688, + "step": 14669 + }, + { + "epoch": 18.7776, + "grad_norm": 1.0429378747940063, + "learning_rate": 2.0674269707883155e-05, + "loss": 0.483, + "step": 14670 + }, + { + "epoch": 18.77888, + "grad_norm": 1.0255515575408936, + "learning_rate": 2.0672268907563024e-05, + "loss": 0.49, + "step": 14671 + }, + { + "epoch": 18.78016, + "grad_norm": 1.1541240215301514, + "learning_rate": 2.06702681072429e-05, + "loss": 0.5842, + "step": 14672 + }, + { + "epoch": 18.78144, + "grad_norm": 1.0616282224655151, + "learning_rate": 2.066826730692277e-05, + "loss": 0.4856, + "step": 14673 + }, + { + "epoch": 18.78272, + "grad_norm": 1.0617082118988037, + "learning_rate": 2.0666266506602643e-05, + "loss": 0.4908, + "step": 14674 + }, + { + "epoch": 18.784, + "grad_norm": 1.1243386268615723, + "learning_rate": 2.0664265706282515e-05, + "loss": 0.5216, + "step": 14675 + }, + { + "epoch": 18.78528, + "grad_norm": 1.103163480758667, + "learning_rate": 2.0662264905962387e-05, + "loss": 0.5228, + "step": 14676 + }, + { + "epoch": 18.78656, + "grad_norm": 1.0311192274093628, + "learning_rate": 2.066026410564226e-05, + "loss": 0.5204, + "step": 14677 + }, + { + "epoch": 18.78784, + "grad_norm": 1.1256968975067139, + "learning_rate": 2.065826330532213e-05, + "loss": 0.5192, + "step": 14678 + }, + { + "epoch": 18.78912, + "grad_norm": 1.0992389917373657, + "learning_rate": 2.0656262505002002e-05, + "loss": 0.5015, + "step": 14679 + }, + { + "epoch": 18.790399999999998, + "grad_norm": 1.0932484865188599, + "learning_rate": 2.0654261704681874e-05, + "loss": 0.5712, + "step": 14680 + }, + { + "epoch": 18.79168, + "grad_norm": 1.0706785917282104, + "learning_rate": 2.0652260904361746e-05, + "loss": 0.5222, + "step": 14681 + }, + { + "epoch": 18.79296, + "grad_norm": 1.0165241956710815, + "learning_rate": 2.0650260104041618e-05, + "loss": 0.4922, + "step": 14682 + }, + { + "epoch": 18.79424, + "grad_norm": 1.0056289434432983, + "learning_rate": 2.064825930372149e-05, + "loss": 0.4751, + "step": 14683 + }, + { + "epoch": 18.79552, + "grad_norm": 1.0656893253326416, + "learning_rate": 2.064625850340136e-05, + "loss": 0.4966, + "step": 14684 + }, + { + "epoch": 18.7968, + "grad_norm": 1.0573127269744873, + "learning_rate": 2.0644257703081233e-05, + "loss": 0.4845, + "step": 14685 + }, + { + "epoch": 18.79808, + "grad_norm": 1.0371816158294678, + "learning_rate": 2.0642256902761105e-05, + "loss": 0.4863, + "step": 14686 + }, + { + "epoch": 18.79936, + "grad_norm": 1.0891573429107666, + "learning_rate": 2.0640256102440977e-05, + "loss": 0.5613, + "step": 14687 + }, + { + "epoch": 18.80064, + "grad_norm": 1.053895115852356, + "learning_rate": 2.063825530212085e-05, + "loss": 0.5053, + "step": 14688 + }, + { + "epoch": 18.80192, + "grad_norm": 1.0640217065811157, + "learning_rate": 2.0636254501800724e-05, + "loss": 0.5254, + "step": 14689 + }, + { + "epoch": 18.8032, + "grad_norm": 1.1202378273010254, + "learning_rate": 2.0634253701480593e-05, + "loss": 0.5121, + "step": 14690 + }, + { + "epoch": 18.80448, + "grad_norm": 1.0097864866256714, + "learning_rate": 2.0632252901160464e-05, + "loss": 0.4796, + "step": 14691 + }, + { + "epoch": 18.80576, + "grad_norm": 1.1119987964630127, + "learning_rate": 2.0630252100840336e-05, + "loss": 0.5685, + "step": 14692 + }, + { + "epoch": 18.80704, + "grad_norm": 1.0607649087905884, + "learning_rate": 2.062825130052021e-05, + "loss": 0.519, + "step": 14693 + }, + { + "epoch": 18.80832, + "grad_norm": 1.0458481311798096, + "learning_rate": 2.062625050020008e-05, + "loss": 0.4971, + "step": 14694 + }, + { + "epoch": 18.8096, + "grad_norm": 1.0886461734771729, + "learning_rate": 2.0624249699879952e-05, + "loss": 0.5249, + "step": 14695 + }, + { + "epoch": 18.81088, + "grad_norm": 1.0727638006210327, + "learning_rate": 2.0622248899559827e-05, + "loss": 0.5262, + "step": 14696 + }, + { + "epoch": 18.81216, + "grad_norm": 1.00741446018219, + "learning_rate": 2.06202480992397e-05, + "loss": 0.5311, + "step": 14697 + }, + { + "epoch": 18.81344, + "grad_norm": 1.0804330110549927, + "learning_rate": 2.0618247298919567e-05, + "loss": 0.547, + "step": 14698 + }, + { + "epoch": 18.81472, + "grad_norm": 1.0355829000473022, + "learning_rate": 2.061624649859944e-05, + "loss": 0.5129, + "step": 14699 + }, + { + "epoch": 18.816, + "grad_norm": 1.0258690118789673, + "learning_rate": 2.0614245698279315e-05, + "loss": 0.4925, + "step": 14700 + }, + { + "epoch": 18.81728, + "grad_norm": 1.0901426076889038, + "learning_rate": 2.0612244897959186e-05, + "loss": 0.4997, + "step": 14701 + }, + { + "epoch": 18.81856, + "grad_norm": 1.072962760925293, + "learning_rate": 2.0610244097639055e-05, + "loss": 0.5183, + "step": 14702 + }, + { + "epoch": 18.81984, + "grad_norm": 1.004193902015686, + "learning_rate": 2.060824329731893e-05, + "loss": 0.5215, + "step": 14703 + }, + { + "epoch": 18.82112, + "grad_norm": 1.0526185035705566, + "learning_rate": 2.0606242496998802e-05, + "loss": 0.5038, + "step": 14704 + }, + { + "epoch": 18.822400000000002, + "grad_norm": 1.0493122339248657, + "learning_rate": 2.0604241696678674e-05, + "loss": 0.5235, + "step": 14705 + }, + { + "epoch": 18.82368, + "grad_norm": 0.9919229745864868, + "learning_rate": 2.0602240896358542e-05, + "loss": 0.496, + "step": 14706 + }, + { + "epoch": 18.82496, + "grad_norm": 1.027870535850525, + "learning_rate": 2.0600240096038418e-05, + "loss": 0.4837, + "step": 14707 + }, + { + "epoch": 18.82624, + "grad_norm": 1.0324792861938477, + "learning_rate": 2.059823929571829e-05, + "loss": 0.5041, + "step": 14708 + }, + { + "epoch": 18.82752, + "grad_norm": 1.0613089799880981, + "learning_rate": 2.059623849539816e-05, + "loss": 0.5293, + "step": 14709 + }, + { + "epoch": 18.8288, + "grad_norm": 1.0319715738296509, + "learning_rate": 2.0594237695078033e-05, + "loss": 0.51, + "step": 14710 + }, + { + "epoch": 18.83008, + "grad_norm": 1.0587095022201538, + "learning_rate": 2.0592236894757905e-05, + "loss": 0.5019, + "step": 14711 + }, + { + "epoch": 18.83136, + "grad_norm": 1.0132025480270386, + "learning_rate": 2.0590236094437777e-05, + "loss": 0.4797, + "step": 14712 + }, + { + "epoch": 18.83264, + "grad_norm": 0.9891220331192017, + "learning_rate": 2.058823529411765e-05, + "loss": 0.4877, + "step": 14713 + }, + { + "epoch": 18.83392, + "grad_norm": 1.0485026836395264, + "learning_rate": 2.058623449379752e-05, + "loss": 0.5021, + "step": 14714 + }, + { + "epoch": 18.8352, + "grad_norm": 1.0996410846710205, + "learning_rate": 2.0584233693477392e-05, + "loss": 0.5152, + "step": 14715 + }, + { + "epoch": 18.83648, + "grad_norm": 1.065560221672058, + "learning_rate": 2.0582232893157264e-05, + "loss": 0.5149, + "step": 14716 + }, + { + "epoch": 18.83776, + "grad_norm": 1.12687349319458, + "learning_rate": 2.0580232092837136e-05, + "loss": 0.5378, + "step": 14717 + }, + { + "epoch": 18.83904, + "grad_norm": 1.1263388395309448, + "learning_rate": 2.0578231292517008e-05, + "loss": 0.5573, + "step": 14718 + }, + { + "epoch": 18.84032, + "grad_norm": 1.0918395519256592, + "learning_rate": 2.057623049219688e-05, + "loss": 0.5072, + "step": 14719 + }, + { + "epoch": 18.8416, + "grad_norm": 1.0265675783157349, + "learning_rate": 2.057422969187675e-05, + "loss": 0.506, + "step": 14720 + }, + { + "epoch": 18.84288, + "grad_norm": 0.9929207563400269, + "learning_rate": 2.0572228891556623e-05, + "loss": 0.4728, + "step": 14721 + }, + { + "epoch": 18.84416, + "grad_norm": 1.0715802907943726, + "learning_rate": 2.0570228091236495e-05, + "loss": 0.4897, + "step": 14722 + }, + { + "epoch": 18.84544, + "grad_norm": 0.9983776211738586, + "learning_rate": 2.0568227290916367e-05, + "loss": 0.481, + "step": 14723 + }, + { + "epoch": 18.84672, + "grad_norm": 1.0175827741622925, + "learning_rate": 2.056622649059624e-05, + "loss": 0.5271, + "step": 14724 + }, + { + "epoch": 18.848, + "grad_norm": 1.0712765455245972, + "learning_rate": 2.056422569027611e-05, + "loss": 0.5686, + "step": 14725 + }, + { + "epoch": 18.84928, + "grad_norm": 1.0037692785263062, + "learning_rate": 2.0562224889955983e-05, + "loss": 0.4798, + "step": 14726 + }, + { + "epoch": 18.85056, + "grad_norm": 0.9978422522544861, + "learning_rate": 2.0560224089635855e-05, + "loss": 0.44, + "step": 14727 + }, + { + "epoch": 18.85184, + "grad_norm": 1.021209955215454, + "learning_rate": 2.055822328931573e-05, + "loss": 0.4934, + "step": 14728 + }, + { + "epoch": 18.85312, + "grad_norm": 1.0505516529083252, + "learning_rate": 2.05562224889956e-05, + "loss": 0.5043, + "step": 14729 + }, + { + "epoch": 18.8544, + "grad_norm": 1.0562058687210083, + "learning_rate": 2.055422168867547e-05, + "loss": 0.5147, + "step": 14730 + }, + { + "epoch": 18.85568, + "grad_norm": 1.0676435232162476, + "learning_rate": 2.0552220888355342e-05, + "loss": 0.5092, + "step": 14731 + }, + { + "epoch": 18.85696, + "grad_norm": 1.0419723987579346, + "learning_rate": 2.0550220088035217e-05, + "loss": 0.529, + "step": 14732 + }, + { + "epoch": 18.85824, + "grad_norm": 1.0583957433700562, + "learning_rate": 2.0548219287715086e-05, + "loss": 0.4971, + "step": 14733 + }, + { + "epoch": 18.85952, + "grad_norm": 1.0169494152069092, + "learning_rate": 2.0546218487394958e-05, + "loss": 0.508, + "step": 14734 + }, + { + "epoch": 18.8608, + "grad_norm": 1.019565224647522, + "learning_rate": 2.0544217687074833e-05, + "loss": 0.4782, + "step": 14735 + }, + { + "epoch": 18.86208, + "grad_norm": 1.087472915649414, + "learning_rate": 2.0542216886754705e-05, + "loss": 0.5106, + "step": 14736 + }, + { + "epoch": 18.86336, + "grad_norm": 1.0848655700683594, + "learning_rate": 2.0540216086434573e-05, + "loss": 0.4834, + "step": 14737 + }, + { + "epoch": 18.86464, + "grad_norm": 1.049041509628296, + "learning_rate": 2.0538215286114445e-05, + "loss": 0.5162, + "step": 14738 + }, + { + "epoch": 18.86592, + "grad_norm": 1.0465503931045532, + "learning_rate": 2.053621448579432e-05, + "loss": 0.5309, + "step": 14739 + }, + { + "epoch": 18.8672, + "grad_norm": 1.0960649251937866, + "learning_rate": 2.0534213685474192e-05, + "loss": 0.4926, + "step": 14740 + }, + { + "epoch": 18.86848, + "grad_norm": 1.0588141679763794, + "learning_rate": 2.053221288515406e-05, + "loss": 0.5142, + "step": 14741 + }, + { + "epoch": 18.86976, + "grad_norm": 1.0418102741241455, + "learning_rate": 2.0530212084833936e-05, + "loss": 0.4991, + "step": 14742 + }, + { + "epoch": 18.87104, + "grad_norm": 1.0588616132736206, + "learning_rate": 2.0528211284513808e-05, + "loss": 0.5113, + "step": 14743 + }, + { + "epoch": 18.87232, + "grad_norm": 1.0481520891189575, + "learning_rate": 2.052621048419368e-05, + "loss": 0.474, + "step": 14744 + }, + { + "epoch": 18.8736, + "grad_norm": 1.0571566820144653, + "learning_rate": 2.0524209683873548e-05, + "loss": 0.5072, + "step": 14745 + }, + { + "epoch": 18.87488, + "grad_norm": 1.0196195840835571, + "learning_rate": 2.0522208883553423e-05, + "loss": 0.535, + "step": 14746 + }, + { + "epoch": 18.87616, + "grad_norm": 1.1058493852615356, + "learning_rate": 2.0520208083233295e-05, + "loss": 0.5374, + "step": 14747 + }, + { + "epoch": 18.87744, + "grad_norm": 1.0693318843841553, + "learning_rate": 2.0518207282913167e-05, + "loss": 0.4748, + "step": 14748 + }, + { + "epoch": 18.87872, + "grad_norm": 1.0735576152801514, + "learning_rate": 2.051620648259304e-05, + "loss": 0.5005, + "step": 14749 + }, + { + "epoch": 18.88, + "grad_norm": 1.0688385963439941, + "learning_rate": 2.051420568227291e-05, + "loss": 0.5158, + "step": 14750 + }, + { + "epoch": 18.88128, + "grad_norm": 1.0036412477493286, + "learning_rate": 2.0512204881952783e-05, + "loss": 0.5041, + "step": 14751 + }, + { + "epoch": 18.88256, + "grad_norm": 1.089450478553772, + "learning_rate": 2.0510204081632654e-05, + "loss": 0.5485, + "step": 14752 + }, + { + "epoch": 18.88384, + "grad_norm": 1.050765872001648, + "learning_rate": 2.0508203281312526e-05, + "loss": 0.496, + "step": 14753 + }, + { + "epoch": 18.88512, + "grad_norm": 1.0228477716445923, + "learning_rate": 2.0506202480992398e-05, + "loss": 0.5171, + "step": 14754 + }, + { + "epoch": 18.8864, + "grad_norm": 1.0970298051834106, + "learning_rate": 2.050420168067227e-05, + "loss": 0.5334, + "step": 14755 + }, + { + "epoch": 18.88768, + "grad_norm": 1.0475279092788696, + "learning_rate": 2.0502200880352142e-05, + "loss": 0.4951, + "step": 14756 + }, + { + "epoch": 18.88896, + "grad_norm": 1.0471172332763672, + "learning_rate": 2.0500200080032014e-05, + "loss": 0.5046, + "step": 14757 + }, + { + "epoch": 18.89024, + "grad_norm": 1.0250929594039917, + "learning_rate": 2.0498199279711886e-05, + "loss": 0.4712, + "step": 14758 + }, + { + "epoch": 18.89152, + "grad_norm": 1.0119768381118774, + "learning_rate": 2.0496198479391757e-05, + "loss": 0.5076, + "step": 14759 + }, + { + "epoch": 18.8928, + "grad_norm": 1.1097235679626465, + "learning_rate": 2.049419767907163e-05, + "loss": 0.5243, + "step": 14760 + }, + { + "epoch": 18.89408, + "grad_norm": 1.0308051109313965, + "learning_rate": 2.04921968787515e-05, + "loss": 0.5459, + "step": 14761 + }, + { + "epoch": 18.89536, + "grad_norm": 1.027444839477539, + "learning_rate": 2.0490196078431373e-05, + "loss": 0.532, + "step": 14762 + }, + { + "epoch": 18.89664, + "grad_norm": 1.0802733898162842, + "learning_rate": 2.0488195278111248e-05, + "loss": 0.5034, + "step": 14763 + }, + { + "epoch": 18.89792, + "grad_norm": 1.0943602323532104, + "learning_rate": 2.0486194477791117e-05, + "loss": 0.5108, + "step": 14764 + }, + { + "epoch": 18.8992, + "grad_norm": 1.0663763284683228, + "learning_rate": 2.048419367747099e-05, + "loss": 0.4789, + "step": 14765 + }, + { + "epoch": 18.90048, + "grad_norm": 1.0287508964538574, + "learning_rate": 2.048219287715086e-05, + "loss": 0.4964, + "step": 14766 + }, + { + "epoch": 18.90176, + "grad_norm": 1.0889275074005127, + "learning_rate": 2.0480192076830736e-05, + "loss": 0.513, + "step": 14767 + }, + { + "epoch": 18.90304, + "grad_norm": 1.0352449417114258, + "learning_rate": 2.0478191276510604e-05, + "loss": 0.4871, + "step": 14768 + }, + { + "epoch": 18.90432, + "grad_norm": 1.0806697607040405, + "learning_rate": 2.0476190476190476e-05, + "loss": 0.5069, + "step": 14769 + }, + { + "epoch": 18.9056, + "grad_norm": 1.0483494997024536, + "learning_rate": 2.0474189675870348e-05, + "loss": 0.493, + "step": 14770 + }, + { + "epoch": 18.90688, + "grad_norm": 0.9899161458015442, + "learning_rate": 2.0472188875550223e-05, + "loss": 0.4689, + "step": 14771 + }, + { + "epoch": 18.90816, + "grad_norm": 0.9991376399993896, + "learning_rate": 2.047018807523009e-05, + "loss": 0.441, + "step": 14772 + }, + { + "epoch": 18.90944, + "grad_norm": 1.0828230381011963, + "learning_rate": 2.0468187274909963e-05, + "loss": 0.4842, + "step": 14773 + }, + { + "epoch": 18.91072, + "grad_norm": 1.0319201946258545, + "learning_rate": 2.046618647458984e-05, + "loss": 0.4966, + "step": 14774 + }, + { + "epoch": 18.912, + "grad_norm": 1.1114946603775024, + "learning_rate": 2.046418567426971e-05, + "loss": 0.5483, + "step": 14775 + }, + { + "epoch": 18.91328, + "grad_norm": 1.1752065420150757, + "learning_rate": 2.046218487394958e-05, + "loss": 0.5939, + "step": 14776 + }, + { + "epoch": 18.91456, + "grad_norm": 1.0463483333587646, + "learning_rate": 2.046018407362945e-05, + "loss": 0.509, + "step": 14777 + }, + { + "epoch": 18.91584, + "grad_norm": 1.0437871217727661, + "learning_rate": 2.0458183273309326e-05, + "loss": 0.4781, + "step": 14778 + }, + { + "epoch": 18.91712, + "grad_norm": 1.054076910018921, + "learning_rate": 2.0456182472989198e-05, + "loss": 0.5121, + "step": 14779 + }, + { + "epoch": 18.9184, + "grad_norm": 1.0515813827514648, + "learning_rate": 2.0454181672669066e-05, + "loss": 0.5137, + "step": 14780 + }, + { + "epoch": 18.91968, + "grad_norm": 1.0603755712509155, + "learning_rate": 2.045218087234894e-05, + "loss": 0.4898, + "step": 14781 + }, + { + "epoch": 18.92096, + "grad_norm": 1.0575968027114868, + "learning_rate": 2.0450180072028814e-05, + "loss": 0.497, + "step": 14782 + }, + { + "epoch": 18.92224, + "grad_norm": 1.0913869142532349, + "learning_rate": 2.0448179271708685e-05, + "loss": 0.5125, + "step": 14783 + }, + { + "epoch": 18.92352, + "grad_norm": 1.018833041191101, + "learning_rate": 2.0446178471388554e-05, + "loss": 0.4733, + "step": 14784 + }, + { + "epoch": 18.9248, + "grad_norm": 1.0295926332473755, + "learning_rate": 2.044417767106843e-05, + "loss": 0.4817, + "step": 14785 + }, + { + "epoch": 18.92608, + "grad_norm": 1.0770633220672607, + "learning_rate": 2.04421768707483e-05, + "loss": 0.4856, + "step": 14786 + }, + { + "epoch": 18.92736, + "grad_norm": 1.0353119373321533, + "learning_rate": 2.0440176070428173e-05, + "loss": 0.5007, + "step": 14787 + }, + { + "epoch": 18.92864, + "grad_norm": 1.082660436630249, + "learning_rate": 2.0438175270108045e-05, + "loss": 0.5117, + "step": 14788 + }, + { + "epoch": 18.92992, + "grad_norm": 1.0934934616088867, + "learning_rate": 2.0436174469787917e-05, + "loss": 0.5001, + "step": 14789 + }, + { + "epoch": 18.9312, + "grad_norm": 1.0217633247375488, + "learning_rate": 2.043417366946779e-05, + "loss": 0.5105, + "step": 14790 + }, + { + "epoch": 18.932479999999998, + "grad_norm": 0.9955496788024902, + "learning_rate": 2.043217286914766e-05, + "loss": 0.4753, + "step": 14791 + }, + { + "epoch": 18.93376, + "grad_norm": 1.0870311260223389, + "learning_rate": 2.0430172068827532e-05, + "loss": 0.5214, + "step": 14792 + }, + { + "epoch": 18.93504, + "grad_norm": 1.0062708854675293, + "learning_rate": 2.0428171268507404e-05, + "loss": 0.471, + "step": 14793 + }, + { + "epoch": 18.93632, + "grad_norm": 1.0651724338531494, + "learning_rate": 2.0426170468187276e-05, + "loss": 0.5747, + "step": 14794 + }, + { + "epoch": 18.9376, + "grad_norm": 0.9868699908256531, + "learning_rate": 2.0424169667867148e-05, + "loss": 0.4606, + "step": 14795 + }, + { + "epoch": 18.93888, + "grad_norm": 1.0331077575683594, + "learning_rate": 2.042216886754702e-05, + "loss": 0.5139, + "step": 14796 + }, + { + "epoch": 18.94016, + "grad_norm": 1.0682381391525269, + "learning_rate": 2.042016806722689e-05, + "loss": 0.4899, + "step": 14797 + }, + { + "epoch": 18.94144, + "grad_norm": 1.1549947261810303, + "learning_rate": 2.0418167266906763e-05, + "loss": 0.5366, + "step": 14798 + }, + { + "epoch": 18.94272, + "grad_norm": 1.0490201711654663, + "learning_rate": 2.0416166466586635e-05, + "loss": 0.5038, + "step": 14799 + }, + { + "epoch": 18.944, + "grad_norm": 1.097759485244751, + "learning_rate": 2.0414165666266507e-05, + "loss": 0.5136, + "step": 14800 + }, + { + "epoch": 18.94528, + "grad_norm": 1.0571385622024536, + "learning_rate": 2.041216486594638e-05, + "loss": 0.4652, + "step": 14801 + }, + { + "epoch": 18.94656, + "grad_norm": 1.071684718132019, + "learning_rate": 2.0410164065626254e-05, + "loss": 0.5625, + "step": 14802 + }, + { + "epoch": 18.94784, + "grad_norm": 1.0939278602600098, + "learning_rate": 2.0408163265306123e-05, + "loss": 0.5401, + "step": 14803 + }, + { + "epoch": 18.94912, + "grad_norm": 1.045653223991394, + "learning_rate": 2.0406162464985994e-05, + "loss": 0.5289, + "step": 14804 + }, + { + "epoch": 18.9504, + "grad_norm": 1.070016622543335, + "learning_rate": 2.0404161664665866e-05, + "loss": 0.542, + "step": 14805 + }, + { + "epoch": 18.95168, + "grad_norm": 0.992372989654541, + "learning_rate": 2.040216086434574e-05, + "loss": 0.4731, + "step": 14806 + }, + { + "epoch": 18.95296, + "grad_norm": 1.062925100326538, + "learning_rate": 2.040016006402561e-05, + "loss": 0.5159, + "step": 14807 + }, + { + "epoch": 18.95424, + "grad_norm": 1.0661933422088623, + "learning_rate": 2.0398159263705482e-05, + "loss": 0.5155, + "step": 14808 + }, + { + "epoch": 18.95552, + "grad_norm": 1.0385395288467407, + "learning_rate": 2.0396158463385357e-05, + "loss": 0.4745, + "step": 14809 + }, + { + "epoch": 18.9568, + "grad_norm": 1.0102157592773438, + "learning_rate": 2.039415766306523e-05, + "loss": 0.4902, + "step": 14810 + }, + { + "epoch": 18.95808, + "grad_norm": 1.0179786682128906, + "learning_rate": 2.0392156862745097e-05, + "loss": 0.4641, + "step": 14811 + }, + { + "epoch": 18.95936, + "grad_norm": 1.062633991241455, + "learning_rate": 2.039015606242497e-05, + "loss": 0.5406, + "step": 14812 + }, + { + "epoch": 18.96064, + "grad_norm": 1.0476435422897339, + "learning_rate": 2.0388155262104844e-05, + "loss": 0.5176, + "step": 14813 + }, + { + "epoch": 18.96192, + "grad_norm": 1.0489333868026733, + "learning_rate": 2.0386154461784716e-05, + "loss": 0.5355, + "step": 14814 + }, + { + "epoch": 18.9632, + "grad_norm": 1.0147716999053955, + "learning_rate": 2.0384153661464585e-05, + "loss": 0.5249, + "step": 14815 + }, + { + "epoch": 18.964480000000002, + "grad_norm": 1.0759072303771973, + "learning_rate": 2.038215286114446e-05, + "loss": 0.4902, + "step": 14816 + }, + { + "epoch": 18.96576, + "grad_norm": 1.095579743385315, + "learning_rate": 2.0380152060824332e-05, + "loss": 0.5045, + "step": 14817 + }, + { + "epoch": 18.96704, + "grad_norm": 1.0580558776855469, + "learning_rate": 2.0378151260504204e-05, + "loss": 0.5302, + "step": 14818 + }, + { + "epoch": 18.96832, + "grad_norm": 1.0755257606506348, + "learning_rate": 2.0376150460184072e-05, + "loss": 0.5294, + "step": 14819 + }, + { + "epoch": 18.9696, + "grad_norm": 1.0983492136001587, + "learning_rate": 2.0374149659863947e-05, + "loss": 0.5071, + "step": 14820 + }, + { + "epoch": 18.97088, + "grad_norm": 1.050110936164856, + "learning_rate": 2.037214885954382e-05, + "loss": 0.4988, + "step": 14821 + }, + { + "epoch": 18.97216, + "grad_norm": 1.052294135093689, + "learning_rate": 2.037014805922369e-05, + "loss": 0.5414, + "step": 14822 + }, + { + "epoch": 18.97344, + "grad_norm": 1.1228139400482178, + "learning_rate": 2.036814725890356e-05, + "loss": 0.535, + "step": 14823 + }, + { + "epoch": 18.97472, + "grad_norm": 1.0272384881973267, + "learning_rate": 2.0366146458583435e-05, + "loss": 0.4597, + "step": 14824 + }, + { + "epoch": 18.976, + "grad_norm": 1.0405932664871216, + "learning_rate": 2.0364145658263307e-05, + "loss": 0.4597, + "step": 14825 + }, + { + "epoch": 18.97728, + "grad_norm": 1.1259437799453735, + "learning_rate": 2.036214485794318e-05, + "loss": 0.5336, + "step": 14826 + }, + { + "epoch": 18.97856, + "grad_norm": 1.0937503576278687, + "learning_rate": 2.036014405762305e-05, + "loss": 0.4824, + "step": 14827 + }, + { + "epoch": 18.97984, + "grad_norm": 1.0712480545043945, + "learning_rate": 2.0358143257302922e-05, + "loss": 0.5119, + "step": 14828 + }, + { + "epoch": 18.98112, + "grad_norm": 1.1139438152313232, + "learning_rate": 2.0356142456982794e-05, + "loss": 0.5282, + "step": 14829 + }, + { + "epoch": 18.9824, + "grad_norm": 1.103958010673523, + "learning_rate": 2.0354141656662666e-05, + "loss": 0.5595, + "step": 14830 + }, + { + "epoch": 18.98368, + "grad_norm": 1.0063518285751343, + "learning_rate": 2.0352140856342538e-05, + "loss": 0.4573, + "step": 14831 + }, + { + "epoch": 18.98496, + "grad_norm": 1.033522605895996, + "learning_rate": 2.035014005602241e-05, + "loss": 0.4847, + "step": 14832 + }, + { + "epoch": 18.98624, + "grad_norm": 1.0984948873519897, + "learning_rate": 2.034813925570228e-05, + "loss": 0.5332, + "step": 14833 + }, + { + "epoch": 18.98752, + "grad_norm": 1.1030117273330688, + "learning_rate": 2.0346138455382153e-05, + "loss": 0.538, + "step": 14834 + }, + { + "epoch": 18.9888, + "grad_norm": 1.097759485244751, + "learning_rate": 2.0344137655062025e-05, + "loss": 0.5191, + "step": 14835 + }, + { + "epoch": 18.99008, + "grad_norm": 1.111032485961914, + "learning_rate": 2.0342136854741897e-05, + "loss": 0.4796, + "step": 14836 + }, + { + "epoch": 18.99136, + "grad_norm": 1.1036945581436157, + "learning_rate": 2.034013605442177e-05, + "loss": 0.5219, + "step": 14837 + }, + { + "epoch": 18.99264, + "grad_norm": 1.1179360151290894, + "learning_rate": 2.033813525410164e-05, + "loss": 0.5537, + "step": 14838 + }, + { + "epoch": 18.99392, + "grad_norm": 1.048097848892212, + "learning_rate": 2.0336134453781513e-05, + "loss": 0.5053, + "step": 14839 + }, + { + "epoch": 18.9952, + "grad_norm": 1.0695916414260864, + "learning_rate": 2.0334133653461385e-05, + "loss": 0.5526, + "step": 14840 + }, + { + "epoch": 18.99648, + "grad_norm": 1.0661946535110474, + "learning_rate": 2.033213285314126e-05, + "loss": 0.5055, + "step": 14841 + }, + { + "epoch": 18.99776, + "grad_norm": 1.0300498008728027, + "learning_rate": 2.033013205282113e-05, + "loss": 0.4859, + "step": 14842 + }, + { + "epoch": 18.99904, + "grad_norm": 1.1209156513214111, + "learning_rate": 2.0328131252501e-05, + "loss": 0.5599, + "step": 14843 + }, + { + "epoch": 19.00032, + "grad_norm": Infinity, + "learning_rate": 2.0328131252501e-05, + "loss": 0.9375, + "step": 14844 + }, + { + "epoch": 19.0016, + "grad_norm": 0.9681119918823242, + "learning_rate": 2.0326130452180872e-05, + "loss": 0.501, + "step": 14845 + }, + { + "epoch": 19.00288, + "grad_norm": 1.0563879013061523, + "learning_rate": 2.0324129651860747e-05, + "loss": 0.4601, + "step": 14846 + }, + { + "epoch": 19.00416, + "grad_norm": 1.0467544794082642, + "learning_rate": 2.0322128851540616e-05, + "loss": 0.5335, + "step": 14847 + }, + { + "epoch": 19.00544, + "grad_norm": 1.031529188156128, + "learning_rate": 2.0320128051220488e-05, + "loss": 0.463, + "step": 14848 + }, + { + "epoch": 19.00672, + "grad_norm": 1.036137580871582, + "learning_rate": 2.0318127250900363e-05, + "loss": 0.4777, + "step": 14849 + }, + { + "epoch": 19.008, + "grad_norm": 1.0127391815185547, + "learning_rate": 2.0316126450580235e-05, + "loss": 0.4845, + "step": 14850 + }, + { + "epoch": 19.00928, + "grad_norm": 1.0757827758789062, + "learning_rate": 2.0314125650260103e-05, + "loss": 0.5228, + "step": 14851 + }, + { + "epoch": 19.01056, + "grad_norm": 1.035569190979004, + "learning_rate": 2.0312124849939975e-05, + "loss": 0.4674, + "step": 14852 + }, + { + "epoch": 19.01184, + "grad_norm": 1.0188312530517578, + "learning_rate": 2.031012404961985e-05, + "loss": 0.486, + "step": 14853 + }, + { + "epoch": 19.01312, + "grad_norm": 1.087680697441101, + "learning_rate": 2.0308123249299722e-05, + "loss": 0.4803, + "step": 14854 + }, + { + "epoch": 19.0144, + "grad_norm": 1.090846061706543, + "learning_rate": 2.030612244897959e-05, + "loss": 0.473, + "step": 14855 + }, + { + "epoch": 19.01568, + "grad_norm": 1.1319047212600708, + "learning_rate": 2.0304121648659466e-05, + "loss": 0.5024, + "step": 14856 + }, + { + "epoch": 19.01696, + "grad_norm": 1.1232948303222656, + "learning_rate": 2.0302120848339338e-05, + "loss": 0.5176, + "step": 14857 + }, + { + "epoch": 19.01824, + "grad_norm": 1.08295476436615, + "learning_rate": 2.030012004801921e-05, + "loss": 0.4821, + "step": 14858 + }, + { + "epoch": 19.01952, + "grad_norm": 1.0505188703536987, + "learning_rate": 2.0298119247699078e-05, + "loss": 0.4906, + "step": 14859 + }, + { + "epoch": 19.0208, + "grad_norm": 1.0271227359771729, + "learning_rate": 2.0296118447378953e-05, + "loss": 0.4796, + "step": 14860 + }, + { + "epoch": 19.02208, + "grad_norm": 1.0881240367889404, + "learning_rate": 2.0294117647058825e-05, + "loss": 0.4729, + "step": 14861 + }, + { + "epoch": 19.02336, + "grad_norm": 1.1086496114730835, + "learning_rate": 2.0292116846738697e-05, + "loss": 0.4996, + "step": 14862 + }, + { + "epoch": 19.02464, + "grad_norm": 1.087936520576477, + "learning_rate": 2.029011604641857e-05, + "loss": 0.4926, + "step": 14863 + }, + { + "epoch": 19.02592, + "grad_norm": 1.062474250793457, + "learning_rate": 2.028811524609844e-05, + "loss": 0.495, + "step": 14864 + }, + { + "epoch": 19.0272, + "grad_norm": 1.049436092376709, + "learning_rate": 2.0286114445778313e-05, + "loss": 0.4552, + "step": 14865 + }, + { + "epoch": 19.02848, + "grad_norm": 1.0556187629699707, + "learning_rate": 2.0284113645458184e-05, + "loss": 0.4789, + "step": 14866 + }, + { + "epoch": 19.02976, + "grad_norm": 0.9977188110351562, + "learning_rate": 2.0282112845138056e-05, + "loss": 0.4788, + "step": 14867 + }, + { + "epoch": 19.03104, + "grad_norm": 1.0484732389450073, + "learning_rate": 2.0280112044817928e-05, + "loss": 0.4596, + "step": 14868 + }, + { + "epoch": 19.03232, + "grad_norm": 1.135587453842163, + "learning_rate": 2.02781112444978e-05, + "loss": 0.4965, + "step": 14869 + }, + { + "epoch": 19.0336, + "grad_norm": 1.0908023118972778, + "learning_rate": 2.0276110444177672e-05, + "loss": 0.4776, + "step": 14870 + }, + { + "epoch": 19.03488, + "grad_norm": 1.1048251390457153, + "learning_rate": 2.0274109643857544e-05, + "loss": 0.5342, + "step": 14871 + }, + { + "epoch": 19.03616, + "grad_norm": 1.1822048425674438, + "learning_rate": 2.0272108843537416e-05, + "loss": 0.5458, + "step": 14872 + }, + { + "epoch": 19.03744, + "grad_norm": 1.0560060739517212, + "learning_rate": 2.0270108043217287e-05, + "loss": 0.477, + "step": 14873 + }, + { + "epoch": 19.03872, + "grad_norm": 1.0473270416259766, + "learning_rate": 2.026810724289716e-05, + "loss": 0.519, + "step": 14874 + }, + { + "epoch": 19.04, + "grad_norm": 1.0593985319137573, + "learning_rate": 2.026610644257703e-05, + "loss": 0.4724, + "step": 14875 + }, + { + "epoch": 19.04128, + "grad_norm": 1.1364285945892334, + "learning_rate": 2.0264105642256903e-05, + "loss": 0.4902, + "step": 14876 + }, + { + "epoch": 19.04256, + "grad_norm": 1.1284868717193604, + "learning_rate": 2.0262104841936778e-05, + "loss": 0.5167, + "step": 14877 + }, + { + "epoch": 19.04384, + "grad_norm": 1.0828670263290405, + "learning_rate": 2.0260104041616647e-05, + "loss": 0.4799, + "step": 14878 + }, + { + "epoch": 19.04512, + "grad_norm": 1.0774503946304321, + "learning_rate": 2.025810324129652e-05, + "loss": 0.516, + "step": 14879 + }, + { + "epoch": 19.0464, + "grad_norm": 1.074471354484558, + "learning_rate": 2.025610244097639e-05, + "loss": 0.5001, + "step": 14880 + }, + { + "epoch": 19.04768, + "grad_norm": 1.0600991249084473, + "learning_rate": 2.0254101640656266e-05, + "loss": 0.475, + "step": 14881 + }, + { + "epoch": 19.04896, + "grad_norm": 1.0403833389282227, + "learning_rate": 2.0252100840336134e-05, + "loss": 0.4729, + "step": 14882 + }, + { + "epoch": 19.05024, + "grad_norm": 1.0715597867965698, + "learning_rate": 2.0250100040016006e-05, + "loss": 0.5526, + "step": 14883 + }, + { + "epoch": 19.05152, + "grad_norm": 1.0039360523223877, + "learning_rate": 2.0248099239695878e-05, + "loss": 0.4799, + "step": 14884 + }, + { + "epoch": 19.0528, + "grad_norm": 0.990351676940918, + "learning_rate": 2.0246098439375753e-05, + "loss": 0.4442, + "step": 14885 + }, + { + "epoch": 19.05408, + "grad_norm": 1.0488803386688232, + "learning_rate": 2.024409763905562e-05, + "loss": 0.4884, + "step": 14886 + }, + { + "epoch": 19.05536, + "grad_norm": 1.0708200931549072, + "learning_rate": 2.0242096838735493e-05, + "loss": 0.5011, + "step": 14887 + }, + { + "epoch": 19.05664, + "grad_norm": 1.0574580430984497, + "learning_rate": 2.024009603841537e-05, + "loss": 0.4919, + "step": 14888 + }, + { + "epoch": 19.05792, + "grad_norm": 1.0483922958374023, + "learning_rate": 2.023809523809524e-05, + "loss": 0.5121, + "step": 14889 + }, + { + "epoch": 19.0592, + "grad_norm": 1.039752721786499, + "learning_rate": 2.023609443777511e-05, + "loss": 0.4642, + "step": 14890 + }, + { + "epoch": 19.06048, + "grad_norm": 1.0242259502410889, + "learning_rate": 2.023409363745498e-05, + "loss": 0.4452, + "step": 14891 + }, + { + "epoch": 19.06176, + "grad_norm": 1.0347176790237427, + "learning_rate": 2.0232092837134856e-05, + "loss": 0.5101, + "step": 14892 + }, + { + "epoch": 19.06304, + "grad_norm": 1.0924898386001587, + "learning_rate": 2.0230092036814728e-05, + "loss": 0.4904, + "step": 14893 + }, + { + "epoch": 19.06432, + "grad_norm": 1.0558232069015503, + "learning_rate": 2.0228091236494596e-05, + "loss": 0.4689, + "step": 14894 + }, + { + "epoch": 19.0656, + "grad_norm": 1.082149863243103, + "learning_rate": 2.022609043617447e-05, + "loss": 0.5111, + "step": 14895 + }, + { + "epoch": 19.06688, + "grad_norm": 1.0433650016784668, + "learning_rate": 2.0224089635854344e-05, + "loss": 0.4458, + "step": 14896 + }, + { + "epoch": 19.06816, + "grad_norm": 1.053171157836914, + "learning_rate": 2.0222088835534215e-05, + "loss": 0.4724, + "step": 14897 + }, + { + "epoch": 19.06944, + "grad_norm": 1.0355948209762573, + "learning_rate": 2.0220088035214084e-05, + "loss": 0.5265, + "step": 14898 + }, + { + "epoch": 19.07072, + "grad_norm": 1.0293118953704834, + "learning_rate": 2.021808723489396e-05, + "loss": 0.4963, + "step": 14899 + }, + { + "epoch": 19.072, + "grad_norm": 1.065530776977539, + "learning_rate": 2.021608643457383e-05, + "loss": 0.4682, + "step": 14900 + }, + { + "epoch": 19.07328, + "grad_norm": 1.0934525728225708, + "learning_rate": 2.0214085634253703e-05, + "loss": 0.4689, + "step": 14901 + }, + { + "epoch": 19.07456, + "grad_norm": 0.997524082660675, + "learning_rate": 2.0212084833933575e-05, + "loss": 0.4693, + "step": 14902 + }, + { + "epoch": 19.07584, + "grad_norm": 1.1037408113479614, + "learning_rate": 2.0210084033613447e-05, + "loss": 0.5157, + "step": 14903 + }, + { + "epoch": 19.07712, + "grad_norm": 1.0279779434204102, + "learning_rate": 2.020808323329332e-05, + "loss": 0.4996, + "step": 14904 + }, + { + "epoch": 19.0784, + "grad_norm": 1.0512694120407104, + "learning_rate": 2.020608243297319e-05, + "loss": 0.4589, + "step": 14905 + }, + { + "epoch": 19.07968, + "grad_norm": 1.048998475074768, + "learning_rate": 2.0204081632653062e-05, + "loss": 0.4715, + "step": 14906 + }, + { + "epoch": 19.08096, + "grad_norm": 1.1140145063400269, + "learning_rate": 2.0202080832332934e-05, + "loss": 0.5188, + "step": 14907 + }, + { + "epoch": 19.08224, + "grad_norm": 1.1645013093948364, + "learning_rate": 2.0200080032012806e-05, + "loss": 0.5306, + "step": 14908 + }, + { + "epoch": 19.08352, + "grad_norm": 1.1035782098770142, + "learning_rate": 2.0198079231692678e-05, + "loss": 0.5419, + "step": 14909 + }, + { + "epoch": 19.0848, + "grad_norm": 1.1250030994415283, + "learning_rate": 2.019607843137255e-05, + "loss": 0.4853, + "step": 14910 + }, + { + "epoch": 19.08608, + "grad_norm": 1.019386887550354, + "learning_rate": 2.019407763105242e-05, + "loss": 0.4822, + "step": 14911 + }, + { + "epoch": 19.08736, + "grad_norm": 1.0403432846069336, + "learning_rate": 2.0192076830732293e-05, + "loss": 0.4773, + "step": 14912 + }, + { + "epoch": 19.08864, + "grad_norm": 1.073562741279602, + "learning_rate": 2.0190076030412165e-05, + "loss": 0.5333, + "step": 14913 + }, + { + "epoch": 19.08992, + "grad_norm": 1.0300809144973755, + "learning_rate": 2.0188075230092037e-05, + "loss": 0.4463, + "step": 14914 + }, + { + "epoch": 19.0912, + "grad_norm": 1.0957305431365967, + "learning_rate": 2.018607442977191e-05, + "loss": 0.5052, + "step": 14915 + }, + { + "epoch": 19.09248, + "grad_norm": 1.155129313468933, + "learning_rate": 2.0184073629451784e-05, + "loss": 0.5286, + "step": 14916 + }, + { + "epoch": 19.09376, + "grad_norm": 1.0918629169464111, + "learning_rate": 2.0182072829131653e-05, + "loss": 0.5577, + "step": 14917 + }, + { + "epoch": 19.09504, + "grad_norm": 1.0622458457946777, + "learning_rate": 2.0180072028811524e-05, + "loss": 0.4989, + "step": 14918 + }, + { + "epoch": 19.09632, + "grad_norm": 1.0885812044143677, + "learning_rate": 2.0178071228491396e-05, + "loss": 0.522, + "step": 14919 + }, + { + "epoch": 19.0976, + "grad_norm": 1.0788886547088623, + "learning_rate": 2.017607042817127e-05, + "loss": 0.5262, + "step": 14920 + }, + { + "epoch": 19.09888, + "grad_norm": 1.0732629299163818, + "learning_rate": 2.017406962785114e-05, + "loss": 0.4768, + "step": 14921 + }, + { + "epoch": 19.10016, + "grad_norm": 1.0360231399536133, + "learning_rate": 2.0172068827531012e-05, + "loss": 0.4685, + "step": 14922 + }, + { + "epoch": 19.10144, + "grad_norm": 1.0716302394866943, + "learning_rate": 2.0170068027210887e-05, + "loss": 0.497, + "step": 14923 + }, + { + "epoch": 19.10272, + "grad_norm": 1.0575581789016724, + "learning_rate": 2.016806722689076e-05, + "loss": 0.4984, + "step": 14924 + }, + { + "epoch": 19.104, + "grad_norm": 1.0178310871124268, + "learning_rate": 2.0166066426570627e-05, + "loss": 0.4571, + "step": 14925 + }, + { + "epoch": 19.10528, + "grad_norm": 1.0144866704940796, + "learning_rate": 2.01640656262505e-05, + "loss": 0.4566, + "step": 14926 + }, + { + "epoch": 19.10656, + "grad_norm": 1.0763506889343262, + "learning_rate": 2.0162064825930374e-05, + "loss": 0.4842, + "step": 14927 + }, + { + "epoch": 19.10784, + "grad_norm": 1.0165890455245972, + "learning_rate": 2.0160064025610246e-05, + "loss": 0.4678, + "step": 14928 + }, + { + "epoch": 19.10912, + "grad_norm": 1.0530028343200684, + "learning_rate": 2.0158063225290115e-05, + "loss": 0.4977, + "step": 14929 + }, + { + "epoch": 19.1104, + "grad_norm": 1.109286904335022, + "learning_rate": 2.015606242496999e-05, + "loss": 0.5079, + "step": 14930 + }, + { + "epoch": 19.11168, + "grad_norm": 1.081906795501709, + "learning_rate": 2.0154061624649862e-05, + "loss": 0.4783, + "step": 14931 + }, + { + "epoch": 19.11296, + "grad_norm": 1.0961225032806396, + "learning_rate": 2.0152060824329734e-05, + "loss": 0.5479, + "step": 14932 + }, + { + "epoch": 19.11424, + "grad_norm": 1.105312705039978, + "learning_rate": 2.0150060024009602e-05, + "loss": 0.5046, + "step": 14933 + }, + { + "epoch": 19.11552, + "grad_norm": 1.146582841873169, + "learning_rate": 2.0148059223689477e-05, + "loss": 0.5447, + "step": 14934 + }, + { + "epoch": 19.1168, + "grad_norm": 1.0598522424697876, + "learning_rate": 2.014605842336935e-05, + "loss": 0.4873, + "step": 14935 + }, + { + "epoch": 19.11808, + "grad_norm": 1.0884562730789185, + "learning_rate": 2.014405762304922e-05, + "loss": 0.49, + "step": 14936 + }, + { + "epoch": 19.11936, + "grad_norm": 1.1092275381088257, + "learning_rate": 2.014205682272909e-05, + "loss": 0.5157, + "step": 14937 + }, + { + "epoch": 19.12064, + "grad_norm": 1.0648193359375, + "learning_rate": 2.0140056022408965e-05, + "loss": 0.4773, + "step": 14938 + }, + { + "epoch": 19.12192, + "grad_norm": 1.0710915327072144, + "learning_rate": 2.0138055222088837e-05, + "loss": 0.5003, + "step": 14939 + }, + { + "epoch": 19.1232, + "grad_norm": 1.0918967723846436, + "learning_rate": 2.013605442176871e-05, + "loss": 0.4891, + "step": 14940 + }, + { + "epoch": 19.12448, + "grad_norm": 1.0287532806396484, + "learning_rate": 2.013405362144858e-05, + "loss": 0.4392, + "step": 14941 + }, + { + "epoch": 19.12576, + "grad_norm": 1.050429105758667, + "learning_rate": 2.0132052821128452e-05, + "loss": 0.4599, + "step": 14942 + }, + { + "epoch": 19.12704, + "grad_norm": 1.1075915098190308, + "learning_rate": 2.0130052020808324e-05, + "loss": 0.5304, + "step": 14943 + }, + { + "epoch": 19.12832, + "grad_norm": 1.040389060974121, + "learning_rate": 2.0128051220488196e-05, + "loss": 0.5047, + "step": 14944 + }, + { + "epoch": 19.1296, + "grad_norm": 1.0519245862960815, + "learning_rate": 2.0126050420168068e-05, + "loss": 0.4767, + "step": 14945 + }, + { + "epoch": 19.13088, + "grad_norm": 1.1048765182495117, + "learning_rate": 2.012404961984794e-05, + "loss": 0.5269, + "step": 14946 + }, + { + "epoch": 19.13216, + "grad_norm": 1.0859326124191284, + "learning_rate": 2.012204881952781e-05, + "loss": 0.5006, + "step": 14947 + }, + { + "epoch": 19.13344, + "grad_norm": 1.0915594100952148, + "learning_rate": 2.0120048019207683e-05, + "loss": 0.4768, + "step": 14948 + }, + { + "epoch": 19.13472, + "grad_norm": 1.1056619882583618, + "learning_rate": 2.0118047218887555e-05, + "loss": 0.5421, + "step": 14949 + }, + { + "epoch": 19.136, + "grad_norm": 1.124221920967102, + "learning_rate": 2.0116046418567427e-05, + "loss": 0.4826, + "step": 14950 + }, + { + "epoch": 19.13728, + "grad_norm": 1.0759928226470947, + "learning_rate": 2.01140456182473e-05, + "loss": 0.5218, + "step": 14951 + }, + { + "epoch": 19.13856, + "grad_norm": 1.0738402605056763, + "learning_rate": 2.011204481792717e-05, + "loss": 0.4804, + "step": 14952 + }, + { + "epoch": 19.13984, + "grad_norm": 1.070008397102356, + "learning_rate": 2.0110044017607043e-05, + "loss": 0.5434, + "step": 14953 + }, + { + "epoch": 19.14112, + "grad_norm": 1.1064351797103882, + "learning_rate": 2.0108043217286915e-05, + "loss": 0.5187, + "step": 14954 + }, + { + "epoch": 19.1424, + "grad_norm": 1.0281625986099243, + "learning_rate": 2.010604241696679e-05, + "loss": 0.4896, + "step": 14955 + }, + { + "epoch": 19.14368, + "grad_norm": 1.088550329208374, + "learning_rate": 2.010404161664666e-05, + "loss": 0.5228, + "step": 14956 + }, + { + "epoch": 19.14496, + "grad_norm": 1.1894739866256714, + "learning_rate": 2.010204081632653e-05, + "loss": 0.5813, + "step": 14957 + }, + { + "epoch": 19.14624, + "grad_norm": 1.1209919452667236, + "learning_rate": 2.0100040016006402e-05, + "loss": 0.483, + "step": 14958 + }, + { + "epoch": 19.14752, + "grad_norm": 1.055732011795044, + "learning_rate": 2.0098039215686277e-05, + "loss": 0.5229, + "step": 14959 + }, + { + "epoch": 19.1488, + "grad_norm": 1.0904511213302612, + "learning_rate": 2.0096038415366146e-05, + "loss": 0.521, + "step": 14960 + }, + { + "epoch": 19.15008, + "grad_norm": 1.0930554866790771, + "learning_rate": 2.0094037615046018e-05, + "loss": 0.4945, + "step": 14961 + }, + { + "epoch": 19.15136, + "grad_norm": 1.0355908870697021, + "learning_rate": 2.0092036814725893e-05, + "loss": 0.5065, + "step": 14962 + }, + { + "epoch": 19.15264, + "grad_norm": 1.0164462327957153, + "learning_rate": 2.0090036014405765e-05, + "loss": 0.4642, + "step": 14963 + }, + { + "epoch": 19.15392, + "grad_norm": 1.0890769958496094, + "learning_rate": 2.0088035214085633e-05, + "loss": 0.4984, + "step": 14964 + }, + { + "epoch": 19.1552, + "grad_norm": 1.0881518125534058, + "learning_rate": 2.0086034413765505e-05, + "loss": 0.5328, + "step": 14965 + }, + { + "epoch": 19.15648, + "grad_norm": 1.0909157991409302, + "learning_rate": 2.008403361344538e-05, + "loss": 0.4777, + "step": 14966 + }, + { + "epoch": 19.15776, + "grad_norm": 1.0730060338974, + "learning_rate": 2.0082032813125252e-05, + "loss": 0.4747, + "step": 14967 + }, + { + "epoch": 19.15904, + "grad_norm": 1.0788432359695435, + "learning_rate": 2.008003201280512e-05, + "loss": 0.4639, + "step": 14968 + }, + { + "epoch": 19.16032, + "grad_norm": 1.0355757474899292, + "learning_rate": 2.0078031212484996e-05, + "loss": 0.4812, + "step": 14969 + }, + { + "epoch": 19.1616, + "grad_norm": 1.1290099620819092, + "learning_rate": 2.0076030412164868e-05, + "loss": 0.5178, + "step": 14970 + }, + { + "epoch": 19.16288, + "grad_norm": 1.0541107654571533, + "learning_rate": 2.007402961184474e-05, + "loss": 0.4936, + "step": 14971 + }, + { + "epoch": 19.16416, + "grad_norm": 1.0474649667739868, + "learning_rate": 2.0072028811524608e-05, + "loss": 0.5262, + "step": 14972 + }, + { + "epoch": 19.16544, + "grad_norm": 1.0278681516647339, + "learning_rate": 2.0070028011204483e-05, + "loss": 0.4955, + "step": 14973 + }, + { + "epoch": 19.16672, + "grad_norm": 1.0625478029251099, + "learning_rate": 2.0068027210884355e-05, + "loss": 0.4842, + "step": 14974 + }, + { + "epoch": 19.168, + "grad_norm": 1.0970510244369507, + "learning_rate": 2.0066026410564227e-05, + "loss": 0.5195, + "step": 14975 + }, + { + "epoch": 19.16928, + "grad_norm": 1.0882428884506226, + "learning_rate": 2.00640256102441e-05, + "loss": 0.5226, + "step": 14976 + }, + { + "epoch": 19.17056, + "grad_norm": 1.0383604764938354, + "learning_rate": 2.006202480992397e-05, + "loss": 0.4514, + "step": 14977 + }, + { + "epoch": 19.17184, + "grad_norm": 1.1663484573364258, + "learning_rate": 2.0060024009603843e-05, + "loss": 0.5614, + "step": 14978 + }, + { + "epoch": 19.17312, + "grad_norm": 1.0208680629730225, + "learning_rate": 2.0058023209283714e-05, + "loss": 0.4491, + "step": 14979 + }, + { + "epoch": 19.1744, + "grad_norm": 1.0704641342163086, + "learning_rate": 2.0056022408963586e-05, + "loss": 0.5215, + "step": 14980 + }, + { + "epoch": 19.17568, + "grad_norm": 1.1021260023117065, + "learning_rate": 2.0054021608643458e-05, + "loss": 0.4645, + "step": 14981 + }, + { + "epoch": 19.17696, + "grad_norm": 1.0811761617660522, + "learning_rate": 2.005202080832333e-05, + "loss": 0.5263, + "step": 14982 + }, + { + "epoch": 19.17824, + "grad_norm": 1.1112900972366333, + "learning_rate": 2.0050020008003205e-05, + "loss": 0.4977, + "step": 14983 + }, + { + "epoch": 19.17952, + "grad_norm": 1.1169459819793701, + "learning_rate": 2.0048019207683074e-05, + "loss": 0.4697, + "step": 14984 + }, + { + "epoch": 19.1808, + "grad_norm": 1.056740164756775, + "learning_rate": 2.0046018407362946e-05, + "loss": 0.4776, + "step": 14985 + }, + { + "epoch": 19.18208, + "grad_norm": 1.0898573398590088, + "learning_rate": 2.0044017607042817e-05, + "loss": 0.5211, + "step": 14986 + }, + { + "epoch": 19.18336, + "grad_norm": 1.0396350622177124, + "learning_rate": 2.0042016806722693e-05, + "loss": 0.4809, + "step": 14987 + }, + { + "epoch": 19.18464, + "grad_norm": 1.1024669408798218, + "learning_rate": 2.004001600640256e-05, + "loss": 0.4961, + "step": 14988 + }, + { + "epoch": 19.18592, + "grad_norm": 1.1737849712371826, + "learning_rate": 2.0038015206082433e-05, + "loss": 0.5079, + "step": 14989 + }, + { + "epoch": 19.1872, + "grad_norm": 1.1294634342193604, + "learning_rate": 2.0036014405762308e-05, + "loss": 0.5342, + "step": 14990 + }, + { + "epoch": 19.18848, + "grad_norm": 1.021791696548462, + "learning_rate": 2.003401360544218e-05, + "loss": 0.497, + "step": 14991 + }, + { + "epoch": 19.18976, + "grad_norm": 1.0725295543670654, + "learning_rate": 2.003201280512205e-05, + "loss": 0.4671, + "step": 14992 + }, + { + "epoch": 19.19104, + "grad_norm": 1.098502278327942, + "learning_rate": 2.003001200480192e-05, + "loss": 0.5108, + "step": 14993 + }, + { + "epoch": 19.19232, + "grad_norm": 1.060045599937439, + "learning_rate": 2.0028011204481796e-05, + "loss": 0.4673, + "step": 14994 + }, + { + "epoch": 19.1936, + "grad_norm": 1.056288242340088, + "learning_rate": 2.0026010404161667e-05, + "loss": 0.4897, + "step": 14995 + }, + { + "epoch": 19.19488, + "grad_norm": 1.0819458961486816, + "learning_rate": 2.0024009603841536e-05, + "loss": 0.5011, + "step": 14996 + }, + { + "epoch": 19.19616, + "grad_norm": 1.0354195833206177, + "learning_rate": 2.0022008803521408e-05, + "loss": 0.4803, + "step": 14997 + }, + { + "epoch": 19.19744, + "grad_norm": 1.0913057327270508, + "learning_rate": 2.0020008003201283e-05, + "loss": 0.4986, + "step": 14998 + }, + { + "epoch": 19.19872, + "grad_norm": 1.101427674293518, + "learning_rate": 2.0018007202881155e-05, + "loss": 0.4715, + "step": 14999 + }, + { + "epoch": 19.2, + "grad_norm": 0.9858012199401855, + "learning_rate": 2.0016006402561023e-05, + "loss": 0.4406, + "step": 15000 + }, + { + "epoch": 19.20128, + "grad_norm": 1.187526822090149, + "learning_rate": 2.00140056022409e-05, + "loss": 0.5555, + "step": 15001 + }, + { + "epoch": 19.20256, + "grad_norm": 1.087991714477539, + "learning_rate": 2.001200480192077e-05, + "loss": 0.5228, + "step": 15002 + }, + { + "epoch": 19.20384, + "grad_norm": 1.0227059125900269, + "learning_rate": 2.0010004001600642e-05, + "loss": 0.4722, + "step": 15003 + }, + { + "epoch": 19.20512, + "grad_norm": 1.0761233568191528, + "learning_rate": 2.000800320128051e-05, + "loss": 0.5027, + "step": 15004 + }, + { + "epoch": 19.2064, + "grad_norm": 1.0522879362106323, + "learning_rate": 2.0006002400960386e-05, + "loss": 0.5058, + "step": 15005 + }, + { + "epoch": 19.20768, + "grad_norm": 1.104921579360962, + "learning_rate": 2.0004001600640258e-05, + "loss": 0.4954, + "step": 15006 + }, + { + "epoch": 19.20896, + "grad_norm": 1.0269452333450317, + "learning_rate": 2.000200080032013e-05, + "loss": 0.4919, + "step": 15007 + }, + { + "epoch": 19.21024, + "grad_norm": 1.1063481569290161, + "learning_rate": 2e-05, + "loss": 0.5197, + "step": 15008 + }, + { + "epoch": 19.21152, + "grad_norm": 1.0551246404647827, + "learning_rate": 1.9997999199679873e-05, + "loss": 0.5092, + "step": 15009 + }, + { + "epoch": 19.2128, + "grad_norm": 1.0208076238632202, + "learning_rate": 1.9995998399359745e-05, + "loss": 0.4658, + "step": 15010 + }, + { + "epoch": 19.21408, + "grad_norm": 1.0514510869979858, + "learning_rate": 1.9993997599039617e-05, + "loss": 0.4883, + "step": 15011 + }, + { + "epoch": 19.21536, + "grad_norm": 1.0635521411895752, + "learning_rate": 1.999199679871949e-05, + "loss": 0.4738, + "step": 15012 + }, + { + "epoch": 19.21664, + "grad_norm": 1.0084950923919678, + "learning_rate": 1.998999599839936e-05, + "loss": 0.4461, + "step": 15013 + }, + { + "epoch": 19.21792, + "grad_norm": 1.0877736806869507, + "learning_rate": 1.9987995198079233e-05, + "loss": 0.5056, + "step": 15014 + }, + { + "epoch": 19.2192, + "grad_norm": 1.0358392000198364, + "learning_rate": 1.9985994397759105e-05, + "loss": 0.4948, + "step": 15015 + }, + { + "epoch": 19.22048, + "grad_norm": 1.0706851482391357, + "learning_rate": 1.9983993597438976e-05, + "loss": 0.5167, + "step": 15016 + }, + { + "epoch": 19.22176, + "grad_norm": 1.0379141569137573, + "learning_rate": 1.998199279711885e-05, + "loss": 0.5038, + "step": 15017 + }, + { + "epoch": 19.22304, + "grad_norm": 1.045020580291748, + "learning_rate": 1.997999199679872e-05, + "loss": 0.4624, + "step": 15018 + }, + { + "epoch": 19.22432, + "grad_norm": 1.0447577238082886, + "learning_rate": 1.9977991196478592e-05, + "loss": 0.494, + "step": 15019 + }, + { + "epoch": 19.2256, + "grad_norm": 1.057941198348999, + "learning_rate": 1.9975990396158464e-05, + "loss": 0.4793, + "step": 15020 + }, + { + "epoch": 19.22688, + "grad_norm": 1.0662115812301636, + "learning_rate": 1.9973989595838336e-05, + "loss": 0.4976, + "step": 15021 + }, + { + "epoch": 19.22816, + "grad_norm": 1.040855050086975, + "learning_rate": 1.997198879551821e-05, + "loss": 0.4681, + "step": 15022 + }, + { + "epoch": 19.22944, + "grad_norm": 1.0539730787277222, + "learning_rate": 1.996998799519808e-05, + "loss": 0.4807, + "step": 15023 + }, + { + "epoch": 19.23072, + "grad_norm": 1.0343507528305054, + "learning_rate": 1.996798719487795e-05, + "loss": 0.4931, + "step": 15024 + }, + { + "epoch": 19.232, + "grad_norm": 1.0538980960845947, + "learning_rate": 1.9965986394557823e-05, + "loss": 0.4921, + "step": 15025 + }, + { + "epoch": 19.23328, + "grad_norm": 1.0909112691879272, + "learning_rate": 1.99639855942377e-05, + "loss": 0.4804, + "step": 15026 + }, + { + "epoch": 19.23456, + "grad_norm": 0.9821348786354065, + "learning_rate": 1.9961984793917567e-05, + "loss": 0.445, + "step": 15027 + }, + { + "epoch": 19.23584, + "grad_norm": 0.9998717904090881, + "learning_rate": 1.995998399359744e-05, + "loss": 0.4519, + "step": 15028 + }, + { + "epoch": 19.23712, + "grad_norm": 1.1057769060134888, + "learning_rate": 1.9957983193277314e-05, + "loss": 0.5291, + "step": 15029 + }, + { + "epoch": 19.2384, + "grad_norm": 1.1260607242584229, + "learning_rate": 1.9955982392957186e-05, + "loss": 0.4936, + "step": 15030 + }, + { + "epoch": 19.23968, + "grad_norm": 1.0743900537490845, + "learning_rate": 1.9953981592637054e-05, + "loss": 0.5174, + "step": 15031 + }, + { + "epoch": 19.24096, + "grad_norm": 1.0485973358154297, + "learning_rate": 1.9951980792316926e-05, + "loss": 0.4953, + "step": 15032 + }, + { + "epoch": 19.24224, + "grad_norm": 1.104485273361206, + "learning_rate": 1.99499799919968e-05, + "loss": 0.5035, + "step": 15033 + }, + { + "epoch": 19.24352, + "grad_norm": 1.1098350286483765, + "learning_rate": 1.9947979191676673e-05, + "loss": 0.5428, + "step": 15034 + }, + { + "epoch": 19.2448, + "grad_norm": 1.1056627035140991, + "learning_rate": 1.9945978391356542e-05, + "loss": 0.5034, + "step": 15035 + }, + { + "epoch": 19.24608, + "grad_norm": 1.1753102540969849, + "learning_rate": 1.9943977591036417e-05, + "loss": 0.559, + "step": 15036 + }, + { + "epoch": 19.24736, + "grad_norm": 1.1483614444732666, + "learning_rate": 1.994197679071629e-05, + "loss": 0.5179, + "step": 15037 + }, + { + "epoch": 19.24864, + "grad_norm": 1.0651777982711792, + "learning_rate": 1.993997599039616e-05, + "loss": 0.4794, + "step": 15038 + }, + { + "epoch": 19.24992, + "grad_norm": 1.1044447422027588, + "learning_rate": 1.993797519007603e-05, + "loss": 0.4939, + "step": 15039 + }, + { + "epoch": 19.2512, + "grad_norm": 1.1543649435043335, + "learning_rate": 1.9935974389755904e-05, + "loss": 0.5337, + "step": 15040 + }, + { + "epoch": 19.25248, + "grad_norm": 1.0800881385803223, + "learning_rate": 1.9933973589435776e-05, + "loss": 0.4684, + "step": 15041 + }, + { + "epoch": 19.25376, + "grad_norm": 1.0753213167190552, + "learning_rate": 1.9931972789115648e-05, + "loss": 0.5192, + "step": 15042 + }, + { + "epoch": 19.25504, + "grad_norm": 1.0312221050262451, + "learning_rate": 1.992997198879552e-05, + "loss": 0.484, + "step": 15043 + }, + { + "epoch": 19.25632, + "grad_norm": 1.0647677183151245, + "learning_rate": 1.9927971188475392e-05, + "loss": 0.4847, + "step": 15044 + }, + { + "epoch": 19.2576, + "grad_norm": 1.090179443359375, + "learning_rate": 1.9925970388155264e-05, + "loss": 0.4858, + "step": 15045 + }, + { + "epoch": 19.25888, + "grad_norm": 1.055359125137329, + "learning_rate": 1.9923969587835136e-05, + "loss": 0.46, + "step": 15046 + }, + { + "epoch": 19.26016, + "grad_norm": 1.0518981218338013, + "learning_rate": 1.9921968787515007e-05, + "loss": 0.4574, + "step": 15047 + }, + { + "epoch": 19.26144, + "grad_norm": 1.1169776916503906, + "learning_rate": 1.991996798719488e-05, + "loss": 0.4914, + "step": 15048 + }, + { + "epoch": 19.26272, + "grad_norm": 1.026633620262146, + "learning_rate": 1.991796718687475e-05, + "loss": 0.4853, + "step": 15049 + }, + { + "epoch": 19.264, + "grad_norm": 1.0704096555709839, + "learning_rate": 1.9915966386554623e-05, + "loss": 0.5028, + "step": 15050 + }, + { + "epoch": 19.26528, + "grad_norm": 1.075821042060852, + "learning_rate": 1.9913965586234495e-05, + "loss": 0.4707, + "step": 15051 + }, + { + "epoch": 19.26656, + "grad_norm": 1.025315523147583, + "learning_rate": 1.9911964785914367e-05, + "loss": 0.4675, + "step": 15052 + }, + { + "epoch": 19.26784, + "grad_norm": 1.0183051824569702, + "learning_rate": 1.990996398559424e-05, + "loss": 0.4757, + "step": 15053 + }, + { + "epoch": 19.26912, + "grad_norm": 1.0708404779434204, + "learning_rate": 1.990796318527411e-05, + "loss": 0.5087, + "step": 15054 + }, + { + "epoch": 19.2704, + "grad_norm": 1.080012559890747, + "learning_rate": 1.9905962384953982e-05, + "loss": 0.5115, + "step": 15055 + }, + { + "epoch": 19.27168, + "grad_norm": 1.0551676750183105, + "learning_rate": 1.9903961584633854e-05, + "loss": 0.4924, + "step": 15056 + }, + { + "epoch": 19.27296, + "grad_norm": 1.0453039407730103, + "learning_rate": 1.9901960784313726e-05, + "loss": 0.501, + "step": 15057 + }, + { + "epoch": 19.27424, + "grad_norm": 1.0461045503616333, + "learning_rate": 1.9899959983993598e-05, + "loss": 0.4755, + "step": 15058 + }, + { + "epoch": 19.27552, + "grad_norm": 1.0412358045578003, + "learning_rate": 1.989795918367347e-05, + "loss": 0.4581, + "step": 15059 + }, + { + "epoch": 19.2768, + "grad_norm": 1.027969479560852, + "learning_rate": 1.989595838335334e-05, + "loss": 0.4646, + "step": 15060 + }, + { + "epoch": 19.27808, + "grad_norm": 1.1006345748901367, + "learning_rate": 1.9893957583033217e-05, + "loss": 0.4882, + "step": 15061 + }, + { + "epoch": 19.27936, + "grad_norm": 1.0730010271072388, + "learning_rate": 1.9891956782713085e-05, + "loss": 0.4754, + "step": 15062 + }, + { + "epoch": 19.28064, + "grad_norm": 0.998073399066925, + "learning_rate": 1.9889955982392957e-05, + "loss": 0.4791, + "step": 15063 + }, + { + "epoch": 19.28192, + "grad_norm": 1.0365452766418457, + "learning_rate": 1.988795518207283e-05, + "loss": 0.4787, + "step": 15064 + }, + { + "epoch": 19.2832, + "grad_norm": 1.0752921104431152, + "learning_rate": 1.9885954381752704e-05, + "loss": 0.5146, + "step": 15065 + }, + { + "epoch": 19.28448, + "grad_norm": 0.9972426295280457, + "learning_rate": 1.9883953581432573e-05, + "loss": 0.4596, + "step": 15066 + }, + { + "epoch": 19.28576, + "grad_norm": 1.1111432313919067, + "learning_rate": 1.9881952781112445e-05, + "loss": 0.4724, + "step": 15067 + }, + { + "epoch": 19.28704, + "grad_norm": 1.0094146728515625, + "learning_rate": 1.987995198079232e-05, + "loss": 0.4806, + "step": 15068 + }, + { + "epoch": 19.28832, + "grad_norm": 1.0732576847076416, + "learning_rate": 1.987795118047219e-05, + "loss": 0.4957, + "step": 15069 + }, + { + "epoch": 19.2896, + "grad_norm": 1.131595253944397, + "learning_rate": 1.987595038015206e-05, + "loss": 0.518, + "step": 15070 + }, + { + "epoch": 19.29088, + "grad_norm": 1.0386974811553955, + "learning_rate": 1.9873949579831932e-05, + "loss": 0.4767, + "step": 15071 + }, + { + "epoch": 19.29216, + "grad_norm": 1.0035613775253296, + "learning_rate": 1.9871948779511807e-05, + "loss": 0.4594, + "step": 15072 + }, + { + "epoch": 19.29344, + "grad_norm": 1.0777531862258911, + "learning_rate": 1.986994797919168e-05, + "loss": 0.4983, + "step": 15073 + }, + { + "epoch": 19.29472, + "grad_norm": 1.0419715642929077, + "learning_rate": 1.9867947178871548e-05, + "loss": 0.4667, + "step": 15074 + }, + { + "epoch": 19.296, + "grad_norm": 1.090698003768921, + "learning_rate": 1.9865946378551423e-05, + "loss": 0.5553, + "step": 15075 + }, + { + "epoch": 19.29728, + "grad_norm": 1.0526976585388184, + "learning_rate": 1.9863945578231295e-05, + "loss": 0.5112, + "step": 15076 + }, + { + "epoch": 19.29856, + "grad_norm": 1.0763030052185059, + "learning_rate": 1.9861944777911167e-05, + "loss": 0.4721, + "step": 15077 + }, + { + "epoch": 19.29984, + "grad_norm": 1.032845377922058, + "learning_rate": 1.9859943977591035e-05, + "loss": 0.5084, + "step": 15078 + }, + { + "epoch": 19.30112, + "grad_norm": 1.0398248434066772, + "learning_rate": 1.985794317727091e-05, + "loss": 0.4519, + "step": 15079 + }, + { + "epoch": 19.3024, + "grad_norm": 1.0678154230117798, + "learning_rate": 1.9855942376950782e-05, + "loss": 0.5367, + "step": 15080 + }, + { + "epoch": 19.30368, + "grad_norm": 1.0743778944015503, + "learning_rate": 1.9853941576630654e-05, + "loss": 0.4997, + "step": 15081 + }, + { + "epoch": 19.30496, + "grad_norm": 1.0013165473937988, + "learning_rate": 1.9851940776310526e-05, + "loss": 0.4498, + "step": 15082 + }, + { + "epoch": 19.30624, + "grad_norm": 1.052975058555603, + "learning_rate": 1.9849939975990398e-05, + "loss": 0.4922, + "step": 15083 + }, + { + "epoch": 19.30752, + "grad_norm": 1.0732128620147705, + "learning_rate": 1.984793917567027e-05, + "loss": 0.4821, + "step": 15084 + }, + { + "epoch": 19.3088, + "grad_norm": 1.0122027397155762, + "learning_rate": 1.984593837535014e-05, + "loss": 0.4702, + "step": 15085 + }, + { + "epoch": 19.31008, + "grad_norm": 1.1038622856140137, + "learning_rate": 1.9843937575030013e-05, + "loss": 0.5435, + "step": 15086 + }, + { + "epoch": 19.31136, + "grad_norm": 1.1267859935760498, + "learning_rate": 1.9841936774709885e-05, + "loss": 0.5358, + "step": 15087 + }, + { + "epoch": 19.31264, + "grad_norm": 1.109407663345337, + "learning_rate": 1.9839935974389757e-05, + "loss": 0.4993, + "step": 15088 + }, + { + "epoch": 19.31392, + "grad_norm": 1.0586917400360107, + "learning_rate": 1.983793517406963e-05, + "loss": 0.541, + "step": 15089 + }, + { + "epoch": 19.3152, + "grad_norm": 1.0636937618255615, + "learning_rate": 1.98359343737495e-05, + "loss": 0.5048, + "step": 15090 + }, + { + "epoch": 19.31648, + "grad_norm": 1.0607376098632812, + "learning_rate": 1.9833933573429373e-05, + "loss": 0.485, + "step": 15091 + }, + { + "epoch": 19.31776, + "grad_norm": 1.100722074508667, + "learning_rate": 1.9831932773109244e-05, + "loss": 0.5225, + "step": 15092 + }, + { + "epoch": 19.31904, + "grad_norm": 1.06232750415802, + "learning_rate": 1.9829931972789116e-05, + "loss": 0.5353, + "step": 15093 + }, + { + "epoch": 19.32032, + "grad_norm": 1.0231244564056396, + "learning_rate": 1.9827931172468988e-05, + "loss": 0.5138, + "step": 15094 + }, + { + "epoch": 19.3216, + "grad_norm": 1.1289072036743164, + "learning_rate": 1.982593037214886e-05, + "loss": 0.5095, + "step": 15095 + }, + { + "epoch": 19.32288, + "grad_norm": 1.0838286876678467, + "learning_rate": 1.9823929571828735e-05, + "loss": 0.5244, + "step": 15096 + }, + { + "epoch": 19.32416, + "grad_norm": 0.9947683811187744, + "learning_rate": 1.9821928771508604e-05, + "loss": 0.4371, + "step": 15097 + }, + { + "epoch": 19.32544, + "grad_norm": 0.9954429864883423, + "learning_rate": 1.9819927971188476e-05, + "loss": 0.4654, + "step": 15098 + }, + { + "epoch": 19.32672, + "grad_norm": 0.9967473149299622, + "learning_rate": 1.9817927170868347e-05, + "loss": 0.4414, + "step": 15099 + }, + { + "epoch": 19.328, + "grad_norm": 1.0796631574630737, + "learning_rate": 1.9815926370548223e-05, + "loss": 0.4671, + "step": 15100 + }, + { + "epoch": 19.32928, + "grad_norm": 1.1318116188049316, + "learning_rate": 1.981392557022809e-05, + "loss": 0.487, + "step": 15101 + }, + { + "epoch": 19.33056, + "grad_norm": 1.1136324405670166, + "learning_rate": 1.9811924769907963e-05, + "loss": 0.502, + "step": 15102 + }, + { + "epoch": 19.33184, + "grad_norm": 1.093737244606018, + "learning_rate": 1.9809923969587835e-05, + "loss": 0.5176, + "step": 15103 + }, + { + "epoch": 19.33312, + "grad_norm": 1.1789205074310303, + "learning_rate": 1.980792316926771e-05, + "loss": 0.494, + "step": 15104 + }, + { + "epoch": 19.3344, + "grad_norm": 1.1485955715179443, + "learning_rate": 1.980592236894758e-05, + "loss": 0.5493, + "step": 15105 + }, + { + "epoch": 19.33568, + "grad_norm": 1.1409004926681519, + "learning_rate": 1.980392156862745e-05, + "loss": 0.5134, + "step": 15106 + }, + { + "epoch": 19.33696, + "grad_norm": 1.1257143020629883, + "learning_rate": 1.9801920768307326e-05, + "loss": 0.5045, + "step": 15107 + }, + { + "epoch": 19.33824, + "grad_norm": 1.071676254272461, + "learning_rate": 1.9799919967987197e-05, + "loss": 0.4601, + "step": 15108 + }, + { + "epoch": 19.33952, + "grad_norm": 1.0796996355056763, + "learning_rate": 1.9797919167667066e-05, + "loss": 0.5009, + "step": 15109 + }, + { + "epoch": 19.3408, + "grad_norm": 1.1070489883422852, + "learning_rate": 1.9795918367346938e-05, + "loss": 0.5274, + "step": 15110 + }, + { + "epoch": 19.34208, + "grad_norm": 1.0755043029785156, + "learning_rate": 1.9793917567026813e-05, + "loss": 0.5438, + "step": 15111 + }, + { + "epoch": 19.34336, + "grad_norm": 1.077373743057251, + "learning_rate": 1.9791916766706685e-05, + "loss": 0.4636, + "step": 15112 + }, + { + "epoch": 19.34464, + "grad_norm": 1.0781618356704712, + "learning_rate": 1.9789915966386553e-05, + "loss": 0.5117, + "step": 15113 + }, + { + "epoch": 19.34592, + "grad_norm": 1.0397669076919556, + "learning_rate": 1.978791516606643e-05, + "loss": 0.4753, + "step": 15114 + }, + { + "epoch": 19.3472, + "grad_norm": 1.0098005533218384, + "learning_rate": 1.97859143657463e-05, + "loss": 0.4595, + "step": 15115 + }, + { + "epoch": 19.34848, + "grad_norm": 1.054352879524231, + "learning_rate": 1.9783913565426172e-05, + "loss": 0.5363, + "step": 15116 + }, + { + "epoch": 19.34976, + "grad_norm": 1.1141222715377808, + "learning_rate": 1.978191276510604e-05, + "loss": 0.5044, + "step": 15117 + }, + { + "epoch": 19.35104, + "grad_norm": 1.093037486076355, + "learning_rate": 1.9779911964785916e-05, + "loss": 0.5358, + "step": 15118 + }, + { + "epoch": 19.35232, + "grad_norm": 1.0386536121368408, + "learning_rate": 1.9777911164465788e-05, + "loss": 0.4794, + "step": 15119 + }, + { + "epoch": 19.3536, + "grad_norm": 1.0772889852523804, + "learning_rate": 1.977591036414566e-05, + "loss": 0.4993, + "step": 15120 + }, + { + "epoch": 19.35488, + "grad_norm": 1.0881242752075195, + "learning_rate": 1.977390956382553e-05, + "loss": 0.5044, + "step": 15121 + }, + { + "epoch": 19.35616, + "grad_norm": 1.0567618608474731, + "learning_rate": 1.9771908763505403e-05, + "loss": 0.497, + "step": 15122 + }, + { + "epoch": 19.35744, + "grad_norm": 1.0052014589309692, + "learning_rate": 1.9769907963185275e-05, + "loss": 0.4805, + "step": 15123 + }, + { + "epoch": 19.35872, + "grad_norm": 1.0756340026855469, + "learning_rate": 1.9767907162865147e-05, + "loss": 0.4962, + "step": 15124 + }, + { + "epoch": 19.36, + "grad_norm": 1.0965667963027954, + "learning_rate": 1.976590636254502e-05, + "loss": 0.4453, + "step": 15125 + }, + { + "epoch": 19.36128, + "grad_norm": 1.1271300315856934, + "learning_rate": 1.976390556222489e-05, + "loss": 0.4642, + "step": 15126 + }, + { + "epoch": 19.36256, + "grad_norm": 1.1453865766525269, + "learning_rate": 1.9761904761904763e-05, + "loss": 0.5407, + "step": 15127 + }, + { + "epoch": 19.36384, + "grad_norm": 1.1263774633407593, + "learning_rate": 1.9759903961584635e-05, + "loss": 0.5207, + "step": 15128 + }, + { + "epoch": 19.36512, + "grad_norm": 1.1071971654891968, + "learning_rate": 1.9757903161264506e-05, + "loss": 0.5006, + "step": 15129 + }, + { + "epoch": 19.3664, + "grad_norm": 1.0227842330932617, + "learning_rate": 1.975590236094438e-05, + "loss": 0.4618, + "step": 15130 + }, + { + "epoch": 19.36768, + "grad_norm": 1.094109058380127, + "learning_rate": 1.975390156062425e-05, + "loss": 0.4808, + "step": 15131 + }, + { + "epoch": 19.36896, + "grad_norm": 1.1060868501663208, + "learning_rate": 1.9751900760304122e-05, + "loss": 0.4996, + "step": 15132 + }, + { + "epoch": 19.37024, + "grad_norm": 1.075696587562561, + "learning_rate": 1.9749899959983994e-05, + "loss": 0.5082, + "step": 15133 + }, + { + "epoch": 19.37152, + "grad_norm": 1.0798813104629517, + "learning_rate": 1.9747899159663866e-05, + "loss": 0.4954, + "step": 15134 + }, + { + "epoch": 19.3728, + "grad_norm": 1.1250864267349243, + "learning_rate": 1.974589835934374e-05, + "loss": 0.5539, + "step": 15135 + }, + { + "epoch": 19.37408, + "grad_norm": 1.0421397686004639, + "learning_rate": 1.974389755902361e-05, + "loss": 0.4999, + "step": 15136 + }, + { + "epoch": 19.37536, + "grad_norm": 1.0826746225357056, + "learning_rate": 1.974189675870348e-05, + "loss": 0.5566, + "step": 15137 + }, + { + "epoch": 19.37664, + "grad_norm": 1.078742504119873, + "learning_rate": 1.9739895958383353e-05, + "loss": 0.5022, + "step": 15138 + }, + { + "epoch": 19.37792, + "grad_norm": 1.0308053493499756, + "learning_rate": 1.973789515806323e-05, + "loss": 0.4586, + "step": 15139 + }, + { + "epoch": 19.3792, + "grad_norm": 1.0350420475006104, + "learning_rate": 1.9735894357743097e-05, + "loss": 0.5138, + "step": 15140 + }, + { + "epoch": 19.38048, + "grad_norm": 1.1027657985687256, + "learning_rate": 1.973389355742297e-05, + "loss": 0.538, + "step": 15141 + }, + { + "epoch": 19.38176, + "grad_norm": 1.0577800273895264, + "learning_rate": 1.9731892757102844e-05, + "loss": 0.522, + "step": 15142 + }, + { + "epoch": 19.38304, + "grad_norm": 1.043058156967163, + "learning_rate": 1.9729891956782716e-05, + "loss": 0.467, + "step": 15143 + }, + { + "epoch": 19.38432, + "grad_norm": 1.0484575033187866, + "learning_rate": 1.9727891156462584e-05, + "loss": 0.524, + "step": 15144 + }, + { + "epoch": 19.3856, + "grad_norm": 1.0222641229629517, + "learning_rate": 1.9725890356142456e-05, + "loss": 0.5035, + "step": 15145 + }, + { + "epoch": 19.38688, + "grad_norm": 0.9348340630531311, + "learning_rate": 1.972388955582233e-05, + "loss": 0.4127, + "step": 15146 + }, + { + "epoch": 19.38816, + "grad_norm": 1.0297220945358276, + "learning_rate": 1.9721888755502203e-05, + "loss": 0.4698, + "step": 15147 + }, + { + "epoch": 19.38944, + "grad_norm": 1.1119678020477295, + "learning_rate": 1.9719887955182072e-05, + "loss": 0.5864, + "step": 15148 + }, + { + "epoch": 19.39072, + "grad_norm": 1.0301072597503662, + "learning_rate": 1.9717887154861947e-05, + "loss": 0.4696, + "step": 15149 + }, + { + "epoch": 19.392, + "grad_norm": 1.0419156551361084, + "learning_rate": 1.971588635454182e-05, + "loss": 0.4809, + "step": 15150 + }, + { + "epoch": 19.39328, + "grad_norm": 1.1309643983840942, + "learning_rate": 1.971388555422169e-05, + "loss": 0.5239, + "step": 15151 + }, + { + "epoch": 19.39456, + "grad_norm": 1.0294731855392456, + "learning_rate": 1.971188475390156e-05, + "loss": 0.4653, + "step": 15152 + }, + { + "epoch": 19.39584, + "grad_norm": 1.0680971145629883, + "learning_rate": 1.9709883953581434e-05, + "loss": 0.5049, + "step": 15153 + }, + { + "epoch": 19.39712, + "grad_norm": 1.0883816480636597, + "learning_rate": 1.9707883153261306e-05, + "loss": 0.4632, + "step": 15154 + }, + { + "epoch": 19.3984, + "grad_norm": 1.0957615375518799, + "learning_rate": 1.9705882352941178e-05, + "loss": 0.4984, + "step": 15155 + }, + { + "epoch": 19.39968, + "grad_norm": 1.0659648180007935, + "learning_rate": 1.970388155262105e-05, + "loss": 0.5045, + "step": 15156 + }, + { + "epoch": 19.40096, + "grad_norm": 1.0844171047210693, + "learning_rate": 1.9701880752300922e-05, + "loss": 0.4732, + "step": 15157 + }, + { + "epoch": 19.40224, + "grad_norm": 1.1332968473434448, + "learning_rate": 1.9699879951980794e-05, + "loss": 0.5328, + "step": 15158 + }, + { + "epoch": 19.40352, + "grad_norm": 1.1378060579299927, + "learning_rate": 1.9697879151660666e-05, + "loss": 0.5104, + "step": 15159 + }, + { + "epoch": 19.4048, + "grad_norm": 1.137235164642334, + "learning_rate": 1.9695878351340537e-05, + "loss": 0.5345, + "step": 15160 + }, + { + "epoch": 19.40608, + "grad_norm": 1.0622361898422241, + "learning_rate": 1.969387755102041e-05, + "loss": 0.4613, + "step": 15161 + }, + { + "epoch": 19.40736, + "grad_norm": 1.0199161767959595, + "learning_rate": 1.969187675070028e-05, + "loss": 0.4496, + "step": 15162 + }, + { + "epoch": 19.40864, + "grad_norm": 1.075202226638794, + "learning_rate": 1.9689875950380153e-05, + "loss": 0.521, + "step": 15163 + }, + { + "epoch": 19.40992, + "grad_norm": 1.0620388984680176, + "learning_rate": 1.9687875150060025e-05, + "loss": 0.4758, + "step": 15164 + }, + { + "epoch": 19.4112, + "grad_norm": 1.073757529258728, + "learning_rate": 1.9685874349739897e-05, + "loss": 0.4867, + "step": 15165 + }, + { + "epoch": 19.41248, + "grad_norm": 1.063657522201538, + "learning_rate": 1.968387354941977e-05, + "loss": 0.5093, + "step": 15166 + }, + { + "epoch": 19.41376, + "grad_norm": 1.1278645992279053, + "learning_rate": 1.968187274909964e-05, + "loss": 0.4889, + "step": 15167 + }, + { + "epoch": 19.41504, + "grad_norm": 1.0807961225509644, + "learning_rate": 1.9679871948779512e-05, + "loss": 0.5224, + "step": 15168 + }, + { + "epoch": 19.41632, + "grad_norm": 1.0827388763427734, + "learning_rate": 1.9677871148459384e-05, + "loss": 0.5286, + "step": 15169 + }, + { + "epoch": 19.4176, + "grad_norm": 1.0594857931137085, + "learning_rate": 1.9675870348139256e-05, + "loss": 0.4851, + "step": 15170 + }, + { + "epoch": 19.41888, + "grad_norm": 1.0610501766204834, + "learning_rate": 1.9673869547819128e-05, + "loss": 0.4941, + "step": 15171 + }, + { + "epoch": 19.42016, + "grad_norm": 1.06877863407135, + "learning_rate": 1.9671868747499e-05, + "loss": 0.4593, + "step": 15172 + }, + { + "epoch": 19.42144, + "grad_norm": 1.0728821754455566, + "learning_rate": 1.966986794717887e-05, + "loss": 0.5086, + "step": 15173 + }, + { + "epoch": 19.422719999999998, + "grad_norm": 1.062900185585022, + "learning_rate": 1.9667867146858747e-05, + "loss": 0.4906, + "step": 15174 + }, + { + "epoch": 19.424, + "grad_norm": 1.0392909049987793, + "learning_rate": 1.9665866346538615e-05, + "loss": 0.4578, + "step": 15175 + }, + { + "epoch": 19.42528, + "grad_norm": 1.0775206089019775, + "learning_rate": 1.9663865546218487e-05, + "loss": 0.4938, + "step": 15176 + }, + { + "epoch": 19.42656, + "grad_norm": 1.0761048793792725, + "learning_rate": 1.966186474589836e-05, + "loss": 0.5025, + "step": 15177 + }, + { + "epoch": 19.42784, + "grad_norm": 1.1409354209899902, + "learning_rate": 1.9659863945578234e-05, + "loss": 0.545, + "step": 15178 + }, + { + "epoch": 19.42912, + "grad_norm": 1.0487301349639893, + "learning_rate": 1.9657863145258103e-05, + "loss": 0.4613, + "step": 15179 + }, + { + "epoch": 19.4304, + "grad_norm": 1.12277090549469, + "learning_rate": 1.9655862344937975e-05, + "loss": 0.5116, + "step": 15180 + }, + { + "epoch": 19.43168, + "grad_norm": 1.0349456071853638, + "learning_rate": 1.965386154461785e-05, + "loss": 0.4875, + "step": 15181 + }, + { + "epoch": 19.43296, + "grad_norm": 1.0730128288269043, + "learning_rate": 1.965186074429772e-05, + "loss": 0.4988, + "step": 15182 + }, + { + "epoch": 19.43424, + "grad_norm": 1.111451268196106, + "learning_rate": 1.964985994397759e-05, + "loss": 0.4995, + "step": 15183 + }, + { + "epoch": 19.43552, + "grad_norm": 1.0789837837219238, + "learning_rate": 1.9647859143657462e-05, + "loss": 0.5073, + "step": 15184 + }, + { + "epoch": 19.4368, + "grad_norm": 1.0748100280761719, + "learning_rate": 1.9645858343337337e-05, + "loss": 0.4898, + "step": 15185 + }, + { + "epoch": 19.43808, + "grad_norm": 1.031887173652649, + "learning_rate": 1.964385754301721e-05, + "loss": 0.4586, + "step": 15186 + }, + { + "epoch": 19.43936, + "grad_norm": 1.1339378356933594, + "learning_rate": 1.9641856742697078e-05, + "loss": 0.5464, + "step": 15187 + }, + { + "epoch": 19.44064, + "grad_norm": 1.0379207134246826, + "learning_rate": 1.9639855942376953e-05, + "loss": 0.4321, + "step": 15188 + }, + { + "epoch": 19.44192, + "grad_norm": 1.0902132987976074, + "learning_rate": 1.9637855142056825e-05, + "loss": 0.5081, + "step": 15189 + }, + { + "epoch": 19.4432, + "grad_norm": 1.0853605270385742, + "learning_rate": 1.9635854341736697e-05, + "loss": 0.4636, + "step": 15190 + }, + { + "epoch": 19.44448, + "grad_norm": 1.0282881259918213, + "learning_rate": 1.9633853541416565e-05, + "loss": 0.5072, + "step": 15191 + }, + { + "epoch": 19.44576, + "grad_norm": 1.1072465181350708, + "learning_rate": 1.963185274109644e-05, + "loss": 0.5032, + "step": 15192 + }, + { + "epoch": 19.44704, + "grad_norm": 1.0915943384170532, + "learning_rate": 1.9629851940776312e-05, + "loss": 0.5344, + "step": 15193 + }, + { + "epoch": 19.44832, + "grad_norm": 1.1000624895095825, + "learning_rate": 1.9627851140456184e-05, + "loss": 0.449, + "step": 15194 + }, + { + "epoch": 19.4496, + "grad_norm": 1.1032449007034302, + "learning_rate": 1.9625850340136056e-05, + "loss": 0.5057, + "step": 15195 + }, + { + "epoch": 19.45088, + "grad_norm": 1.0936930179595947, + "learning_rate": 1.9623849539815928e-05, + "loss": 0.5342, + "step": 15196 + }, + { + "epoch": 19.45216, + "grad_norm": 1.1406618356704712, + "learning_rate": 1.96218487394958e-05, + "loss": 0.514, + "step": 15197 + }, + { + "epoch": 19.45344, + "grad_norm": 1.1138516664505005, + "learning_rate": 1.961984793917567e-05, + "loss": 0.4917, + "step": 15198 + }, + { + "epoch": 19.454720000000002, + "grad_norm": 1.051370620727539, + "learning_rate": 1.9617847138855543e-05, + "loss": 0.497, + "step": 15199 + }, + { + "epoch": 19.456, + "grad_norm": 1.126387357711792, + "learning_rate": 1.9615846338535415e-05, + "loss": 0.5121, + "step": 15200 + }, + { + "epoch": 19.45728, + "grad_norm": 1.0682495832443237, + "learning_rate": 1.9613845538215287e-05, + "loss": 0.5013, + "step": 15201 + }, + { + "epoch": 19.45856, + "grad_norm": 1.1218750476837158, + "learning_rate": 1.961184473789516e-05, + "loss": 0.5307, + "step": 15202 + }, + { + "epoch": 19.45984, + "grad_norm": 1.0630134344100952, + "learning_rate": 1.960984393757503e-05, + "loss": 0.4964, + "step": 15203 + }, + { + "epoch": 19.46112, + "grad_norm": 1.0802773237228394, + "learning_rate": 1.9607843137254903e-05, + "loss": 0.493, + "step": 15204 + }, + { + "epoch": 19.4624, + "grad_norm": 1.061515212059021, + "learning_rate": 1.9605842336934774e-05, + "loss": 0.5217, + "step": 15205 + }, + { + "epoch": 19.46368, + "grad_norm": 1.0091540813446045, + "learning_rate": 1.9603841536614646e-05, + "loss": 0.4742, + "step": 15206 + }, + { + "epoch": 19.46496, + "grad_norm": 1.0679512023925781, + "learning_rate": 1.9601840736294518e-05, + "loss": 0.5139, + "step": 15207 + }, + { + "epoch": 19.46624, + "grad_norm": 1.019675850868225, + "learning_rate": 1.959983993597439e-05, + "loss": 0.4558, + "step": 15208 + }, + { + "epoch": 19.46752, + "grad_norm": 1.1162679195404053, + "learning_rate": 1.9597839135654265e-05, + "loss": 0.4691, + "step": 15209 + }, + { + "epoch": 19.4688, + "grad_norm": 1.0296701192855835, + "learning_rate": 1.9595838335334134e-05, + "loss": 0.4811, + "step": 15210 + }, + { + "epoch": 19.47008, + "grad_norm": 1.0854653120040894, + "learning_rate": 1.9593837535014005e-05, + "loss": 0.5192, + "step": 15211 + }, + { + "epoch": 19.47136, + "grad_norm": 1.088734745979309, + "learning_rate": 1.9591836734693877e-05, + "loss": 0.507, + "step": 15212 + }, + { + "epoch": 19.47264, + "grad_norm": 1.077596664428711, + "learning_rate": 1.9589835934373753e-05, + "loss": 0.4915, + "step": 15213 + }, + { + "epoch": 19.47392, + "grad_norm": 1.1279915571212769, + "learning_rate": 1.958783513405362e-05, + "loss": 0.5085, + "step": 15214 + }, + { + "epoch": 19.4752, + "grad_norm": 1.0682718753814697, + "learning_rate": 1.9585834333733493e-05, + "loss": 0.4905, + "step": 15215 + }, + { + "epoch": 19.47648, + "grad_norm": 1.0742157697677612, + "learning_rate": 1.9583833533413365e-05, + "loss": 0.4806, + "step": 15216 + }, + { + "epoch": 19.47776, + "grad_norm": 1.042440414428711, + "learning_rate": 1.958183273309324e-05, + "loss": 0.5013, + "step": 15217 + }, + { + "epoch": 19.47904, + "grad_norm": 1.071058750152588, + "learning_rate": 1.957983193277311e-05, + "loss": 0.5188, + "step": 15218 + }, + { + "epoch": 19.48032, + "grad_norm": 1.0596333742141724, + "learning_rate": 1.957783113245298e-05, + "loss": 0.5147, + "step": 15219 + }, + { + "epoch": 19.4816, + "grad_norm": 1.0863871574401855, + "learning_rate": 1.9575830332132856e-05, + "loss": 0.5211, + "step": 15220 + }, + { + "epoch": 19.48288, + "grad_norm": 1.0600311756134033, + "learning_rate": 1.9573829531812727e-05, + "loss": 0.4686, + "step": 15221 + }, + { + "epoch": 19.48416, + "grad_norm": 1.0868990421295166, + "learning_rate": 1.9571828731492596e-05, + "loss": 0.5152, + "step": 15222 + }, + { + "epoch": 19.48544, + "grad_norm": 1.065700888633728, + "learning_rate": 1.9569827931172468e-05, + "loss": 0.4934, + "step": 15223 + }, + { + "epoch": 19.48672, + "grad_norm": 1.0733686685562134, + "learning_rate": 1.9567827130852343e-05, + "loss": 0.4662, + "step": 15224 + }, + { + "epoch": 19.488, + "grad_norm": 1.0808594226837158, + "learning_rate": 1.9565826330532215e-05, + "loss": 0.497, + "step": 15225 + }, + { + "epoch": 19.48928, + "grad_norm": 1.066211223602295, + "learning_rate": 1.9563825530212083e-05, + "loss": 0.549, + "step": 15226 + }, + { + "epoch": 19.49056, + "grad_norm": 1.0072721242904663, + "learning_rate": 1.956182472989196e-05, + "loss": 0.4312, + "step": 15227 + }, + { + "epoch": 19.49184, + "grad_norm": 1.107842206954956, + "learning_rate": 1.955982392957183e-05, + "loss": 0.4793, + "step": 15228 + }, + { + "epoch": 19.49312, + "grad_norm": 1.1411617994308472, + "learning_rate": 1.9557823129251702e-05, + "loss": 0.534, + "step": 15229 + }, + { + "epoch": 19.4944, + "grad_norm": 1.0612369775772095, + "learning_rate": 1.955582232893157e-05, + "loss": 0.5087, + "step": 15230 + }, + { + "epoch": 19.49568, + "grad_norm": 1.0148245096206665, + "learning_rate": 1.9553821528611446e-05, + "loss": 0.4518, + "step": 15231 + }, + { + "epoch": 19.49696, + "grad_norm": 1.133119821548462, + "learning_rate": 1.9551820728291318e-05, + "loss": 0.5301, + "step": 15232 + }, + { + "epoch": 19.49824, + "grad_norm": 1.0392405986785889, + "learning_rate": 1.954981992797119e-05, + "loss": 0.4912, + "step": 15233 + }, + { + "epoch": 19.49952, + "grad_norm": 1.0500421524047852, + "learning_rate": 1.954781912765106e-05, + "loss": 0.4866, + "step": 15234 + }, + { + "epoch": 19.5008, + "grad_norm": 1.0911407470703125, + "learning_rate": 1.9545818327330933e-05, + "loss": 0.5114, + "step": 15235 + }, + { + "epoch": 19.50208, + "grad_norm": 1.0419782400131226, + "learning_rate": 1.9543817527010805e-05, + "loss": 0.4925, + "step": 15236 + }, + { + "epoch": 19.50336, + "grad_norm": 1.1287118196487427, + "learning_rate": 1.9541816726690677e-05, + "loss": 0.5297, + "step": 15237 + }, + { + "epoch": 19.50464, + "grad_norm": 1.0417295694351196, + "learning_rate": 1.953981592637055e-05, + "loss": 0.4834, + "step": 15238 + }, + { + "epoch": 19.50592, + "grad_norm": 1.098235011100769, + "learning_rate": 1.953781512605042e-05, + "loss": 0.5075, + "step": 15239 + }, + { + "epoch": 19.5072, + "grad_norm": 1.0871201753616333, + "learning_rate": 1.9535814325730293e-05, + "loss": 0.4985, + "step": 15240 + }, + { + "epoch": 19.50848, + "grad_norm": 1.1194583177566528, + "learning_rate": 1.9533813525410165e-05, + "loss": 0.5293, + "step": 15241 + }, + { + "epoch": 19.50976, + "grad_norm": 1.0736286640167236, + "learning_rate": 1.9531812725090036e-05, + "loss": 0.488, + "step": 15242 + }, + { + "epoch": 19.51104, + "grad_norm": 1.0650372505187988, + "learning_rate": 1.9529811924769908e-05, + "loss": 0.4828, + "step": 15243 + }, + { + "epoch": 19.51232, + "grad_norm": 1.0762523412704468, + "learning_rate": 1.952781112444978e-05, + "loss": 0.5154, + "step": 15244 + }, + { + "epoch": 19.5136, + "grad_norm": 1.1131715774536133, + "learning_rate": 1.9525810324129652e-05, + "loss": 0.4885, + "step": 15245 + }, + { + "epoch": 19.51488, + "grad_norm": 1.0484470129013062, + "learning_rate": 1.9523809523809524e-05, + "loss": 0.4878, + "step": 15246 + }, + { + "epoch": 19.51616, + "grad_norm": 1.0681511163711548, + "learning_rate": 1.9521808723489396e-05, + "loss": 0.5208, + "step": 15247 + }, + { + "epoch": 19.51744, + "grad_norm": 1.0821027755737305, + "learning_rate": 1.951980792316927e-05, + "loss": 0.53, + "step": 15248 + }, + { + "epoch": 19.51872, + "grad_norm": 1.14222252368927, + "learning_rate": 1.951780712284914e-05, + "loss": 0.5255, + "step": 15249 + }, + { + "epoch": 19.52, + "grad_norm": 1.0315016508102417, + "learning_rate": 1.951580632252901e-05, + "loss": 0.4661, + "step": 15250 + }, + { + "epoch": 19.52128, + "grad_norm": 1.0619421005249023, + "learning_rate": 1.9513805522208883e-05, + "loss": 0.5088, + "step": 15251 + }, + { + "epoch": 19.52256, + "grad_norm": 1.1743899583816528, + "learning_rate": 1.951180472188876e-05, + "loss": 0.5224, + "step": 15252 + }, + { + "epoch": 19.52384, + "grad_norm": 1.0858947038650513, + "learning_rate": 1.9509803921568627e-05, + "loss": 0.4706, + "step": 15253 + }, + { + "epoch": 19.52512, + "grad_norm": 1.1015316247940063, + "learning_rate": 1.95078031212485e-05, + "loss": 0.5343, + "step": 15254 + }, + { + "epoch": 19.5264, + "grad_norm": 1.0959560871124268, + "learning_rate": 1.9505802320928374e-05, + "loss": 0.4969, + "step": 15255 + }, + { + "epoch": 19.52768, + "grad_norm": 1.0881457328796387, + "learning_rate": 1.9503801520608246e-05, + "loss": 0.5351, + "step": 15256 + }, + { + "epoch": 19.52896, + "grad_norm": 1.0661768913269043, + "learning_rate": 1.9501800720288114e-05, + "loss": 0.4861, + "step": 15257 + }, + { + "epoch": 19.53024, + "grad_norm": 1.1012307405471802, + "learning_rate": 1.9499799919967986e-05, + "loss": 0.5191, + "step": 15258 + }, + { + "epoch": 19.53152, + "grad_norm": 1.0949671268463135, + "learning_rate": 1.949779911964786e-05, + "loss": 0.5407, + "step": 15259 + }, + { + "epoch": 19.5328, + "grad_norm": 1.0634304285049438, + "learning_rate": 1.9495798319327733e-05, + "loss": 0.4695, + "step": 15260 + }, + { + "epoch": 19.53408, + "grad_norm": 1.0616800785064697, + "learning_rate": 1.9493797519007602e-05, + "loss": 0.4984, + "step": 15261 + }, + { + "epoch": 19.53536, + "grad_norm": 1.1313798427581787, + "learning_rate": 1.9491796718687477e-05, + "loss": 0.5477, + "step": 15262 + }, + { + "epoch": 19.53664, + "grad_norm": 1.0650761127471924, + "learning_rate": 1.948979591836735e-05, + "loss": 0.5243, + "step": 15263 + }, + { + "epoch": 19.53792, + "grad_norm": 1.0772321224212646, + "learning_rate": 1.948779511804722e-05, + "loss": 0.4764, + "step": 15264 + }, + { + "epoch": 19.5392, + "grad_norm": 1.0623836517333984, + "learning_rate": 1.948579431772709e-05, + "loss": 0.5006, + "step": 15265 + }, + { + "epoch": 19.54048, + "grad_norm": 0.9955456852912903, + "learning_rate": 1.9483793517406964e-05, + "loss": 0.4675, + "step": 15266 + }, + { + "epoch": 19.54176, + "grad_norm": 1.0686817169189453, + "learning_rate": 1.9481792717086836e-05, + "loss": 0.5182, + "step": 15267 + }, + { + "epoch": 19.54304, + "grad_norm": 1.1312791109085083, + "learning_rate": 1.9479791916766708e-05, + "loss": 0.5157, + "step": 15268 + }, + { + "epoch": 19.54432, + "grad_norm": 1.1050148010253906, + "learning_rate": 1.947779111644658e-05, + "loss": 0.5053, + "step": 15269 + }, + { + "epoch": 19.5456, + "grad_norm": 1.0860576629638672, + "learning_rate": 1.9475790316126452e-05, + "loss": 0.4846, + "step": 15270 + }, + { + "epoch": 19.54688, + "grad_norm": 1.0629215240478516, + "learning_rate": 1.9473789515806324e-05, + "loss": 0.5131, + "step": 15271 + }, + { + "epoch": 19.54816, + "grad_norm": 1.05284583568573, + "learning_rate": 1.9471788715486196e-05, + "loss": 0.4852, + "step": 15272 + }, + { + "epoch": 19.54944, + "grad_norm": 1.0399121046066284, + "learning_rate": 1.9469787915166067e-05, + "loss": 0.4853, + "step": 15273 + }, + { + "epoch": 19.55072, + "grad_norm": 1.0178717374801636, + "learning_rate": 1.946778711484594e-05, + "loss": 0.4905, + "step": 15274 + }, + { + "epoch": 19.552, + "grad_norm": 1.0732840299606323, + "learning_rate": 1.946578631452581e-05, + "loss": 0.5017, + "step": 15275 + }, + { + "epoch": 19.55328, + "grad_norm": 1.1350263357162476, + "learning_rate": 1.9463785514205683e-05, + "loss": 0.5214, + "step": 15276 + }, + { + "epoch": 19.55456, + "grad_norm": 1.050378441810608, + "learning_rate": 1.9461784713885555e-05, + "loss": 0.4801, + "step": 15277 + }, + { + "epoch": 19.55584, + "grad_norm": 1.0464566946029663, + "learning_rate": 1.9459783913565427e-05, + "loss": 0.4978, + "step": 15278 + }, + { + "epoch": 19.55712, + "grad_norm": 1.0169779062271118, + "learning_rate": 1.94577831132453e-05, + "loss": 0.4731, + "step": 15279 + }, + { + "epoch": 19.5584, + "grad_norm": 1.0549458265304565, + "learning_rate": 1.945578231292517e-05, + "loss": 0.498, + "step": 15280 + }, + { + "epoch": 19.55968, + "grad_norm": 1.0301837921142578, + "learning_rate": 1.9453781512605042e-05, + "loss": 0.5407, + "step": 15281 + }, + { + "epoch": 19.56096, + "grad_norm": 1.0875555276870728, + "learning_rate": 1.9451780712284914e-05, + "loss": 0.509, + "step": 15282 + }, + { + "epoch": 19.56224, + "grad_norm": 1.072562336921692, + "learning_rate": 1.9449779911964786e-05, + "loss": 0.4804, + "step": 15283 + }, + { + "epoch": 19.56352, + "grad_norm": 1.0816986560821533, + "learning_rate": 1.9447779111644658e-05, + "loss": 0.5124, + "step": 15284 + }, + { + "epoch": 19.564799999999998, + "grad_norm": 1.0577350854873657, + "learning_rate": 1.944577831132453e-05, + "loss": 0.5007, + "step": 15285 + }, + { + "epoch": 19.56608, + "grad_norm": 1.0548467636108398, + "learning_rate": 1.94437775110044e-05, + "loss": 0.4651, + "step": 15286 + }, + { + "epoch": 19.56736, + "grad_norm": 1.0653223991394043, + "learning_rate": 1.9441776710684277e-05, + "loss": 0.4804, + "step": 15287 + }, + { + "epoch": 19.56864, + "grad_norm": 1.1180814504623413, + "learning_rate": 1.9439775910364145e-05, + "loss": 0.4905, + "step": 15288 + }, + { + "epoch": 19.56992, + "grad_norm": 1.0680838823318481, + "learning_rate": 1.9437775110044017e-05, + "loss": 0.4631, + "step": 15289 + }, + { + "epoch": 19.5712, + "grad_norm": 1.106482982635498, + "learning_rate": 1.943577430972389e-05, + "loss": 0.4835, + "step": 15290 + }, + { + "epoch": 19.57248, + "grad_norm": 1.105340838432312, + "learning_rate": 1.9433773509403764e-05, + "loss": 0.5132, + "step": 15291 + }, + { + "epoch": 19.57376, + "grad_norm": 1.0390416383743286, + "learning_rate": 1.9431772709083633e-05, + "loss": 0.4297, + "step": 15292 + }, + { + "epoch": 19.57504, + "grad_norm": 1.0809773206710815, + "learning_rate": 1.9429771908763505e-05, + "loss": 0.5219, + "step": 15293 + }, + { + "epoch": 19.57632, + "grad_norm": 0.9835126996040344, + "learning_rate": 1.942777110844338e-05, + "loss": 0.4615, + "step": 15294 + }, + { + "epoch": 19.5776, + "grad_norm": 1.0638185739517212, + "learning_rate": 1.942577030812325e-05, + "loss": 0.511, + "step": 15295 + }, + { + "epoch": 19.57888, + "grad_norm": 1.027719259262085, + "learning_rate": 1.942376950780312e-05, + "loss": 0.4703, + "step": 15296 + }, + { + "epoch": 19.58016, + "grad_norm": 1.035800814628601, + "learning_rate": 1.9421768707482992e-05, + "loss": 0.4904, + "step": 15297 + }, + { + "epoch": 19.58144, + "grad_norm": 1.0650843381881714, + "learning_rate": 1.9419767907162867e-05, + "loss": 0.512, + "step": 15298 + }, + { + "epoch": 19.58272, + "grad_norm": 1.0700833797454834, + "learning_rate": 1.941776710684274e-05, + "loss": 0.494, + "step": 15299 + }, + { + "epoch": 19.584, + "grad_norm": 1.0548943281173706, + "learning_rate": 1.9415766306522608e-05, + "loss": 0.4706, + "step": 15300 + }, + { + "epoch": 19.58528, + "grad_norm": 1.083795189857483, + "learning_rate": 1.9413765506202483e-05, + "loss": 0.4939, + "step": 15301 + }, + { + "epoch": 19.58656, + "grad_norm": 1.0601279735565186, + "learning_rate": 1.9411764705882355e-05, + "loss": 0.4932, + "step": 15302 + }, + { + "epoch": 19.58784, + "grad_norm": 1.109287977218628, + "learning_rate": 1.9409763905562226e-05, + "loss": 0.5211, + "step": 15303 + }, + { + "epoch": 19.58912, + "grad_norm": 1.0838240385055542, + "learning_rate": 1.9407763105242095e-05, + "loss": 0.4992, + "step": 15304 + }, + { + "epoch": 19.5904, + "grad_norm": 1.0001522302627563, + "learning_rate": 1.940576230492197e-05, + "loss": 0.4945, + "step": 15305 + }, + { + "epoch": 19.59168, + "grad_norm": 1.0223277807235718, + "learning_rate": 1.9403761504601842e-05, + "loss": 0.4509, + "step": 15306 + }, + { + "epoch": 19.59296, + "grad_norm": 1.0435092449188232, + "learning_rate": 1.9401760704281714e-05, + "loss": 0.4684, + "step": 15307 + }, + { + "epoch": 19.59424, + "grad_norm": 1.0610374212265015, + "learning_rate": 1.9399759903961586e-05, + "loss": 0.5055, + "step": 15308 + }, + { + "epoch": 19.59552, + "grad_norm": 1.1668132543563843, + "learning_rate": 1.9397759103641458e-05, + "loss": 0.5376, + "step": 15309 + }, + { + "epoch": 19.5968, + "grad_norm": 1.0593454837799072, + "learning_rate": 1.939575830332133e-05, + "loss": 0.4956, + "step": 15310 + }, + { + "epoch": 19.59808, + "grad_norm": 1.0181503295898438, + "learning_rate": 1.93937575030012e-05, + "loss": 0.4587, + "step": 15311 + }, + { + "epoch": 19.59936, + "grad_norm": 1.0744041204452515, + "learning_rate": 1.9391756702681073e-05, + "loss": 0.4853, + "step": 15312 + }, + { + "epoch": 19.60064, + "grad_norm": 1.075843334197998, + "learning_rate": 1.9389755902360945e-05, + "loss": 0.5086, + "step": 15313 + }, + { + "epoch": 19.60192, + "grad_norm": 1.0893293619155884, + "learning_rate": 1.9387755102040817e-05, + "loss": 0.5133, + "step": 15314 + }, + { + "epoch": 19.6032, + "grad_norm": 1.1128917932510376, + "learning_rate": 1.9385754301720692e-05, + "loss": 0.524, + "step": 15315 + }, + { + "epoch": 19.60448, + "grad_norm": 1.0819975137710571, + "learning_rate": 1.938375350140056e-05, + "loss": 0.4789, + "step": 15316 + }, + { + "epoch": 19.60576, + "grad_norm": 1.0744059085845947, + "learning_rate": 1.9381752701080432e-05, + "loss": 0.4927, + "step": 15317 + }, + { + "epoch": 19.60704, + "grad_norm": 1.0915199518203735, + "learning_rate": 1.9379751900760304e-05, + "loss": 0.5235, + "step": 15318 + }, + { + "epoch": 19.60832, + "grad_norm": 1.0718075037002563, + "learning_rate": 1.937775110044018e-05, + "loss": 0.4807, + "step": 15319 + }, + { + "epoch": 19.6096, + "grad_norm": 1.1020848751068115, + "learning_rate": 1.9375750300120048e-05, + "loss": 0.5598, + "step": 15320 + }, + { + "epoch": 19.61088, + "grad_norm": 1.1813325881958008, + "learning_rate": 1.937374949979992e-05, + "loss": 0.5096, + "step": 15321 + }, + { + "epoch": 19.61216, + "grad_norm": 1.097054362297058, + "learning_rate": 1.9371748699479795e-05, + "loss": 0.4994, + "step": 15322 + }, + { + "epoch": 19.61344, + "grad_norm": 1.1227426528930664, + "learning_rate": 1.9369747899159667e-05, + "loss": 0.517, + "step": 15323 + }, + { + "epoch": 19.61472, + "grad_norm": 1.0609750747680664, + "learning_rate": 1.9367747098839535e-05, + "loss": 0.4834, + "step": 15324 + }, + { + "epoch": 19.616, + "grad_norm": 1.1133859157562256, + "learning_rate": 1.9365746298519407e-05, + "loss": 0.5274, + "step": 15325 + }, + { + "epoch": 19.61728, + "grad_norm": 1.035995602607727, + "learning_rate": 1.9363745498199283e-05, + "loss": 0.4571, + "step": 15326 + }, + { + "epoch": 19.61856, + "grad_norm": 1.0632821321487427, + "learning_rate": 1.9361744697879154e-05, + "loss": 0.4931, + "step": 15327 + }, + { + "epoch": 19.61984, + "grad_norm": 1.1032476425170898, + "learning_rate": 1.9359743897559023e-05, + "loss": 0.4829, + "step": 15328 + }, + { + "epoch": 19.62112, + "grad_norm": 1.1157118082046509, + "learning_rate": 1.9357743097238895e-05, + "loss": 0.5142, + "step": 15329 + }, + { + "epoch": 19.6224, + "grad_norm": 1.0863420963287354, + "learning_rate": 1.935574229691877e-05, + "loss": 0.5191, + "step": 15330 + }, + { + "epoch": 19.62368, + "grad_norm": 1.044219970703125, + "learning_rate": 1.9353741496598642e-05, + "loss": 0.4757, + "step": 15331 + }, + { + "epoch": 19.62496, + "grad_norm": 1.0467369556427002, + "learning_rate": 1.935174069627851e-05, + "loss": 0.5285, + "step": 15332 + }, + { + "epoch": 19.62624, + "grad_norm": 1.1456965208053589, + "learning_rate": 1.9349739895958386e-05, + "loss": 0.5265, + "step": 15333 + }, + { + "epoch": 19.62752, + "grad_norm": 1.1731479167938232, + "learning_rate": 1.9347739095638257e-05, + "loss": 0.5279, + "step": 15334 + }, + { + "epoch": 19.6288, + "grad_norm": 1.0172317028045654, + "learning_rate": 1.934573829531813e-05, + "loss": 0.4484, + "step": 15335 + }, + { + "epoch": 19.63008, + "grad_norm": 1.009181022644043, + "learning_rate": 1.9343737494997998e-05, + "loss": 0.4591, + "step": 15336 + }, + { + "epoch": 19.63136, + "grad_norm": 1.1361030340194702, + "learning_rate": 1.9341736694677873e-05, + "loss": 0.5232, + "step": 15337 + }, + { + "epoch": 19.63264, + "grad_norm": 1.0451850891113281, + "learning_rate": 1.9339735894357745e-05, + "loss": 0.4996, + "step": 15338 + }, + { + "epoch": 19.63392, + "grad_norm": 1.0648033618927002, + "learning_rate": 1.9337735094037617e-05, + "loss": 0.5315, + "step": 15339 + }, + { + "epoch": 19.6352, + "grad_norm": 1.0276292562484741, + "learning_rate": 1.933573429371749e-05, + "loss": 0.4351, + "step": 15340 + }, + { + "epoch": 19.63648, + "grad_norm": 1.0635597705841064, + "learning_rate": 1.933373349339736e-05, + "loss": 0.5164, + "step": 15341 + }, + { + "epoch": 19.63776, + "grad_norm": 1.0696072578430176, + "learning_rate": 1.9331732693077232e-05, + "loss": 0.4814, + "step": 15342 + }, + { + "epoch": 19.63904, + "grad_norm": 1.0718597173690796, + "learning_rate": 1.9329731892757104e-05, + "loss": 0.5205, + "step": 15343 + }, + { + "epoch": 19.64032, + "grad_norm": 1.0730284452438354, + "learning_rate": 1.9327731092436976e-05, + "loss": 0.5375, + "step": 15344 + }, + { + "epoch": 19.6416, + "grad_norm": 1.0293726921081543, + "learning_rate": 1.9325730292116848e-05, + "loss": 0.4832, + "step": 15345 + }, + { + "epoch": 19.64288, + "grad_norm": 1.0186909437179565, + "learning_rate": 1.932372949179672e-05, + "loss": 0.4701, + "step": 15346 + }, + { + "epoch": 19.64416, + "grad_norm": 1.0497504472732544, + "learning_rate": 1.932172869147659e-05, + "loss": 0.4562, + "step": 15347 + }, + { + "epoch": 19.64544, + "grad_norm": 1.1007732152938843, + "learning_rate": 1.9319727891156463e-05, + "loss": 0.5467, + "step": 15348 + }, + { + "epoch": 19.64672, + "grad_norm": 1.1249552965164185, + "learning_rate": 1.9317727090836335e-05, + "loss": 0.4875, + "step": 15349 + }, + { + "epoch": 19.648, + "grad_norm": 1.0523337125778198, + "learning_rate": 1.9315726290516207e-05, + "loss": 0.4888, + "step": 15350 + }, + { + "epoch": 19.64928, + "grad_norm": 1.131332516670227, + "learning_rate": 1.931372549019608e-05, + "loss": 0.5072, + "step": 15351 + }, + { + "epoch": 19.65056, + "grad_norm": 1.137013554573059, + "learning_rate": 1.931172468987595e-05, + "loss": 0.4821, + "step": 15352 + }, + { + "epoch": 19.65184, + "grad_norm": 1.0466101169586182, + "learning_rate": 1.9309723889555823e-05, + "loss": 0.4602, + "step": 15353 + }, + { + "epoch": 19.65312, + "grad_norm": 0.9879499673843384, + "learning_rate": 1.9307723089235698e-05, + "loss": 0.4225, + "step": 15354 + }, + { + "epoch": 19.6544, + "grad_norm": 1.0248045921325684, + "learning_rate": 1.9305722288915566e-05, + "loss": 0.4888, + "step": 15355 + }, + { + "epoch": 19.65568, + "grad_norm": 1.0874507427215576, + "learning_rate": 1.9303721488595438e-05, + "loss": 0.5084, + "step": 15356 + }, + { + "epoch": 19.65696, + "grad_norm": 1.0725950002670288, + "learning_rate": 1.930172068827531e-05, + "loss": 0.5525, + "step": 15357 + }, + { + "epoch": 19.65824, + "grad_norm": 1.0607742071151733, + "learning_rate": 1.9299719887955185e-05, + "loss": 0.5004, + "step": 15358 + }, + { + "epoch": 19.65952, + "grad_norm": 1.1186076402664185, + "learning_rate": 1.9297719087635054e-05, + "loss": 0.5398, + "step": 15359 + }, + { + "epoch": 19.660800000000002, + "grad_norm": 1.0648303031921387, + "learning_rate": 1.9295718287314926e-05, + "loss": 0.4627, + "step": 15360 + }, + { + "epoch": 19.66208, + "grad_norm": 1.0890082120895386, + "learning_rate": 1.92937174869948e-05, + "loss": 0.5041, + "step": 15361 + }, + { + "epoch": 19.66336, + "grad_norm": 1.079013705253601, + "learning_rate": 1.9291716686674673e-05, + "loss": 0.5148, + "step": 15362 + }, + { + "epoch": 19.66464, + "grad_norm": 1.1085528135299683, + "learning_rate": 1.928971588635454e-05, + "loss": 0.534, + "step": 15363 + }, + { + "epoch": 19.66592, + "grad_norm": 1.06136953830719, + "learning_rate": 1.9287715086034413e-05, + "loss": 0.4668, + "step": 15364 + }, + { + "epoch": 19.6672, + "grad_norm": 1.0138074159622192, + "learning_rate": 1.928571428571429e-05, + "loss": 0.4922, + "step": 15365 + }, + { + "epoch": 19.66848, + "grad_norm": 1.0837033987045288, + "learning_rate": 1.928371348539416e-05, + "loss": 0.5442, + "step": 15366 + }, + { + "epoch": 19.66976, + "grad_norm": 1.1243685483932495, + "learning_rate": 1.928171268507403e-05, + "loss": 0.4993, + "step": 15367 + }, + { + "epoch": 19.67104, + "grad_norm": 1.0624139308929443, + "learning_rate": 1.9279711884753904e-05, + "loss": 0.4684, + "step": 15368 + }, + { + "epoch": 19.67232, + "grad_norm": 1.0839632749557495, + "learning_rate": 1.9277711084433776e-05, + "loss": 0.4872, + "step": 15369 + }, + { + "epoch": 19.6736, + "grad_norm": 1.078340768814087, + "learning_rate": 1.9275710284113648e-05, + "loss": 0.4678, + "step": 15370 + }, + { + "epoch": 19.67488, + "grad_norm": 1.0516289472579956, + "learning_rate": 1.9273709483793516e-05, + "loss": 0.4567, + "step": 15371 + }, + { + "epoch": 19.67616, + "grad_norm": 1.0610841512680054, + "learning_rate": 1.927170868347339e-05, + "loss": 0.4731, + "step": 15372 + }, + { + "epoch": 19.67744, + "grad_norm": 1.0811412334442139, + "learning_rate": 1.9269707883153263e-05, + "loss": 0.4978, + "step": 15373 + }, + { + "epoch": 19.67872, + "grad_norm": 1.0839446783065796, + "learning_rate": 1.9267707082833135e-05, + "loss": 0.4874, + "step": 15374 + }, + { + "epoch": 19.68, + "grad_norm": 1.06401526927948, + "learning_rate": 1.9265706282513007e-05, + "loss": 0.508, + "step": 15375 + }, + { + "epoch": 19.68128, + "grad_norm": 1.0685803890228271, + "learning_rate": 1.926370548219288e-05, + "loss": 0.507, + "step": 15376 + }, + { + "epoch": 19.68256, + "grad_norm": 1.075534462928772, + "learning_rate": 1.926170468187275e-05, + "loss": 0.5059, + "step": 15377 + }, + { + "epoch": 19.68384, + "grad_norm": 1.0783944129943848, + "learning_rate": 1.9259703881552623e-05, + "loss": 0.4845, + "step": 15378 + }, + { + "epoch": 19.68512, + "grad_norm": 1.0071983337402344, + "learning_rate": 1.9257703081232494e-05, + "loss": 0.4961, + "step": 15379 + }, + { + "epoch": 19.6864, + "grad_norm": 1.0298351049423218, + "learning_rate": 1.9255702280912366e-05, + "loss": 0.512, + "step": 15380 + }, + { + "epoch": 19.68768, + "grad_norm": 1.0420536994934082, + "learning_rate": 1.9253701480592238e-05, + "loss": 0.5143, + "step": 15381 + }, + { + "epoch": 19.68896, + "grad_norm": 1.153039813041687, + "learning_rate": 1.925170068027211e-05, + "loss": 0.5164, + "step": 15382 + }, + { + "epoch": 19.69024, + "grad_norm": 1.0602798461914062, + "learning_rate": 1.9249699879951982e-05, + "loss": 0.5358, + "step": 15383 + }, + { + "epoch": 19.69152, + "grad_norm": 1.0772628784179688, + "learning_rate": 1.9247699079631854e-05, + "loss": 0.4898, + "step": 15384 + }, + { + "epoch": 19.6928, + "grad_norm": 1.1267281770706177, + "learning_rate": 1.9245698279311726e-05, + "loss": 0.5324, + "step": 15385 + }, + { + "epoch": 19.69408, + "grad_norm": 1.059523344039917, + "learning_rate": 1.9243697478991597e-05, + "loss": 0.4948, + "step": 15386 + }, + { + "epoch": 19.69536, + "grad_norm": 1.0987871885299683, + "learning_rate": 1.924169667867147e-05, + "loss": 0.4947, + "step": 15387 + }, + { + "epoch": 19.69664, + "grad_norm": 1.0897448062896729, + "learning_rate": 1.923969587835134e-05, + "loss": 0.5271, + "step": 15388 + }, + { + "epoch": 19.69792, + "grad_norm": 1.0368708372116089, + "learning_rate": 1.9237695078031213e-05, + "loss": 0.4986, + "step": 15389 + }, + { + "epoch": 19.6992, + "grad_norm": 1.063676357269287, + "learning_rate": 1.9235694277711085e-05, + "loss": 0.4886, + "step": 15390 + }, + { + "epoch": 19.70048, + "grad_norm": 1.1547781229019165, + "learning_rate": 1.9233693477390957e-05, + "loss": 0.5081, + "step": 15391 + }, + { + "epoch": 19.70176, + "grad_norm": 1.0699111223220825, + "learning_rate": 1.923169267707083e-05, + "loss": 0.4777, + "step": 15392 + }, + { + "epoch": 19.70304, + "grad_norm": 1.0858879089355469, + "learning_rate": 1.9229691876750704e-05, + "loss": 0.5082, + "step": 15393 + }, + { + "epoch": 19.70432, + "grad_norm": 1.145923376083374, + "learning_rate": 1.9227691076430572e-05, + "loss": 0.5587, + "step": 15394 + }, + { + "epoch": 19.7056, + "grad_norm": 1.0954890251159668, + "learning_rate": 1.9225690276110444e-05, + "loss": 0.5212, + "step": 15395 + }, + { + "epoch": 19.706879999999998, + "grad_norm": 1.0601500272750854, + "learning_rate": 1.9223689475790316e-05, + "loss": 0.4863, + "step": 15396 + }, + { + "epoch": 19.70816, + "grad_norm": 1.0982835292816162, + "learning_rate": 1.922168867547019e-05, + "loss": 0.4968, + "step": 15397 + }, + { + "epoch": 19.70944, + "grad_norm": 1.0785760879516602, + "learning_rate": 1.921968787515006e-05, + "loss": 0.5218, + "step": 15398 + }, + { + "epoch": 19.71072, + "grad_norm": 1.069139003753662, + "learning_rate": 1.921768707482993e-05, + "loss": 0.4671, + "step": 15399 + }, + { + "epoch": 19.712, + "grad_norm": 1.085194706916809, + "learning_rate": 1.9215686274509807e-05, + "loss": 0.5103, + "step": 15400 + }, + { + "epoch": 19.71328, + "grad_norm": 1.1250481605529785, + "learning_rate": 1.921368547418968e-05, + "loss": 0.5061, + "step": 15401 + }, + { + "epoch": 19.71456, + "grad_norm": 1.0836914777755737, + "learning_rate": 1.9211684673869547e-05, + "loss": 0.4801, + "step": 15402 + }, + { + "epoch": 19.71584, + "grad_norm": 1.1382417678833008, + "learning_rate": 1.920968387354942e-05, + "loss": 0.5333, + "step": 15403 + }, + { + "epoch": 19.71712, + "grad_norm": 1.0880017280578613, + "learning_rate": 1.9207683073229294e-05, + "loss": 0.5191, + "step": 15404 + }, + { + "epoch": 19.7184, + "grad_norm": 1.089227557182312, + "learning_rate": 1.9205682272909166e-05, + "loss": 0.4923, + "step": 15405 + }, + { + "epoch": 19.71968, + "grad_norm": 1.0711054801940918, + "learning_rate": 1.9203681472589035e-05, + "loss": 0.5067, + "step": 15406 + }, + { + "epoch": 19.72096, + "grad_norm": 1.0419178009033203, + "learning_rate": 1.920168067226891e-05, + "loss": 0.4903, + "step": 15407 + }, + { + "epoch": 19.72224, + "grad_norm": 1.0852587223052979, + "learning_rate": 1.919967987194878e-05, + "loss": 0.5343, + "step": 15408 + }, + { + "epoch": 19.72352, + "grad_norm": 1.0830180644989014, + "learning_rate": 1.9197679071628653e-05, + "loss": 0.5289, + "step": 15409 + }, + { + "epoch": 19.7248, + "grad_norm": 1.050003170967102, + "learning_rate": 1.9195678271308522e-05, + "loss": 0.4923, + "step": 15410 + }, + { + "epoch": 19.72608, + "grad_norm": 1.1027849912643433, + "learning_rate": 1.9193677470988397e-05, + "loss": 0.5719, + "step": 15411 + }, + { + "epoch": 19.72736, + "grad_norm": 1.0838650465011597, + "learning_rate": 1.919167667066827e-05, + "loss": 0.5167, + "step": 15412 + }, + { + "epoch": 19.72864, + "grad_norm": 1.0345534086227417, + "learning_rate": 1.918967587034814e-05, + "loss": 0.4831, + "step": 15413 + }, + { + "epoch": 19.72992, + "grad_norm": 1.0454968214035034, + "learning_rate": 1.9187675070028013e-05, + "loss": 0.5205, + "step": 15414 + }, + { + "epoch": 19.7312, + "grad_norm": 1.0153090953826904, + "learning_rate": 1.9185674269707885e-05, + "loss": 0.5044, + "step": 15415 + }, + { + "epoch": 19.73248, + "grad_norm": 1.0405809879302979, + "learning_rate": 1.9183673469387756e-05, + "loss": 0.4734, + "step": 15416 + }, + { + "epoch": 19.73376, + "grad_norm": 1.0736857652664185, + "learning_rate": 1.918167266906763e-05, + "loss": 0.5145, + "step": 15417 + }, + { + "epoch": 19.73504, + "grad_norm": 1.0757614374160767, + "learning_rate": 1.91796718687475e-05, + "loss": 0.5051, + "step": 15418 + }, + { + "epoch": 19.73632, + "grad_norm": 1.074829339981079, + "learning_rate": 1.9177671068427372e-05, + "loss": 0.5238, + "step": 15419 + }, + { + "epoch": 19.7376, + "grad_norm": 1.0481175184249878, + "learning_rate": 1.9175670268107244e-05, + "loss": 0.501, + "step": 15420 + }, + { + "epoch": 19.73888, + "grad_norm": 1.1353001594543457, + "learning_rate": 1.9173669467787116e-05, + "loss": 0.4727, + "step": 15421 + }, + { + "epoch": 19.74016, + "grad_norm": 1.078800082206726, + "learning_rate": 1.9171668667466988e-05, + "loss": 0.4876, + "step": 15422 + }, + { + "epoch": 19.74144, + "grad_norm": 1.0725644826889038, + "learning_rate": 1.916966786714686e-05, + "loss": 0.489, + "step": 15423 + }, + { + "epoch": 19.74272, + "grad_norm": 1.0209654569625854, + "learning_rate": 1.916766706682673e-05, + "loss": 0.4757, + "step": 15424 + }, + { + "epoch": 19.744, + "grad_norm": 1.0331294536590576, + "learning_rate": 1.9165666266506603e-05, + "loss": 0.5106, + "step": 15425 + }, + { + "epoch": 19.74528, + "grad_norm": 1.0943084955215454, + "learning_rate": 1.9163665466186475e-05, + "loss": 0.4626, + "step": 15426 + }, + { + "epoch": 19.74656, + "grad_norm": 1.099189281463623, + "learning_rate": 1.9161664665866347e-05, + "loss": 0.4718, + "step": 15427 + }, + { + "epoch": 19.74784, + "grad_norm": 1.0979981422424316, + "learning_rate": 1.9159663865546222e-05, + "loss": 0.4987, + "step": 15428 + }, + { + "epoch": 19.74912, + "grad_norm": 1.0483365058898926, + "learning_rate": 1.915766306522609e-05, + "loss": 0.4775, + "step": 15429 + }, + { + "epoch": 19.7504, + "grad_norm": 1.0271607637405396, + "learning_rate": 1.9155662264905962e-05, + "loss": 0.4741, + "step": 15430 + }, + { + "epoch": 19.75168, + "grad_norm": 1.0611716508865356, + "learning_rate": 1.9153661464585834e-05, + "loss": 0.5134, + "step": 15431 + }, + { + "epoch": 19.75296, + "grad_norm": 1.0353494882583618, + "learning_rate": 1.915166066426571e-05, + "loss": 0.4661, + "step": 15432 + }, + { + "epoch": 19.75424, + "grad_norm": 1.0064977407455444, + "learning_rate": 1.9149659863945578e-05, + "loss": 0.4732, + "step": 15433 + }, + { + "epoch": 19.75552, + "grad_norm": 1.0152562856674194, + "learning_rate": 1.914765906362545e-05, + "loss": 0.4847, + "step": 15434 + }, + { + "epoch": 19.7568, + "grad_norm": 1.1019326448440552, + "learning_rate": 1.9145658263305325e-05, + "loss": 0.5283, + "step": 15435 + }, + { + "epoch": 19.75808, + "grad_norm": 1.0624425411224365, + "learning_rate": 1.9143657462985197e-05, + "loss": 0.4991, + "step": 15436 + }, + { + "epoch": 19.75936, + "grad_norm": 1.075663685798645, + "learning_rate": 1.9141656662665065e-05, + "loss": 0.5066, + "step": 15437 + }, + { + "epoch": 19.76064, + "grad_norm": 1.1295933723449707, + "learning_rate": 1.9139655862344937e-05, + "loss": 0.575, + "step": 15438 + }, + { + "epoch": 19.76192, + "grad_norm": 1.036911129951477, + "learning_rate": 1.9137655062024813e-05, + "loss": 0.4768, + "step": 15439 + }, + { + "epoch": 19.7632, + "grad_norm": 1.107503056526184, + "learning_rate": 1.9135654261704684e-05, + "loss": 0.5262, + "step": 15440 + }, + { + "epoch": 19.76448, + "grad_norm": 1.0643945932388306, + "learning_rate": 1.9133653461384553e-05, + "loss": 0.4138, + "step": 15441 + }, + { + "epoch": 19.76576, + "grad_norm": 1.0519750118255615, + "learning_rate": 1.9131652661064425e-05, + "loss": 0.4887, + "step": 15442 + }, + { + "epoch": 19.76704, + "grad_norm": 1.0428272485733032, + "learning_rate": 1.91296518607443e-05, + "loss": 0.4986, + "step": 15443 + }, + { + "epoch": 19.76832, + "grad_norm": 1.0842911005020142, + "learning_rate": 1.9127651060424172e-05, + "loss": 0.5026, + "step": 15444 + }, + { + "epoch": 19.7696, + "grad_norm": 1.1030406951904297, + "learning_rate": 1.912565026010404e-05, + "loss": 0.4867, + "step": 15445 + }, + { + "epoch": 19.77088, + "grad_norm": 1.0993808507919312, + "learning_rate": 1.9123649459783916e-05, + "loss": 0.4984, + "step": 15446 + }, + { + "epoch": 19.77216, + "grad_norm": 1.1211193799972534, + "learning_rate": 1.9121648659463787e-05, + "loss": 0.503, + "step": 15447 + }, + { + "epoch": 19.77344, + "grad_norm": 1.1349544525146484, + "learning_rate": 1.911964785914366e-05, + "loss": 0.5267, + "step": 15448 + }, + { + "epoch": 19.77472, + "grad_norm": 1.1778333187103271, + "learning_rate": 1.9117647058823528e-05, + "loss": 0.5236, + "step": 15449 + }, + { + "epoch": 19.776, + "grad_norm": 1.1410446166992188, + "learning_rate": 1.9115646258503403e-05, + "loss": 0.4975, + "step": 15450 + }, + { + "epoch": 19.77728, + "grad_norm": 1.0365501642227173, + "learning_rate": 1.9113645458183275e-05, + "loss": 0.4599, + "step": 15451 + }, + { + "epoch": 19.77856, + "grad_norm": 1.1023868322372437, + "learning_rate": 1.9111644657863147e-05, + "loss": 0.5273, + "step": 15452 + }, + { + "epoch": 19.77984, + "grad_norm": 1.103507399559021, + "learning_rate": 1.910964385754302e-05, + "loss": 0.4749, + "step": 15453 + }, + { + "epoch": 19.78112, + "grad_norm": 1.0735512971878052, + "learning_rate": 1.910764305722289e-05, + "loss": 0.4875, + "step": 15454 + }, + { + "epoch": 19.7824, + "grad_norm": 1.0677052736282349, + "learning_rate": 1.9105642256902762e-05, + "loss": 0.5173, + "step": 15455 + }, + { + "epoch": 19.78368, + "grad_norm": 1.0987626314163208, + "learning_rate": 1.9103641456582634e-05, + "loss": 0.5271, + "step": 15456 + }, + { + "epoch": 19.78496, + "grad_norm": 1.0896143913269043, + "learning_rate": 1.9101640656262506e-05, + "loss": 0.533, + "step": 15457 + }, + { + "epoch": 19.78624, + "grad_norm": 1.1035916805267334, + "learning_rate": 1.9099639855942378e-05, + "loss": 0.5359, + "step": 15458 + }, + { + "epoch": 19.78752, + "grad_norm": 1.1171396970748901, + "learning_rate": 1.909763905562225e-05, + "loss": 0.4939, + "step": 15459 + }, + { + "epoch": 19.7888, + "grad_norm": 1.0793712139129639, + "learning_rate": 1.909563825530212e-05, + "loss": 0.5262, + "step": 15460 + }, + { + "epoch": 19.79008, + "grad_norm": 1.0609002113342285, + "learning_rate": 1.9093637454981993e-05, + "loss": 0.4997, + "step": 15461 + }, + { + "epoch": 19.79136, + "grad_norm": 1.010659098625183, + "learning_rate": 1.9091636654661865e-05, + "loss": 0.4642, + "step": 15462 + }, + { + "epoch": 19.79264, + "grad_norm": 1.043384075164795, + "learning_rate": 1.9089635854341737e-05, + "loss": 0.5313, + "step": 15463 + }, + { + "epoch": 19.79392, + "grad_norm": 1.0543464422225952, + "learning_rate": 1.908763505402161e-05, + "loss": 0.4636, + "step": 15464 + }, + { + "epoch": 19.7952, + "grad_norm": 1.094262957572937, + "learning_rate": 1.908563425370148e-05, + "loss": 0.5249, + "step": 15465 + }, + { + "epoch": 19.79648, + "grad_norm": 1.1226550340652466, + "learning_rate": 1.9083633453381353e-05, + "loss": 0.55, + "step": 15466 + }, + { + "epoch": 19.79776, + "grad_norm": 1.1305582523345947, + "learning_rate": 1.9081632653061228e-05, + "loss": 0.5418, + "step": 15467 + }, + { + "epoch": 19.79904, + "grad_norm": 1.1155486106872559, + "learning_rate": 1.9079631852741096e-05, + "loss": 0.5226, + "step": 15468 + }, + { + "epoch": 19.80032, + "grad_norm": 1.1939311027526855, + "learning_rate": 1.9077631052420968e-05, + "loss": 0.5122, + "step": 15469 + }, + { + "epoch": 19.8016, + "grad_norm": 1.0629066228866577, + "learning_rate": 1.907563025210084e-05, + "loss": 0.4641, + "step": 15470 + }, + { + "epoch": 19.802880000000002, + "grad_norm": 1.05988609790802, + "learning_rate": 1.9073629451780715e-05, + "loss": 0.5368, + "step": 15471 + }, + { + "epoch": 19.80416, + "grad_norm": 1.1533082723617554, + "learning_rate": 1.9071628651460584e-05, + "loss": 0.578, + "step": 15472 + }, + { + "epoch": 19.80544, + "grad_norm": 1.1095813512802124, + "learning_rate": 1.9069627851140456e-05, + "loss": 0.4966, + "step": 15473 + }, + { + "epoch": 19.80672, + "grad_norm": 1.067834734916687, + "learning_rate": 1.906762705082033e-05, + "loss": 0.5486, + "step": 15474 + }, + { + "epoch": 19.808, + "grad_norm": 1.0650924444198608, + "learning_rate": 1.9065626250500203e-05, + "loss": 0.5034, + "step": 15475 + }, + { + "epoch": 19.80928, + "grad_norm": 1.0607784986495972, + "learning_rate": 1.906362545018007e-05, + "loss": 0.5092, + "step": 15476 + }, + { + "epoch": 19.81056, + "grad_norm": 1.0503934621810913, + "learning_rate": 1.9061624649859943e-05, + "loss": 0.4871, + "step": 15477 + }, + { + "epoch": 19.81184, + "grad_norm": 1.1574764251708984, + "learning_rate": 1.905962384953982e-05, + "loss": 0.5786, + "step": 15478 + }, + { + "epoch": 19.81312, + "grad_norm": 1.142246127128601, + "learning_rate": 1.905762304921969e-05, + "loss": 0.4866, + "step": 15479 + }, + { + "epoch": 19.8144, + "grad_norm": 1.0628846883773804, + "learning_rate": 1.905562224889956e-05, + "loss": 0.4879, + "step": 15480 + }, + { + "epoch": 19.81568, + "grad_norm": 1.1084834337234497, + "learning_rate": 1.9053621448579434e-05, + "loss": 0.5393, + "step": 15481 + }, + { + "epoch": 19.81696, + "grad_norm": 1.2051427364349365, + "learning_rate": 1.9051620648259306e-05, + "loss": 0.5179, + "step": 15482 + }, + { + "epoch": 19.81824, + "grad_norm": 1.0791866779327393, + "learning_rate": 1.9049619847939178e-05, + "loss": 0.463, + "step": 15483 + }, + { + "epoch": 19.81952, + "grad_norm": 1.0924525260925293, + "learning_rate": 1.9047619047619046e-05, + "loss": 0.4755, + "step": 15484 + }, + { + "epoch": 19.8208, + "grad_norm": 1.0845119953155518, + "learning_rate": 1.904561824729892e-05, + "loss": 0.5183, + "step": 15485 + }, + { + "epoch": 19.82208, + "grad_norm": 1.0582520961761475, + "learning_rate": 1.9043617446978793e-05, + "loss": 0.4608, + "step": 15486 + }, + { + "epoch": 19.82336, + "grad_norm": 1.0958303213119507, + "learning_rate": 1.9041616646658665e-05, + "loss": 0.4605, + "step": 15487 + }, + { + "epoch": 19.82464, + "grad_norm": 1.0015331506729126, + "learning_rate": 1.9039615846338537e-05, + "loss": 0.4579, + "step": 15488 + }, + { + "epoch": 19.82592, + "grad_norm": 1.1324008703231812, + "learning_rate": 1.903761504601841e-05, + "loss": 0.5423, + "step": 15489 + }, + { + "epoch": 19.8272, + "grad_norm": 1.0755281448364258, + "learning_rate": 1.903561424569828e-05, + "loss": 0.4549, + "step": 15490 + }, + { + "epoch": 19.82848, + "grad_norm": 1.1091095209121704, + "learning_rate": 1.9033613445378152e-05, + "loss": 0.5028, + "step": 15491 + }, + { + "epoch": 19.82976, + "grad_norm": 1.0952885150909424, + "learning_rate": 1.9031612645058024e-05, + "loss": 0.479, + "step": 15492 + }, + { + "epoch": 19.83104, + "grad_norm": 1.0651510953903198, + "learning_rate": 1.9029611844737896e-05, + "loss": 0.5052, + "step": 15493 + }, + { + "epoch": 19.83232, + "grad_norm": 1.1032085418701172, + "learning_rate": 1.9027611044417768e-05, + "loss": 0.5447, + "step": 15494 + }, + { + "epoch": 19.8336, + "grad_norm": 1.0811024904251099, + "learning_rate": 1.902561024409764e-05, + "loss": 0.4788, + "step": 15495 + }, + { + "epoch": 19.83488, + "grad_norm": 1.0974313020706177, + "learning_rate": 1.9023609443777512e-05, + "loss": 0.5162, + "step": 15496 + }, + { + "epoch": 19.83616, + "grad_norm": 1.0592910051345825, + "learning_rate": 1.9021608643457384e-05, + "loss": 0.4786, + "step": 15497 + }, + { + "epoch": 19.83744, + "grad_norm": 1.1197059154510498, + "learning_rate": 1.9019607843137255e-05, + "loss": 0.5087, + "step": 15498 + }, + { + "epoch": 19.83872, + "grad_norm": 1.0962718725204468, + "learning_rate": 1.9017607042817127e-05, + "loss": 0.5393, + "step": 15499 + }, + { + "epoch": 19.84, + "grad_norm": 1.1129200458526611, + "learning_rate": 1.9015606242497e-05, + "loss": 0.5287, + "step": 15500 + }, + { + "epoch": 19.84128, + "grad_norm": 1.049560785293579, + "learning_rate": 1.901360544217687e-05, + "loss": 0.4888, + "step": 15501 + }, + { + "epoch": 19.84256, + "grad_norm": 1.0872515439987183, + "learning_rate": 1.9011604641856743e-05, + "loss": 0.4652, + "step": 15502 + }, + { + "epoch": 19.84384, + "grad_norm": 1.112954020500183, + "learning_rate": 1.9009603841536615e-05, + "loss": 0.5514, + "step": 15503 + }, + { + "epoch": 19.84512, + "grad_norm": 1.0758328437805176, + "learning_rate": 1.9007603041216487e-05, + "loss": 0.4836, + "step": 15504 + }, + { + "epoch": 19.8464, + "grad_norm": 1.0734128952026367, + "learning_rate": 1.900560224089636e-05, + "loss": 0.5046, + "step": 15505 + }, + { + "epoch": 19.84768, + "grad_norm": 1.1305335760116577, + "learning_rate": 1.9003601440576234e-05, + "loss": 0.5106, + "step": 15506 + }, + { + "epoch": 19.84896, + "grad_norm": 1.1036813259124756, + "learning_rate": 1.9001600640256102e-05, + "loss": 0.5227, + "step": 15507 + }, + { + "epoch": 19.85024, + "grad_norm": 1.1305910348892212, + "learning_rate": 1.8999599839935974e-05, + "loss": 0.5204, + "step": 15508 + }, + { + "epoch": 19.85152, + "grad_norm": 1.0744602680206299, + "learning_rate": 1.8997599039615846e-05, + "loss": 0.4987, + "step": 15509 + }, + { + "epoch": 19.8528, + "grad_norm": 1.0705610513687134, + "learning_rate": 1.899559823929572e-05, + "loss": 0.5388, + "step": 15510 + }, + { + "epoch": 19.85408, + "grad_norm": 1.0107085704803467, + "learning_rate": 1.899359743897559e-05, + "loss": 0.4689, + "step": 15511 + }, + { + "epoch": 19.85536, + "grad_norm": 1.0287785530090332, + "learning_rate": 1.899159663865546e-05, + "loss": 0.4932, + "step": 15512 + }, + { + "epoch": 19.85664, + "grad_norm": 1.0263909101486206, + "learning_rate": 1.8989595838335337e-05, + "loss": 0.4614, + "step": 15513 + }, + { + "epoch": 19.85792, + "grad_norm": 1.0720847845077515, + "learning_rate": 1.898759503801521e-05, + "loss": 0.5024, + "step": 15514 + }, + { + "epoch": 19.8592, + "grad_norm": 1.1379327774047852, + "learning_rate": 1.8985594237695077e-05, + "loss": 0.4958, + "step": 15515 + }, + { + "epoch": 19.86048, + "grad_norm": 1.0998201370239258, + "learning_rate": 1.898359343737495e-05, + "loss": 0.5257, + "step": 15516 + }, + { + "epoch": 19.86176, + "grad_norm": 1.068842887878418, + "learning_rate": 1.8981592637054824e-05, + "loss": 0.4743, + "step": 15517 + }, + { + "epoch": 19.86304, + "grad_norm": 1.0820146799087524, + "learning_rate": 1.8979591836734696e-05, + "loss": 0.5448, + "step": 15518 + }, + { + "epoch": 19.86432, + "grad_norm": 1.0579636096954346, + "learning_rate": 1.8977591036414564e-05, + "loss": 0.5176, + "step": 15519 + }, + { + "epoch": 19.8656, + "grad_norm": 1.0649138689041138, + "learning_rate": 1.897559023609444e-05, + "loss": 0.4429, + "step": 15520 + }, + { + "epoch": 19.86688, + "grad_norm": 1.0768613815307617, + "learning_rate": 1.897358943577431e-05, + "loss": 0.4847, + "step": 15521 + }, + { + "epoch": 19.86816, + "grad_norm": 1.0212260484695435, + "learning_rate": 1.8971588635454183e-05, + "loss": 0.4501, + "step": 15522 + }, + { + "epoch": 19.86944, + "grad_norm": 1.0247191190719604, + "learning_rate": 1.8969587835134052e-05, + "loss": 0.4715, + "step": 15523 + }, + { + "epoch": 19.87072, + "grad_norm": 1.0731992721557617, + "learning_rate": 1.8967587034813927e-05, + "loss": 0.4961, + "step": 15524 + }, + { + "epoch": 19.872, + "grad_norm": 1.0773158073425293, + "learning_rate": 1.89655862344938e-05, + "loss": 0.4927, + "step": 15525 + }, + { + "epoch": 19.87328, + "grad_norm": 1.0747638940811157, + "learning_rate": 1.896358543417367e-05, + "loss": 0.5224, + "step": 15526 + }, + { + "epoch": 19.87456, + "grad_norm": 1.0612146854400635, + "learning_rate": 1.8961584633853543e-05, + "loss": 0.5373, + "step": 15527 + }, + { + "epoch": 19.87584, + "grad_norm": 1.1125435829162598, + "learning_rate": 1.8959583833533415e-05, + "loss": 0.5768, + "step": 15528 + }, + { + "epoch": 19.87712, + "grad_norm": 1.0910027027130127, + "learning_rate": 1.8957583033213286e-05, + "loss": 0.5322, + "step": 15529 + }, + { + "epoch": 19.8784, + "grad_norm": 1.1110457181930542, + "learning_rate": 1.8955582232893158e-05, + "loss": 0.5189, + "step": 15530 + }, + { + "epoch": 19.87968, + "grad_norm": 1.0447136163711548, + "learning_rate": 1.895358143257303e-05, + "loss": 0.4795, + "step": 15531 + }, + { + "epoch": 19.88096, + "grad_norm": 1.0677382946014404, + "learning_rate": 1.8951580632252902e-05, + "loss": 0.4929, + "step": 15532 + }, + { + "epoch": 19.88224, + "grad_norm": 1.038959264755249, + "learning_rate": 1.8949579831932774e-05, + "loss": 0.4779, + "step": 15533 + }, + { + "epoch": 19.88352, + "grad_norm": 1.0871193408966064, + "learning_rate": 1.8947579031612646e-05, + "loss": 0.48, + "step": 15534 + }, + { + "epoch": 19.8848, + "grad_norm": 1.1005915403366089, + "learning_rate": 1.8945578231292518e-05, + "loss": 0.5181, + "step": 15535 + }, + { + "epoch": 19.88608, + "grad_norm": 1.0428812503814697, + "learning_rate": 1.894357743097239e-05, + "loss": 0.4534, + "step": 15536 + }, + { + "epoch": 19.88736, + "grad_norm": 1.0860538482666016, + "learning_rate": 1.894157663065226e-05, + "loss": 0.5012, + "step": 15537 + }, + { + "epoch": 19.88864, + "grad_norm": 1.0798299312591553, + "learning_rate": 1.8939575830332133e-05, + "loss": 0.5091, + "step": 15538 + }, + { + "epoch": 19.88992, + "grad_norm": 1.0804452896118164, + "learning_rate": 1.8937575030012005e-05, + "loss": 0.4978, + "step": 15539 + }, + { + "epoch": 19.8912, + "grad_norm": 1.069659948348999, + "learning_rate": 1.8935574229691877e-05, + "loss": 0.4902, + "step": 15540 + }, + { + "epoch": 19.89248, + "grad_norm": 1.1372871398925781, + "learning_rate": 1.8933573429371752e-05, + "loss": 0.5438, + "step": 15541 + }, + { + "epoch": 19.89376, + "grad_norm": 1.079403281211853, + "learning_rate": 1.893157262905162e-05, + "loss": 0.4882, + "step": 15542 + }, + { + "epoch": 19.89504, + "grad_norm": 1.0226234197616577, + "learning_rate": 1.8929571828731492e-05, + "loss": 0.4323, + "step": 15543 + }, + { + "epoch": 19.89632, + "grad_norm": 1.0797722339630127, + "learning_rate": 1.8927571028411364e-05, + "loss": 0.507, + "step": 15544 + }, + { + "epoch": 19.8976, + "grad_norm": 1.0766708850860596, + "learning_rate": 1.892557022809124e-05, + "loss": 0.5058, + "step": 15545 + }, + { + "epoch": 19.89888, + "grad_norm": 1.0884517431259155, + "learning_rate": 1.8923569427771108e-05, + "loss": 0.5136, + "step": 15546 + }, + { + "epoch": 19.90016, + "grad_norm": 1.1171706914901733, + "learning_rate": 1.892156862745098e-05, + "loss": 0.5082, + "step": 15547 + }, + { + "epoch": 19.90144, + "grad_norm": 1.065116286277771, + "learning_rate": 1.8919567827130855e-05, + "loss": 0.5113, + "step": 15548 + }, + { + "epoch": 19.90272, + "grad_norm": 1.0674129724502563, + "learning_rate": 1.8917567026810727e-05, + "loss": 0.4844, + "step": 15549 + }, + { + "epoch": 19.904, + "grad_norm": 1.123174786567688, + "learning_rate": 1.8915566226490595e-05, + "loss": 0.527, + "step": 15550 + }, + { + "epoch": 19.90528, + "grad_norm": 1.113181710243225, + "learning_rate": 1.8913565426170467e-05, + "loss": 0.5275, + "step": 15551 + }, + { + "epoch": 19.90656, + "grad_norm": 1.080833077430725, + "learning_rate": 1.8911564625850343e-05, + "loss": 0.5218, + "step": 15552 + }, + { + "epoch": 19.90784, + "grad_norm": 1.1273305416107178, + "learning_rate": 1.8909563825530214e-05, + "loss": 0.5157, + "step": 15553 + }, + { + "epoch": 19.90912, + "grad_norm": 1.1139534711837769, + "learning_rate": 1.8907563025210083e-05, + "loss": 0.5408, + "step": 15554 + }, + { + "epoch": 19.9104, + "grad_norm": 1.0617810487747192, + "learning_rate": 1.8905562224889955e-05, + "loss": 0.4972, + "step": 15555 + }, + { + "epoch": 19.91168, + "grad_norm": 1.0944743156433105, + "learning_rate": 1.890356142456983e-05, + "loss": 0.5016, + "step": 15556 + }, + { + "epoch": 19.912959999999998, + "grad_norm": 1.0922056436538696, + "learning_rate": 1.8901560624249702e-05, + "loss": 0.5228, + "step": 15557 + }, + { + "epoch": 19.91424, + "grad_norm": 1.060390830039978, + "learning_rate": 1.889955982392957e-05, + "loss": 0.5006, + "step": 15558 + }, + { + "epoch": 19.91552, + "grad_norm": 1.0017435550689697, + "learning_rate": 1.8897559023609446e-05, + "loss": 0.4723, + "step": 15559 + }, + { + "epoch": 19.9168, + "grad_norm": 1.0966525077819824, + "learning_rate": 1.8895558223289317e-05, + "loss": 0.5312, + "step": 15560 + }, + { + "epoch": 19.91808, + "grad_norm": 1.037953495979309, + "learning_rate": 1.889355742296919e-05, + "loss": 0.5364, + "step": 15561 + }, + { + "epoch": 19.91936, + "grad_norm": 1.0990053415298462, + "learning_rate": 1.8891556622649058e-05, + "loss": 0.515, + "step": 15562 + }, + { + "epoch": 19.92064, + "grad_norm": 1.1318585872650146, + "learning_rate": 1.8889555822328933e-05, + "loss": 0.5202, + "step": 15563 + }, + { + "epoch": 19.92192, + "grad_norm": 1.0594831705093384, + "learning_rate": 1.8887555022008805e-05, + "loss": 0.5013, + "step": 15564 + }, + { + "epoch": 19.9232, + "grad_norm": 1.0701175928115845, + "learning_rate": 1.8885554221688677e-05, + "loss": 0.4641, + "step": 15565 + }, + { + "epoch": 19.92448, + "grad_norm": 1.095017433166504, + "learning_rate": 1.888355342136855e-05, + "loss": 0.5022, + "step": 15566 + }, + { + "epoch": 19.92576, + "grad_norm": 1.1008120775222778, + "learning_rate": 1.888155262104842e-05, + "loss": 0.5144, + "step": 15567 + }, + { + "epoch": 19.92704, + "grad_norm": 1.0026847124099731, + "learning_rate": 1.8879551820728292e-05, + "loss": 0.4596, + "step": 15568 + }, + { + "epoch": 19.92832, + "grad_norm": 1.0756937265396118, + "learning_rate": 1.8877551020408164e-05, + "loss": 0.4879, + "step": 15569 + }, + { + "epoch": 19.9296, + "grad_norm": 1.0815765857696533, + "learning_rate": 1.8875550220088036e-05, + "loss": 0.533, + "step": 15570 + }, + { + "epoch": 19.93088, + "grad_norm": 1.0457710027694702, + "learning_rate": 1.8873549419767908e-05, + "loss": 0.5204, + "step": 15571 + }, + { + "epoch": 19.93216, + "grad_norm": 1.008851408958435, + "learning_rate": 1.887154861944778e-05, + "loss": 0.5, + "step": 15572 + }, + { + "epoch": 19.93344, + "grad_norm": 1.0376039743423462, + "learning_rate": 1.886954781912765e-05, + "loss": 0.5194, + "step": 15573 + }, + { + "epoch": 19.93472, + "grad_norm": 1.0573559999465942, + "learning_rate": 1.8867547018807523e-05, + "loss": 0.4944, + "step": 15574 + }, + { + "epoch": 19.936, + "grad_norm": 1.035343885421753, + "learning_rate": 1.8865546218487395e-05, + "loss": 0.4731, + "step": 15575 + }, + { + "epoch": 19.93728, + "grad_norm": 1.033947229385376, + "learning_rate": 1.8863545418167267e-05, + "loss": 0.4792, + "step": 15576 + }, + { + "epoch": 19.93856, + "grad_norm": 1.0770082473754883, + "learning_rate": 1.886154461784714e-05, + "loss": 0.5264, + "step": 15577 + }, + { + "epoch": 19.93984, + "grad_norm": 1.0262564420700073, + "learning_rate": 1.885954381752701e-05, + "loss": 0.5304, + "step": 15578 + }, + { + "epoch": 19.94112, + "grad_norm": 1.0958056449890137, + "learning_rate": 1.8857543017206883e-05, + "loss": 0.5294, + "step": 15579 + }, + { + "epoch": 19.9424, + "grad_norm": 1.1022257804870605, + "learning_rate": 1.8855542216886758e-05, + "loss": 0.4546, + "step": 15580 + }, + { + "epoch": 19.94368, + "grad_norm": 1.0951906442642212, + "learning_rate": 1.8853541416566626e-05, + "loss": 0.5054, + "step": 15581 + }, + { + "epoch": 19.944960000000002, + "grad_norm": 1.0561892986297607, + "learning_rate": 1.8851540616246498e-05, + "loss": 0.4944, + "step": 15582 + }, + { + "epoch": 19.94624, + "grad_norm": 1.084155559539795, + "learning_rate": 1.884953981592637e-05, + "loss": 0.5098, + "step": 15583 + }, + { + "epoch": 19.94752, + "grad_norm": 1.0809601545333862, + "learning_rate": 1.8847539015606245e-05, + "loss": 0.5657, + "step": 15584 + }, + { + "epoch": 19.9488, + "grad_norm": 1.118355393409729, + "learning_rate": 1.8845538215286114e-05, + "loss": 0.5551, + "step": 15585 + }, + { + "epoch": 19.95008, + "grad_norm": 1.0390595197677612, + "learning_rate": 1.8843537414965986e-05, + "loss": 0.476, + "step": 15586 + }, + { + "epoch": 19.95136, + "grad_norm": 1.0548921823501587, + "learning_rate": 1.884153661464586e-05, + "loss": 0.5211, + "step": 15587 + }, + { + "epoch": 19.95264, + "grad_norm": 1.0728892087936401, + "learning_rate": 1.8839535814325733e-05, + "loss": 0.4795, + "step": 15588 + }, + { + "epoch": 19.95392, + "grad_norm": 1.0571420192718506, + "learning_rate": 1.88375350140056e-05, + "loss": 0.5225, + "step": 15589 + }, + { + "epoch": 19.9552, + "grad_norm": 1.0640194416046143, + "learning_rate": 1.8835534213685473e-05, + "loss": 0.4759, + "step": 15590 + }, + { + "epoch": 19.95648, + "grad_norm": 1.0471768379211426, + "learning_rate": 1.883353341336535e-05, + "loss": 0.4839, + "step": 15591 + }, + { + "epoch": 19.95776, + "grad_norm": 1.0839452743530273, + "learning_rate": 1.883153261304522e-05, + "loss": 0.522, + "step": 15592 + }, + { + "epoch": 19.95904, + "grad_norm": 1.0934643745422363, + "learning_rate": 1.882953181272509e-05, + "loss": 0.5176, + "step": 15593 + }, + { + "epoch": 19.96032, + "grad_norm": 1.0967458486557007, + "learning_rate": 1.8827531012404964e-05, + "loss": 0.5198, + "step": 15594 + }, + { + "epoch": 19.9616, + "grad_norm": 1.0761650800704956, + "learning_rate": 1.8825530212084836e-05, + "loss": 0.5091, + "step": 15595 + }, + { + "epoch": 19.96288, + "grad_norm": 1.0557310581207275, + "learning_rate": 1.8823529411764708e-05, + "loss": 0.496, + "step": 15596 + }, + { + "epoch": 19.96416, + "grad_norm": 1.0711382627487183, + "learning_rate": 1.8821528611444576e-05, + "loss": 0.4703, + "step": 15597 + }, + { + "epoch": 19.96544, + "grad_norm": 1.1365162134170532, + "learning_rate": 1.881952781112445e-05, + "loss": 0.4936, + "step": 15598 + }, + { + "epoch": 19.96672, + "grad_norm": 1.141715168952942, + "learning_rate": 1.8817527010804323e-05, + "loss": 0.4871, + "step": 15599 + }, + { + "epoch": 19.968, + "grad_norm": 1.146036148071289, + "learning_rate": 1.8815526210484195e-05, + "loss": 0.5403, + "step": 15600 + }, + { + "epoch": 19.96928, + "grad_norm": 1.123439073562622, + "learning_rate": 1.8813525410164067e-05, + "loss": 0.5526, + "step": 15601 + }, + { + "epoch": 19.97056, + "grad_norm": 1.0493305921554565, + "learning_rate": 1.881152460984394e-05, + "loss": 0.4881, + "step": 15602 + }, + { + "epoch": 19.97184, + "grad_norm": 1.0788578987121582, + "learning_rate": 1.880952380952381e-05, + "loss": 0.481, + "step": 15603 + }, + { + "epoch": 19.97312, + "grad_norm": 1.089673399925232, + "learning_rate": 1.8807523009203682e-05, + "loss": 0.5252, + "step": 15604 + }, + { + "epoch": 19.9744, + "grad_norm": 1.0892081260681152, + "learning_rate": 1.8805522208883554e-05, + "loss": 0.4939, + "step": 15605 + }, + { + "epoch": 19.97568, + "grad_norm": 1.0614848136901855, + "learning_rate": 1.8803521408563426e-05, + "loss": 0.5036, + "step": 15606 + }, + { + "epoch": 19.97696, + "grad_norm": 1.028151512145996, + "learning_rate": 1.8801520608243298e-05, + "loss": 0.4862, + "step": 15607 + }, + { + "epoch": 19.97824, + "grad_norm": 1.0651766061782837, + "learning_rate": 1.879951980792317e-05, + "loss": 0.4842, + "step": 15608 + }, + { + "epoch": 19.97952, + "grad_norm": 1.0863500833511353, + "learning_rate": 1.8797519007603042e-05, + "loss": 0.4905, + "step": 15609 + }, + { + "epoch": 19.9808, + "grad_norm": 1.0720291137695312, + "learning_rate": 1.8795518207282914e-05, + "loss": 0.4991, + "step": 15610 + }, + { + "epoch": 19.98208, + "grad_norm": 1.0834654569625854, + "learning_rate": 1.8793517406962785e-05, + "loss": 0.5061, + "step": 15611 + }, + { + "epoch": 19.98336, + "grad_norm": 1.0607835054397583, + "learning_rate": 1.8791516606642657e-05, + "loss": 0.4861, + "step": 15612 + }, + { + "epoch": 19.98464, + "grad_norm": 1.1219713687896729, + "learning_rate": 1.878951580632253e-05, + "loss": 0.5357, + "step": 15613 + }, + { + "epoch": 19.98592, + "grad_norm": 1.078185796737671, + "learning_rate": 1.87875150060024e-05, + "loss": 0.5251, + "step": 15614 + }, + { + "epoch": 19.9872, + "grad_norm": 1.1310948133468628, + "learning_rate": 1.8785514205682273e-05, + "loss": 0.5558, + "step": 15615 + }, + { + "epoch": 19.98848, + "grad_norm": 1.0436378717422485, + "learning_rate": 1.8783513405362145e-05, + "loss": 0.4855, + "step": 15616 + }, + { + "epoch": 19.98976, + "grad_norm": 1.074711799621582, + "learning_rate": 1.8781512605042017e-05, + "loss": 0.4971, + "step": 15617 + }, + { + "epoch": 19.99104, + "grad_norm": 1.123464584350586, + "learning_rate": 1.877951180472189e-05, + "loss": 0.5288, + "step": 15618 + }, + { + "epoch": 19.99232, + "grad_norm": 1.015176773071289, + "learning_rate": 1.8777511004401764e-05, + "loss": 0.5137, + "step": 15619 + }, + { + "epoch": 19.9936, + "grad_norm": 1.0888524055480957, + "learning_rate": 1.8775510204081632e-05, + "loss": 0.4858, + "step": 15620 + }, + { + "epoch": 19.99488, + "grad_norm": 1.0319979190826416, + "learning_rate": 1.8773509403761504e-05, + "loss": 0.4687, + "step": 15621 + }, + { + "epoch": 19.99616, + "grad_norm": 1.116672396659851, + "learning_rate": 1.8771508603441376e-05, + "loss": 0.5169, + "step": 15622 + }, + { + "epoch": 19.99744, + "grad_norm": 1.0477895736694336, + "learning_rate": 1.876950780312125e-05, + "loss": 0.5392, + "step": 15623 + }, + { + "epoch": 19.99872, + "grad_norm": 1.069499135017395, + "learning_rate": 1.876750700280112e-05, + "loss": 0.4781, + "step": 15624 + }, + { + "epoch": 20.0, + "grad_norm": 2.609760046005249, + "learning_rate": 1.876550620248099e-05, + "loss": 1.014, + "step": 15625 + }, + { + "epoch": 20.00128, + "grad_norm": 1.0402753353118896, + "learning_rate": 1.8763505402160867e-05, + "loss": 0.4348, + "step": 15626 + }, + { + "epoch": 20.00256, + "grad_norm": 1.0104725360870361, + "learning_rate": 1.876150460184074e-05, + "loss": 0.4641, + "step": 15627 + }, + { + "epoch": 20.00384, + "grad_norm": 1.1191972494125366, + "learning_rate": 1.8759503801520607e-05, + "loss": 0.5, + "step": 15628 + }, + { + "epoch": 20.00512, + "grad_norm": 1.0969229936599731, + "learning_rate": 1.875750300120048e-05, + "loss": 0.5265, + "step": 15629 + }, + { + "epoch": 20.0064, + "grad_norm": 1.0212663412094116, + "learning_rate": 1.8755502200880354e-05, + "loss": 0.4381, + "step": 15630 + }, + { + "epoch": 20.00768, + "grad_norm": 1.1064248085021973, + "learning_rate": 1.8753501400560226e-05, + "loss": 0.5045, + "step": 15631 + }, + { + "epoch": 20.00896, + "grad_norm": 1.134947419166565, + "learning_rate": 1.8751500600240094e-05, + "loss": 0.5008, + "step": 15632 + }, + { + "epoch": 20.01024, + "grad_norm": 1.0822664499282837, + "learning_rate": 1.874949979991997e-05, + "loss": 0.4941, + "step": 15633 + }, + { + "epoch": 20.01152, + "grad_norm": 1.0499417781829834, + "learning_rate": 1.874749899959984e-05, + "loss": 0.4508, + "step": 15634 + }, + { + "epoch": 20.0128, + "grad_norm": 1.050943374633789, + "learning_rate": 1.8745498199279713e-05, + "loss": 0.5099, + "step": 15635 + }, + { + "epoch": 20.01408, + "grad_norm": 1.016408085823059, + "learning_rate": 1.8743497398959582e-05, + "loss": 0.4515, + "step": 15636 + }, + { + "epoch": 20.01536, + "grad_norm": 1.0423884391784668, + "learning_rate": 1.8741496598639457e-05, + "loss": 0.5248, + "step": 15637 + }, + { + "epoch": 20.01664, + "grad_norm": 0.9838659763336182, + "learning_rate": 1.873949579831933e-05, + "loss": 0.4533, + "step": 15638 + }, + { + "epoch": 20.01792, + "grad_norm": 1.0517929792404175, + "learning_rate": 1.87374949979992e-05, + "loss": 0.4678, + "step": 15639 + }, + { + "epoch": 20.0192, + "grad_norm": 1.0576074123382568, + "learning_rate": 1.8735494197679073e-05, + "loss": 0.471, + "step": 15640 + }, + { + "epoch": 20.02048, + "grad_norm": 1.119922161102295, + "learning_rate": 1.8733493397358945e-05, + "loss": 0.5088, + "step": 15641 + }, + { + "epoch": 20.02176, + "grad_norm": 1.0552582740783691, + "learning_rate": 1.8731492597038816e-05, + "loss": 0.477, + "step": 15642 + }, + { + "epoch": 20.02304, + "grad_norm": 1.0699156522750854, + "learning_rate": 1.8729491796718688e-05, + "loss": 0.5483, + "step": 15643 + }, + { + "epoch": 20.02432, + "grad_norm": 1.020215630531311, + "learning_rate": 1.872749099639856e-05, + "loss": 0.4916, + "step": 15644 + }, + { + "epoch": 20.0256, + "grad_norm": 1.036571741104126, + "learning_rate": 1.8725490196078432e-05, + "loss": 0.4523, + "step": 15645 + }, + { + "epoch": 20.02688, + "grad_norm": 1.095461130142212, + "learning_rate": 1.8723489395758304e-05, + "loss": 0.4832, + "step": 15646 + }, + { + "epoch": 20.02816, + "grad_norm": 1.053385853767395, + "learning_rate": 1.872148859543818e-05, + "loss": 0.4627, + "step": 15647 + }, + { + "epoch": 20.02944, + "grad_norm": 1.0644296407699585, + "learning_rate": 1.8719487795118048e-05, + "loss": 0.4749, + "step": 15648 + }, + { + "epoch": 20.03072, + "grad_norm": 1.083106517791748, + "learning_rate": 1.871748699479792e-05, + "loss": 0.4926, + "step": 15649 + }, + { + "epoch": 20.032, + "grad_norm": 1.06119704246521, + "learning_rate": 1.871548619447779e-05, + "loss": 0.4924, + "step": 15650 + }, + { + "epoch": 20.03328, + "grad_norm": 1.0661925077438354, + "learning_rate": 1.8713485394157667e-05, + "loss": 0.4581, + "step": 15651 + }, + { + "epoch": 20.03456, + "grad_norm": 1.1228007078170776, + "learning_rate": 1.8711484593837535e-05, + "loss": 0.5334, + "step": 15652 + }, + { + "epoch": 20.03584, + "grad_norm": 1.0884519815444946, + "learning_rate": 1.8709483793517407e-05, + "loss": 0.4621, + "step": 15653 + }, + { + "epoch": 20.03712, + "grad_norm": 1.0557575225830078, + "learning_rate": 1.8707482993197282e-05, + "loss": 0.4675, + "step": 15654 + }, + { + "epoch": 20.0384, + "grad_norm": 1.0507951974868774, + "learning_rate": 1.8705482192877154e-05, + "loss": 0.4968, + "step": 15655 + }, + { + "epoch": 20.03968, + "grad_norm": 1.0624488592147827, + "learning_rate": 1.8703481392557022e-05, + "loss": 0.4994, + "step": 15656 + }, + { + "epoch": 20.04096, + "grad_norm": 1.0612828731536865, + "learning_rate": 1.8701480592236894e-05, + "loss": 0.4935, + "step": 15657 + }, + { + "epoch": 20.04224, + "grad_norm": 1.0293620824813843, + "learning_rate": 1.869947979191677e-05, + "loss": 0.4321, + "step": 15658 + }, + { + "epoch": 20.04352, + "grad_norm": 1.074039101600647, + "learning_rate": 1.869747899159664e-05, + "loss": 0.474, + "step": 15659 + }, + { + "epoch": 20.0448, + "grad_norm": 1.0148733854293823, + "learning_rate": 1.869547819127651e-05, + "loss": 0.478, + "step": 15660 + }, + { + "epoch": 20.04608, + "grad_norm": 1.1181195974349976, + "learning_rate": 1.8693477390956382e-05, + "loss": 0.5195, + "step": 15661 + }, + { + "epoch": 20.04736, + "grad_norm": 1.0959631204605103, + "learning_rate": 1.8691476590636257e-05, + "loss": 0.5102, + "step": 15662 + }, + { + "epoch": 20.04864, + "grad_norm": 1.070541262626648, + "learning_rate": 1.868947579031613e-05, + "loss": 0.4831, + "step": 15663 + }, + { + "epoch": 20.04992, + "grad_norm": 1.032403826713562, + "learning_rate": 1.8687474989995997e-05, + "loss": 0.4576, + "step": 15664 + }, + { + "epoch": 20.0512, + "grad_norm": 0.9710387587547302, + "learning_rate": 1.8685474189675873e-05, + "loss": 0.464, + "step": 15665 + }, + { + "epoch": 20.05248, + "grad_norm": 1.1018273830413818, + "learning_rate": 1.8683473389355744e-05, + "loss": 0.4972, + "step": 15666 + }, + { + "epoch": 20.05376, + "grad_norm": 0.9961934089660645, + "learning_rate": 1.8681472589035616e-05, + "loss": 0.4921, + "step": 15667 + }, + { + "epoch": 20.05504, + "grad_norm": 1.1243575811386108, + "learning_rate": 1.8679471788715485e-05, + "loss": 0.4818, + "step": 15668 + }, + { + "epoch": 20.05632, + "grad_norm": 1.05106520652771, + "learning_rate": 1.867747098839536e-05, + "loss": 0.4697, + "step": 15669 + }, + { + "epoch": 20.0576, + "grad_norm": 1.0926669836044312, + "learning_rate": 1.8675470188075232e-05, + "loss": 0.5248, + "step": 15670 + }, + { + "epoch": 20.05888, + "grad_norm": 1.1060155630111694, + "learning_rate": 1.8673469387755104e-05, + "loss": 0.4986, + "step": 15671 + }, + { + "epoch": 20.06016, + "grad_norm": 1.0984371900558472, + "learning_rate": 1.8671468587434976e-05, + "loss": 0.5017, + "step": 15672 + }, + { + "epoch": 20.06144, + "grad_norm": 1.0190678834915161, + "learning_rate": 1.8669467787114847e-05, + "loss": 0.4497, + "step": 15673 + }, + { + "epoch": 20.06272, + "grad_norm": 1.1226532459259033, + "learning_rate": 1.866746698679472e-05, + "loss": 0.5061, + "step": 15674 + }, + { + "epoch": 20.064, + "grad_norm": 1.016611099243164, + "learning_rate": 1.866546618647459e-05, + "loss": 0.4425, + "step": 15675 + }, + { + "epoch": 20.06528, + "grad_norm": 1.1587402820587158, + "learning_rate": 1.8663465386154463e-05, + "loss": 0.4797, + "step": 15676 + }, + { + "epoch": 20.06656, + "grad_norm": 1.0697815418243408, + "learning_rate": 1.8661464585834335e-05, + "loss": 0.4623, + "step": 15677 + }, + { + "epoch": 20.06784, + "grad_norm": 1.0413057804107666, + "learning_rate": 1.8659463785514207e-05, + "loss": 0.4997, + "step": 15678 + }, + { + "epoch": 20.06912, + "grad_norm": 1.0416499376296997, + "learning_rate": 1.865746298519408e-05, + "loss": 0.4707, + "step": 15679 + }, + { + "epoch": 20.0704, + "grad_norm": 1.034729242324829, + "learning_rate": 1.865546218487395e-05, + "loss": 0.4702, + "step": 15680 + }, + { + "epoch": 20.07168, + "grad_norm": 1.1289206743240356, + "learning_rate": 1.8653461384553822e-05, + "loss": 0.4843, + "step": 15681 + }, + { + "epoch": 20.07296, + "grad_norm": 1.0934054851531982, + "learning_rate": 1.8651460584233694e-05, + "loss": 0.4927, + "step": 15682 + }, + { + "epoch": 20.07424, + "grad_norm": 1.0436177253723145, + "learning_rate": 1.8649459783913566e-05, + "loss": 0.49, + "step": 15683 + }, + { + "epoch": 20.07552, + "grad_norm": 1.0504745244979858, + "learning_rate": 1.8647458983593438e-05, + "loss": 0.4885, + "step": 15684 + }, + { + "epoch": 20.0768, + "grad_norm": 1.0551018714904785, + "learning_rate": 1.864545818327331e-05, + "loss": 0.4789, + "step": 15685 + }, + { + "epoch": 20.07808, + "grad_norm": 1.0579137802124023, + "learning_rate": 1.8643457382953185e-05, + "loss": 0.4515, + "step": 15686 + }, + { + "epoch": 20.07936, + "grad_norm": 1.1196202039718628, + "learning_rate": 1.8641456582633053e-05, + "loss": 0.5234, + "step": 15687 + }, + { + "epoch": 20.08064, + "grad_norm": 1.0107567310333252, + "learning_rate": 1.8639455782312925e-05, + "loss": 0.4665, + "step": 15688 + }, + { + "epoch": 20.08192, + "grad_norm": 1.0458314418792725, + "learning_rate": 1.8637454981992797e-05, + "loss": 0.4644, + "step": 15689 + }, + { + "epoch": 20.0832, + "grad_norm": 1.0873018503189087, + "learning_rate": 1.8635454181672672e-05, + "loss": 0.4421, + "step": 15690 + }, + { + "epoch": 20.08448, + "grad_norm": 1.0975297689437866, + "learning_rate": 1.863345338135254e-05, + "loss": 0.5099, + "step": 15691 + }, + { + "epoch": 20.08576, + "grad_norm": 1.0815826654434204, + "learning_rate": 1.8631452581032413e-05, + "loss": 0.4522, + "step": 15692 + }, + { + "epoch": 20.087040000000002, + "grad_norm": 1.0389448404312134, + "learning_rate": 1.8629451780712288e-05, + "loss": 0.4587, + "step": 15693 + }, + { + "epoch": 20.08832, + "grad_norm": 1.0936323404312134, + "learning_rate": 1.862745098039216e-05, + "loss": 0.5339, + "step": 15694 + }, + { + "epoch": 20.0896, + "grad_norm": 1.065900444984436, + "learning_rate": 1.8625450180072028e-05, + "loss": 0.4701, + "step": 15695 + }, + { + "epoch": 20.09088, + "grad_norm": 1.0863001346588135, + "learning_rate": 1.86234493797519e-05, + "loss": 0.5019, + "step": 15696 + }, + { + "epoch": 20.09216, + "grad_norm": 1.1467665433883667, + "learning_rate": 1.8621448579431775e-05, + "loss": 0.5244, + "step": 15697 + }, + { + "epoch": 20.09344, + "grad_norm": 1.0635558366775513, + "learning_rate": 1.8619447779111647e-05, + "loss": 0.4646, + "step": 15698 + }, + { + "epoch": 20.09472, + "grad_norm": 1.0072330236434937, + "learning_rate": 1.8617446978791516e-05, + "loss": 0.4558, + "step": 15699 + }, + { + "epoch": 20.096, + "grad_norm": 1.0939109325408936, + "learning_rate": 1.861544617847139e-05, + "loss": 0.5301, + "step": 15700 + }, + { + "epoch": 20.09728, + "grad_norm": 1.1008801460266113, + "learning_rate": 1.8613445378151263e-05, + "loss": 0.4859, + "step": 15701 + }, + { + "epoch": 20.09856, + "grad_norm": 1.1168580055236816, + "learning_rate": 1.8611444577831135e-05, + "loss": 0.4658, + "step": 15702 + }, + { + "epoch": 20.09984, + "grad_norm": 1.0580940246582031, + "learning_rate": 1.8609443777511003e-05, + "loss": 0.4434, + "step": 15703 + }, + { + "epoch": 20.10112, + "grad_norm": 1.0258312225341797, + "learning_rate": 1.860744297719088e-05, + "loss": 0.4644, + "step": 15704 + }, + { + "epoch": 20.1024, + "grad_norm": 1.0801990032196045, + "learning_rate": 1.860544217687075e-05, + "loss": 0.4959, + "step": 15705 + }, + { + "epoch": 20.10368, + "grad_norm": 1.0814316272735596, + "learning_rate": 1.8603441376550622e-05, + "loss": 0.4898, + "step": 15706 + }, + { + "epoch": 20.10496, + "grad_norm": 1.116225004196167, + "learning_rate": 1.8601440576230494e-05, + "loss": 0.4865, + "step": 15707 + }, + { + "epoch": 20.10624, + "grad_norm": 1.147642970085144, + "learning_rate": 1.8599439775910366e-05, + "loss": 0.5007, + "step": 15708 + }, + { + "epoch": 20.10752, + "grad_norm": 1.1778041124343872, + "learning_rate": 1.8597438975590238e-05, + "loss": 0.5123, + "step": 15709 + }, + { + "epoch": 20.1088, + "grad_norm": 1.0506484508514404, + "learning_rate": 1.859543817527011e-05, + "loss": 0.4695, + "step": 15710 + }, + { + "epoch": 20.11008, + "grad_norm": 1.0645596981048584, + "learning_rate": 1.859343737494998e-05, + "loss": 0.4784, + "step": 15711 + }, + { + "epoch": 20.11136, + "grad_norm": 1.086012840270996, + "learning_rate": 1.8591436574629853e-05, + "loss": 0.4975, + "step": 15712 + }, + { + "epoch": 20.11264, + "grad_norm": 1.0438339710235596, + "learning_rate": 1.8589435774309725e-05, + "loss": 0.4973, + "step": 15713 + }, + { + "epoch": 20.11392, + "grad_norm": 1.1068918704986572, + "learning_rate": 1.8587434973989597e-05, + "loss": 0.4846, + "step": 15714 + }, + { + "epoch": 20.1152, + "grad_norm": 1.1290284395217896, + "learning_rate": 1.858543417366947e-05, + "loss": 0.5336, + "step": 15715 + }, + { + "epoch": 20.11648, + "grad_norm": 1.1011111736297607, + "learning_rate": 1.858343337334934e-05, + "loss": 0.4703, + "step": 15716 + }, + { + "epoch": 20.11776, + "grad_norm": 1.1040534973144531, + "learning_rate": 1.8581432573029212e-05, + "loss": 0.4649, + "step": 15717 + }, + { + "epoch": 20.11904, + "grad_norm": 1.0694252252578735, + "learning_rate": 1.8579431772709084e-05, + "loss": 0.4411, + "step": 15718 + }, + { + "epoch": 20.12032, + "grad_norm": 1.0621991157531738, + "learning_rate": 1.8577430972388956e-05, + "loss": 0.4636, + "step": 15719 + }, + { + "epoch": 20.1216, + "grad_norm": 1.0658694505691528, + "learning_rate": 1.8575430172068828e-05, + "loss": 0.4973, + "step": 15720 + }, + { + "epoch": 20.12288, + "grad_norm": 1.0541012287139893, + "learning_rate": 1.85734293717487e-05, + "loss": 0.492, + "step": 15721 + }, + { + "epoch": 20.12416, + "grad_norm": 1.139356255531311, + "learning_rate": 1.8571428571428572e-05, + "loss": 0.4854, + "step": 15722 + }, + { + "epoch": 20.12544, + "grad_norm": 1.0738463401794434, + "learning_rate": 1.8569427771108444e-05, + "loss": 0.5184, + "step": 15723 + }, + { + "epoch": 20.12672, + "grad_norm": 1.042602777481079, + "learning_rate": 1.8567426970788315e-05, + "loss": 0.4965, + "step": 15724 + }, + { + "epoch": 20.128, + "grad_norm": 1.100381851196289, + "learning_rate": 1.856542617046819e-05, + "loss": 0.52, + "step": 15725 + }, + { + "epoch": 20.12928, + "grad_norm": 1.0966511964797974, + "learning_rate": 1.856342537014806e-05, + "loss": 0.5214, + "step": 15726 + }, + { + "epoch": 20.13056, + "grad_norm": 1.1034797430038452, + "learning_rate": 1.856142456982793e-05, + "loss": 0.4959, + "step": 15727 + }, + { + "epoch": 20.13184, + "grad_norm": 1.1695610284805298, + "learning_rate": 1.8559423769507803e-05, + "loss": 0.5517, + "step": 15728 + }, + { + "epoch": 20.13312, + "grad_norm": 1.1129649877548218, + "learning_rate": 1.8557422969187678e-05, + "loss": 0.4974, + "step": 15729 + }, + { + "epoch": 20.1344, + "grad_norm": 1.130495309829712, + "learning_rate": 1.8555422168867547e-05, + "loss": 0.4873, + "step": 15730 + }, + { + "epoch": 20.13568, + "grad_norm": 1.099576473236084, + "learning_rate": 1.855342136854742e-05, + "loss": 0.4751, + "step": 15731 + }, + { + "epoch": 20.13696, + "grad_norm": 1.0953304767608643, + "learning_rate": 1.8551420568227294e-05, + "loss": 0.5135, + "step": 15732 + }, + { + "epoch": 20.13824, + "grad_norm": 1.0430271625518799, + "learning_rate": 1.8549419767907166e-05, + "loss": 0.4849, + "step": 15733 + }, + { + "epoch": 20.13952, + "grad_norm": 1.1032804250717163, + "learning_rate": 1.8547418967587034e-05, + "loss": 0.5093, + "step": 15734 + }, + { + "epoch": 20.1408, + "grad_norm": 1.1103456020355225, + "learning_rate": 1.8545418167266906e-05, + "loss": 0.5148, + "step": 15735 + }, + { + "epoch": 20.14208, + "grad_norm": 1.1591553688049316, + "learning_rate": 1.854341736694678e-05, + "loss": 0.5335, + "step": 15736 + }, + { + "epoch": 20.14336, + "grad_norm": 1.0609407424926758, + "learning_rate": 1.8541416566626653e-05, + "loss": 0.469, + "step": 15737 + }, + { + "epoch": 20.14464, + "grad_norm": 1.0772405862808228, + "learning_rate": 1.853941576630652e-05, + "loss": 0.522, + "step": 15738 + }, + { + "epoch": 20.14592, + "grad_norm": 1.2019360065460205, + "learning_rate": 1.8537414965986397e-05, + "loss": 0.5215, + "step": 15739 + }, + { + "epoch": 20.1472, + "grad_norm": 1.0603306293487549, + "learning_rate": 1.853541416566627e-05, + "loss": 0.4832, + "step": 15740 + }, + { + "epoch": 20.14848, + "grad_norm": 1.1025315523147583, + "learning_rate": 1.853341336534614e-05, + "loss": 0.5234, + "step": 15741 + }, + { + "epoch": 20.14976, + "grad_norm": 1.0431936979293823, + "learning_rate": 1.853141256502601e-05, + "loss": 0.464, + "step": 15742 + }, + { + "epoch": 20.15104, + "grad_norm": 1.0916297435760498, + "learning_rate": 1.8529411764705884e-05, + "loss": 0.4768, + "step": 15743 + }, + { + "epoch": 20.15232, + "grad_norm": 1.0731911659240723, + "learning_rate": 1.8527410964385756e-05, + "loss": 0.4871, + "step": 15744 + }, + { + "epoch": 20.1536, + "grad_norm": 1.060876488685608, + "learning_rate": 1.8525410164065628e-05, + "loss": 0.4926, + "step": 15745 + }, + { + "epoch": 20.15488, + "grad_norm": 1.0614646673202515, + "learning_rate": 1.85234093637455e-05, + "loss": 0.4415, + "step": 15746 + }, + { + "epoch": 20.15616, + "grad_norm": 1.1642632484436035, + "learning_rate": 1.852140856342537e-05, + "loss": 0.5138, + "step": 15747 + }, + { + "epoch": 20.15744, + "grad_norm": 1.0110687017440796, + "learning_rate": 1.8519407763105243e-05, + "loss": 0.4631, + "step": 15748 + }, + { + "epoch": 20.15872, + "grad_norm": 1.130932331085205, + "learning_rate": 1.8517406962785115e-05, + "loss": 0.4553, + "step": 15749 + }, + { + "epoch": 20.16, + "grad_norm": 0.9995177388191223, + "learning_rate": 1.8515406162464987e-05, + "loss": 0.4915, + "step": 15750 + }, + { + "epoch": 20.16128, + "grad_norm": 1.0752531290054321, + "learning_rate": 1.851340536214486e-05, + "loss": 0.5099, + "step": 15751 + }, + { + "epoch": 20.16256, + "grad_norm": 1.0507272481918335, + "learning_rate": 1.851140456182473e-05, + "loss": 0.4755, + "step": 15752 + }, + { + "epoch": 20.16384, + "grad_norm": 1.1398614645004272, + "learning_rate": 1.8509403761504603e-05, + "loss": 0.5324, + "step": 15753 + }, + { + "epoch": 20.16512, + "grad_norm": 0.984160840511322, + "learning_rate": 1.8507402961184475e-05, + "loss": 0.4188, + "step": 15754 + }, + { + "epoch": 20.1664, + "grad_norm": 1.1006124019622803, + "learning_rate": 1.8505402160864346e-05, + "loss": 0.5033, + "step": 15755 + }, + { + "epoch": 20.16768, + "grad_norm": 1.0734970569610596, + "learning_rate": 1.8503401360544218e-05, + "loss": 0.4785, + "step": 15756 + }, + { + "epoch": 20.16896, + "grad_norm": 1.1505695581436157, + "learning_rate": 1.850140056022409e-05, + "loss": 0.5547, + "step": 15757 + }, + { + "epoch": 20.17024, + "grad_norm": 1.147254228591919, + "learning_rate": 1.8499399759903962e-05, + "loss": 0.5077, + "step": 15758 + }, + { + "epoch": 20.17152, + "grad_norm": 1.0394471883773804, + "learning_rate": 1.8497398959583834e-05, + "loss": 0.465, + "step": 15759 + }, + { + "epoch": 20.1728, + "grad_norm": 1.0904779434204102, + "learning_rate": 1.849539815926371e-05, + "loss": 0.4909, + "step": 15760 + }, + { + "epoch": 20.17408, + "grad_norm": 1.2574462890625, + "learning_rate": 1.8493397358943578e-05, + "loss": 0.4928, + "step": 15761 + }, + { + "epoch": 20.17536, + "grad_norm": 1.0924097299575806, + "learning_rate": 1.849139655862345e-05, + "loss": 0.4794, + "step": 15762 + }, + { + "epoch": 20.17664, + "grad_norm": 1.0930646657943726, + "learning_rate": 1.848939575830332e-05, + "loss": 0.4895, + "step": 15763 + }, + { + "epoch": 20.17792, + "grad_norm": 1.0378696918487549, + "learning_rate": 1.8487394957983196e-05, + "loss": 0.5068, + "step": 15764 + }, + { + "epoch": 20.1792, + "grad_norm": 1.121130347251892, + "learning_rate": 1.8485394157663065e-05, + "loss": 0.49, + "step": 15765 + }, + { + "epoch": 20.18048, + "grad_norm": 1.1667362451553345, + "learning_rate": 1.8483393357342937e-05, + "loss": 0.4976, + "step": 15766 + }, + { + "epoch": 20.18176, + "grad_norm": 1.0337846279144287, + "learning_rate": 1.8481392557022812e-05, + "loss": 0.476, + "step": 15767 + }, + { + "epoch": 20.18304, + "grad_norm": 1.0466781854629517, + "learning_rate": 1.8479391756702684e-05, + "loss": 0.4418, + "step": 15768 + }, + { + "epoch": 20.18432, + "grad_norm": 1.1099622249603271, + "learning_rate": 1.8477390956382552e-05, + "loss": 0.5132, + "step": 15769 + }, + { + "epoch": 20.1856, + "grad_norm": 1.1434911489486694, + "learning_rate": 1.8475390156062424e-05, + "loss": 0.5056, + "step": 15770 + }, + { + "epoch": 20.18688, + "grad_norm": 1.0886666774749756, + "learning_rate": 1.84733893557423e-05, + "loss": 0.447, + "step": 15771 + }, + { + "epoch": 20.18816, + "grad_norm": 1.051747441291809, + "learning_rate": 1.847138855542217e-05, + "loss": 0.4715, + "step": 15772 + }, + { + "epoch": 20.18944, + "grad_norm": 1.1362502574920654, + "learning_rate": 1.846938775510204e-05, + "loss": 0.4843, + "step": 15773 + }, + { + "epoch": 20.19072, + "grad_norm": 1.0374624729156494, + "learning_rate": 1.846738695478191e-05, + "loss": 0.4653, + "step": 15774 + }, + { + "epoch": 20.192, + "grad_norm": 1.105280876159668, + "learning_rate": 1.8465386154461787e-05, + "loss": 0.4934, + "step": 15775 + }, + { + "epoch": 20.19328, + "grad_norm": 1.1130280494689941, + "learning_rate": 1.846338535414166e-05, + "loss": 0.5158, + "step": 15776 + }, + { + "epoch": 20.19456, + "grad_norm": 1.0840026140213013, + "learning_rate": 1.8461384553821527e-05, + "loss": 0.4837, + "step": 15777 + }, + { + "epoch": 20.19584, + "grad_norm": 1.0840171575546265, + "learning_rate": 1.8459383753501402e-05, + "loss": 0.4841, + "step": 15778 + }, + { + "epoch": 20.19712, + "grad_norm": 1.044407844543457, + "learning_rate": 1.8457382953181274e-05, + "loss": 0.4895, + "step": 15779 + }, + { + "epoch": 20.1984, + "grad_norm": 1.110978603363037, + "learning_rate": 1.8455382152861146e-05, + "loss": 0.4946, + "step": 15780 + }, + { + "epoch": 20.19968, + "grad_norm": 1.154561161994934, + "learning_rate": 1.8453381352541015e-05, + "loss": 0.5201, + "step": 15781 + }, + { + "epoch": 20.20096, + "grad_norm": 1.1208866834640503, + "learning_rate": 1.845138055222089e-05, + "loss": 0.4845, + "step": 15782 + }, + { + "epoch": 20.20224, + "grad_norm": 1.111349105834961, + "learning_rate": 1.8449379751900762e-05, + "loss": 0.4799, + "step": 15783 + }, + { + "epoch": 20.20352, + "grad_norm": 1.1092686653137207, + "learning_rate": 1.8447378951580634e-05, + "loss": 0.4641, + "step": 15784 + }, + { + "epoch": 20.2048, + "grad_norm": 1.198625922203064, + "learning_rate": 1.8445378151260505e-05, + "loss": 0.498, + "step": 15785 + }, + { + "epoch": 20.20608, + "grad_norm": 1.128433108329773, + "learning_rate": 1.8443377350940377e-05, + "loss": 0.4709, + "step": 15786 + }, + { + "epoch": 20.20736, + "grad_norm": 1.02711820602417, + "learning_rate": 1.844137655062025e-05, + "loss": 0.4623, + "step": 15787 + }, + { + "epoch": 20.20864, + "grad_norm": 1.1294431686401367, + "learning_rate": 1.843937575030012e-05, + "loss": 0.4966, + "step": 15788 + }, + { + "epoch": 20.20992, + "grad_norm": 1.0591932535171509, + "learning_rate": 1.8437374949979993e-05, + "loss": 0.4802, + "step": 15789 + }, + { + "epoch": 20.2112, + "grad_norm": 1.1523479223251343, + "learning_rate": 1.8435374149659865e-05, + "loss": 0.5155, + "step": 15790 + }, + { + "epoch": 20.21248, + "grad_norm": 1.1084343194961548, + "learning_rate": 1.8433373349339737e-05, + "loss": 0.5098, + "step": 15791 + }, + { + "epoch": 20.21376, + "grad_norm": 1.1509208679199219, + "learning_rate": 1.843137254901961e-05, + "loss": 0.5099, + "step": 15792 + }, + { + "epoch": 20.21504, + "grad_norm": 1.1288859844207764, + "learning_rate": 1.842937174869948e-05, + "loss": 0.482, + "step": 15793 + }, + { + "epoch": 20.21632, + "grad_norm": 1.039110779762268, + "learning_rate": 1.8427370948379352e-05, + "loss": 0.4622, + "step": 15794 + }, + { + "epoch": 20.2176, + "grad_norm": 1.1868407726287842, + "learning_rate": 1.8425370148059224e-05, + "loss": 0.5312, + "step": 15795 + }, + { + "epoch": 20.21888, + "grad_norm": 1.0792521238327026, + "learning_rate": 1.8423369347739096e-05, + "loss": 0.4415, + "step": 15796 + }, + { + "epoch": 20.22016, + "grad_norm": 1.0646893978118896, + "learning_rate": 1.8421368547418968e-05, + "loss": 0.4616, + "step": 15797 + }, + { + "epoch": 20.22144, + "grad_norm": 1.0488739013671875, + "learning_rate": 1.841936774709884e-05, + "loss": 0.4913, + "step": 15798 + }, + { + "epoch": 20.22272, + "grad_norm": 1.152377963066101, + "learning_rate": 1.8417366946778715e-05, + "loss": 0.5146, + "step": 15799 + }, + { + "epoch": 20.224, + "grad_norm": 1.0589985847473145, + "learning_rate": 1.8415366146458583e-05, + "loss": 0.4561, + "step": 15800 + }, + { + "epoch": 20.22528, + "grad_norm": 1.1091581583023071, + "learning_rate": 1.8413365346138455e-05, + "loss": 0.5, + "step": 15801 + }, + { + "epoch": 20.22656, + "grad_norm": 1.1588554382324219, + "learning_rate": 1.8411364545818327e-05, + "loss": 0.4701, + "step": 15802 + }, + { + "epoch": 20.22784, + "grad_norm": 1.0531977415084839, + "learning_rate": 1.8409363745498202e-05, + "loss": 0.4541, + "step": 15803 + }, + { + "epoch": 20.22912, + "grad_norm": 1.1038832664489746, + "learning_rate": 1.840736294517807e-05, + "loss": 0.4773, + "step": 15804 + }, + { + "epoch": 20.2304, + "grad_norm": 1.0699522495269775, + "learning_rate": 1.8405362144857943e-05, + "loss": 0.4831, + "step": 15805 + }, + { + "epoch": 20.23168, + "grad_norm": 1.085437297821045, + "learning_rate": 1.8403361344537818e-05, + "loss": 0.4914, + "step": 15806 + }, + { + "epoch": 20.23296, + "grad_norm": 1.104048728942871, + "learning_rate": 1.840136054421769e-05, + "loss": 0.4827, + "step": 15807 + }, + { + "epoch": 20.23424, + "grad_norm": 1.0480152368545532, + "learning_rate": 1.8399359743897558e-05, + "loss": 0.4724, + "step": 15808 + }, + { + "epoch": 20.23552, + "grad_norm": 1.1070879697799683, + "learning_rate": 1.839735894357743e-05, + "loss": 0.485, + "step": 15809 + }, + { + "epoch": 20.2368, + "grad_norm": 1.0816839933395386, + "learning_rate": 1.8395358143257305e-05, + "loss": 0.4814, + "step": 15810 + }, + { + "epoch": 20.23808, + "grad_norm": 1.0380091667175293, + "learning_rate": 1.8393357342937177e-05, + "loss": 0.483, + "step": 15811 + }, + { + "epoch": 20.23936, + "grad_norm": 1.0764389038085938, + "learning_rate": 1.8391356542617046e-05, + "loss": 0.4797, + "step": 15812 + }, + { + "epoch": 20.24064, + "grad_norm": 1.1139464378356934, + "learning_rate": 1.838935574229692e-05, + "loss": 0.5232, + "step": 15813 + }, + { + "epoch": 20.24192, + "grad_norm": 1.0700820684432983, + "learning_rate": 1.8387354941976793e-05, + "loss": 0.434, + "step": 15814 + }, + { + "epoch": 20.2432, + "grad_norm": 1.0398091077804565, + "learning_rate": 1.8385354141656665e-05, + "loss": 0.4793, + "step": 15815 + }, + { + "epoch": 20.24448, + "grad_norm": 1.1150001287460327, + "learning_rate": 1.8383353341336533e-05, + "loss": 0.4747, + "step": 15816 + }, + { + "epoch": 20.24576, + "grad_norm": 1.1604424715042114, + "learning_rate": 1.8381352541016408e-05, + "loss": 0.5042, + "step": 15817 + }, + { + "epoch": 20.24704, + "grad_norm": 1.0625085830688477, + "learning_rate": 1.837935174069628e-05, + "loss": 0.4773, + "step": 15818 + }, + { + "epoch": 20.24832, + "grad_norm": 1.1010764837265015, + "learning_rate": 1.8377350940376152e-05, + "loss": 0.458, + "step": 15819 + }, + { + "epoch": 20.2496, + "grad_norm": 1.1224896907806396, + "learning_rate": 1.8375350140056024e-05, + "loss": 0.4832, + "step": 15820 + }, + { + "epoch": 20.25088, + "grad_norm": 1.1224029064178467, + "learning_rate": 1.8373349339735896e-05, + "loss": 0.4807, + "step": 15821 + }, + { + "epoch": 20.25216, + "grad_norm": 1.0098991394042969, + "learning_rate": 1.8371348539415768e-05, + "loss": 0.4576, + "step": 15822 + }, + { + "epoch": 20.25344, + "grad_norm": 1.065501093864441, + "learning_rate": 1.836934773909564e-05, + "loss": 0.4252, + "step": 15823 + }, + { + "epoch": 20.25472, + "grad_norm": 1.1191290616989136, + "learning_rate": 1.836734693877551e-05, + "loss": 0.4997, + "step": 15824 + }, + { + "epoch": 20.256, + "grad_norm": 1.107366919517517, + "learning_rate": 1.8365346138455383e-05, + "loss": 0.4822, + "step": 15825 + }, + { + "epoch": 20.25728, + "grad_norm": 1.0112433433532715, + "learning_rate": 1.8363345338135255e-05, + "loss": 0.4796, + "step": 15826 + }, + { + "epoch": 20.25856, + "grad_norm": 1.03781259059906, + "learning_rate": 1.8361344537815127e-05, + "loss": 0.4841, + "step": 15827 + }, + { + "epoch": 20.25984, + "grad_norm": 1.0869375467300415, + "learning_rate": 1.8359343737495e-05, + "loss": 0.5392, + "step": 15828 + }, + { + "epoch": 20.26112, + "grad_norm": 1.1285794973373413, + "learning_rate": 1.835734293717487e-05, + "loss": 0.4978, + "step": 15829 + }, + { + "epoch": 20.2624, + "grad_norm": 1.1054590940475464, + "learning_rate": 1.8355342136854742e-05, + "loss": 0.4876, + "step": 15830 + }, + { + "epoch": 20.26368, + "grad_norm": 1.0893003940582275, + "learning_rate": 1.8353341336534614e-05, + "loss": 0.4894, + "step": 15831 + }, + { + "epoch": 20.26496, + "grad_norm": 1.1795381307601929, + "learning_rate": 1.8351340536214486e-05, + "loss": 0.4894, + "step": 15832 + }, + { + "epoch": 20.26624, + "grad_norm": 1.0786329507827759, + "learning_rate": 1.8349339735894358e-05, + "loss": 0.4816, + "step": 15833 + }, + { + "epoch": 20.26752, + "grad_norm": 1.0577181577682495, + "learning_rate": 1.834733893557423e-05, + "loss": 0.4729, + "step": 15834 + }, + { + "epoch": 20.2688, + "grad_norm": 1.0976567268371582, + "learning_rate": 1.8345338135254102e-05, + "loss": 0.5178, + "step": 15835 + }, + { + "epoch": 20.27008, + "grad_norm": 1.0734076499938965, + "learning_rate": 1.8343337334933974e-05, + "loss": 0.4922, + "step": 15836 + }, + { + "epoch": 20.27136, + "grad_norm": 1.062116265296936, + "learning_rate": 1.8341336534613845e-05, + "loss": 0.4958, + "step": 15837 + }, + { + "epoch": 20.27264, + "grad_norm": 1.0870201587677002, + "learning_rate": 1.833933573429372e-05, + "loss": 0.4803, + "step": 15838 + }, + { + "epoch": 20.27392, + "grad_norm": 1.0811505317687988, + "learning_rate": 1.833733493397359e-05, + "loss": 0.5093, + "step": 15839 + }, + { + "epoch": 20.2752, + "grad_norm": 1.1151394844055176, + "learning_rate": 1.833533413365346e-05, + "loss": 0.5054, + "step": 15840 + }, + { + "epoch": 20.27648, + "grad_norm": 1.1511766910552979, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.4998, + "step": 15841 + }, + { + "epoch": 20.27776, + "grad_norm": 1.0190060138702393, + "learning_rate": 1.8331332533013208e-05, + "loss": 0.4492, + "step": 15842 + }, + { + "epoch": 20.27904, + "grad_norm": 1.0209211111068726, + "learning_rate": 1.8329331732693077e-05, + "loss": 0.4544, + "step": 15843 + }, + { + "epoch": 20.28032, + "grad_norm": 1.0691982507705688, + "learning_rate": 1.832733093237295e-05, + "loss": 0.4621, + "step": 15844 + }, + { + "epoch": 20.2816, + "grad_norm": 1.1807547807693481, + "learning_rate": 1.8325330132052824e-05, + "loss": 0.5496, + "step": 15845 + }, + { + "epoch": 20.28288, + "grad_norm": 1.0593502521514893, + "learning_rate": 1.8323329331732696e-05, + "loss": 0.439, + "step": 15846 + }, + { + "epoch": 20.28416, + "grad_norm": 1.0794156789779663, + "learning_rate": 1.8321328531412564e-05, + "loss": 0.5034, + "step": 15847 + }, + { + "epoch": 20.28544, + "grad_norm": 1.1507887840270996, + "learning_rate": 1.8319327731092436e-05, + "loss": 0.5477, + "step": 15848 + }, + { + "epoch": 20.28672, + "grad_norm": 0.9900954365730286, + "learning_rate": 1.831732693077231e-05, + "loss": 0.4778, + "step": 15849 + }, + { + "epoch": 20.288, + "grad_norm": 1.0622221231460571, + "learning_rate": 1.8315326130452183e-05, + "loss": 0.4582, + "step": 15850 + }, + { + "epoch": 20.28928, + "grad_norm": 1.0999888181686401, + "learning_rate": 1.831332533013205e-05, + "loss": 0.4872, + "step": 15851 + }, + { + "epoch": 20.29056, + "grad_norm": 1.0076533555984497, + "learning_rate": 1.8311324529811927e-05, + "loss": 0.4649, + "step": 15852 + }, + { + "epoch": 20.29184, + "grad_norm": 1.1547635793685913, + "learning_rate": 1.83093237294918e-05, + "loss": 0.4873, + "step": 15853 + }, + { + "epoch": 20.29312, + "grad_norm": 1.1193164587020874, + "learning_rate": 1.830732292917167e-05, + "loss": 0.4748, + "step": 15854 + }, + { + "epoch": 20.2944, + "grad_norm": 1.1040410995483398, + "learning_rate": 1.830532212885154e-05, + "loss": 0.4852, + "step": 15855 + }, + { + "epoch": 20.29568, + "grad_norm": 1.0710041522979736, + "learning_rate": 1.8303321328531414e-05, + "loss": 0.4598, + "step": 15856 + }, + { + "epoch": 20.29696, + "grad_norm": 1.1397336721420288, + "learning_rate": 1.8301320528211286e-05, + "loss": 0.5641, + "step": 15857 + }, + { + "epoch": 20.29824, + "grad_norm": 1.1837016344070435, + "learning_rate": 1.8299319727891158e-05, + "loss": 0.4928, + "step": 15858 + }, + { + "epoch": 20.29952, + "grad_norm": 1.112987756729126, + "learning_rate": 1.829731892757103e-05, + "loss": 0.4575, + "step": 15859 + }, + { + "epoch": 20.3008, + "grad_norm": 1.0400470495224, + "learning_rate": 1.82953181272509e-05, + "loss": 0.4527, + "step": 15860 + }, + { + "epoch": 20.30208, + "grad_norm": 1.0332235097885132, + "learning_rate": 1.8293317326930773e-05, + "loss": 0.4948, + "step": 15861 + }, + { + "epoch": 20.30336, + "grad_norm": 1.1508287191390991, + "learning_rate": 1.8291316526610645e-05, + "loss": 0.519, + "step": 15862 + }, + { + "epoch": 20.30464, + "grad_norm": 1.0921905040740967, + "learning_rate": 1.8289315726290517e-05, + "loss": 0.465, + "step": 15863 + }, + { + "epoch": 20.30592, + "grad_norm": 1.1255881786346436, + "learning_rate": 1.828731492597039e-05, + "loss": 0.5087, + "step": 15864 + }, + { + "epoch": 20.3072, + "grad_norm": 1.13408362865448, + "learning_rate": 1.828531412565026e-05, + "loss": 0.5263, + "step": 15865 + }, + { + "epoch": 20.30848, + "grad_norm": 1.1082432270050049, + "learning_rate": 1.8283313325330133e-05, + "loss": 0.477, + "step": 15866 + }, + { + "epoch": 20.30976, + "grad_norm": 1.0318244695663452, + "learning_rate": 1.8281312525010005e-05, + "loss": 0.4939, + "step": 15867 + }, + { + "epoch": 20.31104, + "grad_norm": 1.0345925092697144, + "learning_rate": 1.8279311724689876e-05, + "loss": 0.4818, + "step": 15868 + }, + { + "epoch": 20.31232, + "grad_norm": 1.1669683456420898, + "learning_rate": 1.8277310924369748e-05, + "loss": 0.5096, + "step": 15869 + }, + { + "epoch": 20.3136, + "grad_norm": 1.1364049911499023, + "learning_rate": 1.827531012404962e-05, + "loss": 0.4886, + "step": 15870 + }, + { + "epoch": 20.31488, + "grad_norm": 1.096252202987671, + "learning_rate": 1.8273309323729492e-05, + "loss": 0.4738, + "step": 15871 + }, + { + "epoch": 20.31616, + "grad_norm": 1.098500370979309, + "learning_rate": 1.8271308523409364e-05, + "loss": 0.5058, + "step": 15872 + }, + { + "epoch": 20.31744, + "grad_norm": 1.0478065013885498, + "learning_rate": 1.826930772308924e-05, + "loss": 0.4372, + "step": 15873 + }, + { + "epoch": 20.31872, + "grad_norm": 1.1461261510849, + "learning_rate": 1.8267306922769108e-05, + "loss": 0.5234, + "step": 15874 + }, + { + "epoch": 20.32, + "grad_norm": 1.1147645711898804, + "learning_rate": 1.826530612244898e-05, + "loss": 0.4923, + "step": 15875 + }, + { + "epoch": 20.32128, + "grad_norm": 1.076523780822754, + "learning_rate": 1.826330532212885e-05, + "loss": 0.4899, + "step": 15876 + }, + { + "epoch": 20.32256, + "grad_norm": 1.062248706817627, + "learning_rate": 1.8261304521808726e-05, + "loss": 0.4891, + "step": 15877 + }, + { + "epoch": 20.32384, + "grad_norm": 1.0742130279541016, + "learning_rate": 1.8259303721488595e-05, + "loss": 0.5267, + "step": 15878 + }, + { + "epoch": 20.32512, + "grad_norm": 1.046400547027588, + "learning_rate": 1.8257302921168467e-05, + "loss": 0.4743, + "step": 15879 + }, + { + "epoch": 20.3264, + "grad_norm": 1.1239981651306152, + "learning_rate": 1.8255302120848342e-05, + "loss": 0.4765, + "step": 15880 + }, + { + "epoch": 20.32768, + "grad_norm": 1.074862003326416, + "learning_rate": 1.8253301320528214e-05, + "loss": 0.4618, + "step": 15881 + }, + { + "epoch": 20.32896, + "grad_norm": 1.0675029754638672, + "learning_rate": 1.8251300520208082e-05, + "loss": 0.501, + "step": 15882 + }, + { + "epoch": 20.33024, + "grad_norm": 1.0445536375045776, + "learning_rate": 1.8249299719887954e-05, + "loss": 0.5028, + "step": 15883 + }, + { + "epoch": 20.33152, + "grad_norm": 1.1353448629379272, + "learning_rate": 1.824729891956783e-05, + "loss": 0.4808, + "step": 15884 + }, + { + "epoch": 20.3328, + "grad_norm": 1.0861884355545044, + "learning_rate": 1.82452981192477e-05, + "loss": 0.4717, + "step": 15885 + }, + { + "epoch": 20.33408, + "grad_norm": 1.0767872333526611, + "learning_rate": 1.824329731892757e-05, + "loss": 0.4831, + "step": 15886 + }, + { + "epoch": 20.33536, + "grad_norm": 1.1319317817687988, + "learning_rate": 1.824129651860744e-05, + "loss": 0.5856, + "step": 15887 + }, + { + "epoch": 20.33664, + "grad_norm": 1.1070101261138916, + "learning_rate": 1.8239295718287317e-05, + "loss": 0.4876, + "step": 15888 + }, + { + "epoch": 20.33792, + "grad_norm": 1.0813465118408203, + "learning_rate": 1.823729491796719e-05, + "loss": 0.4761, + "step": 15889 + }, + { + "epoch": 20.3392, + "grad_norm": 1.1044111251831055, + "learning_rate": 1.8235294117647057e-05, + "loss": 0.5044, + "step": 15890 + }, + { + "epoch": 20.34048, + "grad_norm": 1.0086759328842163, + "learning_rate": 1.8233293317326932e-05, + "loss": 0.5094, + "step": 15891 + }, + { + "epoch": 20.34176, + "grad_norm": 1.0328330993652344, + "learning_rate": 1.8231292517006804e-05, + "loss": 0.4408, + "step": 15892 + }, + { + "epoch": 20.34304, + "grad_norm": 1.073428988456726, + "learning_rate": 1.8229291716686676e-05, + "loss": 0.4879, + "step": 15893 + }, + { + "epoch": 20.34432, + "grad_norm": 1.0707123279571533, + "learning_rate": 1.8227290916366545e-05, + "loss": 0.4873, + "step": 15894 + }, + { + "epoch": 20.3456, + "grad_norm": 0.9961058497428894, + "learning_rate": 1.822529011604642e-05, + "loss": 0.4728, + "step": 15895 + }, + { + "epoch": 20.34688, + "grad_norm": 1.1604456901550293, + "learning_rate": 1.8223289315726292e-05, + "loss": 0.4744, + "step": 15896 + }, + { + "epoch": 20.34816, + "grad_norm": 1.1048893928527832, + "learning_rate": 1.8221288515406164e-05, + "loss": 0.4744, + "step": 15897 + }, + { + "epoch": 20.34944, + "grad_norm": 1.0889872312545776, + "learning_rate": 1.8219287715086035e-05, + "loss": 0.4792, + "step": 15898 + }, + { + "epoch": 20.35072, + "grad_norm": 1.1309471130371094, + "learning_rate": 1.8217286914765907e-05, + "loss": 0.5117, + "step": 15899 + }, + { + "epoch": 20.352, + "grad_norm": 1.1387243270874023, + "learning_rate": 1.821528611444578e-05, + "loss": 0.5127, + "step": 15900 + }, + { + "epoch": 20.35328, + "grad_norm": 1.1357046365737915, + "learning_rate": 1.821328531412565e-05, + "loss": 0.5016, + "step": 15901 + }, + { + "epoch": 20.35456, + "grad_norm": 1.1048427820205688, + "learning_rate": 1.8211284513805523e-05, + "loss": 0.4902, + "step": 15902 + }, + { + "epoch": 20.35584, + "grad_norm": 1.0928595066070557, + "learning_rate": 1.8209283713485395e-05, + "loss": 0.4721, + "step": 15903 + }, + { + "epoch": 20.35712, + "grad_norm": 1.0246015787124634, + "learning_rate": 1.8207282913165267e-05, + "loss": 0.4882, + "step": 15904 + }, + { + "epoch": 20.3584, + "grad_norm": 1.0948463678359985, + "learning_rate": 1.820528211284514e-05, + "loss": 0.4531, + "step": 15905 + }, + { + "epoch": 20.35968, + "grad_norm": 1.0775697231292725, + "learning_rate": 1.820328131252501e-05, + "loss": 0.4587, + "step": 15906 + }, + { + "epoch": 20.36096, + "grad_norm": 1.0711140632629395, + "learning_rate": 1.8201280512204882e-05, + "loss": 0.4533, + "step": 15907 + }, + { + "epoch": 20.36224, + "grad_norm": 1.0861189365386963, + "learning_rate": 1.8199279711884754e-05, + "loss": 0.4906, + "step": 15908 + }, + { + "epoch": 20.36352, + "grad_norm": 1.1456248760223389, + "learning_rate": 1.8197278911564626e-05, + "loss": 0.4666, + "step": 15909 + }, + { + "epoch": 20.3648, + "grad_norm": 1.126597285270691, + "learning_rate": 1.8195278111244498e-05, + "loss": 0.5546, + "step": 15910 + }, + { + "epoch": 20.36608, + "grad_norm": 1.1376280784606934, + "learning_rate": 1.819327731092437e-05, + "loss": 0.497, + "step": 15911 + }, + { + "epoch": 20.36736, + "grad_norm": 1.084020972251892, + "learning_rate": 1.8191276510604245e-05, + "loss": 0.4715, + "step": 15912 + }, + { + "epoch": 20.36864, + "grad_norm": 1.0613341331481934, + "learning_rate": 1.8189275710284113e-05, + "loss": 0.4906, + "step": 15913 + }, + { + "epoch": 20.36992, + "grad_norm": 1.0506778955459595, + "learning_rate": 1.8187274909963985e-05, + "loss": 0.4739, + "step": 15914 + }, + { + "epoch": 20.3712, + "grad_norm": 1.0626603364944458, + "learning_rate": 1.8185274109643857e-05, + "loss": 0.4892, + "step": 15915 + }, + { + "epoch": 20.37248, + "grad_norm": 1.02902090549469, + "learning_rate": 1.8183273309323732e-05, + "loss": 0.5032, + "step": 15916 + }, + { + "epoch": 20.37376, + "grad_norm": 1.0012882947921753, + "learning_rate": 1.81812725090036e-05, + "loss": 0.4414, + "step": 15917 + }, + { + "epoch": 20.37504, + "grad_norm": 1.0998730659484863, + "learning_rate": 1.8179271708683473e-05, + "loss": 0.5046, + "step": 15918 + }, + { + "epoch": 20.37632, + "grad_norm": 1.138268232345581, + "learning_rate": 1.8177270908363348e-05, + "loss": 0.5413, + "step": 15919 + }, + { + "epoch": 20.3776, + "grad_norm": 1.1356502771377563, + "learning_rate": 1.817527010804322e-05, + "loss": 0.5459, + "step": 15920 + }, + { + "epoch": 20.37888, + "grad_norm": 1.0949009656906128, + "learning_rate": 1.8173269307723088e-05, + "loss": 0.481, + "step": 15921 + }, + { + "epoch": 20.38016, + "grad_norm": 1.0819274187088013, + "learning_rate": 1.817126850740296e-05, + "loss": 0.4975, + "step": 15922 + }, + { + "epoch": 20.38144, + "grad_norm": 1.084596037864685, + "learning_rate": 1.8169267707082835e-05, + "loss": 0.5315, + "step": 15923 + }, + { + "epoch": 20.38272, + "grad_norm": 1.1176440715789795, + "learning_rate": 1.8167266906762707e-05, + "loss": 0.4893, + "step": 15924 + }, + { + "epoch": 20.384, + "grad_norm": 1.1294463872909546, + "learning_rate": 1.8165266106442576e-05, + "loss": 0.4869, + "step": 15925 + }, + { + "epoch": 20.38528, + "grad_norm": 1.177349328994751, + "learning_rate": 1.816326530612245e-05, + "loss": 0.476, + "step": 15926 + }, + { + "epoch": 20.38656, + "grad_norm": 1.15972101688385, + "learning_rate": 1.8161264505802323e-05, + "loss": 0.5329, + "step": 15927 + }, + { + "epoch": 20.38784, + "grad_norm": 1.0567160844802856, + "learning_rate": 1.8159263705482195e-05, + "loss": 0.4679, + "step": 15928 + }, + { + "epoch": 20.38912, + "grad_norm": 1.0844935178756714, + "learning_rate": 1.8157262905162063e-05, + "loss": 0.4588, + "step": 15929 + }, + { + "epoch": 20.3904, + "grad_norm": 1.057117223739624, + "learning_rate": 1.8155262104841938e-05, + "loss": 0.4974, + "step": 15930 + }, + { + "epoch": 20.39168, + "grad_norm": 1.1074366569519043, + "learning_rate": 1.815326130452181e-05, + "loss": 0.5375, + "step": 15931 + }, + { + "epoch": 20.39296, + "grad_norm": 1.081174612045288, + "learning_rate": 1.8151260504201682e-05, + "loss": 0.5087, + "step": 15932 + }, + { + "epoch": 20.39424, + "grad_norm": 1.090725064277649, + "learning_rate": 1.8149259703881554e-05, + "loss": 0.5162, + "step": 15933 + }, + { + "epoch": 20.39552, + "grad_norm": 1.1822116374969482, + "learning_rate": 1.8147258903561426e-05, + "loss": 0.4763, + "step": 15934 + }, + { + "epoch": 20.3968, + "grad_norm": 1.0998727083206177, + "learning_rate": 1.8145258103241298e-05, + "loss": 0.481, + "step": 15935 + }, + { + "epoch": 20.39808, + "grad_norm": 1.1375815868377686, + "learning_rate": 1.814325730292117e-05, + "loss": 0.5338, + "step": 15936 + }, + { + "epoch": 20.39936, + "grad_norm": 1.1332123279571533, + "learning_rate": 1.814125650260104e-05, + "loss": 0.5234, + "step": 15937 + }, + { + "epoch": 20.40064, + "grad_norm": 1.1074390411376953, + "learning_rate": 1.8139255702280913e-05, + "loss": 0.4858, + "step": 15938 + }, + { + "epoch": 20.40192, + "grad_norm": 1.059393286705017, + "learning_rate": 1.8137254901960785e-05, + "loss": 0.4912, + "step": 15939 + }, + { + "epoch": 20.4032, + "grad_norm": 1.132832646369934, + "learning_rate": 1.8135254101640657e-05, + "loss": 0.5001, + "step": 15940 + }, + { + "epoch": 20.40448, + "grad_norm": 1.0979952812194824, + "learning_rate": 1.813325330132053e-05, + "loss": 0.5107, + "step": 15941 + }, + { + "epoch": 20.40576, + "grad_norm": 1.1360996961593628, + "learning_rate": 1.81312525010004e-05, + "loss": 0.4633, + "step": 15942 + }, + { + "epoch": 20.40704, + "grad_norm": 1.0994714498519897, + "learning_rate": 1.8129251700680272e-05, + "loss": 0.4984, + "step": 15943 + }, + { + "epoch": 20.40832, + "grad_norm": 1.0364211797714233, + "learning_rate": 1.8127250900360144e-05, + "loss": 0.4605, + "step": 15944 + }, + { + "epoch": 20.4096, + "grad_norm": 1.0742835998535156, + "learning_rate": 1.8125250100040016e-05, + "loss": 0.4562, + "step": 15945 + }, + { + "epoch": 20.41088, + "grad_norm": 1.151170015335083, + "learning_rate": 1.8123249299719888e-05, + "loss": 0.4731, + "step": 15946 + }, + { + "epoch": 20.41216, + "grad_norm": 1.179980993270874, + "learning_rate": 1.812124849939976e-05, + "loss": 0.4855, + "step": 15947 + }, + { + "epoch": 20.41344, + "grad_norm": 1.1587342023849487, + "learning_rate": 1.8119247699079632e-05, + "loss": 0.5534, + "step": 15948 + }, + { + "epoch": 20.41472, + "grad_norm": 1.099782109260559, + "learning_rate": 1.8117246898759504e-05, + "loss": 0.4801, + "step": 15949 + }, + { + "epoch": 20.416, + "grad_norm": 1.149043321609497, + "learning_rate": 1.8115246098439375e-05, + "loss": 0.5156, + "step": 15950 + }, + { + "epoch": 20.41728, + "grad_norm": 1.0656511783599854, + "learning_rate": 1.811324529811925e-05, + "loss": 0.4889, + "step": 15951 + }, + { + "epoch": 20.41856, + "grad_norm": 1.1039444208145142, + "learning_rate": 1.811124449779912e-05, + "loss": 0.4805, + "step": 15952 + }, + { + "epoch": 20.41984, + "grad_norm": 1.2415475845336914, + "learning_rate": 1.810924369747899e-05, + "loss": 0.5199, + "step": 15953 + }, + { + "epoch": 20.42112, + "grad_norm": 1.1294299364089966, + "learning_rate": 1.8107242897158863e-05, + "loss": 0.5036, + "step": 15954 + }, + { + "epoch": 20.4224, + "grad_norm": 1.0674455165863037, + "learning_rate": 1.8105242096838738e-05, + "loss": 0.5114, + "step": 15955 + }, + { + "epoch": 20.42368, + "grad_norm": 1.1650712490081787, + "learning_rate": 1.8103241296518607e-05, + "loss": 0.4882, + "step": 15956 + }, + { + "epoch": 20.42496, + "grad_norm": 1.055827260017395, + "learning_rate": 1.810124049619848e-05, + "loss": 0.4596, + "step": 15957 + }, + { + "epoch": 20.42624, + "grad_norm": 1.0579880475997925, + "learning_rate": 1.8099239695878354e-05, + "loss": 0.4844, + "step": 15958 + }, + { + "epoch": 20.42752, + "grad_norm": 1.1154268980026245, + "learning_rate": 1.8097238895558226e-05, + "loss": 0.5422, + "step": 15959 + }, + { + "epoch": 20.4288, + "grad_norm": 1.1454224586486816, + "learning_rate": 1.8095238095238094e-05, + "loss": 0.5215, + "step": 15960 + }, + { + "epoch": 20.43008, + "grad_norm": 1.1174713373184204, + "learning_rate": 1.8093237294917966e-05, + "loss": 0.5044, + "step": 15961 + }, + { + "epoch": 20.43136, + "grad_norm": 1.1929234266281128, + "learning_rate": 1.809123649459784e-05, + "loss": 0.5606, + "step": 15962 + }, + { + "epoch": 20.43264, + "grad_norm": 1.2372697591781616, + "learning_rate": 1.8089235694277713e-05, + "loss": 0.535, + "step": 15963 + }, + { + "epoch": 20.43392, + "grad_norm": 1.1086331605911255, + "learning_rate": 1.808723489395758e-05, + "loss": 0.5191, + "step": 15964 + }, + { + "epoch": 20.4352, + "grad_norm": 1.1372125148773193, + "learning_rate": 1.8085234093637457e-05, + "loss": 0.5431, + "step": 15965 + }, + { + "epoch": 20.43648, + "grad_norm": 1.093470811843872, + "learning_rate": 1.808323329331733e-05, + "loss": 0.4876, + "step": 15966 + }, + { + "epoch": 20.43776, + "grad_norm": 1.1044193506240845, + "learning_rate": 1.80812324929972e-05, + "loss": 0.4605, + "step": 15967 + }, + { + "epoch": 20.43904, + "grad_norm": 1.069149136543274, + "learning_rate": 1.807923169267707e-05, + "loss": 0.4948, + "step": 15968 + }, + { + "epoch": 20.44032, + "grad_norm": 1.0932533740997314, + "learning_rate": 1.8077230892356944e-05, + "loss": 0.4867, + "step": 15969 + }, + { + "epoch": 20.4416, + "grad_norm": 1.1302661895751953, + "learning_rate": 1.8075230092036816e-05, + "loss": 0.4979, + "step": 15970 + }, + { + "epoch": 20.44288, + "grad_norm": 1.0727626085281372, + "learning_rate": 1.8073229291716688e-05, + "loss": 0.4647, + "step": 15971 + }, + { + "epoch": 20.44416, + "grad_norm": 1.1052957773208618, + "learning_rate": 1.807122849139656e-05, + "loss": 0.5308, + "step": 15972 + }, + { + "epoch": 20.44544, + "grad_norm": 1.0423941612243652, + "learning_rate": 1.806922769107643e-05, + "loss": 0.4499, + "step": 15973 + }, + { + "epoch": 20.44672, + "grad_norm": 1.0597472190856934, + "learning_rate": 1.8067226890756303e-05, + "loss": 0.4767, + "step": 15974 + }, + { + "epoch": 20.448, + "grad_norm": 1.0862598419189453, + "learning_rate": 1.8065226090436175e-05, + "loss": 0.5064, + "step": 15975 + }, + { + "epoch": 20.44928, + "grad_norm": 1.124342441558838, + "learning_rate": 1.8063225290116047e-05, + "loss": 0.5078, + "step": 15976 + }, + { + "epoch": 20.45056, + "grad_norm": 1.1150842905044556, + "learning_rate": 1.806122448979592e-05, + "loss": 0.5201, + "step": 15977 + }, + { + "epoch": 20.45184, + "grad_norm": 1.101412296295166, + "learning_rate": 1.805922368947579e-05, + "loss": 0.5034, + "step": 15978 + }, + { + "epoch": 20.45312, + "grad_norm": 1.0810297727584839, + "learning_rate": 1.8057222889155666e-05, + "loss": 0.5046, + "step": 15979 + }, + { + "epoch": 20.4544, + "grad_norm": 1.1513960361480713, + "learning_rate": 1.8055222088835534e-05, + "loss": 0.5528, + "step": 15980 + }, + { + "epoch": 20.45568, + "grad_norm": 1.0624333620071411, + "learning_rate": 1.8053221288515406e-05, + "loss": 0.4878, + "step": 15981 + }, + { + "epoch": 20.45696, + "grad_norm": 1.1114479303359985, + "learning_rate": 1.8051220488195278e-05, + "loss": 0.5168, + "step": 15982 + }, + { + "epoch": 20.45824, + "grad_norm": 1.1178675889968872, + "learning_rate": 1.8049219687875153e-05, + "loss": 0.5037, + "step": 15983 + }, + { + "epoch": 20.45952, + "grad_norm": 1.145005226135254, + "learning_rate": 1.8047218887555022e-05, + "loss": 0.5133, + "step": 15984 + }, + { + "epoch": 20.4608, + "grad_norm": 1.016296625137329, + "learning_rate": 1.8045218087234894e-05, + "loss": 0.4644, + "step": 15985 + }, + { + "epoch": 20.46208, + "grad_norm": 1.0797942876815796, + "learning_rate": 1.804321728691477e-05, + "loss": 0.457, + "step": 15986 + }, + { + "epoch": 20.46336, + "grad_norm": 1.0780278444290161, + "learning_rate": 1.804121648659464e-05, + "loss": 0.5116, + "step": 15987 + }, + { + "epoch": 20.46464, + "grad_norm": 1.1817333698272705, + "learning_rate": 1.803921568627451e-05, + "loss": 0.4981, + "step": 15988 + }, + { + "epoch": 20.46592, + "grad_norm": 1.0998475551605225, + "learning_rate": 1.803721488595438e-05, + "loss": 0.4816, + "step": 15989 + }, + { + "epoch": 20.4672, + "grad_norm": 1.1314204931259155, + "learning_rate": 1.8035214085634256e-05, + "loss": 0.5686, + "step": 15990 + }, + { + "epoch": 20.46848, + "grad_norm": 1.0639474391937256, + "learning_rate": 1.803321328531413e-05, + "loss": 0.4855, + "step": 15991 + }, + { + "epoch": 20.46976, + "grad_norm": 0.9945549964904785, + "learning_rate": 1.8031212484993997e-05, + "loss": 0.4438, + "step": 15992 + }, + { + "epoch": 20.47104, + "grad_norm": 1.022700548171997, + "learning_rate": 1.8029211684673872e-05, + "loss": 0.47, + "step": 15993 + }, + { + "epoch": 20.47232, + "grad_norm": 1.0951484441757202, + "learning_rate": 1.8027210884353744e-05, + "loss": 0.5013, + "step": 15994 + }, + { + "epoch": 20.4736, + "grad_norm": 1.067020058631897, + "learning_rate": 1.8025210084033616e-05, + "loss": 0.5355, + "step": 15995 + }, + { + "epoch": 20.47488, + "grad_norm": 1.0288721323013306, + "learning_rate": 1.8023209283713484e-05, + "loss": 0.4643, + "step": 15996 + }, + { + "epoch": 20.47616, + "grad_norm": 1.031306505203247, + "learning_rate": 1.802120848339336e-05, + "loss": 0.4628, + "step": 15997 + }, + { + "epoch": 20.47744, + "grad_norm": 1.183764100074768, + "learning_rate": 1.801920768307323e-05, + "loss": 0.5168, + "step": 15998 + }, + { + "epoch": 20.47872, + "grad_norm": 0.9885333180427551, + "learning_rate": 1.8017206882753103e-05, + "loss": 0.4841, + "step": 15999 + }, + { + "epoch": 20.48, + "grad_norm": 1.01718008518219, + "learning_rate": 1.801520608243297e-05, + "loss": 0.5065, + "step": 16000 + }, + { + "epoch": 20.48128, + "grad_norm": 1.058000922203064, + "learning_rate": 1.8013205282112847e-05, + "loss": 0.4702, + "step": 16001 + }, + { + "epoch": 20.48256, + "grad_norm": 1.0549604892730713, + "learning_rate": 1.801120448179272e-05, + "loss": 0.4522, + "step": 16002 + }, + { + "epoch": 20.48384, + "grad_norm": 1.2326083183288574, + "learning_rate": 1.800920368147259e-05, + "loss": 0.5391, + "step": 16003 + }, + { + "epoch": 20.48512, + "grad_norm": 1.0707967281341553, + "learning_rate": 1.8007202881152462e-05, + "loss": 0.4603, + "step": 16004 + }, + { + "epoch": 20.4864, + "grad_norm": 1.141228199005127, + "learning_rate": 1.8005202080832334e-05, + "loss": 0.5162, + "step": 16005 + }, + { + "epoch": 20.48768, + "grad_norm": 1.074081540107727, + "learning_rate": 1.8003201280512206e-05, + "loss": 0.4736, + "step": 16006 + }, + { + "epoch": 20.48896, + "grad_norm": 1.0647474527359009, + "learning_rate": 1.8001200480192078e-05, + "loss": 0.4778, + "step": 16007 + }, + { + "epoch": 20.49024, + "grad_norm": 1.1163427829742432, + "learning_rate": 1.799919967987195e-05, + "loss": 0.5002, + "step": 16008 + }, + { + "epoch": 20.49152, + "grad_norm": 1.0871421098709106, + "learning_rate": 1.7997198879551822e-05, + "loss": 0.485, + "step": 16009 + }, + { + "epoch": 20.4928, + "grad_norm": 1.0807521343231201, + "learning_rate": 1.7995198079231694e-05, + "loss": 0.4932, + "step": 16010 + }, + { + "epoch": 20.49408, + "grad_norm": 1.119227409362793, + "learning_rate": 1.7993197278911565e-05, + "loss": 0.5201, + "step": 16011 + }, + { + "epoch": 20.49536, + "grad_norm": 1.0895061492919922, + "learning_rate": 1.7991196478591437e-05, + "loss": 0.495, + "step": 16012 + }, + { + "epoch": 20.49664, + "grad_norm": 1.0847043991088867, + "learning_rate": 1.798919567827131e-05, + "loss": 0.484, + "step": 16013 + }, + { + "epoch": 20.49792, + "grad_norm": 1.0497854948043823, + "learning_rate": 1.798719487795118e-05, + "loss": 0.4332, + "step": 16014 + }, + { + "epoch": 20.4992, + "grad_norm": 1.1417521238327026, + "learning_rate": 1.7985194077631053e-05, + "loss": 0.518, + "step": 16015 + }, + { + "epoch": 20.50048, + "grad_norm": 1.1823201179504395, + "learning_rate": 1.7983193277310925e-05, + "loss": 0.4732, + "step": 16016 + }, + { + "epoch": 20.50176, + "grad_norm": 1.0755385160446167, + "learning_rate": 1.7981192476990797e-05, + "loss": 0.4858, + "step": 16017 + }, + { + "epoch": 20.50304, + "grad_norm": 1.1058756113052368, + "learning_rate": 1.7979191676670672e-05, + "loss": 0.4979, + "step": 16018 + }, + { + "epoch": 20.50432, + "grad_norm": 1.109181523323059, + "learning_rate": 1.797719087635054e-05, + "loss": 0.5062, + "step": 16019 + }, + { + "epoch": 20.5056, + "grad_norm": 1.0815677642822266, + "learning_rate": 1.7975190076030412e-05, + "loss": 0.5043, + "step": 16020 + }, + { + "epoch": 20.50688, + "grad_norm": 1.1085656881332397, + "learning_rate": 1.7973189275710284e-05, + "loss": 0.4851, + "step": 16021 + }, + { + "epoch": 20.50816, + "grad_norm": 1.077195644378662, + "learning_rate": 1.797118847539016e-05, + "loss": 0.4885, + "step": 16022 + }, + { + "epoch": 20.50944, + "grad_norm": 0.9872754812240601, + "learning_rate": 1.7969187675070028e-05, + "loss": 0.4926, + "step": 16023 + }, + { + "epoch": 20.51072, + "grad_norm": 1.0659494400024414, + "learning_rate": 1.79671868747499e-05, + "loss": 0.447, + "step": 16024 + }, + { + "epoch": 20.512, + "grad_norm": 1.0598735809326172, + "learning_rate": 1.7965186074429775e-05, + "loss": 0.487, + "step": 16025 + }, + { + "epoch": 20.51328, + "grad_norm": 1.0480681657791138, + "learning_rate": 1.7963185274109647e-05, + "loss": 0.4795, + "step": 16026 + }, + { + "epoch": 20.51456, + "grad_norm": 1.0445165634155273, + "learning_rate": 1.7961184473789515e-05, + "loss": 0.4792, + "step": 16027 + }, + { + "epoch": 20.51584, + "grad_norm": 1.084743857383728, + "learning_rate": 1.7959183673469387e-05, + "loss": 0.5358, + "step": 16028 + }, + { + "epoch": 20.51712, + "grad_norm": 1.127386212348938, + "learning_rate": 1.7957182873149262e-05, + "loss": 0.514, + "step": 16029 + }, + { + "epoch": 20.5184, + "grad_norm": 1.1147735118865967, + "learning_rate": 1.7955182072829134e-05, + "loss": 0.53, + "step": 16030 + }, + { + "epoch": 20.51968, + "grad_norm": 1.0256705284118652, + "learning_rate": 1.7953181272509003e-05, + "loss": 0.4541, + "step": 16031 + }, + { + "epoch": 20.52096, + "grad_norm": 1.1226087808609009, + "learning_rate": 1.7951180472188878e-05, + "loss": 0.5, + "step": 16032 + }, + { + "epoch": 20.52224, + "grad_norm": 1.1469954252243042, + "learning_rate": 1.794917967186875e-05, + "loss": 0.5799, + "step": 16033 + }, + { + "epoch": 20.52352, + "grad_norm": 1.1341044902801514, + "learning_rate": 1.794717887154862e-05, + "loss": 0.547, + "step": 16034 + }, + { + "epoch": 20.5248, + "grad_norm": 1.123470425605774, + "learning_rate": 1.794517807122849e-05, + "loss": 0.4843, + "step": 16035 + }, + { + "epoch": 20.52608, + "grad_norm": 1.1044238805770874, + "learning_rate": 1.7943177270908365e-05, + "loss": 0.5044, + "step": 16036 + }, + { + "epoch": 20.52736, + "grad_norm": 1.021741271018982, + "learning_rate": 1.7941176470588237e-05, + "loss": 0.4652, + "step": 16037 + }, + { + "epoch": 20.52864, + "grad_norm": 1.1132144927978516, + "learning_rate": 1.793917567026811e-05, + "loss": 0.486, + "step": 16038 + }, + { + "epoch": 20.52992, + "grad_norm": 1.0857055187225342, + "learning_rate": 1.793717486994798e-05, + "loss": 0.5323, + "step": 16039 + }, + { + "epoch": 20.5312, + "grad_norm": 1.1283453702926636, + "learning_rate": 1.7935174069627853e-05, + "loss": 0.4848, + "step": 16040 + }, + { + "epoch": 20.53248, + "grad_norm": 1.175715684890747, + "learning_rate": 1.7933173269307725e-05, + "loss": 0.5741, + "step": 16041 + }, + { + "epoch": 20.53376, + "grad_norm": 1.1087825298309326, + "learning_rate": 1.7931172468987596e-05, + "loss": 0.4799, + "step": 16042 + }, + { + "epoch": 20.53504, + "grad_norm": 1.1053529977798462, + "learning_rate": 1.7929171668667468e-05, + "loss": 0.5016, + "step": 16043 + }, + { + "epoch": 20.53632, + "grad_norm": 1.1228110790252686, + "learning_rate": 1.792717086834734e-05, + "loss": 0.5231, + "step": 16044 + }, + { + "epoch": 20.5376, + "grad_norm": 1.127894639968872, + "learning_rate": 1.7925170068027212e-05, + "loss": 0.5069, + "step": 16045 + }, + { + "epoch": 20.53888, + "grad_norm": 1.0892380475997925, + "learning_rate": 1.7923169267707084e-05, + "loss": 0.5249, + "step": 16046 + }, + { + "epoch": 20.54016, + "grad_norm": 1.0959818363189697, + "learning_rate": 1.7921168467386956e-05, + "loss": 0.511, + "step": 16047 + }, + { + "epoch": 20.54144, + "grad_norm": 1.0303337574005127, + "learning_rate": 1.7919167667066828e-05, + "loss": 0.4818, + "step": 16048 + }, + { + "epoch": 20.54272, + "grad_norm": 1.0872350931167603, + "learning_rate": 1.79171668667467e-05, + "loss": 0.4872, + "step": 16049 + }, + { + "epoch": 20.544, + "grad_norm": 1.132896065711975, + "learning_rate": 1.791516606642657e-05, + "loss": 0.5216, + "step": 16050 + }, + { + "epoch": 20.545279999999998, + "grad_norm": 1.0700019598007202, + "learning_rate": 1.7913165266106443e-05, + "loss": 0.4918, + "step": 16051 + }, + { + "epoch": 20.54656, + "grad_norm": 1.0675747394561768, + "learning_rate": 1.7911164465786315e-05, + "loss": 0.4883, + "step": 16052 + }, + { + "epoch": 20.54784, + "grad_norm": 1.1021859645843506, + "learning_rate": 1.7909163665466187e-05, + "loss": 0.4742, + "step": 16053 + }, + { + "epoch": 20.54912, + "grad_norm": 1.0696967840194702, + "learning_rate": 1.790716286514606e-05, + "loss": 0.4873, + "step": 16054 + }, + { + "epoch": 20.5504, + "grad_norm": 1.0992937088012695, + "learning_rate": 1.790516206482593e-05, + "loss": 0.5003, + "step": 16055 + }, + { + "epoch": 20.55168, + "grad_norm": 1.0972262620925903, + "learning_rate": 1.7903161264505802e-05, + "loss": 0.562, + "step": 16056 + }, + { + "epoch": 20.55296, + "grad_norm": 1.1256171464920044, + "learning_rate": 1.7901160464185678e-05, + "loss": 0.4714, + "step": 16057 + }, + { + "epoch": 20.55424, + "grad_norm": 1.1551443338394165, + "learning_rate": 1.7899159663865546e-05, + "loss": 0.5156, + "step": 16058 + }, + { + "epoch": 20.55552, + "grad_norm": 1.0866180658340454, + "learning_rate": 1.7897158863545418e-05, + "loss": 0.4726, + "step": 16059 + }, + { + "epoch": 20.5568, + "grad_norm": 1.1982043981552124, + "learning_rate": 1.789515806322529e-05, + "loss": 0.5541, + "step": 16060 + }, + { + "epoch": 20.55808, + "grad_norm": 1.107646107673645, + "learning_rate": 1.7893157262905165e-05, + "loss": 0.5068, + "step": 16061 + }, + { + "epoch": 20.55936, + "grad_norm": 1.1011683940887451, + "learning_rate": 1.7891156462585034e-05, + "loss": 0.5374, + "step": 16062 + }, + { + "epoch": 20.56064, + "grad_norm": 1.1166385412216187, + "learning_rate": 1.7889155662264905e-05, + "loss": 0.4849, + "step": 16063 + }, + { + "epoch": 20.56192, + "grad_norm": 1.1044502258300781, + "learning_rate": 1.788715486194478e-05, + "loss": 0.4947, + "step": 16064 + }, + { + "epoch": 20.5632, + "grad_norm": 1.126716136932373, + "learning_rate": 1.7885154061624652e-05, + "loss": 0.5107, + "step": 16065 + }, + { + "epoch": 20.56448, + "grad_norm": 1.119673252105713, + "learning_rate": 1.788315326130452e-05, + "loss": 0.5369, + "step": 16066 + }, + { + "epoch": 20.56576, + "grad_norm": 1.074339747428894, + "learning_rate": 1.7881152460984393e-05, + "loss": 0.4935, + "step": 16067 + }, + { + "epoch": 20.56704, + "grad_norm": 1.1229872703552246, + "learning_rate": 1.7879151660664268e-05, + "loss": 0.5122, + "step": 16068 + }, + { + "epoch": 20.56832, + "grad_norm": 1.1117804050445557, + "learning_rate": 1.787715086034414e-05, + "loss": 0.4864, + "step": 16069 + }, + { + "epoch": 20.5696, + "grad_norm": 1.0915405750274658, + "learning_rate": 1.787515006002401e-05, + "loss": 0.5351, + "step": 16070 + }, + { + "epoch": 20.57088, + "grad_norm": 1.1257494688034058, + "learning_rate": 1.7873149259703884e-05, + "loss": 0.4874, + "step": 16071 + }, + { + "epoch": 20.57216, + "grad_norm": 1.1443331241607666, + "learning_rate": 1.7871148459383755e-05, + "loss": 0.4771, + "step": 16072 + }, + { + "epoch": 20.57344, + "grad_norm": 1.1253244876861572, + "learning_rate": 1.7869147659063627e-05, + "loss": 0.492, + "step": 16073 + }, + { + "epoch": 20.57472, + "grad_norm": 1.1300221681594849, + "learning_rate": 1.7867146858743496e-05, + "loss": 0.5306, + "step": 16074 + }, + { + "epoch": 20.576, + "grad_norm": 1.1364995241165161, + "learning_rate": 1.786514605842337e-05, + "loss": 0.5069, + "step": 16075 + }, + { + "epoch": 20.577280000000002, + "grad_norm": 1.0387669801712036, + "learning_rate": 1.7863145258103243e-05, + "loss": 0.4726, + "step": 16076 + }, + { + "epoch": 20.57856, + "grad_norm": 1.049291968345642, + "learning_rate": 1.7861144457783115e-05, + "loss": 0.4852, + "step": 16077 + }, + { + "epoch": 20.57984, + "grad_norm": 1.0444282293319702, + "learning_rate": 1.7859143657462987e-05, + "loss": 0.4718, + "step": 16078 + }, + { + "epoch": 20.58112, + "grad_norm": 1.095975399017334, + "learning_rate": 1.785714285714286e-05, + "loss": 0.5015, + "step": 16079 + }, + { + "epoch": 20.5824, + "grad_norm": 1.104600191116333, + "learning_rate": 1.785514205682273e-05, + "loss": 0.5515, + "step": 16080 + }, + { + "epoch": 20.58368, + "grad_norm": 1.0863333940505981, + "learning_rate": 1.7853141256502602e-05, + "loss": 0.453, + "step": 16081 + }, + { + "epoch": 20.58496, + "grad_norm": 1.1413263082504272, + "learning_rate": 1.7851140456182474e-05, + "loss": 0.5646, + "step": 16082 + }, + { + "epoch": 20.58624, + "grad_norm": 1.149473786354065, + "learning_rate": 1.7849139655862346e-05, + "loss": 0.4853, + "step": 16083 + }, + { + "epoch": 20.58752, + "grad_norm": 1.069388747215271, + "learning_rate": 1.7847138855542218e-05, + "loss": 0.4969, + "step": 16084 + }, + { + "epoch": 20.5888, + "grad_norm": 1.156481385231018, + "learning_rate": 1.784513805522209e-05, + "loss": 0.5106, + "step": 16085 + }, + { + "epoch": 20.59008, + "grad_norm": 1.0707303285598755, + "learning_rate": 1.784313725490196e-05, + "loss": 0.467, + "step": 16086 + }, + { + "epoch": 20.59136, + "grad_norm": 1.1706613302230835, + "learning_rate": 1.7841136454581833e-05, + "loss": 0.5354, + "step": 16087 + }, + { + "epoch": 20.59264, + "grad_norm": 1.0919955968856812, + "learning_rate": 1.7839135654261705e-05, + "loss": 0.4906, + "step": 16088 + }, + { + "epoch": 20.59392, + "grad_norm": 1.0173168182373047, + "learning_rate": 1.7837134853941577e-05, + "loss": 0.4586, + "step": 16089 + }, + { + "epoch": 20.5952, + "grad_norm": 1.1111178398132324, + "learning_rate": 1.783513405362145e-05, + "loss": 0.5209, + "step": 16090 + }, + { + "epoch": 20.59648, + "grad_norm": 1.0708633661270142, + "learning_rate": 1.783313325330132e-05, + "loss": 0.4752, + "step": 16091 + }, + { + "epoch": 20.59776, + "grad_norm": 1.0484986305236816, + "learning_rate": 1.7831132452981196e-05, + "loss": 0.491, + "step": 16092 + }, + { + "epoch": 20.59904, + "grad_norm": 1.205859899520874, + "learning_rate": 1.7829131652661064e-05, + "loss": 0.4858, + "step": 16093 + }, + { + "epoch": 20.60032, + "grad_norm": 1.1603915691375732, + "learning_rate": 1.7827130852340936e-05, + "loss": 0.4818, + "step": 16094 + }, + { + "epoch": 20.6016, + "grad_norm": 1.0573688745498657, + "learning_rate": 1.7825130052020808e-05, + "loss": 0.4992, + "step": 16095 + }, + { + "epoch": 20.60288, + "grad_norm": 1.1221954822540283, + "learning_rate": 1.7823129251700683e-05, + "loss": 0.4625, + "step": 16096 + }, + { + "epoch": 20.60416, + "grad_norm": 1.1069375276565552, + "learning_rate": 1.7821128451380552e-05, + "loss": 0.502, + "step": 16097 + }, + { + "epoch": 20.60544, + "grad_norm": 1.0597623586654663, + "learning_rate": 1.7819127651060424e-05, + "loss": 0.4757, + "step": 16098 + }, + { + "epoch": 20.60672, + "grad_norm": 1.1148048639297485, + "learning_rate": 1.78171268507403e-05, + "loss": 0.521, + "step": 16099 + }, + { + "epoch": 20.608, + "grad_norm": 1.1164958477020264, + "learning_rate": 1.781512605042017e-05, + "loss": 0.4964, + "step": 16100 + }, + { + "epoch": 20.60928, + "grad_norm": 1.063112735748291, + "learning_rate": 1.781312525010004e-05, + "loss": 0.51, + "step": 16101 + }, + { + "epoch": 20.61056, + "grad_norm": 1.1305928230285645, + "learning_rate": 1.781112444977991e-05, + "loss": 0.5106, + "step": 16102 + }, + { + "epoch": 20.61184, + "grad_norm": 1.0931042432785034, + "learning_rate": 1.7809123649459786e-05, + "loss": 0.482, + "step": 16103 + }, + { + "epoch": 20.61312, + "grad_norm": 1.0625600814819336, + "learning_rate": 1.7807122849139658e-05, + "loss": 0.4919, + "step": 16104 + }, + { + "epoch": 20.6144, + "grad_norm": 1.0991266965866089, + "learning_rate": 1.7805122048819527e-05, + "loss": 0.481, + "step": 16105 + }, + { + "epoch": 20.61568, + "grad_norm": 1.0887682437896729, + "learning_rate": 1.7803121248499402e-05, + "loss": 0.5264, + "step": 16106 + }, + { + "epoch": 20.61696, + "grad_norm": 1.150978446006775, + "learning_rate": 1.7801120448179274e-05, + "loss": 0.5575, + "step": 16107 + }, + { + "epoch": 20.61824, + "grad_norm": 1.1687663793563843, + "learning_rate": 1.7799119647859146e-05, + "loss": 0.4825, + "step": 16108 + }, + { + "epoch": 20.61952, + "grad_norm": 1.1047236919403076, + "learning_rate": 1.7797118847539014e-05, + "loss": 0.513, + "step": 16109 + }, + { + "epoch": 20.6208, + "grad_norm": 1.1841906309127808, + "learning_rate": 1.779511804721889e-05, + "loss": 0.448, + "step": 16110 + }, + { + "epoch": 20.62208, + "grad_norm": 1.0986437797546387, + "learning_rate": 1.779311724689876e-05, + "loss": 0.5195, + "step": 16111 + }, + { + "epoch": 20.62336, + "grad_norm": 1.110182762145996, + "learning_rate": 1.7791116446578633e-05, + "loss": 0.5159, + "step": 16112 + }, + { + "epoch": 20.62464, + "grad_norm": 1.0949082374572754, + "learning_rate": 1.77891156462585e-05, + "loss": 0.5092, + "step": 16113 + }, + { + "epoch": 20.62592, + "grad_norm": 1.1522455215454102, + "learning_rate": 1.7787114845938377e-05, + "loss": 0.5514, + "step": 16114 + }, + { + "epoch": 20.6272, + "grad_norm": 1.0845671892166138, + "learning_rate": 1.778511404561825e-05, + "loss": 0.4713, + "step": 16115 + }, + { + "epoch": 20.62848, + "grad_norm": 1.1189041137695312, + "learning_rate": 1.778311324529812e-05, + "loss": 0.5218, + "step": 16116 + }, + { + "epoch": 20.62976, + "grad_norm": 1.1408662796020508, + "learning_rate": 1.7781112444977992e-05, + "loss": 0.531, + "step": 16117 + }, + { + "epoch": 20.63104, + "grad_norm": 1.071902871131897, + "learning_rate": 1.7779111644657864e-05, + "loss": 0.47, + "step": 16118 + }, + { + "epoch": 20.63232, + "grad_norm": 1.0264390707015991, + "learning_rate": 1.7777110844337736e-05, + "loss": 0.4506, + "step": 16119 + }, + { + "epoch": 20.6336, + "grad_norm": 1.073315143585205, + "learning_rate": 1.7775110044017608e-05, + "loss": 0.4636, + "step": 16120 + }, + { + "epoch": 20.63488, + "grad_norm": 1.1272789239883423, + "learning_rate": 1.777310924369748e-05, + "loss": 0.5256, + "step": 16121 + }, + { + "epoch": 20.63616, + "grad_norm": 1.1192529201507568, + "learning_rate": 1.7771108443377352e-05, + "loss": 0.5349, + "step": 16122 + }, + { + "epoch": 20.63744, + "grad_norm": 1.0640686750411987, + "learning_rate": 1.7769107643057224e-05, + "loss": 0.4645, + "step": 16123 + }, + { + "epoch": 20.63872, + "grad_norm": 1.095218539237976, + "learning_rate": 1.7767106842737095e-05, + "loss": 0.5333, + "step": 16124 + }, + { + "epoch": 20.64, + "grad_norm": 1.1425113677978516, + "learning_rate": 1.7765106042416967e-05, + "loss": 0.5289, + "step": 16125 + }, + { + "epoch": 20.64128, + "grad_norm": 1.095801830291748, + "learning_rate": 1.776310524209684e-05, + "loss": 0.496, + "step": 16126 + }, + { + "epoch": 20.64256, + "grad_norm": 1.1665611267089844, + "learning_rate": 1.776110444177671e-05, + "loss": 0.5176, + "step": 16127 + }, + { + "epoch": 20.64384, + "grad_norm": 1.109303593635559, + "learning_rate": 1.7759103641456583e-05, + "loss": 0.5234, + "step": 16128 + }, + { + "epoch": 20.64512, + "grad_norm": 1.0539172887802124, + "learning_rate": 1.7757102841136455e-05, + "loss": 0.4982, + "step": 16129 + }, + { + "epoch": 20.6464, + "grad_norm": 1.068564772605896, + "learning_rate": 1.7755102040816327e-05, + "loss": 0.5223, + "step": 16130 + }, + { + "epoch": 20.64768, + "grad_norm": 1.0728812217712402, + "learning_rate": 1.7753101240496202e-05, + "loss": 0.4921, + "step": 16131 + }, + { + "epoch": 20.64896, + "grad_norm": 1.1256548166275024, + "learning_rate": 1.775110044017607e-05, + "loss": 0.5624, + "step": 16132 + }, + { + "epoch": 20.65024, + "grad_norm": 1.0447193384170532, + "learning_rate": 1.7749099639855942e-05, + "loss": 0.5047, + "step": 16133 + }, + { + "epoch": 20.65152, + "grad_norm": 1.0693058967590332, + "learning_rate": 1.7747098839535814e-05, + "loss": 0.4991, + "step": 16134 + }, + { + "epoch": 20.6528, + "grad_norm": 1.0980228185653687, + "learning_rate": 1.774509803921569e-05, + "loss": 0.5151, + "step": 16135 + }, + { + "epoch": 20.65408, + "grad_norm": 1.066780924797058, + "learning_rate": 1.7743097238895558e-05, + "loss": 0.4867, + "step": 16136 + }, + { + "epoch": 20.65536, + "grad_norm": 1.225781798362732, + "learning_rate": 1.774109643857543e-05, + "loss": 0.5333, + "step": 16137 + }, + { + "epoch": 20.65664, + "grad_norm": 1.0326478481292725, + "learning_rate": 1.7739095638255305e-05, + "loss": 0.4637, + "step": 16138 + }, + { + "epoch": 20.65792, + "grad_norm": 1.1340293884277344, + "learning_rate": 1.7737094837935177e-05, + "loss": 0.5045, + "step": 16139 + }, + { + "epoch": 20.6592, + "grad_norm": 1.1338059902191162, + "learning_rate": 1.7735094037615045e-05, + "loss": 0.5238, + "step": 16140 + }, + { + "epoch": 20.66048, + "grad_norm": 1.1640844345092773, + "learning_rate": 1.7733093237294917e-05, + "loss": 0.5039, + "step": 16141 + }, + { + "epoch": 20.66176, + "grad_norm": 1.0547772645950317, + "learning_rate": 1.7731092436974792e-05, + "loss": 0.4398, + "step": 16142 + }, + { + "epoch": 20.66304, + "grad_norm": 1.1220600605010986, + "learning_rate": 1.7729091636654664e-05, + "loss": 0.5248, + "step": 16143 + }, + { + "epoch": 20.66432, + "grad_norm": 1.0587539672851562, + "learning_rate": 1.7727090836334533e-05, + "loss": 0.4715, + "step": 16144 + }, + { + "epoch": 20.6656, + "grad_norm": 1.1665568351745605, + "learning_rate": 1.7725090036014408e-05, + "loss": 0.5386, + "step": 16145 + }, + { + "epoch": 20.66688, + "grad_norm": 1.0843427181243896, + "learning_rate": 1.772308923569428e-05, + "loss": 0.5055, + "step": 16146 + }, + { + "epoch": 20.66816, + "grad_norm": 1.0697423219680786, + "learning_rate": 1.772108843537415e-05, + "loss": 0.5219, + "step": 16147 + }, + { + "epoch": 20.66944, + "grad_norm": 1.1367213726043701, + "learning_rate": 1.771908763505402e-05, + "loss": 0.5201, + "step": 16148 + }, + { + "epoch": 20.67072, + "grad_norm": 1.1785510778427124, + "learning_rate": 1.7717086834733895e-05, + "loss": 0.4896, + "step": 16149 + }, + { + "epoch": 20.672, + "grad_norm": 1.1276311874389648, + "learning_rate": 1.7715086034413767e-05, + "loss": 0.4954, + "step": 16150 + }, + { + "epoch": 20.67328, + "grad_norm": 1.1062501668930054, + "learning_rate": 1.771308523409364e-05, + "loss": 0.4945, + "step": 16151 + }, + { + "epoch": 20.67456, + "grad_norm": 1.0566152334213257, + "learning_rate": 1.771108443377351e-05, + "loss": 0.471, + "step": 16152 + }, + { + "epoch": 20.67584, + "grad_norm": 1.0588206052780151, + "learning_rate": 1.7709083633453383e-05, + "loss": 0.5359, + "step": 16153 + }, + { + "epoch": 20.67712, + "grad_norm": 1.1501905918121338, + "learning_rate": 1.7707082833133255e-05, + "loss": 0.4935, + "step": 16154 + }, + { + "epoch": 20.6784, + "grad_norm": 1.0157536268234253, + "learning_rate": 1.7705082032813126e-05, + "loss": 0.4498, + "step": 16155 + }, + { + "epoch": 20.67968, + "grad_norm": 1.1221078634262085, + "learning_rate": 1.7703081232492998e-05, + "loss": 0.4926, + "step": 16156 + }, + { + "epoch": 20.68096, + "grad_norm": 1.0495266914367676, + "learning_rate": 1.770108043217287e-05, + "loss": 0.471, + "step": 16157 + }, + { + "epoch": 20.68224, + "grad_norm": 1.1005525588989258, + "learning_rate": 1.7699079631852742e-05, + "loss": 0.499, + "step": 16158 + }, + { + "epoch": 20.68352, + "grad_norm": 1.0935386419296265, + "learning_rate": 1.7697078831532614e-05, + "loss": 0.4897, + "step": 16159 + }, + { + "epoch": 20.6848, + "grad_norm": 1.0938061475753784, + "learning_rate": 1.7695078031212486e-05, + "loss": 0.4632, + "step": 16160 + }, + { + "epoch": 20.68608, + "grad_norm": 1.104549765586853, + "learning_rate": 1.7693077230892358e-05, + "loss": 0.4732, + "step": 16161 + }, + { + "epoch": 20.687359999999998, + "grad_norm": 1.0983625650405884, + "learning_rate": 1.769107643057223e-05, + "loss": 0.4978, + "step": 16162 + }, + { + "epoch": 20.68864, + "grad_norm": 1.1382209062576294, + "learning_rate": 1.76890756302521e-05, + "loss": 0.5318, + "step": 16163 + }, + { + "epoch": 20.68992, + "grad_norm": 1.1396292448043823, + "learning_rate": 1.7687074829931973e-05, + "loss": 0.5042, + "step": 16164 + }, + { + "epoch": 20.6912, + "grad_norm": 1.1234673261642456, + "learning_rate": 1.7685074029611845e-05, + "loss": 0.5071, + "step": 16165 + }, + { + "epoch": 20.69248, + "grad_norm": 1.0681381225585938, + "learning_rate": 1.7683073229291717e-05, + "loss": 0.5042, + "step": 16166 + }, + { + "epoch": 20.69376, + "grad_norm": 1.0953648090362549, + "learning_rate": 1.768107242897159e-05, + "loss": 0.4874, + "step": 16167 + }, + { + "epoch": 20.69504, + "grad_norm": 1.1817059516906738, + "learning_rate": 1.767907162865146e-05, + "loss": 0.5278, + "step": 16168 + }, + { + "epoch": 20.69632, + "grad_norm": 1.130669116973877, + "learning_rate": 1.7677070828331332e-05, + "loss": 0.5081, + "step": 16169 + }, + { + "epoch": 20.6976, + "grad_norm": 1.0833494663238525, + "learning_rate": 1.7675070028011208e-05, + "loss": 0.4866, + "step": 16170 + }, + { + "epoch": 20.69888, + "grad_norm": 1.0703330039978027, + "learning_rate": 1.7673069227691076e-05, + "loss": 0.4909, + "step": 16171 + }, + { + "epoch": 20.70016, + "grad_norm": 1.13280189037323, + "learning_rate": 1.7671068427370948e-05, + "loss": 0.5435, + "step": 16172 + }, + { + "epoch": 20.70144, + "grad_norm": 1.023118495941162, + "learning_rate": 1.766906762705082e-05, + "loss": 0.4848, + "step": 16173 + }, + { + "epoch": 20.70272, + "grad_norm": 1.0794446468353271, + "learning_rate": 1.7667066826730695e-05, + "loss": 0.5403, + "step": 16174 + }, + { + "epoch": 20.704, + "grad_norm": 1.007285237312317, + "learning_rate": 1.7665066026410564e-05, + "loss": 0.4658, + "step": 16175 + }, + { + "epoch": 20.70528, + "grad_norm": 1.0927997827529907, + "learning_rate": 1.7663065226090435e-05, + "loss": 0.5081, + "step": 16176 + }, + { + "epoch": 20.70656, + "grad_norm": 1.104650855064392, + "learning_rate": 1.766106442577031e-05, + "loss": 0.4623, + "step": 16177 + }, + { + "epoch": 20.70784, + "grad_norm": 1.0667095184326172, + "learning_rate": 1.7659063625450182e-05, + "loss": 0.4841, + "step": 16178 + }, + { + "epoch": 20.70912, + "grad_norm": 1.0377103090286255, + "learning_rate": 1.765706282513005e-05, + "loss": 0.4809, + "step": 16179 + }, + { + "epoch": 20.7104, + "grad_norm": 1.1214765310287476, + "learning_rate": 1.7655062024809923e-05, + "loss": 0.4913, + "step": 16180 + }, + { + "epoch": 20.71168, + "grad_norm": 1.079932689666748, + "learning_rate": 1.7653061224489798e-05, + "loss": 0.4694, + "step": 16181 + }, + { + "epoch": 20.71296, + "grad_norm": 1.179184913635254, + "learning_rate": 1.765106042416967e-05, + "loss": 0.4864, + "step": 16182 + }, + { + "epoch": 20.71424, + "grad_norm": 1.0123862028121948, + "learning_rate": 1.764905962384954e-05, + "loss": 0.4597, + "step": 16183 + }, + { + "epoch": 20.71552, + "grad_norm": 1.1281754970550537, + "learning_rate": 1.7647058823529414e-05, + "loss": 0.4937, + "step": 16184 + }, + { + "epoch": 20.7168, + "grad_norm": 1.0788217782974243, + "learning_rate": 1.7645058023209285e-05, + "loss": 0.4898, + "step": 16185 + }, + { + "epoch": 20.71808, + "grad_norm": 1.1888701915740967, + "learning_rate": 1.7643057222889157e-05, + "loss": 0.4891, + "step": 16186 + }, + { + "epoch": 20.71936, + "grad_norm": 1.1022487878799438, + "learning_rate": 1.7641056422569026e-05, + "loss": 0.5248, + "step": 16187 + }, + { + "epoch": 20.72064, + "grad_norm": 1.0947836637496948, + "learning_rate": 1.76390556222489e-05, + "loss": 0.4841, + "step": 16188 + }, + { + "epoch": 20.72192, + "grad_norm": 1.101966142654419, + "learning_rate": 1.7637054821928773e-05, + "loss": 0.5178, + "step": 16189 + }, + { + "epoch": 20.7232, + "grad_norm": 1.065540075302124, + "learning_rate": 1.7635054021608645e-05, + "loss": 0.452, + "step": 16190 + }, + { + "epoch": 20.72448, + "grad_norm": 1.024093508720398, + "learning_rate": 1.7633053221288517e-05, + "loss": 0.4519, + "step": 16191 + }, + { + "epoch": 20.72576, + "grad_norm": 1.0961694717407227, + "learning_rate": 1.763105242096839e-05, + "loss": 0.5014, + "step": 16192 + }, + { + "epoch": 20.72704, + "grad_norm": 1.0572614669799805, + "learning_rate": 1.762905162064826e-05, + "loss": 0.4712, + "step": 16193 + }, + { + "epoch": 20.72832, + "grad_norm": 1.2695916891098022, + "learning_rate": 1.7627050820328132e-05, + "loss": 0.5001, + "step": 16194 + }, + { + "epoch": 20.7296, + "grad_norm": 1.0590322017669678, + "learning_rate": 1.7625050020008004e-05, + "loss": 0.4618, + "step": 16195 + }, + { + "epoch": 20.73088, + "grad_norm": 1.0804553031921387, + "learning_rate": 1.7623049219687876e-05, + "loss": 0.4603, + "step": 16196 + }, + { + "epoch": 20.73216, + "grad_norm": 1.037230372428894, + "learning_rate": 1.7621048419367748e-05, + "loss": 0.4547, + "step": 16197 + }, + { + "epoch": 20.73344, + "grad_norm": 1.1001992225646973, + "learning_rate": 1.761904761904762e-05, + "loss": 0.4539, + "step": 16198 + }, + { + "epoch": 20.73472, + "grad_norm": 1.0365502834320068, + "learning_rate": 1.761704681872749e-05, + "loss": 0.4636, + "step": 16199 + }, + { + "epoch": 20.736, + "grad_norm": 1.0428606271743774, + "learning_rate": 1.7615046018407363e-05, + "loss": 0.4721, + "step": 16200 + }, + { + "epoch": 20.73728, + "grad_norm": 1.1415473222732544, + "learning_rate": 1.7613045218087235e-05, + "loss": 0.4861, + "step": 16201 + }, + { + "epoch": 20.73856, + "grad_norm": 1.120957612991333, + "learning_rate": 1.7611044417767107e-05, + "loss": 0.4992, + "step": 16202 + }, + { + "epoch": 20.73984, + "grad_norm": 1.0169925689697266, + "learning_rate": 1.760904361744698e-05, + "loss": 0.4415, + "step": 16203 + }, + { + "epoch": 20.74112, + "grad_norm": 1.1539958715438843, + "learning_rate": 1.760704281712685e-05, + "loss": 0.539, + "step": 16204 + }, + { + "epoch": 20.7424, + "grad_norm": 1.0903569459915161, + "learning_rate": 1.7605042016806726e-05, + "loss": 0.4769, + "step": 16205 + }, + { + "epoch": 20.74368, + "grad_norm": 1.1013376712799072, + "learning_rate": 1.7603041216486594e-05, + "loss": 0.4809, + "step": 16206 + }, + { + "epoch": 20.74496, + "grad_norm": 1.1020969152450562, + "learning_rate": 1.7601040416166466e-05, + "loss": 0.4534, + "step": 16207 + }, + { + "epoch": 20.74624, + "grad_norm": 1.0721830129623413, + "learning_rate": 1.7599039615846338e-05, + "loss": 0.4651, + "step": 16208 + }, + { + "epoch": 20.74752, + "grad_norm": 1.0913660526275635, + "learning_rate": 1.7597038815526213e-05, + "loss": 0.482, + "step": 16209 + }, + { + "epoch": 20.7488, + "grad_norm": 1.0793043375015259, + "learning_rate": 1.7595038015206082e-05, + "loss": 0.4536, + "step": 16210 + }, + { + "epoch": 20.75008, + "grad_norm": 1.0972751379013062, + "learning_rate": 1.7593037214885954e-05, + "loss": 0.5117, + "step": 16211 + }, + { + "epoch": 20.75136, + "grad_norm": 1.1126095056533813, + "learning_rate": 1.759103641456583e-05, + "loss": 0.5348, + "step": 16212 + }, + { + "epoch": 20.75264, + "grad_norm": 1.1272165775299072, + "learning_rate": 1.75890356142457e-05, + "loss": 0.515, + "step": 16213 + }, + { + "epoch": 20.75392, + "grad_norm": 1.1290925741195679, + "learning_rate": 1.758703481392557e-05, + "loss": 0.5332, + "step": 16214 + }, + { + "epoch": 20.7552, + "grad_norm": 1.0821077823638916, + "learning_rate": 1.758503401360544e-05, + "loss": 0.5224, + "step": 16215 + }, + { + "epoch": 20.75648, + "grad_norm": 1.0680049657821655, + "learning_rate": 1.7583033213285316e-05, + "loss": 0.5149, + "step": 16216 + }, + { + "epoch": 20.75776, + "grad_norm": 1.059745192527771, + "learning_rate": 1.7581032412965188e-05, + "loss": 0.5058, + "step": 16217 + }, + { + "epoch": 20.75904, + "grad_norm": 1.0059226751327515, + "learning_rate": 1.7579031612645057e-05, + "loss": 0.5011, + "step": 16218 + }, + { + "epoch": 20.76032, + "grad_norm": 1.1066895723342896, + "learning_rate": 1.757703081232493e-05, + "loss": 0.5139, + "step": 16219 + }, + { + "epoch": 20.7616, + "grad_norm": 1.0385751724243164, + "learning_rate": 1.7575030012004804e-05, + "loss": 0.4992, + "step": 16220 + }, + { + "epoch": 20.76288, + "grad_norm": 1.0588712692260742, + "learning_rate": 1.7573029211684676e-05, + "loss": 0.4881, + "step": 16221 + }, + { + "epoch": 20.76416, + "grad_norm": 1.1335150003433228, + "learning_rate": 1.7571028411364544e-05, + "loss": 0.5135, + "step": 16222 + }, + { + "epoch": 20.76544, + "grad_norm": 1.06964111328125, + "learning_rate": 1.756902761104442e-05, + "loss": 0.474, + "step": 16223 + }, + { + "epoch": 20.76672, + "grad_norm": 1.0755014419555664, + "learning_rate": 1.756702681072429e-05, + "loss": 0.5125, + "step": 16224 + }, + { + "epoch": 20.768, + "grad_norm": 1.129836082458496, + "learning_rate": 1.7565026010404163e-05, + "loss": 0.4916, + "step": 16225 + }, + { + "epoch": 20.76928, + "grad_norm": 1.106661319732666, + "learning_rate": 1.756302521008403e-05, + "loss": 0.5316, + "step": 16226 + }, + { + "epoch": 20.77056, + "grad_norm": 1.123422384262085, + "learning_rate": 1.7561024409763907e-05, + "loss": 0.5241, + "step": 16227 + }, + { + "epoch": 20.77184, + "grad_norm": 1.0029109716415405, + "learning_rate": 1.755902360944378e-05, + "loss": 0.4284, + "step": 16228 + }, + { + "epoch": 20.77312, + "grad_norm": 1.0901793241500854, + "learning_rate": 1.755702280912365e-05, + "loss": 0.4894, + "step": 16229 + }, + { + "epoch": 20.7744, + "grad_norm": 1.003308653831482, + "learning_rate": 1.7555022008803522e-05, + "loss": 0.4551, + "step": 16230 + }, + { + "epoch": 20.77568, + "grad_norm": 1.1484678983688354, + "learning_rate": 1.7553021208483394e-05, + "loss": 0.4894, + "step": 16231 + }, + { + "epoch": 20.77696, + "grad_norm": 1.0901198387145996, + "learning_rate": 1.7551020408163266e-05, + "loss": 0.4932, + "step": 16232 + }, + { + "epoch": 20.77824, + "grad_norm": 1.1623092889785767, + "learning_rate": 1.7549019607843138e-05, + "loss": 0.5136, + "step": 16233 + }, + { + "epoch": 20.77952, + "grad_norm": 1.0638892650604248, + "learning_rate": 1.754701880752301e-05, + "loss": 0.4726, + "step": 16234 + }, + { + "epoch": 20.7808, + "grad_norm": 1.0341891050338745, + "learning_rate": 1.754501800720288e-05, + "loss": 0.4597, + "step": 16235 + }, + { + "epoch": 20.78208, + "grad_norm": 1.0782972574234009, + "learning_rate": 1.7543017206882754e-05, + "loss": 0.5077, + "step": 16236 + }, + { + "epoch": 20.78336, + "grad_norm": 1.1353981494903564, + "learning_rate": 1.7541016406562625e-05, + "loss": 0.518, + "step": 16237 + }, + { + "epoch": 20.78464, + "grad_norm": 1.0707870721817017, + "learning_rate": 1.7539015606242497e-05, + "loss": 0.5071, + "step": 16238 + }, + { + "epoch": 20.78592, + "grad_norm": 1.0623449087142944, + "learning_rate": 1.753701480592237e-05, + "loss": 0.4881, + "step": 16239 + }, + { + "epoch": 20.7872, + "grad_norm": 1.0724384784698486, + "learning_rate": 1.753501400560224e-05, + "loss": 0.5197, + "step": 16240 + }, + { + "epoch": 20.78848, + "grad_norm": 1.1770350933074951, + "learning_rate": 1.7533013205282113e-05, + "loss": 0.5326, + "step": 16241 + }, + { + "epoch": 20.78976, + "grad_norm": 1.1036503314971924, + "learning_rate": 1.7531012404961985e-05, + "loss": 0.5241, + "step": 16242 + }, + { + "epoch": 20.79104, + "grad_norm": 1.0659743547439575, + "learning_rate": 1.7529011604641857e-05, + "loss": 0.5047, + "step": 16243 + }, + { + "epoch": 20.79232, + "grad_norm": 1.0868451595306396, + "learning_rate": 1.7527010804321732e-05, + "loss": 0.4819, + "step": 16244 + }, + { + "epoch": 20.7936, + "grad_norm": 1.1210837364196777, + "learning_rate": 1.75250100040016e-05, + "loss": 0.5214, + "step": 16245 + }, + { + "epoch": 20.79488, + "grad_norm": 1.065202236175537, + "learning_rate": 1.7523009203681472e-05, + "loss": 0.4913, + "step": 16246 + }, + { + "epoch": 20.79616, + "grad_norm": 1.1045325994491577, + "learning_rate": 1.7521008403361344e-05, + "loss": 0.493, + "step": 16247 + }, + { + "epoch": 20.79744, + "grad_norm": 1.029994249343872, + "learning_rate": 1.751900760304122e-05, + "loss": 0.4934, + "step": 16248 + }, + { + "epoch": 20.79872, + "grad_norm": 1.1114221811294556, + "learning_rate": 1.7517006802721088e-05, + "loss": 0.5183, + "step": 16249 + }, + { + "epoch": 20.8, + "grad_norm": 1.0240123271942139, + "learning_rate": 1.751500600240096e-05, + "loss": 0.4572, + "step": 16250 + }, + { + "epoch": 20.80128, + "grad_norm": 1.0750398635864258, + "learning_rate": 1.7513005202080835e-05, + "loss": 0.4603, + "step": 16251 + }, + { + "epoch": 20.80256, + "grad_norm": 1.1186068058013916, + "learning_rate": 1.7511004401760707e-05, + "loss": 0.4698, + "step": 16252 + }, + { + "epoch": 20.80384, + "grad_norm": 1.1130450963974, + "learning_rate": 1.7509003601440575e-05, + "loss": 0.5152, + "step": 16253 + }, + { + "epoch": 20.80512, + "grad_norm": 1.104473352432251, + "learning_rate": 1.7507002801120447e-05, + "loss": 0.5139, + "step": 16254 + }, + { + "epoch": 20.8064, + "grad_norm": 1.1033751964569092, + "learning_rate": 1.7505002000800322e-05, + "loss": 0.4785, + "step": 16255 + }, + { + "epoch": 20.80768, + "grad_norm": 1.0060745477676392, + "learning_rate": 1.7503001200480194e-05, + "loss": 0.4259, + "step": 16256 + }, + { + "epoch": 20.80896, + "grad_norm": 1.1276196241378784, + "learning_rate": 1.7501000400160063e-05, + "loss": 0.5602, + "step": 16257 + }, + { + "epoch": 20.81024, + "grad_norm": 1.1201550960540771, + "learning_rate": 1.7498999599839938e-05, + "loss": 0.5034, + "step": 16258 + }, + { + "epoch": 20.81152, + "grad_norm": 1.0943266153335571, + "learning_rate": 1.749699879951981e-05, + "loss": 0.509, + "step": 16259 + }, + { + "epoch": 20.8128, + "grad_norm": 1.0522860288619995, + "learning_rate": 1.749499799919968e-05, + "loss": 0.4813, + "step": 16260 + }, + { + "epoch": 20.81408, + "grad_norm": 1.0484342575073242, + "learning_rate": 1.749299719887955e-05, + "loss": 0.4743, + "step": 16261 + }, + { + "epoch": 20.81536, + "grad_norm": 1.0669806003570557, + "learning_rate": 1.7490996398559425e-05, + "loss": 0.4943, + "step": 16262 + }, + { + "epoch": 20.81664, + "grad_norm": 1.063136339187622, + "learning_rate": 1.7488995598239297e-05, + "loss": 0.4856, + "step": 16263 + }, + { + "epoch": 20.81792, + "grad_norm": 1.091721534729004, + "learning_rate": 1.748699479791917e-05, + "loss": 0.481, + "step": 16264 + }, + { + "epoch": 20.8192, + "grad_norm": 1.0925869941711426, + "learning_rate": 1.748499399759904e-05, + "loss": 0.5033, + "step": 16265 + }, + { + "epoch": 20.82048, + "grad_norm": 1.1635428667068481, + "learning_rate": 1.7482993197278913e-05, + "loss": 0.5144, + "step": 16266 + }, + { + "epoch": 20.82176, + "grad_norm": 1.0611239671707153, + "learning_rate": 1.7480992396958784e-05, + "loss": 0.4716, + "step": 16267 + }, + { + "epoch": 20.82304, + "grad_norm": 1.1461703777313232, + "learning_rate": 1.7478991596638656e-05, + "loss": 0.5118, + "step": 16268 + }, + { + "epoch": 20.82432, + "grad_norm": 1.0631418228149414, + "learning_rate": 1.7476990796318528e-05, + "loss": 0.4622, + "step": 16269 + }, + { + "epoch": 20.8256, + "grad_norm": 1.0580074787139893, + "learning_rate": 1.74749899959984e-05, + "loss": 0.5003, + "step": 16270 + }, + { + "epoch": 20.82688, + "grad_norm": 1.1611815690994263, + "learning_rate": 1.7472989195678272e-05, + "loss": 0.5646, + "step": 16271 + }, + { + "epoch": 20.82816, + "grad_norm": 1.1409611701965332, + "learning_rate": 1.7470988395358144e-05, + "loss": 0.4652, + "step": 16272 + }, + { + "epoch": 20.829439999999998, + "grad_norm": 1.1084599494934082, + "learning_rate": 1.7468987595038016e-05, + "loss": 0.5027, + "step": 16273 + }, + { + "epoch": 20.83072, + "grad_norm": 1.1274982690811157, + "learning_rate": 1.7466986794717887e-05, + "loss": 0.486, + "step": 16274 + }, + { + "epoch": 20.832, + "grad_norm": 1.1236320734024048, + "learning_rate": 1.746498599439776e-05, + "loss": 0.5091, + "step": 16275 + }, + { + "epoch": 20.83328, + "grad_norm": 1.1362968683242798, + "learning_rate": 1.746298519407763e-05, + "loss": 0.4902, + "step": 16276 + }, + { + "epoch": 20.83456, + "grad_norm": 1.092063069343567, + "learning_rate": 1.7460984393757503e-05, + "loss": 0.5271, + "step": 16277 + }, + { + "epoch": 20.83584, + "grad_norm": 1.1088082790374756, + "learning_rate": 1.7458983593437375e-05, + "loss": 0.4834, + "step": 16278 + }, + { + "epoch": 20.83712, + "grad_norm": 1.081045150756836, + "learning_rate": 1.7456982793117247e-05, + "loss": 0.4928, + "step": 16279 + }, + { + "epoch": 20.8384, + "grad_norm": 1.1172006130218506, + "learning_rate": 1.745498199279712e-05, + "loss": 0.5346, + "step": 16280 + }, + { + "epoch": 20.83968, + "grad_norm": 1.0738575458526611, + "learning_rate": 1.745298119247699e-05, + "loss": 0.4861, + "step": 16281 + }, + { + "epoch": 20.84096, + "grad_norm": 1.0271522998809814, + "learning_rate": 1.7450980392156862e-05, + "loss": 0.5072, + "step": 16282 + }, + { + "epoch": 20.84224, + "grad_norm": 1.1056361198425293, + "learning_rate": 1.7448979591836738e-05, + "loss": 0.5387, + "step": 16283 + }, + { + "epoch": 20.84352, + "grad_norm": 1.07204270362854, + "learning_rate": 1.7446978791516606e-05, + "loss": 0.4802, + "step": 16284 + }, + { + "epoch": 20.8448, + "grad_norm": 1.0948388576507568, + "learning_rate": 1.7444977991196478e-05, + "loss": 0.4963, + "step": 16285 + }, + { + "epoch": 20.84608, + "grad_norm": 1.1299151182174683, + "learning_rate": 1.744297719087635e-05, + "loss": 0.5061, + "step": 16286 + }, + { + "epoch": 20.84736, + "grad_norm": 1.0689219236373901, + "learning_rate": 1.7440976390556225e-05, + "loss": 0.4855, + "step": 16287 + }, + { + "epoch": 20.84864, + "grad_norm": 1.1455039978027344, + "learning_rate": 1.7438975590236093e-05, + "loss": 0.5598, + "step": 16288 + }, + { + "epoch": 20.84992, + "grad_norm": 1.1063296794891357, + "learning_rate": 1.7436974789915965e-05, + "loss": 0.4986, + "step": 16289 + }, + { + "epoch": 20.8512, + "grad_norm": 1.1937898397445679, + "learning_rate": 1.743497398959584e-05, + "loss": 0.513, + "step": 16290 + }, + { + "epoch": 20.85248, + "grad_norm": 1.005609154701233, + "learning_rate": 1.7432973189275712e-05, + "loss": 0.4549, + "step": 16291 + }, + { + "epoch": 20.85376, + "grad_norm": 1.0839899778366089, + "learning_rate": 1.743097238895558e-05, + "loss": 0.4952, + "step": 16292 + }, + { + "epoch": 20.85504, + "grad_norm": 1.1058770418167114, + "learning_rate": 1.7428971588635453e-05, + "loss": 0.4729, + "step": 16293 + }, + { + "epoch": 20.85632, + "grad_norm": 1.0722662210464478, + "learning_rate": 1.7426970788315328e-05, + "loss": 0.4774, + "step": 16294 + }, + { + "epoch": 20.8576, + "grad_norm": 1.0828754901885986, + "learning_rate": 1.74249699879952e-05, + "loss": 0.5036, + "step": 16295 + }, + { + "epoch": 20.85888, + "grad_norm": 1.0864346027374268, + "learning_rate": 1.742296918767507e-05, + "loss": 0.5187, + "step": 16296 + }, + { + "epoch": 20.86016, + "grad_norm": 1.0293843746185303, + "learning_rate": 1.7420968387354944e-05, + "loss": 0.4526, + "step": 16297 + }, + { + "epoch": 20.86144, + "grad_norm": 1.0849385261535645, + "learning_rate": 1.7418967587034815e-05, + "loss": 0.4959, + "step": 16298 + }, + { + "epoch": 20.86272, + "grad_norm": 1.0619242191314697, + "learning_rate": 1.7416966786714687e-05, + "loss": 0.504, + "step": 16299 + }, + { + "epoch": 20.864, + "grad_norm": 1.0861916542053223, + "learning_rate": 1.7414965986394556e-05, + "loss": 0.4619, + "step": 16300 + }, + { + "epoch": 20.86528, + "grad_norm": 1.0871127843856812, + "learning_rate": 1.741296518607443e-05, + "loss": 0.4972, + "step": 16301 + }, + { + "epoch": 20.86656, + "grad_norm": 1.1219013929367065, + "learning_rate": 1.7410964385754303e-05, + "loss": 0.4946, + "step": 16302 + }, + { + "epoch": 20.86784, + "grad_norm": 1.0501264333724976, + "learning_rate": 1.7408963585434175e-05, + "loss": 0.5039, + "step": 16303 + }, + { + "epoch": 20.86912, + "grad_norm": 1.1329530477523804, + "learning_rate": 1.7406962785114047e-05, + "loss": 0.5356, + "step": 16304 + }, + { + "epoch": 20.8704, + "grad_norm": 1.0762802362442017, + "learning_rate": 1.740496198479392e-05, + "loss": 0.5171, + "step": 16305 + }, + { + "epoch": 20.87168, + "grad_norm": 1.0678480863571167, + "learning_rate": 1.740296118447379e-05, + "loss": 0.4828, + "step": 16306 + }, + { + "epoch": 20.87296, + "grad_norm": 1.1031467914581299, + "learning_rate": 1.7400960384153662e-05, + "loss": 0.4672, + "step": 16307 + }, + { + "epoch": 20.87424, + "grad_norm": 1.0675780773162842, + "learning_rate": 1.7398959583833534e-05, + "loss": 0.5382, + "step": 16308 + }, + { + "epoch": 20.87552, + "grad_norm": 1.0954948663711548, + "learning_rate": 1.7396958783513406e-05, + "loss": 0.5027, + "step": 16309 + }, + { + "epoch": 20.8768, + "grad_norm": 1.0903716087341309, + "learning_rate": 1.7394957983193278e-05, + "loss": 0.4796, + "step": 16310 + }, + { + "epoch": 20.87808, + "grad_norm": 1.147133708000183, + "learning_rate": 1.7392957182873153e-05, + "loss": 0.5004, + "step": 16311 + }, + { + "epoch": 20.87936, + "grad_norm": 1.1145248413085938, + "learning_rate": 1.739095638255302e-05, + "loss": 0.5235, + "step": 16312 + }, + { + "epoch": 20.88064, + "grad_norm": 1.0918245315551758, + "learning_rate": 1.7388955582232893e-05, + "loss": 0.4825, + "step": 16313 + }, + { + "epoch": 20.88192, + "grad_norm": 1.1098552942276, + "learning_rate": 1.7386954781912765e-05, + "loss": 0.4949, + "step": 16314 + }, + { + "epoch": 20.8832, + "grad_norm": 1.1097396612167358, + "learning_rate": 1.738495398159264e-05, + "loss": 0.49, + "step": 16315 + }, + { + "epoch": 20.88448, + "grad_norm": 1.1164050102233887, + "learning_rate": 1.738295318127251e-05, + "loss": 0.5204, + "step": 16316 + }, + { + "epoch": 20.88576, + "grad_norm": 1.1462451219558716, + "learning_rate": 1.738095238095238e-05, + "loss": 0.5449, + "step": 16317 + }, + { + "epoch": 20.88704, + "grad_norm": 1.052255392074585, + "learning_rate": 1.7378951580632256e-05, + "loss": 0.481, + "step": 16318 + }, + { + "epoch": 20.88832, + "grad_norm": 1.0906624794006348, + "learning_rate": 1.7376950780312128e-05, + "loss": 0.4848, + "step": 16319 + }, + { + "epoch": 20.8896, + "grad_norm": 1.0164730548858643, + "learning_rate": 1.7374949979991996e-05, + "loss": 0.4646, + "step": 16320 + }, + { + "epoch": 20.89088, + "grad_norm": 1.1174423694610596, + "learning_rate": 1.7372949179671868e-05, + "loss": 0.5213, + "step": 16321 + }, + { + "epoch": 20.89216, + "grad_norm": 1.0460309982299805, + "learning_rate": 1.7370948379351743e-05, + "loss": 0.5021, + "step": 16322 + }, + { + "epoch": 20.89344, + "grad_norm": 1.0785523653030396, + "learning_rate": 1.7368947579031615e-05, + "loss": 0.4775, + "step": 16323 + }, + { + "epoch": 20.89472, + "grad_norm": 1.173263430595398, + "learning_rate": 1.7366946778711484e-05, + "loss": 0.5126, + "step": 16324 + }, + { + "epoch": 20.896, + "grad_norm": 1.0594722032546997, + "learning_rate": 1.736494597839136e-05, + "loss": 0.4804, + "step": 16325 + }, + { + "epoch": 20.89728, + "grad_norm": 1.1119956970214844, + "learning_rate": 1.736294517807123e-05, + "loss": 0.4952, + "step": 16326 + }, + { + "epoch": 20.89856, + "grad_norm": 1.0069000720977783, + "learning_rate": 1.7360944377751103e-05, + "loss": 0.4821, + "step": 16327 + }, + { + "epoch": 20.89984, + "grad_norm": 1.118168830871582, + "learning_rate": 1.735894357743097e-05, + "loss": 0.513, + "step": 16328 + }, + { + "epoch": 20.90112, + "grad_norm": 1.0356378555297852, + "learning_rate": 1.7356942777110846e-05, + "loss": 0.435, + "step": 16329 + }, + { + "epoch": 20.9024, + "grad_norm": 1.0259177684783936, + "learning_rate": 1.7354941976790718e-05, + "loss": 0.4688, + "step": 16330 + }, + { + "epoch": 20.90368, + "grad_norm": 1.057941198348999, + "learning_rate": 1.735294117647059e-05, + "loss": 0.4662, + "step": 16331 + }, + { + "epoch": 20.90496, + "grad_norm": 1.1319576501846313, + "learning_rate": 1.735094037615046e-05, + "loss": 0.5064, + "step": 16332 + }, + { + "epoch": 20.90624, + "grad_norm": 1.1116087436676025, + "learning_rate": 1.7348939575830334e-05, + "loss": 0.5034, + "step": 16333 + }, + { + "epoch": 20.90752, + "grad_norm": 1.1020821332931519, + "learning_rate": 1.7346938775510206e-05, + "loss": 0.509, + "step": 16334 + }, + { + "epoch": 20.9088, + "grad_norm": 1.024755835533142, + "learning_rate": 1.7344937975190078e-05, + "loss": 0.4732, + "step": 16335 + }, + { + "epoch": 20.91008, + "grad_norm": 1.1245591640472412, + "learning_rate": 1.734293717486995e-05, + "loss": 0.5083, + "step": 16336 + }, + { + "epoch": 20.91136, + "grad_norm": 1.0561660528182983, + "learning_rate": 1.734093637454982e-05, + "loss": 0.4551, + "step": 16337 + }, + { + "epoch": 20.91264, + "grad_norm": 1.017407774925232, + "learning_rate": 1.7338935574229693e-05, + "loss": 0.4703, + "step": 16338 + }, + { + "epoch": 20.91392, + "grad_norm": 1.145080804824829, + "learning_rate": 1.7336934773909565e-05, + "loss": 0.5039, + "step": 16339 + }, + { + "epoch": 20.9152, + "grad_norm": 1.074860692024231, + "learning_rate": 1.7334933973589437e-05, + "loss": 0.4906, + "step": 16340 + }, + { + "epoch": 20.91648, + "grad_norm": 1.1369560956954956, + "learning_rate": 1.733293317326931e-05, + "loss": 0.5045, + "step": 16341 + }, + { + "epoch": 20.91776, + "grad_norm": 1.0858222246170044, + "learning_rate": 1.733093237294918e-05, + "loss": 0.4916, + "step": 16342 + }, + { + "epoch": 20.91904, + "grad_norm": 1.1016911268234253, + "learning_rate": 1.7328931572629052e-05, + "loss": 0.521, + "step": 16343 + }, + { + "epoch": 20.92032, + "grad_norm": 1.0700989961624146, + "learning_rate": 1.7326930772308924e-05, + "loss": 0.4737, + "step": 16344 + }, + { + "epoch": 20.9216, + "grad_norm": 1.066236972808838, + "learning_rate": 1.7324929971988796e-05, + "loss": 0.4873, + "step": 16345 + }, + { + "epoch": 20.92288, + "grad_norm": 1.1762385368347168, + "learning_rate": 1.7322929171668668e-05, + "loss": 0.5469, + "step": 16346 + }, + { + "epoch": 20.92416, + "grad_norm": 1.086787462234497, + "learning_rate": 1.732092837134854e-05, + "loss": 0.5013, + "step": 16347 + }, + { + "epoch": 20.925440000000002, + "grad_norm": 1.0452648401260376, + "learning_rate": 1.731892757102841e-05, + "loss": 0.4687, + "step": 16348 + }, + { + "epoch": 20.92672, + "grad_norm": 1.1296664476394653, + "learning_rate": 1.7316926770708284e-05, + "loss": 0.5306, + "step": 16349 + }, + { + "epoch": 20.928, + "grad_norm": 1.0765169858932495, + "learning_rate": 1.731492597038816e-05, + "loss": 0.4865, + "step": 16350 + }, + { + "epoch": 20.92928, + "grad_norm": 1.122309684753418, + "learning_rate": 1.7312925170068027e-05, + "loss": 0.4692, + "step": 16351 + }, + { + "epoch": 20.93056, + "grad_norm": 1.1600003242492676, + "learning_rate": 1.73109243697479e-05, + "loss": 0.545, + "step": 16352 + }, + { + "epoch": 20.93184, + "grad_norm": 1.06452214717865, + "learning_rate": 1.730892356942777e-05, + "loss": 0.4528, + "step": 16353 + }, + { + "epoch": 20.93312, + "grad_norm": 1.0116653442382812, + "learning_rate": 1.7306922769107646e-05, + "loss": 0.444, + "step": 16354 + }, + { + "epoch": 20.9344, + "grad_norm": 1.0115692615509033, + "learning_rate": 1.7304921968787515e-05, + "loss": 0.4983, + "step": 16355 + }, + { + "epoch": 20.93568, + "grad_norm": 1.1115832328796387, + "learning_rate": 1.7302921168467387e-05, + "loss": 0.4955, + "step": 16356 + }, + { + "epoch": 20.93696, + "grad_norm": 1.0773097276687622, + "learning_rate": 1.7300920368147262e-05, + "loss": 0.4978, + "step": 16357 + }, + { + "epoch": 20.93824, + "grad_norm": 1.102440595626831, + "learning_rate": 1.7298919567827134e-05, + "loss": 0.5104, + "step": 16358 + }, + { + "epoch": 20.93952, + "grad_norm": 1.0837904214859009, + "learning_rate": 1.7296918767507002e-05, + "loss": 0.4988, + "step": 16359 + }, + { + "epoch": 20.9408, + "grad_norm": 1.091955542564392, + "learning_rate": 1.7294917967186874e-05, + "loss": 0.5182, + "step": 16360 + }, + { + "epoch": 20.94208, + "grad_norm": 1.13055419921875, + "learning_rate": 1.729291716686675e-05, + "loss": 0.5187, + "step": 16361 + }, + { + "epoch": 20.94336, + "grad_norm": 1.1152286529541016, + "learning_rate": 1.729091636654662e-05, + "loss": 0.498, + "step": 16362 + }, + { + "epoch": 20.94464, + "grad_norm": 1.0456947088241577, + "learning_rate": 1.728891556622649e-05, + "loss": 0.5033, + "step": 16363 + }, + { + "epoch": 20.94592, + "grad_norm": 1.0851963758468628, + "learning_rate": 1.7286914765906365e-05, + "loss": 0.5229, + "step": 16364 + }, + { + "epoch": 20.9472, + "grad_norm": 1.03732168674469, + "learning_rate": 1.7284913965586237e-05, + "loss": 0.4809, + "step": 16365 + }, + { + "epoch": 20.94848, + "grad_norm": 1.0764819383621216, + "learning_rate": 1.728291316526611e-05, + "loss": 0.4754, + "step": 16366 + }, + { + "epoch": 20.94976, + "grad_norm": 1.1544440984725952, + "learning_rate": 1.7280912364945977e-05, + "loss": 0.5008, + "step": 16367 + }, + { + "epoch": 20.95104, + "grad_norm": 1.1239932775497437, + "learning_rate": 1.7278911564625852e-05, + "loss": 0.4808, + "step": 16368 + }, + { + "epoch": 20.95232, + "grad_norm": 1.1171079874038696, + "learning_rate": 1.7276910764305724e-05, + "loss": 0.4859, + "step": 16369 + }, + { + "epoch": 20.9536, + "grad_norm": 1.1345586776733398, + "learning_rate": 1.7274909963985596e-05, + "loss": 0.4935, + "step": 16370 + }, + { + "epoch": 20.95488, + "grad_norm": 1.1230216026306152, + "learning_rate": 1.7272909163665468e-05, + "loss": 0.5274, + "step": 16371 + }, + { + "epoch": 20.95616, + "grad_norm": 1.2072840929031372, + "learning_rate": 1.727090836334534e-05, + "loss": 0.5538, + "step": 16372 + }, + { + "epoch": 20.95744, + "grad_norm": 1.1466865539550781, + "learning_rate": 1.726890756302521e-05, + "loss": 0.5492, + "step": 16373 + }, + { + "epoch": 20.95872, + "grad_norm": 1.017502784729004, + "learning_rate": 1.7266906762705083e-05, + "loss": 0.4899, + "step": 16374 + }, + { + "epoch": 20.96, + "grad_norm": 1.1090508699417114, + "learning_rate": 1.7264905962384955e-05, + "loss": 0.4935, + "step": 16375 + }, + { + "epoch": 20.96128, + "grad_norm": 1.078791618347168, + "learning_rate": 1.7262905162064827e-05, + "loss": 0.4829, + "step": 16376 + }, + { + "epoch": 20.96256, + "grad_norm": 1.1684792041778564, + "learning_rate": 1.72609043617447e-05, + "loss": 0.5098, + "step": 16377 + }, + { + "epoch": 20.96384, + "grad_norm": 1.1406118869781494, + "learning_rate": 1.725890356142457e-05, + "loss": 0.5024, + "step": 16378 + }, + { + "epoch": 20.96512, + "grad_norm": 1.0845208168029785, + "learning_rate": 1.7256902761104443e-05, + "loss": 0.4965, + "step": 16379 + }, + { + "epoch": 20.9664, + "grad_norm": 1.04744291305542, + "learning_rate": 1.7254901960784314e-05, + "loss": 0.4817, + "step": 16380 + }, + { + "epoch": 20.96768, + "grad_norm": 1.1002355813980103, + "learning_rate": 1.7252901160464186e-05, + "loss": 0.4886, + "step": 16381 + }, + { + "epoch": 20.96896, + "grad_norm": 1.0374670028686523, + "learning_rate": 1.7250900360144058e-05, + "loss": 0.4639, + "step": 16382 + }, + { + "epoch": 20.97024, + "grad_norm": 1.0553470849990845, + "learning_rate": 1.724889955982393e-05, + "loss": 0.4705, + "step": 16383 + }, + { + "epoch": 20.97152, + "grad_norm": 1.1559804677963257, + "learning_rate": 1.7246898759503802e-05, + "loss": 0.5046, + "step": 16384 + }, + { + "epoch": 20.9728, + "grad_norm": 1.1546366214752197, + "learning_rate": 1.7244897959183677e-05, + "loss": 0.5199, + "step": 16385 + }, + { + "epoch": 20.97408, + "grad_norm": 1.0644662380218506, + "learning_rate": 1.7242897158863546e-05, + "loss": 0.4589, + "step": 16386 + }, + { + "epoch": 20.97536, + "grad_norm": 1.0808895826339722, + "learning_rate": 1.7240896358543417e-05, + "loss": 0.5159, + "step": 16387 + }, + { + "epoch": 20.97664, + "grad_norm": 1.1637095212936401, + "learning_rate": 1.723889555822329e-05, + "loss": 0.5412, + "step": 16388 + }, + { + "epoch": 20.97792, + "grad_norm": 1.0618444681167603, + "learning_rate": 1.7236894757903165e-05, + "loss": 0.5048, + "step": 16389 + }, + { + "epoch": 20.9792, + "grad_norm": 1.1485847234725952, + "learning_rate": 1.7234893957583033e-05, + "loss": 0.4985, + "step": 16390 + }, + { + "epoch": 20.98048, + "grad_norm": 1.1214594841003418, + "learning_rate": 1.7232893157262905e-05, + "loss": 0.4636, + "step": 16391 + }, + { + "epoch": 20.98176, + "grad_norm": 1.1146374940872192, + "learning_rate": 1.7230892356942777e-05, + "loss": 0.5329, + "step": 16392 + }, + { + "epoch": 20.98304, + "grad_norm": 1.092591404914856, + "learning_rate": 1.7228891556622652e-05, + "loss": 0.5354, + "step": 16393 + }, + { + "epoch": 20.98432, + "grad_norm": 1.1593587398529053, + "learning_rate": 1.722689075630252e-05, + "loss": 0.4776, + "step": 16394 + }, + { + "epoch": 20.9856, + "grad_norm": 1.0387076139450073, + "learning_rate": 1.7224889955982392e-05, + "loss": 0.4753, + "step": 16395 + }, + { + "epoch": 20.98688, + "grad_norm": 1.0778547525405884, + "learning_rate": 1.7222889155662268e-05, + "loss": 0.4997, + "step": 16396 + }, + { + "epoch": 20.98816, + "grad_norm": 1.0431793928146362, + "learning_rate": 1.722088835534214e-05, + "loss": 0.4463, + "step": 16397 + }, + { + "epoch": 20.98944, + "grad_norm": 1.0882987976074219, + "learning_rate": 1.7218887555022008e-05, + "loss": 0.4593, + "step": 16398 + }, + { + "epoch": 20.99072, + "grad_norm": 1.1702048778533936, + "learning_rate": 1.721688675470188e-05, + "loss": 0.5429, + "step": 16399 + }, + { + "epoch": 20.992, + "grad_norm": 1.0901694297790527, + "learning_rate": 1.7214885954381755e-05, + "loss": 0.5139, + "step": 16400 + }, + { + "epoch": 20.99328, + "grad_norm": 1.1674569845199585, + "learning_rate": 1.7212885154061627e-05, + "loss": 0.5385, + "step": 16401 + }, + { + "epoch": 20.99456, + "grad_norm": 1.1179885864257812, + "learning_rate": 1.7210884353741495e-05, + "loss": 0.4636, + "step": 16402 + }, + { + "epoch": 20.99584, + "grad_norm": 1.1010475158691406, + "learning_rate": 1.720888355342137e-05, + "loss": 0.5093, + "step": 16403 + }, + { + "epoch": 20.99712, + "grad_norm": 1.157423734664917, + "learning_rate": 1.7206882753101242e-05, + "loss": 0.5069, + "step": 16404 + }, + { + "epoch": 20.9984, + "grad_norm": 1.0208972692489624, + "learning_rate": 1.7204881952781114e-05, + "loss": 0.4783, + "step": 16405 + }, + { + "epoch": 20.99968, + "grad_norm": 1.0992622375488281, + "learning_rate": 1.7202881152460983e-05, + "loss": 0.4992, + "step": 16406 + }, + { + "epoch": 21.00096, + "grad_norm": 2.2934412956237793, + "learning_rate": 1.7200880352140858e-05, + "loss": 0.8021, + "step": 16407 + }, + { + "epoch": 21.00224, + "grad_norm": 1.0393415689468384, + "learning_rate": 1.719887955182073e-05, + "loss": 0.4699, + "step": 16408 + }, + { + "epoch": 21.00352, + "grad_norm": 1.0242365598678589, + "learning_rate": 1.7196878751500602e-05, + "loss": 0.4762, + "step": 16409 + }, + { + "epoch": 21.0048, + "grad_norm": 1.0307321548461914, + "learning_rate": 1.7194877951180474e-05, + "loss": 0.4812, + "step": 16410 + }, + { + "epoch": 21.00608, + "grad_norm": 1.07740318775177, + "learning_rate": 1.7192877150860345e-05, + "loss": 0.4985, + "step": 16411 + }, + { + "epoch": 21.00736, + "grad_norm": 1.0476981401443481, + "learning_rate": 1.7190876350540217e-05, + "loss": 0.4478, + "step": 16412 + }, + { + "epoch": 21.00864, + "grad_norm": 1.041973352432251, + "learning_rate": 1.718887555022009e-05, + "loss": 0.4452, + "step": 16413 + }, + { + "epoch": 21.00992, + "grad_norm": 1.1095020771026611, + "learning_rate": 1.718687474989996e-05, + "loss": 0.5099, + "step": 16414 + }, + { + "epoch": 21.0112, + "grad_norm": 1.123745083808899, + "learning_rate": 1.7184873949579833e-05, + "loss": 0.4954, + "step": 16415 + }, + { + "epoch": 21.01248, + "grad_norm": 1.1033985614776611, + "learning_rate": 1.7182873149259705e-05, + "loss": 0.4863, + "step": 16416 + }, + { + "epoch": 21.01376, + "grad_norm": 1.1041131019592285, + "learning_rate": 1.7180872348939577e-05, + "loss": 0.4556, + "step": 16417 + }, + { + "epoch": 21.01504, + "grad_norm": 1.0607541799545288, + "learning_rate": 1.717887154861945e-05, + "loss": 0.4738, + "step": 16418 + }, + { + "epoch": 21.01632, + "grad_norm": 1.0398081541061401, + "learning_rate": 1.717687074829932e-05, + "loss": 0.4566, + "step": 16419 + }, + { + "epoch": 21.0176, + "grad_norm": 1.0724793672561646, + "learning_rate": 1.7174869947979192e-05, + "loss": 0.4586, + "step": 16420 + }, + { + "epoch": 21.01888, + "grad_norm": 1.109832525253296, + "learning_rate": 1.7172869147659064e-05, + "loss": 0.4661, + "step": 16421 + }, + { + "epoch": 21.02016, + "grad_norm": 1.0665154457092285, + "learning_rate": 1.7170868347338936e-05, + "loss": 0.4561, + "step": 16422 + }, + { + "epoch": 21.02144, + "grad_norm": 1.094376564025879, + "learning_rate": 1.7168867547018808e-05, + "loss": 0.4649, + "step": 16423 + }, + { + "epoch": 21.02272, + "grad_norm": 1.061568021774292, + "learning_rate": 1.7166866746698683e-05, + "loss": 0.4897, + "step": 16424 + }, + { + "epoch": 21.024, + "grad_norm": 1.0756134986877441, + "learning_rate": 1.716486594637855e-05, + "loss": 0.4575, + "step": 16425 + }, + { + "epoch": 21.02528, + "grad_norm": 1.0889450311660767, + "learning_rate": 1.7162865146058423e-05, + "loss": 0.5075, + "step": 16426 + }, + { + "epoch": 21.02656, + "grad_norm": 1.0604745149612427, + "learning_rate": 1.7160864345738295e-05, + "loss": 0.4671, + "step": 16427 + }, + { + "epoch": 21.02784, + "grad_norm": 1.0580859184265137, + "learning_rate": 1.715886354541817e-05, + "loss": 0.4691, + "step": 16428 + }, + { + "epoch": 21.02912, + "grad_norm": 1.1256431341171265, + "learning_rate": 1.715686274509804e-05, + "loss": 0.5068, + "step": 16429 + }, + { + "epoch": 21.0304, + "grad_norm": 1.1371206045150757, + "learning_rate": 1.715486194477791e-05, + "loss": 0.484, + "step": 16430 + }, + { + "epoch": 21.03168, + "grad_norm": 1.045066237449646, + "learning_rate": 1.7152861144457786e-05, + "loss": 0.4497, + "step": 16431 + }, + { + "epoch": 21.03296, + "grad_norm": 1.1001946926116943, + "learning_rate": 1.7150860344137658e-05, + "loss": 0.4488, + "step": 16432 + }, + { + "epoch": 21.03424, + "grad_norm": 1.087947964668274, + "learning_rate": 1.7148859543817526e-05, + "loss": 0.475, + "step": 16433 + }, + { + "epoch": 21.03552, + "grad_norm": 1.0743104219436646, + "learning_rate": 1.7146858743497398e-05, + "loss": 0.4484, + "step": 16434 + }, + { + "epoch": 21.0368, + "grad_norm": 1.0668704509735107, + "learning_rate": 1.7144857943177273e-05, + "loss": 0.4431, + "step": 16435 + }, + { + "epoch": 21.03808, + "grad_norm": 1.1153063774108887, + "learning_rate": 1.7142857142857145e-05, + "loss": 0.4967, + "step": 16436 + }, + { + "epoch": 21.03936, + "grad_norm": 1.122804045677185, + "learning_rate": 1.7140856342537014e-05, + "loss": 0.4632, + "step": 16437 + }, + { + "epoch": 21.04064, + "grad_norm": 1.1385273933410645, + "learning_rate": 1.713885554221689e-05, + "loss": 0.4943, + "step": 16438 + }, + { + "epoch": 21.04192, + "grad_norm": 1.0751020908355713, + "learning_rate": 1.713685474189676e-05, + "loss": 0.4541, + "step": 16439 + }, + { + "epoch": 21.0432, + "grad_norm": 1.0615102052688599, + "learning_rate": 1.7134853941576633e-05, + "loss": 0.48, + "step": 16440 + }, + { + "epoch": 21.04448, + "grad_norm": 1.1338032484054565, + "learning_rate": 1.71328531412565e-05, + "loss": 0.4953, + "step": 16441 + }, + { + "epoch": 21.04576, + "grad_norm": 1.0364360809326172, + "learning_rate": 1.7130852340936376e-05, + "loss": 0.4364, + "step": 16442 + }, + { + "epoch": 21.04704, + "grad_norm": 1.0834779739379883, + "learning_rate": 1.7128851540616248e-05, + "loss": 0.5213, + "step": 16443 + }, + { + "epoch": 21.04832, + "grad_norm": 1.0926162004470825, + "learning_rate": 1.712685074029612e-05, + "loss": 0.498, + "step": 16444 + }, + { + "epoch": 21.0496, + "grad_norm": 1.0313501358032227, + "learning_rate": 1.712484993997599e-05, + "loss": 0.4867, + "step": 16445 + }, + { + "epoch": 21.05088, + "grad_norm": 1.0741840600967407, + "learning_rate": 1.7122849139655864e-05, + "loss": 0.5016, + "step": 16446 + }, + { + "epoch": 21.05216, + "grad_norm": 1.1211599111557007, + "learning_rate": 1.7120848339335736e-05, + "loss": 0.5157, + "step": 16447 + }, + { + "epoch": 21.05344, + "grad_norm": 1.0967553853988647, + "learning_rate": 1.7118847539015608e-05, + "loss": 0.4737, + "step": 16448 + }, + { + "epoch": 21.05472, + "grad_norm": 1.204904317855835, + "learning_rate": 1.711684673869548e-05, + "loss": 0.5274, + "step": 16449 + }, + { + "epoch": 21.056, + "grad_norm": 1.0651218891143799, + "learning_rate": 1.711484593837535e-05, + "loss": 0.4312, + "step": 16450 + }, + { + "epoch": 21.05728, + "grad_norm": 1.0928947925567627, + "learning_rate": 1.7112845138055223e-05, + "loss": 0.4495, + "step": 16451 + }, + { + "epoch": 21.05856, + "grad_norm": 1.0684500932693481, + "learning_rate": 1.7110844337735095e-05, + "loss": 0.4851, + "step": 16452 + }, + { + "epoch": 21.05984, + "grad_norm": 1.095968246459961, + "learning_rate": 1.7108843537414967e-05, + "loss": 0.452, + "step": 16453 + }, + { + "epoch": 21.06112, + "grad_norm": 1.140784502029419, + "learning_rate": 1.710684273709484e-05, + "loss": 0.5047, + "step": 16454 + }, + { + "epoch": 21.0624, + "grad_norm": 1.1561800241470337, + "learning_rate": 1.710484193677471e-05, + "loss": 0.5386, + "step": 16455 + }, + { + "epoch": 21.06368, + "grad_norm": 1.110007643699646, + "learning_rate": 1.7102841136454582e-05, + "loss": 0.4753, + "step": 16456 + }, + { + "epoch": 21.06496, + "grad_norm": 1.0784857273101807, + "learning_rate": 1.7100840336134454e-05, + "loss": 0.4374, + "step": 16457 + }, + { + "epoch": 21.06624, + "grad_norm": 1.0909050703048706, + "learning_rate": 1.7098839535814326e-05, + "loss": 0.4375, + "step": 16458 + }, + { + "epoch": 21.06752, + "grad_norm": 1.12933349609375, + "learning_rate": 1.7096838735494198e-05, + "loss": 0.4876, + "step": 16459 + }, + { + "epoch": 21.0688, + "grad_norm": 1.1255989074707031, + "learning_rate": 1.709483793517407e-05, + "loss": 0.4721, + "step": 16460 + }, + { + "epoch": 21.07008, + "grad_norm": 1.059699535369873, + "learning_rate": 1.709283713485394e-05, + "loss": 0.4565, + "step": 16461 + }, + { + "epoch": 21.07136, + "grad_norm": 1.0751200914382935, + "learning_rate": 1.7090836334533814e-05, + "loss": 0.464, + "step": 16462 + }, + { + "epoch": 21.07264, + "grad_norm": 1.1244473457336426, + "learning_rate": 1.708883553421369e-05, + "loss": 0.4842, + "step": 16463 + }, + { + "epoch": 21.07392, + "grad_norm": 1.1796867847442627, + "learning_rate": 1.7086834733893557e-05, + "loss": 0.5435, + "step": 16464 + }, + { + "epoch": 21.0752, + "grad_norm": 1.0969107151031494, + "learning_rate": 1.708483393357343e-05, + "loss": 0.479, + "step": 16465 + }, + { + "epoch": 21.07648, + "grad_norm": 1.1163842678070068, + "learning_rate": 1.70828331332533e-05, + "loss": 0.4498, + "step": 16466 + }, + { + "epoch": 21.07776, + "grad_norm": 1.1781340837478638, + "learning_rate": 1.7080832332933176e-05, + "loss": 0.5003, + "step": 16467 + }, + { + "epoch": 21.07904, + "grad_norm": 1.1196157932281494, + "learning_rate": 1.7078831532613045e-05, + "loss": 0.5118, + "step": 16468 + }, + { + "epoch": 21.08032, + "grad_norm": 1.1168984174728394, + "learning_rate": 1.7076830732292917e-05, + "loss": 0.4505, + "step": 16469 + }, + { + "epoch": 21.0816, + "grad_norm": 1.1065702438354492, + "learning_rate": 1.7074829931972792e-05, + "loss": 0.5232, + "step": 16470 + }, + { + "epoch": 21.08288, + "grad_norm": 1.0598688125610352, + "learning_rate": 1.7072829131652664e-05, + "loss": 0.4615, + "step": 16471 + }, + { + "epoch": 21.08416, + "grad_norm": 1.0742090940475464, + "learning_rate": 1.7070828331332532e-05, + "loss": 0.4675, + "step": 16472 + }, + { + "epoch": 21.08544, + "grad_norm": 1.0973572731018066, + "learning_rate": 1.7068827531012404e-05, + "loss": 0.4869, + "step": 16473 + }, + { + "epoch": 21.08672, + "grad_norm": 1.0886359214782715, + "learning_rate": 1.706682673069228e-05, + "loss": 0.4684, + "step": 16474 + }, + { + "epoch": 21.088, + "grad_norm": 1.0763150453567505, + "learning_rate": 1.706482593037215e-05, + "loss": 0.4558, + "step": 16475 + }, + { + "epoch": 21.08928, + "grad_norm": 1.0928971767425537, + "learning_rate": 1.706282513005202e-05, + "loss": 0.4765, + "step": 16476 + }, + { + "epoch": 21.09056, + "grad_norm": 1.1339545249938965, + "learning_rate": 1.7060824329731895e-05, + "loss": 0.5047, + "step": 16477 + }, + { + "epoch": 21.09184, + "grad_norm": 1.0725475549697876, + "learning_rate": 1.7058823529411767e-05, + "loss": 0.4662, + "step": 16478 + }, + { + "epoch": 21.09312, + "grad_norm": 1.1563411951065063, + "learning_rate": 1.705682272909164e-05, + "loss": 0.4984, + "step": 16479 + }, + { + "epoch": 21.0944, + "grad_norm": 1.0735126733779907, + "learning_rate": 1.7054821928771507e-05, + "loss": 0.4125, + "step": 16480 + }, + { + "epoch": 21.09568, + "grad_norm": 1.1152347326278687, + "learning_rate": 1.7052821128451382e-05, + "loss": 0.505, + "step": 16481 + }, + { + "epoch": 21.09696, + "grad_norm": 1.0870592594146729, + "learning_rate": 1.7050820328131254e-05, + "loss": 0.5002, + "step": 16482 + }, + { + "epoch": 21.09824, + "grad_norm": 1.116709589958191, + "learning_rate": 1.7048819527811126e-05, + "loss": 0.462, + "step": 16483 + }, + { + "epoch": 21.09952, + "grad_norm": 1.1316735744476318, + "learning_rate": 1.7046818727490998e-05, + "loss": 0.5289, + "step": 16484 + }, + { + "epoch": 21.1008, + "grad_norm": 1.127074122428894, + "learning_rate": 1.704481792717087e-05, + "loss": 0.4713, + "step": 16485 + }, + { + "epoch": 21.10208, + "grad_norm": 1.1193740367889404, + "learning_rate": 1.704281712685074e-05, + "loss": 0.4759, + "step": 16486 + }, + { + "epoch": 21.10336, + "grad_norm": 1.0757756233215332, + "learning_rate": 1.7040816326530613e-05, + "loss": 0.5199, + "step": 16487 + }, + { + "epoch": 21.10464, + "grad_norm": 1.0991090536117554, + "learning_rate": 1.7038815526210485e-05, + "loss": 0.523, + "step": 16488 + }, + { + "epoch": 21.10592, + "grad_norm": 1.0792702436447144, + "learning_rate": 1.7036814725890357e-05, + "loss": 0.4515, + "step": 16489 + }, + { + "epoch": 21.1072, + "grad_norm": 1.1250958442687988, + "learning_rate": 1.703481392557023e-05, + "loss": 0.4843, + "step": 16490 + }, + { + "epoch": 21.10848, + "grad_norm": 1.1384851932525635, + "learning_rate": 1.70328131252501e-05, + "loss": 0.523, + "step": 16491 + }, + { + "epoch": 21.10976, + "grad_norm": 1.0551738739013672, + "learning_rate": 1.7030812324929973e-05, + "loss": 0.454, + "step": 16492 + }, + { + "epoch": 21.11104, + "grad_norm": 1.0307825803756714, + "learning_rate": 1.7028811524609844e-05, + "loss": 0.4157, + "step": 16493 + }, + { + "epoch": 21.11232, + "grad_norm": 1.1250662803649902, + "learning_rate": 1.7026810724289716e-05, + "loss": 0.5015, + "step": 16494 + }, + { + "epoch": 21.1136, + "grad_norm": 1.0940858125686646, + "learning_rate": 1.7024809923969588e-05, + "loss": 0.4605, + "step": 16495 + }, + { + "epoch": 21.11488, + "grad_norm": 1.0869758129119873, + "learning_rate": 1.702280912364946e-05, + "loss": 0.544, + "step": 16496 + }, + { + "epoch": 21.11616, + "grad_norm": 1.0667223930358887, + "learning_rate": 1.7020808323329332e-05, + "loss": 0.4369, + "step": 16497 + }, + { + "epoch": 21.11744, + "grad_norm": 1.0950666666030884, + "learning_rate": 1.7018807523009204e-05, + "loss": 0.5017, + "step": 16498 + }, + { + "epoch": 21.11872, + "grad_norm": 1.0515073537826538, + "learning_rate": 1.7016806722689076e-05, + "loss": 0.4371, + "step": 16499 + }, + { + "epoch": 21.12, + "grad_norm": 1.133811354637146, + "learning_rate": 1.7014805922368947e-05, + "loss": 0.5154, + "step": 16500 + }, + { + "epoch": 21.12128, + "grad_norm": 1.1043317317962646, + "learning_rate": 1.701280512204882e-05, + "loss": 0.4841, + "step": 16501 + }, + { + "epoch": 21.12256, + "grad_norm": 1.1179126501083374, + "learning_rate": 1.7010804321728695e-05, + "loss": 0.5082, + "step": 16502 + }, + { + "epoch": 21.12384, + "grad_norm": 1.0235763788223267, + "learning_rate": 1.7008803521408563e-05, + "loss": 0.4659, + "step": 16503 + }, + { + "epoch": 21.12512, + "grad_norm": 1.1011422872543335, + "learning_rate": 1.7006802721088435e-05, + "loss": 0.4946, + "step": 16504 + }, + { + "epoch": 21.1264, + "grad_norm": 1.1294410228729248, + "learning_rate": 1.7004801920768307e-05, + "loss": 0.4528, + "step": 16505 + }, + { + "epoch": 21.12768, + "grad_norm": 1.088690996170044, + "learning_rate": 1.7002801120448182e-05, + "loss": 0.4638, + "step": 16506 + }, + { + "epoch": 21.12896, + "grad_norm": 1.1901614665985107, + "learning_rate": 1.700080032012805e-05, + "loss": 0.4745, + "step": 16507 + }, + { + "epoch": 21.13024, + "grad_norm": 1.177269458770752, + "learning_rate": 1.6998799519807922e-05, + "loss": 0.4941, + "step": 16508 + }, + { + "epoch": 21.13152, + "grad_norm": 1.117848515510559, + "learning_rate": 1.6996798719487798e-05, + "loss": 0.4575, + "step": 16509 + }, + { + "epoch": 21.1328, + "grad_norm": 1.1764754056930542, + "learning_rate": 1.699479791916767e-05, + "loss": 0.4967, + "step": 16510 + }, + { + "epoch": 21.13408, + "grad_norm": 1.094519853591919, + "learning_rate": 1.6992797118847538e-05, + "loss": 0.4677, + "step": 16511 + }, + { + "epoch": 21.13536, + "grad_norm": 1.1102967262268066, + "learning_rate": 1.699079631852741e-05, + "loss": 0.4894, + "step": 16512 + }, + { + "epoch": 21.13664, + "grad_norm": 1.0887529850006104, + "learning_rate": 1.6988795518207285e-05, + "loss": 0.4896, + "step": 16513 + }, + { + "epoch": 21.13792, + "grad_norm": 1.1254242658615112, + "learning_rate": 1.6986794717887157e-05, + "loss": 0.4719, + "step": 16514 + }, + { + "epoch": 21.1392, + "grad_norm": 1.1468827724456787, + "learning_rate": 1.6984793917567025e-05, + "loss": 0.4629, + "step": 16515 + }, + { + "epoch": 21.14048, + "grad_norm": 1.1172226667404175, + "learning_rate": 1.69827931172469e-05, + "loss": 0.4621, + "step": 16516 + }, + { + "epoch": 21.14176, + "grad_norm": 1.1248224973678589, + "learning_rate": 1.6980792316926772e-05, + "loss": 0.4801, + "step": 16517 + }, + { + "epoch": 21.14304, + "grad_norm": 1.124162197113037, + "learning_rate": 1.6978791516606644e-05, + "loss": 0.4939, + "step": 16518 + }, + { + "epoch": 21.14432, + "grad_norm": 1.0810459852218628, + "learning_rate": 1.6976790716286513e-05, + "loss": 0.48, + "step": 16519 + }, + { + "epoch": 21.1456, + "grad_norm": 1.0889114141464233, + "learning_rate": 1.6974789915966388e-05, + "loss": 0.4854, + "step": 16520 + }, + { + "epoch": 21.14688, + "grad_norm": 1.0843100547790527, + "learning_rate": 1.697278911564626e-05, + "loss": 0.4766, + "step": 16521 + }, + { + "epoch": 21.14816, + "grad_norm": 1.0566636323928833, + "learning_rate": 1.697078831532613e-05, + "loss": 0.4388, + "step": 16522 + }, + { + "epoch": 21.14944, + "grad_norm": 1.102766513824463, + "learning_rate": 1.6968787515006004e-05, + "loss": 0.4835, + "step": 16523 + }, + { + "epoch": 21.15072, + "grad_norm": 1.0825378894805908, + "learning_rate": 1.6966786714685875e-05, + "loss": 0.4682, + "step": 16524 + }, + { + "epoch": 21.152, + "grad_norm": 1.1190485954284668, + "learning_rate": 1.6964785914365747e-05, + "loss": 0.4853, + "step": 16525 + }, + { + "epoch": 21.15328, + "grad_norm": 1.0945488214492798, + "learning_rate": 1.696278511404562e-05, + "loss": 0.5039, + "step": 16526 + }, + { + "epoch": 21.15456, + "grad_norm": 1.0824321508407593, + "learning_rate": 1.696078431372549e-05, + "loss": 0.4791, + "step": 16527 + }, + { + "epoch": 21.15584, + "grad_norm": 1.0835765600204468, + "learning_rate": 1.6958783513405363e-05, + "loss": 0.4846, + "step": 16528 + }, + { + "epoch": 21.15712, + "grad_norm": 1.0822176933288574, + "learning_rate": 1.6956782713085235e-05, + "loss": 0.4914, + "step": 16529 + }, + { + "epoch": 21.1584, + "grad_norm": 1.0641273260116577, + "learning_rate": 1.6954781912765107e-05, + "loss": 0.4497, + "step": 16530 + }, + { + "epoch": 21.15968, + "grad_norm": 1.1108111143112183, + "learning_rate": 1.695278111244498e-05, + "loss": 0.4962, + "step": 16531 + }, + { + "epoch": 21.16096, + "grad_norm": 1.0912580490112305, + "learning_rate": 1.695078031212485e-05, + "loss": 0.4981, + "step": 16532 + }, + { + "epoch": 21.16224, + "grad_norm": 1.1157331466674805, + "learning_rate": 1.6948779511804722e-05, + "loss": 0.4682, + "step": 16533 + }, + { + "epoch": 21.16352, + "grad_norm": 1.1047836542129517, + "learning_rate": 1.6946778711484594e-05, + "loss": 0.479, + "step": 16534 + }, + { + "epoch": 21.1648, + "grad_norm": 1.1271506547927856, + "learning_rate": 1.6944777911164466e-05, + "loss": 0.5429, + "step": 16535 + }, + { + "epoch": 21.16608, + "grad_norm": 1.075492024421692, + "learning_rate": 1.6942777110844338e-05, + "loss": 0.4732, + "step": 16536 + }, + { + "epoch": 21.16736, + "grad_norm": 1.1787910461425781, + "learning_rate": 1.6940776310524213e-05, + "loss": 0.5362, + "step": 16537 + }, + { + "epoch": 21.16864, + "grad_norm": 1.0927817821502686, + "learning_rate": 1.693877551020408e-05, + "loss": 0.4621, + "step": 16538 + }, + { + "epoch": 21.16992, + "grad_norm": 1.1615148782730103, + "learning_rate": 1.6936774709883953e-05, + "loss": 0.5075, + "step": 16539 + }, + { + "epoch": 21.1712, + "grad_norm": 1.115074634552002, + "learning_rate": 1.6934773909563825e-05, + "loss": 0.4464, + "step": 16540 + }, + { + "epoch": 21.17248, + "grad_norm": 1.1260422468185425, + "learning_rate": 1.69327731092437e-05, + "loss": 0.4683, + "step": 16541 + }, + { + "epoch": 21.17376, + "grad_norm": 1.0940788984298706, + "learning_rate": 1.693077230892357e-05, + "loss": 0.5185, + "step": 16542 + }, + { + "epoch": 21.17504, + "grad_norm": 1.095339059829712, + "learning_rate": 1.692877150860344e-05, + "loss": 0.4741, + "step": 16543 + }, + { + "epoch": 21.17632, + "grad_norm": 1.0709000825881958, + "learning_rate": 1.6926770708283316e-05, + "loss": 0.4933, + "step": 16544 + }, + { + "epoch": 21.1776, + "grad_norm": 1.1118566989898682, + "learning_rate": 1.6924769907963188e-05, + "loss": 0.5025, + "step": 16545 + }, + { + "epoch": 21.17888, + "grad_norm": 1.1093112230300903, + "learning_rate": 1.6922769107643056e-05, + "loss": 0.5049, + "step": 16546 + }, + { + "epoch": 21.18016, + "grad_norm": 1.0397313833236694, + "learning_rate": 1.6920768307322928e-05, + "loss": 0.4659, + "step": 16547 + }, + { + "epoch": 21.18144, + "grad_norm": 1.1054112911224365, + "learning_rate": 1.6918767507002803e-05, + "loss": 0.4411, + "step": 16548 + }, + { + "epoch": 21.18272, + "grad_norm": 1.1328340768814087, + "learning_rate": 1.6916766706682675e-05, + "loss": 0.4802, + "step": 16549 + }, + { + "epoch": 21.184, + "grad_norm": 1.0890549421310425, + "learning_rate": 1.6914765906362544e-05, + "loss": 0.4922, + "step": 16550 + }, + { + "epoch": 21.18528, + "grad_norm": 1.1533173322677612, + "learning_rate": 1.691276510604242e-05, + "loss": 0.515, + "step": 16551 + }, + { + "epoch": 21.18656, + "grad_norm": 1.0926942825317383, + "learning_rate": 1.691076430572229e-05, + "loss": 0.4696, + "step": 16552 + }, + { + "epoch": 21.18784, + "grad_norm": 1.0669362545013428, + "learning_rate": 1.6908763505402163e-05, + "loss": 0.4982, + "step": 16553 + }, + { + "epoch": 21.18912, + "grad_norm": 1.0951491594314575, + "learning_rate": 1.690676270508203e-05, + "loss": 0.4715, + "step": 16554 + }, + { + "epoch": 21.1904, + "grad_norm": 1.072893500328064, + "learning_rate": 1.6904761904761906e-05, + "loss": 0.4721, + "step": 16555 + }, + { + "epoch": 21.19168, + "grad_norm": 1.0877991914749146, + "learning_rate": 1.6902761104441778e-05, + "loss": 0.4584, + "step": 16556 + }, + { + "epoch": 21.19296, + "grad_norm": 1.12388014793396, + "learning_rate": 1.690076030412165e-05, + "loss": 0.4523, + "step": 16557 + }, + { + "epoch": 21.19424, + "grad_norm": 1.1295223236083984, + "learning_rate": 1.689875950380152e-05, + "loss": 0.4936, + "step": 16558 + }, + { + "epoch": 21.19552, + "grad_norm": 1.1079462766647339, + "learning_rate": 1.6896758703481394e-05, + "loss": 0.4463, + "step": 16559 + }, + { + "epoch": 21.1968, + "grad_norm": 1.1547305583953857, + "learning_rate": 1.6894757903161266e-05, + "loss": 0.4691, + "step": 16560 + }, + { + "epoch": 21.19808, + "grad_norm": 1.1566081047058105, + "learning_rate": 1.6892757102841137e-05, + "loss": 0.5163, + "step": 16561 + }, + { + "epoch": 21.19936, + "grad_norm": 1.1385910511016846, + "learning_rate": 1.689075630252101e-05, + "loss": 0.4941, + "step": 16562 + }, + { + "epoch": 21.20064, + "grad_norm": 1.1214505434036255, + "learning_rate": 1.688875550220088e-05, + "loss": 0.5138, + "step": 16563 + }, + { + "epoch": 21.20192, + "grad_norm": 1.089640498161316, + "learning_rate": 1.6886754701880753e-05, + "loss": 0.4491, + "step": 16564 + }, + { + "epoch": 21.2032, + "grad_norm": 1.0873479843139648, + "learning_rate": 1.6884753901560625e-05, + "loss": 0.4711, + "step": 16565 + }, + { + "epoch": 21.20448, + "grad_norm": 1.0695767402648926, + "learning_rate": 1.6882753101240497e-05, + "loss": 0.4575, + "step": 16566 + }, + { + "epoch": 21.20576, + "grad_norm": 1.1109639406204224, + "learning_rate": 1.688075230092037e-05, + "loss": 0.5093, + "step": 16567 + }, + { + "epoch": 21.20704, + "grad_norm": 1.10574471950531, + "learning_rate": 1.687875150060024e-05, + "loss": 0.5033, + "step": 16568 + }, + { + "epoch": 21.20832, + "grad_norm": 1.0415626764297485, + "learning_rate": 1.6876750700280112e-05, + "loss": 0.4781, + "step": 16569 + }, + { + "epoch": 21.209600000000002, + "grad_norm": 1.1042823791503906, + "learning_rate": 1.6874749899959984e-05, + "loss": 0.4813, + "step": 16570 + }, + { + "epoch": 21.21088, + "grad_norm": 1.1033847332000732, + "learning_rate": 1.6872749099639856e-05, + "loss": 0.4547, + "step": 16571 + }, + { + "epoch": 21.21216, + "grad_norm": 1.0810890197753906, + "learning_rate": 1.6870748299319728e-05, + "loss": 0.5008, + "step": 16572 + }, + { + "epoch": 21.21344, + "grad_norm": 1.111965537071228, + "learning_rate": 1.68687474989996e-05, + "loss": 0.4542, + "step": 16573 + }, + { + "epoch": 21.21472, + "grad_norm": 1.168083906173706, + "learning_rate": 1.686674669867947e-05, + "loss": 0.4931, + "step": 16574 + }, + { + "epoch": 21.216, + "grad_norm": 1.1118204593658447, + "learning_rate": 1.6864745898359343e-05, + "loss": 0.4682, + "step": 16575 + }, + { + "epoch": 21.21728, + "grad_norm": 1.1220096349716187, + "learning_rate": 1.686274509803922e-05, + "loss": 0.5033, + "step": 16576 + }, + { + "epoch": 21.21856, + "grad_norm": 1.061000943183899, + "learning_rate": 1.6860744297719087e-05, + "loss": 0.5032, + "step": 16577 + }, + { + "epoch": 21.21984, + "grad_norm": 1.1358115673065186, + "learning_rate": 1.685874349739896e-05, + "loss": 0.4291, + "step": 16578 + }, + { + "epoch": 21.22112, + "grad_norm": 1.0819488763809204, + "learning_rate": 1.685674269707883e-05, + "loss": 0.4413, + "step": 16579 + }, + { + "epoch": 21.2224, + "grad_norm": 1.0310840606689453, + "learning_rate": 1.6854741896758706e-05, + "loss": 0.436, + "step": 16580 + }, + { + "epoch": 21.22368, + "grad_norm": 1.0708487033843994, + "learning_rate": 1.6852741096438575e-05, + "loss": 0.4718, + "step": 16581 + }, + { + "epoch": 21.22496, + "grad_norm": 1.1317819356918335, + "learning_rate": 1.6850740296118446e-05, + "loss": 0.528, + "step": 16582 + }, + { + "epoch": 21.22624, + "grad_norm": 1.1294492483139038, + "learning_rate": 1.6848739495798322e-05, + "loss": 0.4881, + "step": 16583 + }, + { + "epoch": 21.22752, + "grad_norm": 1.1252028942108154, + "learning_rate": 1.6846738695478194e-05, + "loss": 0.4865, + "step": 16584 + }, + { + "epoch": 21.2288, + "grad_norm": 1.187008261680603, + "learning_rate": 1.6844737895158062e-05, + "loss": 0.5291, + "step": 16585 + }, + { + "epoch": 21.23008, + "grad_norm": 1.1691592931747437, + "learning_rate": 1.6842737094837934e-05, + "loss": 0.5374, + "step": 16586 + }, + { + "epoch": 21.23136, + "grad_norm": 1.116336464881897, + "learning_rate": 1.684073629451781e-05, + "loss": 0.5126, + "step": 16587 + }, + { + "epoch": 21.23264, + "grad_norm": 1.038357138633728, + "learning_rate": 1.683873549419768e-05, + "loss": 0.4979, + "step": 16588 + }, + { + "epoch": 21.23392, + "grad_norm": 1.0832995176315308, + "learning_rate": 1.683673469387755e-05, + "loss": 0.4649, + "step": 16589 + }, + { + "epoch": 21.2352, + "grad_norm": 1.0173653364181519, + "learning_rate": 1.6834733893557425e-05, + "loss": 0.4228, + "step": 16590 + }, + { + "epoch": 21.23648, + "grad_norm": 1.0729964971542358, + "learning_rate": 1.6832733093237297e-05, + "loss": 0.4692, + "step": 16591 + }, + { + "epoch": 21.23776, + "grad_norm": 1.036226749420166, + "learning_rate": 1.683073229291717e-05, + "loss": 0.4212, + "step": 16592 + }, + { + "epoch": 21.23904, + "grad_norm": 1.1111071109771729, + "learning_rate": 1.6828731492597037e-05, + "loss": 0.5403, + "step": 16593 + }, + { + "epoch": 21.24032, + "grad_norm": 1.0485975742340088, + "learning_rate": 1.6826730692276912e-05, + "loss": 0.4712, + "step": 16594 + }, + { + "epoch": 21.2416, + "grad_norm": 0.9971339106559753, + "learning_rate": 1.6824729891956784e-05, + "loss": 0.3999, + "step": 16595 + }, + { + "epoch": 21.24288, + "grad_norm": 1.1354920864105225, + "learning_rate": 1.6822729091636656e-05, + "loss": 0.533, + "step": 16596 + }, + { + "epoch": 21.24416, + "grad_norm": 1.1078267097473145, + "learning_rate": 1.6820728291316528e-05, + "loss": 0.4979, + "step": 16597 + }, + { + "epoch": 21.24544, + "grad_norm": 1.1035330295562744, + "learning_rate": 1.68187274909964e-05, + "loss": 0.4989, + "step": 16598 + }, + { + "epoch": 21.24672, + "grad_norm": 1.091853380203247, + "learning_rate": 1.681672669067627e-05, + "loss": 0.4654, + "step": 16599 + }, + { + "epoch": 21.248, + "grad_norm": 1.0949825048446655, + "learning_rate": 1.6814725890356143e-05, + "loss": 0.4446, + "step": 16600 + }, + { + "epoch": 21.24928, + "grad_norm": 1.165686011314392, + "learning_rate": 1.6812725090036015e-05, + "loss": 0.5106, + "step": 16601 + }, + { + "epoch": 21.25056, + "grad_norm": 1.0916712284088135, + "learning_rate": 1.6810724289715887e-05, + "loss": 0.4743, + "step": 16602 + }, + { + "epoch": 21.25184, + "grad_norm": 1.0554473400115967, + "learning_rate": 1.680872348939576e-05, + "loss": 0.459, + "step": 16603 + }, + { + "epoch": 21.25312, + "grad_norm": 1.1290833950042725, + "learning_rate": 1.6806722689075634e-05, + "loss": 0.5159, + "step": 16604 + }, + { + "epoch": 21.2544, + "grad_norm": 1.066178560256958, + "learning_rate": 1.6804721888755503e-05, + "loss": 0.4695, + "step": 16605 + }, + { + "epoch": 21.25568, + "grad_norm": 1.0555775165557861, + "learning_rate": 1.6802721088435374e-05, + "loss": 0.4638, + "step": 16606 + }, + { + "epoch": 21.25696, + "grad_norm": 1.1663188934326172, + "learning_rate": 1.6800720288115246e-05, + "loss": 0.5057, + "step": 16607 + }, + { + "epoch": 21.25824, + "grad_norm": 1.0777236223220825, + "learning_rate": 1.679871948779512e-05, + "loss": 0.457, + "step": 16608 + }, + { + "epoch": 21.25952, + "grad_norm": 1.185142993927002, + "learning_rate": 1.679671868747499e-05, + "loss": 0.4742, + "step": 16609 + }, + { + "epoch": 21.2608, + "grad_norm": 1.0730408430099487, + "learning_rate": 1.6794717887154862e-05, + "loss": 0.5001, + "step": 16610 + }, + { + "epoch": 21.26208, + "grad_norm": 1.0346087217330933, + "learning_rate": 1.6792717086834734e-05, + "loss": 0.4242, + "step": 16611 + }, + { + "epoch": 21.26336, + "grad_norm": 1.0942175388336182, + "learning_rate": 1.6790716286514606e-05, + "loss": 0.4677, + "step": 16612 + }, + { + "epoch": 21.26464, + "grad_norm": 1.15269935131073, + "learning_rate": 1.6788715486194477e-05, + "loss": 0.5306, + "step": 16613 + }, + { + "epoch": 21.26592, + "grad_norm": 1.0454847812652588, + "learning_rate": 1.678671468587435e-05, + "loss": 0.4561, + "step": 16614 + }, + { + "epoch": 21.2672, + "grad_norm": 1.16476309299469, + "learning_rate": 1.6784713885554225e-05, + "loss": 0.53, + "step": 16615 + }, + { + "epoch": 21.26848, + "grad_norm": 1.286267638206482, + "learning_rate": 1.6782713085234093e-05, + "loss": 0.5394, + "step": 16616 + }, + { + "epoch": 21.26976, + "grad_norm": 1.114345908164978, + "learning_rate": 1.6780712284913965e-05, + "loss": 0.4361, + "step": 16617 + }, + { + "epoch": 21.27104, + "grad_norm": 1.149522304534912, + "learning_rate": 1.6778711484593837e-05, + "loss": 0.4684, + "step": 16618 + }, + { + "epoch": 21.27232, + "grad_norm": 1.143465280532837, + "learning_rate": 1.6776710684273712e-05, + "loss": 0.5119, + "step": 16619 + }, + { + "epoch": 21.2736, + "grad_norm": 1.0972861051559448, + "learning_rate": 1.677470988395358e-05, + "loss": 0.482, + "step": 16620 + }, + { + "epoch": 21.27488, + "grad_norm": 1.0499613285064697, + "learning_rate": 1.6772709083633452e-05, + "loss": 0.465, + "step": 16621 + }, + { + "epoch": 21.27616, + "grad_norm": 1.0492812395095825, + "learning_rate": 1.6770708283313328e-05, + "loss": 0.4625, + "step": 16622 + }, + { + "epoch": 21.27744, + "grad_norm": 1.0873419046401978, + "learning_rate": 1.67687074829932e-05, + "loss": 0.4993, + "step": 16623 + }, + { + "epoch": 21.27872, + "grad_norm": 1.1490446329116821, + "learning_rate": 1.6766706682673068e-05, + "loss": 0.4968, + "step": 16624 + }, + { + "epoch": 21.28, + "grad_norm": 1.1244101524353027, + "learning_rate": 1.676470588235294e-05, + "loss": 0.5022, + "step": 16625 + }, + { + "epoch": 21.28128, + "grad_norm": 1.0936533212661743, + "learning_rate": 1.6762705082032815e-05, + "loss": 0.4537, + "step": 16626 + }, + { + "epoch": 21.28256, + "grad_norm": 1.1399465799331665, + "learning_rate": 1.6760704281712687e-05, + "loss": 0.4794, + "step": 16627 + }, + { + "epoch": 21.28384, + "grad_norm": 1.14096999168396, + "learning_rate": 1.6758703481392555e-05, + "loss": 0.4995, + "step": 16628 + }, + { + "epoch": 21.28512, + "grad_norm": 1.1549474000930786, + "learning_rate": 1.675670268107243e-05, + "loss": 0.4909, + "step": 16629 + }, + { + "epoch": 21.2864, + "grad_norm": 1.088848352432251, + "learning_rate": 1.6754701880752302e-05, + "loss": 0.4626, + "step": 16630 + }, + { + "epoch": 21.28768, + "grad_norm": 1.114532709121704, + "learning_rate": 1.6752701080432174e-05, + "loss": 0.4492, + "step": 16631 + }, + { + "epoch": 21.28896, + "grad_norm": 1.1168134212493896, + "learning_rate": 1.6750700280112043e-05, + "loss": 0.505, + "step": 16632 + }, + { + "epoch": 21.29024, + "grad_norm": 1.1612701416015625, + "learning_rate": 1.6748699479791918e-05, + "loss": 0.519, + "step": 16633 + }, + { + "epoch": 21.29152, + "grad_norm": 1.0742192268371582, + "learning_rate": 1.674669867947179e-05, + "loss": 0.4603, + "step": 16634 + }, + { + "epoch": 21.2928, + "grad_norm": 1.1029329299926758, + "learning_rate": 1.674469787915166e-05, + "loss": 0.4851, + "step": 16635 + }, + { + "epoch": 21.29408, + "grad_norm": 1.106626033782959, + "learning_rate": 1.6742697078831534e-05, + "loss": 0.5193, + "step": 16636 + }, + { + "epoch": 21.29536, + "grad_norm": 1.117716908454895, + "learning_rate": 1.6740696278511405e-05, + "loss": 0.5412, + "step": 16637 + }, + { + "epoch": 21.29664, + "grad_norm": 1.0421884059906006, + "learning_rate": 1.6738695478191277e-05, + "loss": 0.446, + "step": 16638 + }, + { + "epoch": 21.29792, + "grad_norm": 1.0915087461471558, + "learning_rate": 1.673669467787115e-05, + "loss": 0.5246, + "step": 16639 + }, + { + "epoch": 21.2992, + "grad_norm": 1.1888045072555542, + "learning_rate": 1.673469387755102e-05, + "loss": 0.4996, + "step": 16640 + }, + { + "epoch": 21.30048, + "grad_norm": 1.0700676441192627, + "learning_rate": 1.6732693077230893e-05, + "loss": 0.4903, + "step": 16641 + }, + { + "epoch": 21.30176, + "grad_norm": 1.1567354202270508, + "learning_rate": 1.6730692276910765e-05, + "loss": 0.525, + "step": 16642 + }, + { + "epoch": 21.30304, + "grad_norm": 1.1310677528381348, + "learning_rate": 1.672869147659064e-05, + "loss": 0.5157, + "step": 16643 + }, + { + "epoch": 21.30432, + "grad_norm": 1.101754903793335, + "learning_rate": 1.672669067627051e-05, + "loss": 0.4957, + "step": 16644 + }, + { + "epoch": 21.3056, + "grad_norm": 1.1254335641860962, + "learning_rate": 1.672468987595038e-05, + "loss": 0.4955, + "step": 16645 + }, + { + "epoch": 21.30688, + "grad_norm": 1.1134346723556519, + "learning_rate": 1.6722689075630252e-05, + "loss": 0.4501, + "step": 16646 + }, + { + "epoch": 21.30816, + "grad_norm": 1.1738286018371582, + "learning_rate": 1.6720688275310127e-05, + "loss": 0.4818, + "step": 16647 + }, + { + "epoch": 21.30944, + "grad_norm": 1.0866729021072388, + "learning_rate": 1.6718687474989996e-05, + "loss": 0.4542, + "step": 16648 + }, + { + "epoch": 21.31072, + "grad_norm": 1.1546677350997925, + "learning_rate": 1.6716686674669868e-05, + "loss": 0.4684, + "step": 16649 + }, + { + "epoch": 21.312, + "grad_norm": 1.077710747718811, + "learning_rate": 1.6714685874349743e-05, + "loss": 0.4649, + "step": 16650 + }, + { + "epoch": 21.31328, + "grad_norm": 1.1036674976348877, + "learning_rate": 1.6712685074029615e-05, + "loss": 0.498, + "step": 16651 + }, + { + "epoch": 21.31456, + "grad_norm": 1.1394011974334717, + "learning_rate": 1.6710684273709483e-05, + "loss": 0.4669, + "step": 16652 + }, + { + "epoch": 21.31584, + "grad_norm": 1.1064188480377197, + "learning_rate": 1.6708683473389355e-05, + "loss": 0.4772, + "step": 16653 + }, + { + "epoch": 21.31712, + "grad_norm": 1.1202365159988403, + "learning_rate": 1.670668267306923e-05, + "loss": 0.4958, + "step": 16654 + }, + { + "epoch": 21.3184, + "grad_norm": 1.156451940536499, + "learning_rate": 1.6704681872749102e-05, + "loss": 0.5121, + "step": 16655 + }, + { + "epoch": 21.31968, + "grad_norm": 1.1051130294799805, + "learning_rate": 1.670268107242897e-05, + "loss": 0.5077, + "step": 16656 + }, + { + "epoch": 21.32096, + "grad_norm": 1.0539714097976685, + "learning_rate": 1.6700680272108846e-05, + "loss": 0.4521, + "step": 16657 + }, + { + "epoch": 21.32224, + "grad_norm": 1.1256710290908813, + "learning_rate": 1.6698679471788718e-05, + "loss": 0.4814, + "step": 16658 + }, + { + "epoch": 21.32352, + "grad_norm": 1.1350125074386597, + "learning_rate": 1.669667867146859e-05, + "loss": 0.509, + "step": 16659 + }, + { + "epoch": 21.3248, + "grad_norm": 0.9891694784164429, + "learning_rate": 1.6694677871148458e-05, + "loss": 0.4096, + "step": 16660 + }, + { + "epoch": 21.32608, + "grad_norm": 1.1129761934280396, + "learning_rate": 1.6692677070828333e-05, + "loss": 0.4794, + "step": 16661 + }, + { + "epoch": 21.32736, + "grad_norm": 1.0066285133361816, + "learning_rate": 1.6690676270508205e-05, + "loss": 0.4442, + "step": 16662 + }, + { + "epoch": 21.32864, + "grad_norm": 1.067672848701477, + "learning_rate": 1.6688675470188077e-05, + "loss": 0.4584, + "step": 16663 + }, + { + "epoch": 21.32992, + "grad_norm": 1.1201221942901611, + "learning_rate": 1.668667466986795e-05, + "loss": 0.4726, + "step": 16664 + }, + { + "epoch": 21.3312, + "grad_norm": 1.104931116104126, + "learning_rate": 1.668467386954782e-05, + "loss": 0.4799, + "step": 16665 + }, + { + "epoch": 21.33248, + "grad_norm": 1.0619887113571167, + "learning_rate": 1.6682673069227693e-05, + "loss": 0.4639, + "step": 16666 + }, + { + "epoch": 21.33376, + "grad_norm": 1.1233429908752441, + "learning_rate": 1.6680672268907564e-05, + "loss": 0.4904, + "step": 16667 + }, + { + "epoch": 21.33504, + "grad_norm": 1.137324333190918, + "learning_rate": 1.6678671468587436e-05, + "loss": 0.4927, + "step": 16668 + }, + { + "epoch": 21.33632, + "grad_norm": 1.0907856225967407, + "learning_rate": 1.6676670668267308e-05, + "loss": 0.4698, + "step": 16669 + }, + { + "epoch": 21.3376, + "grad_norm": 1.1330994367599487, + "learning_rate": 1.667466986794718e-05, + "loss": 0.5419, + "step": 16670 + }, + { + "epoch": 21.33888, + "grad_norm": 1.1534572839736938, + "learning_rate": 1.6672669067627052e-05, + "loss": 0.4913, + "step": 16671 + }, + { + "epoch": 21.34016, + "grad_norm": 1.070062279701233, + "learning_rate": 1.6670668267306924e-05, + "loss": 0.4565, + "step": 16672 + }, + { + "epoch": 21.34144, + "grad_norm": 1.0346976518630981, + "learning_rate": 1.6668667466986796e-05, + "loss": 0.4802, + "step": 16673 + }, + { + "epoch": 21.34272, + "grad_norm": 1.0610109567642212, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.4705, + "step": 16674 + }, + { + "epoch": 21.344, + "grad_norm": 1.103748083114624, + "learning_rate": 1.666466586634654e-05, + "loss": 0.4516, + "step": 16675 + }, + { + "epoch": 21.34528, + "grad_norm": 1.1500375270843506, + "learning_rate": 1.666266506602641e-05, + "loss": 0.5045, + "step": 16676 + }, + { + "epoch": 21.34656, + "grad_norm": 1.0690054893493652, + "learning_rate": 1.6660664265706283e-05, + "loss": 0.4846, + "step": 16677 + }, + { + "epoch": 21.34784, + "grad_norm": 1.1877983808517456, + "learning_rate": 1.6658663465386155e-05, + "loss": 0.5534, + "step": 16678 + }, + { + "epoch": 21.34912, + "grad_norm": 1.092886209487915, + "learning_rate": 1.6656662665066027e-05, + "loss": 0.4751, + "step": 16679 + }, + { + "epoch": 21.3504, + "grad_norm": 1.1173007488250732, + "learning_rate": 1.66546618647459e-05, + "loss": 0.5033, + "step": 16680 + }, + { + "epoch": 21.35168, + "grad_norm": 1.1595876216888428, + "learning_rate": 1.665266106442577e-05, + "loss": 0.5458, + "step": 16681 + }, + { + "epoch": 21.35296, + "grad_norm": 1.059984803199768, + "learning_rate": 1.6650660264105646e-05, + "loss": 0.4982, + "step": 16682 + }, + { + "epoch": 21.35424, + "grad_norm": 1.0998493432998657, + "learning_rate": 1.6648659463785514e-05, + "loss": 0.5241, + "step": 16683 + }, + { + "epoch": 21.35552, + "grad_norm": 1.1519036293029785, + "learning_rate": 1.6646658663465386e-05, + "loss": 0.5199, + "step": 16684 + }, + { + "epoch": 21.3568, + "grad_norm": 1.141817331314087, + "learning_rate": 1.6644657863145258e-05, + "loss": 0.5133, + "step": 16685 + }, + { + "epoch": 21.35808, + "grad_norm": 1.0783271789550781, + "learning_rate": 1.6642657062825133e-05, + "loss": 0.4728, + "step": 16686 + }, + { + "epoch": 21.35936, + "grad_norm": 1.1302711963653564, + "learning_rate": 1.6640656262505e-05, + "loss": 0.5232, + "step": 16687 + }, + { + "epoch": 21.36064, + "grad_norm": 1.1916934251785278, + "learning_rate": 1.6638655462184873e-05, + "loss": 0.5078, + "step": 16688 + }, + { + "epoch": 21.36192, + "grad_norm": 1.1410053968429565, + "learning_rate": 1.663665466186475e-05, + "loss": 0.4956, + "step": 16689 + }, + { + "epoch": 21.3632, + "grad_norm": 1.1169198751449585, + "learning_rate": 1.663465386154462e-05, + "loss": 0.4981, + "step": 16690 + }, + { + "epoch": 21.36448, + "grad_norm": 1.0799909830093384, + "learning_rate": 1.663265306122449e-05, + "loss": 0.4978, + "step": 16691 + }, + { + "epoch": 21.36576, + "grad_norm": 1.1033358573913574, + "learning_rate": 1.663065226090436e-05, + "loss": 0.4631, + "step": 16692 + }, + { + "epoch": 21.36704, + "grad_norm": 1.146831750869751, + "learning_rate": 1.6628651460584236e-05, + "loss": 0.4868, + "step": 16693 + }, + { + "epoch": 21.36832, + "grad_norm": 1.1381210088729858, + "learning_rate": 1.6626650660264108e-05, + "loss": 0.4916, + "step": 16694 + }, + { + "epoch": 21.3696, + "grad_norm": 1.1156655550003052, + "learning_rate": 1.6624649859943976e-05, + "loss": 0.4718, + "step": 16695 + }, + { + "epoch": 21.37088, + "grad_norm": 1.1188489198684692, + "learning_rate": 1.6622649059623852e-05, + "loss": 0.481, + "step": 16696 + }, + { + "epoch": 21.37216, + "grad_norm": 1.0974246263504028, + "learning_rate": 1.6620648259303724e-05, + "loss": 0.4989, + "step": 16697 + }, + { + "epoch": 21.37344, + "grad_norm": 1.2472598552703857, + "learning_rate": 1.6618647458983595e-05, + "loss": 0.621, + "step": 16698 + }, + { + "epoch": 21.37472, + "grad_norm": 1.074589490890503, + "learning_rate": 1.6616646658663464e-05, + "loss": 0.4802, + "step": 16699 + }, + { + "epoch": 21.376, + "grad_norm": 1.0904700756072998, + "learning_rate": 1.661464585834334e-05, + "loss": 0.4855, + "step": 16700 + }, + { + "epoch": 21.37728, + "grad_norm": 1.093536376953125, + "learning_rate": 1.661264505802321e-05, + "loss": 0.4616, + "step": 16701 + }, + { + "epoch": 21.37856, + "grad_norm": 1.0414737462997437, + "learning_rate": 1.6610644257703083e-05, + "loss": 0.4223, + "step": 16702 + }, + { + "epoch": 21.37984, + "grad_norm": 1.1210976839065552, + "learning_rate": 1.6608643457382955e-05, + "loss": 0.5263, + "step": 16703 + }, + { + "epoch": 21.38112, + "grad_norm": 1.0790395736694336, + "learning_rate": 1.6606642657062827e-05, + "loss": 0.4368, + "step": 16704 + }, + { + "epoch": 21.3824, + "grad_norm": 1.146754503250122, + "learning_rate": 1.66046418567427e-05, + "loss": 0.501, + "step": 16705 + }, + { + "epoch": 21.38368, + "grad_norm": 1.1394240856170654, + "learning_rate": 1.660264105642257e-05, + "loss": 0.53, + "step": 16706 + }, + { + "epoch": 21.38496, + "grad_norm": 1.1505063772201538, + "learning_rate": 1.6600640256102442e-05, + "loss": 0.4901, + "step": 16707 + }, + { + "epoch": 21.38624, + "grad_norm": 1.1096645593643188, + "learning_rate": 1.6598639455782314e-05, + "loss": 0.4851, + "step": 16708 + }, + { + "epoch": 21.38752, + "grad_norm": 1.1275689601898193, + "learning_rate": 1.6596638655462186e-05, + "loss": 0.5176, + "step": 16709 + }, + { + "epoch": 21.3888, + "grad_norm": 1.1523791551589966, + "learning_rate": 1.6594637855142058e-05, + "loss": 0.5758, + "step": 16710 + }, + { + "epoch": 21.39008, + "grad_norm": 1.1662170886993408, + "learning_rate": 1.659263705482193e-05, + "loss": 0.5227, + "step": 16711 + }, + { + "epoch": 21.39136, + "grad_norm": 1.178915023803711, + "learning_rate": 1.65906362545018e-05, + "loss": 0.4966, + "step": 16712 + }, + { + "epoch": 21.39264, + "grad_norm": 1.153525471687317, + "learning_rate": 1.6588635454181673e-05, + "loss": 0.509, + "step": 16713 + }, + { + "epoch": 21.39392, + "grad_norm": 1.0840404033660889, + "learning_rate": 1.6586634653861545e-05, + "loss": 0.45, + "step": 16714 + }, + { + "epoch": 21.3952, + "grad_norm": 1.0108500719070435, + "learning_rate": 1.6584633853541417e-05, + "loss": 0.4983, + "step": 16715 + }, + { + "epoch": 21.39648, + "grad_norm": 1.0287790298461914, + "learning_rate": 1.658263305322129e-05, + "loss": 0.447, + "step": 16716 + }, + { + "epoch": 21.39776, + "grad_norm": 1.1262785196304321, + "learning_rate": 1.6580632252901164e-05, + "loss": 0.5199, + "step": 16717 + }, + { + "epoch": 21.39904, + "grad_norm": 1.1663216352462769, + "learning_rate": 1.6578631452581033e-05, + "loss": 0.5389, + "step": 16718 + }, + { + "epoch": 21.40032, + "grad_norm": 1.050490379333496, + "learning_rate": 1.6576630652260904e-05, + "loss": 0.4721, + "step": 16719 + }, + { + "epoch": 21.4016, + "grad_norm": 1.0672640800476074, + "learning_rate": 1.6574629851940776e-05, + "loss": 0.4858, + "step": 16720 + }, + { + "epoch": 21.40288, + "grad_norm": 1.0625916719436646, + "learning_rate": 1.657262905162065e-05, + "loss": 0.4438, + "step": 16721 + }, + { + "epoch": 21.40416, + "grad_norm": 1.169717788696289, + "learning_rate": 1.657062825130052e-05, + "loss": 0.4998, + "step": 16722 + }, + { + "epoch": 21.40544, + "grad_norm": 1.1702299118041992, + "learning_rate": 1.6568627450980392e-05, + "loss": 0.4903, + "step": 16723 + }, + { + "epoch": 21.40672, + "grad_norm": 1.062159538269043, + "learning_rate": 1.6566626650660264e-05, + "loss": 0.4464, + "step": 16724 + }, + { + "epoch": 21.408, + "grad_norm": 1.1465506553649902, + "learning_rate": 1.656462585034014e-05, + "loss": 0.511, + "step": 16725 + }, + { + "epoch": 21.40928, + "grad_norm": 1.079226016998291, + "learning_rate": 1.6562625050020007e-05, + "loss": 0.4868, + "step": 16726 + }, + { + "epoch": 21.41056, + "grad_norm": 1.0724486112594604, + "learning_rate": 1.656062424969988e-05, + "loss": 0.4862, + "step": 16727 + }, + { + "epoch": 21.41184, + "grad_norm": 1.142055869102478, + "learning_rate": 1.6558623449379755e-05, + "loss": 0.5397, + "step": 16728 + }, + { + "epoch": 21.41312, + "grad_norm": 1.0549299716949463, + "learning_rate": 1.6556622649059626e-05, + "loss": 0.44, + "step": 16729 + }, + { + "epoch": 21.4144, + "grad_norm": 1.177176833152771, + "learning_rate": 1.6554621848739495e-05, + "loss": 0.5188, + "step": 16730 + }, + { + "epoch": 21.41568, + "grad_norm": 1.174822449684143, + "learning_rate": 1.6552621048419367e-05, + "loss": 0.502, + "step": 16731 + }, + { + "epoch": 21.41696, + "grad_norm": 1.1037957668304443, + "learning_rate": 1.6550620248099242e-05, + "loss": 0.486, + "step": 16732 + }, + { + "epoch": 21.41824, + "grad_norm": 1.1217528581619263, + "learning_rate": 1.6548619447779114e-05, + "loss": 0.4969, + "step": 16733 + }, + { + "epoch": 21.41952, + "grad_norm": 1.146602749824524, + "learning_rate": 1.6546618647458982e-05, + "loss": 0.4549, + "step": 16734 + }, + { + "epoch": 21.4208, + "grad_norm": 1.062936782836914, + "learning_rate": 1.6544617847138858e-05, + "loss": 0.467, + "step": 16735 + }, + { + "epoch": 21.42208, + "grad_norm": 1.1040713787078857, + "learning_rate": 1.654261704681873e-05, + "loss": 0.4766, + "step": 16736 + }, + { + "epoch": 21.42336, + "grad_norm": 1.055647373199463, + "learning_rate": 1.65406162464986e-05, + "loss": 0.4662, + "step": 16737 + }, + { + "epoch": 21.42464, + "grad_norm": 1.1249991655349731, + "learning_rate": 1.653861544617847e-05, + "loss": 0.5188, + "step": 16738 + }, + { + "epoch": 21.42592, + "grad_norm": 1.0827454328536987, + "learning_rate": 1.6536614645858345e-05, + "loss": 0.4846, + "step": 16739 + }, + { + "epoch": 21.4272, + "grad_norm": 1.1406104564666748, + "learning_rate": 1.6534613845538217e-05, + "loss": 0.5539, + "step": 16740 + }, + { + "epoch": 21.42848, + "grad_norm": 1.1139311790466309, + "learning_rate": 1.653261304521809e-05, + "loss": 0.4687, + "step": 16741 + }, + { + "epoch": 21.42976, + "grad_norm": 1.1412529945373535, + "learning_rate": 1.653061224489796e-05, + "loss": 0.5136, + "step": 16742 + }, + { + "epoch": 21.43104, + "grad_norm": 1.1076371669769287, + "learning_rate": 1.6528611444577832e-05, + "loss": 0.4687, + "step": 16743 + }, + { + "epoch": 21.43232, + "grad_norm": 1.0816609859466553, + "learning_rate": 1.6526610644257704e-05, + "loss": 0.4853, + "step": 16744 + }, + { + "epoch": 21.4336, + "grad_norm": 1.0708239078521729, + "learning_rate": 1.6524609843937576e-05, + "loss": 0.4779, + "step": 16745 + }, + { + "epoch": 21.43488, + "grad_norm": 1.1649770736694336, + "learning_rate": 1.6522609043617448e-05, + "loss": 0.498, + "step": 16746 + }, + { + "epoch": 21.43616, + "grad_norm": 1.1269488334655762, + "learning_rate": 1.652060824329732e-05, + "loss": 0.4976, + "step": 16747 + }, + { + "epoch": 21.43744, + "grad_norm": 1.0875318050384521, + "learning_rate": 1.651860744297719e-05, + "loss": 0.514, + "step": 16748 + }, + { + "epoch": 21.43872, + "grad_norm": 1.0458475351333618, + "learning_rate": 1.6516606642657063e-05, + "loss": 0.4691, + "step": 16749 + }, + { + "epoch": 21.44, + "grad_norm": 1.1326485872268677, + "learning_rate": 1.6514605842336935e-05, + "loss": 0.4746, + "step": 16750 + }, + { + "epoch": 21.44128, + "grad_norm": 1.032772183418274, + "learning_rate": 1.6512605042016807e-05, + "loss": 0.4662, + "step": 16751 + }, + { + "epoch": 21.44256, + "grad_norm": 1.1388492584228516, + "learning_rate": 1.651060424169668e-05, + "loss": 0.49, + "step": 16752 + }, + { + "epoch": 21.44384, + "grad_norm": 1.1491458415985107, + "learning_rate": 1.650860344137655e-05, + "loss": 0.486, + "step": 16753 + }, + { + "epoch": 21.44512, + "grad_norm": 1.0875811576843262, + "learning_rate": 1.6506602641056423e-05, + "loss": 0.4755, + "step": 16754 + }, + { + "epoch": 21.4464, + "grad_norm": 1.056416630744934, + "learning_rate": 1.6504601840736295e-05, + "loss": 0.4494, + "step": 16755 + }, + { + "epoch": 21.44768, + "grad_norm": 1.0903964042663574, + "learning_rate": 1.650260104041617e-05, + "loss": 0.5011, + "step": 16756 + }, + { + "epoch": 21.44896, + "grad_norm": 1.0801723003387451, + "learning_rate": 1.650060024009604e-05, + "loss": 0.492, + "step": 16757 + }, + { + "epoch": 21.45024, + "grad_norm": 1.1389845609664917, + "learning_rate": 1.649859943977591e-05, + "loss": 0.51, + "step": 16758 + }, + { + "epoch": 21.45152, + "grad_norm": 1.1262261867523193, + "learning_rate": 1.6496598639455782e-05, + "loss": 0.5022, + "step": 16759 + }, + { + "epoch": 21.4528, + "grad_norm": 1.123065710067749, + "learning_rate": 1.6494597839135657e-05, + "loss": 0.5005, + "step": 16760 + }, + { + "epoch": 21.45408, + "grad_norm": 1.099016785621643, + "learning_rate": 1.6492597038815526e-05, + "loss": 0.5023, + "step": 16761 + }, + { + "epoch": 21.45536, + "grad_norm": 1.1234606504440308, + "learning_rate": 1.6490596238495398e-05, + "loss": 0.4718, + "step": 16762 + }, + { + "epoch": 21.45664, + "grad_norm": 1.1033679246902466, + "learning_rate": 1.6488595438175273e-05, + "loss": 0.4908, + "step": 16763 + }, + { + "epoch": 21.45792, + "grad_norm": 1.0978657007217407, + "learning_rate": 1.6486594637855145e-05, + "loss": 0.5028, + "step": 16764 + }, + { + "epoch": 21.4592, + "grad_norm": 1.0921522378921509, + "learning_rate": 1.6484593837535013e-05, + "loss": 0.4639, + "step": 16765 + }, + { + "epoch": 21.46048, + "grad_norm": 1.0589871406555176, + "learning_rate": 1.6482593037214885e-05, + "loss": 0.4662, + "step": 16766 + }, + { + "epoch": 21.46176, + "grad_norm": 1.120967149734497, + "learning_rate": 1.648059223689476e-05, + "loss": 0.4901, + "step": 16767 + }, + { + "epoch": 21.46304, + "grad_norm": 1.1276214122772217, + "learning_rate": 1.6478591436574632e-05, + "loss": 0.4807, + "step": 16768 + }, + { + "epoch": 21.46432, + "grad_norm": 1.0669405460357666, + "learning_rate": 1.64765906362545e-05, + "loss": 0.4683, + "step": 16769 + }, + { + "epoch": 21.4656, + "grad_norm": 1.0860059261322021, + "learning_rate": 1.6474589835934376e-05, + "loss": 0.5079, + "step": 16770 + }, + { + "epoch": 21.46688, + "grad_norm": 1.073190689086914, + "learning_rate": 1.6472589035614248e-05, + "loss": 0.4723, + "step": 16771 + }, + { + "epoch": 21.46816, + "grad_norm": 1.1541026830673218, + "learning_rate": 1.647058823529412e-05, + "loss": 0.5063, + "step": 16772 + }, + { + "epoch": 21.46944, + "grad_norm": 1.0719937086105347, + "learning_rate": 1.6468587434973988e-05, + "loss": 0.4436, + "step": 16773 + }, + { + "epoch": 21.47072, + "grad_norm": 1.048323631286621, + "learning_rate": 1.6466586634653863e-05, + "loss": 0.439, + "step": 16774 + }, + { + "epoch": 21.472, + "grad_norm": 1.0535809993743896, + "learning_rate": 1.6464585834333735e-05, + "loss": 0.4863, + "step": 16775 + }, + { + "epoch": 21.47328, + "grad_norm": 1.0624300241470337, + "learning_rate": 1.6462585034013607e-05, + "loss": 0.5091, + "step": 16776 + }, + { + "epoch": 21.47456, + "grad_norm": 1.1307200193405151, + "learning_rate": 1.6460584233693475e-05, + "loss": 0.4733, + "step": 16777 + }, + { + "epoch": 21.47584, + "grad_norm": 1.1632894277572632, + "learning_rate": 1.645858343337335e-05, + "loss": 0.4962, + "step": 16778 + }, + { + "epoch": 21.47712, + "grad_norm": 1.1236180067062378, + "learning_rate": 1.6456582633053223e-05, + "loss": 0.4846, + "step": 16779 + }, + { + "epoch": 21.4784, + "grad_norm": 1.083125114440918, + "learning_rate": 1.6454581832733094e-05, + "loss": 0.4867, + "step": 16780 + }, + { + "epoch": 21.47968, + "grad_norm": 1.0312892198562622, + "learning_rate": 1.6452581032412966e-05, + "loss": 0.4648, + "step": 16781 + }, + { + "epoch": 21.48096, + "grad_norm": 1.0965737104415894, + "learning_rate": 1.6450580232092838e-05, + "loss": 0.4893, + "step": 16782 + }, + { + "epoch": 21.48224, + "grad_norm": 1.157957673072815, + "learning_rate": 1.644857943177271e-05, + "loss": 0.5055, + "step": 16783 + }, + { + "epoch": 21.48352, + "grad_norm": 1.0796724557876587, + "learning_rate": 1.6446578631452582e-05, + "loss": 0.4762, + "step": 16784 + }, + { + "epoch": 21.4848, + "grad_norm": 1.1314095258712769, + "learning_rate": 1.6444577831132454e-05, + "loss": 0.49, + "step": 16785 + }, + { + "epoch": 21.48608, + "grad_norm": 1.070644736289978, + "learning_rate": 1.6442577030812326e-05, + "loss": 0.4916, + "step": 16786 + }, + { + "epoch": 21.48736, + "grad_norm": 1.101872205734253, + "learning_rate": 1.6440576230492197e-05, + "loss": 0.4969, + "step": 16787 + }, + { + "epoch": 21.48864, + "grad_norm": 1.0998544692993164, + "learning_rate": 1.643857543017207e-05, + "loss": 0.4433, + "step": 16788 + }, + { + "epoch": 21.48992, + "grad_norm": 1.1133356094360352, + "learning_rate": 1.643657462985194e-05, + "loss": 0.4838, + "step": 16789 + }, + { + "epoch": 21.4912, + "grad_norm": 1.0400280952453613, + "learning_rate": 1.6434573829531813e-05, + "loss": 0.4781, + "step": 16790 + }, + { + "epoch": 21.49248, + "grad_norm": 1.1187825202941895, + "learning_rate": 1.6432573029211685e-05, + "loss": 0.4502, + "step": 16791 + }, + { + "epoch": 21.49376, + "grad_norm": 1.09933340549469, + "learning_rate": 1.6430572228891557e-05, + "loss": 0.4945, + "step": 16792 + }, + { + "epoch": 21.49504, + "grad_norm": 1.0932108163833618, + "learning_rate": 1.642857142857143e-05, + "loss": 0.4643, + "step": 16793 + }, + { + "epoch": 21.49632, + "grad_norm": 1.1463568210601807, + "learning_rate": 1.64265706282513e-05, + "loss": 0.5326, + "step": 16794 + }, + { + "epoch": 21.4976, + "grad_norm": 1.0978707075119019, + "learning_rate": 1.6424569827931176e-05, + "loss": 0.5083, + "step": 16795 + }, + { + "epoch": 21.49888, + "grad_norm": 1.1023026704788208, + "learning_rate": 1.6422569027611044e-05, + "loss": 0.4388, + "step": 16796 + }, + { + "epoch": 21.50016, + "grad_norm": 1.119019865989685, + "learning_rate": 1.6420568227290916e-05, + "loss": 0.4728, + "step": 16797 + }, + { + "epoch": 21.50144, + "grad_norm": 1.1367695331573486, + "learning_rate": 1.6418567426970788e-05, + "loss": 0.4819, + "step": 16798 + }, + { + "epoch": 21.50272, + "grad_norm": 1.096197485923767, + "learning_rate": 1.6416566626650663e-05, + "loss": 0.459, + "step": 16799 + }, + { + "epoch": 21.504, + "grad_norm": 1.1106514930725098, + "learning_rate": 1.641456582633053e-05, + "loss": 0.4894, + "step": 16800 + }, + { + "epoch": 21.50528, + "grad_norm": 1.0969781875610352, + "learning_rate": 1.6412565026010403e-05, + "loss": 0.5059, + "step": 16801 + }, + { + "epoch": 21.50656, + "grad_norm": 1.158473014831543, + "learning_rate": 1.641056422569028e-05, + "loss": 0.4895, + "step": 16802 + }, + { + "epoch": 21.50784, + "grad_norm": 1.147180199623108, + "learning_rate": 1.640856342537015e-05, + "loss": 0.5291, + "step": 16803 + }, + { + "epoch": 21.50912, + "grad_norm": 1.0704094171524048, + "learning_rate": 1.640656262505002e-05, + "loss": 0.4578, + "step": 16804 + }, + { + "epoch": 21.5104, + "grad_norm": 1.1559474468231201, + "learning_rate": 1.640456182472989e-05, + "loss": 0.5084, + "step": 16805 + }, + { + "epoch": 21.51168, + "grad_norm": 1.14545738697052, + "learning_rate": 1.6402561024409766e-05, + "loss": 0.5353, + "step": 16806 + }, + { + "epoch": 21.51296, + "grad_norm": 1.1325570344924927, + "learning_rate": 1.6400560224089638e-05, + "loss": 0.4731, + "step": 16807 + }, + { + "epoch": 21.51424, + "grad_norm": 1.1258713006973267, + "learning_rate": 1.6398559423769506e-05, + "loss": 0.4792, + "step": 16808 + }, + { + "epoch": 21.51552, + "grad_norm": 1.1287716627120972, + "learning_rate": 1.639655862344938e-05, + "loss": 0.4661, + "step": 16809 + }, + { + "epoch": 21.5168, + "grad_norm": 1.1289795637130737, + "learning_rate": 1.6394557823129254e-05, + "loss": 0.4774, + "step": 16810 + }, + { + "epoch": 21.51808, + "grad_norm": 1.1608408689498901, + "learning_rate": 1.6392557022809125e-05, + "loss": 0.4932, + "step": 16811 + }, + { + "epoch": 21.51936, + "grad_norm": 1.1531096696853638, + "learning_rate": 1.6390556222488994e-05, + "loss": 0.5028, + "step": 16812 + }, + { + "epoch": 21.52064, + "grad_norm": 1.1289916038513184, + "learning_rate": 1.638855542216887e-05, + "loss": 0.4894, + "step": 16813 + }, + { + "epoch": 21.52192, + "grad_norm": 1.131188988685608, + "learning_rate": 1.638655462184874e-05, + "loss": 0.5029, + "step": 16814 + }, + { + "epoch": 21.5232, + "grad_norm": 1.0534837245941162, + "learning_rate": 1.6384553821528613e-05, + "loss": 0.4684, + "step": 16815 + }, + { + "epoch": 21.52448, + "grad_norm": 1.1278927326202393, + "learning_rate": 1.6382553021208485e-05, + "loss": 0.5104, + "step": 16816 + }, + { + "epoch": 21.52576, + "grad_norm": 1.177605152130127, + "learning_rate": 1.6380552220888357e-05, + "loss": 0.4751, + "step": 16817 + }, + { + "epoch": 21.52704, + "grad_norm": 1.1548734903335571, + "learning_rate": 1.637855142056823e-05, + "loss": 0.502, + "step": 16818 + }, + { + "epoch": 21.52832, + "grad_norm": 1.145601749420166, + "learning_rate": 1.63765506202481e-05, + "loss": 0.5197, + "step": 16819 + }, + { + "epoch": 21.5296, + "grad_norm": 1.058839201927185, + "learning_rate": 1.6374549819927972e-05, + "loss": 0.4424, + "step": 16820 + }, + { + "epoch": 21.53088, + "grad_norm": 1.1241230964660645, + "learning_rate": 1.6372549019607844e-05, + "loss": 0.4831, + "step": 16821 + }, + { + "epoch": 21.53216, + "grad_norm": 1.130538821220398, + "learning_rate": 1.6370548219287716e-05, + "loss": 0.4936, + "step": 16822 + }, + { + "epoch": 21.53344, + "grad_norm": 1.1460503339767456, + "learning_rate": 1.6368547418967588e-05, + "loss": 0.4879, + "step": 16823 + }, + { + "epoch": 21.53472, + "grad_norm": 1.1377558708190918, + "learning_rate": 1.636654661864746e-05, + "loss": 0.5494, + "step": 16824 + }, + { + "epoch": 21.536, + "grad_norm": 1.1145014762878418, + "learning_rate": 1.636454581832733e-05, + "loss": 0.5045, + "step": 16825 + }, + { + "epoch": 21.53728, + "grad_norm": 1.0829366445541382, + "learning_rate": 1.6362545018007203e-05, + "loss": 0.4957, + "step": 16826 + }, + { + "epoch": 21.53856, + "grad_norm": 1.0551543235778809, + "learning_rate": 1.6360544217687075e-05, + "loss": 0.4979, + "step": 16827 + }, + { + "epoch": 21.53984, + "grad_norm": 1.0742313861846924, + "learning_rate": 1.6358543417366947e-05, + "loss": 0.5055, + "step": 16828 + }, + { + "epoch": 21.54112, + "grad_norm": 1.1106947660446167, + "learning_rate": 1.635654261704682e-05, + "loss": 0.5044, + "step": 16829 + }, + { + "epoch": 21.5424, + "grad_norm": 1.0783088207244873, + "learning_rate": 1.6354541816726694e-05, + "loss": 0.5078, + "step": 16830 + }, + { + "epoch": 21.54368, + "grad_norm": 1.1662565469741821, + "learning_rate": 1.6352541016406563e-05, + "loss": 0.5252, + "step": 16831 + }, + { + "epoch": 21.54496, + "grad_norm": 1.0940940380096436, + "learning_rate": 1.6350540216086434e-05, + "loss": 0.4984, + "step": 16832 + }, + { + "epoch": 21.54624, + "grad_norm": 1.0466233491897583, + "learning_rate": 1.6348539415766306e-05, + "loss": 0.498, + "step": 16833 + }, + { + "epoch": 21.54752, + "grad_norm": 1.097116470336914, + "learning_rate": 1.634653861544618e-05, + "loss": 0.4793, + "step": 16834 + }, + { + "epoch": 21.5488, + "grad_norm": 1.19340980052948, + "learning_rate": 1.634453781512605e-05, + "loss": 0.5476, + "step": 16835 + }, + { + "epoch": 21.55008, + "grad_norm": 1.0786470174789429, + "learning_rate": 1.6342537014805922e-05, + "loss": 0.4835, + "step": 16836 + }, + { + "epoch": 21.55136, + "grad_norm": 1.1250004768371582, + "learning_rate": 1.6340536214485794e-05, + "loss": 0.4834, + "step": 16837 + }, + { + "epoch": 21.55264, + "grad_norm": 1.1461224555969238, + "learning_rate": 1.633853541416567e-05, + "loss": 0.5397, + "step": 16838 + }, + { + "epoch": 21.55392, + "grad_norm": 1.153669834136963, + "learning_rate": 1.6336534613845537e-05, + "loss": 0.5246, + "step": 16839 + }, + { + "epoch": 21.5552, + "grad_norm": 1.1360396146774292, + "learning_rate": 1.633453381352541e-05, + "loss": 0.4887, + "step": 16840 + }, + { + "epoch": 21.55648, + "grad_norm": 1.1669892072677612, + "learning_rate": 1.6332533013205284e-05, + "loss": 0.5377, + "step": 16841 + }, + { + "epoch": 21.557760000000002, + "grad_norm": 1.094748854637146, + "learning_rate": 1.6330532212885156e-05, + "loss": 0.4918, + "step": 16842 + }, + { + "epoch": 21.55904, + "grad_norm": 1.1243646144866943, + "learning_rate": 1.6328531412565025e-05, + "loss": 0.4567, + "step": 16843 + }, + { + "epoch": 21.56032, + "grad_norm": 1.0712041854858398, + "learning_rate": 1.6326530612244897e-05, + "loss": 0.4915, + "step": 16844 + }, + { + "epoch": 21.5616, + "grad_norm": 1.2099251747131348, + "learning_rate": 1.6324529811924772e-05, + "loss": 0.5528, + "step": 16845 + }, + { + "epoch": 21.56288, + "grad_norm": 1.102573037147522, + "learning_rate": 1.6322529011604644e-05, + "loss": 0.4806, + "step": 16846 + }, + { + "epoch": 21.56416, + "grad_norm": 1.1252408027648926, + "learning_rate": 1.6320528211284512e-05, + "loss": 0.48, + "step": 16847 + }, + { + "epoch": 21.56544, + "grad_norm": 1.112750768661499, + "learning_rate": 1.6318527410964387e-05, + "loss": 0.5078, + "step": 16848 + }, + { + "epoch": 21.56672, + "grad_norm": 1.0940823554992676, + "learning_rate": 1.631652661064426e-05, + "loss": 0.499, + "step": 16849 + }, + { + "epoch": 21.568, + "grad_norm": 1.04874849319458, + "learning_rate": 1.631452581032413e-05, + "loss": 0.4649, + "step": 16850 + }, + { + "epoch": 21.56928, + "grad_norm": 1.0783095359802246, + "learning_rate": 1.6312525010004e-05, + "loss": 0.5148, + "step": 16851 + }, + { + "epoch": 21.57056, + "grad_norm": 1.1552876234054565, + "learning_rate": 1.6310524209683875e-05, + "loss": 0.5087, + "step": 16852 + }, + { + "epoch": 21.57184, + "grad_norm": 1.1094108819961548, + "learning_rate": 1.6308523409363747e-05, + "loss": 0.4625, + "step": 16853 + }, + { + "epoch": 21.57312, + "grad_norm": 1.1274285316467285, + "learning_rate": 1.630652260904362e-05, + "loss": 0.5366, + "step": 16854 + }, + { + "epoch": 21.5744, + "grad_norm": 1.1100611686706543, + "learning_rate": 1.630452180872349e-05, + "loss": 0.5105, + "step": 16855 + }, + { + "epoch": 21.57568, + "grad_norm": 1.108077883720398, + "learning_rate": 1.6302521008403362e-05, + "loss": 0.4801, + "step": 16856 + }, + { + "epoch": 21.57696, + "grad_norm": 1.1188905239105225, + "learning_rate": 1.6300520208083234e-05, + "loss": 0.4432, + "step": 16857 + }, + { + "epoch": 21.57824, + "grad_norm": 1.2001028060913086, + "learning_rate": 1.6298519407763106e-05, + "loss": 0.5044, + "step": 16858 + }, + { + "epoch": 21.57952, + "grad_norm": 1.151809573173523, + "learning_rate": 1.6296518607442978e-05, + "loss": 0.4696, + "step": 16859 + }, + { + "epoch": 21.5808, + "grad_norm": 1.1924430131912231, + "learning_rate": 1.629451780712285e-05, + "loss": 0.5198, + "step": 16860 + }, + { + "epoch": 21.58208, + "grad_norm": 1.1216545104980469, + "learning_rate": 1.629251700680272e-05, + "loss": 0.4543, + "step": 16861 + }, + { + "epoch": 21.58336, + "grad_norm": 1.150597333908081, + "learning_rate": 1.6290516206482593e-05, + "loss": 0.5262, + "step": 16862 + }, + { + "epoch": 21.58464, + "grad_norm": 1.1352801322937012, + "learning_rate": 1.6288515406162465e-05, + "loss": 0.4916, + "step": 16863 + }, + { + "epoch": 21.58592, + "grad_norm": 1.0616695880889893, + "learning_rate": 1.6286514605842337e-05, + "loss": 0.4861, + "step": 16864 + }, + { + "epoch": 21.5872, + "grad_norm": 1.079972743988037, + "learning_rate": 1.628451380552221e-05, + "loss": 0.4737, + "step": 16865 + }, + { + "epoch": 21.58848, + "grad_norm": 1.0440356731414795, + "learning_rate": 1.628251300520208e-05, + "loss": 0.499, + "step": 16866 + }, + { + "epoch": 21.58976, + "grad_norm": 1.1359690427780151, + "learning_rate": 1.6280512204881953e-05, + "loss": 0.5054, + "step": 16867 + }, + { + "epoch": 21.59104, + "grad_norm": 1.1668392419815063, + "learning_rate": 1.6278511404561825e-05, + "loss": 0.5426, + "step": 16868 + }, + { + "epoch": 21.59232, + "grad_norm": 1.2051527500152588, + "learning_rate": 1.62765106042417e-05, + "loss": 0.4973, + "step": 16869 + }, + { + "epoch": 21.5936, + "grad_norm": 1.0257412195205688, + "learning_rate": 1.627450980392157e-05, + "loss": 0.4355, + "step": 16870 + }, + { + "epoch": 21.59488, + "grad_norm": 1.0660890340805054, + "learning_rate": 1.627250900360144e-05, + "loss": 0.4721, + "step": 16871 + }, + { + "epoch": 21.59616, + "grad_norm": 1.1143311262130737, + "learning_rate": 1.6270508203281312e-05, + "loss": 0.4979, + "step": 16872 + }, + { + "epoch": 21.59744, + "grad_norm": 1.1307166814804077, + "learning_rate": 1.6268507402961187e-05, + "loss": 0.516, + "step": 16873 + }, + { + "epoch": 21.59872, + "grad_norm": 1.0710972547531128, + "learning_rate": 1.6266506602641056e-05, + "loss": 0.4866, + "step": 16874 + }, + { + "epoch": 21.6, + "grad_norm": 1.1194899082183838, + "learning_rate": 1.6264505802320928e-05, + "loss": 0.4978, + "step": 16875 + }, + { + "epoch": 21.60128, + "grad_norm": 1.1724941730499268, + "learning_rate": 1.6262505002000803e-05, + "loss": 0.5422, + "step": 16876 + }, + { + "epoch": 21.60256, + "grad_norm": 1.1926066875457764, + "learning_rate": 1.6260504201680675e-05, + "loss": 0.5132, + "step": 16877 + }, + { + "epoch": 21.60384, + "grad_norm": 1.1342614889144897, + "learning_rate": 1.6258503401360543e-05, + "loss": 0.4642, + "step": 16878 + }, + { + "epoch": 21.60512, + "grad_norm": 1.0815118551254272, + "learning_rate": 1.6256502601040415e-05, + "loss": 0.4648, + "step": 16879 + }, + { + "epoch": 21.6064, + "grad_norm": 1.073204517364502, + "learning_rate": 1.625450180072029e-05, + "loss": 0.4942, + "step": 16880 + }, + { + "epoch": 21.60768, + "grad_norm": 1.1120681762695312, + "learning_rate": 1.6252501000400162e-05, + "loss": 0.5087, + "step": 16881 + }, + { + "epoch": 21.60896, + "grad_norm": 1.085679292678833, + "learning_rate": 1.625050020008003e-05, + "loss": 0.4773, + "step": 16882 + }, + { + "epoch": 21.61024, + "grad_norm": 1.0857542753219604, + "learning_rate": 1.6248499399759906e-05, + "loss": 0.4978, + "step": 16883 + }, + { + "epoch": 21.61152, + "grad_norm": 1.0734546184539795, + "learning_rate": 1.6246498599439778e-05, + "loss": 0.4487, + "step": 16884 + }, + { + "epoch": 21.6128, + "grad_norm": 1.1066250801086426, + "learning_rate": 1.624449779911965e-05, + "loss": 0.5063, + "step": 16885 + }, + { + "epoch": 21.61408, + "grad_norm": 1.0745172500610352, + "learning_rate": 1.6242496998799518e-05, + "loss": 0.4654, + "step": 16886 + }, + { + "epoch": 21.61536, + "grad_norm": 1.1120097637176514, + "learning_rate": 1.6240496198479393e-05, + "loss": 0.4833, + "step": 16887 + }, + { + "epoch": 21.61664, + "grad_norm": 1.0820595026016235, + "learning_rate": 1.6238495398159265e-05, + "loss": 0.4938, + "step": 16888 + }, + { + "epoch": 21.61792, + "grad_norm": 1.0889438390731812, + "learning_rate": 1.6236494597839137e-05, + "loss": 0.4847, + "step": 16889 + }, + { + "epoch": 21.6192, + "grad_norm": 1.1894570589065552, + "learning_rate": 1.6234493797519005e-05, + "loss": 0.5127, + "step": 16890 + }, + { + "epoch": 21.62048, + "grad_norm": 1.123768925666809, + "learning_rate": 1.623249299719888e-05, + "loss": 0.4727, + "step": 16891 + }, + { + "epoch": 21.62176, + "grad_norm": 1.0649784803390503, + "learning_rate": 1.6230492196878753e-05, + "loss": 0.4568, + "step": 16892 + }, + { + "epoch": 21.62304, + "grad_norm": 1.1826279163360596, + "learning_rate": 1.6228491396558624e-05, + "loss": 0.4925, + "step": 16893 + }, + { + "epoch": 21.62432, + "grad_norm": 1.1905529499053955, + "learning_rate": 1.6226490596238496e-05, + "loss": 0.495, + "step": 16894 + }, + { + "epoch": 21.6256, + "grad_norm": 1.206997275352478, + "learning_rate": 1.6224489795918368e-05, + "loss": 0.4867, + "step": 16895 + }, + { + "epoch": 21.62688, + "grad_norm": 1.1951205730438232, + "learning_rate": 1.622248899559824e-05, + "loss": 0.5134, + "step": 16896 + }, + { + "epoch": 21.62816, + "grad_norm": 1.1452414989471436, + "learning_rate": 1.6220488195278112e-05, + "loss": 0.4927, + "step": 16897 + }, + { + "epoch": 21.62944, + "grad_norm": 1.1808103322982788, + "learning_rate": 1.6218487394957984e-05, + "loss": 0.5282, + "step": 16898 + }, + { + "epoch": 21.63072, + "grad_norm": 1.0460494756698608, + "learning_rate": 1.6216486594637856e-05, + "loss": 0.4496, + "step": 16899 + }, + { + "epoch": 21.632, + "grad_norm": 1.1484510898590088, + "learning_rate": 1.6214485794317727e-05, + "loss": 0.4895, + "step": 16900 + }, + { + "epoch": 21.63328, + "grad_norm": 1.153800129890442, + "learning_rate": 1.62124849939976e-05, + "loss": 0.5, + "step": 16901 + }, + { + "epoch": 21.63456, + "grad_norm": 1.1488093137741089, + "learning_rate": 1.621048419367747e-05, + "loss": 0.5194, + "step": 16902 + }, + { + "epoch": 21.63584, + "grad_norm": 1.0916675329208374, + "learning_rate": 1.6208483393357343e-05, + "loss": 0.4648, + "step": 16903 + }, + { + "epoch": 21.63712, + "grad_norm": 1.0833429098129272, + "learning_rate": 1.6206482593037215e-05, + "loss": 0.4548, + "step": 16904 + }, + { + "epoch": 21.6384, + "grad_norm": 1.1267287731170654, + "learning_rate": 1.6204481792717087e-05, + "loss": 0.5134, + "step": 16905 + }, + { + "epoch": 21.63968, + "grad_norm": 1.2103078365325928, + "learning_rate": 1.620248099239696e-05, + "loss": 0.5598, + "step": 16906 + }, + { + "epoch": 21.64096, + "grad_norm": 1.0767333507537842, + "learning_rate": 1.620048019207683e-05, + "loss": 0.4599, + "step": 16907 + }, + { + "epoch": 21.64224, + "grad_norm": 1.12871515750885, + "learning_rate": 1.6198479391756706e-05, + "loss": 0.5094, + "step": 16908 + }, + { + "epoch": 21.64352, + "grad_norm": 1.0893502235412598, + "learning_rate": 1.6196478591436574e-05, + "loss": 0.4617, + "step": 16909 + }, + { + "epoch": 21.6448, + "grad_norm": 1.1467949151992798, + "learning_rate": 1.6194477791116446e-05, + "loss": 0.4717, + "step": 16910 + }, + { + "epoch": 21.64608, + "grad_norm": 1.151532530784607, + "learning_rate": 1.6192476990796318e-05, + "loss": 0.4958, + "step": 16911 + }, + { + "epoch": 21.64736, + "grad_norm": 1.151410698890686, + "learning_rate": 1.6190476190476193e-05, + "loss": 0.5203, + "step": 16912 + }, + { + "epoch": 21.64864, + "grad_norm": 1.044360876083374, + "learning_rate": 1.618847539015606e-05, + "loss": 0.4547, + "step": 16913 + }, + { + "epoch": 21.64992, + "grad_norm": 1.1051994562149048, + "learning_rate": 1.6186474589835933e-05, + "loss": 0.5285, + "step": 16914 + }, + { + "epoch": 21.6512, + "grad_norm": 1.0989066362380981, + "learning_rate": 1.618447378951581e-05, + "loss": 0.4682, + "step": 16915 + }, + { + "epoch": 21.65248, + "grad_norm": 1.1165852546691895, + "learning_rate": 1.618247298919568e-05, + "loss": 0.5312, + "step": 16916 + }, + { + "epoch": 21.65376, + "grad_norm": 1.1108812093734741, + "learning_rate": 1.618047218887555e-05, + "loss": 0.5191, + "step": 16917 + }, + { + "epoch": 21.65504, + "grad_norm": 1.1192561388015747, + "learning_rate": 1.617847138855542e-05, + "loss": 0.4527, + "step": 16918 + }, + { + "epoch": 21.65632, + "grad_norm": 1.137689232826233, + "learning_rate": 1.6176470588235296e-05, + "loss": 0.4527, + "step": 16919 + }, + { + "epoch": 21.6576, + "grad_norm": 1.1603665351867676, + "learning_rate": 1.6174469787915168e-05, + "loss": 0.5427, + "step": 16920 + }, + { + "epoch": 21.65888, + "grad_norm": 1.0487416982650757, + "learning_rate": 1.6172468987595036e-05, + "loss": 0.4543, + "step": 16921 + }, + { + "epoch": 21.66016, + "grad_norm": 1.0956952571868896, + "learning_rate": 1.617046818727491e-05, + "loss": 0.4952, + "step": 16922 + }, + { + "epoch": 21.66144, + "grad_norm": 1.0673739910125732, + "learning_rate": 1.6168467386954784e-05, + "loss": 0.4378, + "step": 16923 + }, + { + "epoch": 21.66272, + "grad_norm": 1.1492853164672852, + "learning_rate": 1.6166466586634655e-05, + "loss": 0.5302, + "step": 16924 + }, + { + "epoch": 21.664, + "grad_norm": 1.058276653289795, + "learning_rate": 1.6164465786314524e-05, + "loss": 0.4846, + "step": 16925 + }, + { + "epoch": 21.66528, + "grad_norm": 1.1349074840545654, + "learning_rate": 1.61624649859944e-05, + "loss": 0.5074, + "step": 16926 + }, + { + "epoch": 21.66656, + "grad_norm": 1.1578072309494019, + "learning_rate": 1.616046418567427e-05, + "loss": 0.5537, + "step": 16927 + }, + { + "epoch": 21.667839999999998, + "grad_norm": 1.1258023977279663, + "learning_rate": 1.6158463385354143e-05, + "loss": 0.5015, + "step": 16928 + }, + { + "epoch": 21.66912, + "grad_norm": 1.1199811697006226, + "learning_rate": 1.6156462585034015e-05, + "loss": 0.5215, + "step": 16929 + }, + { + "epoch": 21.6704, + "grad_norm": 1.0602338314056396, + "learning_rate": 1.6154461784713887e-05, + "loss": 0.4738, + "step": 16930 + }, + { + "epoch": 21.67168, + "grad_norm": 1.1654423475265503, + "learning_rate": 1.615246098439376e-05, + "loss": 0.5589, + "step": 16931 + }, + { + "epoch": 21.67296, + "grad_norm": 1.081931710243225, + "learning_rate": 1.615046018407363e-05, + "loss": 0.4466, + "step": 16932 + }, + { + "epoch": 21.67424, + "grad_norm": 1.1227881908416748, + "learning_rate": 1.6148459383753502e-05, + "loss": 0.5191, + "step": 16933 + }, + { + "epoch": 21.67552, + "grad_norm": 1.1291007995605469, + "learning_rate": 1.6146458583433374e-05, + "loss": 0.5003, + "step": 16934 + }, + { + "epoch": 21.6768, + "grad_norm": 1.0886553525924683, + "learning_rate": 1.6144457783113246e-05, + "loss": 0.4575, + "step": 16935 + }, + { + "epoch": 21.67808, + "grad_norm": 1.2142690420150757, + "learning_rate": 1.614245698279312e-05, + "loss": 0.5728, + "step": 16936 + }, + { + "epoch": 21.67936, + "grad_norm": 1.0875095129013062, + "learning_rate": 1.614045618247299e-05, + "loss": 0.4926, + "step": 16937 + }, + { + "epoch": 21.68064, + "grad_norm": 1.092563509941101, + "learning_rate": 1.613845538215286e-05, + "loss": 0.4254, + "step": 16938 + }, + { + "epoch": 21.68192, + "grad_norm": 1.0964988470077515, + "learning_rate": 1.6136454581832733e-05, + "loss": 0.4502, + "step": 16939 + }, + { + "epoch": 21.6832, + "grad_norm": 1.0522211790084839, + "learning_rate": 1.613445378151261e-05, + "loss": 0.4743, + "step": 16940 + }, + { + "epoch": 21.68448, + "grad_norm": 1.0800302028656006, + "learning_rate": 1.6132452981192477e-05, + "loss": 0.4934, + "step": 16941 + }, + { + "epoch": 21.68576, + "grad_norm": 1.1527082920074463, + "learning_rate": 1.613045218087235e-05, + "loss": 0.5126, + "step": 16942 + }, + { + "epoch": 21.68704, + "grad_norm": 1.0669933557510376, + "learning_rate": 1.6128451380552224e-05, + "loss": 0.4885, + "step": 16943 + }, + { + "epoch": 21.68832, + "grad_norm": 1.1759413480758667, + "learning_rate": 1.6126450580232096e-05, + "loss": 0.5232, + "step": 16944 + }, + { + "epoch": 21.6896, + "grad_norm": 1.2183908224105835, + "learning_rate": 1.6124449779911964e-05, + "loss": 0.5372, + "step": 16945 + }, + { + "epoch": 21.69088, + "grad_norm": 1.1375067234039307, + "learning_rate": 1.6122448979591836e-05, + "loss": 0.5206, + "step": 16946 + }, + { + "epoch": 21.69216, + "grad_norm": 1.2400755882263184, + "learning_rate": 1.612044817927171e-05, + "loss": 0.5103, + "step": 16947 + }, + { + "epoch": 21.69344, + "grad_norm": 1.172790288925171, + "learning_rate": 1.6118447378951583e-05, + "loss": 0.4674, + "step": 16948 + }, + { + "epoch": 21.69472, + "grad_norm": 1.0851976871490479, + "learning_rate": 1.6116446578631452e-05, + "loss": 0.4606, + "step": 16949 + }, + { + "epoch": 21.696, + "grad_norm": 1.1035497188568115, + "learning_rate": 1.6114445778311324e-05, + "loss": 0.4504, + "step": 16950 + }, + { + "epoch": 21.69728, + "grad_norm": 1.1531620025634766, + "learning_rate": 1.61124449779912e-05, + "loss": 0.5273, + "step": 16951 + }, + { + "epoch": 21.69856, + "grad_norm": 1.0832749605178833, + "learning_rate": 1.611044417767107e-05, + "loss": 0.4824, + "step": 16952 + }, + { + "epoch": 21.699840000000002, + "grad_norm": 1.1482528448104858, + "learning_rate": 1.610844337735094e-05, + "loss": 0.5408, + "step": 16953 + }, + { + "epoch": 21.70112, + "grad_norm": 1.0921155214309692, + "learning_rate": 1.6106442577030814e-05, + "loss": 0.4788, + "step": 16954 + }, + { + "epoch": 21.7024, + "grad_norm": 1.0282621383666992, + "learning_rate": 1.6104441776710686e-05, + "loss": 0.422, + "step": 16955 + }, + { + "epoch": 21.70368, + "grad_norm": 1.1054280996322632, + "learning_rate": 1.6102440976390558e-05, + "loss": 0.4924, + "step": 16956 + }, + { + "epoch": 21.70496, + "grad_norm": 1.1694878339767456, + "learning_rate": 1.6100440176070427e-05, + "loss": 0.5017, + "step": 16957 + }, + { + "epoch": 21.70624, + "grad_norm": 1.0920751094818115, + "learning_rate": 1.6098439375750302e-05, + "loss": 0.5053, + "step": 16958 + }, + { + "epoch": 21.70752, + "grad_norm": 1.0558879375457764, + "learning_rate": 1.6096438575430174e-05, + "loss": 0.4535, + "step": 16959 + }, + { + "epoch": 21.7088, + "grad_norm": 1.084283709526062, + "learning_rate": 1.6094437775110046e-05, + "loss": 0.4792, + "step": 16960 + }, + { + "epoch": 21.71008, + "grad_norm": 1.0939773321151733, + "learning_rate": 1.6092436974789917e-05, + "loss": 0.466, + "step": 16961 + }, + { + "epoch": 21.71136, + "grad_norm": 1.151667594909668, + "learning_rate": 1.609043617446979e-05, + "loss": 0.5377, + "step": 16962 + }, + { + "epoch": 21.71264, + "grad_norm": 1.173499345779419, + "learning_rate": 1.608843537414966e-05, + "loss": 0.5031, + "step": 16963 + }, + { + "epoch": 21.71392, + "grad_norm": 1.102555751800537, + "learning_rate": 1.6086434573829533e-05, + "loss": 0.457, + "step": 16964 + }, + { + "epoch": 21.7152, + "grad_norm": 1.1618655920028687, + "learning_rate": 1.6084433773509405e-05, + "loss": 0.513, + "step": 16965 + }, + { + "epoch": 21.71648, + "grad_norm": 1.0982638597488403, + "learning_rate": 1.6082432973189277e-05, + "loss": 0.4711, + "step": 16966 + }, + { + "epoch": 21.71776, + "grad_norm": 1.179809808731079, + "learning_rate": 1.608043217286915e-05, + "loss": 0.5369, + "step": 16967 + }, + { + "epoch": 21.71904, + "grad_norm": 1.1830394268035889, + "learning_rate": 1.607843137254902e-05, + "loss": 0.5041, + "step": 16968 + }, + { + "epoch": 21.72032, + "grad_norm": 1.1552484035491943, + "learning_rate": 1.6076430572228892e-05, + "loss": 0.5089, + "step": 16969 + }, + { + "epoch": 21.7216, + "grad_norm": 1.0561984777450562, + "learning_rate": 1.6074429771908764e-05, + "loss": 0.4571, + "step": 16970 + }, + { + "epoch": 21.72288, + "grad_norm": 1.2352055311203003, + "learning_rate": 1.6072428971588636e-05, + "loss": 0.5468, + "step": 16971 + }, + { + "epoch": 21.72416, + "grad_norm": 1.0787138938903809, + "learning_rate": 1.6070428171268508e-05, + "loss": 0.4551, + "step": 16972 + }, + { + "epoch": 21.72544, + "grad_norm": 1.0198187828063965, + "learning_rate": 1.606842737094838e-05, + "loss": 0.4421, + "step": 16973 + }, + { + "epoch": 21.72672, + "grad_norm": 1.1738202571868896, + "learning_rate": 1.606642657062825e-05, + "loss": 0.5602, + "step": 16974 + }, + { + "epoch": 21.728, + "grad_norm": 1.08427095413208, + "learning_rate": 1.6064425770308127e-05, + "loss": 0.5051, + "step": 16975 + }, + { + "epoch": 21.72928, + "grad_norm": 1.0601693391799927, + "learning_rate": 1.6062424969987995e-05, + "loss": 0.4876, + "step": 16976 + }, + { + "epoch": 21.73056, + "grad_norm": 1.016387701034546, + "learning_rate": 1.6060424169667867e-05, + "loss": 0.4689, + "step": 16977 + }, + { + "epoch": 21.73184, + "grad_norm": 1.1087530851364136, + "learning_rate": 1.605842336934774e-05, + "loss": 0.5092, + "step": 16978 + }, + { + "epoch": 21.73312, + "grad_norm": 1.0978829860687256, + "learning_rate": 1.6056422569027614e-05, + "loss": 0.4665, + "step": 16979 + }, + { + "epoch": 21.7344, + "grad_norm": 1.1613261699676514, + "learning_rate": 1.6054421768707483e-05, + "loss": 0.5061, + "step": 16980 + }, + { + "epoch": 21.73568, + "grad_norm": 1.1227811574935913, + "learning_rate": 1.6052420968387355e-05, + "loss": 0.5022, + "step": 16981 + }, + { + "epoch": 21.73696, + "grad_norm": 1.1268730163574219, + "learning_rate": 1.605042016806723e-05, + "loss": 0.4788, + "step": 16982 + }, + { + "epoch": 21.73824, + "grad_norm": 1.1465482711791992, + "learning_rate": 1.6048419367747102e-05, + "loss": 0.4893, + "step": 16983 + }, + { + "epoch": 21.73952, + "grad_norm": 1.1241455078125, + "learning_rate": 1.604641856742697e-05, + "loss": 0.5033, + "step": 16984 + }, + { + "epoch": 21.7408, + "grad_norm": 1.1142101287841797, + "learning_rate": 1.6044417767106842e-05, + "loss": 0.5004, + "step": 16985 + }, + { + "epoch": 21.74208, + "grad_norm": 1.125550389289856, + "learning_rate": 1.6042416966786717e-05, + "loss": 0.5053, + "step": 16986 + }, + { + "epoch": 21.74336, + "grad_norm": 1.0979461669921875, + "learning_rate": 1.604041616646659e-05, + "loss": 0.514, + "step": 16987 + }, + { + "epoch": 21.74464, + "grad_norm": 1.0835514068603516, + "learning_rate": 1.6038415366146458e-05, + "loss": 0.4502, + "step": 16988 + }, + { + "epoch": 21.74592, + "grad_norm": 1.1370912790298462, + "learning_rate": 1.6036414565826333e-05, + "loss": 0.5205, + "step": 16989 + }, + { + "epoch": 21.7472, + "grad_norm": 1.076702356338501, + "learning_rate": 1.6034413765506205e-05, + "loss": 0.4769, + "step": 16990 + }, + { + "epoch": 21.74848, + "grad_norm": 1.1314976215362549, + "learning_rate": 1.6032412965186077e-05, + "loss": 0.473, + "step": 16991 + }, + { + "epoch": 21.74976, + "grad_norm": 1.1105777025222778, + "learning_rate": 1.6030412164865945e-05, + "loss": 0.5232, + "step": 16992 + }, + { + "epoch": 21.75104, + "grad_norm": 1.0831820964813232, + "learning_rate": 1.602841136454582e-05, + "loss": 0.4784, + "step": 16993 + }, + { + "epoch": 21.75232, + "grad_norm": 1.2100034952163696, + "learning_rate": 1.6026410564225692e-05, + "loss": 0.4977, + "step": 16994 + }, + { + "epoch": 21.7536, + "grad_norm": 1.1598615646362305, + "learning_rate": 1.6024409763905564e-05, + "loss": 0.5115, + "step": 16995 + }, + { + "epoch": 21.75488, + "grad_norm": 1.0910362005233765, + "learning_rate": 1.6022408963585436e-05, + "loss": 0.5047, + "step": 16996 + }, + { + "epoch": 21.75616, + "grad_norm": 1.12237548828125, + "learning_rate": 1.6020408163265308e-05, + "loss": 0.4748, + "step": 16997 + }, + { + "epoch": 21.75744, + "grad_norm": 1.175667643547058, + "learning_rate": 1.601840736294518e-05, + "loss": 0.5035, + "step": 16998 + }, + { + "epoch": 21.75872, + "grad_norm": 1.1076536178588867, + "learning_rate": 1.601640656262505e-05, + "loss": 0.5181, + "step": 16999 + }, + { + "epoch": 21.76, + "grad_norm": 1.105377197265625, + "learning_rate": 1.6014405762304923e-05, + "loss": 0.4939, + "step": 17000 + }, + { + "epoch": 21.76128, + "grad_norm": 1.052820086479187, + "learning_rate": 1.6012404961984795e-05, + "loss": 0.4644, + "step": 17001 + }, + { + "epoch": 21.76256, + "grad_norm": 1.0277769565582275, + "learning_rate": 1.6010404161664667e-05, + "loss": 0.4438, + "step": 17002 + }, + { + "epoch": 21.76384, + "grad_norm": 1.1701537370681763, + "learning_rate": 1.600840336134454e-05, + "loss": 0.5511, + "step": 17003 + }, + { + "epoch": 21.76512, + "grad_norm": 1.1238141059875488, + "learning_rate": 1.600640256102441e-05, + "loss": 0.5343, + "step": 17004 + }, + { + "epoch": 21.7664, + "grad_norm": 1.1096354722976685, + "learning_rate": 1.6004401760704283e-05, + "loss": 0.576, + "step": 17005 + }, + { + "epoch": 21.76768, + "grad_norm": 1.1454472541809082, + "learning_rate": 1.6002400960384154e-05, + "loss": 0.5276, + "step": 17006 + }, + { + "epoch": 21.76896, + "grad_norm": 1.095155954360962, + "learning_rate": 1.6000400160064026e-05, + "loss": 0.4968, + "step": 17007 + }, + { + "epoch": 21.77024, + "grad_norm": 1.0994304418563843, + "learning_rate": 1.5998399359743898e-05, + "loss": 0.4959, + "step": 17008 + }, + { + "epoch": 21.77152, + "grad_norm": 1.117372989654541, + "learning_rate": 1.599639855942377e-05, + "loss": 0.5205, + "step": 17009 + }, + { + "epoch": 21.7728, + "grad_norm": 1.0571292638778687, + "learning_rate": 1.5994397759103642e-05, + "loss": 0.4693, + "step": 17010 + }, + { + "epoch": 21.77408, + "grad_norm": 1.0867185592651367, + "learning_rate": 1.5992396958783514e-05, + "loss": 0.4867, + "step": 17011 + }, + { + "epoch": 21.77536, + "grad_norm": 1.0914740562438965, + "learning_rate": 1.5990396158463386e-05, + "loss": 0.4593, + "step": 17012 + }, + { + "epoch": 21.77664, + "grad_norm": 1.1249064207077026, + "learning_rate": 1.5988395358143257e-05, + "loss": 0.4779, + "step": 17013 + }, + { + "epoch": 21.77792, + "grad_norm": 1.1130808591842651, + "learning_rate": 1.5986394557823133e-05, + "loss": 0.4954, + "step": 17014 + }, + { + "epoch": 21.7792, + "grad_norm": 1.0818630456924438, + "learning_rate": 1.5984393757503e-05, + "loss": 0.4842, + "step": 17015 + }, + { + "epoch": 21.78048, + "grad_norm": 1.1350693702697754, + "learning_rate": 1.5982392957182873e-05, + "loss": 0.4944, + "step": 17016 + }, + { + "epoch": 21.78176, + "grad_norm": 1.1261863708496094, + "learning_rate": 1.5980392156862745e-05, + "loss": 0.4952, + "step": 17017 + }, + { + "epoch": 21.78304, + "grad_norm": 1.1032289266586304, + "learning_rate": 1.597839135654262e-05, + "loss": 0.505, + "step": 17018 + }, + { + "epoch": 21.78432, + "grad_norm": 1.1199618577957153, + "learning_rate": 1.597639055622249e-05, + "loss": 0.4496, + "step": 17019 + }, + { + "epoch": 21.7856, + "grad_norm": 1.105373740196228, + "learning_rate": 1.597438975590236e-05, + "loss": 0.5072, + "step": 17020 + }, + { + "epoch": 21.78688, + "grad_norm": 1.0840733051300049, + "learning_rate": 1.5972388955582236e-05, + "loss": 0.5159, + "step": 17021 + }, + { + "epoch": 21.78816, + "grad_norm": 1.0092965364456177, + "learning_rate": 1.5970388155262107e-05, + "loss": 0.4299, + "step": 17022 + }, + { + "epoch": 21.78944, + "grad_norm": 1.0409196615219116, + "learning_rate": 1.5968387354941976e-05, + "loss": 0.4441, + "step": 17023 + }, + { + "epoch": 21.79072, + "grad_norm": 1.1237512826919556, + "learning_rate": 1.5966386554621848e-05, + "loss": 0.5225, + "step": 17024 + }, + { + "epoch": 21.792, + "grad_norm": 1.1367634534835815, + "learning_rate": 1.5964385754301723e-05, + "loss": 0.4826, + "step": 17025 + }, + { + "epoch": 21.79328, + "grad_norm": 1.146195650100708, + "learning_rate": 1.5962384953981595e-05, + "loss": 0.5016, + "step": 17026 + }, + { + "epoch": 21.79456, + "grad_norm": 1.0935816764831543, + "learning_rate": 1.5960384153661463e-05, + "loss": 0.4626, + "step": 17027 + }, + { + "epoch": 21.79584, + "grad_norm": 1.169556975364685, + "learning_rate": 1.595838335334134e-05, + "loss": 0.5184, + "step": 17028 + }, + { + "epoch": 21.79712, + "grad_norm": 1.1683391332626343, + "learning_rate": 1.595638255302121e-05, + "loss": 0.5, + "step": 17029 + }, + { + "epoch": 21.7984, + "grad_norm": 1.1834429502487183, + "learning_rate": 1.5954381752701082e-05, + "loss": 0.5633, + "step": 17030 + }, + { + "epoch": 21.79968, + "grad_norm": 1.1966886520385742, + "learning_rate": 1.595238095238095e-05, + "loss": 0.5166, + "step": 17031 + }, + { + "epoch": 21.80096, + "grad_norm": 1.1592307090759277, + "learning_rate": 1.5950380152060826e-05, + "loss": 0.5007, + "step": 17032 + }, + { + "epoch": 21.80224, + "grad_norm": 1.0894125699996948, + "learning_rate": 1.5948379351740698e-05, + "loss": 0.4405, + "step": 17033 + }, + { + "epoch": 21.80352, + "grad_norm": 1.1028647422790527, + "learning_rate": 1.594637855142057e-05, + "loss": 0.4736, + "step": 17034 + }, + { + "epoch": 21.8048, + "grad_norm": 1.01607084274292, + "learning_rate": 1.594437775110044e-05, + "loss": 0.4253, + "step": 17035 + }, + { + "epoch": 21.80608, + "grad_norm": 1.1347633600234985, + "learning_rate": 1.5942376950780313e-05, + "loss": 0.5113, + "step": 17036 + }, + { + "epoch": 21.80736, + "grad_norm": 1.1249445676803589, + "learning_rate": 1.5940376150460185e-05, + "loss": 0.4994, + "step": 17037 + }, + { + "epoch": 21.80864, + "grad_norm": 1.192696452140808, + "learning_rate": 1.5938375350140057e-05, + "loss": 0.5374, + "step": 17038 + }, + { + "epoch": 21.809919999999998, + "grad_norm": 1.092063307762146, + "learning_rate": 1.593637454981993e-05, + "loss": 0.4376, + "step": 17039 + }, + { + "epoch": 21.8112, + "grad_norm": 1.0697550773620605, + "learning_rate": 1.59343737494998e-05, + "loss": 0.4639, + "step": 17040 + }, + { + "epoch": 21.81248, + "grad_norm": 1.066964864730835, + "learning_rate": 1.5932372949179673e-05, + "loss": 0.4451, + "step": 17041 + }, + { + "epoch": 21.81376, + "grad_norm": 1.099763035774231, + "learning_rate": 1.5930372148859545e-05, + "loss": 0.4918, + "step": 17042 + }, + { + "epoch": 21.81504, + "grad_norm": 1.1521753072738647, + "learning_rate": 1.5928371348539416e-05, + "loss": 0.5071, + "step": 17043 + }, + { + "epoch": 21.81632, + "grad_norm": 1.0961254835128784, + "learning_rate": 1.592637054821929e-05, + "loss": 0.496, + "step": 17044 + }, + { + "epoch": 21.8176, + "grad_norm": 1.0640747547149658, + "learning_rate": 1.592436974789916e-05, + "loss": 0.4542, + "step": 17045 + }, + { + "epoch": 21.81888, + "grad_norm": 1.2090388536453247, + "learning_rate": 1.5922368947579032e-05, + "loss": 0.5355, + "step": 17046 + }, + { + "epoch": 21.82016, + "grad_norm": 1.0608830451965332, + "learning_rate": 1.5920368147258904e-05, + "loss": 0.4686, + "step": 17047 + }, + { + "epoch": 21.82144, + "grad_norm": 1.1006875038146973, + "learning_rate": 1.5918367346938776e-05, + "loss": 0.5012, + "step": 17048 + }, + { + "epoch": 21.82272, + "grad_norm": 1.132010817527771, + "learning_rate": 1.591636654661865e-05, + "loss": 0.5102, + "step": 17049 + }, + { + "epoch": 21.824, + "grad_norm": 1.0770601034164429, + "learning_rate": 1.591436574629852e-05, + "loss": 0.4736, + "step": 17050 + }, + { + "epoch": 21.82528, + "grad_norm": 1.1132420301437378, + "learning_rate": 1.591236494597839e-05, + "loss": 0.512, + "step": 17051 + }, + { + "epoch": 21.82656, + "grad_norm": 1.1608811616897583, + "learning_rate": 1.5910364145658263e-05, + "loss": 0.4529, + "step": 17052 + }, + { + "epoch": 21.82784, + "grad_norm": 1.1335759162902832, + "learning_rate": 1.590836334533814e-05, + "loss": 0.501, + "step": 17053 + }, + { + "epoch": 21.82912, + "grad_norm": 1.0787115097045898, + "learning_rate": 1.5906362545018007e-05, + "loss": 0.478, + "step": 17054 + }, + { + "epoch": 21.8304, + "grad_norm": 1.0791386365890503, + "learning_rate": 1.590436174469788e-05, + "loss": 0.4745, + "step": 17055 + }, + { + "epoch": 21.83168, + "grad_norm": 1.0508241653442383, + "learning_rate": 1.590236094437775e-05, + "loss": 0.4423, + "step": 17056 + }, + { + "epoch": 21.83296, + "grad_norm": 1.061378836631775, + "learning_rate": 1.5900360144057626e-05, + "loss": 0.4637, + "step": 17057 + }, + { + "epoch": 21.83424, + "grad_norm": 1.1394723653793335, + "learning_rate": 1.5898359343737494e-05, + "loss": 0.4634, + "step": 17058 + }, + { + "epoch": 21.83552, + "grad_norm": 1.1414905786514282, + "learning_rate": 1.5896358543417366e-05, + "loss": 0.4838, + "step": 17059 + }, + { + "epoch": 21.8368, + "grad_norm": 1.127100944519043, + "learning_rate": 1.589435774309724e-05, + "loss": 0.5292, + "step": 17060 + }, + { + "epoch": 21.83808, + "grad_norm": 1.1181409358978271, + "learning_rate": 1.5892356942777113e-05, + "loss": 0.5023, + "step": 17061 + }, + { + "epoch": 21.83936, + "grad_norm": 1.1021265983581543, + "learning_rate": 1.5890356142456982e-05, + "loss": 0.4538, + "step": 17062 + }, + { + "epoch": 21.84064, + "grad_norm": 1.042770504951477, + "learning_rate": 1.5888355342136854e-05, + "loss": 0.4804, + "step": 17063 + }, + { + "epoch": 21.841920000000002, + "grad_norm": 1.1434727907180786, + "learning_rate": 1.588635454181673e-05, + "loss": 0.5127, + "step": 17064 + }, + { + "epoch": 21.8432, + "grad_norm": 1.111856460571289, + "learning_rate": 1.58843537414966e-05, + "loss": 0.5108, + "step": 17065 + }, + { + "epoch": 21.84448, + "grad_norm": 1.0827891826629639, + "learning_rate": 1.588235294117647e-05, + "loss": 0.4725, + "step": 17066 + }, + { + "epoch": 21.84576, + "grad_norm": 1.1117992401123047, + "learning_rate": 1.5880352140856344e-05, + "loss": 0.474, + "step": 17067 + }, + { + "epoch": 21.84704, + "grad_norm": 1.1064029932022095, + "learning_rate": 1.5878351340536216e-05, + "loss": 0.4948, + "step": 17068 + }, + { + "epoch": 21.84832, + "grad_norm": 1.0790382623672485, + "learning_rate": 1.5876350540216088e-05, + "loss": 0.5079, + "step": 17069 + }, + { + "epoch": 21.8496, + "grad_norm": 1.1318869590759277, + "learning_rate": 1.5874349739895957e-05, + "loss": 0.5018, + "step": 17070 + }, + { + "epoch": 21.85088, + "grad_norm": 1.1790701150894165, + "learning_rate": 1.5872348939575832e-05, + "loss": 0.5168, + "step": 17071 + }, + { + "epoch": 21.85216, + "grad_norm": 1.1890596151351929, + "learning_rate": 1.5870348139255704e-05, + "loss": 0.5259, + "step": 17072 + }, + { + "epoch": 21.85344, + "grad_norm": 1.1023216247558594, + "learning_rate": 1.5868347338935576e-05, + "loss": 0.5027, + "step": 17073 + }, + { + "epoch": 21.85472, + "grad_norm": 1.2085516452789307, + "learning_rate": 1.5866346538615447e-05, + "loss": 0.5627, + "step": 17074 + }, + { + "epoch": 21.856, + "grad_norm": 1.0791484117507935, + "learning_rate": 1.586434573829532e-05, + "loss": 0.5077, + "step": 17075 + }, + { + "epoch": 21.85728, + "grad_norm": 1.1096769571304321, + "learning_rate": 1.586234493797519e-05, + "loss": 0.4913, + "step": 17076 + }, + { + "epoch": 21.85856, + "grad_norm": 1.1315524578094482, + "learning_rate": 1.5860344137655063e-05, + "loss": 0.4864, + "step": 17077 + }, + { + "epoch": 21.85984, + "grad_norm": 1.126545786857605, + "learning_rate": 1.5858343337334935e-05, + "loss": 0.4946, + "step": 17078 + }, + { + "epoch": 21.86112, + "grad_norm": 1.0645451545715332, + "learning_rate": 1.5856342537014807e-05, + "loss": 0.4946, + "step": 17079 + }, + { + "epoch": 21.8624, + "grad_norm": 1.1454330682754517, + "learning_rate": 1.585434173669468e-05, + "loss": 0.4943, + "step": 17080 + }, + { + "epoch": 21.86368, + "grad_norm": 1.156087040901184, + "learning_rate": 1.585234093637455e-05, + "loss": 0.493, + "step": 17081 + }, + { + "epoch": 21.86496, + "grad_norm": 1.093726634979248, + "learning_rate": 1.5850340136054422e-05, + "loss": 0.4829, + "step": 17082 + }, + { + "epoch": 21.86624, + "grad_norm": 1.1023703813552856, + "learning_rate": 1.5848339335734294e-05, + "loss": 0.5139, + "step": 17083 + }, + { + "epoch": 21.86752, + "grad_norm": 1.0602704286575317, + "learning_rate": 1.5846338535414166e-05, + "loss": 0.4675, + "step": 17084 + }, + { + "epoch": 21.8688, + "grad_norm": 1.1112686395645142, + "learning_rate": 1.5844337735094038e-05, + "loss": 0.4874, + "step": 17085 + }, + { + "epoch": 21.87008, + "grad_norm": 1.07467520236969, + "learning_rate": 1.584233693477391e-05, + "loss": 0.4669, + "step": 17086 + }, + { + "epoch": 21.87136, + "grad_norm": 1.1608061790466309, + "learning_rate": 1.584033613445378e-05, + "loss": 0.5248, + "step": 17087 + }, + { + "epoch": 21.87264, + "grad_norm": 1.1059867143630981, + "learning_rate": 1.5838335334133657e-05, + "loss": 0.5052, + "step": 17088 + }, + { + "epoch": 21.87392, + "grad_norm": 1.120044231414795, + "learning_rate": 1.5836334533813525e-05, + "loss": 0.4999, + "step": 17089 + }, + { + "epoch": 21.8752, + "grad_norm": 1.1224958896636963, + "learning_rate": 1.5834333733493397e-05, + "loss": 0.5023, + "step": 17090 + }, + { + "epoch": 21.87648, + "grad_norm": 1.1289787292480469, + "learning_rate": 1.583233293317327e-05, + "loss": 0.5192, + "step": 17091 + }, + { + "epoch": 21.87776, + "grad_norm": 1.0791914463043213, + "learning_rate": 1.5830332132853144e-05, + "loss": 0.4934, + "step": 17092 + }, + { + "epoch": 21.87904, + "grad_norm": 1.1275081634521484, + "learning_rate": 1.5828331332533013e-05, + "loss": 0.5088, + "step": 17093 + }, + { + "epoch": 21.88032, + "grad_norm": 1.1475367546081543, + "learning_rate": 1.5826330532212885e-05, + "loss": 0.5101, + "step": 17094 + }, + { + "epoch": 21.8816, + "grad_norm": 1.1276719570159912, + "learning_rate": 1.582432973189276e-05, + "loss": 0.5062, + "step": 17095 + }, + { + "epoch": 21.88288, + "grad_norm": 1.0605647563934326, + "learning_rate": 1.582232893157263e-05, + "loss": 0.465, + "step": 17096 + }, + { + "epoch": 21.88416, + "grad_norm": 1.1123402118682861, + "learning_rate": 1.58203281312525e-05, + "loss": 0.5084, + "step": 17097 + }, + { + "epoch": 21.88544, + "grad_norm": 1.1372113227844238, + "learning_rate": 1.5818327330932372e-05, + "loss": 0.5107, + "step": 17098 + }, + { + "epoch": 21.88672, + "grad_norm": 1.104132890701294, + "learning_rate": 1.5816326530612247e-05, + "loss": 0.4793, + "step": 17099 + }, + { + "epoch": 21.888, + "grad_norm": 1.0645356178283691, + "learning_rate": 1.581432573029212e-05, + "loss": 0.4723, + "step": 17100 + }, + { + "epoch": 21.88928, + "grad_norm": 1.1151045560836792, + "learning_rate": 1.5812324929971988e-05, + "loss": 0.4642, + "step": 17101 + }, + { + "epoch": 21.89056, + "grad_norm": 1.1927440166473389, + "learning_rate": 1.5810324129651863e-05, + "loss": 0.5306, + "step": 17102 + }, + { + "epoch": 21.89184, + "grad_norm": 1.148725986480713, + "learning_rate": 1.5808323329331735e-05, + "loss": 0.5302, + "step": 17103 + }, + { + "epoch": 21.89312, + "grad_norm": 1.1260452270507812, + "learning_rate": 1.5806322529011607e-05, + "loss": 0.5168, + "step": 17104 + }, + { + "epoch": 21.8944, + "grad_norm": 1.1221392154693604, + "learning_rate": 1.5804321728691475e-05, + "loss": 0.5141, + "step": 17105 + }, + { + "epoch": 21.89568, + "grad_norm": 1.172025442123413, + "learning_rate": 1.580232092837135e-05, + "loss": 0.5186, + "step": 17106 + }, + { + "epoch": 21.89696, + "grad_norm": 1.1051148176193237, + "learning_rate": 1.5800320128051222e-05, + "loss": 0.4797, + "step": 17107 + }, + { + "epoch": 21.89824, + "grad_norm": 1.0419796705245972, + "learning_rate": 1.5798319327731094e-05, + "loss": 0.4483, + "step": 17108 + }, + { + "epoch": 21.89952, + "grad_norm": 1.0614566802978516, + "learning_rate": 1.5796318527410966e-05, + "loss": 0.4695, + "step": 17109 + }, + { + "epoch": 21.9008, + "grad_norm": 1.111011266708374, + "learning_rate": 1.5794317727090838e-05, + "loss": 0.4986, + "step": 17110 + }, + { + "epoch": 21.90208, + "grad_norm": 1.1020454168319702, + "learning_rate": 1.579231692677071e-05, + "loss": 0.5104, + "step": 17111 + }, + { + "epoch": 21.90336, + "grad_norm": 1.1246579885482788, + "learning_rate": 1.579031612645058e-05, + "loss": 0.5063, + "step": 17112 + }, + { + "epoch": 21.90464, + "grad_norm": 1.0644440650939941, + "learning_rate": 1.5788315326130453e-05, + "loss": 0.4876, + "step": 17113 + }, + { + "epoch": 21.90592, + "grad_norm": 1.0820109844207764, + "learning_rate": 1.5786314525810325e-05, + "loss": 0.5087, + "step": 17114 + }, + { + "epoch": 21.9072, + "grad_norm": 1.0998029708862305, + "learning_rate": 1.5784313725490197e-05, + "loss": 0.4956, + "step": 17115 + }, + { + "epoch": 21.90848, + "grad_norm": 1.1597471237182617, + "learning_rate": 1.578231292517007e-05, + "loss": 0.5432, + "step": 17116 + }, + { + "epoch": 21.90976, + "grad_norm": 1.1102598905563354, + "learning_rate": 1.578031212484994e-05, + "loss": 0.528, + "step": 17117 + }, + { + "epoch": 21.91104, + "grad_norm": 1.1272238492965698, + "learning_rate": 1.5778311324529813e-05, + "loss": 0.4563, + "step": 17118 + }, + { + "epoch": 21.91232, + "grad_norm": 1.0742859840393066, + "learning_rate": 1.5776310524209684e-05, + "loss": 0.4609, + "step": 17119 + }, + { + "epoch": 21.9136, + "grad_norm": 1.1245981454849243, + "learning_rate": 1.5774309723889556e-05, + "loss": 0.5284, + "step": 17120 + }, + { + "epoch": 21.91488, + "grad_norm": 1.1140938997268677, + "learning_rate": 1.5772308923569428e-05, + "loss": 0.4731, + "step": 17121 + }, + { + "epoch": 21.91616, + "grad_norm": 1.1234639883041382, + "learning_rate": 1.57703081232493e-05, + "loss": 0.4937, + "step": 17122 + }, + { + "epoch": 21.91744, + "grad_norm": 1.1589488983154297, + "learning_rate": 1.5768307322929172e-05, + "loss": 0.5186, + "step": 17123 + }, + { + "epoch": 21.91872, + "grad_norm": 1.1331297159194946, + "learning_rate": 1.5766306522609044e-05, + "loss": 0.4893, + "step": 17124 + }, + { + "epoch": 21.92, + "grad_norm": 1.1356956958770752, + "learning_rate": 1.5764305722288916e-05, + "loss": 0.4674, + "step": 17125 + }, + { + "epoch": 21.92128, + "grad_norm": 1.0896925926208496, + "learning_rate": 1.5762304921968787e-05, + "loss": 0.462, + "step": 17126 + }, + { + "epoch": 21.92256, + "grad_norm": 1.1289901733398438, + "learning_rate": 1.5760304121648663e-05, + "loss": 0.5332, + "step": 17127 + }, + { + "epoch": 21.92384, + "grad_norm": 1.0392855405807495, + "learning_rate": 1.575830332132853e-05, + "loss": 0.4433, + "step": 17128 + }, + { + "epoch": 21.92512, + "grad_norm": 1.1507699489593506, + "learning_rate": 1.5756302521008403e-05, + "loss": 0.5408, + "step": 17129 + }, + { + "epoch": 21.9264, + "grad_norm": 1.0881316661834717, + "learning_rate": 1.5754301720688275e-05, + "loss": 0.5205, + "step": 17130 + }, + { + "epoch": 21.92768, + "grad_norm": 1.1345676183700562, + "learning_rate": 1.575230092036815e-05, + "loss": 0.4812, + "step": 17131 + }, + { + "epoch": 21.92896, + "grad_norm": 1.1051517724990845, + "learning_rate": 1.575030012004802e-05, + "loss": 0.5129, + "step": 17132 + }, + { + "epoch": 21.93024, + "grad_norm": 1.1058467626571655, + "learning_rate": 1.574829931972789e-05, + "loss": 0.4401, + "step": 17133 + }, + { + "epoch": 21.93152, + "grad_norm": 1.1275383234024048, + "learning_rate": 1.5746298519407766e-05, + "loss": 0.5036, + "step": 17134 + }, + { + "epoch": 21.9328, + "grad_norm": 1.1110889911651611, + "learning_rate": 1.5744297719087637e-05, + "loss": 0.4844, + "step": 17135 + }, + { + "epoch": 21.93408, + "grad_norm": 1.1405858993530273, + "learning_rate": 1.5742296918767506e-05, + "loss": 0.5322, + "step": 17136 + }, + { + "epoch": 21.93536, + "grad_norm": 1.1357598304748535, + "learning_rate": 1.5740296118447378e-05, + "loss": 0.4834, + "step": 17137 + }, + { + "epoch": 21.93664, + "grad_norm": 1.1561405658721924, + "learning_rate": 1.5738295318127253e-05, + "loss": 0.5107, + "step": 17138 + }, + { + "epoch": 21.93792, + "grad_norm": 1.1202070713043213, + "learning_rate": 1.5736294517807125e-05, + "loss": 0.5176, + "step": 17139 + }, + { + "epoch": 21.9392, + "grad_norm": 1.141701579093933, + "learning_rate": 1.5734293717486993e-05, + "loss": 0.5304, + "step": 17140 + }, + { + "epoch": 21.94048, + "grad_norm": 1.0925021171569824, + "learning_rate": 1.573229291716687e-05, + "loss": 0.4811, + "step": 17141 + }, + { + "epoch": 21.94176, + "grad_norm": 1.147238850593567, + "learning_rate": 1.573029211684674e-05, + "loss": 0.5093, + "step": 17142 + }, + { + "epoch": 21.94304, + "grad_norm": 1.1315157413482666, + "learning_rate": 1.5728291316526612e-05, + "loss": 0.4744, + "step": 17143 + }, + { + "epoch": 21.94432, + "grad_norm": 1.1635738611221313, + "learning_rate": 1.572629051620648e-05, + "loss": 0.5085, + "step": 17144 + }, + { + "epoch": 21.9456, + "grad_norm": 1.1979985237121582, + "learning_rate": 1.5724289715886356e-05, + "loss": 0.5047, + "step": 17145 + }, + { + "epoch": 21.94688, + "grad_norm": 1.1460307836532593, + "learning_rate": 1.5722288915566228e-05, + "loss": 0.487, + "step": 17146 + }, + { + "epoch": 21.94816, + "grad_norm": 1.087184190750122, + "learning_rate": 1.57202881152461e-05, + "loss": 0.524, + "step": 17147 + }, + { + "epoch": 21.94944, + "grad_norm": 1.1512465476989746, + "learning_rate": 1.571828731492597e-05, + "loss": 0.4737, + "step": 17148 + }, + { + "epoch": 21.95072, + "grad_norm": 1.1650768518447876, + "learning_rate": 1.5716286514605843e-05, + "loss": 0.5084, + "step": 17149 + }, + { + "epoch": 21.951999999999998, + "grad_norm": 1.0953246355056763, + "learning_rate": 1.5714285714285715e-05, + "loss": 0.5255, + "step": 17150 + }, + { + "epoch": 21.95328, + "grad_norm": 1.0524414777755737, + "learning_rate": 1.5712284913965587e-05, + "loss": 0.485, + "step": 17151 + }, + { + "epoch": 21.95456, + "grad_norm": 1.0316898822784424, + "learning_rate": 1.571028411364546e-05, + "loss": 0.4774, + "step": 17152 + }, + { + "epoch": 21.95584, + "grad_norm": 1.0071184635162354, + "learning_rate": 1.570828331332533e-05, + "loss": 0.4582, + "step": 17153 + }, + { + "epoch": 21.95712, + "grad_norm": 1.1721525192260742, + "learning_rate": 1.5706282513005203e-05, + "loss": 0.5407, + "step": 17154 + }, + { + "epoch": 21.9584, + "grad_norm": 1.0725557804107666, + "learning_rate": 1.5704281712685075e-05, + "loss": 0.4728, + "step": 17155 + }, + { + "epoch": 21.95968, + "grad_norm": 1.0535809993743896, + "learning_rate": 1.5702280912364946e-05, + "loss": 0.455, + "step": 17156 + }, + { + "epoch": 21.96096, + "grad_norm": 1.1127283573150635, + "learning_rate": 1.570028011204482e-05, + "loss": 0.5051, + "step": 17157 + }, + { + "epoch": 21.96224, + "grad_norm": 1.1096875667572021, + "learning_rate": 1.569827931172469e-05, + "loss": 0.4694, + "step": 17158 + }, + { + "epoch": 21.96352, + "grad_norm": 1.105588436126709, + "learning_rate": 1.5696278511404562e-05, + "loss": 0.4654, + "step": 17159 + }, + { + "epoch": 21.9648, + "grad_norm": 1.1018520593643188, + "learning_rate": 1.5694277711084434e-05, + "loss": 0.4964, + "step": 17160 + }, + { + "epoch": 21.96608, + "grad_norm": 1.1417279243469238, + "learning_rate": 1.5692276910764306e-05, + "loss": 0.5166, + "step": 17161 + }, + { + "epoch": 21.96736, + "grad_norm": 1.0698052644729614, + "learning_rate": 1.569027611044418e-05, + "loss": 0.5072, + "step": 17162 + }, + { + "epoch": 21.96864, + "grad_norm": 1.1088814735412598, + "learning_rate": 1.568827531012405e-05, + "loss": 0.5034, + "step": 17163 + }, + { + "epoch": 21.96992, + "grad_norm": 1.0650029182434082, + "learning_rate": 1.568627450980392e-05, + "loss": 0.4262, + "step": 17164 + }, + { + "epoch": 21.9712, + "grad_norm": 1.0559251308441162, + "learning_rate": 1.5684273709483793e-05, + "loss": 0.4375, + "step": 17165 + }, + { + "epoch": 21.97248, + "grad_norm": 1.107661485671997, + "learning_rate": 1.568227290916367e-05, + "loss": 0.4978, + "step": 17166 + }, + { + "epoch": 21.97376, + "grad_norm": 1.109234094619751, + "learning_rate": 1.5680272108843537e-05, + "loss": 0.5112, + "step": 17167 + }, + { + "epoch": 21.97504, + "grad_norm": 1.1099008321762085, + "learning_rate": 1.567827130852341e-05, + "loss": 0.4588, + "step": 17168 + }, + { + "epoch": 21.97632, + "grad_norm": 1.1135979890823364, + "learning_rate": 1.567627050820328e-05, + "loss": 0.4801, + "step": 17169 + }, + { + "epoch": 21.9776, + "grad_norm": 1.1482584476470947, + "learning_rate": 1.5674269707883156e-05, + "loss": 0.4637, + "step": 17170 + }, + { + "epoch": 21.97888, + "grad_norm": 1.0247658491134644, + "learning_rate": 1.5672268907563024e-05, + "loss": 0.4485, + "step": 17171 + }, + { + "epoch": 21.98016, + "grad_norm": 1.0352832078933716, + "learning_rate": 1.5670268107242896e-05, + "loss": 0.4452, + "step": 17172 + }, + { + "epoch": 21.98144, + "grad_norm": 1.1640503406524658, + "learning_rate": 1.566826730692277e-05, + "loss": 0.5008, + "step": 17173 + }, + { + "epoch": 21.98272, + "grad_norm": 1.086647629737854, + "learning_rate": 1.5666266506602643e-05, + "loss": 0.4714, + "step": 17174 + }, + { + "epoch": 21.984, + "grad_norm": 1.1195262670516968, + "learning_rate": 1.5664265706282512e-05, + "loss": 0.5151, + "step": 17175 + }, + { + "epoch": 21.98528, + "grad_norm": 1.1734910011291504, + "learning_rate": 1.5662264905962384e-05, + "loss": 0.5262, + "step": 17176 + }, + { + "epoch": 21.98656, + "grad_norm": 1.1472947597503662, + "learning_rate": 1.566026410564226e-05, + "loss": 0.4849, + "step": 17177 + }, + { + "epoch": 21.98784, + "grad_norm": 1.114270806312561, + "learning_rate": 1.565826330532213e-05, + "loss": 0.5082, + "step": 17178 + }, + { + "epoch": 21.98912, + "grad_norm": 1.0618884563446045, + "learning_rate": 1.5656262505002e-05, + "loss": 0.4387, + "step": 17179 + }, + { + "epoch": 21.9904, + "grad_norm": 1.098699927330017, + "learning_rate": 1.5654261704681874e-05, + "loss": 0.5168, + "step": 17180 + }, + { + "epoch": 21.99168, + "grad_norm": 1.1803953647613525, + "learning_rate": 1.5652260904361746e-05, + "loss": 0.5424, + "step": 17181 + }, + { + "epoch": 21.99296, + "grad_norm": 1.1238080263137817, + "learning_rate": 1.5650260104041618e-05, + "loss": 0.4951, + "step": 17182 + }, + { + "epoch": 21.99424, + "grad_norm": 1.0754061937332153, + "learning_rate": 1.5648259303721487e-05, + "loss": 0.4725, + "step": 17183 + }, + { + "epoch": 21.99552, + "grad_norm": 1.0654562711715698, + "learning_rate": 1.5646258503401362e-05, + "loss": 0.486, + "step": 17184 + }, + { + "epoch": 21.9968, + "grad_norm": 1.1009260416030884, + "learning_rate": 1.5644257703081234e-05, + "loss": 0.5057, + "step": 17185 + }, + { + "epoch": 21.99808, + "grad_norm": 1.091380000114441, + "learning_rate": 1.5642256902761106e-05, + "loss": 0.4666, + "step": 17186 + }, + { + "epoch": 21.99936, + "grad_norm": 1.0999908447265625, + "learning_rate": 1.5640256102440977e-05, + "loss": 0.4635, + "step": 17187 + }, + { + "epoch": 22.00064, + "grad_norm": Infinity, + "learning_rate": 1.5640256102440977e-05, + "loss": 0.957, + "step": 17188 + }, + { + "epoch": 22.00192, + "grad_norm": 1.0367313623428345, + "learning_rate": 1.563825530212085e-05, + "loss": 0.4574, + "step": 17189 + }, + { + "epoch": 22.0032, + "grad_norm": 1.0084816217422485, + "learning_rate": 1.563625450180072e-05, + "loss": 0.4657, + "step": 17190 + }, + { + "epoch": 22.00448, + "grad_norm": 1.096815824508667, + "learning_rate": 1.5634253701480593e-05, + "loss": 0.4837, + "step": 17191 + }, + { + "epoch": 22.00576, + "grad_norm": 1.0867575407028198, + "learning_rate": 1.5632252901160465e-05, + "loss": 0.4679, + "step": 17192 + }, + { + "epoch": 22.00704, + "grad_norm": 1.0765024423599243, + "learning_rate": 1.5630252100840337e-05, + "loss": 0.4796, + "step": 17193 + }, + { + "epoch": 22.00832, + "grad_norm": 1.0676796436309814, + "learning_rate": 1.562825130052021e-05, + "loss": 0.4513, + "step": 17194 + }, + { + "epoch": 22.0096, + "grad_norm": 1.0853677988052368, + "learning_rate": 1.562625050020008e-05, + "loss": 0.4669, + "step": 17195 + }, + { + "epoch": 22.01088, + "grad_norm": 1.1179803609848022, + "learning_rate": 1.5624249699879952e-05, + "loss": 0.4737, + "step": 17196 + }, + { + "epoch": 22.01216, + "grad_norm": 1.1370209455490112, + "learning_rate": 1.5622248899559824e-05, + "loss": 0.4873, + "step": 17197 + }, + { + "epoch": 22.01344, + "grad_norm": 1.0936213731765747, + "learning_rate": 1.5620248099239696e-05, + "loss": 0.4791, + "step": 17198 + }, + { + "epoch": 22.01472, + "grad_norm": 1.1142305135726929, + "learning_rate": 1.5618247298919568e-05, + "loss": 0.4956, + "step": 17199 + }, + { + "epoch": 22.016, + "grad_norm": 1.0316848754882812, + "learning_rate": 1.561624649859944e-05, + "loss": 0.3966, + "step": 17200 + }, + { + "epoch": 22.01728, + "grad_norm": 1.1402348279953003, + "learning_rate": 1.561424569827931e-05, + "loss": 0.4791, + "step": 17201 + }, + { + "epoch": 22.01856, + "grad_norm": 1.1665503978729248, + "learning_rate": 1.5612244897959187e-05, + "loss": 0.5207, + "step": 17202 + }, + { + "epoch": 22.01984, + "grad_norm": 1.1318738460540771, + "learning_rate": 1.5610244097639055e-05, + "loss": 0.4978, + "step": 17203 + }, + { + "epoch": 22.02112, + "grad_norm": 1.0995848178863525, + "learning_rate": 1.5608243297318927e-05, + "loss": 0.4897, + "step": 17204 + }, + { + "epoch": 22.0224, + "grad_norm": 1.0910522937774658, + "learning_rate": 1.56062424969988e-05, + "loss": 0.4392, + "step": 17205 + }, + { + "epoch": 22.02368, + "grad_norm": 1.0444594621658325, + "learning_rate": 1.5604241696678674e-05, + "loss": 0.428, + "step": 17206 + }, + { + "epoch": 22.02496, + "grad_norm": 1.1223368644714355, + "learning_rate": 1.5602240896358543e-05, + "loss": 0.4473, + "step": 17207 + }, + { + "epoch": 22.02624, + "grad_norm": 1.035774827003479, + "learning_rate": 1.5600240096038415e-05, + "loss": 0.4201, + "step": 17208 + }, + { + "epoch": 22.02752, + "grad_norm": 1.155383586883545, + "learning_rate": 1.559823929571829e-05, + "loss": 0.4817, + "step": 17209 + }, + { + "epoch": 22.0288, + "grad_norm": 1.180153727531433, + "learning_rate": 1.559623849539816e-05, + "loss": 0.5142, + "step": 17210 + }, + { + "epoch": 22.03008, + "grad_norm": 1.0396239757537842, + "learning_rate": 1.559423769507803e-05, + "loss": 0.4348, + "step": 17211 + }, + { + "epoch": 22.03136, + "grad_norm": 1.1152554750442505, + "learning_rate": 1.5592236894757902e-05, + "loss": 0.5252, + "step": 17212 + }, + { + "epoch": 22.03264, + "grad_norm": 1.0972347259521484, + "learning_rate": 1.5590236094437777e-05, + "loss": 0.4627, + "step": 17213 + }, + { + "epoch": 22.03392, + "grad_norm": 1.1150223016738892, + "learning_rate": 1.558823529411765e-05, + "loss": 0.4778, + "step": 17214 + }, + { + "epoch": 22.0352, + "grad_norm": 1.1251050233840942, + "learning_rate": 1.5586234493797518e-05, + "loss": 0.5056, + "step": 17215 + }, + { + "epoch": 22.03648, + "grad_norm": 1.1292921304702759, + "learning_rate": 1.5584233693477393e-05, + "loss": 0.4896, + "step": 17216 + }, + { + "epoch": 22.03776, + "grad_norm": 1.169345736503601, + "learning_rate": 1.5582232893157265e-05, + "loss": 0.4869, + "step": 17217 + }, + { + "epoch": 22.03904, + "grad_norm": 1.10861074924469, + "learning_rate": 1.5580232092837137e-05, + "loss": 0.4686, + "step": 17218 + }, + { + "epoch": 22.04032, + "grad_norm": 1.080115556716919, + "learning_rate": 1.5578231292517005e-05, + "loss": 0.4709, + "step": 17219 + }, + { + "epoch": 22.0416, + "grad_norm": 1.0956518650054932, + "learning_rate": 1.557623049219688e-05, + "loss": 0.4785, + "step": 17220 + }, + { + "epoch": 22.04288, + "grad_norm": 1.1570404767990112, + "learning_rate": 1.5574229691876752e-05, + "loss": 0.539, + "step": 17221 + }, + { + "epoch": 22.04416, + "grad_norm": 1.1641499996185303, + "learning_rate": 1.5572228891556624e-05, + "loss": 0.5051, + "step": 17222 + }, + { + "epoch": 22.04544, + "grad_norm": 1.096850872039795, + "learning_rate": 1.5570228091236496e-05, + "loss": 0.4491, + "step": 17223 + }, + { + "epoch": 22.04672, + "grad_norm": 1.103550910949707, + "learning_rate": 1.5568227290916368e-05, + "loss": 0.5001, + "step": 17224 + }, + { + "epoch": 22.048, + "grad_norm": 1.1057217121124268, + "learning_rate": 1.556622649059624e-05, + "loss": 0.4451, + "step": 17225 + }, + { + "epoch": 22.04928, + "grad_norm": 1.1657497882843018, + "learning_rate": 1.556422569027611e-05, + "loss": 0.5181, + "step": 17226 + }, + { + "epoch": 22.05056, + "grad_norm": 1.1382044553756714, + "learning_rate": 1.5562224889955983e-05, + "loss": 0.4842, + "step": 17227 + }, + { + "epoch": 22.05184, + "grad_norm": 1.1846833229064941, + "learning_rate": 1.5560224089635855e-05, + "loss": 0.5218, + "step": 17228 + }, + { + "epoch": 22.05312, + "grad_norm": 1.137743592262268, + "learning_rate": 1.5558223289315727e-05, + "loss": 0.4667, + "step": 17229 + }, + { + "epoch": 22.0544, + "grad_norm": 1.1043981313705444, + "learning_rate": 1.55562224889956e-05, + "loss": 0.4647, + "step": 17230 + }, + { + "epoch": 22.05568, + "grad_norm": 1.121191143989563, + "learning_rate": 1.555422168867547e-05, + "loss": 0.4709, + "step": 17231 + }, + { + "epoch": 22.05696, + "grad_norm": 1.0988284349441528, + "learning_rate": 1.5552220888355343e-05, + "loss": 0.4861, + "step": 17232 + }, + { + "epoch": 22.05824, + "grad_norm": 1.153282880783081, + "learning_rate": 1.5550220088035214e-05, + "loss": 0.4873, + "step": 17233 + }, + { + "epoch": 22.05952, + "grad_norm": 1.1652368307113647, + "learning_rate": 1.5548219287715086e-05, + "loss": 0.4896, + "step": 17234 + }, + { + "epoch": 22.0608, + "grad_norm": 1.1135512590408325, + "learning_rate": 1.5546218487394958e-05, + "loss": 0.4406, + "step": 17235 + }, + { + "epoch": 22.06208, + "grad_norm": 1.166584849357605, + "learning_rate": 1.554421768707483e-05, + "loss": 0.507, + "step": 17236 + }, + { + "epoch": 22.06336, + "grad_norm": 1.1483736038208008, + "learning_rate": 1.5542216886754702e-05, + "loss": 0.5168, + "step": 17237 + }, + { + "epoch": 22.06464, + "grad_norm": 1.1366267204284668, + "learning_rate": 1.5540216086434574e-05, + "loss": 0.5365, + "step": 17238 + }, + { + "epoch": 22.06592, + "grad_norm": 1.1409132480621338, + "learning_rate": 1.5538215286114446e-05, + "loss": 0.4238, + "step": 17239 + }, + { + "epoch": 22.0672, + "grad_norm": 1.1683720350265503, + "learning_rate": 1.5536214485794317e-05, + "loss": 0.5112, + "step": 17240 + }, + { + "epoch": 22.06848, + "grad_norm": 1.0807946920394897, + "learning_rate": 1.5534213685474193e-05, + "loss": 0.4384, + "step": 17241 + }, + { + "epoch": 22.06976, + "grad_norm": 1.2099087238311768, + "learning_rate": 1.553221288515406e-05, + "loss": 0.523, + "step": 17242 + }, + { + "epoch": 22.07104, + "grad_norm": 1.1151604652404785, + "learning_rate": 1.5530212084833933e-05, + "loss": 0.4653, + "step": 17243 + }, + { + "epoch": 22.07232, + "grad_norm": 1.0875353813171387, + "learning_rate": 1.5528211284513805e-05, + "loss": 0.4273, + "step": 17244 + }, + { + "epoch": 22.0736, + "grad_norm": 1.2180532217025757, + "learning_rate": 1.552621048419368e-05, + "loss": 0.5473, + "step": 17245 + }, + { + "epoch": 22.07488, + "grad_norm": 1.1339551210403442, + "learning_rate": 1.552420968387355e-05, + "loss": 0.4595, + "step": 17246 + }, + { + "epoch": 22.07616, + "grad_norm": 1.1073286533355713, + "learning_rate": 1.552220888355342e-05, + "loss": 0.4464, + "step": 17247 + }, + { + "epoch": 22.07744, + "grad_norm": 1.136877417564392, + "learning_rate": 1.5520208083233296e-05, + "loss": 0.494, + "step": 17248 + }, + { + "epoch": 22.07872, + "grad_norm": 1.1355657577514648, + "learning_rate": 1.5518207282913167e-05, + "loss": 0.4883, + "step": 17249 + }, + { + "epoch": 22.08, + "grad_norm": 1.1080483198165894, + "learning_rate": 1.5516206482593036e-05, + "loss": 0.4671, + "step": 17250 + }, + { + "epoch": 22.08128, + "grad_norm": 1.1329385042190552, + "learning_rate": 1.5514205682272908e-05, + "loss": 0.536, + "step": 17251 + }, + { + "epoch": 22.08256, + "grad_norm": 1.0859447717666626, + "learning_rate": 1.5512204881952783e-05, + "loss": 0.4559, + "step": 17252 + }, + { + "epoch": 22.08384, + "grad_norm": 1.134740948677063, + "learning_rate": 1.5510204081632655e-05, + "loss": 0.4539, + "step": 17253 + }, + { + "epoch": 22.08512, + "grad_norm": 1.1605035066604614, + "learning_rate": 1.5508203281312523e-05, + "loss": 0.4898, + "step": 17254 + }, + { + "epoch": 22.0864, + "grad_norm": 1.0824528932571411, + "learning_rate": 1.55062024809924e-05, + "loss": 0.4827, + "step": 17255 + }, + { + "epoch": 22.08768, + "grad_norm": 1.0842760801315308, + "learning_rate": 1.550420168067227e-05, + "loss": 0.4643, + "step": 17256 + }, + { + "epoch": 22.08896, + "grad_norm": 1.152551531791687, + "learning_rate": 1.5502200880352142e-05, + "loss": 0.5292, + "step": 17257 + }, + { + "epoch": 22.09024, + "grad_norm": 1.086955189704895, + "learning_rate": 1.550020008003201e-05, + "loss": 0.4563, + "step": 17258 + }, + { + "epoch": 22.09152, + "grad_norm": 1.1958043575286865, + "learning_rate": 1.5498199279711886e-05, + "loss": 0.5174, + "step": 17259 + }, + { + "epoch": 22.0928, + "grad_norm": 1.111706256866455, + "learning_rate": 1.5496198479391758e-05, + "loss": 0.4607, + "step": 17260 + }, + { + "epoch": 22.09408, + "grad_norm": 1.1034555435180664, + "learning_rate": 1.549419767907163e-05, + "loss": 0.4702, + "step": 17261 + }, + { + "epoch": 22.09536, + "grad_norm": 1.1045020818710327, + "learning_rate": 1.54921968787515e-05, + "loss": 0.4442, + "step": 17262 + }, + { + "epoch": 22.09664, + "grad_norm": 1.1338021755218506, + "learning_rate": 1.5490196078431373e-05, + "loss": 0.4941, + "step": 17263 + }, + { + "epoch": 22.09792, + "grad_norm": 1.1852363348007202, + "learning_rate": 1.5488195278111245e-05, + "loss": 0.5476, + "step": 17264 + }, + { + "epoch": 22.0992, + "grad_norm": 1.109511137008667, + "learning_rate": 1.5486194477791117e-05, + "loss": 0.4669, + "step": 17265 + }, + { + "epoch": 22.10048, + "grad_norm": 1.1228883266448975, + "learning_rate": 1.548419367747099e-05, + "loss": 0.4729, + "step": 17266 + }, + { + "epoch": 22.10176, + "grad_norm": 1.1240496635437012, + "learning_rate": 1.548219287715086e-05, + "loss": 0.5041, + "step": 17267 + }, + { + "epoch": 22.10304, + "grad_norm": 1.1427851915359497, + "learning_rate": 1.5480192076830733e-05, + "loss": 0.5104, + "step": 17268 + }, + { + "epoch": 22.10432, + "grad_norm": 1.1009935140609741, + "learning_rate": 1.5478191276510608e-05, + "loss": 0.4835, + "step": 17269 + }, + { + "epoch": 22.1056, + "grad_norm": 1.2154606580734253, + "learning_rate": 1.5476190476190476e-05, + "loss": 0.5254, + "step": 17270 + }, + { + "epoch": 22.10688, + "grad_norm": 1.1332104206085205, + "learning_rate": 1.5474189675870348e-05, + "loss": 0.475, + "step": 17271 + }, + { + "epoch": 22.10816, + "grad_norm": 1.156253457069397, + "learning_rate": 1.547218887555022e-05, + "loss": 0.4589, + "step": 17272 + }, + { + "epoch": 22.10944, + "grad_norm": 1.0867305994033813, + "learning_rate": 1.5470188075230095e-05, + "loss": 0.4505, + "step": 17273 + }, + { + "epoch": 22.11072, + "grad_norm": 1.1919612884521484, + "learning_rate": 1.5468187274909964e-05, + "loss": 0.4891, + "step": 17274 + }, + { + "epoch": 22.112, + "grad_norm": 1.1696399450302124, + "learning_rate": 1.5466186474589836e-05, + "loss": 0.5135, + "step": 17275 + }, + { + "epoch": 22.11328, + "grad_norm": 1.1965242624282837, + "learning_rate": 1.546418567426971e-05, + "loss": 0.5167, + "step": 17276 + }, + { + "epoch": 22.11456, + "grad_norm": 1.1492869853973389, + "learning_rate": 1.5462184873949583e-05, + "loss": 0.5027, + "step": 17277 + }, + { + "epoch": 22.11584, + "grad_norm": 1.1493687629699707, + "learning_rate": 1.546018407362945e-05, + "loss": 0.4764, + "step": 17278 + }, + { + "epoch": 22.11712, + "grad_norm": 1.1883156299591064, + "learning_rate": 1.5458183273309323e-05, + "loss": 0.4979, + "step": 17279 + }, + { + "epoch": 22.1184, + "grad_norm": 1.198495864868164, + "learning_rate": 1.54561824729892e-05, + "loss": 0.5273, + "step": 17280 + }, + { + "epoch": 22.11968, + "grad_norm": 1.1346237659454346, + "learning_rate": 1.545418167266907e-05, + "loss": 0.4574, + "step": 17281 + }, + { + "epoch": 22.12096, + "grad_norm": 1.1258900165557861, + "learning_rate": 1.545218087234894e-05, + "loss": 0.4931, + "step": 17282 + }, + { + "epoch": 22.12224, + "grad_norm": 1.1091734170913696, + "learning_rate": 1.545018007202881e-05, + "loss": 0.4793, + "step": 17283 + }, + { + "epoch": 22.12352, + "grad_norm": 1.0940433740615845, + "learning_rate": 1.5448179271708686e-05, + "loss": 0.4683, + "step": 17284 + }, + { + "epoch": 22.1248, + "grad_norm": 1.120511531829834, + "learning_rate": 1.5446178471388558e-05, + "loss": 0.4579, + "step": 17285 + }, + { + "epoch": 22.12608, + "grad_norm": 1.1699022054672241, + "learning_rate": 1.5444177671068426e-05, + "loss": 0.4941, + "step": 17286 + }, + { + "epoch": 22.12736, + "grad_norm": 1.0578786134719849, + "learning_rate": 1.54421768707483e-05, + "loss": 0.4419, + "step": 17287 + }, + { + "epoch": 22.12864, + "grad_norm": 1.104691982269287, + "learning_rate": 1.5440176070428173e-05, + "loss": 0.4954, + "step": 17288 + }, + { + "epoch": 22.12992, + "grad_norm": 1.1554813385009766, + "learning_rate": 1.5438175270108045e-05, + "loss": 0.5006, + "step": 17289 + }, + { + "epoch": 22.1312, + "grad_norm": 1.1532070636749268, + "learning_rate": 1.5436174469787914e-05, + "loss": 0.4952, + "step": 17290 + }, + { + "epoch": 22.13248, + "grad_norm": 1.0986944437026978, + "learning_rate": 1.543417366946779e-05, + "loss": 0.4291, + "step": 17291 + }, + { + "epoch": 22.13376, + "grad_norm": 1.1358639001846313, + "learning_rate": 1.543217286914766e-05, + "loss": 0.4807, + "step": 17292 + }, + { + "epoch": 22.13504, + "grad_norm": 1.113168716430664, + "learning_rate": 1.5430172068827533e-05, + "loss": 0.4816, + "step": 17293 + }, + { + "epoch": 22.13632, + "grad_norm": 1.1050689220428467, + "learning_rate": 1.5428171268507404e-05, + "loss": 0.4404, + "step": 17294 + }, + { + "epoch": 22.1376, + "grad_norm": 1.091439962387085, + "learning_rate": 1.5426170468187276e-05, + "loss": 0.5036, + "step": 17295 + }, + { + "epoch": 22.13888, + "grad_norm": 1.1166634559631348, + "learning_rate": 1.5424169667867148e-05, + "loss": 0.5347, + "step": 17296 + }, + { + "epoch": 22.14016, + "grad_norm": 1.1514959335327148, + "learning_rate": 1.542216886754702e-05, + "loss": 0.494, + "step": 17297 + }, + { + "epoch": 22.14144, + "grad_norm": 1.070124864578247, + "learning_rate": 1.5420168067226892e-05, + "loss": 0.4533, + "step": 17298 + }, + { + "epoch": 22.14272, + "grad_norm": 1.0914841890335083, + "learning_rate": 1.5418167266906764e-05, + "loss": 0.509, + "step": 17299 + }, + { + "epoch": 22.144, + "grad_norm": 1.1017416715621948, + "learning_rate": 1.5416166466586636e-05, + "loss": 0.4744, + "step": 17300 + }, + { + "epoch": 22.14528, + "grad_norm": 1.1203486919403076, + "learning_rate": 1.5414165666266507e-05, + "loss": 0.4563, + "step": 17301 + }, + { + "epoch": 22.14656, + "grad_norm": 1.1463284492492676, + "learning_rate": 1.541216486594638e-05, + "loss": 0.4892, + "step": 17302 + }, + { + "epoch": 22.14784, + "grad_norm": 1.1561495065689087, + "learning_rate": 1.541016406562625e-05, + "loss": 0.4916, + "step": 17303 + }, + { + "epoch": 22.14912, + "grad_norm": 1.1047677993774414, + "learning_rate": 1.5408163265306123e-05, + "loss": 0.4706, + "step": 17304 + }, + { + "epoch": 22.1504, + "grad_norm": 1.1294938325881958, + "learning_rate": 1.5406162464985995e-05, + "loss": 0.4652, + "step": 17305 + }, + { + "epoch": 22.15168, + "grad_norm": 1.166673183441162, + "learning_rate": 1.5404161664665867e-05, + "loss": 0.478, + "step": 17306 + }, + { + "epoch": 22.15296, + "grad_norm": 1.1008706092834473, + "learning_rate": 1.540216086434574e-05, + "loss": 0.4525, + "step": 17307 + }, + { + "epoch": 22.15424, + "grad_norm": 1.1509610414505005, + "learning_rate": 1.5400160064025614e-05, + "loss": 0.5268, + "step": 17308 + }, + { + "epoch": 22.15552, + "grad_norm": 1.1664763689041138, + "learning_rate": 1.5398159263705482e-05, + "loss": 0.5145, + "step": 17309 + }, + { + "epoch": 22.1568, + "grad_norm": 1.1705671548843384, + "learning_rate": 1.5396158463385354e-05, + "loss": 0.4866, + "step": 17310 + }, + { + "epoch": 22.158079999999998, + "grad_norm": 1.0866199731826782, + "learning_rate": 1.5394157663065226e-05, + "loss": 0.4587, + "step": 17311 + }, + { + "epoch": 22.15936, + "grad_norm": 1.0654592514038086, + "learning_rate": 1.53921568627451e-05, + "loss": 0.4489, + "step": 17312 + }, + { + "epoch": 22.16064, + "grad_norm": 1.1411796808242798, + "learning_rate": 1.539015606242497e-05, + "loss": 0.5293, + "step": 17313 + }, + { + "epoch": 22.16192, + "grad_norm": 1.1217213869094849, + "learning_rate": 1.538815526210484e-05, + "loss": 0.4792, + "step": 17314 + }, + { + "epoch": 22.1632, + "grad_norm": 1.133726954460144, + "learning_rate": 1.5386154461784717e-05, + "loss": 0.4706, + "step": 17315 + }, + { + "epoch": 22.16448, + "grad_norm": 1.0698765516281128, + "learning_rate": 1.538415366146459e-05, + "loss": 0.4544, + "step": 17316 + }, + { + "epoch": 22.16576, + "grad_norm": 1.112635850906372, + "learning_rate": 1.5382152861144457e-05, + "loss": 0.4675, + "step": 17317 + }, + { + "epoch": 22.16704, + "grad_norm": 1.0733355283737183, + "learning_rate": 1.538015206082433e-05, + "loss": 0.4737, + "step": 17318 + }, + { + "epoch": 22.16832, + "grad_norm": 1.1422197818756104, + "learning_rate": 1.5378151260504204e-05, + "loss": 0.4691, + "step": 17319 + }, + { + "epoch": 22.1696, + "grad_norm": 1.1713340282440186, + "learning_rate": 1.5376150460184076e-05, + "loss": 0.4922, + "step": 17320 + }, + { + "epoch": 22.17088, + "grad_norm": 1.114890217781067, + "learning_rate": 1.5374149659863945e-05, + "loss": 0.4539, + "step": 17321 + }, + { + "epoch": 22.17216, + "grad_norm": 1.1028982400894165, + "learning_rate": 1.537214885954382e-05, + "loss": 0.4329, + "step": 17322 + }, + { + "epoch": 22.17344, + "grad_norm": 1.1502256393432617, + "learning_rate": 1.537014805922369e-05, + "loss": 0.486, + "step": 17323 + }, + { + "epoch": 22.17472, + "grad_norm": 1.1716766357421875, + "learning_rate": 1.5368147258903563e-05, + "loss": 0.5172, + "step": 17324 + }, + { + "epoch": 22.176, + "grad_norm": 1.1179568767547607, + "learning_rate": 1.5366146458583432e-05, + "loss": 0.481, + "step": 17325 + }, + { + "epoch": 22.17728, + "grad_norm": 1.0973176956176758, + "learning_rate": 1.5364145658263307e-05, + "loss": 0.4588, + "step": 17326 + }, + { + "epoch": 22.17856, + "grad_norm": 1.1206368207931519, + "learning_rate": 1.536214485794318e-05, + "loss": 0.4673, + "step": 17327 + }, + { + "epoch": 22.17984, + "grad_norm": 1.128866195678711, + "learning_rate": 1.536014405762305e-05, + "loss": 0.4766, + "step": 17328 + }, + { + "epoch": 22.18112, + "grad_norm": 1.0996205806732178, + "learning_rate": 1.5358143257302923e-05, + "loss": 0.4679, + "step": 17329 + }, + { + "epoch": 22.1824, + "grad_norm": 1.087931752204895, + "learning_rate": 1.5356142456982795e-05, + "loss": 0.4724, + "step": 17330 + }, + { + "epoch": 22.18368, + "grad_norm": 1.1762522459030151, + "learning_rate": 1.5354141656662666e-05, + "loss": 0.5389, + "step": 17331 + }, + { + "epoch": 22.18496, + "grad_norm": 1.0654993057250977, + "learning_rate": 1.535214085634254e-05, + "loss": 0.4851, + "step": 17332 + }, + { + "epoch": 22.18624, + "grad_norm": 1.0894544124603271, + "learning_rate": 1.535014005602241e-05, + "loss": 0.5082, + "step": 17333 + }, + { + "epoch": 22.18752, + "grad_norm": 1.0549466609954834, + "learning_rate": 1.5348139255702282e-05, + "loss": 0.4361, + "step": 17334 + }, + { + "epoch": 22.1888, + "grad_norm": 1.16023588180542, + "learning_rate": 1.5346138455382154e-05, + "loss": 0.5082, + "step": 17335 + }, + { + "epoch": 22.19008, + "grad_norm": 1.1312403678894043, + "learning_rate": 1.5344137655062026e-05, + "loss": 0.4884, + "step": 17336 + }, + { + "epoch": 22.19136, + "grad_norm": 1.1789931058883667, + "learning_rate": 1.5342136854741898e-05, + "loss": 0.4722, + "step": 17337 + }, + { + "epoch": 22.19264, + "grad_norm": 1.1645036935806274, + "learning_rate": 1.534013605442177e-05, + "loss": 0.5172, + "step": 17338 + }, + { + "epoch": 22.19392, + "grad_norm": 1.106508731842041, + "learning_rate": 1.533813525410164e-05, + "loss": 0.466, + "step": 17339 + }, + { + "epoch": 22.1952, + "grad_norm": 1.1362861394882202, + "learning_rate": 1.5336134453781513e-05, + "loss": 0.4865, + "step": 17340 + }, + { + "epoch": 22.19648, + "grad_norm": 1.10444974899292, + "learning_rate": 1.5334133653461385e-05, + "loss": 0.4501, + "step": 17341 + }, + { + "epoch": 22.19776, + "grad_norm": 1.1265275478363037, + "learning_rate": 1.5332132853141257e-05, + "loss": 0.4673, + "step": 17342 + }, + { + "epoch": 22.19904, + "grad_norm": 1.1572017669677734, + "learning_rate": 1.533013205282113e-05, + "loss": 0.4611, + "step": 17343 + }, + { + "epoch": 22.20032, + "grad_norm": 1.2215920686721802, + "learning_rate": 1.5328131252501e-05, + "loss": 0.5035, + "step": 17344 + }, + { + "epoch": 22.2016, + "grad_norm": 1.2130608558654785, + "learning_rate": 1.5326130452180872e-05, + "loss": 0.4942, + "step": 17345 + }, + { + "epoch": 22.20288, + "grad_norm": 1.133594036102295, + "learning_rate": 1.5324129651860744e-05, + "loss": 0.4936, + "step": 17346 + }, + { + "epoch": 22.20416, + "grad_norm": 1.1092051267623901, + "learning_rate": 1.532212885154062e-05, + "loss": 0.5127, + "step": 17347 + }, + { + "epoch": 22.20544, + "grad_norm": 1.145732045173645, + "learning_rate": 1.5320128051220488e-05, + "loss": 0.4747, + "step": 17348 + }, + { + "epoch": 22.20672, + "grad_norm": 1.0972932577133179, + "learning_rate": 1.531812725090036e-05, + "loss": 0.4997, + "step": 17349 + }, + { + "epoch": 22.208, + "grad_norm": 1.0953388214111328, + "learning_rate": 1.5316126450580232e-05, + "loss": 0.4583, + "step": 17350 + }, + { + "epoch": 22.20928, + "grad_norm": 1.086438775062561, + "learning_rate": 1.5314125650260107e-05, + "loss": 0.4578, + "step": 17351 + }, + { + "epoch": 22.21056, + "grad_norm": 1.1954177618026733, + "learning_rate": 1.5312124849939975e-05, + "loss": 0.5342, + "step": 17352 + }, + { + "epoch": 22.21184, + "grad_norm": 1.1370768547058105, + "learning_rate": 1.5310124049619847e-05, + "loss": 0.5075, + "step": 17353 + }, + { + "epoch": 22.21312, + "grad_norm": 1.1684788465499878, + "learning_rate": 1.5308123249299723e-05, + "loss": 0.4789, + "step": 17354 + }, + { + "epoch": 22.2144, + "grad_norm": 1.108345866203308, + "learning_rate": 1.5306122448979594e-05, + "loss": 0.4686, + "step": 17355 + }, + { + "epoch": 22.21568, + "grad_norm": 1.0191106796264648, + "learning_rate": 1.5304121648659463e-05, + "loss": 0.4258, + "step": 17356 + }, + { + "epoch": 22.21696, + "grad_norm": 1.1205805540084839, + "learning_rate": 1.5302120848339335e-05, + "loss": 0.4757, + "step": 17357 + }, + { + "epoch": 22.21824, + "grad_norm": 1.1553397178649902, + "learning_rate": 1.530012004801921e-05, + "loss": 0.4878, + "step": 17358 + }, + { + "epoch": 22.21952, + "grad_norm": 1.1630548238754272, + "learning_rate": 1.5298119247699082e-05, + "loss": 0.4906, + "step": 17359 + }, + { + "epoch": 22.2208, + "grad_norm": 1.0727124214172363, + "learning_rate": 1.529611844737895e-05, + "loss": 0.4877, + "step": 17360 + }, + { + "epoch": 22.22208, + "grad_norm": 1.0958082675933838, + "learning_rate": 1.5294117647058826e-05, + "loss": 0.4236, + "step": 17361 + }, + { + "epoch": 22.22336, + "grad_norm": 1.1463395357131958, + "learning_rate": 1.5292116846738697e-05, + "loss": 0.4354, + "step": 17362 + }, + { + "epoch": 22.22464, + "grad_norm": 1.125491976737976, + "learning_rate": 1.529011604641857e-05, + "loss": 0.4664, + "step": 17363 + }, + { + "epoch": 22.22592, + "grad_norm": 1.1401169300079346, + "learning_rate": 1.5288115246098438e-05, + "loss": 0.4903, + "step": 17364 + }, + { + "epoch": 22.2272, + "grad_norm": 1.1379910707473755, + "learning_rate": 1.5286114445778313e-05, + "loss": 0.4734, + "step": 17365 + }, + { + "epoch": 22.22848, + "grad_norm": 1.0853389501571655, + "learning_rate": 1.5284113645458185e-05, + "loss": 0.4546, + "step": 17366 + }, + { + "epoch": 22.22976, + "grad_norm": 1.176604151725769, + "learning_rate": 1.5282112845138057e-05, + "loss": 0.5138, + "step": 17367 + }, + { + "epoch": 22.23104, + "grad_norm": 1.2261618375778198, + "learning_rate": 1.528011204481793e-05, + "loss": 0.5195, + "step": 17368 + }, + { + "epoch": 22.23232, + "grad_norm": 1.1438370943069458, + "learning_rate": 1.52781112444978e-05, + "loss": 0.5111, + "step": 17369 + }, + { + "epoch": 22.2336, + "grad_norm": 1.0962119102478027, + "learning_rate": 1.5276110444177672e-05, + "loss": 0.4471, + "step": 17370 + }, + { + "epoch": 22.23488, + "grad_norm": 1.1545382738113403, + "learning_rate": 1.5274109643857544e-05, + "loss": 0.4741, + "step": 17371 + }, + { + "epoch": 22.23616, + "grad_norm": 1.122843861579895, + "learning_rate": 1.5272108843537416e-05, + "loss": 0.4445, + "step": 17372 + }, + { + "epoch": 22.23744, + "grad_norm": 1.114371418952942, + "learning_rate": 1.5270108043217288e-05, + "loss": 0.4685, + "step": 17373 + }, + { + "epoch": 22.23872, + "grad_norm": 1.1114271879196167, + "learning_rate": 1.526810724289716e-05, + "loss": 0.5048, + "step": 17374 + }, + { + "epoch": 22.24, + "grad_norm": 1.1175719499588013, + "learning_rate": 1.526610644257703e-05, + "loss": 0.4901, + "step": 17375 + }, + { + "epoch": 22.24128, + "grad_norm": 1.1345455646514893, + "learning_rate": 1.5264105642256903e-05, + "loss": 0.4525, + "step": 17376 + }, + { + "epoch": 22.24256, + "grad_norm": 1.138181447982788, + "learning_rate": 1.5262104841936775e-05, + "loss": 0.4898, + "step": 17377 + }, + { + "epoch": 22.24384, + "grad_norm": 1.1441845893859863, + "learning_rate": 1.5260104041616647e-05, + "loss": 0.4646, + "step": 17378 + }, + { + "epoch": 22.24512, + "grad_norm": 1.102541446685791, + "learning_rate": 1.525810324129652e-05, + "loss": 0.4854, + "step": 17379 + }, + { + "epoch": 22.2464, + "grad_norm": 1.0756680965423584, + "learning_rate": 1.5256102440976391e-05, + "loss": 0.4716, + "step": 17380 + }, + { + "epoch": 22.24768, + "grad_norm": 1.1094943284988403, + "learning_rate": 1.5254101640656263e-05, + "loss": 0.4975, + "step": 17381 + }, + { + "epoch": 22.24896, + "grad_norm": 1.1272087097167969, + "learning_rate": 1.5252100840336136e-05, + "loss": 0.4882, + "step": 17382 + }, + { + "epoch": 22.25024, + "grad_norm": 1.134270191192627, + "learning_rate": 1.5250100040016008e-05, + "loss": 0.4806, + "step": 17383 + }, + { + "epoch": 22.25152, + "grad_norm": 1.1215940713882446, + "learning_rate": 1.5248099239695878e-05, + "loss": 0.4555, + "step": 17384 + }, + { + "epoch": 22.2528, + "grad_norm": 1.0712652206420898, + "learning_rate": 1.524609843937575e-05, + "loss": 0.4371, + "step": 17385 + }, + { + "epoch": 22.25408, + "grad_norm": 1.0350494384765625, + "learning_rate": 1.5244097639055624e-05, + "loss": 0.4513, + "step": 17386 + }, + { + "epoch": 22.25536, + "grad_norm": 1.1774425506591797, + "learning_rate": 1.5242096838735496e-05, + "loss": 0.5087, + "step": 17387 + }, + { + "epoch": 22.25664, + "grad_norm": 1.137933611869812, + "learning_rate": 1.5240096038415366e-05, + "loss": 0.5026, + "step": 17388 + }, + { + "epoch": 22.25792, + "grad_norm": 1.1481642723083496, + "learning_rate": 1.5238095238095241e-05, + "loss": 0.474, + "step": 17389 + }, + { + "epoch": 22.2592, + "grad_norm": 1.067002296447754, + "learning_rate": 1.5236094437775111e-05, + "loss": 0.4291, + "step": 17390 + }, + { + "epoch": 22.26048, + "grad_norm": 1.1963670253753662, + "learning_rate": 1.5234093637454983e-05, + "loss": 0.5206, + "step": 17391 + }, + { + "epoch": 22.26176, + "grad_norm": 1.1314671039581299, + "learning_rate": 1.5232092837134853e-05, + "loss": 0.4769, + "step": 17392 + }, + { + "epoch": 22.26304, + "grad_norm": 1.0662577152252197, + "learning_rate": 1.5230092036814728e-05, + "loss": 0.4752, + "step": 17393 + }, + { + "epoch": 22.26432, + "grad_norm": 1.1250959634780884, + "learning_rate": 1.5228091236494599e-05, + "loss": 0.5012, + "step": 17394 + }, + { + "epoch": 22.2656, + "grad_norm": 1.1448113918304443, + "learning_rate": 1.522609043617447e-05, + "loss": 0.5028, + "step": 17395 + }, + { + "epoch": 22.26688, + "grad_norm": 1.1659822463989258, + "learning_rate": 1.522408963585434e-05, + "loss": 0.4969, + "step": 17396 + }, + { + "epoch": 22.26816, + "grad_norm": 1.1037496328353882, + "learning_rate": 1.5222088835534216e-05, + "loss": 0.501, + "step": 17397 + }, + { + "epoch": 22.26944, + "grad_norm": 1.1459519863128662, + "learning_rate": 1.5220088035214086e-05, + "loss": 0.4836, + "step": 17398 + }, + { + "epoch": 22.27072, + "grad_norm": 1.0809226036071777, + "learning_rate": 1.5218087234893958e-05, + "loss": 0.4574, + "step": 17399 + }, + { + "epoch": 22.272, + "grad_norm": 1.1643927097320557, + "learning_rate": 1.5216086434573831e-05, + "loss": 0.5028, + "step": 17400 + }, + { + "epoch": 22.27328, + "grad_norm": 1.097196102142334, + "learning_rate": 1.5214085634253703e-05, + "loss": 0.4355, + "step": 17401 + }, + { + "epoch": 22.27456, + "grad_norm": 1.1606805324554443, + "learning_rate": 1.5212084833933573e-05, + "loss": 0.4937, + "step": 17402 + }, + { + "epoch": 22.27584, + "grad_norm": 1.1250141859054565, + "learning_rate": 1.5210084033613445e-05, + "loss": 0.4682, + "step": 17403 + }, + { + "epoch": 22.27712, + "grad_norm": 1.1419126987457275, + "learning_rate": 1.5208083233293319e-05, + "loss": 0.4867, + "step": 17404 + }, + { + "epoch": 22.2784, + "grad_norm": 1.1214076280593872, + "learning_rate": 1.520608243297319e-05, + "loss": 0.4733, + "step": 17405 + }, + { + "epoch": 22.27968, + "grad_norm": 1.1369352340698242, + "learning_rate": 1.520408163265306e-05, + "loss": 0.4595, + "step": 17406 + }, + { + "epoch": 22.28096, + "grad_norm": 1.0881363153457642, + "learning_rate": 1.5202080832332934e-05, + "loss": 0.479, + "step": 17407 + }, + { + "epoch": 22.28224, + "grad_norm": 1.1420772075653076, + "learning_rate": 1.5200080032012806e-05, + "loss": 0.4535, + "step": 17408 + }, + { + "epoch": 22.28352, + "grad_norm": 1.1348367929458618, + "learning_rate": 1.5198079231692678e-05, + "loss": 0.4644, + "step": 17409 + }, + { + "epoch": 22.2848, + "grad_norm": 1.0882881879806519, + "learning_rate": 1.5196078431372548e-05, + "loss": 0.4883, + "step": 17410 + }, + { + "epoch": 22.28608, + "grad_norm": 1.1225090026855469, + "learning_rate": 1.5194077631052422e-05, + "loss": 0.4732, + "step": 17411 + }, + { + "epoch": 22.28736, + "grad_norm": 1.195477843284607, + "learning_rate": 1.5192076830732294e-05, + "loss": 0.5217, + "step": 17412 + }, + { + "epoch": 22.28864, + "grad_norm": 1.1594408750534058, + "learning_rate": 1.5190076030412166e-05, + "loss": 0.5314, + "step": 17413 + }, + { + "epoch": 22.28992, + "grad_norm": 1.1369751691818237, + "learning_rate": 1.5188075230092039e-05, + "loss": 0.4771, + "step": 17414 + }, + { + "epoch": 22.2912, + "grad_norm": 1.1328139305114746, + "learning_rate": 1.518607442977191e-05, + "loss": 0.5051, + "step": 17415 + }, + { + "epoch": 22.29248, + "grad_norm": 1.1395426988601685, + "learning_rate": 1.5184073629451781e-05, + "loss": 0.4516, + "step": 17416 + }, + { + "epoch": 22.29376, + "grad_norm": 1.1057137250900269, + "learning_rate": 1.5182072829131653e-05, + "loss": 0.5055, + "step": 17417 + }, + { + "epoch": 22.29504, + "grad_norm": 1.0750411748886108, + "learning_rate": 1.5180072028811526e-05, + "loss": 0.4643, + "step": 17418 + }, + { + "epoch": 22.29632, + "grad_norm": 1.134002685546875, + "learning_rate": 1.5178071228491397e-05, + "loss": 0.4959, + "step": 17419 + }, + { + "epoch": 22.2976, + "grad_norm": 1.1353780031204224, + "learning_rate": 1.5176070428171269e-05, + "loss": 0.4992, + "step": 17420 + }, + { + "epoch": 22.29888, + "grad_norm": 1.131705641746521, + "learning_rate": 1.5174069627851142e-05, + "loss": 0.4595, + "step": 17421 + }, + { + "epoch": 22.300159999999998, + "grad_norm": 1.1513729095458984, + "learning_rate": 1.5172068827531014e-05, + "loss": 0.4984, + "step": 17422 + }, + { + "epoch": 22.30144, + "grad_norm": 1.126017689704895, + "learning_rate": 1.5170068027210884e-05, + "loss": 0.4743, + "step": 17423 + }, + { + "epoch": 22.30272, + "grad_norm": 1.056380271911621, + "learning_rate": 1.5168067226890756e-05, + "loss": 0.4702, + "step": 17424 + }, + { + "epoch": 22.304, + "grad_norm": 1.079280972480774, + "learning_rate": 1.516606642657063e-05, + "loss": 0.4556, + "step": 17425 + }, + { + "epoch": 22.30528, + "grad_norm": 1.1339120864868164, + "learning_rate": 1.5164065626250501e-05, + "loss": 0.4703, + "step": 17426 + }, + { + "epoch": 22.30656, + "grad_norm": 1.1718413829803467, + "learning_rate": 1.5162064825930372e-05, + "loss": 0.4925, + "step": 17427 + }, + { + "epoch": 22.30784, + "grad_norm": 1.1251111030578613, + "learning_rate": 1.5160064025610247e-05, + "loss": 0.4401, + "step": 17428 + }, + { + "epoch": 22.30912, + "grad_norm": 1.1180959939956665, + "learning_rate": 1.5158063225290117e-05, + "loss": 0.5187, + "step": 17429 + }, + { + "epoch": 22.3104, + "grad_norm": 1.1028566360473633, + "learning_rate": 1.5156062424969989e-05, + "loss": 0.4702, + "step": 17430 + }, + { + "epoch": 22.31168, + "grad_norm": 1.1383028030395508, + "learning_rate": 1.5154061624649859e-05, + "loss": 0.4931, + "step": 17431 + }, + { + "epoch": 22.31296, + "grad_norm": 1.0866107940673828, + "learning_rate": 1.5152060824329734e-05, + "loss": 0.5136, + "step": 17432 + }, + { + "epoch": 22.31424, + "grad_norm": 1.1053640842437744, + "learning_rate": 1.5150060024009604e-05, + "loss": 0.5046, + "step": 17433 + }, + { + "epoch": 22.31552, + "grad_norm": 1.0949057340621948, + "learning_rate": 1.5148059223689476e-05, + "loss": 0.524, + "step": 17434 + }, + { + "epoch": 22.3168, + "grad_norm": 1.101889729499817, + "learning_rate": 1.514605842336935e-05, + "loss": 0.4857, + "step": 17435 + }, + { + "epoch": 22.31808, + "grad_norm": 1.0940123796463013, + "learning_rate": 1.5144057623049222e-05, + "loss": 0.4603, + "step": 17436 + }, + { + "epoch": 22.31936, + "grad_norm": 1.0854915380477905, + "learning_rate": 1.5142056822729092e-05, + "loss": 0.506, + "step": 17437 + }, + { + "epoch": 22.32064, + "grad_norm": 1.1346979141235352, + "learning_rate": 1.5140056022408964e-05, + "loss": 0.5258, + "step": 17438 + }, + { + "epoch": 22.32192, + "grad_norm": 1.0600947141647339, + "learning_rate": 1.5138055222088837e-05, + "loss": 0.4525, + "step": 17439 + }, + { + "epoch": 22.3232, + "grad_norm": 1.082271695137024, + "learning_rate": 1.5136054421768709e-05, + "loss": 0.4732, + "step": 17440 + }, + { + "epoch": 22.32448, + "grad_norm": 1.145754098892212, + "learning_rate": 1.513405362144858e-05, + "loss": 0.4547, + "step": 17441 + }, + { + "epoch": 22.32576, + "grad_norm": 1.162156581878662, + "learning_rate": 1.5132052821128453e-05, + "loss": 0.4697, + "step": 17442 + }, + { + "epoch": 22.32704, + "grad_norm": 1.1728843450546265, + "learning_rate": 1.5130052020808325e-05, + "loss": 0.4654, + "step": 17443 + }, + { + "epoch": 22.32832, + "grad_norm": 1.1649385690689087, + "learning_rate": 1.5128051220488196e-05, + "loss": 0.5223, + "step": 17444 + }, + { + "epoch": 22.3296, + "grad_norm": 1.1600626707077026, + "learning_rate": 1.5126050420168067e-05, + "loss": 0.4684, + "step": 17445 + }, + { + "epoch": 22.33088, + "grad_norm": 1.1548811197280884, + "learning_rate": 1.512404961984794e-05, + "loss": 0.4642, + "step": 17446 + }, + { + "epoch": 22.332160000000002, + "grad_norm": 1.2105642557144165, + "learning_rate": 1.5122048819527812e-05, + "loss": 0.4687, + "step": 17447 + }, + { + "epoch": 22.33344, + "grad_norm": 1.204110860824585, + "learning_rate": 1.5120048019207684e-05, + "loss": 0.4745, + "step": 17448 + }, + { + "epoch": 22.33472, + "grad_norm": 1.189799189567566, + "learning_rate": 1.5118047218887554e-05, + "loss": 0.4882, + "step": 17449 + }, + { + "epoch": 22.336, + "grad_norm": 1.0634477138519287, + "learning_rate": 1.5116046418567428e-05, + "loss": 0.4457, + "step": 17450 + }, + { + "epoch": 22.33728, + "grad_norm": 1.1369130611419678, + "learning_rate": 1.51140456182473e-05, + "loss": 0.5071, + "step": 17451 + }, + { + "epoch": 22.33856, + "grad_norm": 1.1142311096191406, + "learning_rate": 1.5112044817927171e-05, + "loss": 0.4664, + "step": 17452 + }, + { + "epoch": 22.33984, + "grad_norm": 1.1078466176986694, + "learning_rate": 1.5110044017607045e-05, + "loss": 0.4767, + "step": 17453 + }, + { + "epoch": 22.34112, + "grad_norm": 1.1485053300857544, + "learning_rate": 1.5108043217286915e-05, + "loss": 0.4765, + "step": 17454 + }, + { + "epoch": 22.3424, + "grad_norm": 1.105195164680481, + "learning_rate": 1.5106042416966787e-05, + "loss": 0.4429, + "step": 17455 + }, + { + "epoch": 22.34368, + "grad_norm": 1.1941795349121094, + "learning_rate": 1.5104041616646659e-05, + "loss": 0.5371, + "step": 17456 + }, + { + "epoch": 22.34496, + "grad_norm": 1.0616157054901123, + "learning_rate": 1.5102040816326532e-05, + "loss": 0.421, + "step": 17457 + }, + { + "epoch": 22.34624, + "grad_norm": 1.1069750785827637, + "learning_rate": 1.5100040016006402e-05, + "loss": 0.4859, + "step": 17458 + }, + { + "epoch": 22.34752, + "grad_norm": 1.1655443906784058, + "learning_rate": 1.5098039215686274e-05, + "loss": 0.519, + "step": 17459 + }, + { + "epoch": 22.3488, + "grad_norm": 1.1244604587554932, + "learning_rate": 1.5096038415366148e-05, + "loss": 0.4697, + "step": 17460 + }, + { + "epoch": 22.35008, + "grad_norm": 1.1628854274749756, + "learning_rate": 1.509403761504602e-05, + "loss": 0.5347, + "step": 17461 + }, + { + "epoch": 22.35136, + "grad_norm": 1.0745189189910889, + "learning_rate": 1.509203681472589e-05, + "loss": 0.4332, + "step": 17462 + }, + { + "epoch": 22.35264, + "grad_norm": 1.117801308631897, + "learning_rate": 1.5090036014405762e-05, + "loss": 0.439, + "step": 17463 + }, + { + "epoch": 22.35392, + "grad_norm": 1.1143686771392822, + "learning_rate": 1.5088035214085635e-05, + "loss": 0.4714, + "step": 17464 + }, + { + "epoch": 22.3552, + "grad_norm": 1.1027582883834839, + "learning_rate": 1.5086034413765507e-05, + "loss": 0.4566, + "step": 17465 + }, + { + "epoch": 22.35648, + "grad_norm": 1.114611268043518, + "learning_rate": 1.5084033613445377e-05, + "loss": 0.5007, + "step": 17466 + }, + { + "epoch": 22.35776, + "grad_norm": 1.12099027633667, + "learning_rate": 1.5082032813125253e-05, + "loss": 0.4968, + "step": 17467 + }, + { + "epoch": 22.35904, + "grad_norm": 1.1713063716888428, + "learning_rate": 1.5080032012805123e-05, + "loss": 0.5194, + "step": 17468 + }, + { + "epoch": 22.36032, + "grad_norm": 1.1638789176940918, + "learning_rate": 1.5078031212484995e-05, + "loss": 0.5699, + "step": 17469 + }, + { + "epoch": 22.3616, + "grad_norm": 1.118815541267395, + "learning_rate": 1.5076030412164865e-05, + "loss": 0.4611, + "step": 17470 + }, + { + "epoch": 22.36288, + "grad_norm": 1.087066650390625, + "learning_rate": 1.507402961184474e-05, + "loss": 0.4478, + "step": 17471 + }, + { + "epoch": 22.36416, + "grad_norm": 1.1488027572631836, + "learning_rate": 1.507202881152461e-05, + "loss": 0.4589, + "step": 17472 + }, + { + "epoch": 22.36544, + "grad_norm": 1.1603463888168335, + "learning_rate": 1.5070028011204482e-05, + "loss": 0.5202, + "step": 17473 + }, + { + "epoch": 22.36672, + "grad_norm": 1.1120957136154175, + "learning_rate": 1.5068027210884356e-05, + "loss": 0.4665, + "step": 17474 + }, + { + "epoch": 22.368, + "grad_norm": 1.0825188159942627, + "learning_rate": 1.5066026410564227e-05, + "loss": 0.4775, + "step": 17475 + }, + { + "epoch": 22.36928, + "grad_norm": 1.1528738737106323, + "learning_rate": 1.5064025610244098e-05, + "loss": 0.529, + "step": 17476 + }, + { + "epoch": 22.37056, + "grad_norm": 1.176680326461792, + "learning_rate": 1.506202480992397e-05, + "loss": 0.4932, + "step": 17477 + }, + { + "epoch": 22.37184, + "grad_norm": 1.1375823020935059, + "learning_rate": 1.5060024009603843e-05, + "loss": 0.484, + "step": 17478 + }, + { + "epoch": 22.37312, + "grad_norm": 1.109999179840088, + "learning_rate": 1.5058023209283715e-05, + "loss": 0.4834, + "step": 17479 + }, + { + "epoch": 22.3744, + "grad_norm": 1.1392533779144287, + "learning_rate": 1.5056022408963585e-05, + "loss": 0.4511, + "step": 17480 + }, + { + "epoch": 22.37568, + "grad_norm": 1.074696660041809, + "learning_rate": 1.5054021608643459e-05, + "loss": 0.4497, + "step": 17481 + }, + { + "epoch": 22.37696, + "grad_norm": 1.1152911186218262, + "learning_rate": 1.505202080832333e-05, + "loss": 0.456, + "step": 17482 + }, + { + "epoch": 22.37824, + "grad_norm": 1.1230454444885254, + "learning_rate": 1.5050020008003202e-05, + "loss": 0.4766, + "step": 17483 + }, + { + "epoch": 22.37952, + "grad_norm": 1.068898320198059, + "learning_rate": 1.5048019207683072e-05, + "loss": 0.425, + "step": 17484 + }, + { + "epoch": 22.3808, + "grad_norm": 1.1210873126983643, + "learning_rate": 1.5046018407362946e-05, + "loss": 0.5016, + "step": 17485 + }, + { + "epoch": 22.38208, + "grad_norm": 1.0686876773834229, + "learning_rate": 1.5044017607042818e-05, + "loss": 0.4212, + "step": 17486 + }, + { + "epoch": 22.38336, + "grad_norm": 1.0619049072265625, + "learning_rate": 1.504201680672269e-05, + "loss": 0.45, + "step": 17487 + }, + { + "epoch": 22.38464, + "grad_norm": 1.1653971672058105, + "learning_rate": 1.5040016006402563e-05, + "loss": 0.4943, + "step": 17488 + }, + { + "epoch": 22.38592, + "grad_norm": 1.1745988130569458, + "learning_rate": 1.5038015206082433e-05, + "loss": 0.4886, + "step": 17489 + }, + { + "epoch": 22.3872, + "grad_norm": 1.1317180395126343, + "learning_rate": 1.5036014405762305e-05, + "loss": 0.4911, + "step": 17490 + }, + { + "epoch": 22.38848, + "grad_norm": 1.1715854406356812, + "learning_rate": 1.5034013605442177e-05, + "loss": 0.4806, + "step": 17491 + }, + { + "epoch": 22.38976, + "grad_norm": 1.1153987646102905, + "learning_rate": 1.503201280512205e-05, + "loss": 0.5112, + "step": 17492 + }, + { + "epoch": 22.39104, + "grad_norm": 1.1248096227645874, + "learning_rate": 1.503001200480192e-05, + "loss": 0.4911, + "step": 17493 + }, + { + "epoch": 22.39232, + "grad_norm": 1.1459614038467407, + "learning_rate": 1.5028011204481793e-05, + "loss": 0.4958, + "step": 17494 + }, + { + "epoch": 22.3936, + "grad_norm": 1.153074860572815, + "learning_rate": 1.5026010404161666e-05, + "loss": 0.4822, + "step": 17495 + }, + { + "epoch": 22.39488, + "grad_norm": 1.1298890113830566, + "learning_rate": 1.5024009603841538e-05, + "loss": 0.4773, + "step": 17496 + }, + { + "epoch": 22.39616, + "grad_norm": 1.125186800956726, + "learning_rate": 1.5022008803521408e-05, + "loss": 0.4854, + "step": 17497 + }, + { + "epoch": 22.39744, + "grad_norm": 1.1420212984085083, + "learning_rate": 1.502000800320128e-05, + "loss": 0.5225, + "step": 17498 + }, + { + "epoch": 22.39872, + "grad_norm": 1.0702089071273804, + "learning_rate": 1.5018007202881154e-05, + "loss": 0.4647, + "step": 17499 + }, + { + "epoch": 22.4, + "grad_norm": 1.172632098197937, + "learning_rate": 1.5016006402561026e-05, + "loss": 0.5238, + "step": 17500 + }, + { + "epoch": 22.40128, + "grad_norm": 1.1643844842910767, + "learning_rate": 1.5014005602240896e-05, + "loss": 0.5163, + "step": 17501 + }, + { + "epoch": 22.40256, + "grad_norm": 1.044021725654602, + "learning_rate": 1.5012004801920771e-05, + "loss": 0.4328, + "step": 17502 + }, + { + "epoch": 22.40384, + "grad_norm": 1.0835877656936646, + "learning_rate": 1.5010004001600641e-05, + "loss": 0.4673, + "step": 17503 + }, + { + "epoch": 22.40512, + "grad_norm": 1.095995306968689, + "learning_rate": 1.5008003201280513e-05, + "loss": 0.5019, + "step": 17504 + }, + { + "epoch": 22.4064, + "grad_norm": 1.1427990198135376, + "learning_rate": 1.5006002400960383e-05, + "loss": 0.4884, + "step": 17505 + }, + { + "epoch": 22.40768, + "grad_norm": 1.1321765184402466, + "learning_rate": 1.5004001600640258e-05, + "loss": 0.4775, + "step": 17506 + }, + { + "epoch": 22.40896, + "grad_norm": 1.107414722442627, + "learning_rate": 1.5002000800320129e-05, + "loss": 0.4403, + "step": 17507 + }, + { + "epoch": 22.41024, + "grad_norm": 1.1043665409088135, + "learning_rate": 1.5e-05, + "loss": 0.4972, + "step": 17508 + }, + { + "epoch": 22.41152, + "grad_norm": 1.120118260383606, + "learning_rate": 1.499799919967987e-05, + "loss": 0.4897, + "step": 17509 + }, + { + "epoch": 22.4128, + "grad_norm": 1.11980140209198, + "learning_rate": 1.4995998399359746e-05, + "loss": 0.4861, + "step": 17510 + }, + { + "epoch": 22.41408, + "grad_norm": 1.1767655611038208, + "learning_rate": 1.4993997599039616e-05, + "loss": 0.518, + "step": 17511 + }, + { + "epoch": 22.41536, + "grad_norm": 1.1098543405532837, + "learning_rate": 1.4991996798719488e-05, + "loss": 0.472, + "step": 17512 + }, + { + "epoch": 22.41664, + "grad_norm": 1.2055127620697021, + "learning_rate": 1.4989995998399361e-05, + "loss": 0.5145, + "step": 17513 + }, + { + "epoch": 22.41792, + "grad_norm": 1.160184621810913, + "learning_rate": 1.4987995198079233e-05, + "loss": 0.5199, + "step": 17514 + }, + { + "epoch": 22.4192, + "grad_norm": 1.1769219636917114, + "learning_rate": 1.4985994397759103e-05, + "loss": 0.5057, + "step": 17515 + }, + { + "epoch": 22.42048, + "grad_norm": 1.1221567392349243, + "learning_rate": 1.4983993597438975e-05, + "loss": 0.4786, + "step": 17516 + }, + { + "epoch": 22.42176, + "grad_norm": 1.167410135269165, + "learning_rate": 1.4981992797118849e-05, + "loss": 0.4793, + "step": 17517 + }, + { + "epoch": 22.42304, + "grad_norm": 1.1696871519088745, + "learning_rate": 1.497999199679872e-05, + "loss": 0.4783, + "step": 17518 + }, + { + "epoch": 22.42432, + "grad_norm": 1.1420179605484009, + "learning_rate": 1.497799119647859e-05, + "loss": 0.47, + "step": 17519 + }, + { + "epoch": 22.4256, + "grad_norm": 1.1280018091201782, + "learning_rate": 1.4975990396158466e-05, + "loss": 0.5005, + "step": 17520 + }, + { + "epoch": 22.42688, + "grad_norm": 1.126597285270691, + "learning_rate": 1.4973989595838336e-05, + "loss": 0.4808, + "step": 17521 + }, + { + "epoch": 22.42816, + "grad_norm": 1.1016881465911865, + "learning_rate": 1.4971988795518208e-05, + "loss": 0.4692, + "step": 17522 + }, + { + "epoch": 22.42944, + "grad_norm": 1.1398890018463135, + "learning_rate": 1.4969987995198078e-05, + "loss": 0.5017, + "step": 17523 + }, + { + "epoch": 22.43072, + "grad_norm": 1.1369463205337524, + "learning_rate": 1.4967987194877953e-05, + "loss": 0.4744, + "step": 17524 + }, + { + "epoch": 22.432, + "grad_norm": 1.1468337774276733, + "learning_rate": 1.4965986394557824e-05, + "loss": 0.4802, + "step": 17525 + }, + { + "epoch": 22.43328, + "grad_norm": 1.184463620185852, + "learning_rate": 1.4963985594237695e-05, + "loss": 0.5436, + "step": 17526 + }, + { + "epoch": 22.43456, + "grad_norm": 1.0913583040237427, + "learning_rate": 1.4961984793917569e-05, + "loss": 0.4606, + "step": 17527 + }, + { + "epoch": 22.43584, + "grad_norm": 1.1201003789901733, + "learning_rate": 1.4959983993597441e-05, + "loss": 0.4782, + "step": 17528 + }, + { + "epoch": 22.43712, + "grad_norm": 1.1170686483383179, + "learning_rate": 1.4957983193277311e-05, + "loss": 0.4602, + "step": 17529 + }, + { + "epoch": 22.4384, + "grad_norm": 1.1606577634811401, + "learning_rate": 1.4955982392957183e-05, + "loss": 0.501, + "step": 17530 + }, + { + "epoch": 22.43968, + "grad_norm": 1.1875215768814087, + "learning_rate": 1.4953981592637056e-05, + "loss": 0.4986, + "step": 17531 + }, + { + "epoch": 22.44096, + "grad_norm": 1.1248286962509155, + "learning_rate": 1.4951980792316928e-05, + "loss": 0.51, + "step": 17532 + }, + { + "epoch": 22.44224, + "grad_norm": 1.106138825416565, + "learning_rate": 1.4949979991996798e-05, + "loss": 0.4864, + "step": 17533 + }, + { + "epoch": 22.44352, + "grad_norm": 1.1583833694458008, + "learning_rate": 1.4947979191676672e-05, + "loss": 0.4824, + "step": 17534 + }, + { + "epoch": 22.4448, + "grad_norm": 1.1970548629760742, + "learning_rate": 1.4945978391356544e-05, + "loss": 0.5043, + "step": 17535 + }, + { + "epoch": 22.44608, + "grad_norm": 1.1255377531051636, + "learning_rate": 1.4943977591036416e-05, + "loss": 0.4703, + "step": 17536 + }, + { + "epoch": 22.44736, + "grad_norm": 1.137693166732788, + "learning_rate": 1.4941976790716286e-05, + "loss": 0.5076, + "step": 17537 + }, + { + "epoch": 22.44864, + "grad_norm": 1.099455714225769, + "learning_rate": 1.493997599039616e-05, + "loss": 0.4662, + "step": 17538 + }, + { + "epoch": 22.44992, + "grad_norm": 1.1596556901931763, + "learning_rate": 1.4937975190076031e-05, + "loss": 0.4969, + "step": 17539 + }, + { + "epoch": 22.4512, + "grad_norm": 1.1630852222442627, + "learning_rate": 1.4935974389755903e-05, + "loss": 0.5131, + "step": 17540 + }, + { + "epoch": 22.45248, + "grad_norm": 1.1254620552062988, + "learning_rate": 1.4933973589435777e-05, + "loss": 0.5012, + "step": 17541 + }, + { + "epoch": 22.45376, + "grad_norm": 1.1461201906204224, + "learning_rate": 1.4931972789115647e-05, + "loss": 0.4934, + "step": 17542 + }, + { + "epoch": 22.45504, + "grad_norm": 1.1740776300430298, + "learning_rate": 1.4929971988795519e-05, + "loss": 0.4746, + "step": 17543 + }, + { + "epoch": 22.45632, + "grad_norm": 1.1553022861480713, + "learning_rate": 1.492797118847539e-05, + "loss": 0.485, + "step": 17544 + }, + { + "epoch": 22.4576, + "grad_norm": 1.1020857095718384, + "learning_rate": 1.4925970388155264e-05, + "loss": 0.4326, + "step": 17545 + }, + { + "epoch": 22.45888, + "grad_norm": 1.125576376914978, + "learning_rate": 1.4923969587835134e-05, + "loss": 0.4976, + "step": 17546 + }, + { + "epoch": 22.46016, + "grad_norm": 1.1211215257644653, + "learning_rate": 1.4921968787515006e-05, + "loss": 0.4824, + "step": 17547 + }, + { + "epoch": 22.46144, + "grad_norm": 1.1188104152679443, + "learning_rate": 1.491996798719488e-05, + "loss": 0.4772, + "step": 17548 + }, + { + "epoch": 22.46272, + "grad_norm": 1.074813723564148, + "learning_rate": 1.4917967186874752e-05, + "loss": 0.4494, + "step": 17549 + }, + { + "epoch": 22.464, + "grad_norm": 1.0762443542480469, + "learning_rate": 1.4915966386554622e-05, + "loss": 0.4373, + "step": 17550 + }, + { + "epoch": 22.46528, + "grad_norm": 1.1819483041763306, + "learning_rate": 1.4913965586234494e-05, + "loss": 0.5661, + "step": 17551 + }, + { + "epoch": 22.46656, + "grad_norm": 1.104282259941101, + "learning_rate": 1.4911964785914367e-05, + "loss": 0.4914, + "step": 17552 + }, + { + "epoch": 22.46784, + "grad_norm": 1.1156835556030273, + "learning_rate": 1.4909963985594239e-05, + "loss": 0.4791, + "step": 17553 + }, + { + "epoch": 22.46912, + "grad_norm": 1.198472023010254, + "learning_rate": 1.490796318527411e-05, + "loss": 0.4995, + "step": 17554 + }, + { + "epoch": 22.4704, + "grad_norm": 1.1563735008239746, + "learning_rate": 1.4905962384953984e-05, + "loss": 0.4815, + "step": 17555 + }, + { + "epoch": 22.47168, + "grad_norm": 1.1188043355941772, + "learning_rate": 1.4903961584633855e-05, + "loss": 0.4759, + "step": 17556 + }, + { + "epoch": 22.47296, + "grad_norm": 1.130955457687378, + "learning_rate": 1.4901960784313726e-05, + "loss": 0.4884, + "step": 17557 + }, + { + "epoch": 22.47424, + "grad_norm": 1.1355321407318115, + "learning_rate": 1.4899959983993597e-05, + "loss": 0.4986, + "step": 17558 + }, + { + "epoch": 22.47552, + "grad_norm": 1.1446267366409302, + "learning_rate": 1.4897959183673472e-05, + "loss": 0.4902, + "step": 17559 + }, + { + "epoch": 22.4768, + "grad_norm": 1.1643836498260498, + "learning_rate": 1.4895958383353342e-05, + "loss": 0.4702, + "step": 17560 + }, + { + "epoch": 22.47808, + "grad_norm": 1.1795804500579834, + "learning_rate": 1.4893957583033214e-05, + "loss": 0.4978, + "step": 17561 + }, + { + "epoch": 22.47936, + "grad_norm": 1.0704967975616455, + "learning_rate": 1.4891956782713084e-05, + "loss": 0.4907, + "step": 17562 + }, + { + "epoch": 22.48064, + "grad_norm": 1.0626331567764282, + "learning_rate": 1.488995598239296e-05, + "loss": 0.4522, + "step": 17563 + }, + { + "epoch": 22.48192, + "grad_norm": 1.0941236019134521, + "learning_rate": 1.488795518207283e-05, + "loss": 0.4381, + "step": 17564 + }, + { + "epoch": 22.4832, + "grad_norm": 1.100813388824463, + "learning_rate": 1.4885954381752701e-05, + "loss": 0.5071, + "step": 17565 + }, + { + "epoch": 22.48448, + "grad_norm": 1.2816932201385498, + "learning_rate": 1.4883953581432575e-05, + "loss": 0.4937, + "step": 17566 + }, + { + "epoch": 22.48576, + "grad_norm": 1.2014461755752563, + "learning_rate": 1.4881952781112447e-05, + "loss": 0.4908, + "step": 17567 + }, + { + "epoch": 22.48704, + "grad_norm": 1.0948677062988281, + "learning_rate": 1.4879951980792317e-05, + "loss": 0.4935, + "step": 17568 + }, + { + "epoch": 22.48832, + "grad_norm": 1.114828109741211, + "learning_rate": 1.4877951180472189e-05, + "loss": 0.4639, + "step": 17569 + }, + { + "epoch": 22.4896, + "grad_norm": 1.118323802947998, + "learning_rate": 1.4875950380152062e-05, + "loss": 0.5029, + "step": 17570 + }, + { + "epoch": 22.49088, + "grad_norm": 1.11355721950531, + "learning_rate": 1.4873949579831934e-05, + "loss": 0.4878, + "step": 17571 + }, + { + "epoch": 22.49216, + "grad_norm": 1.1527410745620728, + "learning_rate": 1.4871948779511804e-05, + "loss": 0.5242, + "step": 17572 + }, + { + "epoch": 22.49344, + "grad_norm": 1.1299474239349365, + "learning_rate": 1.4869947979191678e-05, + "loss": 0.4836, + "step": 17573 + }, + { + "epoch": 22.49472, + "grad_norm": 1.0945117473602295, + "learning_rate": 1.486794717887155e-05, + "loss": 0.4503, + "step": 17574 + }, + { + "epoch": 22.496, + "grad_norm": 1.0921000242233276, + "learning_rate": 1.4865946378551422e-05, + "loss": 0.5056, + "step": 17575 + }, + { + "epoch": 22.49728, + "grad_norm": 1.1385952234268188, + "learning_rate": 1.4863945578231292e-05, + "loss": 0.5017, + "step": 17576 + }, + { + "epoch": 22.49856, + "grad_norm": 1.1188937425613403, + "learning_rate": 1.4861944777911165e-05, + "loss": 0.4629, + "step": 17577 + }, + { + "epoch": 22.49984, + "grad_norm": 1.1394176483154297, + "learning_rate": 1.4859943977591037e-05, + "loss": 0.483, + "step": 17578 + }, + { + "epoch": 22.50112, + "grad_norm": 1.0991370677947998, + "learning_rate": 1.4857943177270909e-05, + "loss": 0.49, + "step": 17579 + }, + { + "epoch": 22.5024, + "grad_norm": 1.1252307891845703, + "learning_rate": 1.4855942376950783e-05, + "loss": 0.4859, + "step": 17580 + }, + { + "epoch": 22.50368, + "grad_norm": 1.1384406089782715, + "learning_rate": 1.4853941576630653e-05, + "loss": 0.4985, + "step": 17581 + }, + { + "epoch": 22.50496, + "grad_norm": 1.1817059516906738, + "learning_rate": 1.4851940776310525e-05, + "loss": 0.4953, + "step": 17582 + }, + { + "epoch": 22.50624, + "grad_norm": 1.1219290494918823, + "learning_rate": 1.4849939975990396e-05, + "loss": 0.4729, + "step": 17583 + }, + { + "epoch": 22.50752, + "grad_norm": 1.1384577751159668, + "learning_rate": 1.484793917567027e-05, + "loss": 0.5415, + "step": 17584 + }, + { + "epoch": 22.5088, + "grad_norm": 1.1150089502334595, + "learning_rate": 1.484593837535014e-05, + "loss": 0.4845, + "step": 17585 + }, + { + "epoch": 22.51008, + "grad_norm": 1.1471636295318604, + "learning_rate": 1.4843937575030012e-05, + "loss": 0.4882, + "step": 17586 + }, + { + "epoch": 22.51136, + "grad_norm": 1.1583935022354126, + "learning_rate": 1.4841936774709886e-05, + "loss": 0.5159, + "step": 17587 + }, + { + "epoch": 22.51264, + "grad_norm": 1.1290314197540283, + "learning_rate": 1.4839935974389757e-05, + "loss": 0.4779, + "step": 17588 + }, + { + "epoch": 22.51392, + "grad_norm": 1.116462230682373, + "learning_rate": 1.4837935174069628e-05, + "loss": 0.4735, + "step": 17589 + }, + { + "epoch": 22.5152, + "grad_norm": 1.1295406818389893, + "learning_rate": 1.48359343737495e-05, + "loss": 0.4985, + "step": 17590 + }, + { + "epoch": 22.51648, + "grad_norm": 1.0936286449432373, + "learning_rate": 1.4833933573429373e-05, + "loss": 0.4902, + "step": 17591 + }, + { + "epoch": 22.51776, + "grad_norm": 1.1139616966247559, + "learning_rate": 1.4831932773109245e-05, + "loss": 0.5094, + "step": 17592 + }, + { + "epoch": 22.51904, + "grad_norm": 1.0469192266464233, + "learning_rate": 1.4829931972789115e-05, + "loss": 0.4364, + "step": 17593 + }, + { + "epoch": 22.52032, + "grad_norm": 1.0828360319137573, + "learning_rate": 1.482793117246899e-05, + "loss": 0.483, + "step": 17594 + }, + { + "epoch": 22.5216, + "grad_norm": 1.1085882186889648, + "learning_rate": 1.482593037214886e-05, + "loss": 0.5008, + "step": 17595 + }, + { + "epoch": 22.52288, + "grad_norm": 1.1540615558624268, + "learning_rate": 1.4823929571828732e-05, + "loss": 0.4861, + "step": 17596 + }, + { + "epoch": 22.52416, + "grad_norm": 1.106768012046814, + "learning_rate": 1.4821928771508602e-05, + "loss": 0.4946, + "step": 17597 + }, + { + "epoch": 22.52544, + "grad_norm": 1.1145246028900146, + "learning_rate": 1.4819927971188478e-05, + "loss": 0.4877, + "step": 17598 + }, + { + "epoch": 22.52672, + "grad_norm": 1.1328237056732178, + "learning_rate": 1.4817927170868348e-05, + "loss": 0.4733, + "step": 17599 + }, + { + "epoch": 22.528, + "grad_norm": 1.1732732057571411, + "learning_rate": 1.481592637054822e-05, + "loss": 0.4773, + "step": 17600 + }, + { + "epoch": 22.52928, + "grad_norm": 1.093930721282959, + "learning_rate": 1.4813925570228093e-05, + "loss": 0.4651, + "step": 17601 + }, + { + "epoch": 22.53056, + "grad_norm": 1.0944005250930786, + "learning_rate": 1.4811924769907965e-05, + "loss": 0.4874, + "step": 17602 + }, + { + "epoch": 22.53184, + "grad_norm": 1.1620426177978516, + "learning_rate": 1.4809923969587835e-05, + "loss": 0.4759, + "step": 17603 + }, + { + "epoch": 22.53312, + "grad_norm": 1.131779670715332, + "learning_rate": 1.4807923169267707e-05, + "loss": 0.4777, + "step": 17604 + }, + { + "epoch": 22.5344, + "grad_norm": 1.1112638711929321, + "learning_rate": 1.480592236894758e-05, + "loss": 0.4639, + "step": 17605 + }, + { + "epoch": 22.53568, + "grad_norm": 1.039440393447876, + "learning_rate": 1.4803921568627453e-05, + "loss": 0.4314, + "step": 17606 + }, + { + "epoch": 22.53696, + "grad_norm": 1.0729459524154663, + "learning_rate": 1.4801920768307323e-05, + "loss": 0.4708, + "step": 17607 + }, + { + "epoch": 22.538240000000002, + "grad_norm": 1.1430833339691162, + "learning_rate": 1.4799919967987196e-05, + "loss": 0.4659, + "step": 17608 + }, + { + "epoch": 22.53952, + "grad_norm": 1.1749787330627441, + "learning_rate": 1.4797919167667068e-05, + "loss": 0.5147, + "step": 17609 + }, + { + "epoch": 22.5408, + "grad_norm": 1.1035168170928955, + "learning_rate": 1.479591836734694e-05, + "loss": 0.4272, + "step": 17610 + }, + { + "epoch": 22.54208, + "grad_norm": 1.131881594657898, + "learning_rate": 1.479391756702681e-05, + "loss": 0.4769, + "step": 17611 + }, + { + "epoch": 22.54336, + "grad_norm": 1.0516685247421265, + "learning_rate": 1.4791916766706684e-05, + "loss": 0.4525, + "step": 17612 + }, + { + "epoch": 22.54464, + "grad_norm": 1.0958613157272339, + "learning_rate": 1.4789915966386556e-05, + "loss": 0.4753, + "step": 17613 + }, + { + "epoch": 22.54592, + "grad_norm": 1.1349154710769653, + "learning_rate": 1.4787915166066427e-05, + "loss": 0.4946, + "step": 17614 + }, + { + "epoch": 22.5472, + "grad_norm": 1.055051565170288, + "learning_rate": 1.4785914365746298e-05, + "loss": 0.4331, + "step": 17615 + }, + { + "epoch": 22.54848, + "grad_norm": 1.1691598892211914, + "learning_rate": 1.4783913565426171e-05, + "loss": 0.5193, + "step": 17616 + }, + { + "epoch": 22.54976, + "grad_norm": 1.0969427824020386, + "learning_rate": 1.4781912765106043e-05, + "loss": 0.4621, + "step": 17617 + }, + { + "epoch": 22.55104, + "grad_norm": 1.1640121936798096, + "learning_rate": 1.4779911964785915e-05, + "loss": 0.4855, + "step": 17618 + }, + { + "epoch": 22.55232, + "grad_norm": 1.2234057188034058, + "learning_rate": 1.4777911164465788e-05, + "loss": 0.4956, + "step": 17619 + }, + { + "epoch": 22.5536, + "grad_norm": 1.1525243520736694, + "learning_rate": 1.4775910364145659e-05, + "loss": 0.4738, + "step": 17620 + }, + { + "epoch": 22.55488, + "grad_norm": 1.1218520402908325, + "learning_rate": 1.477390956382553e-05, + "loss": 0.4954, + "step": 17621 + }, + { + "epoch": 22.55616, + "grad_norm": 1.1429941654205322, + "learning_rate": 1.4771908763505402e-05, + "loss": 0.4926, + "step": 17622 + }, + { + "epoch": 22.55744, + "grad_norm": 1.072890043258667, + "learning_rate": 1.4769907963185276e-05, + "loss": 0.4441, + "step": 17623 + }, + { + "epoch": 22.55872, + "grad_norm": 1.085209608078003, + "learning_rate": 1.4767907162865146e-05, + "loss": 0.4751, + "step": 17624 + }, + { + "epoch": 22.56, + "grad_norm": 1.085693597793579, + "learning_rate": 1.4765906362545018e-05, + "loss": 0.4678, + "step": 17625 + }, + { + "epoch": 22.56128, + "grad_norm": 1.1705639362335205, + "learning_rate": 1.4763905562224891e-05, + "loss": 0.4686, + "step": 17626 + }, + { + "epoch": 22.56256, + "grad_norm": 1.130325198173523, + "learning_rate": 1.4761904761904763e-05, + "loss": 0.5101, + "step": 17627 + }, + { + "epoch": 22.56384, + "grad_norm": 1.1805752515792847, + "learning_rate": 1.4759903961584633e-05, + "loss": 0.5458, + "step": 17628 + }, + { + "epoch": 22.56512, + "grad_norm": 1.1645885705947876, + "learning_rate": 1.4757903161264505e-05, + "loss": 0.5055, + "step": 17629 + }, + { + "epoch": 22.5664, + "grad_norm": 1.2138627767562866, + "learning_rate": 1.4755902360944379e-05, + "loss": 0.5324, + "step": 17630 + }, + { + "epoch": 22.56768, + "grad_norm": 1.119138240814209, + "learning_rate": 1.475390156062425e-05, + "loss": 0.4681, + "step": 17631 + }, + { + "epoch": 22.56896, + "grad_norm": 1.09098482131958, + "learning_rate": 1.475190076030412e-05, + "loss": 0.4839, + "step": 17632 + }, + { + "epoch": 22.57024, + "grad_norm": 1.1301281452178955, + "learning_rate": 1.4749899959983996e-05, + "loss": 0.4629, + "step": 17633 + }, + { + "epoch": 22.57152, + "grad_norm": 1.1431349515914917, + "learning_rate": 1.4747899159663866e-05, + "loss": 0.5064, + "step": 17634 + }, + { + "epoch": 22.5728, + "grad_norm": 1.023766279220581, + "learning_rate": 1.4745898359343738e-05, + "loss": 0.4402, + "step": 17635 + }, + { + "epoch": 22.57408, + "grad_norm": 1.1107232570648193, + "learning_rate": 1.4743897559023608e-05, + "loss": 0.4486, + "step": 17636 + }, + { + "epoch": 22.57536, + "grad_norm": 1.0915488004684448, + "learning_rate": 1.4741896758703483e-05, + "loss": 0.4598, + "step": 17637 + }, + { + "epoch": 22.57664, + "grad_norm": 1.1454777717590332, + "learning_rate": 1.4739895958383354e-05, + "loss": 0.5209, + "step": 17638 + }, + { + "epoch": 22.57792, + "grad_norm": 1.0833492279052734, + "learning_rate": 1.4737895158063225e-05, + "loss": 0.5036, + "step": 17639 + }, + { + "epoch": 22.5792, + "grad_norm": 1.115124225616455, + "learning_rate": 1.4735894357743099e-05, + "loss": 0.5157, + "step": 17640 + }, + { + "epoch": 22.58048, + "grad_norm": 1.121090292930603, + "learning_rate": 1.4733893557422971e-05, + "loss": 0.5237, + "step": 17641 + }, + { + "epoch": 22.58176, + "grad_norm": 1.0920593738555908, + "learning_rate": 1.4731892757102841e-05, + "loss": 0.4775, + "step": 17642 + }, + { + "epoch": 22.58304, + "grad_norm": 1.0497217178344727, + "learning_rate": 1.4729891956782713e-05, + "loss": 0.4622, + "step": 17643 + }, + { + "epoch": 22.584319999999998, + "grad_norm": 1.1190520524978638, + "learning_rate": 1.4727891156462586e-05, + "loss": 0.4555, + "step": 17644 + }, + { + "epoch": 22.5856, + "grad_norm": 1.1475340127944946, + "learning_rate": 1.4725890356142458e-05, + "loss": 0.5076, + "step": 17645 + }, + { + "epoch": 22.58688, + "grad_norm": 1.2062084674835205, + "learning_rate": 1.4723889555822328e-05, + "loss": 0.4723, + "step": 17646 + }, + { + "epoch": 22.58816, + "grad_norm": 1.1473878622055054, + "learning_rate": 1.4721888755502202e-05, + "loss": 0.522, + "step": 17647 + }, + { + "epoch": 22.58944, + "grad_norm": 1.0924144983291626, + "learning_rate": 1.4719887955182074e-05, + "loss": 0.4567, + "step": 17648 + }, + { + "epoch": 22.59072, + "grad_norm": 1.141308069229126, + "learning_rate": 1.4717887154861946e-05, + "loss": 0.4708, + "step": 17649 + }, + { + "epoch": 22.592, + "grad_norm": 1.0392951965332031, + "learning_rate": 1.4715886354541816e-05, + "loss": 0.4772, + "step": 17650 + }, + { + "epoch": 22.59328, + "grad_norm": 1.0937608480453491, + "learning_rate": 1.471388555422169e-05, + "loss": 0.4681, + "step": 17651 + }, + { + "epoch": 22.59456, + "grad_norm": 1.1153881549835205, + "learning_rate": 1.4711884753901561e-05, + "loss": 0.5083, + "step": 17652 + }, + { + "epoch": 22.59584, + "grad_norm": 1.1007659435272217, + "learning_rate": 1.4709883953581433e-05, + "loss": 0.4735, + "step": 17653 + }, + { + "epoch": 22.59712, + "grad_norm": 1.126693844795227, + "learning_rate": 1.4707883153261307e-05, + "loss": 0.4602, + "step": 17654 + }, + { + "epoch": 22.5984, + "grad_norm": 1.103529453277588, + "learning_rate": 1.4705882352941177e-05, + "loss": 0.4932, + "step": 17655 + }, + { + "epoch": 22.59968, + "grad_norm": 1.045106053352356, + "learning_rate": 1.4703881552621049e-05, + "loss": 0.4576, + "step": 17656 + }, + { + "epoch": 22.60096, + "grad_norm": 1.0801410675048828, + "learning_rate": 1.470188075230092e-05, + "loss": 0.4534, + "step": 17657 + }, + { + "epoch": 22.60224, + "grad_norm": 1.1122349500656128, + "learning_rate": 1.4699879951980794e-05, + "loss": 0.4549, + "step": 17658 + }, + { + "epoch": 22.60352, + "grad_norm": 1.1067173480987549, + "learning_rate": 1.4697879151660664e-05, + "loss": 0.478, + "step": 17659 + }, + { + "epoch": 22.6048, + "grad_norm": 1.1408361196517944, + "learning_rate": 1.4695878351340536e-05, + "loss": 0.4798, + "step": 17660 + }, + { + "epoch": 22.60608, + "grad_norm": 1.1362731456756592, + "learning_rate": 1.469387755102041e-05, + "loss": 0.4567, + "step": 17661 + }, + { + "epoch": 22.60736, + "grad_norm": 1.2117960453033447, + "learning_rate": 1.4691876750700282e-05, + "loss": 0.5187, + "step": 17662 + }, + { + "epoch": 22.60864, + "grad_norm": 1.155263066291809, + "learning_rate": 1.4689875950380152e-05, + "loss": 0.531, + "step": 17663 + }, + { + "epoch": 22.60992, + "grad_norm": 1.1107439994812012, + "learning_rate": 1.4687875150060024e-05, + "loss": 0.471, + "step": 17664 + }, + { + "epoch": 22.6112, + "grad_norm": 1.1272735595703125, + "learning_rate": 1.4685874349739897e-05, + "loss": 0.4807, + "step": 17665 + }, + { + "epoch": 22.61248, + "grad_norm": 1.142755150794983, + "learning_rate": 1.4683873549419769e-05, + "loss": 0.4807, + "step": 17666 + }, + { + "epoch": 22.61376, + "grad_norm": 1.160695195198059, + "learning_rate": 1.468187274909964e-05, + "loss": 0.5241, + "step": 17667 + }, + { + "epoch": 22.61504, + "grad_norm": 1.1584571599960327, + "learning_rate": 1.4679871948779514e-05, + "loss": 0.4977, + "step": 17668 + }, + { + "epoch": 22.61632, + "grad_norm": 1.08279287815094, + "learning_rate": 1.4677871148459385e-05, + "loss": 0.4404, + "step": 17669 + }, + { + "epoch": 22.6176, + "grad_norm": 1.0838192701339722, + "learning_rate": 1.4675870348139256e-05, + "loss": 0.4565, + "step": 17670 + }, + { + "epoch": 22.61888, + "grad_norm": 1.1284055709838867, + "learning_rate": 1.4673869547819127e-05, + "loss": 0.5155, + "step": 17671 + }, + { + "epoch": 22.62016, + "grad_norm": 1.0866196155548096, + "learning_rate": 1.4671868747499002e-05, + "loss": 0.4731, + "step": 17672 + }, + { + "epoch": 22.62144, + "grad_norm": 1.1500025987625122, + "learning_rate": 1.4669867947178872e-05, + "loss": 0.5287, + "step": 17673 + }, + { + "epoch": 22.62272, + "grad_norm": 1.1272227764129639, + "learning_rate": 1.4667867146858744e-05, + "loss": 0.4799, + "step": 17674 + }, + { + "epoch": 22.624, + "grad_norm": 1.1563876867294312, + "learning_rate": 1.4665866346538614e-05, + "loss": 0.4674, + "step": 17675 + }, + { + "epoch": 22.62528, + "grad_norm": 1.1425659656524658, + "learning_rate": 1.466386554621849e-05, + "loss": 0.4739, + "step": 17676 + }, + { + "epoch": 22.62656, + "grad_norm": 1.1223922967910767, + "learning_rate": 1.466186474589836e-05, + "loss": 0.4803, + "step": 17677 + }, + { + "epoch": 22.62784, + "grad_norm": 1.1119972467422485, + "learning_rate": 1.4659863945578231e-05, + "loss": 0.5031, + "step": 17678 + }, + { + "epoch": 22.62912, + "grad_norm": 1.1872590780258179, + "learning_rate": 1.4657863145258105e-05, + "loss": 0.5111, + "step": 17679 + }, + { + "epoch": 22.6304, + "grad_norm": 1.173486351966858, + "learning_rate": 1.4655862344937977e-05, + "loss": 0.5107, + "step": 17680 + }, + { + "epoch": 22.63168, + "grad_norm": 1.1515588760375977, + "learning_rate": 1.4653861544617847e-05, + "loss": 0.4845, + "step": 17681 + }, + { + "epoch": 22.63296, + "grad_norm": 1.1229499578475952, + "learning_rate": 1.4651860744297719e-05, + "loss": 0.4891, + "step": 17682 + }, + { + "epoch": 22.63424, + "grad_norm": 1.1395021677017212, + "learning_rate": 1.4649859943977592e-05, + "loss": 0.4833, + "step": 17683 + }, + { + "epoch": 22.63552, + "grad_norm": 1.1398979425430298, + "learning_rate": 1.4647859143657464e-05, + "loss": 0.4845, + "step": 17684 + }, + { + "epoch": 22.6368, + "grad_norm": 1.1859970092773438, + "learning_rate": 1.4645858343337334e-05, + "loss": 0.5003, + "step": 17685 + }, + { + "epoch": 22.63808, + "grad_norm": 1.1473771333694458, + "learning_rate": 1.464385754301721e-05, + "loss": 0.4632, + "step": 17686 + }, + { + "epoch": 22.63936, + "grad_norm": 1.087823748588562, + "learning_rate": 1.464185674269708e-05, + "loss": 0.4444, + "step": 17687 + }, + { + "epoch": 22.64064, + "grad_norm": 1.0971521139144897, + "learning_rate": 1.4639855942376952e-05, + "loss": 0.4446, + "step": 17688 + }, + { + "epoch": 22.64192, + "grad_norm": 1.1230698823928833, + "learning_rate": 1.4637855142056822e-05, + "loss": 0.45, + "step": 17689 + }, + { + "epoch": 22.6432, + "grad_norm": 1.176695704460144, + "learning_rate": 1.4635854341736697e-05, + "loss": 0.5211, + "step": 17690 + }, + { + "epoch": 22.64448, + "grad_norm": 1.1348381042480469, + "learning_rate": 1.4633853541416567e-05, + "loss": 0.487, + "step": 17691 + }, + { + "epoch": 22.64576, + "grad_norm": 1.1356598138809204, + "learning_rate": 1.4631852741096439e-05, + "loss": 0.5139, + "step": 17692 + }, + { + "epoch": 22.64704, + "grad_norm": 1.204661250114441, + "learning_rate": 1.4629851940776313e-05, + "loss": 0.5212, + "step": 17693 + }, + { + "epoch": 22.64832, + "grad_norm": 1.1288728713989258, + "learning_rate": 1.4627851140456184e-05, + "loss": 0.5026, + "step": 17694 + }, + { + "epoch": 22.6496, + "grad_norm": 1.1189838647842407, + "learning_rate": 1.4625850340136055e-05, + "loss": 0.4806, + "step": 17695 + }, + { + "epoch": 22.65088, + "grad_norm": 1.099488615989685, + "learning_rate": 1.4623849539815926e-05, + "loss": 0.4657, + "step": 17696 + }, + { + "epoch": 22.65216, + "grad_norm": 1.1469444036483765, + "learning_rate": 1.46218487394958e-05, + "loss": 0.5038, + "step": 17697 + }, + { + "epoch": 22.65344, + "grad_norm": 1.1008341312408447, + "learning_rate": 1.4619847939175672e-05, + "loss": 0.4696, + "step": 17698 + }, + { + "epoch": 22.65472, + "grad_norm": 1.1267304420471191, + "learning_rate": 1.4617847138855542e-05, + "loss": 0.4798, + "step": 17699 + }, + { + "epoch": 22.656, + "grad_norm": 1.0712333917617798, + "learning_rate": 1.4615846338535416e-05, + "loss": 0.4875, + "step": 17700 + }, + { + "epoch": 22.65728, + "grad_norm": 1.090133547782898, + "learning_rate": 1.4613845538215287e-05, + "loss": 0.5221, + "step": 17701 + }, + { + "epoch": 22.65856, + "grad_norm": 1.0911118984222412, + "learning_rate": 1.461184473789516e-05, + "loss": 0.4545, + "step": 17702 + }, + { + "epoch": 22.65984, + "grad_norm": 1.1019871234893799, + "learning_rate": 1.460984393757503e-05, + "loss": 0.5021, + "step": 17703 + }, + { + "epoch": 22.66112, + "grad_norm": 1.1478427648544312, + "learning_rate": 1.4607843137254903e-05, + "loss": 0.4791, + "step": 17704 + }, + { + "epoch": 22.6624, + "grad_norm": 1.038714051246643, + "learning_rate": 1.4605842336934775e-05, + "loss": 0.4635, + "step": 17705 + }, + { + "epoch": 22.66368, + "grad_norm": 1.1068960428237915, + "learning_rate": 1.4603841536614647e-05, + "loss": 0.4915, + "step": 17706 + }, + { + "epoch": 22.66496, + "grad_norm": 1.1241267919540405, + "learning_rate": 1.460184073629452e-05, + "loss": 0.4938, + "step": 17707 + }, + { + "epoch": 22.66624, + "grad_norm": 1.1513806581497192, + "learning_rate": 1.459983993597439e-05, + "loss": 0.4996, + "step": 17708 + }, + { + "epoch": 22.66752, + "grad_norm": 1.0984781980514526, + "learning_rate": 1.4597839135654262e-05, + "loss": 0.4556, + "step": 17709 + }, + { + "epoch": 22.6688, + "grad_norm": 1.1513564586639404, + "learning_rate": 1.4595838335334134e-05, + "loss": 0.4871, + "step": 17710 + }, + { + "epoch": 22.67008, + "grad_norm": 1.0893858671188354, + "learning_rate": 1.4593837535014008e-05, + "loss": 0.418, + "step": 17711 + }, + { + "epoch": 22.67136, + "grad_norm": 1.1374493837356567, + "learning_rate": 1.4591836734693878e-05, + "loss": 0.4628, + "step": 17712 + }, + { + "epoch": 22.67264, + "grad_norm": 1.066206932067871, + "learning_rate": 1.458983593437375e-05, + "loss": 0.4407, + "step": 17713 + }, + { + "epoch": 22.67392, + "grad_norm": 1.120012640953064, + "learning_rate": 1.4587835134053623e-05, + "loss": 0.4844, + "step": 17714 + }, + { + "epoch": 22.6752, + "grad_norm": 1.1568934917449951, + "learning_rate": 1.4585834333733495e-05, + "loss": 0.4558, + "step": 17715 + }, + { + "epoch": 22.67648, + "grad_norm": 1.1683546304702759, + "learning_rate": 1.4583833533413365e-05, + "loss": 0.5141, + "step": 17716 + }, + { + "epoch": 22.67776, + "grad_norm": 1.1002466678619385, + "learning_rate": 1.4581832733093237e-05, + "loss": 0.483, + "step": 17717 + }, + { + "epoch": 22.67904, + "grad_norm": 1.1254520416259766, + "learning_rate": 1.457983193277311e-05, + "loss": 0.4755, + "step": 17718 + }, + { + "epoch": 22.680320000000002, + "grad_norm": 1.0948314666748047, + "learning_rate": 1.4577831132452982e-05, + "loss": 0.4998, + "step": 17719 + }, + { + "epoch": 22.6816, + "grad_norm": 1.126795768737793, + "learning_rate": 1.4575830332132853e-05, + "loss": 0.4927, + "step": 17720 + }, + { + "epoch": 22.68288, + "grad_norm": 1.096273422241211, + "learning_rate": 1.4573829531812728e-05, + "loss": 0.4373, + "step": 17721 + }, + { + "epoch": 22.68416, + "grad_norm": 1.1614435911178589, + "learning_rate": 1.4571828731492598e-05, + "loss": 0.5023, + "step": 17722 + }, + { + "epoch": 22.68544, + "grad_norm": 1.181004524230957, + "learning_rate": 1.456982793117247e-05, + "loss": 0.5358, + "step": 17723 + }, + { + "epoch": 22.68672, + "grad_norm": 1.0640144348144531, + "learning_rate": 1.456782713085234e-05, + "loss": 0.4442, + "step": 17724 + }, + { + "epoch": 22.688, + "grad_norm": 1.1106035709381104, + "learning_rate": 1.4565826330532215e-05, + "loss": 0.4776, + "step": 17725 + }, + { + "epoch": 22.68928, + "grad_norm": 1.104239583015442, + "learning_rate": 1.4563825530212085e-05, + "loss": 0.4939, + "step": 17726 + }, + { + "epoch": 22.69056, + "grad_norm": 1.165892481803894, + "learning_rate": 1.4561824729891957e-05, + "loss": 0.5086, + "step": 17727 + }, + { + "epoch": 22.69184, + "grad_norm": 1.1454297304153442, + "learning_rate": 1.4559823929571828e-05, + "loss": 0.4816, + "step": 17728 + }, + { + "epoch": 22.69312, + "grad_norm": 1.103267788887024, + "learning_rate": 1.4557823129251703e-05, + "loss": 0.4574, + "step": 17729 + }, + { + "epoch": 22.6944, + "grad_norm": 1.1371041536331177, + "learning_rate": 1.4555822328931573e-05, + "loss": 0.4844, + "step": 17730 + }, + { + "epoch": 22.69568, + "grad_norm": 1.0860483646392822, + "learning_rate": 1.4553821528611445e-05, + "loss": 0.4337, + "step": 17731 + }, + { + "epoch": 22.69696, + "grad_norm": 1.1786482334136963, + "learning_rate": 1.4551820728291318e-05, + "loss": 0.5068, + "step": 17732 + }, + { + "epoch": 22.69824, + "grad_norm": 1.1752735376358032, + "learning_rate": 1.454981992797119e-05, + "loss": 0.5256, + "step": 17733 + }, + { + "epoch": 22.69952, + "grad_norm": 1.1000226736068726, + "learning_rate": 1.454781912765106e-05, + "loss": 0.4766, + "step": 17734 + }, + { + "epoch": 22.7008, + "grad_norm": 1.137929081916809, + "learning_rate": 1.4545818327330932e-05, + "loss": 0.4936, + "step": 17735 + }, + { + "epoch": 22.70208, + "grad_norm": 1.1989151239395142, + "learning_rate": 1.4543817527010806e-05, + "loss": 0.5027, + "step": 17736 + }, + { + "epoch": 22.70336, + "grad_norm": 1.180206537246704, + "learning_rate": 1.4541816726690678e-05, + "loss": 0.5085, + "step": 17737 + }, + { + "epoch": 22.70464, + "grad_norm": 1.0755740404129028, + "learning_rate": 1.4539815926370548e-05, + "loss": 0.4829, + "step": 17738 + }, + { + "epoch": 22.70592, + "grad_norm": 1.0281373262405396, + "learning_rate": 1.4537815126050421e-05, + "loss": 0.4234, + "step": 17739 + }, + { + "epoch": 22.7072, + "grad_norm": 1.0628975629806519, + "learning_rate": 1.4535814325730293e-05, + "loss": 0.4498, + "step": 17740 + }, + { + "epoch": 22.70848, + "grad_norm": 1.0640840530395508, + "learning_rate": 1.4533813525410165e-05, + "loss": 0.4457, + "step": 17741 + }, + { + "epoch": 22.70976, + "grad_norm": 1.1275110244750977, + "learning_rate": 1.4531812725090035e-05, + "loss": 0.5072, + "step": 17742 + }, + { + "epoch": 22.71104, + "grad_norm": 1.1140217781066895, + "learning_rate": 1.4529811924769909e-05, + "loss": 0.4612, + "step": 17743 + }, + { + "epoch": 22.71232, + "grad_norm": 1.083038568496704, + "learning_rate": 1.452781112444978e-05, + "loss": 0.4893, + "step": 17744 + }, + { + "epoch": 22.7136, + "grad_norm": 1.105229377746582, + "learning_rate": 1.4525810324129652e-05, + "loss": 0.461, + "step": 17745 + }, + { + "epoch": 22.71488, + "grad_norm": 1.2414989471435547, + "learning_rate": 1.4523809523809526e-05, + "loss": 0.5307, + "step": 17746 + }, + { + "epoch": 22.71616, + "grad_norm": 1.2143381834030151, + "learning_rate": 1.4521808723489396e-05, + "loss": 0.5498, + "step": 17747 + }, + { + "epoch": 22.71744, + "grad_norm": 1.171492099761963, + "learning_rate": 1.4519807923169268e-05, + "loss": 0.4874, + "step": 17748 + }, + { + "epoch": 22.71872, + "grad_norm": 1.1219180822372437, + "learning_rate": 1.451780712284914e-05, + "loss": 0.4637, + "step": 17749 + }, + { + "epoch": 22.72, + "grad_norm": 1.1720901727676392, + "learning_rate": 1.4515806322529013e-05, + "loss": 0.5322, + "step": 17750 + }, + { + "epoch": 22.72128, + "grad_norm": 1.137251615524292, + "learning_rate": 1.4513805522208884e-05, + "loss": 0.5004, + "step": 17751 + }, + { + "epoch": 22.72256, + "grad_norm": 1.147051215171814, + "learning_rate": 1.4511804721888755e-05, + "loss": 0.4769, + "step": 17752 + }, + { + "epoch": 22.72384, + "grad_norm": 1.1876957416534424, + "learning_rate": 1.4509803921568629e-05, + "loss": 0.4828, + "step": 17753 + }, + { + "epoch": 22.72512, + "grad_norm": 1.1537209749221802, + "learning_rate": 1.4507803121248501e-05, + "loss": 0.5377, + "step": 17754 + }, + { + "epoch": 22.7264, + "grad_norm": 1.0867969989776611, + "learning_rate": 1.4505802320928371e-05, + "loss": 0.4288, + "step": 17755 + }, + { + "epoch": 22.72768, + "grad_norm": 1.1193979978561401, + "learning_rate": 1.4503801520608243e-05, + "loss": 0.4479, + "step": 17756 + }, + { + "epoch": 22.72896, + "grad_norm": 1.113378882408142, + "learning_rate": 1.4501800720288116e-05, + "loss": 0.4729, + "step": 17757 + }, + { + "epoch": 22.73024, + "grad_norm": 1.0698660612106323, + "learning_rate": 1.4499799919967988e-05, + "loss": 0.454, + "step": 17758 + }, + { + "epoch": 22.73152, + "grad_norm": 1.0980463027954102, + "learning_rate": 1.4497799119647858e-05, + "loss": 0.512, + "step": 17759 + }, + { + "epoch": 22.7328, + "grad_norm": 1.0734245777130127, + "learning_rate": 1.4495798319327734e-05, + "loss": 0.4737, + "step": 17760 + }, + { + "epoch": 22.73408, + "grad_norm": 1.1146806478500366, + "learning_rate": 1.4493797519007604e-05, + "loss": 0.4867, + "step": 17761 + }, + { + "epoch": 22.73536, + "grad_norm": 1.1204739809036255, + "learning_rate": 1.4491796718687476e-05, + "loss": 0.4844, + "step": 17762 + }, + { + "epoch": 22.73664, + "grad_norm": 1.0988640785217285, + "learning_rate": 1.4489795918367346e-05, + "loss": 0.4449, + "step": 17763 + }, + { + "epoch": 22.73792, + "grad_norm": 1.1634745597839355, + "learning_rate": 1.4487795118047221e-05, + "loss": 0.5171, + "step": 17764 + }, + { + "epoch": 22.7392, + "grad_norm": 1.177647352218628, + "learning_rate": 1.4485794317727091e-05, + "loss": 0.5184, + "step": 17765 + }, + { + "epoch": 22.74048, + "grad_norm": 1.12690007686615, + "learning_rate": 1.4483793517406963e-05, + "loss": 0.4881, + "step": 17766 + }, + { + "epoch": 22.74176, + "grad_norm": 1.1667804718017578, + "learning_rate": 1.4481792717086837e-05, + "loss": 0.4587, + "step": 17767 + }, + { + "epoch": 22.74304, + "grad_norm": 1.1464014053344727, + "learning_rate": 1.4479791916766709e-05, + "loss": 0.4792, + "step": 17768 + }, + { + "epoch": 22.74432, + "grad_norm": 1.0807876586914062, + "learning_rate": 1.4477791116446579e-05, + "loss": 0.4861, + "step": 17769 + }, + { + "epoch": 22.7456, + "grad_norm": 1.1696687936782837, + "learning_rate": 1.447579031612645e-05, + "loss": 0.5222, + "step": 17770 + }, + { + "epoch": 22.74688, + "grad_norm": 1.1905460357666016, + "learning_rate": 1.4473789515806324e-05, + "loss": 0.5166, + "step": 17771 + }, + { + "epoch": 22.74816, + "grad_norm": 1.1877039670944214, + "learning_rate": 1.4471788715486196e-05, + "loss": 0.4591, + "step": 17772 + }, + { + "epoch": 22.74944, + "grad_norm": 1.1958463191986084, + "learning_rate": 1.4469787915166066e-05, + "loss": 0.4868, + "step": 17773 + }, + { + "epoch": 22.75072, + "grad_norm": 1.1075440645217896, + "learning_rate": 1.446778711484594e-05, + "loss": 0.4828, + "step": 17774 + }, + { + "epoch": 22.752, + "grad_norm": 1.0892101526260376, + "learning_rate": 1.4465786314525812e-05, + "loss": 0.4635, + "step": 17775 + }, + { + "epoch": 22.75328, + "grad_norm": 1.1559746265411377, + "learning_rate": 1.4463785514205683e-05, + "loss": 0.5233, + "step": 17776 + }, + { + "epoch": 22.75456, + "grad_norm": 1.0793570280075073, + "learning_rate": 1.4461784713885554e-05, + "loss": 0.4376, + "step": 17777 + }, + { + "epoch": 22.75584, + "grad_norm": 1.12493896484375, + "learning_rate": 1.4459783913565427e-05, + "loss": 0.5247, + "step": 17778 + }, + { + "epoch": 22.75712, + "grad_norm": 1.1282217502593994, + "learning_rate": 1.4457783113245299e-05, + "loss": 0.4513, + "step": 17779 + }, + { + "epoch": 22.7584, + "grad_norm": 1.1162893772125244, + "learning_rate": 1.445578231292517e-05, + "loss": 0.4873, + "step": 17780 + }, + { + "epoch": 22.75968, + "grad_norm": 1.1070144176483154, + "learning_rate": 1.4453781512605044e-05, + "loss": 0.4985, + "step": 17781 + }, + { + "epoch": 22.76096, + "grad_norm": 1.0941122770309448, + "learning_rate": 1.4451780712284915e-05, + "loss": 0.4902, + "step": 17782 + }, + { + "epoch": 22.76224, + "grad_norm": 1.1269056797027588, + "learning_rate": 1.4449779911964786e-05, + "loss": 0.469, + "step": 17783 + }, + { + "epoch": 22.76352, + "grad_norm": 1.0950514078140259, + "learning_rate": 1.4447779111644658e-05, + "loss": 0.4394, + "step": 17784 + }, + { + "epoch": 22.7648, + "grad_norm": 1.1223610639572144, + "learning_rate": 1.4445778311324532e-05, + "loss": 0.4732, + "step": 17785 + }, + { + "epoch": 22.76608, + "grad_norm": 1.0896393060684204, + "learning_rate": 1.4443777511004402e-05, + "loss": 0.4444, + "step": 17786 + }, + { + "epoch": 22.76736, + "grad_norm": 1.1810468435287476, + "learning_rate": 1.4441776710684274e-05, + "loss": 0.5342, + "step": 17787 + }, + { + "epoch": 22.76864, + "grad_norm": 1.1601206064224243, + "learning_rate": 1.4439775910364146e-05, + "loss": 0.5354, + "step": 17788 + }, + { + "epoch": 22.76992, + "grad_norm": 1.1780500411987305, + "learning_rate": 1.443777511004402e-05, + "loss": 0.4826, + "step": 17789 + }, + { + "epoch": 22.7712, + "grad_norm": 1.1130772829055786, + "learning_rate": 1.443577430972389e-05, + "loss": 0.4948, + "step": 17790 + }, + { + "epoch": 22.77248, + "grad_norm": 1.1274909973144531, + "learning_rate": 1.4433773509403761e-05, + "loss": 0.4979, + "step": 17791 + }, + { + "epoch": 22.77376, + "grad_norm": 1.1447163820266724, + "learning_rate": 1.4431772709083635e-05, + "loss": 0.4735, + "step": 17792 + }, + { + "epoch": 22.77504, + "grad_norm": 1.0660210847854614, + "learning_rate": 1.4429771908763507e-05, + "loss": 0.4555, + "step": 17793 + }, + { + "epoch": 22.77632, + "grad_norm": 1.0781277418136597, + "learning_rate": 1.4427771108443377e-05, + "loss": 0.4619, + "step": 17794 + }, + { + "epoch": 22.7776, + "grad_norm": 1.1187148094177246, + "learning_rate": 1.4425770308123249e-05, + "loss": 0.4552, + "step": 17795 + }, + { + "epoch": 22.77888, + "grad_norm": 1.1533479690551758, + "learning_rate": 1.4423769507803122e-05, + "loss": 0.4782, + "step": 17796 + }, + { + "epoch": 22.78016, + "grad_norm": 1.143250584602356, + "learning_rate": 1.4421768707482994e-05, + "loss": 0.5023, + "step": 17797 + }, + { + "epoch": 22.78144, + "grad_norm": 1.1441926956176758, + "learning_rate": 1.4419767907162864e-05, + "loss": 0.5012, + "step": 17798 + }, + { + "epoch": 22.78272, + "grad_norm": 1.1615264415740967, + "learning_rate": 1.441776710684274e-05, + "loss": 0.5388, + "step": 17799 + }, + { + "epoch": 22.784, + "grad_norm": 1.1589363813400269, + "learning_rate": 1.441576630652261e-05, + "loss": 0.505, + "step": 17800 + }, + { + "epoch": 22.78528, + "grad_norm": 1.1797236204147339, + "learning_rate": 1.4413765506202482e-05, + "loss": 0.502, + "step": 17801 + }, + { + "epoch": 22.78656, + "grad_norm": 1.2213630676269531, + "learning_rate": 1.4411764705882352e-05, + "loss": 0.5167, + "step": 17802 + }, + { + "epoch": 22.78784, + "grad_norm": 1.155617117881775, + "learning_rate": 1.4409763905562227e-05, + "loss": 0.4613, + "step": 17803 + }, + { + "epoch": 22.78912, + "grad_norm": 1.1112602949142456, + "learning_rate": 1.4407763105242097e-05, + "loss": 0.4594, + "step": 17804 + }, + { + "epoch": 22.790399999999998, + "grad_norm": 1.1475427150726318, + "learning_rate": 1.4405762304921969e-05, + "loss": 0.457, + "step": 17805 + }, + { + "epoch": 22.79168, + "grad_norm": 1.1044026613235474, + "learning_rate": 1.4403761504601842e-05, + "loss": 0.48, + "step": 17806 + }, + { + "epoch": 22.79296, + "grad_norm": 1.119400978088379, + "learning_rate": 1.4401760704281714e-05, + "loss": 0.481, + "step": 17807 + }, + { + "epoch": 22.79424, + "grad_norm": 1.1394966840744019, + "learning_rate": 1.4399759903961585e-05, + "loss": 0.5108, + "step": 17808 + }, + { + "epoch": 22.79552, + "grad_norm": 1.170421838760376, + "learning_rate": 1.4397759103641456e-05, + "loss": 0.4999, + "step": 17809 + }, + { + "epoch": 22.7968, + "grad_norm": 1.134114384651184, + "learning_rate": 1.439575830332133e-05, + "loss": 0.4723, + "step": 17810 + }, + { + "epoch": 22.79808, + "grad_norm": 1.180664300918579, + "learning_rate": 1.4393757503001202e-05, + "loss": 0.4983, + "step": 17811 + }, + { + "epoch": 22.79936, + "grad_norm": 1.2017977237701416, + "learning_rate": 1.4391756702681072e-05, + "loss": 0.5217, + "step": 17812 + }, + { + "epoch": 22.80064, + "grad_norm": 1.0838621854782104, + "learning_rate": 1.4389755902360945e-05, + "loss": 0.4477, + "step": 17813 + }, + { + "epoch": 22.80192, + "grad_norm": 1.1182233095169067, + "learning_rate": 1.4387755102040817e-05, + "loss": 0.4882, + "step": 17814 + }, + { + "epoch": 22.8032, + "grad_norm": 1.1456414461135864, + "learning_rate": 1.438575430172069e-05, + "loss": 0.4561, + "step": 17815 + }, + { + "epoch": 22.80448, + "grad_norm": 1.0864543914794922, + "learning_rate": 1.438375350140056e-05, + "loss": 0.4516, + "step": 17816 + }, + { + "epoch": 22.80576, + "grad_norm": 1.145043134689331, + "learning_rate": 1.4381752701080433e-05, + "loss": 0.4764, + "step": 17817 + }, + { + "epoch": 22.80704, + "grad_norm": 1.1899402141571045, + "learning_rate": 1.4379751900760305e-05, + "loss": 0.564, + "step": 17818 + }, + { + "epoch": 22.80832, + "grad_norm": 1.0811036825180054, + "learning_rate": 1.4377751100440177e-05, + "loss": 0.453, + "step": 17819 + }, + { + "epoch": 22.8096, + "grad_norm": 1.1460031270980835, + "learning_rate": 1.437575030012005e-05, + "loss": 0.4835, + "step": 17820 + }, + { + "epoch": 22.81088, + "grad_norm": 1.1830692291259766, + "learning_rate": 1.437374949979992e-05, + "loss": 0.4641, + "step": 17821 + }, + { + "epoch": 22.81216, + "grad_norm": 1.204338550567627, + "learning_rate": 1.4371748699479792e-05, + "loss": 0.4938, + "step": 17822 + }, + { + "epoch": 22.81344, + "grad_norm": 1.107739806175232, + "learning_rate": 1.4369747899159664e-05, + "loss": 0.459, + "step": 17823 + }, + { + "epoch": 22.81472, + "grad_norm": 1.113206386566162, + "learning_rate": 1.4367747098839538e-05, + "loss": 0.459, + "step": 17824 + }, + { + "epoch": 22.816, + "grad_norm": 1.164965033531189, + "learning_rate": 1.4365746298519408e-05, + "loss": 0.533, + "step": 17825 + }, + { + "epoch": 22.81728, + "grad_norm": 1.1349233388900757, + "learning_rate": 1.436374549819928e-05, + "loss": 0.4582, + "step": 17826 + }, + { + "epoch": 22.81856, + "grad_norm": 1.1366132497787476, + "learning_rate": 1.4361744697879153e-05, + "loss": 0.518, + "step": 17827 + }, + { + "epoch": 22.81984, + "grad_norm": 1.064651370048523, + "learning_rate": 1.4359743897559025e-05, + "loss": 0.4514, + "step": 17828 + }, + { + "epoch": 22.82112, + "grad_norm": 1.0854195356369019, + "learning_rate": 1.4357743097238895e-05, + "loss": 0.4654, + "step": 17829 + }, + { + "epoch": 22.822400000000002, + "grad_norm": 1.1058958768844604, + "learning_rate": 1.4355742296918767e-05, + "loss": 0.4712, + "step": 17830 + }, + { + "epoch": 22.82368, + "grad_norm": 1.1246731281280518, + "learning_rate": 1.435374149659864e-05, + "loss": 0.4796, + "step": 17831 + }, + { + "epoch": 22.82496, + "grad_norm": 1.123199224472046, + "learning_rate": 1.4351740696278512e-05, + "loss": 0.521, + "step": 17832 + }, + { + "epoch": 22.82624, + "grad_norm": 1.1568586826324463, + "learning_rate": 1.4349739895958383e-05, + "loss": 0.5103, + "step": 17833 + }, + { + "epoch": 22.82752, + "grad_norm": 1.0084097385406494, + "learning_rate": 1.4347739095638258e-05, + "loss": 0.4231, + "step": 17834 + }, + { + "epoch": 22.8288, + "grad_norm": 1.0467265844345093, + "learning_rate": 1.4345738295318128e-05, + "loss": 0.4596, + "step": 17835 + }, + { + "epoch": 22.83008, + "grad_norm": 1.1111983060836792, + "learning_rate": 1.4343737494998e-05, + "loss": 0.4774, + "step": 17836 + }, + { + "epoch": 22.83136, + "grad_norm": 1.1236525774002075, + "learning_rate": 1.434173669467787e-05, + "loss": 0.4637, + "step": 17837 + }, + { + "epoch": 22.83264, + "grad_norm": 1.144333839416504, + "learning_rate": 1.4339735894357745e-05, + "loss": 0.5001, + "step": 17838 + }, + { + "epoch": 22.83392, + "grad_norm": 1.1298198699951172, + "learning_rate": 1.4337735094037615e-05, + "loss": 0.4897, + "step": 17839 + }, + { + "epoch": 22.8352, + "grad_norm": 1.0742158889770508, + "learning_rate": 1.4335734293717487e-05, + "loss": 0.4371, + "step": 17840 + }, + { + "epoch": 22.83648, + "grad_norm": 1.1449413299560547, + "learning_rate": 1.4333733493397357e-05, + "loss": 0.4818, + "step": 17841 + }, + { + "epoch": 22.83776, + "grad_norm": 1.1488326787948608, + "learning_rate": 1.4331732693077233e-05, + "loss": 0.4926, + "step": 17842 + }, + { + "epoch": 22.83904, + "grad_norm": 1.1773475408554077, + "learning_rate": 1.4329731892757103e-05, + "loss": 0.5046, + "step": 17843 + }, + { + "epoch": 22.84032, + "grad_norm": 1.1405889987945557, + "learning_rate": 1.4327731092436975e-05, + "loss": 0.4876, + "step": 17844 + }, + { + "epoch": 22.8416, + "grad_norm": 1.0573958158493042, + "learning_rate": 1.4325730292116848e-05, + "loss": 0.4591, + "step": 17845 + }, + { + "epoch": 22.84288, + "grad_norm": 1.1423496007919312, + "learning_rate": 1.432372949179672e-05, + "loss": 0.5063, + "step": 17846 + }, + { + "epoch": 22.84416, + "grad_norm": 1.1753638982772827, + "learning_rate": 1.432172869147659e-05, + "loss": 0.5118, + "step": 17847 + }, + { + "epoch": 22.84544, + "grad_norm": 1.114268183708191, + "learning_rate": 1.4319727891156462e-05, + "loss": 0.4528, + "step": 17848 + }, + { + "epoch": 22.84672, + "grad_norm": 1.1355352401733398, + "learning_rate": 1.4317727090836336e-05, + "loss": 0.4828, + "step": 17849 + }, + { + "epoch": 22.848, + "grad_norm": 1.199616551399231, + "learning_rate": 1.4315726290516208e-05, + "loss": 0.4811, + "step": 17850 + }, + { + "epoch": 22.84928, + "grad_norm": 1.173221468925476, + "learning_rate": 1.4313725490196078e-05, + "loss": 0.5268, + "step": 17851 + }, + { + "epoch": 22.85056, + "grad_norm": 1.1589332818984985, + "learning_rate": 1.4311724689875953e-05, + "loss": 0.4963, + "step": 17852 + }, + { + "epoch": 22.85184, + "grad_norm": 1.137489676475525, + "learning_rate": 1.4309723889555823e-05, + "loss": 0.4951, + "step": 17853 + }, + { + "epoch": 22.85312, + "grad_norm": 1.1129984855651855, + "learning_rate": 1.4307723089235695e-05, + "loss": 0.4814, + "step": 17854 + }, + { + "epoch": 22.8544, + "grad_norm": 1.0853337049484253, + "learning_rate": 1.4305722288915565e-05, + "loss": 0.4432, + "step": 17855 + }, + { + "epoch": 22.85568, + "grad_norm": 1.1107585430145264, + "learning_rate": 1.430372148859544e-05, + "loss": 0.5175, + "step": 17856 + }, + { + "epoch": 22.85696, + "grad_norm": 1.143998146057129, + "learning_rate": 1.430172068827531e-05, + "loss": 0.4882, + "step": 17857 + }, + { + "epoch": 22.85824, + "grad_norm": 1.164641261100769, + "learning_rate": 1.4299719887955182e-05, + "loss": 0.53, + "step": 17858 + }, + { + "epoch": 22.85952, + "grad_norm": 1.1436692476272583, + "learning_rate": 1.4297719087635056e-05, + "loss": 0.4866, + "step": 17859 + }, + { + "epoch": 22.8608, + "grad_norm": 1.1730378866195679, + "learning_rate": 1.4295718287314928e-05, + "loss": 0.5107, + "step": 17860 + }, + { + "epoch": 22.86208, + "grad_norm": 1.122793436050415, + "learning_rate": 1.4293717486994798e-05, + "loss": 0.5111, + "step": 17861 + }, + { + "epoch": 22.86336, + "grad_norm": 1.0745701789855957, + "learning_rate": 1.429171668667467e-05, + "loss": 0.4485, + "step": 17862 + }, + { + "epoch": 22.86464, + "grad_norm": 1.1484006643295288, + "learning_rate": 1.4289715886354543e-05, + "loss": 0.5085, + "step": 17863 + }, + { + "epoch": 22.86592, + "grad_norm": 1.1889216899871826, + "learning_rate": 1.4287715086034415e-05, + "loss": 0.5249, + "step": 17864 + }, + { + "epoch": 22.8672, + "grad_norm": 1.1078394651412964, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.4893, + "step": 17865 + }, + { + "epoch": 22.86848, + "grad_norm": 1.0795047283172607, + "learning_rate": 1.4283713485394159e-05, + "loss": 0.4388, + "step": 17866 + }, + { + "epoch": 22.86976, + "grad_norm": 1.0773197412490845, + "learning_rate": 1.428171268507403e-05, + "loss": 0.4857, + "step": 17867 + }, + { + "epoch": 22.87104, + "grad_norm": 1.1797206401824951, + "learning_rate": 1.4279711884753903e-05, + "loss": 0.5151, + "step": 17868 + }, + { + "epoch": 22.87232, + "grad_norm": 1.1339863538742065, + "learning_rate": 1.4277711084433773e-05, + "loss": 0.457, + "step": 17869 + }, + { + "epoch": 22.8736, + "grad_norm": 1.1581915616989136, + "learning_rate": 1.4275710284113646e-05, + "loss": 0.5057, + "step": 17870 + }, + { + "epoch": 22.87488, + "grad_norm": 1.1262785196304321, + "learning_rate": 1.4273709483793518e-05, + "loss": 0.5133, + "step": 17871 + }, + { + "epoch": 22.87616, + "grad_norm": 1.1205724477767944, + "learning_rate": 1.427170868347339e-05, + "loss": 0.5048, + "step": 17872 + }, + { + "epoch": 22.87744, + "grad_norm": 1.0860368013381958, + "learning_rate": 1.4269707883153264e-05, + "loss": 0.4702, + "step": 17873 + }, + { + "epoch": 22.87872, + "grad_norm": 1.1230723857879639, + "learning_rate": 1.4267707082833134e-05, + "loss": 0.4754, + "step": 17874 + }, + { + "epoch": 22.88, + "grad_norm": 1.1550164222717285, + "learning_rate": 1.4265706282513006e-05, + "loss": 0.5318, + "step": 17875 + }, + { + "epoch": 22.88128, + "grad_norm": 1.0607014894485474, + "learning_rate": 1.4263705482192878e-05, + "loss": 0.4266, + "step": 17876 + }, + { + "epoch": 22.88256, + "grad_norm": 1.1427565813064575, + "learning_rate": 1.4261704681872751e-05, + "loss": 0.5016, + "step": 17877 + }, + { + "epoch": 22.88384, + "grad_norm": 1.082594394683838, + "learning_rate": 1.4259703881552621e-05, + "loss": 0.4745, + "step": 17878 + }, + { + "epoch": 22.88512, + "grad_norm": 1.1618205308914185, + "learning_rate": 1.4257703081232493e-05, + "loss": 0.5358, + "step": 17879 + }, + { + "epoch": 22.8864, + "grad_norm": 1.0929317474365234, + "learning_rate": 1.4255702280912367e-05, + "loss": 0.4662, + "step": 17880 + }, + { + "epoch": 22.88768, + "grad_norm": 1.1140695810317993, + "learning_rate": 1.4253701480592239e-05, + "loss": 0.4729, + "step": 17881 + }, + { + "epoch": 22.88896, + "grad_norm": 1.0910569429397583, + "learning_rate": 1.4251700680272109e-05, + "loss": 0.4473, + "step": 17882 + }, + { + "epoch": 22.89024, + "grad_norm": 1.193246841430664, + "learning_rate": 1.424969987995198e-05, + "loss": 0.541, + "step": 17883 + }, + { + "epoch": 22.89152, + "grad_norm": 1.0496735572814941, + "learning_rate": 1.4247699079631854e-05, + "loss": 0.494, + "step": 17884 + }, + { + "epoch": 22.8928, + "grad_norm": 1.1013808250427246, + "learning_rate": 1.4245698279311726e-05, + "loss": 0.4842, + "step": 17885 + }, + { + "epoch": 22.89408, + "grad_norm": 1.1200693845748901, + "learning_rate": 1.4243697478991596e-05, + "loss": 0.5158, + "step": 17886 + }, + { + "epoch": 22.89536, + "grad_norm": 1.1038991212844849, + "learning_rate": 1.4241696678671471e-05, + "loss": 0.4855, + "step": 17887 + }, + { + "epoch": 22.89664, + "grad_norm": 1.119197964668274, + "learning_rate": 1.4239695878351342e-05, + "loss": 0.5312, + "step": 17888 + }, + { + "epoch": 22.89792, + "grad_norm": 1.0970196723937988, + "learning_rate": 1.4237695078031213e-05, + "loss": 0.481, + "step": 17889 + }, + { + "epoch": 22.8992, + "grad_norm": 1.1380757093429565, + "learning_rate": 1.4235694277711084e-05, + "loss": 0.5077, + "step": 17890 + }, + { + "epoch": 22.90048, + "grad_norm": 1.0854060649871826, + "learning_rate": 1.4233693477390959e-05, + "loss": 0.4607, + "step": 17891 + }, + { + "epoch": 22.90176, + "grad_norm": 1.0660706758499146, + "learning_rate": 1.4231692677070829e-05, + "loss": 0.4662, + "step": 17892 + }, + { + "epoch": 22.90304, + "grad_norm": 1.1607842445373535, + "learning_rate": 1.42296918767507e-05, + "loss": 0.5338, + "step": 17893 + }, + { + "epoch": 22.90432, + "grad_norm": 1.1080985069274902, + "learning_rate": 1.4227691076430571e-05, + "loss": 0.4802, + "step": 17894 + }, + { + "epoch": 22.9056, + "grad_norm": 1.1644606590270996, + "learning_rate": 1.4225690276110446e-05, + "loss": 0.5258, + "step": 17895 + }, + { + "epoch": 22.90688, + "grad_norm": 1.1852686405181885, + "learning_rate": 1.4223689475790316e-05, + "loss": 0.4759, + "step": 17896 + }, + { + "epoch": 22.90816, + "grad_norm": 1.1916351318359375, + "learning_rate": 1.4221688675470188e-05, + "loss": 0.4743, + "step": 17897 + }, + { + "epoch": 22.90944, + "grad_norm": 1.1184988021850586, + "learning_rate": 1.4219687875150062e-05, + "loss": 0.5025, + "step": 17898 + }, + { + "epoch": 22.91072, + "grad_norm": 1.147409439086914, + "learning_rate": 1.4217687074829934e-05, + "loss": 0.526, + "step": 17899 + }, + { + "epoch": 22.912, + "grad_norm": 1.1477159261703491, + "learning_rate": 1.4215686274509804e-05, + "loss": 0.4882, + "step": 17900 + }, + { + "epoch": 22.91328, + "grad_norm": 1.0916913747787476, + "learning_rate": 1.4213685474189676e-05, + "loss": 0.508, + "step": 17901 + }, + { + "epoch": 22.91456, + "grad_norm": 1.0843031406402588, + "learning_rate": 1.421168467386955e-05, + "loss": 0.5053, + "step": 17902 + }, + { + "epoch": 22.91584, + "grad_norm": 1.1080353260040283, + "learning_rate": 1.4209683873549421e-05, + "loss": 0.5002, + "step": 17903 + }, + { + "epoch": 22.91712, + "grad_norm": 1.1214649677276611, + "learning_rate": 1.4207683073229291e-05, + "loss": 0.5296, + "step": 17904 + }, + { + "epoch": 22.9184, + "grad_norm": 1.0759986639022827, + "learning_rate": 1.4205682272909165e-05, + "loss": 0.4737, + "step": 17905 + }, + { + "epoch": 22.91968, + "grad_norm": 1.1020501852035522, + "learning_rate": 1.4203681472589037e-05, + "loss": 0.4915, + "step": 17906 + }, + { + "epoch": 22.92096, + "grad_norm": 1.1096168756484985, + "learning_rate": 1.4201680672268908e-05, + "loss": 0.4827, + "step": 17907 + }, + { + "epoch": 22.92224, + "grad_norm": 1.0973726511001587, + "learning_rate": 1.4199679871948779e-05, + "loss": 0.4771, + "step": 17908 + }, + { + "epoch": 22.92352, + "grad_norm": 1.143187403678894, + "learning_rate": 1.4197679071628652e-05, + "loss": 0.5155, + "step": 17909 + }, + { + "epoch": 22.9248, + "grad_norm": 1.1421252489089966, + "learning_rate": 1.4195678271308524e-05, + "loss": 0.4643, + "step": 17910 + }, + { + "epoch": 22.92608, + "grad_norm": 1.1452699899673462, + "learning_rate": 1.4193677470988396e-05, + "loss": 0.5185, + "step": 17911 + }, + { + "epoch": 22.92736, + "grad_norm": 1.0854843854904175, + "learning_rate": 1.419167667066827e-05, + "loss": 0.4865, + "step": 17912 + }, + { + "epoch": 22.92864, + "grad_norm": 1.117672324180603, + "learning_rate": 1.418967587034814e-05, + "loss": 0.4689, + "step": 17913 + }, + { + "epoch": 22.92992, + "grad_norm": 1.135145902633667, + "learning_rate": 1.4187675070028011e-05, + "loss": 0.4971, + "step": 17914 + }, + { + "epoch": 22.9312, + "grad_norm": 1.0750057697296143, + "learning_rate": 1.4185674269707883e-05, + "loss": 0.4357, + "step": 17915 + }, + { + "epoch": 22.932479999999998, + "grad_norm": 1.1267346143722534, + "learning_rate": 1.4183673469387757e-05, + "loss": 0.4957, + "step": 17916 + }, + { + "epoch": 22.93376, + "grad_norm": 1.1975816488265991, + "learning_rate": 1.4181672669067627e-05, + "loss": 0.5446, + "step": 17917 + }, + { + "epoch": 22.93504, + "grad_norm": 1.12879478931427, + "learning_rate": 1.4179671868747499e-05, + "loss": 0.4855, + "step": 17918 + }, + { + "epoch": 22.93632, + "grad_norm": 1.1235387325286865, + "learning_rate": 1.4177671068427372e-05, + "loss": 0.5129, + "step": 17919 + }, + { + "epoch": 22.9376, + "grad_norm": 1.1320890188217163, + "learning_rate": 1.4175670268107244e-05, + "loss": 0.4815, + "step": 17920 + }, + { + "epoch": 22.93888, + "grad_norm": 1.1158738136291504, + "learning_rate": 1.4173669467787114e-05, + "loss": 0.4736, + "step": 17921 + }, + { + "epoch": 22.94016, + "grad_norm": 1.0395013093948364, + "learning_rate": 1.4171668667466986e-05, + "loss": 0.4211, + "step": 17922 + }, + { + "epoch": 22.94144, + "grad_norm": 1.0959734916687012, + "learning_rate": 1.416966786714686e-05, + "loss": 0.4874, + "step": 17923 + }, + { + "epoch": 22.94272, + "grad_norm": 1.0824655294418335, + "learning_rate": 1.4167667066826732e-05, + "loss": 0.494, + "step": 17924 + }, + { + "epoch": 22.944, + "grad_norm": 1.1469032764434814, + "learning_rate": 1.4165666266506602e-05, + "loss": 0.4917, + "step": 17925 + }, + { + "epoch": 22.94528, + "grad_norm": 1.2326279878616333, + "learning_rate": 1.4163665466186477e-05, + "loss": 0.5198, + "step": 17926 + }, + { + "epoch": 22.94656, + "grad_norm": 1.1711357831954956, + "learning_rate": 1.4161664665866347e-05, + "loss": 0.5073, + "step": 17927 + }, + { + "epoch": 22.94784, + "grad_norm": 1.1785284280776978, + "learning_rate": 1.415966386554622e-05, + "loss": 0.4986, + "step": 17928 + }, + { + "epoch": 22.94912, + "grad_norm": 1.1128637790679932, + "learning_rate": 1.415766306522609e-05, + "loss": 0.4626, + "step": 17929 + }, + { + "epoch": 22.9504, + "grad_norm": 1.1575325727462769, + "learning_rate": 1.4155662264905965e-05, + "loss": 0.5185, + "step": 17930 + }, + { + "epoch": 22.95168, + "grad_norm": 1.11526620388031, + "learning_rate": 1.4153661464585835e-05, + "loss": 0.47, + "step": 17931 + }, + { + "epoch": 22.95296, + "grad_norm": 1.1603832244873047, + "learning_rate": 1.4151660664265707e-05, + "loss": 0.479, + "step": 17932 + }, + { + "epoch": 22.95424, + "grad_norm": 1.1286890506744385, + "learning_rate": 1.414965986394558e-05, + "loss": 0.4544, + "step": 17933 + }, + { + "epoch": 22.95552, + "grad_norm": 1.0917226076126099, + "learning_rate": 1.4147659063625452e-05, + "loss": 0.4819, + "step": 17934 + }, + { + "epoch": 22.9568, + "grad_norm": 1.1569414138793945, + "learning_rate": 1.4145658263305322e-05, + "loss": 0.5174, + "step": 17935 + }, + { + "epoch": 22.95808, + "grad_norm": 1.1322275400161743, + "learning_rate": 1.4143657462985194e-05, + "loss": 0.5149, + "step": 17936 + }, + { + "epoch": 22.95936, + "grad_norm": 1.1222246885299683, + "learning_rate": 1.4141656662665068e-05, + "loss": 0.5159, + "step": 17937 + }, + { + "epoch": 22.96064, + "grad_norm": 1.0865634679794312, + "learning_rate": 1.413965586234494e-05, + "loss": 0.4598, + "step": 17938 + }, + { + "epoch": 22.96192, + "grad_norm": 1.2040810585021973, + "learning_rate": 1.413765506202481e-05, + "loss": 0.5407, + "step": 17939 + }, + { + "epoch": 22.9632, + "grad_norm": 1.1161025762557983, + "learning_rate": 1.4135654261704683e-05, + "loss": 0.52, + "step": 17940 + }, + { + "epoch": 22.964480000000002, + "grad_norm": 1.1312873363494873, + "learning_rate": 1.4133653461384555e-05, + "loss": 0.5067, + "step": 17941 + }, + { + "epoch": 22.96576, + "grad_norm": 1.0572233200073242, + "learning_rate": 1.4131652661064427e-05, + "loss": 0.4639, + "step": 17942 + }, + { + "epoch": 22.96704, + "grad_norm": 1.0853049755096436, + "learning_rate": 1.4129651860744297e-05, + "loss": 0.4694, + "step": 17943 + }, + { + "epoch": 22.96832, + "grad_norm": 1.133817434310913, + "learning_rate": 1.412765106042417e-05, + "loss": 0.5031, + "step": 17944 + }, + { + "epoch": 22.9696, + "grad_norm": 1.1215367317199707, + "learning_rate": 1.4125650260104042e-05, + "loss": 0.4852, + "step": 17945 + }, + { + "epoch": 22.97088, + "grad_norm": 1.1265122890472412, + "learning_rate": 1.4123649459783914e-05, + "loss": 0.4838, + "step": 17946 + }, + { + "epoch": 22.97216, + "grad_norm": 1.1352272033691406, + "learning_rate": 1.4121648659463788e-05, + "loss": 0.4701, + "step": 17947 + }, + { + "epoch": 22.97344, + "grad_norm": 1.0938243865966797, + "learning_rate": 1.4119647859143658e-05, + "loss": 0.4614, + "step": 17948 + }, + { + "epoch": 22.97472, + "grad_norm": 1.147600769996643, + "learning_rate": 1.411764705882353e-05, + "loss": 0.4799, + "step": 17949 + }, + { + "epoch": 22.976, + "grad_norm": 1.1563359498977661, + "learning_rate": 1.4115646258503402e-05, + "loss": 0.4585, + "step": 17950 + }, + { + "epoch": 22.97728, + "grad_norm": 1.1627904176712036, + "learning_rate": 1.4113645458183275e-05, + "loss": 0.56, + "step": 17951 + }, + { + "epoch": 22.97856, + "grad_norm": 1.1048862934112549, + "learning_rate": 1.4111644657863145e-05, + "loss": 0.5129, + "step": 17952 + }, + { + "epoch": 22.97984, + "grad_norm": 1.1043338775634766, + "learning_rate": 1.4109643857543017e-05, + "loss": 0.4792, + "step": 17953 + }, + { + "epoch": 22.98112, + "grad_norm": 1.151166319847107, + "learning_rate": 1.4107643057222889e-05, + "loss": 0.5038, + "step": 17954 + }, + { + "epoch": 22.9824, + "grad_norm": 1.1335054636001587, + "learning_rate": 1.4105642256902763e-05, + "loss": 0.5076, + "step": 17955 + }, + { + "epoch": 22.98368, + "grad_norm": 1.0865764617919922, + "learning_rate": 1.4103641456582633e-05, + "loss": 0.4719, + "step": 17956 + }, + { + "epoch": 22.98496, + "grad_norm": 1.1240063905715942, + "learning_rate": 1.4101640656262505e-05, + "loss": 0.5044, + "step": 17957 + }, + { + "epoch": 22.98624, + "grad_norm": 1.142352819442749, + "learning_rate": 1.4099639855942378e-05, + "loss": 0.5157, + "step": 17958 + }, + { + "epoch": 22.98752, + "grad_norm": 1.0918866395950317, + "learning_rate": 1.409763905562225e-05, + "loss": 0.4706, + "step": 17959 + }, + { + "epoch": 22.9888, + "grad_norm": 1.0810490846633911, + "learning_rate": 1.409563825530212e-05, + "loss": 0.4711, + "step": 17960 + }, + { + "epoch": 22.99008, + "grad_norm": 1.1601754426956177, + "learning_rate": 1.4093637454981992e-05, + "loss": 0.5184, + "step": 17961 + }, + { + "epoch": 22.99136, + "grad_norm": 1.089206576347351, + "learning_rate": 1.4091636654661866e-05, + "loss": 0.5103, + "step": 17962 + }, + { + "epoch": 22.99264, + "grad_norm": 1.1088589429855347, + "learning_rate": 1.4089635854341738e-05, + "loss": 0.4609, + "step": 17963 + }, + { + "epoch": 22.99392, + "grad_norm": 1.1909695863723755, + "learning_rate": 1.4087635054021608e-05, + "loss": 0.5124, + "step": 17964 + }, + { + "epoch": 22.9952, + "grad_norm": 1.0958809852600098, + "learning_rate": 1.4085634253701483e-05, + "loss": 0.4415, + "step": 17965 + }, + { + "epoch": 22.99648, + "grad_norm": 1.120891809463501, + "learning_rate": 1.4083633453381353e-05, + "loss": 0.4856, + "step": 17966 + }, + { + "epoch": 22.99776, + "grad_norm": 1.111769437789917, + "learning_rate": 1.4081632653061225e-05, + "loss": 0.5141, + "step": 17967 + }, + { + "epoch": 22.99904, + "grad_norm": 1.1565723419189453, + "learning_rate": 1.4079631852741095e-05, + "loss": 0.5062, + "step": 17968 + }, + { + "epoch": 23.00032, + "grad_norm": 2.3065271377563477, + "learning_rate": 1.407763105242097e-05, + "loss": 0.7523, + "step": 17969 + }, + { + "epoch": 23.0016, + "grad_norm": 1.0669747591018677, + "learning_rate": 1.407563025210084e-05, + "loss": 0.4847, + "step": 17970 + }, + { + "epoch": 23.00288, + "grad_norm": 1.0902814865112305, + "learning_rate": 1.4073629451780712e-05, + "loss": 0.4821, + "step": 17971 + }, + { + "epoch": 23.00416, + "grad_norm": 1.1460057497024536, + "learning_rate": 1.4071628651460586e-05, + "loss": 0.4837, + "step": 17972 + }, + { + "epoch": 23.00544, + "grad_norm": 1.0994794368743896, + "learning_rate": 1.4069627851140458e-05, + "loss": 0.4203, + "step": 17973 + }, + { + "epoch": 23.00672, + "grad_norm": 1.093055009841919, + "learning_rate": 1.4067627050820328e-05, + "loss": 0.4731, + "step": 17974 + }, + { + "epoch": 23.008, + "grad_norm": 1.139220952987671, + "learning_rate": 1.40656262505002e-05, + "loss": 0.4932, + "step": 17975 + }, + { + "epoch": 23.00928, + "grad_norm": 1.0311874151229858, + "learning_rate": 1.4063625450180073e-05, + "loss": 0.4708, + "step": 17976 + }, + { + "epoch": 23.01056, + "grad_norm": 1.0706690549850464, + "learning_rate": 1.4061624649859945e-05, + "loss": 0.4173, + "step": 17977 + }, + { + "epoch": 23.01184, + "grad_norm": 1.192206621170044, + "learning_rate": 1.4059623849539815e-05, + "loss": 0.4742, + "step": 17978 + }, + { + "epoch": 23.01312, + "grad_norm": 1.13650381565094, + "learning_rate": 1.4057623049219689e-05, + "loss": 0.4875, + "step": 17979 + }, + { + "epoch": 23.0144, + "grad_norm": 1.1168575286865234, + "learning_rate": 1.405562224889956e-05, + "loss": 0.4488, + "step": 17980 + }, + { + "epoch": 23.01568, + "grad_norm": 1.1627042293548584, + "learning_rate": 1.4053621448579433e-05, + "loss": 0.4947, + "step": 17981 + }, + { + "epoch": 23.01696, + "grad_norm": 1.2101789712905884, + "learning_rate": 1.4051620648259303e-05, + "loss": 0.5105, + "step": 17982 + }, + { + "epoch": 23.01824, + "grad_norm": 1.080032467842102, + "learning_rate": 1.4049619847939176e-05, + "loss": 0.4658, + "step": 17983 + }, + { + "epoch": 23.01952, + "grad_norm": 1.169611930847168, + "learning_rate": 1.4047619047619048e-05, + "loss": 0.4832, + "step": 17984 + }, + { + "epoch": 23.0208, + "grad_norm": 1.147308588027954, + "learning_rate": 1.404561824729892e-05, + "loss": 0.4709, + "step": 17985 + }, + { + "epoch": 23.02208, + "grad_norm": 1.1938717365264893, + "learning_rate": 1.4043617446978794e-05, + "loss": 0.5101, + "step": 17986 + }, + { + "epoch": 23.02336, + "grad_norm": 1.174412488937378, + "learning_rate": 1.4041616646658664e-05, + "loss": 0.4884, + "step": 17987 + }, + { + "epoch": 23.02464, + "grad_norm": 1.1288552284240723, + "learning_rate": 1.4039615846338536e-05, + "loss": 0.4617, + "step": 17988 + }, + { + "epoch": 23.02592, + "grad_norm": 1.100049614906311, + "learning_rate": 1.4037615046018408e-05, + "loss": 0.4318, + "step": 17989 + }, + { + "epoch": 23.0272, + "grad_norm": 1.1527884006500244, + "learning_rate": 1.4035614245698281e-05, + "loss": 0.4885, + "step": 17990 + }, + { + "epoch": 23.02848, + "grad_norm": 1.0793335437774658, + "learning_rate": 1.4033613445378151e-05, + "loss": 0.4429, + "step": 17991 + }, + { + "epoch": 23.02976, + "grad_norm": 1.1420992612838745, + "learning_rate": 1.4031612645058023e-05, + "loss": 0.4655, + "step": 17992 + }, + { + "epoch": 23.03104, + "grad_norm": 1.115891695022583, + "learning_rate": 1.4029611844737897e-05, + "loss": 0.5128, + "step": 17993 + }, + { + "epoch": 23.03232, + "grad_norm": 1.1061939001083374, + "learning_rate": 1.4027611044417769e-05, + "loss": 0.4666, + "step": 17994 + }, + { + "epoch": 23.0336, + "grad_norm": 1.1033188104629517, + "learning_rate": 1.4025610244097639e-05, + "loss": 0.4384, + "step": 17995 + }, + { + "epoch": 23.03488, + "grad_norm": 1.1487460136413574, + "learning_rate": 1.402360944377751e-05, + "loss": 0.4253, + "step": 17996 + }, + { + "epoch": 23.03616, + "grad_norm": 1.210303544998169, + "learning_rate": 1.4021608643457384e-05, + "loss": 0.4954, + "step": 17997 + }, + { + "epoch": 23.03744, + "grad_norm": 1.119410753250122, + "learning_rate": 1.4019607843137256e-05, + "loss": 0.4598, + "step": 17998 + }, + { + "epoch": 23.03872, + "grad_norm": 1.1100172996520996, + "learning_rate": 1.4017607042817126e-05, + "loss": 0.4887, + "step": 17999 + }, + { + "epoch": 23.04, + "grad_norm": 1.1196397542953491, + "learning_rate": 1.4015606242497001e-05, + "loss": 0.4729, + "step": 18000 + }, + { + "epoch": 23.04128, + "grad_norm": 1.1445229053497314, + "learning_rate": 1.4013605442176872e-05, + "loss": 0.4818, + "step": 18001 + }, + { + "epoch": 23.04256, + "grad_norm": 1.0524245500564575, + "learning_rate": 1.4011604641856743e-05, + "loss": 0.45, + "step": 18002 + }, + { + "epoch": 23.04384, + "grad_norm": 1.1181271076202393, + "learning_rate": 1.4009603841536614e-05, + "loss": 0.4498, + "step": 18003 + }, + { + "epoch": 23.04512, + "grad_norm": 1.0702544450759888, + "learning_rate": 1.4007603041216489e-05, + "loss": 0.4782, + "step": 18004 + }, + { + "epoch": 23.0464, + "grad_norm": 1.1783769130706787, + "learning_rate": 1.4005602240896359e-05, + "loss": 0.5008, + "step": 18005 + }, + { + "epoch": 23.04768, + "grad_norm": 1.1337677240371704, + "learning_rate": 1.400360144057623e-05, + "loss": 0.465, + "step": 18006 + }, + { + "epoch": 23.04896, + "grad_norm": 1.0862562656402588, + "learning_rate": 1.4001600640256101e-05, + "loss": 0.4675, + "step": 18007 + }, + { + "epoch": 23.05024, + "grad_norm": 1.122215747833252, + "learning_rate": 1.3999599839935976e-05, + "loss": 0.4764, + "step": 18008 + }, + { + "epoch": 23.05152, + "grad_norm": 1.1443709135055542, + "learning_rate": 1.3997599039615846e-05, + "loss": 0.496, + "step": 18009 + }, + { + "epoch": 23.0528, + "grad_norm": 1.1884753704071045, + "learning_rate": 1.3995598239295718e-05, + "loss": 0.5125, + "step": 18010 + }, + { + "epoch": 23.05408, + "grad_norm": 1.1133694648742676, + "learning_rate": 1.3993597438975592e-05, + "loss": 0.4876, + "step": 18011 + }, + { + "epoch": 23.05536, + "grad_norm": 1.0614551305770874, + "learning_rate": 1.3991596638655464e-05, + "loss": 0.4339, + "step": 18012 + }, + { + "epoch": 23.05664, + "grad_norm": 1.1484719514846802, + "learning_rate": 1.3989595838335334e-05, + "loss": 0.4655, + "step": 18013 + }, + { + "epoch": 23.05792, + "grad_norm": 1.1289807558059692, + "learning_rate": 1.3987595038015206e-05, + "loss": 0.4593, + "step": 18014 + }, + { + "epoch": 23.0592, + "grad_norm": 1.1590949296951294, + "learning_rate": 1.398559423769508e-05, + "loss": 0.4633, + "step": 18015 + }, + { + "epoch": 23.06048, + "grad_norm": 1.1496042013168335, + "learning_rate": 1.3983593437374951e-05, + "loss": 0.4818, + "step": 18016 + }, + { + "epoch": 23.06176, + "grad_norm": 1.1318520307540894, + "learning_rate": 1.3981592637054821e-05, + "loss": 0.4942, + "step": 18017 + }, + { + "epoch": 23.06304, + "grad_norm": 1.1314492225646973, + "learning_rate": 1.3979591836734696e-05, + "loss": 0.4815, + "step": 18018 + }, + { + "epoch": 23.06432, + "grad_norm": 1.1428439617156982, + "learning_rate": 1.3977591036414567e-05, + "loss": 0.4794, + "step": 18019 + }, + { + "epoch": 23.0656, + "grad_norm": 1.1494139432907104, + "learning_rate": 1.3975590236094438e-05, + "loss": 0.4554, + "step": 18020 + }, + { + "epoch": 23.06688, + "grad_norm": 1.0700560808181763, + "learning_rate": 1.3973589435774309e-05, + "loss": 0.4345, + "step": 18021 + }, + { + "epoch": 23.06816, + "grad_norm": 1.1106326580047607, + "learning_rate": 1.3971588635454184e-05, + "loss": 0.4747, + "step": 18022 + }, + { + "epoch": 23.06944, + "grad_norm": 1.1512048244476318, + "learning_rate": 1.3969587835134054e-05, + "loss": 0.478, + "step": 18023 + }, + { + "epoch": 23.07072, + "grad_norm": 1.0818241834640503, + "learning_rate": 1.3967587034813926e-05, + "loss": 0.4671, + "step": 18024 + }, + { + "epoch": 23.072, + "grad_norm": 1.1178078651428223, + "learning_rate": 1.39655862344938e-05, + "loss": 0.4329, + "step": 18025 + }, + { + "epoch": 23.07328, + "grad_norm": 1.1609102487564087, + "learning_rate": 1.3963585434173671e-05, + "loss": 0.4793, + "step": 18026 + }, + { + "epoch": 23.07456, + "grad_norm": 1.1655328273773193, + "learning_rate": 1.3961584633853541e-05, + "loss": 0.4715, + "step": 18027 + }, + { + "epoch": 23.07584, + "grad_norm": 1.150871753692627, + "learning_rate": 1.3959583833533413e-05, + "loss": 0.4812, + "step": 18028 + }, + { + "epoch": 23.07712, + "grad_norm": 1.2036709785461426, + "learning_rate": 1.3957583033213287e-05, + "loss": 0.5125, + "step": 18029 + }, + { + "epoch": 23.0784, + "grad_norm": 1.169793725013733, + "learning_rate": 1.3955582232893159e-05, + "loss": 0.4995, + "step": 18030 + }, + { + "epoch": 23.07968, + "grad_norm": 1.1694694757461548, + "learning_rate": 1.3953581432573029e-05, + "loss": 0.5015, + "step": 18031 + }, + { + "epoch": 23.08096, + "grad_norm": 1.1842191219329834, + "learning_rate": 1.3951580632252902e-05, + "loss": 0.5023, + "step": 18032 + }, + { + "epoch": 23.08224, + "grad_norm": 1.1319433450698853, + "learning_rate": 1.3949579831932774e-05, + "loss": 0.4759, + "step": 18033 + }, + { + "epoch": 23.08352, + "grad_norm": 1.0593634843826294, + "learning_rate": 1.3947579031612646e-05, + "loss": 0.4504, + "step": 18034 + }, + { + "epoch": 23.0848, + "grad_norm": 1.1120679378509521, + "learning_rate": 1.3945578231292516e-05, + "loss": 0.5023, + "step": 18035 + }, + { + "epoch": 23.08608, + "grad_norm": 1.1178866624832153, + "learning_rate": 1.394357743097239e-05, + "loss": 0.4568, + "step": 18036 + }, + { + "epoch": 23.08736, + "grad_norm": 1.2306301593780518, + "learning_rate": 1.3941576630652262e-05, + "loss": 0.5722, + "step": 18037 + }, + { + "epoch": 23.08864, + "grad_norm": 1.0901416540145874, + "learning_rate": 1.3939575830332134e-05, + "loss": 0.4703, + "step": 18038 + }, + { + "epoch": 23.08992, + "grad_norm": 1.0788695812225342, + "learning_rate": 1.3937575030012007e-05, + "loss": 0.4225, + "step": 18039 + }, + { + "epoch": 23.0912, + "grad_norm": 1.1180025339126587, + "learning_rate": 1.3935574229691877e-05, + "loss": 0.4538, + "step": 18040 + }, + { + "epoch": 23.09248, + "grad_norm": 1.1626075506210327, + "learning_rate": 1.393357342937175e-05, + "loss": 0.4787, + "step": 18041 + }, + { + "epoch": 23.09376, + "grad_norm": 1.1562151908874512, + "learning_rate": 1.3931572629051621e-05, + "loss": 0.4508, + "step": 18042 + }, + { + "epoch": 23.09504, + "grad_norm": 1.0841056108474731, + "learning_rate": 1.3929571828731495e-05, + "loss": 0.4401, + "step": 18043 + }, + { + "epoch": 23.09632, + "grad_norm": 1.1172791719436646, + "learning_rate": 1.3927571028411365e-05, + "loss": 0.4876, + "step": 18044 + }, + { + "epoch": 23.0976, + "grad_norm": 1.1132711172103882, + "learning_rate": 1.3925570228091237e-05, + "loss": 0.483, + "step": 18045 + }, + { + "epoch": 23.09888, + "grad_norm": 1.1011528968811035, + "learning_rate": 1.392356942777111e-05, + "loss": 0.4601, + "step": 18046 + }, + { + "epoch": 23.10016, + "grad_norm": 1.2015907764434814, + "learning_rate": 1.3921568627450982e-05, + "loss": 0.4793, + "step": 18047 + }, + { + "epoch": 23.10144, + "grad_norm": 1.1870005130767822, + "learning_rate": 1.3919567827130852e-05, + "loss": 0.4781, + "step": 18048 + }, + { + "epoch": 23.10272, + "grad_norm": 1.1651169061660767, + "learning_rate": 1.3917567026810724e-05, + "loss": 0.4782, + "step": 18049 + }, + { + "epoch": 23.104, + "grad_norm": 1.1237787008285522, + "learning_rate": 1.3915566226490598e-05, + "loss": 0.4722, + "step": 18050 + }, + { + "epoch": 23.10528, + "grad_norm": 1.1419413089752197, + "learning_rate": 1.391356542617047e-05, + "loss": 0.4772, + "step": 18051 + }, + { + "epoch": 23.10656, + "grad_norm": 1.1415618658065796, + "learning_rate": 1.391156462585034e-05, + "loss": 0.4407, + "step": 18052 + }, + { + "epoch": 23.10784, + "grad_norm": 1.1670557260513306, + "learning_rate": 1.3909563825530215e-05, + "loss": 0.5378, + "step": 18053 + }, + { + "epoch": 23.10912, + "grad_norm": 1.1420172452926636, + "learning_rate": 1.3907563025210085e-05, + "loss": 0.4717, + "step": 18054 + }, + { + "epoch": 23.1104, + "grad_norm": 1.1076371669769287, + "learning_rate": 1.3905562224889957e-05, + "loss": 0.4571, + "step": 18055 + }, + { + "epoch": 23.11168, + "grad_norm": 1.1110548973083496, + "learning_rate": 1.3903561424569827e-05, + "loss": 0.4484, + "step": 18056 + }, + { + "epoch": 23.11296, + "grad_norm": 1.1428343057632446, + "learning_rate": 1.3901560624249702e-05, + "loss": 0.5226, + "step": 18057 + }, + { + "epoch": 23.11424, + "grad_norm": 1.1347376108169556, + "learning_rate": 1.3899559823929572e-05, + "loss": 0.4633, + "step": 18058 + }, + { + "epoch": 23.11552, + "grad_norm": 1.1556316614151, + "learning_rate": 1.3897559023609444e-05, + "loss": 0.4751, + "step": 18059 + }, + { + "epoch": 23.1168, + "grad_norm": 1.1056498289108276, + "learning_rate": 1.3895558223289318e-05, + "loss": 0.4446, + "step": 18060 + }, + { + "epoch": 23.11808, + "grad_norm": 1.194538950920105, + "learning_rate": 1.389355742296919e-05, + "loss": 0.5121, + "step": 18061 + }, + { + "epoch": 23.11936, + "grad_norm": 1.0786786079406738, + "learning_rate": 1.389155662264906e-05, + "loss": 0.482, + "step": 18062 + }, + { + "epoch": 23.12064, + "grad_norm": 1.1537976264953613, + "learning_rate": 1.3889555822328932e-05, + "loss": 0.5048, + "step": 18063 + }, + { + "epoch": 23.12192, + "grad_norm": 1.1101106405258179, + "learning_rate": 1.3887555022008805e-05, + "loss": 0.5427, + "step": 18064 + }, + { + "epoch": 23.1232, + "grad_norm": 1.0096379518508911, + "learning_rate": 1.3885554221688677e-05, + "loss": 0.4264, + "step": 18065 + }, + { + "epoch": 23.12448, + "grad_norm": 1.1329681873321533, + "learning_rate": 1.3883553421368547e-05, + "loss": 0.5005, + "step": 18066 + }, + { + "epoch": 23.12576, + "grad_norm": 1.1219654083251953, + "learning_rate": 1.3881552621048419e-05, + "loss": 0.4677, + "step": 18067 + }, + { + "epoch": 23.12704, + "grad_norm": 1.1316763162612915, + "learning_rate": 1.3879551820728293e-05, + "loss": 0.4857, + "step": 18068 + }, + { + "epoch": 23.12832, + "grad_norm": 1.1240242719650269, + "learning_rate": 1.3877551020408165e-05, + "loss": 0.481, + "step": 18069 + }, + { + "epoch": 23.1296, + "grad_norm": 1.0853866338729858, + "learning_rate": 1.3875550220088035e-05, + "loss": 0.4481, + "step": 18070 + }, + { + "epoch": 23.13088, + "grad_norm": 1.1638892889022827, + "learning_rate": 1.3873549419767908e-05, + "loss": 0.5032, + "step": 18071 + }, + { + "epoch": 23.13216, + "grad_norm": 1.1282097101211548, + "learning_rate": 1.387154861944778e-05, + "loss": 0.4717, + "step": 18072 + }, + { + "epoch": 23.13344, + "grad_norm": 1.1175307035446167, + "learning_rate": 1.3869547819127652e-05, + "loss": 0.4375, + "step": 18073 + }, + { + "epoch": 23.13472, + "grad_norm": 1.0587260723114014, + "learning_rate": 1.3867547018807522e-05, + "loss": 0.4765, + "step": 18074 + }, + { + "epoch": 23.136, + "grad_norm": 1.179371953010559, + "learning_rate": 1.3865546218487396e-05, + "loss": 0.5337, + "step": 18075 + }, + { + "epoch": 23.13728, + "grad_norm": 1.153063178062439, + "learning_rate": 1.3863545418167268e-05, + "loss": 0.4959, + "step": 18076 + }, + { + "epoch": 23.13856, + "grad_norm": 1.1271103620529175, + "learning_rate": 1.386154461784714e-05, + "loss": 0.478, + "step": 18077 + }, + { + "epoch": 23.13984, + "grad_norm": 1.0915660858154297, + "learning_rate": 1.3859543817527013e-05, + "loss": 0.4598, + "step": 18078 + }, + { + "epoch": 23.14112, + "grad_norm": 1.1732710599899292, + "learning_rate": 1.3857543017206883e-05, + "loss": 0.4857, + "step": 18079 + }, + { + "epoch": 23.1424, + "grad_norm": 1.1222920417785645, + "learning_rate": 1.3855542216886755e-05, + "loss": 0.4344, + "step": 18080 + }, + { + "epoch": 23.14368, + "grad_norm": 1.1178438663482666, + "learning_rate": 1.3853541416566627e-05, + "loss": 0.4794, + "step": 18081 + }, + { + "epoch": 23.14496, + "grad_norm": 1.1883807182312012, + "learning_rate": 1.38515406162465e-05, + "loss": 0.4837, + "step": 18082 + }, + { + "epoch": 23.14624, + "grad_norm": 1.1413466930389404, + "learning_rate": 1.384953981592637e-05, + "loss": 0.4826, + "step": 18083 + }, + { + "epoch": 23.14752, + "grad_norm": 1.0779472589492798, + "learning_rate": 1.3847539015606242e-05, + "loss": 0.4544, + "step": 18084 + }, + { + "epoch": 23.1488, + "grad_norm": 1.0848368406295776, + "learning_rate": 1.3845538215286116e-05, + "loss": 0.4832, + "step": 18085 + }, + { + "epoch": 23.15008, + "grad_norm": 1.117047667503357, + "learning_rate": 1.3843537414965988e-05, + "loss": 0.4593, + "step": 18086 + }, + { + "epoch": 23.15136, + "grad_norm": 1.113674283027649, + "learning_rate": 1.3841536614645858e-05, + "loss": 0.4669, + "step": 18087 + }, + { + "epoch": 23.15264, + "grad_norm": 1.186633825302124, + "learning_rate": 1.383953581432573e-05, + "loss": 0.4913, + "step": 18088 + }, + { + "epoch": 23.15392, + "grad_norm": 1.1024609804153442, + "learning_rate": 1.3837535014005603e-05, + "loss": 0.4779, + "step": 18089 + }, + { + "epoch": 23.1552, + "grad_norm": 1.0634430646896362, + "learning_rate": 1.3835534213685475e-05, + "loss": 0.4301, + "step": 18090 + }, + { + "epoch": 23.15648, + "grad_norm": 1.10919988155365, + "learning_rate": 1.3833533413365345e-05, + "loss": 0.4733, + "step": 18091 + }, + { + "epoch": 23.15776, + "grad_norm": 1.1104084253311157, + "learning_rate": 1.383153261304522e-05, + "loss": 0.4612, + "step": 18092 + }, + { + "epoch": 23.15904, + "grad_norm": 1.1497199535369873, + "learning_rate": 1.382953181272509e-05, + "loss": 0.4389, + "step": 18093 + }, + { + "epoch": 23.16032, + "grad_norm": 1.177474856376648, + "learning_rate": 1.3827531012404963e-05, + "loss": 0.4916, + "step": 18094 + }, + { + "epoch": 23.1616, + "grad_norm": 1.1319663524627686, + "learning_rate": 1.3825530212084833e-05, + "loss": 0.4834, + "step": 18095 + }, + { + "epoch": 23.16288, + "grad_norm": 1.1392722129821777, + "learning_rate": 1.3823529411764708e-05, + "loss": 0.4928, + "step": 18096 + }, + { + "epoch": 23.16416, + "grad_norm": 1.1550222635269165, + "learning_rate": 1.3821528611444578e-05, + "loss": 0.4575, + "step": 18097 + }, + { + "epoch": 23.16544, + "grad_norm": 1.17167067527771, + "learning_rate": 1.381952781112445e-05, + "loss": 0.5154, + "step": 18098 + }, + { + "epoch": 23.16672, + "grad_norm": 1.167103886604309, + "learning_rate": 1.3817527010804324e-05, + "loss": 0.4935, + "step": 18099 + }, + { + "epoch": 23.168, + "grad_norm": 1.1817306280136108, + "learning_rate": 1.3815526210484195e-05, + "loss": 0.4877, + "step": 18100 + }, + { + "epoch": 23.16928, + "grad_norm": 1.1134272813796997, + "learning_rate": 1.3813525410164066e-05, + "loss": 0.4528, + "step": 18101 + }, + { + "epoch": 23.17056, + "grad_norm": 1.1140645742416382, + "learning_rate": 1.3811524609843938e-05, + "loss": 0.4576, + "step": 18102 + }, + { + "epoch": 23.17184, + "grad_norm": 1.140651822090149, + "learning_rate": 1.3809523809523811e-05, + "loss": 0.4729, + "step": 18103 + }, + { + "epoch": 23.17312, + "grad_norm": 1.1449460983276367, + "learning_rate": 1.3807523009203683e-05, + "loss": 0.4636, + "step": 18104 + }, + { + "epoch": 23.1744, + "grad_norm": 1.1074920892715454, + "learning_rate": 1.3805522208883553e-05, + "loss": 0.4774, + "step": 18105 + }, + { + "epoch": 23.17568, + "grad_norm": 1.1994163990020752, + "learning_rate": 1.3803521408563427e-05, + "loss": 0.4314, + "step": 18106 + }, + { + "epoch": 23.17696, + "grad_norm": 1.1801729202270508, + "learning_rate": 1.3801520608243298e-05, + "loss": 0.4872, + "step": 18107 + }, + { + "epoch": 23.17824, + "grad_norm": 1.1620689630508423, + "learning_rate": 1.379951980792317e-05, + "loss": 0.473, + "step": 18108 + }, + { + "epoch": 23.17952, + "grad_norm": 1.1328171491622925, + "learning_rate": 1.379751900760304e-05, + "loss": 0.4785, + "step": 18109 + }, + { + "epoch": 23.1808, + "grad_norm": 1.1725462675094604, + "learning_rate": 1.3795518207282914e-05, + "loss": 0.4587, + "step": 18110 + }, + { + "epoch": 23.18208, + "grad_norm": 1.1395995616912842, + "learning_rate": 1.3793517406962786e-05, + "loss": 0.4554, + "step": 18111 + }, + { + "epoch": 23.18336, + "grad_norm": 1.0973244905471802, + "learning_rate": 1.3791516606642658e-05, + "loss": 0.4621, + "step": 18112 + }, + { + "epoch": 23.18464, + "grad_norm": 1.1823533773422241, + "learning_rate": 1.3789515806322531e-05, + "loss": 0.5013, + "step": 18113 + }, + { + "epoch": 23.18592, + "grad_norm": 1.124189019203186, + "learning_rate": 1.3787515006002401e-05, + "loss": 0.4375, + "step": 18114 + }, + { + "epoch": 23.1872, + "grad_norm": 1.0450643301010132, + "learning_rate": 1.3785514205682273e-05, + "loss": 0.4448, + "step": 18115 + }, + { + "epoch": 23.18848, + "grad_norm": 1.1765981912612915, + "learning_rate": 1.3783513405362145e-05, + "loss": 0.5302, + "step": 18116 + }, + { + "epoch": 23.18976, + "grad_norm": 1.1082278490066528, + "learning_rate": 1.3781512605042019e-05, + "loss": 0.4354, + "step": 18117 + }, + { + "epoch": 23.19104, + "grad_norm": 1.123656153678894, + "learning_rate": 1.3779511804721889e-05, + "loss": 0.4896, + "step": 18118 + }, + { + "epoch": 23.19232, + "grad_norm": 1.1027441024780273, + "learning_rate": 1.377751100440176e-05, + "loss": 0.4553, + "step": 18119 + }, + { + "epoch": 23.1936, + "grad_norm": 1.1921277046203613, + "learning_rate": 1.3775510204081633e-05, + "loss": 0.5235, + "step": 18120 + }, + { + "epoch": 23.19488, + "grad_norm": 1.1635897159576416, + "learning_rate": 1.3773509403761506e-05, + "loss": 0.4774, + "step": 18121 + }, + { + "epoch": 23.19616, + "grad_norm": 1.1405202150344849, + "learning_rate": 1.3771508603441376e-05, + "loss": 0.4663, + "step": 18122 + }, + { + "epoch": 23.19744, + "grad_norm": 1.0812584161758423, + "learning_rate": 1.3769507803121248e-05, + "loss": 0.4246, + "step": 18123 + }, + { + "epoch": 23.19872, + "grad_norm": 1.1046404838562012, + "learning_rate": 1.3767507002801122e-05, + "loss": 0.444, + "step": 18124 + }, + { + "epoch": 23.2, + "grad_norm": 1.1270830631256104, + "learning_rate": 1.3765506202480994e-05, + "loss": 0.4681, + "step": 18125 + }, + { + "epoch": 23.20128, + "grad_norm": 1.092661738395691, + "learning_rate": 1.3763505402160864e-05, + "loss": 0.4537, + "step": 18126 + }, + { + "epoch": 23.20256, + "grad_norm": 1.1461021900177002, + "learning_rate": 1.3761504601840736e-05, + "loss": 0.4697, + "step": 18127 + }, + { + "epoch": 23.20384, + "grad_norm": 1.1606507301330566, + "learning_rate": 1.375950380152061e-05, + "loss": 0.5199, + "step": 18128 + }, + { + "epoch": 23.20512, + "grad_norm": 1.1296125650405884, + "learning_rate": 1.3757503001200481e-05, + "loss": 0.4839, + "step": 18129 + }, + { + "epoch": 23.2064, + "grad_norm": 1.2692511081695557, + "learning_rate": 1.3755502200880351e-05, + "loss": 0.4903, + "step": 18130 + }, + { + "epoch": 23.20768, + "grad_norm": 1.1228126287460327, + "learning_rate": 1.3753501400560226e-05, + "loss": 0.4585, + "step": 18131 + }, + { + "epoch": 23.20896, + "grad_norm": 1.1252394914627075, + "learning_rate": 1.3751500600240097e-05, + "loss": 0.5162, + "step": 18132 + }, + { + "epoch": 23.21024, + "grad_norm": 1.128462791442871, + "learning_rate": 1.3749499799919968e-05, + "loss": 0.4823, + "step": 18133 + }, + { + "epoch": 23.21152, + "grad_norm": 1.157885193824768, + "learning_rate": 1.3747498999599839e-05, + "loss": 0.5073, + "step": 18134 + }, + { + "epoch": 23.2128, + "grad_norm": 1.0952425003051758, + "learning_rate": 1.3745498199279714e-05, + "loss": 0.5003, + "step": 18135 + }, + { + "epoch": 23.21408, + "grad_norm": 1.1897740364074707, + "learning_rate": 1.3743497398959584e-05, + "loss": 0.5091, + "step": 18136 + }, + { + "epoch": 23.21536, + "grad_norm": 1.1394944190979004, + "learning_rate": 1.3741496598639456e-05, + "loss": 0.4115, + "step": 18137 + }, + { + "epoch": 23.21664, + "grad_norm": 1.1047616004943848, + "learning_rate": 1.373949579831933e-05, + "loss": 0.4888, + "step": 18138 + }, + { + "epoch": 23.21792, + "grad_norm": 1.1248316764831543, + "learning_rate": 1.3737494997999201e-05, + "loss": 0.4505, + "step": 18139 + }, + { + "epoch": 23.2192, + "grad_norm": 1.1958407163619995, + "learning_rate": 1.3735494197679071e-05, + "loss": 0.5051, + "step": 18140 + }, + { + "epoch": 23.22048, + "grad_norm": 1.186632513999939, + "learning_rate": 1.3733493397358943e-05, + "loss": 0.4692, + "step": 18141 + }, + { + "epoch": 23.22176, + "grad_norm": 1.1667543649673462, + "learning_rate": 1.3731492597038817e-05, + "loss": 0.5314, + "step": 18142 + }, + { + "epoch": 23.22304, + "grad_norm": 1.1657756567001343, + "learning_rate": 1.3729491796718689e-05, + "loss": 0.5432, + "step": 18143 + }, + { + "epoch": 23.22432, + "grad_norm": 1.1313673257827759, + "learning_rate": 1.3727490996398559e-05, + "loss": 0.4813, + "step": 18144 + }, + { + "epoch": 23.2256, + "grad_norm": 1.1224007606506348, + "learning_rate": 1.3725490196078432e-05, + "loss": 0.4471, + "step": 18145 + }, + { + "epoch": 23.22688, + "grad_norm": 1.171020746231079, + "learning_rate": 1.3723489395758304e-05, + "loss": 0.5398, + "step": 18146 + }, + { + "epoch": 23.22816, + "grad_norm": 1.0536117553710938, + "learning_rate": 1.3721488595438176e-05, + "loss": 0.4308, + "step": 18147 + }, + { + "epoch": 23.22944, + "grad_norm": 1.146742820739746, + "learning_rate": 1.3719487795118046e-05, + "loss": 0.4901, + "step": 18148 + }, + { + "epoch": 23.23072, + "grad_norm": 1.09756338596344, + "learning_rate": 1.371748699479792e-05, + "loss": 0.4469, + "step": 18149 + }, + { + "epoch": 23.232, + "grad_norm": 1.1287214756011963, + "learning_rate": 1.3715486194477792e-05, + "loss": 0.4648, + "step": 18150 + }, + { + "epoch": 23.23328, + "grad_norm": 1.138611078262329, + "learning_rate": 1.3713485394157664e-05, + "loss": 0.4304, + "step": 18151 + }, + { + "epoch": 23.23456, + "grad_norm": 1.092214822769165, + "learning_rate": 1.3711484593837537e-05, + "loss": 0.4395, + "step": 18152 + }, + { + "epoch": 23.23584, + "grad_norm": 1.1681923866271973, + "learning_rate": 1.3709483793517407e-05, + "loss": 0.5203, + "step": 18153 + }, + { + "epoch": 23.23712, + "grad_norm": 1.1423531770706177, + "learning_rate": 1.3707482993197279e-05, + "loss": 0.455, + "step": 18154 + }, + { + "epoch": 23.2384, + "grad_norm": 1.1326360702514648, + "learning_rate": 1.3705482192877151e-05, + "loss": 0.4795, + "step": 18155 + }, + { + "epoch": 23.23968, + "grad_norm": 1.1082360744476318, + "learning_rate": 1.3703481392557025e-05, + "loss": 0.4743, + "step": 18156 + }, + { + "epoch": 23.24096, + "grad_norm": 1.1555867195129395, + "learning_rate": 1.3701480592236895e-05, + "loss": 0.4985, + "step": 18157 + }, + { + "epoch": 23.24224, + "grad_norm": 1.1038802862167358, + "learning_rate": 1.3699479791916767e-05, + "loss": 0.4486, + "step": 18158 + }, + { + "epoch": 23.24352, + "grad_norm": 1.1623843908309937, + "learning_rate": 1.369747899159664e-05, + "loss": 0.4817, + "step": 18159 + }, + { + "epoch": 23.2448, + "grad_norm": 1.2444136142730713, + "learning_rate": 1.3695478191276512e-05, + "loss": 0.5373, + "step": 18160 + }, + { + "epoch": 23.24608, + "grad_norm": 1.107471227645874, + "learning_rate": 1.3693477390956382e-05, + "loss": 0.463, + "step": 18161 + }, + { + "epoch": 23.24736, + "grad_norm": 1.1224188804626465, + "learning_rate": 1.3691476590636254e-05, + "loss": 0.4517, + "step": 18162 + }, + { + "epoch": 23.24864, + "grad_norm": 1.119936466217041, + "learning_rate": 1.3689475790316128e-05, + "loss": 0.4567, + "step": 18163 + }, + { + "epoch": 23.24992, + "grad_norm": 1.1780643463134766, + "learning_rate": 1.3687474989996e-05, + "loss": 0.4465, + "step": 18164 + }, + { + "epoch": 23.2512, + "grad_norm": 1.0980896949768066, + "learning_rate": 1.368547418967587e-05, + "loss": 0.4619, + "step": 18165 + }, + { + "epoch": 23.25248, + "grad_norm": 1.1995065212249756, + "learning_rate": 1.3683473389355745e-05, + "loss": 0.5128, + "step": 18166 + }, + { + "epoch": 23.25376, + "grad_norm": 1.0773276090621948, + "learning_rate": 1.3681472589035615e-05, + "loss": 0.4548, + "step": 18167 + }, + { + "epoch": 23.25504, + "grad_norm": 1.1148123741149902, + "learning_rate": 1.3679471788715487e-05, + "loss": 0.4754, + "step": 18168 + }, + { + "epoch": 23.25632, + "grad_norm": 1.1616092920303345, + "learning_rate": 1.3677470988395357e-05, + "loss": 0.4963, + "step": 18169 + }, + { + "epoch": 23.2576, + "grad_norm": 1.1255162954330444, + "learning_rate": 1.3675470188075232e-05, + "loss": 0.4957, + "step": 18170 + }, + { + "epoch": 23.25888, + "grad_norm": 1.1024315357208252, + "learning_rate": 1.3673469387755102e-05, + "loss": 0.4764, + "step": 18171 + }, + { + "epoch": 23.26016, + "grad_norm": 1.1717227697372437, + "learning_rate": 1.3671468587434974e-05, + "loss": 0.4885, + "step": 18172 + }, + { + "epoch": 23.26144, + "grad_norm": 1.1702935695648193, + "learning_rate": 1.3669467787114844e-05, + "loss": 0.4821, + "step": 18173 + }, + { + "epoch": 23.26272, + "grad_norm": 1.051859974861145, + "learning_rate": 1.366746698679472e-05, + "loss": 0.4485, + "step": 18174 + }, + { + "epoch": 23.264, + "grad_norm": 1.042208194732666, + "learning_rate": 1.366546618647459e-05, + "loss": 0.4337, + "step": 18175 + }, + { + "epoch": 23.26528, + "grad_norm": 1.1617704629898071, + "learning_rate": 1.3663465386154462e-05, + "loss": 0.5218, + "step": 18176 + }, + { + "epoch": 23.26656, + "grad_norm": 1.151158332824707, + "learning_rate": 1.3661464585834335e-05, + "loss": 0.4289, + "step": 18177 + }, + { + "epoch": 23.26784, + "grad_norm": 1.1818608045578003, + "learning_rate": 1.3659463785514207e-05, + "loss": 0.4554, + "step": 18178 + }, + { + "epoch": 23.26912, + "grad_norm": 1.1779991388320923, + "learning_rate": 1.3657462985194077e-05, + "loss": 0.5049, + "step": 18179 + }, + { + "epoch": 23.2704, + "grad_norm": 1.209188461303711, + "learning_rate": 1.3655462184873949e-05, + "loss": 0.539, + "step": 18180 + }, + { + "epoch": 23.27168, + "grad_norm": 1.1615291833877563, + "learning_rate": 1.3653461384553823e-05, + "loss": 0.5058, + "step": 18181 + }, + { + "epoch": 23.27296, + "grad_norm": 1.1432435512542725, + "learning_rate": 1.3651460584233695e-05, + "loss": 0.4513, + "step": 18182 + }, + { + "epoch": 23.27424, + "grad_norm": 1.2107926607131958, + "learning_rate": 1.3649459783913565e-05, + "loss": 0.5399, + "step": 18183 + }, + { + "epoch": 23.27552, + "grad_norm": 1.2299079895019531, + "learning_rate": 1.364745898359344e-05, + "loss": 0.5395, + "step": 18184 + }, + { + "epoch": 23.2768, + "grad_norm": 1.1999679803848267, + "learning_rate": 1.364545818327331e-05, + "loss": 0.4976, + "step": 18185 + }, + { + "epoch": 23.27808, + "grad_norm": 1.1381257772445679, + "learning_rate": 1.3643457382953182e-05, + "loss": 0.507, + "step": 18186 + }, + { + "epoch": 23.27936, + "grad_norm": 1.1756349802017212, + "learning_rate": 1.3641456582633052e-05, + "loss": 0.5124, + "step": 18187 + }, + { + "epoch": 23.28064, + "grad_norm": 1.1597191095352173, + "learning_rate": 1.3639455782312927e-05, + "loss": 0.4758, + "step": 18188 + }, + { + "epoch": 23.28192, + "grad_norm": 1.1272660493850708, + "learning_rate": 1.3637454981992798e-05, + "loss": 0.4734, + "step": 18189 + }, + { + "epoch": 23.2832, + "grad_norm": 1.1996667385101318, + "learning_rate": 1.363545418167267e-05, + "loss": 0.4675, + "step": 18190 + }, + { + "epoch": 23.28448, + "grad_norm": 1.1502827405929565, + "learning_rate": 1.3633453381352543e-05, + "loss": 0.5163, + "step": 18191 + }, + { + "epoch": 23.28576, + "grad_norm": 1.1988962888717651, + "learning_rate": 1.3631452581032415e-05, + "loss": 0.4729, + "step": 18192 + }, + { + "epoch": 23.28704, + "grad_norm": 1.0956463813781738, + "learning_rate": 1.3629451780712285e-05, + "loss": 0.4646, + "step": 18193 + }, + { + "epoch": 23.28832, + "grad_norm": 1.0955307483673096, + "learning_rate": 1.3627450980392157e-05, + "loss": 0.4796, + "step": 18194 + }, + { + "epoch": 23.2896, + "grad_norm": 1.2165777683258057, + "learning_rate": 1.362545018007203e-05, + "loss": 0.5016, + "step": 18195 + }, + { + "epoch": 23.29088, + "grad_norm": 1.2035049200057983, + "learning_rate": 1.3623449379751902e-05, + "loss": 0.4793, + "step": 18196 + }, + { + "epoch": 23.29216, + "grad_norm": 1.1386168003082275, + "learning_rate": 1.3621448579431772e-05, + "loss": 0.4938, + "step": 18197 + }, + { + "epoch": 23.29344, + "grad_norm": 1.1671661138534546, + "learning_rate": 1.3619447779111646e-05, + "loss": 0.523, + "step": 18198 + }, + { + "epoch": 23.29472, + "grad_norm": 1.096612811088562, + "learning_rate": 1.3617446978791518e-05, + "loss": 0.4371, + "step": 18199 + }, + { + "epoch": 23.296, + "grad_norm": 1.1467458009719849, + "learning_rate": 1.361544617847139e-05, + "loss": 0.4933, + "step": 18200 + }, + { + "epoch": 23.29728, + "grad_norm": 1.131923794746399, + "learning_rate": 1.361344537815126e-05, + "loss": 0.4217, + "step": 18201 + }, + { + "epoch": 23.29856, + "grad_norm": 1.149901032447815, + "learning_rate": 1.3611444577831133e-05, + "loss": 0.4558, + "step": 18202 + }, + { + "epoch": 23.29984, + "grad_norm": 1.162813663482666, + "learning_rate": 1.3609443777511005e-05, + "loss": 0.4712, + "step": 18203 + }, + { + "epoch": 23.30112, + "grad_norm": 1.11738920211792, + "learning_rate": 1.3607442977190877e-05, + "loss": 0.4764, + "step": 18204 + }, + { + "epoch": 23.3024, + "grad_norm": 1.2097214460372925, + "learning_rate": 1.360544217687075e-05, + "loss": 0.4811, + "step": 18205 + }, + { + "epoch": 23.30368, + "grad_norm": 1.095917820930481, + "learning_rate": 1.360344137655062e-05, + "loss": 0.4281, + "step": 18206 + }, + { + "epoch": 23.30496, + "grad_norm": 1.1422666311264038, + "learning_rate": 1.3601440576230493e-05, + "loss": 0.4902, + "step": 18207 + }, + { + "epoch": 23.30624, + "grad_norm": 1.118310809135437, + "learning_rate": 1.3599439775910364e-05, + "loss": 0.4364, + "step": 18208 + }, + { + "epoch": 23.30752, + "grad_norm": 1.0865938663482666, + "learning_rate": 1.3597438975590238e-05, + "loss": 0.4709, + "step": 18209 + }, + { + "epoch": 23.3088, + "grad_norm": 1.0885086059570312, + "learning_rate": 1.3595438175270108e-05, + "loss": 0.489, + "step": 18210 + }, + { + "epoch": 23.31008, + "grad_norm": 1.1577430963516235, + "learning_rate": 1.359343737494998e-05, + "loss": 0.5023, + "step": 18211 + }, + { + "epoch": 23.31136, + "grad_norm": 1.162643551826477, + "learning_rate": 1.3591436574629854e-05, + "loss": 0.4846, + "step": 18212 + }, + { + "epoch": 23.31264, + "grad_norm": 1.1046801805496216, + "learning_rate": 1.3589435774309725e-05, + "loss": 0.4425, + "step": 18213 + }, + { + "epoch": 23.31392, + "grad_norm": 1.1700000762939453, + "learning_rate": 1.3587434973989596e-05, + "loss": 0.4799, + "step": 18214 + }, + { + "epoch": 23.3152, + "grad_norm": 1.0989649295806885, + "learning_rate": 1.3585434173669467e-05, + "loss": 0.4446, + "step": 18215 + }, + { + "epoch": 23.31648, + "grad_norm": 1.168190598487854, + "learning_rate": 1.3583433373349341e-05, + "loss": 0.5183, + "step": 18216 + }, + { + "epoch": 23.31776, + "grad_norm": 1.081773281097412, + "learning_rate": 1.3581432573029213e-05, + "loss": 0.4795, + "step": 18217 + }, + { + "epoch": 23.31904, + "grad_norm": 1.1346309185028076, + "learning_rate": 1.3579431772709083e-05, + "loss": 0.4431, + "step": 18218 + }, + { + "epoch": 23.32032, + "grad_norm": 1.150037169456482, + "learning_rate": 1.3577430972388958e-05, + "loss": 0.4687, + "step": 18219 + }, + { + "epoch": 23.3216, + "grad_norm": 1.1243027448654175, + "learning_rate": 1.3575430172068828e-05, + "loss": 0.4564, + "step": 18220 + }, + { + "epoch": 23.32288, + "grad_norm": 1.1913601160049438, + "learning_rate": 1.35734293717487e-05, + "loss": 0.5253, + "step": 18221 + }, + { + "epoch": 23.32416, + "grad_norm": 1.142009973526001, + "learning_rate": 1.357142857142857e-05, + "loss": 0.4782, + "step": 18222 + }, + { + "epoch": 23.32544, + "grad_norm": 1.1879467964172363, + "learning_rate": 1.3569427771108446e-05, + "loss": 0.4572, + "step": 18223 + }, + { + "epoch": 23.32672, + "grad_norm": 1.1096800565719604, + "learning_rate": 1.3567426970788316e-05, + "loss": 0.4476, + "step": 18224 + }, + { + "epoch": 23.328, + "grad_norm": 1.2172260284423828, + "learning_rate": 1.3565426170468188e-05, + "loss": 0.4931, + "step": 18225 + }, + { + "epoch": 23.32928, + "grad_norm": 1.1828625202178955, + "learning_rate": 1.3563425370148061e-05, + "loss": 0.4673, + "step": 18226 + }, + { + "epoch": 23.33056, + "grad_norm": 1.162265419960022, + "learning_rate": 1.3561424569827933e-05, + "loss": 0.4658, + "step": 18227 + }, + { + "epoch": 23.33184, + "grad_norm": 1.1894097328186035, + "learning_rate": 1.3559423769507803e-05, + "loss": 0.5361, + "step": 18228 + }, + { + "epoch": 23.33312, + "grad_norm": 1.0926059484481812, + "learning_rate": 1.3557422969187675e-05, + "loss": 0.4379, + "step": 18229 + }, + { + "epoch": 23.3344, + "grad_norm": 1.1234076023101807, + "learning_rate": 1.3555422168867549e-05, + "loss": 0.4882, + "step": 18230 + }, + { + "epoch": 23.33568, + "grad_norm": 1.1318891048431396, + "learning_rate": 1.355342136854742e-05, + "loss": 0.4784, + "step": 18231 + }, + { + "epoch": 23.33696, + "grad_norm": 1.2062749862670898, + "learning_rate": 1.355142056822729e-05, + "loss": 0.5137, + "step": 18232 + }, + { + "epoch": 23.33824, + "grad_norm": 1.1662986278533936, + "learning_rate": 1.3549419767907163e-05, + "loss": 0.4596, + "step": 18233 + }, + { + "epoch": 23.33952, + "grad_norm": 1.1790847778320312, + "learning_rate": 1.3547418967587036e-05, + "loss": 0.482, + "step": 18234 + }, + { + "epoch": 23.3408, + "grad_norm": 1.079903244972229, + "learning_rate": 1.3545418167266908e-05, + "loss": 0.4931, + "step": 18235 + }, + { + "epoch": 23.34208, + "grad_norm": 1.0966075658798218, + "learning_rate": 1.3543417366946778e-05, + "loss": 0.4332, + "step": 18236 + }, + { + "epoch": 23.34336, + "grad_norm": 1.104612946510315, + "learning_rate": 1.3541416566626652e-05, + "loss": 0.4597, + "step": 18237 + }, + { + "epoch": 23.34464, + "grad_norm": 1.1291465759277344, + "learning_rate": 1.3539415766306524e-05, + "loss": 0.4864, + "step": 18238 + }, + { + "epoch": 23.34592, + "grad_norm": 1.205235242843628, + "learning_rate": 1.3537414965986395e-05, + "loss": 0.5488, + "step": 18239 + }, + { + "epoch": 23.3472, + "grad_norm": 1.1906328201293945, + "learning_rate": 1.3535414165666266e-05, + "loss": 0.4874, + "step": 18240 + }, + { + "epoch": 23.34848, + "grad_norm": 1.1368838548660278, + "learning_rate": 1.3533413365346139e-05, + "loss": 0.4336, + "step": 18241 + }, + { + "epoch": 23.34976, + "grad_norm": 1.2331318855285645, + "learning_rate": 1.3531412565026011e-05, + "loss": 0.5252, + "step": 18242 + }, + { + "epoch": 23.35104, + "grad_norm": 1.147335410118103, + "learning_rate": 1.3529411764705883e-05, + "loss": 0.4733, + "step": 18243 + }, + { + "epoch": 23.35232, + "grad_norm": 1.1000919342041016, + "learning_rate": 1.3527410964385756e-05, + "loss": 0.4743, + "step": 18244 + }, + { + "epoch": 23.3536, + "grad_norm": 1.0781689882278442, + "learning_rate": 1.3525410164065627e-05, + "loss": 0.4291, + "step": 18245 + }, + { + "epoch": 23.35488, + "grad_norm": 1.218489170074463, + "learning_rate": 1.3523409363745498e-05, + "loss": 0.4633, + "step": 18246 + }, + { + "epoch": 23.35616, + "grad_norm": 1.2166152000427246, + "learning_rate": 1.352140856342537e-05, + "loss": 0.5128, + "step": 18247 + }, + { + "epoch": 23.35744, + "grad_norm": 1.1179983615875244, + "learning_rate": 1.3519407763105244e-05, + "loss": 0.4939, + "step": 18248 + }, + { + "epoch": 23.35872, + "grad_norm": 1.1847084760665894, + "learning_rate": 1.3517406962785114e-05, + "loss": 0.5327, + "step": 18249 + }, + { + "epoch": 23.36, + "grad_norm": 1.1466864347457886, + "learning_rate": 1.3515406162464986e-05, + "loss": 0.4945, + "step": 18250 + }, + { + "epoch": 23.36128, + "grad_norm": 1.1324851512908936, + "learning_rate": 1.351340536214486e-05, + "loss": 0.4895, + "step": 18251 + }, + { + "epoch": 23.36256, + "grad_norm": 1.1512887477874756, + "learning_rate": 1.3511404561824731e-05, + "loss": 0.4707, + "step": 18252 + }, + { + "epoch": 23.36384, + "grad_norm": 1.1378182172775269, + "learning_rate": 1.3509403761504601e-05, + "loss": 0.5104, + "step": 18253 + }, + { + "epoch": 23.36512, + "grad_norm": 1.1355574131011963, + "learning_rate": 1.3507402961184473e-05, + "loss": 0.4774, + "step": 18254 + }, + { + "epoch": 23.3664, + "grad_norm": 1.095694899559021, + "learning_rate": 1.3505402160864347e-05, + "loss": 0.4631, + "step": 18255 + }, + { + "epoch": 23.36768, + "grad_norm": 1.0908712148666382, + "learning_rate": 1.3503401360544219e-05, + "loss": 0.5036, + "step": 18256 + }, + { + "epoch": 23.36896, + "grad_norm": 1.1985548734664917, + "learning_rate": 1.3501400560224089e-05, + "loss": 0.5001, + "step": 18257 + }, + { + "epoch": 23.37024, + "grad_norm": 1.1489444971084595, + "learning_rate": 1.3499399759903964e-05, + "loss": 0.4526, + "step": 18258 + }, + { + "epoch": 23.37152, + "grad_norm": 1.183395266532898, + "learning_rate": 1.3497398959583834e-05, + "loss": 0.5232, + "step": 18259 + }, + { + "epoch": 23.3728, + "grad_norm": 1.1368277072906494, + "learning_rate": 1.3495398159263706e-05, + "loss": 0.4497, + "step": 18260 + }, + { + "epoch": 23.37408, + "grad_norm": 1.1291542053222656, + "learning_rate": 1.3493397358943576e-05, + "loss": 0.4875, + "step": 18261 + }, + { + "epoch": 23.37536, + "grad_norm": 1.132755160331726, + "learning_rate": 1.3491396558623452e-05, + "loss": 0.4763, + "step": 18262 + }, + { + "epoch": 23.37664, + "grad_norm": 1.107372522354126, + "learning_rate": 1.3489395758303322e-05, + "loss": 0.4821, + "step": 18263 + }, + { + "epoch": 23.37792, + "grad_norm": 1.1369836330413818, + "learning_rate": 1.3487394957983194e-05, + "loss": 0.4666, + "step": 18264 + }, + { + "epoch": 23.3792, + "grad_norm": 1.1723092794418335, + "learning_rate": 1.3485394157663067e-05, + "loss": 0.5537, + "step": 18265 + }, + { + "epoch": 23.38048, + "grad_norm": 1.1863315105438232, + "learning_rate": 1.3483393357342939e-05, + "loss": 0.5479, + "step": 18266 + }, + { + "epoch": 23.38176, + "grad_norm": 1.1658673286437988, + "learning_rate": 1.3481392557022809e-05, + "loss": 0.5388, + "step": 18267 + }, + { + "epoch": 23.38304, + "grad_norm": 1.1437945365905762, + "learning_rate": 1.3479391756702681e-05, + "loss": 0.4794, + "step": 18268 + }, + { + "epoch": 23.38432, + "grad_norm": 1.149343729019165, + "learning_rate": 1.3477390956382555e-05, + "loss": 0.4792, + "step": 18269 + }, + { + "epoch": 23.3856, + "grad_norm": 1.0931353569030762, + "learning_rate": 1.3475390156062426e-05, + "loss": 0.4807, + "step": 18270 + }, + { + "epoch": 23.38688, + "grad_norm": 1.122345209121704, + "learning_rate": 1.3473389355742297e-05, + "loss": 0.4718, + "step": 18271 + }, + { + "epoch": 23.38816, + "grad_norm": 1.0672874450683594, + "learning_rate": 1.347138855542217e-05, + "loss": 0.4547, + "step": 18272 + }, + { + "epoch": 23.38944, + "grad_norm": 1.1067068576812744, + "learning_rate": 1.3469387755102042e-05, + "loss": 0.4553, + "step": 18273 + }, + { + "epoch": 23.39072, + "grad_norm": 1.193006992340088, + "learning_rate": 1.3467386954781914e-05, + "loss": 0.5064, + "step": 18274 + }, + { + "epoch": 23.392, + "grad_norm": 1.1643834114074707, + "learning_rate": 1.3465386154461784e-05, + "loss": 0.4489, + "step": 18275 + }, + { + "epoch": 23.39328, + "grad_norm": 1.0767834186553955, + "learning_rate": 1.3463385354141658e-05, + "loss": 0.4177, + "step": 18276 + }, + { + "epoch": 23.39456, + "grad_norm": 1.1161274909973145, + "learning_rate": 1.346138455382153e-05, + "loss": 0.4918, + "step": 18277 + }, + { + "epoch": 23.39584, + "grad_norm": 1.2007757425308228, + "learning_rate": 1.3459383753501401e-05, + "loss": 0.5226, + "step": 18278 + }, + { + "epoch": 23.39712, + "grad_norm": 1.1643115282058716, + "learning_rate": 1.3457382953181275e-05, + "loss": 0.4366, + "step": 18279 + }, + { + "epoch": 23.3984, + "grad_norm": 1.183032512664795, + "learning_rate": 1.3455382152861145e-05, + "loss": 0.4606, + "step": 18280 + }, + { + "epoch": 23.39968, + "grad_norm": 1.0798239707946777, + "learning_rate": 1.3453381352541017e-05, + "loss": 0.4546, + "step": 18281 + }, + { + "epoch": 23.40096, + "grad_norm": 1.117011547088623, + "learning_rate": 1.3451380552220889e-05, + "loss": 0.4951, + "step": 18282 + }, + { + "epoch": 23.40224, + "grad_norm": 1.2203407287597656, + "learning_rate": 1.3449379751900762e-05, + "loss": 0.4415, + "step": 18283 + }, + { + "epoch": 23.40352, + "grad_norm": 1.1237221956253052, + "learning_rate": 1.3447378951580632e-05, + "loss": 0.4756, + "step": 18284 + }, + { + "epoch": 23.4048, + "grad_norm": 1.107589840888977, + "learning_rate": 1.3445378151260504e-05, + "loss": 0.4927, + "step": 18285 + }, + { + "epoch": 23.40608, + "grad_norm": 1.1575672626495361, + "learning_rate": 1.3443377350940376e-05, + "loss": 0.4813, + "step": 18286 + }, + { + "epoch": 23.40736, + "grad_norm": 1.074341893196106, + "learning_rate": 1.344137655062025e-05, + "loss": 0.4872, + "step": 18287 + }, + { + "epoch": 23.40864, + "grad_norm": 1.0716972351074219, + "learning_rate": 1.343937575030012e-05, + "loss": 0.4199, + "step": 18288 + }, + { + "epoch": 23.40992, + "grad_norm": 1.1021279096603394, + "learning_rate": 1.3437374949979992e-05, + "loss": 0.5047, + "step": 18289 + }, + { + "epoch": 23.4112, + "grad_norm": 1.1311216354370117, + "learning_rate": 1.3435374149659865e-05, + "loss": 0.5052, + "step": 18290 + }, + { + "epoch": 23.41248, + "grad_norm": 1.1251327991485596, + "learning_rate": 1.3433373349339737e-05, + "loss": 0.4459, + "step": 18291 + }, + { + "epoch": 23.41376, + "grad_norm": 1.1681146621704102, + "learning_rate": 1.3431372549019607e-05, + "loss": 0.4603, + "step": 18292 + }, + { + "epoch": 23.41504, + "grad_norm": 1.1599884033203125, + "learning_rate": 1.3429371748699479e-05, + "loss": 0.4869, + "step": 18293 + }, + { + "epoch": 23.41632, + "grad_norm": 1.170707106590271, + "learning_rate": 1.3427370948379353e-05, + "loss": 0.4833, + "step": 18294 + }, + { + "epoch": 23.4176, + "grad_norm": 1.1624923944473267, + "learning_rate": 1.3425370148059224e-05, + "loss": 0.472, + "step": 18295 + }, + { + "epoch": 23.41888, + "grad_norm": 1.1000187397003174, + "learning_rate": 1.3423369347739095e-05, + "loss": 0.4675, + "step": 18296 + }, + { + "epoch": 23.42016, + "grad_norm": 1.2124167680740356, + "learning_rate": 1.342136854741897e-05, + "loss": 0.5053, + "step": 18297 + }, + { + "epoch": 23.42144, + "grad_norm": 1.0960698127746582, + "learning_rate": 1.341936774709884e-05, + "loss": 0.4545, + "step": 18298 + }, + { + "epoch": 23.422719999999998, + "grad_norm": 1.1535124778747559, + "learning_rate": 1.3417366946778712e-05, + "loss": 0.4919, + "step": 18299 + }, + { + "epoch": 23.424, + "grad_norm": 1.0561548471450806, + "learning_rate": 1.3415366146458582e-05, + "loss": 0.4315, + "step": 18300 + }, + { + "epoch": 23.42528, + "grad_norm": 1.136558175086975, + "learning_rate": 1.3413365346138457e-05, + "loss": 0.4742, + "step": 18301 + }, + { + "epoch": 23.42656, + "grad_norm": 1.1573458909988403, + "learning_rate": 1.3411364545818327e-05, + "loss": 0.4753, + "step": 18302 + }, + { + "epoch": 23.42784, + "grad_norm": 1.1722750663757324, + "learning_rate": 1.34093637454982e-05, + "loss": 0.4666, + "step": 18303 + }, + { + "epoch": 23.42912, + "grad_norm": 1.1860383749008179, + "learning_rate": 1.3407362945178073e-05, + "loss": 0.4841, + "step": 18304 + }, + { + "epoch": 23.4304, + "grad_norm": 1.1146502494812012, + "learning_rate": 1.3405362144857945e-05, + "loss": 0.4435, + "step": 18305 + }, + { + "epoch": 23.43168, + "grad_norm": 1.1159223318099976, + "learning_rate": 1.3403361344537815e-05, + "loss": 0.4846, + "step": 18306 + }, + { + "epoch": 23.43296, + "grad_norm": 1.122043490409851, + "learning_rate": 1.3401360544217687e-05, + "loss": 0.4639, + "step": 18307 + }, + { + "epoch": 23.43424, + "grad_norm": 1.105041742324829, + "learning_rate": 1.339935974389756e-05, + "loss": 0.4886, + "step": 18308 + }, + { + "epoch": 23.43552, + "grad_norm": 1.1351252794265747, + "learning_rate": 1.3397358943577432e-05, + "loss": 0.4366, + "step": 18309 + }, + { + "epoch": 23.4368, + "grad_norm": 1.1929394006729126, + "learning_rate": 1.3395358143257302e-05, + "loss": 0.4917, + "step": 18310 + }, + { + "epoch": 23.43808, + "grad_norm": 1.1288291215896606, + "learning_rate": 1.3393357342937176e-05, + "loss": 0.4498, + "step": 18311 + }, + { + "epoch": 23.43936, + "grad_norm": 1.1538920402526855, + "learning_rate": 1.3391356542617048e-05, + "loss": 0.4655, + "step": 18312 + }, + { + "epoch": 23.44064, + "grad_norm": 1.0849840641021729, + "learning_rate": 1.338935574229692e-05, + "loss": 0.472, + "step": 18313 + }, + { + "epoch": 23.44192, + "grad_norm": 1.0807019472122192, + "learning_rate": 1.338735494197679e-05, + "loss": 0.4488, + "step": 18314 + }, + { + "epoch": 23.4432, + "grad_norm": 1.1522600650787354, + "learning_rate": 1.3385354141656663e-05, + "loss": 0.5164, + "step": 18315 + }, + { + "epoch": 23.44448, + "grad_norm": 1.2187221050262451, + "learning_rate": 1.3383353341336535e-05, + "loss": 0.5471, + "step": 18316 + }, + { + "epoch": 23.44576, + "grad_norm": 1.1273307800292969, + "learning_rate": 1.3381352541016407e-05, + "loss": 0.4396, + "step": 18317 + }, + { + "epoch": 23.44704, + "grad_norm": 1.0828683376312256, + "learning_rate": 1.337935174069628e-05, + "loss": 0.4686, + "step": 18318 + }, + { + "epoch": 23.44832, + "grad_norm": 1.0796676874160767, + "learning_rate": 1.337735094037615e-05, + "loss": 0.4329, + "step": 18319 + }, + { + "epoch": 23.4496, + "grad_norm": 1.2199808359146118, + "learning_rate": 1.3375350140056023e-05, + "loss": 0.5222, + "step": 18320 + }, + { + "epoch": 23.45088, + "grad_norm": 1.096962571144104, + "learning_rate": 1.3373349339735894e-05, + "loss": 0.4696, + "step": 18321 + }, + { + "epoch": 23.45216, + "grad_norm": 1.1377143859863281, + "learning_rate": 1.3371348539415768e-05, + "loss": 0.5229, + "step": 18322 + }, + { + "epoch": 23.45344, + "grad_norm": 1.1810131072998047, + "learning_rate": 1.3369347739095638e-05, + "loss": 0.476, + "step": 18323 + }, + { + "epoch": 23.454720000000002, + "grad_norm": 1.1354953050613403, + "learning_rate": 1.336734693877551e-05, + "loss": 0.4791, + "step": 18324 + }, + { + "epoch": 23.456, + "grad_norm": 1.0756906270980835, + "learning_rate": 1.3365346138455384e-05, + "loss": 0.44, + "step": 18325 + }, + { + "epoch": 23.45728, + "grad_norm": 1.1659843921661377, + "learning_rate": 1.3363345338135255e-05, + "loss": 0.4829, + "step": 18326 + }, + { + "epoch": 23.45856, + "grad_norm": 1.0880708694458008, + "learning_rate": 1.3361344537815126e-05, + "loss": 0.4278, + "step": 18327 + }, + { + "epoch": 23.45984, + "grad_norm": 1.161241888999939, + "learning_rate": 1.3359343737494997e-05, + "loss": 0.4762, + "step": 18328 + }, + { + "epoch": 23.46112, + "grad_norm": 1.1863901615142822, + "learning_rate": 1.3357342937174871e-05, + "loss": 0.4841, + "step": 18329 + }, + { + "epoch": 23.4624, + "grad_norm": 1.1466909646987915, + "learning_rate": 1.3355342136854743e-05, + "loss": 0.5037, + "step": 18330 + }, + { + "epoch": 23.46368, + "grad_norm": 1.2050119638442993, + "learning_rate": 1.3353341336534613e-05, + "loss": 0.4718, + "step": 18331 + }, + { + "epoch": 23.46496, + "grad_norm": 1.173220157623291, + "learning_rate": 1.3351340536214488e-05, + "loss": 0.4654, + "step": 18332 + }, + { + "epoch": 23.46624, + "grad_norm": 1.1541483402252197, + "learning_rate": 1.3349339735894358e-05, + "loss": 0.455, + "step": 18333 + }, + { + "epoch": 23.46752, + "grad_norm": 1.1780107021331787, + "learning_rate": 1.334733893557423e-05, + "loss": 0.5024, + "step": 18334 + }, + { + "epoch": 23.4688, + "grad_norm": 1.1591427326202393, + "learning_rate": 1.33453381352541e-05, + "loss": 0.4447, + "step": 18335 + }, + { + "epoch": 23.47008, + "grad_norm": 1.058050274848938, + "learning_rate": 1.3343337334933976e-05, + "loss": 0.4141, + "step": 18336 + }, + { + "epoch": 23.47136, + "grad_norm": 1.1635574102401733, + "learning_rate": 1.3341336534613846e-05, + "loss": 0.4718, + "step": 18337 + }, + { + "epoch": 23.47264, + "grad_norm": 1.1959178447723389, + "learning_rate": 1.3339335734293718e-05, + "loss": 0.5097, + "step": 18338 + }, + { + "epoch": 23.47392, + "grad_norm": 1.1117192506790161, + "learning_rate": 1.3337334933973591e-05, + "loss": 0.4703, + "step": 18339 + }, + { + "epoch": 23.4752, + "grad_norm": 1.1207165718078613, + "learning_rate": 1.3335334133653463e-05, + "loss": 0.4286, + "step": 18340 + }, + { + "epoch": 23.47648, + "grad_norm": 1.1271799802780151, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.4686, + "step": 18341 + }, + { + "epoch": 23.47776, + "grad_norm": 1.1951183080673218, + "learning_rate": 1.3331332533013205e-05, + "loss": 0.5247, + "step": 18342 + }, + { + "epoch": 23.47904, + "grad_norm": 1.1584644317626953, + "learning_rate": 1.3329331732693079e-05, + "loss": 0.4629, + "step": 18343 + }, + { + "epoch": 23.48032, + "grad_norm": 1.1701948642730713, + "learning_rate": 1.332733093237295e-05, + "loss": 0.4776, + "step": 18344 + }, + { + "epoch": 23.4816, + "grad_norm": 1.1780883073806763, + "learning_rate": 1.332533013205282e-05, + "loss": 0.4967, + "step": 18345 + }, + { + "epoch": 23.48288, + "grad_norm": 1.11570143699646, + "learning_rate": 1.3323329331732693e-05, + "loss": 0.486, + "step": 18346 + }, + { + "epoch": 23.48416, + "grad_norm": 1.060181975364685, + "learning_rate": 1.3321328531412566e-05, + "loss": 0.4462, + "step": 18347 + }, + { + "epoch": 23.48544, + "grad_norm": 1.0786004066467285, + "learning_rate": 1.3319327731092438e-05, + "loss": 0.4475, + "step": 18348 + }, + { + "epoch": 23.48672, + "grad_norm": 1.0986952781677246, + "learning_rate": 1.3317326930772308e-05, + "loss": 0.4567, + "step": 18349 + }, + { + "epoch": 23.488, + "grad_norm": 1.1361345052719116, + "learning_rate": 1.3315326130452183e-05, + "loss": 0.4504, + "step": 18350 + }, + { + "epoch": 23.48928, + "grad_norm": 1.145846962928772, + "learning_rate": 1.3313325330132054e-05, + "loss": 0.4997, + "step": 18351 + }, + { + "epoch": 23.49056, + "grad_norm": 1.1790693998336792, + "learning_rate": 1.3311324529811925e-05, + "loss": 0.504, + "step": 18352 + }, + { + "epoch": 23.49184, + "grad_norm": 1.1805227994918823, + "learning_rate": 1.3309323729491796e-05, + "loss": 0.4856, + "step": 18353 + }, + { + "epoch": 23.49312, + "grad_norm": 1.1480375528335571, + "learning_rate": 1.330732292917167e-05, + "loss": 0.4626, + "step": 18354 + }, + { + "epoch": 23.4944, + "grad_norm": 1.1622354984283447, + "learning_rate": 1.3305322128851541e-05, + "loss": 0.5071, + "step": 18355 + }, + { + "epoch": 23.49568, + "grad_norm": 1.1230262517929077, + "learning_rate": 1.3303321328531413e-05, + "loss": 0.4853, + "step": 18356 + }, + { + "epoch": 23.49696, + "grad_norm": 1.181242823600769, + "learning_rate": 1.3301320528211286e-05, + "loss": 0.4681, + "step": 18357 + }, + { + "epoch": 23.49824, + "grad_norm": 1.1128627061843872, + "learning_rate": 1.3299319727891158e-05, + "loss": 0.5194, + "step": 18358 + }, + { + "epoch": 23.49952, + "grad_norm": 1.192732572555542, + "learning_rate": 1.3297318927571028e-05, + "loss": 0.5138, + "step": 18359 + }, + { + "epoch": 23.5008, + "grad_norm": 1.1375622749328613, + "learning_rate": 1.32953181272509e-05, + "loss": 0.5004, + "step": 18360 + }, + { + "epoch": 23.50208, + "grad_norm": 1.13736093044281, + "learning_rate": 1.3293317326930774e-05, + "loss": 0.5064, + "step": 18361 + }, + { + "epoch": 23.50336, + "grad_norm": 1.1759809255599976, + "learning_rate": 1.3291316526610646e-05, + "loss": 0.4812, + "step": 18362 + }, + { + "epoch": 23.50464, + "grad_norm": 1.115228295326233, + "learning_rate": 1.3289315726290516e-05, + "loss": 0.4502, + "step": 18363 + }, + { + "epoch": 23.50592, + "grad_norm": 1.1482048034667969, + "learning_rate": 1.328731492597039e-05, + "loss": 0.4791, + "step": 18364 + }, + { + "epoch": 23.5072, + "grad_norm": 1.1504039764404297, + "learning_rate": 1.3285314125650261e-05, + "loss": 0.4734, + "step": 18365 + }, + { + "epoch": 23.50848, + "grad_norm": 1.0897183418273926, + "learning_rate": 1.3283313325330133e-05, + "loss": 0.4615, + "step": 18366 + }, + { + "epoch": 23.50976, + "grad_norm": 1.1377613544464111, + "learning_rate": 1.3281312525010003e-05, + "loss": 0.482, + "step": 18367 + }, + { + "epoch": 23.51104, + "grad_norm": 1.1299495697021484, + "learning_rate": 1.3279311724689877e-05, + "loss": 0.4337, + "step": 18368 + }, + { + "epoch": 23.51232, + "grad_norm": 1.1434617042541504, + "learning_rate": 1.3277310924369749e-05, + "loss": 0.5078, + "step": 18369 + }, + { + "epoch": 23.5136, + "grad_norm": 1.1693207025527954, + "learning_rate": 1.327531012404962e-05, + "loss": 0.4764, + "step": 18370 + }, + { + "epoch": 23.51488, + "grad_norm": 1.1742783784866333, + "learning_rate": 1.3273309323729494e-05, + "loss": 0.4831, + "step": 18371 + }, + { + "epoch": 23.51616, + "grad_norm": 1.1278483867645264, + "learning_rate": 1.3271308523409364e-05, + "loss": 0.4247, + "step": 18372 + }, + { + "epoch": 23.51744, + "grad_norm": 1.1402909755706787, + "learning_rate": 1.3269307723089236e-05, + "loss": 0.4449, + "step": 18373 + }, + { + "epoch": 23.51872, + "grad_norm": 1.1170767545700073, + "learning_rate": 1.3267306922769108e-05, + "loss": 0.4955, + "step": 18374 + }, + { + "epoch": 23.52, + "grad_norm": 1.194703221321106, + "learning_rate": 1.3265306122448982e-05, + "loss": 0.5139, + "step": 18375 + }, + { + "epoch": 23.52128, + "grad_norm": 1.1340879201889038, + "learning_rate": 1.3263305322128852e-05, + "loss": 0.4611, + "step": 18376 + }, + { + "epoch": 23.52256, + "grad_norm": 1.1601988077163696, + "learning_rate": 1.3261304521808724e-05, + "loss": 0.4484, + "step": 18377 + }, + { + "epoch": 23.52384, + "grad_norm": 1.1433758735656738, + "learning_rate": 1.3259303721488597e-05, + "loss": 0.4465, + "step": 18378 + }, + { + "epoch": 23.52512, + "grad_norm": 1.2454715967178345, + "learning_rate": 1.3257302921168469e-05, + "loss": 0.528, + "step": 18379 + }, + { + "epoch": 23.5264, + "grad_norm": 1.1691670417785645, + "learning_rate": 1.3255302120848339e-05, + "loss": 0.4814, + "step": 18380 + }, + { + "epoch": 23.52768, + "grad_norm": 1.140670657157898, + "learning_rate": 1.3253301320528211e-05, + "loss": 0.4821, + "step": 18381 + }, + { + "epoch": 23.52896, + "grad_norm": 1.1309688091278076, + "learning_rate": 1.3251300520208085e-05, + "loss": 0.4691, + "step": 18382 + }, + { + "epoch": 23.53024, + "grad_norm": 1.1939188241958618, + "learning_rate": 1.3249299719887956e-05, + "loss": 0.5132, + "step": 18383 + }, + { + "epoch": 23.53152, + "grad_norm": 1.214198112487793, + "learning_rate": 1.3247298919567827e-05, + "loss": 0.5292, + "step": 18384 + }, + { + "epoch": 23.5328, + "grad_norm": 1.1498759984970093, + "learning_rate": 1.3245298119247702e-05, + "loss": 0.4803, + "step": 18385 + }, + { + "epoch": 23.53408, + "grad_norm": 1.1071444749832153, + "learning_rate": 1.3243297318927572e-05, + "loss": 0.466, + "step": 18386 + }, + { + "epoch": 23.53536, + "grad_norm": 1.1929941177368164, + "learning_rate": 1.3241296518607444e-05, + "loss": 0.4975, + "step": 18387 + }, + { + "epoch": 23.53664, + "grad_norm": 1.1311265230178833, + "learning_rate": 1.3239295718287314e-05, + "loss": 0.4715, + "step": 18388 + }, + { + "epoch": 23.53792, + "grad_norm": 1.0508058071136475, + "learning_rate": 1.323729491796719e-05, + "loss": 0.4312, + "step": 18389 + }, + { + "epoch": 23.5392, + "grad_norm": 1.1475504636764526, + "learning_rate": 1.323529411764706e-05, + "loss": 0.5074, + "step": 18390 + }, + { + "epoch": 23.54048, + "grad_norm": 1.1296319961547852, + "learning_rate": 1.3233293317326931e-05, + "loss": 0.5169, + "step": 18391 + }, + { + "epoch": 23.54176, + "grad_norm": 1.1883400678634644, + "learning_rate": 1.3231292517006805e-05, + "loss": 0.4944, + "step": 18392 + }, + { + "epoch": 23.54304, + "grad_norm": 1.1157552003860474, + "learning_rate": 1.3229291716686677e-05, + "loss": 0.4619, + "step": 18393 + }, + { + "epoch": 23.54432, + "grad_norm": 1.1468461751937866, + "learning_rate": 1.3227290916366547e-05, + "loss": 0.5149, + "step": 18394 + }, + { + "epoch": 23.5456, + "grad_norm": 1.0889732837677002, + "learning_rate": 1.3225290116046419e-05, + "loss": 0.4268, + "step": 18395 + }, + { + "epoch": 23.54688, + "grad_norm": 1.142313838005066, + "learning_rate": 1.3223289315726292e-05, + "loss": 0.5009, + "step": 18396 + }, + { + "epoch": 23.54816, + "grad_norm": 1.1910713911056519, + "learning_rate": 1.3221288515406164e-05, + "loss": 0.5089, + "step": 18397 + }, + { + "epoch": 23.54944, + "grad_norm": 1.1199548244476318, + "learning_rate": 1.3219287715086034e-05, + "loss": 0.4684, + "step": 18398 + }, + { + "epoch": 23.55072, + "grad_norm": 1.0918095111846924, + "learning_rate": 1.3217286914765906e-05, + "loss": 0.462, + "step": 18399 + }, + { + "epoch": 23.552, + "grad_norm": 1.1307764053344727, + "learning_rate": 1.321528611444578e-05, + "loss": 0.4839, + "step": 18400 + }, + { + "epoch": 23.55328, + "grad_norm": 1.1751610040664673, + "learning_rate": 1.3213285314125651e-05, + "loss": 0.4891, + "step": 18401 + }, + { + "epoch": 23.55456, + "grad_norm": 1.1532877683639526, + "learning_rate": 1.3211284513805522e-05, + "loss": 0.4607, + "step": 18402 + }, + { + "epoch": 23.55584, + "grad_norm": 1.1767176389694214, + "learning_rate": 1.3209283713485395e-05, + "loss": 0.5076, + "step": 18403 + }, + { + "epoch": 23.55712, + "grad_norm": 1.2673394680023193, + "learning_rate": 1.3207282913165267e-05, + "loss": 0.5038, + "step": 18404 + }, + { + "epoch": 23.5584, + "grad_norm": 1.1632843017578125, + "learning_rate": 1.3205282112845139e-05, + "loss": 0.5004, + "step": 18405 + }, + { + "epoch": 23.55968, + "grad_norm": 1.1043983697891235, + "learning_rate": 1.3203281312525009e-05, + "loss": 0.4669, + "step": 18406 + }, + { + "epoch": 23.56096, + "grad_norm": 1.1622287034988403, + "learning_rate": 1.3201280512204883e-05, + "loss": 0.4538, + "step": 18407 + }, + { + "epoch": 23.56224, + "grad_norm": 1.1268742084503174, + "learning_rate": 1.3199279711884754e-05, + "loss": 0.4563, + "step": 18408 + }, + { + "epoch": 23.56352, + "grad_norm": 1.1703455448150635, + "learning_rate": 1.3197278911564626e-05, + "loss": 0.5168, + "step": 18409 + }, + { + "epoch": 23.564799999999998, + "grad_norm": 1.1342973709106445, + "learning_rate": 1.31952781112445e-05, + "loss": 0.505, + "step": 18410 + }, + { + "epoch": 23.56608, + "grad_norm": 1.1047439575195312, + "learning_rate": 1.319327731092437e-05, + "loss": 0.4839, + "step": 18411 + }, + { + "epoch": 23.56736, + "grad_norm": 1.1439480781555176, + "learning_rate": 1.3191276510604242e-05, + "loss": 0.462, + "step": 18412 + }, + { + "epoch": 23.56864, + "grad_norm": 1.1610522270202637, + "learning_rate": 1.3189275710284114e-05, + "loss": 0.498, + "step": 18413 + }, + { + "epoch": 23.56992, + "grad_norm": 1.1288456916809082, + "learning_rate": 1.3187274909963987e-05, + "loss": 0.4751, + "step": 18414 + }, + { + "epoch": 23.5712, + "grad_norm": 1.204026460647583, + "learning_rate": 1.3185274109643857e-05, + "loss": 0.517, + "step": 18415 + }, + { + "epoch": 23.57248, + "grad_norm": 1.1533409357070923, + "learning_rate": 1.318327330932373e-05, + "loss": 0.4879, + "step": 18416 + }, + { + "epoch": 23.57376, + "grad_norm": 1.1280791759490967, + "learning_rate": 1.3181272509003603e-05, + "loss": 0.4578, + "step": 18417 + }, + { + "epoch": 23.57504, + "grad_norm": 1.1419196128845215, + "learning_rate": 1.3179271708683475e-05, + "loss": 0.4827, + "step": 18418 + }, + { + "epoch": 23.57632, + "grad_norm": 1.1568984985351562, + "learning_rate": 1.3177270908363345e-05, + "loss": 0.4784, + "step": 18419 + }, + { + "epoch": 23.5776, + "grad_norm": 1.1242620944976807, + "learning_rate": 1.3175270108043217e-05, + "loss": 0.4755, + "step": 18420 + }, + { + "epoch": 23.57888, + "grad_norm": 1.1650310754776, + "learning_rate": 1.317326930772309e-05, + "loss": 0.5058, + "step": 18421 + }, + { + "epoch": 23.58016, + "grad_norm": 1.2178586721420288, + "learning_rate": 1.3171268507402962e-05, + "loss": 0.457, + "step": 18422 + }, + { + "epoch": 23.58144, + "grad_norm": 1.105704665184021, + "learning_rate": 1.3169267707082832e-05, + "loss": 0.4333, + "step": 18423 + }, + { + "epoch": 23.58272, + "grad_norm": 1.1392953395843506, + "learning_rate": 1.3167266906762708e-05, + "loss": 0.517, + "step": 18424 + }, + { + "epoch": 23.584, + "grad_norm": 1.1252771615982056, + "learning_rate": 1.3165266106442578e-05, + "loss": 0.4966, + "step": 18425 + }, + { + "epoch": 23.58528, + "grad_norm": 1.2022629976272583, + "learning_rate": 1.316326530612245e-05, + "loss": 0.4968, + "step": 18426 + }, + { + "epoch": 23.58656, + "grad_norm": 1.1347490549087524, + "learning_rate": 1.316126450580232e-05, + "loss": 0.4637, + "step": 18427 + }, + { + "epoch": 23.58784, + "grad_norm": 1.175969123840332, + "learning_rate": 1.3159263705482195e-05, + "loss": 0.4954, + "step": 18428 + }, + { + "epoch": 23.58912, + "grad_norm": 1.1564298868179321, + "learning_rate": 1.3157262905162065e-05, + "loss": 0.5057, + "step": 18429 + }, + { + "epoch": 23.5904, + "grad_norm": 1.1712132692337036, + "learning_rate": 1.3155262104841937e-05, + "loss": 0.49, + "step": 18430 + }, + { + "epoch": 23.59168, + "grad_norm": 1.1701792478561401, + "learning_rate": 1.315326130452181e-05, + "loss": 0.485, + "step": 18431 + }, + { + "epoch": 23.59296, + "grad_norm": 1.1181670427322388, + "learning_rate": 1.3151260504201682e-05, + "loss": 0.447, + "step": 18432 + }, + { + "epoch": 23.59424, + "grad_norm": 1.1028802394866943, + "learning_rate": 1.3149259703881553e-05, + "loss": 0.4823, + "step": 18433 + }, + { + "epoch": 23.59552, + "grad_norm": 1.191518783569336, + "learning_rate": 1.3147258903561424e-05, + "loss": 0.4561, + "step": 18434 + }, + { + "epoch": 23.5968, + "grad_norm": 1.1290074586868286, + "learning_rate": 1.3145258103241298e-05, + "loss": 0.4843, + "step": 18435 + }, + { + "epoch": 23.59808, + "grad_norm": 1.1779063940048218, + "learning_rate": 1.314325730292117e-05, + "loss": 0.4708, + "step": 18436 + }, + { + "epoch": 23.59936, + "grad_norm": 1.1097599267959595, + "learning_rate": 1.314125650260104e-05, + "loss": 0.4734, + "step": 18437 + }, + { + "epoch": 23.60064, + "grad_norm": 1.1385622024536133, + "learning_rate": 1.3139255702280914e-05, + "loss": 0.4822, + "step": 18438 + }, + { + "epoch": 23.60192, + "grad_norm": 1.0946968793869019, + "learning_rate": 1.3137254901960785e-05, + "loss": 0.4713, + "step": 18439 + }, + { + "epoch": 23.6032, + "grad_norm": 1.1536378860473633, + "learning_rate": 1.3135254101640657e-05, + "loss": 0.5073, + "step": 18440 + }, + { + "epoch": 23.60448, + "grad_norm": 1.1673475503921509, + "learning_rate": 1.3133253301320527e-05, + "loss": 0.4681, + "step": 18441 + }, + { + "epoch": 23.60576, + "grad_norm": 1.163110375404358, + "learning_rate": 1.3131252501000401e-05, + "loss": 0.5031, + "step": 18442 + }, + { + "epoch": 23.60704, + "grad_norm": 1.131412386894226, + "learning_rate": 1.3129251700680273e-05, + "loss": 0.4597, + "step": 18443 + }, + { + "epoch": 23.60832, + "grad_norm": 1.1555569171905518, + "learning_rate": 1.3127250900360145e-05, + "loss": 0.5379, + "step": 18444 + }, + { + "epoch": 23.6096, + "grad_norm": 1.1479874849319458, + "learning_rate": 1.3125250100040018e-05, + "loss": 0.4431, + "step": 18445 + }, + { + "epoch": 23.61088, + "grad_norm": 1.140120029449463, + "learning_rate": 1.3123249299719888e-05, + "loss": 0.4916, + "step": 18446 + }, + { + "epoch": 23.61216, + "grad_norm": 1.1265473365783691, + "learning_rate": 1.312124849939976e-05, + "loss": 0.5152, + "step": 18447 + }, + { + "epoch": 23.61344, + "grad_norm": 1.043741226196289, + "learning_rate": 1.3119247699079632e-05, + "loss": 0.4195, + "step": 18448 + }, + { + "epoch": 23.61472, + "grad_norm": 1.1078736782073975, + "learning_rate": 1.3117246898759506e-05, + "loss": 0.4793, + "step": 18449 + }, + { + "epoch": 23.616, + "grad_norm": 1.1717230081558228, + "learning_rate": 1.3115246098439376e-05, + "loss": 0.4681, + "step": 18450 + }, + { + "epoch": 23.61728, + "grad_norm": 1.174607753753662, + "learning_rate": 1.3113245298119248e-05, + "loss": 0.4738, + "step": 18451 + }, + { + "epoch": 23.61856, + "grad_norm": 1.1562086343765259, + "learning_rate": 1.311124449779912e-05, + "loss": 0.4716, + "step": 18452 + }, + { + "epoch": 23.61984, + "grad_norm": 1.1882222890853882, + "learning_rate": 1.3109243697478993e-05, + "loss": 0.5002, + "step": 18453 + }, + { + "epoch": 23.62112, + "grad_norm": 1.1770943403244019, + "learning_rate": 1.3107242897158863e-05, + "loss": 0.4787, + "step": 18454 + }, + { + "epoch": 23.6224, + "grad_norm": 1.16677987575531, + "learning_rate": 1.3105242096838735e-05, + "loss": 0.494, + "step": 18455 + }, + { + "epoch": 23.62368, + "grad_norm": 1.190652847290039, + "learning_rate": 1.3103241296518609e-05, + "loss": 0.4809, + "step": 18456 + }, + { + "epoch": 23.62496, + "grad_norm": 1.1706172227859497, + "learning_rate": 1.310124049619848e-05, + "loss": 0.4633, + "step": 18457 + }, + { + "epoch": 23.62624, + "grad_norm": 1.0963094234466553, + "learning_rate": 1.309923969587835e-05, + "loss": 0.4892, + "step": 18458 + }, + { + "epoch": 23.62752, + "grad_norm": 1.1064554452896118, + "learning_rate": 1.3097238895558223e-05, + "loss": 0.4748, + "step": 18459 + }, + { + "epoch": 23.6288, + "grad_norm": 1.1423083543777466, + "learning_rate": 1.3095238095238096e-05, + "loss": 0.4798, + "step": 18460 + }, + { + "epoch": 23.63008, + "grad_norm": 1.1238229274749756, + "learning_rate": 1.3093237294917968e-05, + "loss": 0.4674, + "step": 18461 + }, + { + "epoch": 23.63136, + "grad_norm": 1.1107577085494995, + "learning_rate": 1.3091236494597838e-05, + "loss": 0.438, + "step": 18462 + }, + { + "epoch": 23.63264, + "grad_norm": 1.1175166368484497, + "learning_rate": 1.3089235694277713e-05, + "loss": 0.4601, + "step": 18463 + }, + { + "epoch": 23.63392, + "grad_norm": 1.1670434474945068, + "learning_rate": 1.3087234893957584e-05, + "loss": 0.4494, + "step": 18464 + }, + { + "epoch": 23.6352, + "grad_norm": 1.2020723819732666, + "learning_rate": 1.3085234093637455e-05, + "loss": 0.4967, + "step": 18465 + }, + { + "epoch": 23.63648, + "grad_norm": 1.1246027946472168, + "learning_rate": 1.3083233293317326e-05, + "loss": 0.4723, + "step": 18466 + }, + { + "epoch": 23.63776, + "grad_norm": 1.1464457511901855, + "learning_rate": 1.30812324929972e-05, + "loss": 0.5051, + "step": 18467 + }, + { + "epoch": 23.63904, + "grad_norm": 1.152032732963562, + "learning_rate": 1.3079231692677071e-05, + "loss": 0.4749, + "step": 18468 + }, + { + "epoch": 23.64032, + "grad_norm": 1.150923490524292, + "learning_rate": 1.3077230892356943e-05, + "loss": 0.4738, + "step": 18469 + }, + { + "epoch": 23.6416, + "grad_norm": 1.1413908004760742, + "learning_rate": 1.3075230092036816e-05, + "loss": 0.4809, + "step": 18470 + }, + { + "epoch": 23.64288, + "grad_norm": 1.130707025527954, + "learning_rate": 1.3073229291716688e-05, + "loss": 0.4826, + "step": 18471 + }, + { + "epoch": 23.64416, + "grad_norm": 1.115024209022522, + "learning_rate": 1.3071228491396558e-05, + "loss": 0.4714, + "step": 18472 + }, + { + "epoch": 23.64544, + "grad_norm": 1.1373069286346436, + "learning_rate": 1.306922769107643e-05, + "loss": 0.4832, + "step": 18473 + }, + { + "epoch": 23.64672, + "grad_norm": 1.1076675653457642, + "learning_rate": 1.3067226890756304e-05, + "loss": 0.466, + "step": 18474 + }, + { + "epoch": 23.648, + "grad_norm": 1.190407156944275, + "learning_rate": 1.3065226090436176e-05, + "loss": 0.5365, + "step": 18475 + }, + { + "epoch": 23.64928, + "grad_norm": 1.0926904678344727, + "learning_rate": 1.3063225290116046e-05, + "loss": 0.5145, + "step": 18476 + }, + { + "epoch": 23.65056, + "grad_norm": 1.066358208656311, + "learning_rate": 1.306122448979592e-05, + "loss": 0.4459, + "step": 18477 + }, + { + "epoch": 23.65184, + "grad_norm": 1.129427194595337, + "learning_rate": 1.3059223689475791e-05, + "loss": 0.4508, + "step": 18478 + }, + { + "epoch": 23.65312, + "grad_norm": 1.1428444385528564, + "learning_rate": 1.3057222889155663e-05, + "loss": 0.4887, + "step": 18479 + }, + { + "epoch": 23.6544, + "grad_norm": 1.1522812843322754, + "learning_rate": 1.3055222088835533e-05, + "loss": 0.4857, + "step": 18480 + }, + { + "epoch": 23.65568, + "grad_norm": 1.1594774723052979, + "learning_rate": 1.3053221288515407e-05, + "loss": 0.483, + "step": 18481 + }, + { + "epoch": 23.65696, + "grad_norm": 1.1202539205551147, + "learning_rate": 1.3051220488195279e-05, + "loss": 0.4884, + "step": 18482 + }, + { + "epoch": 23.65824, + "grad_norm": 1.184862732887268, + "learning_rate": 1.304921968787515e-05, + "loss": 0.5249, + "step": 18483 + }, + { + "epoch": 23.65952, + "grad_norm": 1.2333064079284668, + "learning_rate": 1.3047218887555024e-05, + "loss": 0.5485, + "step": 18484 + }, + { + "epoch": 23.660800000000002, + "grad_norm": 1.1727824211120605, + "learning_rate": 1.3045218087234894e-05, + "loss": 0.4687, + "step": 18485 + }, + { + "epoch": 23.66208, + "grad_norm": 1.1470680236816406, + "learning_rate": 1.3043217286914766e-05, + "loss": 0.5179, + "step": 18486 + }, + { + "epoch": 23.66336, + "grad_norm": 1.1859335899353027, + "learning_rate": 1.3041216486594638e-05, + "loss": 0.4971, + "step": 18487 + }, + { + "epoch": 23.66464, + "grad_norm": 1.2277741432189941, + "learning_rate": 1.3039215686274511e-05, + "loss": 0.4887, + "step": 18488 + }, + { + "epoch": 23.66592, + "grad_norm": 1.1842122077941895, + "learning_rate": 1.3037214885954382e-05, + "loss": 0.4908, + "step": 18489 + }, + { + "epoch": 23.6672, + "grad_norm": 1.0600404739379883, + "learning_rate": 1.3035214085634254e-05, + "loss": 0.4354, + "step": 18490 + }, + { + "epoch": 23.66848, + "grad_norm": 1.13839590549469, + "learning_rate": 1.3033213285314127e-05, + "loss": 0.4953, + "step": 18491 + }, + { + "epoch": 23.66976, + "grad_norm": 1.1359751224517822, + "learning_rate": 1.3031212484993999e-05, + "loss": 0.4667, + "step": 18492 + }, + { + "epoch": 23.67104, + "grad_norm": 1.168168067932129, + "learning_rate": 1.3029211684673869e-05, + "loss": 0.4623, + "step": 18493 + }, + { + "epoch": 23.67232, + "grad_norm": 1.1284360885620117, + "learning_rate": 1.3027210884353741e-05, + "loss": 0.4694, + "step": 18494 + }, + { + "epoch": 23.6736, + "grad_norm": 1.1159340143203735, + "learning_rate": 1.3025210084033614e-05, + "loss": 0.4493, + "step": 18495 + }, + { + "epoch": 23.67488, + "grad_norm": 1.236001968383789, + "learning_rate": 1.3023209283713486e-05, + "loss": 0.508, + "step": 18496 + }, + { + "epoch": 23.67616, + "grad_norm": 1.2399630546569824, + "learning_rate": 1.3021208483393357e-05, + "loss": 0.494, + "step": 18497 + }, + { + "epoch": 23.67744, + "grad_norm": 1.181467890739441, + "learning_rate": 1.3019207683073232e-05, + "loss": 0.4886, + "step": 18498 + }, + { + "epoch": 23.67872, + "grad_norm": 1.2008730173110962, + "learning_rate": 1.3017206882753102e-05, + "loss": 0.5143, + "step": 18499 + }, + { + "epoch": 23.68, + "grad_norm": 1.1191836595535278, + "learning_rate": 1.3015206082432974e-05, + "loss": 0.4378, + "step": 18500 + }, + { + "epoch": 23.68128, + "grad_norm": 1.0821129083633423, + "learning_rate": 1.3013205282112844e-05, + "loss": 0.4534, + "step": 18501 + }, + { + "epoch": 23.68256, + "grad_norm": 1.1685720682144165, + "learning_rate": 1.301120448179272e-05, + "loss": 0.5053, + "step": 18502 + }, + { + "epoch": 23.68384, + "grad_norm": 1.1220587491989136, + "learning_rate": 1.300920368147259e-05, + "loss": 0.5203, + "step": 18503 + }, + { + "epoch": 23.68512, + "grad_norm": 1.1965205669403076, + "learning_rate": 1.3007202881152461e-05, + "loss": 0.535, + "step": 18504 + }, + { + "epoch": 23.6864, + "grad_norm": 1.161190152168274, + "learning_rate": 1.3005202080832335e-05, + "loss": 0.4718, + "step": 18505 + }, + { + "epoch": 23.68768, + "grad_norm": 1.1077537536621094, + "learning_rate": 1.3003201280512207e-05, + "loss": 0.4594, + "step": 18506 + }, + { + "epoch": 23.68896, + "grad_norm": 1.136878490447998, + "learning_rate": 1.3001200480192077e-05, + "loss": 0.5092, + "step": 18507 + }, + { + "epoch": 23.69024, + "grad_norm": 1.1334853172302246, + "learning_rate": 1.2999199679871949e-05, + "loss": 0.4772, + "step": 18508 + }, + { + "epoch": 23.69152, + "grad_norm": 1.1531234979629517, + "learning_rate": 1.2997198879551822e-05, + "loss": 0.4892, + "step": 18509 + }, + { + "epoch": 23.6928, + "grad_norm": 1.186574101448059, + "learning_rate": 1.2995198079231694e-05, + "loss": 0.5056, + "step": 18510 + }, + { + "epoch": 23.69408, + "grad_norm": 1.0875732898712158, + "learning_rate": 1.2993197278911564e-05, + "loss": 0.458, + "step": 18511 + }, + { + "epoch": 23.69536, + "grad_norm": 1.11167311668396, + "learning_rate": 1.2991196478591436e-05, + "loss": 0.4613, + "step": 18512 + }, + { + "epoch": 23.69664, + "grad_norm": 1.0831806659698486, + "learning_rate": 1.298919567827131e-05, + "loss": 0.4726, + "step": 18513 + }, + { + "epoch": 23.69792, + "grad_norm": 1.1323453187942505, + "learning_rate": 1.2987194877951181e-05, + "loss": 0.449, + "step": 18514 + }, + { + "epoch": 23.6992, + "grad_norm": 1.2333152294158936, + "learning_rate": 1.2985194077631052e-05, + "loss": 0.5372, + "step": 18515 + }, + { + "epoch": 23.70048, + "grad_norm": 1.1051863431930542, + "learning_rate": 1.2983193277310927e-05, + "loss": 0.4527, + "step": 18516 + }, + { + "epoch": 23.70176, + "grad_norm": 1.1067372560501099, + "learning_rate": 1.2981192476990797e-05, + "loss": 0.4401, + "step": 18517 + }, + { + "epoch": 23.70304, + "grad_norm": 1.1563383340835571, + "learning_rate": 1.2979191676670669e-05, + "loss": 0.4822, + "step": 18518 + }, + { + "epoch": 23.70432, + "grad_norm": 1.1573394536972046, + "learning_rate": 1.2977190876350539e-05, + "loss": 0.4711, + "step": 18519 + }, + { + "epoch": 23.7056, + "grad_norm": 1.1465578079223633, + "learning_rate": 1.2975190076030414e-05, + "loss": 0.4794, + "step": 18520 + }, + { + "epoch": 23.706879999999998, + "grad_norm": 1.093738317489624, + "learning_rate": 1.2973189275710284e-05, + "loss": 0.4742, + "step": 18521 + }, + { + "epoch": 23.70816, + "grad_norm": 1.1131172180175781, + "learning_rate": 1.2971188475390156e-05, + "loss": 0.5315, + "step": 18522 + }, + { + "epoch": 23.70944, + "grad_norm": 1.1380599737167358, + "learning_rate": 1.296918767507003e-05, + "loss": 0.4672, + "step": 18523 + }, + { + "epoch": 23.71072, + "grad_norm": 1.1121565103530884, + "learning_rate": 1.2967186874749902e-05, + "loss": 0.4578, + "step": 18524 + }, + { + "epoch": 23.712, + "grad_norm": 1.1249797344207764, + "learning_rate": 1.2965186074429772e-05, + "loss": 0.4734, + "step": 18525 + }, + { + "epoch": 23.71328, + "grad_norm": 1.141965389251709, + "learning_rate": 1.2963185274109644e-05, + "loss": 0.4883, + "step": 18526 + }, + { + "epoch": 23.71456, + "grad_norm": 1.1544233560562134, + "learning_rate": 1.2961184473789517e-05, + "loss": 0.4956, + "step": 18527 + }, + { + "epoch": 23.71584, + "grad_norm": 1.1456880569458008, + "learning_rate": 1.2959183673469389e-05, + "loss": 0.4913, + "step": 18528 + }, + { + "epoch": 23.71712, + "grad_norm": 1.1444575786590576, + "learning_rate": 1.295718287314926e-05, + "loss": 0.4707, + "step": 18529 + }, + { + "epoch": 23.7184, + "grad_norm": 1.050844669342041, + "learning_rate": 1.2955182072829133e-05, + "loss": 0.4341, + "step": 18530 + }, + { + "epoch": 23.71968, + "grad_norm": 1.1426994800567627, + "learning_rate": 1.2953181272509005e-05, + "loss": 0.4679, + "step": 18531 + }, + { + "epoch": 23.72096, + "grad_norm": 1.1515729427337646, + "learning_rate": 1.2951180472188877e-05, + "loss": 0.4746, + "step": 18532 + }, + { + "epoch": 23.72224, + "grad_norm": 1.2546969652175903, + "learning_rate": 1.2949179671868747e-05, + "loss": 0.4886, + "step": 18533 + }, + { + "epoch": 23.72352, + "grad_norm": 1.2079882621765137, + "learning_rate": 1.294717887154862e-05, + "loss": 0.4708, + "step": 18534 + }, + { + "epoch": 23.7248, + "grad_norm": 1.2106804847717285, + "learning_rate": 1.2945178071228492e-05, + "loss": 0.5217, + "step": 18535 + }, + { + "epoch": 23.72608, + "grad_norm": 1.2002469301223755, + "learning_rate": 1.2943177270908364e-05, + "loss": 0.4444, + "step": 18536 + }, + { + "epoch": 23.72736, + "grad_norm": 1.2250186204910278, + "learning_rate": 1.2941176470588238e-05, + "loss": 0.5409, + "step": 18537 + }, + { + "epoch": 23.72864, + "grad_norm": 1.1869670152664185, + "learning_rate": 1.2939175670268108e-05, + "loss": 0.4845, + "step": 18538 + }, + { + "epoch": 23.72992, + "grad_norm": 1.2370437383651733, + "learning_rate": 1.293717486994798e-05, + "loss": 0.4883, + "step": 18539 + }, + { + "epoch": 23.7312, + "grad_norm": 1.15911066532135, + "learning_rate": 1.2935174069627851e-05, + "loss": 0.4697, + "step": 18540 + }, + { + "epoch": 23.73248, + "grad_norm": 1.1930956840515137, + "learning_rate": 1.2933173269307725e-05, + "loss": 0.4824, + "step": 18541 + }, + { + "epoch": 23.73376, + "grad_norm": 1.1702487468719482, + "learning_rate": 1.2931172468987595e-05, + "loss": 0.4525, + "step": 18542 + }, + { + "epoch": 23.73504, + "grad_norm": 1.1612924337387085, + "learning_rate": 1.2929171668667467e-05, + "loss": 0.5488, + "step": 18543 + }, + { + "epoch": 23.73632, + "grad_norm": 1.1951484680175781, + "learning_rate": 1.292717086834734e-05, + "loss": 0.4683, + "step": 18544 + }, + { + "epoch": 23.7376, + "grad_norm": 1.1368860006332397, + "learning_rate": 1.2925170068027212e-05, + "loss": 0.4735, + "step": 18545 + }, + { + "epoch": 23.73888, + "grad_norm": 1.100229263305664, + "learning_rate": 1.2923169267707083e-05, + "loss": 0.4805, + "step": 18546 + }, + { + "epoch": 23.74016, + "grad_norm": 1.1962560415267944, + "learning_rate": 1.2921168467386954e-05, + "loss": 0.488, + "step": 18547 + }, + { + "epoch": 23.74144, + "grad_norm": 1.1619665622711182, + "learning_rate": 1.2919167667066828e-05, + "loss": 0.4893, + "step": 18548 + }, + { + "epoch": 23.74272, + "grad_norm": 1.1612014770507812, + "learning_rate": 1.29171668667467e-05, + "loss": 0.4735, + "step": 18549 + }, + { + "epoch": 23.744, + "grad_norm": 1.1319211721420288, + "learning_rate": 1.291516606642657e-05, + "loss": 0.4683, + "step": 18550 + }, + { + "epoch": 23.74528, + "grad_norm": 1.1409507989883423, + "learning_rate": 1.2913165266106445e-05, + "loss": 0.4702, + "step": 18551 + }, + { + "epoch": 23.74656, + "grad_norm": 1.1675480604171753, + "learning_rate": 1.2911164465786315e-05, + "loss": 0.4782, + "step": 18552 + }, + { + "epoch": 23.74784, + "grad_norm": 1.1573346853256226, + "learning_rate": 1.2909163665466187e-05, + "loss": 0.5024, + "step": 18553 + }, + { + "epoch": 23.74912, + "grad_norm": 1.1228611469268799, + "learning_rate": 1.2907162865146057e-05, + "loss": 0.4761, + "step": 18554 + }, + { + "epoch": 23.7504, + "grad_norm": 1.0913078784942627, + "learning_rate": 1.2905162064825933e-05, + "loss": 0.4681, + "step": 18555 + }, + { + "epoch": 23.75168, + "grad_norm": 1.1557594537734985, + "learning_rate": 1.2903161264505803e-05, + "loss": 0.4563, + "step": 18556 + }, + { + "epoch": 23.75296, + "grad_norm": 1.1187933683395386, + "learning_rate": 1.2901160464185675e-05, + "loss": 0.4953, + "step": 18557 + }, + { + "epoch": 23.75424, + "grad_norm": 1.18485426902771, + "learning_rate": 1.2899159663865548e-05, + "loss": 0.5024, + "step": 18558 + }, + { + "epoch": 23.75552, + "grad_norm": 1.158681035041809, + "learning_rate": 1.289715886354542e-05, + "loss": 0.471, + "step": 18559 + }, + { + "epoch": 23.7568, + "grad_norm": 1.1719639301300049, + "learning_rate": 1.289515806322529e-05, + "loss": 0.4878, + "step": 18560 + }, + { + "epoch": 23.75808, + "grad_norm": 1.1346675157546997, + "learning_rate": 1.2893157262905162e-05, + "loss": 0.4737, + "step": 18561 + }, + { + "epoch": 23.75936, + "grad_norm": 1.1762566566467285, + "learning_rate": 1.2891156462585036e-05, + "loss": 0.4824, + "step": 18562 + }, + { + "epoch": 23.76064, + "grad_norm": 1.190079689025879, + "learning_rate": 1.2889155662264908e-05, + "loss": 0.4976, + "step": 18563 + }, + { + "epoch": 23.76192, + "grad_norm": 1.1472417116165161, + "learning_rate": 1.2887154861944778e-05, + "loss": 0.4598, + "step": 18564 + }, + { + "epoch": 23.7632, + "grad_norm": 1.1223433017730713, + "learning_rate": 1.288515406162465e-05, + "loss": 0.4866, + "step": 18565 + }, + { + "epoch": 23.76448, + "grad_norm": 1.154817819595337, + "learning_rate": 1.2883153261304523e-05, + "loss": 0.4892, + "step": 18566 + }, + { + "epoch": 23.76576, + "grad_norm": 1.1164178848266602, + "learning_rate": 1.2881152460984395e-05, + "loss": 0.5027, + "step": 18567 + }, + { + "epoch": 23.76704, + "grad_norm": 1.171064019203186, + "learning_rate": 1.2879151660664265e-05, + "loss": 0.4635, + "step": 18568 + }, + { + "epoch": 23.76832, + "grad_norm": 1.1405155658721924, + "learning_rate": 1.2877150860344139e-05, + "loss": 0.4721, + "step": 18569 + }, + { + "epoch": 23.7696, + "grad_norm": 1.0519278049468994, + "learning_rate": 1.287515006002401e-05, + "loss": 0.4368, + "step": 18570 + }, + { + "epoch": 23.77088, + "grad_norm": 1.1556754112243652, + "learning_rate": 1.2873149259703882e-05, + "loss": 0.5245, + "step": 18571 + }, + { + "epoch": 23.77216, + "grad_norm": 1.1321344375610352, + "learning_rate": 1.2871148459383753e-05, + "loss": 0.4937, + "step": 18572 + }, + { + "epoch": 23.77344, + "grad_norm": 1.14230477809906, + "learning_rate": 1.2869147659063626e-05, + "loss": 0.4789, + "step": 18573 + }, + { + "epoch": 23.77472, + "grad_norm": 1.082296371459961, + "learning_rate": 1.2867146858743498e-05, + "loss": 0.4855, + "step": 18574 + }, + { + "epoch": 23.776, + "grad_norm": 1.1363306045532227, + "learning_rate": 1.286514605842337e-05, + "loss": 0.4706, + "step": 18575 + }, + { + "epoch": 23.77728, + "grad_norm": 1.180460810661316, + "learning_rate": 1.2863145258103243e-05, + "loss": 0.503, + "step": 18576 + }, + { + "epoch": 23.77856, + "grad_norm": 1.1838196516036987, + "learning_rate": 1.2861144457783114e-05, + "loss": 0.5241, + "step": 18577 + }, + { + "epoch": 23.77984, + "grad_norm": 1.1658860445022583, + "learning_rate": 1.2859143657462985e-05, + "loss": 0.4915, + "step": 18578 + }, + { + "epoch": 23.78112, + "grad_norm": 1.1486824750900269, + "learning_rate": 1.2857142857142857e-05, + "loss": 0.4803, + "step": 18579 + }, + { + "epoch": 23.7824, + "grad_norm": 1.0898503065109253, + "learning_rate": 1.285514205682273e-05, + "loss": 0.4444, + "step": 18580 + }, + { + "epoch": 23.78368, + "grad_norm": 1.1599661111831665, + "learning_rate": 1.2853141256502601e-05, + "loss": 0.539, + "step": 18581 + }, + { + "epoch": 23.78496, + "grad_norm": 1.151951551437378, + "learning_rate": 1.2851140456182473e-05, + "loss": 0.4741, + "step": 18582 + }, + { + "epoch": 23.78624, + "grad_norm": 1.2248902320861816, + "learning_rate": 1.2849139655862346e-05, + "loss": 0.4911, + "step": 18583 + }, + { + "epoch": 23.78752, + "grad_norm": 1.1011284589767456, + "learning_rate": 1.2847138855542218e-05, + "loss": 0.491, + "step": 18584 + }, + { + "epoch": 23.7888, + "grad_norm": 1.1119115352630615, + "learning_rate": 1.2845138055222088e-05, + "loss": 0.47, + "step": 18585 + }, + { + "epoch": 23.79008, + "grad_norm": 1.1536645889282227, + "learning_rate": 1.284313725490196e-05, + "loss": 0.4986, + "step": 18586 + }, + { + "epoch": 23.79136, + "grad_norm": 1.1889305114746094, + "learning_rate": 1.2841136454581834e-05, + "loss": 0.4693, + "step": 18587 + }, + { + "epoch": 23.79264, + "grad_norm": 1.0987606048583984, + "learning_rate": 1.2839135654261706e-05, + "loss": 0.4626, + "step": 18588 + }, + { + "epoch": 23.79392, + "grad_norm": 1.1800764799118042, + "learning_rate": 1.2837134853941576e-05, + "loss": 0.5212, + "step": 18589 + }, + { + "epoch": 23.7952, + "grad_norm": 1.1225427389144897, + "learning_rate": 1.2835134053621451e-05, + "loss": 0.4598, + "step": 18590 + }, + { + "epoch": 23.79648, + "grad_norm": 1.1654258966445923, + "learning_rate": 1.2833133253301321e-05, + "loss": 0.491, + "step": 18591 + }, + { + "epoch": 23.79776, + "grad_norm": 1.114014983177185, + "learning_rate": 1.2831132452981193e-05, + "loss": 0.4837, + "step": 18592 + }, + { + "epoch": 23.79904, + "grad_norm": 1.117835521697998, + "learning_rate": 1.2829131652661063e-05, + "loss": 0.4643, + "step": 18593 + }, + { + "epoch": 23.80032, + "grad_norm": 1.181115746498108, + "learning_rate": 1.2827130852340938e-05, + "loss": 0.5055, + "step": 18594 + }, + { + "epoch": 23.8016, + "grad_norm": 1.112195611000061, + "learning_rate": 1.2825130052020809e-05, + "loss": 0.4422, + "step": 18595 + }, + { + "epoch": 23.802880000000002, + "grad_norm": 1.1325654983520508, + "learning_rate": 1.282312925170068e-05, + "loss": 0.4486, + "step": 18596 + }, + { + "epoch": 23.80416, + "grad_norm": 1.103108286857605, + "learning_rate": 1.2821128451380554e-05, + "loss": 0.4953, + "step": 18597 + }, + { + "epoch": 23.80544, + "grad_norm": 1.0995981693267822, + "learning_rate": 1.2819127651060426e-05, + "loss": 0.4517, + "step": 18598 + }, + { + "epoch": 23.80672, + "grad_norm": 1.0977530479431152, + "learning_rate": 1.2817126850740296e-05, + "loss": 0.4939, + "step": 18599 + }, + { + "epoch": 23.808, + "grad_norm": 1.1470292806625366, + "learning_rate": 1.2815126050420168e-05, + "loss": 0.4998, + "step": 18600 + }, + { + "epoch": 23.80928, + "grad_norm": 1.1226685047149658, + "learning_rate": 1.2813125250100041e-05, + "loss": 0.4792, + "step": 18601 + }, + { + "epoch": 23.81056, + "grad_norm": 1.1221965551376343, + "learning_rate": 1.2811124449779913e-05, + "loss": 0.4889, + "step": 18602 + }, + { + "epoch": 23.81184, + "grad_norm": 1.1775237321853638, + "learning_rate": 1.2809123649459783e-05, + "loss": 0.4841, + "step": 18603 + }, + { + "epoch": 23.81312, + "grad_norm": 1.0999358892440796, + "learning_rate": 1.2807122849139657e-05, + "loss": 0.4482, + "step": 18604 + }, + { + "epoch": 23.8144, + "grad_norm": 1.074433445930481, + "learning_rate": 1.2805122048819529e-05, + "loss": 0.4656, + "step": 18605 + }, + { + "epoch": 23.81568, + "grad_norm": 1.0768699645996094, + "learning_rate": 1.28031212484994e-05, + "loss": 0.4976, + "step": 18606 + }, + { + "epoch": 23.81696, + "grad_norm": 1.0591034889221191, + "learning_rate": 1.2801120448179271e-05, + "loss": 0.4466, + "step": 18607 + }, + { + "epoch": 23.81824, + "grad_norm": 1.2163914442062378, + "learning_rate": 1.2799119647859144e-05, + "loss": 0.5735, + "step": 18608 + }, + { + "epoch": 23.81952, + "grad_norm": 1.1105905771255493, + "learning_rate": 1.2797118847539016e-05, + "loss": 0.4582, + "step": 18609 + }, + { + "epoch": 23.8208, + "grad_norm": 1.1055454015731812, + "learning_rate": 1.2795118047218888e-05, + "loss": 0.4706, + "step": 18610 + }, + { + "epoch": 23.82208, + "grad_norm": 1.1523383855819702, + "learning_rate": 1.2793117246898762e-05, + "loss": 0.4749, + "step": 18611 + }, + { + "epoch": 23.82336, + "grad_norm": 1.1723835468292236, + "learning_rate": 1.2791116446578632e-05, + "loss": 0.4854, + "step": 18612 + }, + { + "epoch": 23.82464, + "grad_norm": 1.1401787996292114, + "learning_rate": 1.2789115646258504e-05, + "loss": 0.5144, + "step": 18613 + }, + { + "epoch": 23.82592, + "grad_norm": 1.1540292501449585, + "learning_rate": 1.2787114845938376e-05, + "loss": 0.4866, + "step": 18614 + }, + { + "epoch": 23.8272, + "grad_norm": 1.1869522333145142, + "learning_rate": 1.2785114045618249e-05, + "loss": 0.4653, + "step": 18615 + }, + { + "epoch": 23.82848, + "grad_norm": 1.1412826776504517, + "learning_rate": 1.278311324529812e-05, + "loss": 0.4968, + "step": 18616 + }, + { + "epoch": 23.82976, + "grad_norm": 1.15664803981781, + "learning_rate": 1.2781112444977991e-05, + "loss": 0.4903, + "step": 18617 + }, + { + "epoch": 23.83104, + "grad_norm": 1.1142537593841553, + "learning_rate": 1.2779111644657865e-05, + "loss": 0.4709, + "step": 18618 + }, + { + "epoch": 23.83232, + "grad_norm": 1.0486506223678589, + "learning_rate": 1.2777110844337737e-05, + "loss": 0.4785, + "step": 18619 + }, + { + "epoch": 23.8336, + "grad_norm": 1.0911049842834473, + "learning_rate": 1.2775110044017607e-05, + "loss": 0.4762, + "step": 18620 + }, + { + "epoch": 23.83488, + "grad_norm": 1.146394968032837, + "learning_rate": 1.2773109243697479e-05, + "loss": 0.521, + "step": 18621 + }, + { + "epoch": 23.83616, + "grad_norm": 1.053382396697998, + "learning_rate": 1.2771108443377352e-05, + "loss": 0.4642, + "step": 18622 + }, + { + "epoch": 23.83744, + "grad_norm": 1.1303876638412476, + "learning_rate": 1.2769107643057224e-05, + "loss": 0.4674, + "step": 18623 + }, + { + "epoch": 23.83872, + "grad_norm": 1.1761562824249268, + "learning_rate": 1.2767106842737094e-05, + "loss": 0.5115, + "step": 18624 + }, + { + "epoch": 23.84, + "grad_norm": 1.120778203010559, + "learning_rate": 1.2765106042416966e-05, + "loss": 0.4848, + "step": 18625 + }, + { + "epoch": 23.84128, + "grad_norm": 1.110851764678955, + "learning_rate": 1.276310524209684e-05, + "loss": 0.4846, + "step": 18626 + }, + { + "epoch": 23.84256, + "grad_norm": 1.1596050262451172, + "learning_rate": 1.2761104441776711e-05, + "loss": 0.4922, + "step": 18627 + }, + { + "epoch": 23.84384, + "grad_norm": 1.1609506607055664, + "learning_rate": 1.2759103641456582e-05, + "loss": 0.5059, + "step": 18628 + }, + { + "epoch": 23.84512, + "grad_norm": 1.1542946100234985, + "learning_rate": 1.2757102841136457e-05, + "loss": 0.5106, + "step": 18629 + }, + { + "epoch": 23.8464, + "grad_norm": 1.1738747358322144, + "learning_rate": 1.2755102040816327e-05, + "loss": 0.4977, + "step": 18630 + }, + { + "epoch": 23.84768, + "grad_norm": 1.1270424127578735, + "learning_rate": 1.2753101240496199e-05, + "loss": 0.4735, + "step": 18631 + }, + { + "epoch": 23.84896, + "grad_norm": 1.1536674499511719, + "learning_rate": 1.2751100440176069e-05, + "loss": 0.506, + "step": 18632 + }, + { + "epoch": 23.85024, + "grad_norm": 1.1659085750579834, + "learning_rate": 1.2749099639855944e-05, + "loss": 0.5192, + "step": 18633 + }, + { + "epoch": 23.85152, + "grad_norm": 1.1134508848190308, + "learning_rate": 1.2747098839535814e-05, + "loss": 0.4403, + "step": 18634 + }, + { + "epoch": 23.8528, + "grad_norm": 1.1781237125396729, + "learning_rate": 1.2745098039215686e-05, + "loss": 0.4842, + "step": 18635 + }, + { + "epoch": 23.85408, + "grad_norm": 1.1390173435211182, + "learning_rate": 1.274309723889556e-05, + "loss": 0.4649, + "step": 18636 + }, + { + "epoch": 23.85536, + "grad_norm": 1.1824352741241455, + "learning_rate": 1.2741096438575432e-05, + "loss": 0.4635, + "step": 18637 + }, + { + "epoch": 23.85664, + "grad_norm": 1.1931419372558594, + "learning_rate": 1.2739095638255302e-05, + "loss": 0.5115, + "step": 18638 + }, + { + "epoch": 23.85792, + "grad_norm": 1.082920789718628, + "learning_rate": 1.2737094837935174e-05, + "loss": 0.4455, + "step": 18639 + }, + { + "epoch": 23.8592, + "grad_norm": 1.1064939498901367, + "learning_rate": 1.2735094037615047e-05, + "loss": 0.4162, + "step": 18640 + }, + { + "epoch": 23.86048, + "grad_norm": 1.1889894008636475, + "learning_rate": 1.2733093237294919e-05, + "loss": 0.4835, + "step": 18641 + }, + { + "epoch": 23.86176, + "grad_norm": 1.2531040906906128, + "learning_rate": 1.273109243697479e-05, + "loss": 0.5775, + "step": 18642 + }, + { + "epoch": 23.86304, + "grad_norm": 1.176379919052124, + "learning_rate": 1.2729091636654663e-05, + "loss": 0.5473, + "step": 18643 + }, + { + "epoch": 23.86432, + "grad_norm": 1.1142444610595703, + "learning_rate": 1.2727090836334535e-05, + "loss": 0.4867, + "step": 18644 + }, + { + "epoch": 23.8656, + "grad_norm": 1.1500768661499023, + "learning_rate": 1.2725090036014407e-05, + "loss": 0.4787, + "step": 18645 + }, + { + "epoch": 23.86688, + "grad_norm": 1.1695806980133057, + "learning_rate": 1.2723089235694277e-05, + "loss": 0.4926, + "step": 18646 + }, + { + "epoch": 23.86816, + "grad_norm": 1.1269073486328125, + "learning_rate": 1.272108843537415e-05, + "loss": 0.4831, + "step": 18647 + }, + { + "epoch": 23.86944, + "grad_norm": 1.213214635848999, + "learning_rate": 1.2719087635054022e-05, + "loss": 0.5235, + "step": 18648 + }, + { + "epoch": 23.87072, + "grad_norm": 1.134529709815979, + "learning_rate": 1.2717086834733894e-05, + "loss": 0.5038, + "step": 18649 + }, + { + "epoch": 23.872, + "grad_norm": 1.142578363418579, + "learning_rate": 1.2715086034413768e-05, + "loss": 0.4529, + "step": 18650 + }, + { + "epoch": 23.87328, + "grad_norm": 1.2363348007202148, + "learning_rate": 1.2713085234093638e-05, + "loss": 0.5431, + "step": 18651 + }, + { + "epoch": 23.87456, + "grad_norm": 1.0975717306137085, + "learning_rate": 1.271108443377351e-05, + "loss": 0.4614, + "step": 18652 + }, + { + "epoch": 23.87584, + "grad_norm": 1.1565914154052734, + "learning_rate": 1.2709083633453381e-05, + "loss": 0.5314, + "step": 18653 + }, + { + "epoch": 23.87712, + "grad_norm": 1.1390055418014526, + "learning_rate": 1.2707082833133255e-05, + "loss": 0.4763, + "step": 18654 + }, + { + "epoch": 23.8784, + "grad_norm": 1.120939016342163, + "learning_rate": 1.2705082032813125e-05, + "loss": 0.4826, + "step": 18655 + }, + { + "epoch": 23.87968, + "grad_norm": 1.1649311780929565, + "learning_rate": 1.2703081232492997e-05, + "loss": 0.5242, + "step": 18656 + }, + { + "epoch": 23.88096, + "grad_norm": 1.1487033367156982, + "learning_rate": 1.270108043217287e-05, + "loss": 0.4823, + "step": 18657 + }, + { + "epoch": 23.88224, + "grad_norm": 1.161218523979187, + "learning_rate": 1.2699079631852742e-05, + "loss": 0.4973, + "step": 18658 + }, + { + "epoch": 23.88352, + "grad_norm": 1.1504417657852173, + "learning_rate": 1.2697078831532613e-05, + "loss": 0.4817, + "step": 18659 + }, + { + "epoch": 23.8848, + "grad_norm": 1.204056978225708, + "learning_rate": 1.2695078031212484e-05, + "loss": 0.531, + "step": 18660 + }, + { + "epoch": 23.88608, + "grad_norm": 1.180108666419983, + "learning_rate": 1.2693077230892358e-05, + "loss": 0.4767, + "step": 18661 + }, + { + "epoch": 23.88736, + "grad_norm": 1.0854483842849731, + "learning_rate": 1.269107643057223e-05, + "loss": 0.4293, + "step": 18662 + }, + { + "epoch": 23.88864, + "grad_norm": 1.11992609500885, + "learning_rate": 1.26890756302521e-05, + "loss": 0.4731, + "step": 18663 + }, + { + "epoch": 23.88992, + "grad_norm": 1.1562358140945435, + "learning_rate": 1.2687074829931975e-05, + "loss": 0.5066, + "step": 18664 + }, + { + "epoch": 23.8912, + "grad_norm": 1.1416101455688477, + "learning_rate": 1.2685074029611845e-05, + "loss": 0.517, + "step": 18665 + }, + { + "epoch": 23.89248, + "grad_norm": 1.144882082939148, + "learning_rate": 1.2683073229291717e-05, + "loss": 0.4406, + "step": 18666 + }, + { + "epoch": 23.89376, + "grad_norm": 1.1081516742706299, + "learning_rate": 1.2681072428971587e-05, + "loss": 0.4549, + "step": 18667 + }, + { + "epoch": 23.89504, + "grad_norm": 1.1630297899246216, + "learning_rate": 1.2679071628651463e-05, + "loss": 0.4835, + "step": 18668 + }, + { + "epoch": 23.89632, + "grad_norm": 1.1396640539169312, + "learning_rate": 1.2677070828331333e-05, + "loss": 0.4788, + "step": 18669 + }, + { + "epoch": 23.8976, + "grad_norm": 1.2122644186019897, + "learning_rate": 1.2675070028011205e-05, + "loss": 0.5103, + "step": 18670 + }, + { + "epoch": 23.89888, + "grad_norm": 1.224592924118042, + "learning_rate": 1.2673069227691078e-05, + "loss": 0.5471, + "step": 18671 + }, + { + "epoch": 23.90016, + "grad_norm": 1.184139370918274, + "learning_rate": 1.267106842737095e-05, + "loss": 0.5388, + "step": 18672 + }, + { + "epoch": 23.90144, + "grad_norm": 1.2130299806594849, + "learning_rate": 1.266906762705082e-05, + "loss": 0.5117, + "step": 18673 + }, + { + "epoch": 23.90272, + "grad_norm": 1.1781811714172363, + "learning_rate": 1.2667066826730692e-05, + "loss": 0.4856, + "step": 18674 + }, + { + "epoch": 23.904, + "grad_norm": 1.1690351963043213, + "learning_rate": 1.2665066026410566e-05, + "loss": 0.4863, + "step": 18675 + }, + { + "epoch": 23.90528, + "grad_norm": 1.0583910942077637, + "learning_rate": 1.2663065226090437e-05, + "loss": 0.4572, + "step": 18676 + }, + { + "epoch": 23.90656, + "grad_norm": 1.0969510078430176, + "learning_rate": 1.2661064425770308e-05, + "loss": 0.4255, + "step": 18677 + }, + { + "epoch": 23.90784, + "grad_norm": 1.1258211135864258, + "learning_rate": 1.265906362545018e-05, + "loss": 0.4855, + "step": 18678 + }, + { + "epoch": 23.90912, + "grad_norm": 1.175527572631836, + "learning_rate": 1.2657062825130053e-05, + "loss": 0.5177, + "step": 18679 + }, + { + "epoch": 23.9104, + "grad_norm": 1.1303937435150146, + "learning_rate": 1.2655062024809925e-05, + "loss": 0.4979, + "step": 18680 + }, + { + "epoch": 23.91168, + "grad_norm": 1.0970633029937744, + "learning_rate": 1.2653061224489795e-05, + "loss": 0.4648, + "step": 18681 + }, + { + "epoch": 23.912959999999998, + "grad_norm": 1.12120521068573, + "learning_rate": 1.265106042416967e-05, + "loss": 0.5126, + "step": 18682 + }, + { + "epoch": 23.91424, + "grad_norm": 1.1222971677780151, + "learning_rate": 1.264905962384954e-05, + "loss": 0.4831, + "step": 18683 + }, + { + "epoch": 23.91552, + "grad_norm": 1.146567702293396, + "learning_rate": 1.2647058823529412e-05, + "loss": 0.4479, + "step": 18684 + }, + { + "epoch": 23.9168, + "grad_norm": 1.0587197542190552, + "learning_rate": 1.2645058023209283e-05, + "loss": 0.4675, + "step": 18685 + }, + { + "epoch": 23.91808, + "grad_norm": 1.0969828367233276, + "learning_rate": 1.2643057222889158e-05, + "loss": 0.5139, + "step": 18686 + }, + { + "epoch": 23.91936, + "grad_norm": 1.1403480768203735, + "learning_rate": 1.2641056422569028e-05, + "loss": 0.5023, + "step": 18687 + }, + { + "epoch": 23.92064, + "grad_norm": 1.137335181236267, + "learning_rate": 1.26390556222489e-05, + "loss": 0.4405, + "step": 18688 + }, + { + "epoch": 23.92192, + "grad_norm": 1.1394611597061157, + "learning_rate": 1.2637054821928773e-05, + "loss": 0.5089, + "step": 18689 + }, + { + "epoch": 23.9232, + "grad_norm": 1.1698044538497925, + "learning_rate": 1.2635054021608645e-05, + "loss": 0.436, + "step": 18690 + }, + { + "epoch": 23.92448, + "grad_norm": 1.1128085851669312, + "learning_rate": 1.2633053221288515e-05, + "loss": 0.4677, + "step": 18691 + }, + { + "epoch": 23.92576, + "grad_norm": 1.1220570802688599, + "learning_rate": 1.2631052420968387e-05, + "loss": 0.4621, + "step": 18692 + }, + { + "epoch": 23.92704, + "grad_norm": 1.0818321704864502, + "learning_rate": 1.262905162064826e-05, + "loss": 0.4531, + "step": 18693 + }, + { + "epoch": 23.92832, + "grad_norm": 1.1508997678756714, + "learning_rate": 1.2627050820328133e-05, + "loss": 0.4856, + "step": 18694 + }, + { + "epoch": 23.9296, + "grad_norm": 1.146302580833435, + "learning_rate": 1.2625050020008003e-05, + "loss": 0.4544, + "step": 18695 + }, + { + "epoch": 23.93088, + "grad_norm": 1.1530104875564575, + "learning_rate": 1.2623049219687876e-05, + "loss": 0.4572, + "step": 18696 + }, + { + "epoch": 23.93216, + "grad_norm": 1.2042316198349, + "learning_rate": 1.2621048419367748e-05, + "loss": 0.521, + "step": 18697 + }, + { + "epoch": 23.93344, + "grad_norm": 1.1437196731567383, + "learning_rate": 1.261904761904762e-05, + "loss": 0.4874, + "step": 18698 + }, + { + "epoch": 23.93472, + "grad_norm": 1.0899137258529663, + "learning_rate": 1.261704681872749e-05, + "loss": 0.4506, + "step": 18699 + }, + { + "epoch": 23.936, + "grad_norm": 1.153517723083496, + "learning_rate": 1.2615046018407364e-05, + "loss": 0.4816, + "step": 18700 + }, + { + "epoch": 23.93728, + "grad_norm": 1.1165791749954224, + "learning_rate": 1.2613045218087236e-05, + "loss": 0.4971, + "step": 18701 + }, + { + "epoch": 23.93856, + "grad_norm": 1.0416091680526733, + "learning_rate": 1.2611044417767107e-05, + "loss": 0.4636, + "step": 18702 + }, + { + "epoch": 23.93984, + "grad_norm": 1.1278401613235474, + "learning_rate": 1.2609043617446981e-05, + "loss": 0.4784, + "step": 18703 + }, + { + "epoch": 23.94112, + "grad_norm": 1.0824394226074219, + "learning_rate": 1.2607042817126851e-05, + "loss": 0.4231, + "step": 18704 + }, + { + "epoch": 23.9424, + "grad_norm": 1.1062698364257812, + "learning_rate": 1.2605042016806723e-05, + "loss": 0.4636, + "step": 18705 + }, + { + "epoch": 23.94368, + "grad_norm": 1.1416337490081787, + "learning_rate": 1.2603041216486595e-05, + "loss": 0.4995, + "step": 18706 + }, + { + "epoch": 23.944960000000002, + "grad_norm": 1.0663871765136719, + "learning_rate": 1.2601040416166468e-05, + "loss": 0.4747, + "step": 18707 + }, + { + "epoch": 23.94624, + "grad_norm": 1.2235000133514404, + "learning_rate": 1.2599039615846339e-05, + "loss": 0.4992, + "step": 18708 + }, + { + "epoch": 23.94752, + "grad_norm": 1.0971728563308716, + "learning_rate": 1.259703881552621e-05, + "loss": 0.4431, + "step": 18709 + }, + { + "epoch": 23.9488, + "grad_norm": 1.1560825109481812, + "learning_rate": 1.2595038015206084e-05, + "loss": 0.4653, + "step": 18710 + }, + { + "epoch": 23.95008, + "grad_norm": 1.0700130462646484, + "learning_rate": 1.2593037214885956e-05, + "loss": 0.4259, + "step": 18711 + }, + { + "epoch": 23.95136, + "grad_norm": 1.120007038116455, + "learning_rate": 1.2591036414565826e-05, + "loss": 0.5081, + "step": 18712 + }, + { + "epoch": 23.95264, + "grad_norm": 1.1066792011260986, + "learning_rate": 1.2589035614245698e-05, + "loss": 0.4736, + "step": 18713 + }, + { + "epoch": 23.95392, + "grad_norm": 1.1956183910369873, + "learning_rate": 1.2587034813925571e-05, + "loss": 0.5158, + "step": 18714 + }, + { + "epoch": 23.9552, + "grad_norm": 1.1309118270874023, + "learning_rate": 1.2585034013605443e-05, + "loss": 0.459, + "step": 18715 + }, + { + "epoch": 23.95648, + "grad_norm": 1.1367197036743164, + "learning_rate": 1.2583033213285313e-05, + "loss": 0.5102, + "step": 18716 + }, + { + "epoch": 23.95776, + "grad_norm": 1.1677851676940918, + "learning_rate": 1.2581032412965189e-05, + "loss": 0.4909, + "step": 18717 + }, + { + "epoch": 23.95904, + "grad_norm": 1.1207268238067627, + "learning_rate": 1.2579031612645059e-05, + "loss": 0.5059, + "step": 18718 + }, + { + "epoch": 23.96032, + "grad_norm": 1.18141770362854, + "learning_rate": 1.257703081232493e-05, + "loss": 0.4715, + "step": 18719 + }, + { + "epoch": 23.9616, + "grad_norm": 1.203596591949463, + "learning_rate": 1.2575030012004801e-05, + "loss": 0.5205, + "step": 18720 + }, + { + "epoch": 23.96288, + "grad_norm": 1.124710202217102, + "learning_rate": 1.2573029211684676e-05, + "loss": 0.478, + "step": 18721 + }, + { + "epoch": 23.96416, + "grad_norm": 1.1432015895843506, + "learning_rate": 1.2571028411364546e-05, + "loss": 0.4859, + "step": 18722 + }, + { + "epoch": 23.96544, + "grad_norm": 1.1251646280288696, + "learning_rate": 1.2569027611044418e-05, + "loss": 0.4917, + "step": 18723 + }, + { + "epoch": 23.96672, + "grad_norm": 1.1594064235687256, + "learning_rate": 1.2567026810724292e-05, + "loss": 0.4919, + "step": 18724 + }, + { + "epoch": 23.968, + "grad_norm": 1.253121256828308, + "learning_rate": 1.2565026010404164e-05, + "loss": 0.467, + "step": 18725 + }, + { + "epoch": 23.96928, + "grad_norm": 1.188087821006775, + "learning_rate": 1.2563025210084034e-05, + "loss": 0.5015, + "step": 18726 + }, + { + "epoch": 23.97056, + "grad_norm": 1.1531556844711304, + "learning_rate": 1.2561024409763906e-05, + "loss": 0.4871, + "step": 18727 + }, + { + "epoch": 23.97184, + "grad_norm": 1.1422969102859497, + "learning_rate": 1.2559023609443779e-05, + "loss": 0.4797, + "step": 18728 + }, + { + "epoch": 23.97312, + "grad_norm": 1.1510505676269531, + "learning_rate": 1.2557022809123651e-05, + "loss": 0.4865, + "step": 18729 + }, + { + "epoch": 23.9744, + "grad_norm": 1.1790677309036255, + "learning_rate": 1.2555022008803521e-05, + "loss": 0.5234, + "step": 18730 + }, + { + "epoch": 23.97568, + "grad_norm": 1.141440987586975, + "learning_rate": 1.2553021208483393e-05, + "loss": 0.4965, + "step": 18731 + }, + { + "epoch": 23.97696, + "grad_norm": 1.1835546493530273, + "learning_rate": 1.2551020408163267e-05, + "loss": 0.4611, + "step": 18732 + }, + { + "epoch": 23.97824, + "grad_norm": 1.1586164236068726, + "learning_rate": 1.2549019607843138e-05, + "loss": 0.5029, + "step": 18733 + }, + { + "epoch": 23.97952, + "grad_norm": 1.1706522703170776, + "learning_rate": 1.2547018807523009e-05, + "loss": 0.4507, + "step": 18734 + }, + { + "epoch": 23.9808, + "grad_norm": 1.1484572887420654, + "learning_rate": 1.2545018007202882e-05, + "loss": 0.5009, + "step": 18735 + }, + { + "epoch": 23.98208, + "grad_norm": 1.1552718877792358, + "learning_rate": 1.2543017206882754e-05, + "loss": 0.4575, + "step": 18736 + }, + { + "epoch": 23.98336, + "grad_norm": 1.15525221824646, + "learning_rate": 1.2541016406562626e-05, + "loss": 0.5026, + "step": 18737 + }, + { + "epoch": 23.98464, + "grad_norm": 1.107552170753479, + "learning_rate": 1.2539015606242496e-05, + "loss": 0.4756, + "step": 18738 + }, + { + "epoch": 23.98592, + "grad_norm": 1.2044583559036255, + "learning_rate": 1.253701480592237e-05, + "loss": 0.4688, + "step": 18739 + }, + { + "epoch": 23.9872, + "grad_norm": 1.1863468885421753, + "learning_rate": 1.2535014005602241e-05, + "loss": 0.4841, + "step": 18740 + }, + { + "epoch": 23.98848, + "grad_norm": 1.1284416913986206, + "learning_rate": 1.2533013205282113e-05, + "loss": 0.4416, + "step": 18741 + }, + { + "epoch": 23.98976, + "grad_norm": 1.1152154207229614, + "learning_rate": 1.2531012404961987e-05, + "loss": 0.4756, + "step": 18742 + }, + { + "epoch": 23.99104, + "grad_norm": 1.1176108121871948, + "learning_rate": 1.2529011604641857e-05, + "loss": 0.449, + "step": 18743 + }, + { + "epoch": 23.99232, + "grad_norm": 1.2353702783584595, + "learning_rate": 1.2527010804321729e-05, + "loss": 0.513, + "step": 18744 + }, + { + "epoch": 23.9936, + "grad_norm": 1.1160780191421509, + "learning_rate": 1.25250100040016e-05, + "loss": 0.4815, + "step": 18745 + }, + { + "epoch": 23.99488, + "grad_norm": 1.2114685773849487, + "learning_rate": 1.2523009203681474e-05, + "loss": 0.5117, + "step": 18746 + }, + { + "epoch": 23.99616, + "grad_norm": 1.226145625114441, + "learning_rate": 1.2521008403361344e-05, + "loss": 0.5135, + "step": 18747 + }, + { + "epoch": 23.99744, + "grad_norm": 1.1571670770645142, + "learning_rate": 1.2519007603041216e-05, + "loss": 0.4824, + "step": 18748 + }, + { + "epoch": 23.99872, + "grad_norm": 1.0984230041503906, + "learning_rate": 1.251700680272109e-05, + "loss": 0.4727, + "step": 18749 + }, + { + "epoch": 24.0, + "grad_norm": 2.419872283935547, + "learning_rate": 1.2515006002400962e-05, + "loss": 0.8476, + "step": 18750 + }, + { + "epoch": 24.00128, + "grad_norm": 1.1103284358978271, + "learning_rate": 1.2513005202080832e-05, + "loss": 0.5034, + "step": 18751 + }, + { + "epoch": 24.00256, + "grad_norm": 1.077684760093689, + "learning_rate": 1.2511004401760704e-05, + "loss": 0.4662, + "step": 18752 + }, + { + "epoch": 24.00384, + "grad_norm": 1.0543673038482666, + "learning_rate": 1.2509003601440577e-05, + "loss": 0.4556, + "step": 18753 + }, + { + "epoch": 24.00512, + "grad_norm": 1.18900728225708, + "learning_rate": 1.2507002801120449e-05, + "loss": 0.5031, + "step": 18754 + }, + { + "epoch": 24.0064, + "grad_norm": 1.0953742265701294, + "learning_rate": 1.250500200080032e-05, + "loss": 0.4672, + "step": 18755 + }, + { + "epoch": 24.00768, + "grad_norm": 1.1897867918014526, + "learning_rate": 1.2503001200480195e-05, + "loss": 0.4988, + "step": 18756 + }, + { + "epoch": 24.00896, + "grad_norm": 1.134912371635437, + "learning_rate": 1.2501000400160065e-05, + "loss": 0.5086, + "step": 18757 + }, + { + "epoch": 24.01024, + "grad_norm": 1.1343998908996582, + "learning_rate": 1.2498999599839937e-05, + "loss": 0.4737, + "step": 18758 + }, + { + "epoch": 24.01152, + "grad_norm": 1.0810625553131104, + "learning_rate": 1.2496998799519808e-05, + "loss": 0.4678, + "step": 18759 + }, + { + "epoch": 24.0128, + "grad_norm": 1.1055142879486084, + "learning_rate": 1.249499799919968e-05, + "loss": 0.459, + "step": 18760 + }, + { + "epoch": 24.01408, + "grad_norm": 1.1549721956253052, + "learning_rate": 1.2492997198879552e-05, + "loss": 0.4943, + "step": 18761 + }, + { + "epoch": 24.01536, + "grad_norm": 1.121549129486084, + "learning_rate": 1.2490996398559426e-05, + "loss": 0.4437, + "step": 18762 + }, + { + "epoch": 24.01664, + "grad_norm": 1.0666358470916748, + "learning_rate": 1.2488995598239296e-05, + "loss": 0.4201, + "step": 18763 + }, + { + "epoch": 24.01792, + "grad_norm": 1.134900689125061, + "learning_rate": 1.248699479791917e-05, + "loss": 0.4853, + "step": 18764 + }, + { + "epoch": 24.0192, + "grad_norm": 1.1466809511184692, + "learning_rate": 1.248499399759904e-05, + "loss": 0.4803, + "step": 18765 + }, + { + "epoch": 24.02048, + "grad_norm": 1.1177642345428467, + "learning_rate": 1.2482993197278913e-05, + "loss": 0.4439, + "step": 18766 + }, + { + "epoch": 24.02176, + "grad_norm": 1.1064014434814453, + "learning_rate": 1.2480992396958783e-05, + "loss": 0.4562, + "step": 18767 + }, + { + "epoch": 24.02304, + "grad_norm": 1.103027105331421, + "learning_rate": 1.2478991596638657e-05, + "loss": 0.4495, + "step": 18768 + }, + { + "epoch": 24.02432, + "grad_norm": 1.165516972541809, + "learning_rate": 1.2476990796318529e-05, + "loss": 0.4858, + "step": 18769 + }, + { + "epoch": 24.0256, + "grad_norm": 1.037825107574463, + "learning_rate": 1.24749899959984e-05, + "loss": 0.4153, + "step": 18770 + }, + { + "epoch": 24.02688, + "grad_norm": 1.0688804388046265, + "learning_rate": 1.2472989195678272e-05, + "loss": 0.4368, + "step": 18771 + }, + { + "epoch": 24.02816, + "grad_norm": 1.1697540283203125, + "learning_rate": 1.2470988395358144e-05, + "loss": 0.4791, + "step": 18772 + }, + { + "epoch": 24.02944, + "grad_norm": 1.1564234495162964, + "learning_rate": 1.2468987595038016e-05, + "loss": 0.4714, + "step": 18773 + }, + { + "epoch": 24.03072, + "grad_norm": 1.1580915451049805, + "learning_rate": 1.2466986794717888e-05, + "loss": 0.4786, + "step": 18774 + }, + { + "epoch": 24.032, + "grad_norm": 1.172911286354065, + "learning_rate": 1.246498599439776e-05, + "loss": 0.4536, + "step": 18775 + }, + { + "epoch": 24.03328, + "grad_norm": 1.1058646440505981, + "learning_rate": 1.2462985194077632e-05, + "loss": 0.436, + "step": 18776 + }, + { + "epoch": 24.03456, + "grad_norm": 1.1083683967590332, + "learning_rate": 1.2460984393757503e-05, + "loss": 0.4947, + "step": 18777 + }, + { + "epoch": 24.03584, + "grad_norm": 1.1440143585205078, + "learning_rate": 1.2458983593437375e-05, + "loss": 0.4808, + "step": 18778 + }, + { + "epoch": 24.03712, + "grad_norm": 1.11277174949646, + "learning_rate": 1.2456982793117247e-05, + "loss": 0.4342, + "step": 18779 + }, + { + "epoch": 24.0384, + "grad_norm": 1.054457426071167, + "learning_rate": 1.2454981992797119e-05, + "loss": 0.4136, + "step": 18780 + }, + { + "epoch": 24.03968, + "grad_norm": 1.1549383401870728, + "learning_rate": 1.2452981192476991e-05, + "loss": 0.4787, + "step": 18781 + }, + { + "epoch": 24.04096, + "grad_norm": 1.158250093460083, + "learning_rate": 1.2450980392156863e-05, + "loss": 0.4401, + "step": 18782 + }, + { + "epoch": 24.04224, + "grad_norm": 1.170989990234375, + "learning_rate": 1.2448979591836735e-05, + "loss": 0.4973, + "step": 18783 + }, + { + "epoch": 24.04352, + "grad_norm": 1.1259979009628296, + "learning_rate": 1.2446978791516606e-05, + "loss": 0.4472, + "step": 18784 + }, + { + "epoch": 24.0448, + "grad_norm": 1.147057056427002, + "learning_rate": 1.244497799119648e-05, + "loss": 0.5124, + "step": 18785 + }, + { + "epoch": 24.04608, + "grad_norm": 1.2855243682861328, + "learning_rate": 1.244297719087635e-05, + "loss": 0.543, + "step": 18786 + }, + { + "epoch": 24.04736, + "grad_norm": 1.1440112590789795, + "learning_rate": 1.2440976390556224e-05, + "loss": 0.4572, + "step": 18787 + }, + { + "epoch": 24.04864, + "grad_norm": 1.1109403371810913, + "learning_rate": 1.2438975590236094e-05, + "loss": 0.4864, + "step": 18788 + }, + { + "epoch": 24.04992, + "grad_norm": 1.129292368888855, + "learning_rate": 1.2436974789915967e-05, + "loss": 0.4461, + "step": 18789 + }, + { + "epoch": 24.0512, + "grad_norm": 1.091012716293335, + "learning_rate": 1.2434973989595838e-05, + "loss": 0.4387, + "step": 18790 + }, + { + "epoch": 24.05248, + "grad_norm": 1.1717545986175537, + "learning_rate": 1.2432973189275711e-05, + "loss": 0.461, + "step": 18791 + }, + { + "epoch": 24.05376, + "grad_norm": 1.1226803064346313, + "learning_rate": 1.2430972388955583e-05, + "loss": 0.4271, + "step": 18792 + }, + { + "epoch": 24.05504, + "grad_norm": 1.1514590978622437, + "learning_rate": 1.2428971588635455e-05, + "loss": 0.4739, + "step": 18793 + }, + { + "epoch": 24.05632, + "grad_norm": 1.157591700553894, + "learning_rate": 1.2426970788315327e-05, + "loss": 0.4825, + "step": 18794 + }, + { + "epoch": 24.0576, + "grad_norm": 1.17417311668396, + "learning_rate": 1.2424969987995199e-05, + "loss": 0.5034, + "step": 18795 + }, + { + "epoch": 24.05888, + "grad_norm": 1.0604926347732544, + "learning_rate": 1.242296918767507e-05, + "loss": 0.451, + "step": 18796 + }, + { + "epoch": 24.06016, + "grad_norm": 1.0973352193832397, + "learning_rate": 1.2420968387354942e-05, + "loss": 0.4632, + "step": 18797 + }, + { + "epoch": 24.06144, + "grad_norm": 1.1098352670669556, + "learning_rate": 1.2418967587034814e-05, + "loss": 0.4198, + "step": 18798 + }, + { + "epoch": 24.06272, + "grad_norm": 1.117893934249878, + "learning_rate": 1.2416966786714688e-05, + "loss": 0.4585, + "step": 18799 + }, + { + "epoch": 24.064, + "grad_norm": 1.1006295680999756, + "learning_rate": 1.2414965986394558e-05, + "loss": 0.474, + "step": 18800 + }, + { + "epoch": 24.06528, + "grad_norm": 1.1561734676361084, + "learning_rate": 1.2412965186074431e-05, + "loss": 0.4768, + "step": 18801 + }, + { + "epoch": 24.06656, + "grad_norm": 1.1429786682128906, + "learning_rate": 1.2410964385754302e-05, + "loss": 0.4457, + "step": 18802 + }, + { + "epoch": 24.06784, + "grad_norm": 1.1339454650878906, + "learning_rate": 1.2408963585434175e-05, + "loss": 0.4879, + "step": 18803 + }, + { + "epoch": 24.06912, + "grad_norm": 1.198805332183838, + "learning_rate": 1.2406962785114045e-05, + "loss": 0.5238, + "step": 18804 + }, + { + "epoch": 24.0704, + "grad_norm": 1.101431131362915, + "learning_rate": 1.2404961984793919e-05, + "loss": 0.4727, + "step": 18805 + }, + { + "epoch": 24.07168, + "grad_norm": 1.0786443948745728, + "learning_rate": 1.2402961184473789e-05, + "loss": 0.4213, + "step": 18806 + }, + { + "epoch": 24.07296, + "grad_norm": 1.1256358623504639, + "learning_rate": 1.2400960384153663e-05, + "loss": 0.4796, + "step": 18807 + }, + { + "epoch": 24.07424, + "grad_norm": 1.1662980318069458, + "learning_rate": 1.2398959583833534e-05, + "loss": 0.4701, + "step": 18808 + }, + { + "epoch": 24.07552, + "grad_norm": 1.2283040285110474, + "learning_rate": 1.2396958783513406e-05, + "loss": 0.5541, + "step": 18809 + }, + { + "epoch": 24.0768, + "grad_norm": 1.189424753189087, + "learning_rate": 1.2394957983193278e-05, + "loss": 0.4847, + "step": 18810 + }, + { + "epoch": 24.07808, + "grad_norm": 1.106939673423767, + "learning_rate": 1.239295718287315e-05, + "loss": 0.4491, + "step": 18811 + }, + { + "epoch": 24.07936, + "grad_norm": 1.059758186340332, + "learning_rate": 1.2390956382553022e-05, + "loss": 0.4485, + "step": 18812 + }, + { + "epoch": 24.08064, + "grad_norm": 1.2013357877731323, + "learning_rate": 1.2388955582232894e-05, + "loss": 0.502, + "step": 18813 + }, + { + "epoch": 24.08192, + "grad_norm": 1.1451191902160645, + "learning_rate": 1.2386954781912766e-05, + "loss": 0.4661, + "step": 18814 + }, + { + "epoch": 24.0832, + "grad_norm": 1.110666275024414, + "learning_rate": 1.2384953981592637e-05, + "loss": 0.453, + "step": 18815 + }, + { + "epoch": 24.08448, + "grad_norm": 1.1258022785186768, + "learning_rate": 1.238295318127251e-05, + "loss": 0.4725, + "step": 18816 + }, + { + "epoch": 24.08576, + "grad_norm": 1.1745010614395142, + "learning_rate": 1.2380952380952381e-05, + "loss": 0.4724, + "step": 18817 + }, + { + "epoch": 24.087040000000002, + "grad_norm": 1.1408393383026123, + "learning_rate": 1.2378951580632253e-05, + "loss": 0.485, + "step": 18818 + }, + { + "epoch": 24.08832, + "grad_norm": 1.2080360651016235, + "learning_rate": 1.2376950780312125e-05, + "loss": 0.5133, + "step": 18819 + }, + { + "epoch": 24.0896, + "grad_norm": 1.2680752277374268, + "learning_rate": 1.2374949979991997e-05, + "loss": 0.4442, + "step": 18820 + }, + { + "epoch": 24.09088, + "grad_norm": 1.0254223346710205, + "learning_rate": 1.2372949179671869e-05, + "loss": 0.4307, + "step": 18821 + }, + { + "epoch": 24.09216, + "grad_norm": 1.1265034675598145, + "learning_rate": 1.2370948379351742e-05, + "loss": 0.468, + "step": 18822 + }, + { + "epoch": 24.09344, + "grad_norm": 1.1478646993637085, + "learning_rate": 1.2368947579031612e-05, + "loss": 0.456, + "step": 18823 + }, + { + "epoch": 24.09472, + "grad_norm": 1.2165275812149048, + "learning_rate": 1.2366946778711486e-05, + "loss": 0.4712, + "step": 18824 + }, + { + "epoch": 24.096, + "grad_norm": 1.1693812608718872, + "learning_rate": 1.2364945978391356e-05, + "loss": 0.4956, + "step": 18825 + }, + { + "epoch": 24.09728, + "grad_norm": 1.1482276916503906, + "learning_rate": 1.236294517807123e-05, + "loss": 0.4687, + "step": 18826 + }, + { + "epoch": 24.09856, + "grad_norm": 1.1738667488098145, + "learning_rate": 1.23609443777511e-05, + "loss": 0.4096, + "step": 18827 + }, + { + "epoch": 24.09984, + "grad_norm": 1.1686372756958008, + "learning_rate": 1.2358943577430973e-05, + "loss": 0.4708, + "step": 18828 + }, + { + "epoch": 24.10112, + "grad_norm": 1.1600852012634277, + "learning_rate": 1.2356942777110843e-05, + "loss": 0.5038, + "step": 18829 + }, + { + "epoch": 24.1024, + "grad_norm": 1.1481022834777832, + "learning_rate": 1.2354941976790717e-05, + "loss": 0.4626, + "step": 18830 + }, + { + "epoch": 24.10368, + "grad_norm": 1.158562421798706, + "learning_rate": 1.2352941176470589e-05, + "loss": 0.4819, + "step": 18831 + }, + { + "epoch": 24.10496, + "grad_norm": 1.2289602756500244, + "learning_rate": 1.235094037615046e-05, + "loss": 0.4865, + "step": 18832 + }, + { + "epoch": 24.10624, + "grad_norm": 1.2404357194900513, + "learning_rate": 1.2348939575830333e-05, + "loss": 0.4778, + "step": 18833 + }, + { + "epoch": 24.10752, + "grad_norm": 1.0343823432922363, + "learning_rate": 1.2346938775510204e-05, + "loss": 0.4364, + "step": 18834 + }, + { + "epoch": 24.1088, + "grad_norm": 1.1348611116409302, + "learning_rate": 1.2344937975190076e-05, + "loss": 0.4825, + "step": 18835 + }, + { + "epoch": 24.11008, + "grad_norm": 1.1395764350891113, + "learning_rate": 1.2342937174869948e-05, + "loss": 0.4656, + "step": 18836 + }, + { + "epoch": 24.11136, + "grad_norm": 1.1095606088638306, + "learning_rate": 1.234093637454982e-05, + "loss": 0.4568, + "step": 18837 + }, + { + "epoch": 24.11264, + "grad_norm": 1.184478521347046, + "learning_rate": 1.2338935574229694e-05, + "loss": 0.4887, + "step": 18838 + }, + { + "epoch": 24.11392, + "grad_norm": 1.1172800064086914, + "learning_rate": 1.2336934773909564e-05, + "loss": 0.4558, + "step": 18839 + }, + { + "epoch": 24.1152, + "grad_norm": 1.2466896772384644, + "learning_rate": 1.2334933973589437e-05, + "loss": 0.5224, + "step": 18840 + }, + { + "epoch": 24.11648, + "grad_norm": 1.1181126832962036, + "learning_rate": 1.2332933173269307e-05, + "loss": 0.4622, + "step": 18841 + }, + { + "epoch": 24.11776, + "grad_norm": 1.1256959438323975, + "learning_rate": 1.2330932372949181e-05, + "loss": 0.4646, + "step": 18842 + }, + { + "epoch": 24.11904, + "grad_norm": 1.1790109872817993, + "learning_rate": 1.2328931572629051e-05, + "loss": 0.4708, + "step": 18843 + }, + { + "epoch": 24.12032, + "grad_norm": 1.1937501430511475, + "learning_rate": 1.2326930772308925e-05, + "loss": 0.4791, + "step": 18844 + }, + { + "epoch": 24.1216, + "grad_norm": 1.2300196886062622, + "learning_rate": 1.2324929971988797e-05, + "loss": 0.4995, + "step": 18845 + }, + { + "epoch": 24.12288, + "grad_norm": 1.1216380596160889, + "learning_rate": 1.2322929171668668e-05, + "loss": 0.4832, + "step": 18846 + }, + { + "epoch": 24.12416, + "grad_norm": 1.1680253744125366, + "learning_rate": 1.232092837134854e-05, + "loss": 0.4863, + "step": 18847 + }, + { + "epoch": 24.12544, + "grad_norm": 1.191275715827942, + "learning_rate": 1.2318927571028412e-05, + "loss": 0.4561, + "step": 18848 + }, + { + "epoch": 24.12672, + "grad_norm": 1.1473134756088257, + "learning_rate": 1.2316926770708284e-05, + "loss": 0.466, + "step": 18849 + }, + { + "epoch": 24.128, + "grad_norm": 1.181347370147705, + "learning_rate": 1.2314925970388156e-05, + "loss": 0.478, + "step": 18850 + }, + { + "epoch": 24.12928, + "grad_norm": 1.1565207242965698, + "learning_rate": 1.2312925170068028e-05, + "loss": 0.5205, + "step": 18851 + }, + { + "epoch": 24.13056, + "grad_norm": 1.136231541633606, + "learning_rate": 1.2310924369747901e-05, + "loss": 0.4984, + "step": 18852 + }, + { + "epoch": 24.13184, + "grad_norm": 1.1483358144760132, + "learning_rate": 1.2308923569427771e-05, + "loss": 0.4857, + "step": 18853 + }, + { + "epoch": 24.13312, + "grad_norm": 1.1635894775390625, + "learning_rate": 1.2306922769107645e-05, + "loss": 0.4728, + "step": 18854 + }, + { + "epoch": 24.1344, + "grad_norm": 1.1853755712509155, + "learning_rate": 1.2304921968787515e-05, + "loss": 0.4908, + "step": 18855 + }, + { + "epoch": 24.13568, + "grad_norm": 1.1830283403396606, + "learning_rate": 1.2302921168467389e-05, + "loss": 0.4935, + "step": 18856 + }, + { + "epoch": 24.13696, + "grad_norm": 1.1618894338607788, + "learning_rate": 1.2300920368147259e-05, + "loss": 0.4388, + "step": 18857 + }, + { + "epoch": 24.13824, + "grad_norm": 1.1691558361053467, + "learning_rate": 1.2298919567827132e-05, + "loss": 0.459, + "step": 18858 + }, + { + "epoch": 24.13952, + "grad_norm": 1.1906380653381348, + "learning_rate": 1.2296918767507003e-05, + "loss": 0.4695, + "step": 18859 + }, + { + "epoch": 24.1408, + "grad_norm": 1.0874199867248535, + "learning_rate": 1.2294917967186876e-05, + "loss": 0.4549, + "step": 18860 + }, + { + "epoch": 24.14208, + "grad_norm": 1.1504427194595337, + "learning_rate": 1.2292917166866748e-05, + "loss": 0.4616, + "step": 18861 + }, + { + "epoch": 24.14336, + "grad_norm": 1.1930842399597168, + "learning_rate": 1.229091636654662e-05, + "loss": 0.5141, + "step": 18862 + }, + { + "epoch": 24.14464, + "grad_norm": 1.1488827466964722, + "learning_rate": 1.2288915566226492e-05, + "loss": 0.4666, + "step": 18863 + }, + { + "epoch": 24.14592, + "grad_norm": 1.1404608488082886, + "learning_rate": 1.2286914765906364e-05, + "loss": 0.4479, + "step": 18864 + }, + { + "epoch": 24.1472, + "grad_norm": 1.1836938858032227, + "learning_rate": 1.2284913965586235e-05, + "loss": 0.5085, + "step": 18865 + }, + { + "epoch": 24.14848, + "grad_norm": 1.1096590757369995, + "learning_rate": 1.2282913165266107e-05, + "loss": 0.4589, + "step": 18866 + }, + { + "epoch": 24.14976, + "grad_norm": 1.1787697076797485, + "learning_rate": 1.2280912364945979e-05, + "loss": 0.4746, + "step": 18867 + }, + { + "epoch": 24.15104, + "grad_norm": 1.0832207202911377, + "learning_rate": 1.2278911564625851e-05, + "loss": 0.4693, + "step": 18868 + }, + { + "epoch": 24.15232, + "grad_norm": 1.1375176906585693, + "learning_rate": 1.2276910764305723e-05, + "loss": 0.4311, + "step": 18869 + }, + { + "epoch": 24.1536, + "grad_norm": 1.1903811693191528, + "learning_rate": 1.2274909963985595e-05, + "loss": 0.4851, + "step": 18870 + }, + { + "epoch": 24.15488, + "grad_norm": 1.169784665107727, + "learning_rate": 1.2272909163665467e-05, + "loss": 0.495, + "step": 18871 + }, + { + "epoch": 24.15616, + "grad_norm": 1.154515027999878, + "learning_rate": 1.2270908363345338e-05, + "loss": 0.4999, + "step": 18872 + }, + { + "epoch": 24.15744, + "grad_norm": 1.141190528869629, + "learning_rate": 1.226890756302521e-05, + "loss": 0.4566, + "step": 18873 + }, + { + "epoch": 24.15872, + "grad_norm": 1.1099512577056885, + "learning_rate": 1.2266906762705082e-05, + "loss": 0.3997, + "step": 18874 + }, + { + "epoch": 24.16, + "grad_norm": 1.129675030708313, + "learning_rate": 1.2264905962384956e-05, + "loss": 0.4474, + "step": 18875 + }, + { + "epoch": 24.16128, + "grad_norm": 1.2040431499481201, + "learning_rate": 1.2262905162064826e-05, + "loss": 0.4952, + "step": 18876 + }, + { + "epoch": 24.16256, + "grad_norm": 1.1369742155075073, + "learning_rate": 1.22609043617447e-05, + "loss": 0.4785, + "step": 18877 + }, + { + "epoch": 24.16384, + "grad_norm": 1.1882468461990356, + "learning_rate": 1.225890356142457e-05, + "loss": 0.4668, + "step": 18878 + }, + { + "epoch": 24.16512, + "grad_norm": 1.1586542129516602, + "learning_rate": 1.2256902761104443e-05, + "loss": 0.4616, + "step": 18879 + }, + { + "epoch": 24.1664, + "grad_norm": 1.1391338109970093, + "learning_rate": 1.2254901960784313e-05, + "loss": 0.4558, + "step": 18880 + }, + { + "epoch": 24.16768, + "grad_norm": 1.2286574840545654, + "learning_rate": 1.2252901160464187e-05, + "loss": 0.5128, + "step": 18881 + }, + { + "epoch": 24.16896, + "grad_norm": 1.1499093770980835, + "learning_rate": 1.2250900360144059e-05, + "loss": 0.4686, + "step": 18882 + }, + { + "epoch": 24.17024, + "grad_norm": 1.1170378923416138, + "learning_rate": 1.224889955982393e-05, + "loss": 0.5008, + "step": 18883 + }, + { + "epoch": 24.17152, + "grad_norm": 1.1456178426742554, + "learning_rate": 1.2246898759503802e-05, + "loss": 0.4888, + "step": 18884 + }, + { + "epoch": 24.1728, + "grad_norm": 1.1081149578094482, + "learning_rate": 1.2244897959183674e-05, + "loss": 0.4932, + "step": 18885 + }, + { + "epoch": 24.17408, + "grad_norm": 1.250626564025879, + "learning_rate": 1.2242897158863546e-05, + "loss": 0.4693, + "step": 18886 + }, + { + "epoch": 24.17536, + "grad_norm": 1.2053821086883545, + "learning_rate": 1.2240896358543418e-05, + "loss": 0.5102, + "step": 18887 + }, + { + "epoch": 24.17664, + "grad_norm": 1.1828999519348145, + "learning_rate": 1.223889555822329e-05, + "loss": 0.4694, + "step": 18888 + }, + { + "epoch": 24.17792, + "grad_norm": 1.1308258771896362, + "learning_rate": 1.2236894757903162e-05, + "loss": 0.4473, + "step": 18889 + }, + { + "epoch": 24.1792, + "grad_norm": 1.1865490674972534, + "learning_rate": 1.2234893957583033e-05, + "loss": 0.5123, + "step": 18890 + }, + { + "epoch": 24.18048, + "grad_norm": 1.1763098239898682, + "learning_rate": 1.2232893157262907e-05, + "loss": 0.5011, + "step": 18891 + }, + { + "epoch": 24.18176, + "grad_norm": 1.1184165477752686, + "learning_rate": 1.2230892356942777e-05, + "loss": 0.4935, + "step": 18892 + }, + { + "epoch": 24.18304, + "grad_norm": 1.2041736841201782, + "learning_rate": 1.222889155662265e-05, + "loss": 0.5107, + "step": 18893 + }, + { + "epoch": 24.18432, + "grad_norm": 1.1142528057098389, + "learning_rate": 1.2226890756302521e-05, + "loss": 0.4647, + "step": 18894 + }, + { + "epoch": 24.1856, + "grad_norm": 1.1789337396621704, + "learning_rate": 1.2224889955982394e-05, + "loss": 0.4476, + "step": 18895 + }, + { + "epoch": 24.18688, + "grad_norm": 1.0955803394317627, + "learning_rate": 1.2222889155662265e-05, + "loss": 0.4594, + "step": 18896 + }, + { + "epoch": 24.18816, + "grad_norm": 1.1010346412658691, + "learning_rate": 1.2220888355342138e-05, + "loss": 0.4668, + "step": 18897 + }, + { + "epoch": 24.18944, + "grad_norm": 1.0665899515151978, + "learning_rate": 1.221888755502201e-05, + "loss": 0.4563, + "step": 18898 + }, + { + "epoch": 24.19072, + "grad_norm": 1.1239908933639526, + "learning_rate": 1.2216886754701882e-05, + "loss": 0.4785, + "step": 18899 + }, + { + "epoch": 24.192, + "grad_norm": 1.1581279039382935, + "learning_rate": 1.2214885954381754e-05, + "loss": 0.501, + "step": 18900 + }, + { + "epoch": 24.19328, + "grad_norm": 1.1129977703094482, + "learning_rate": 1.2212885154061626e-05, + "loss": 0.4571, + "step": 18901 + }, + { + "epoch": 24.19456, + "grad_norm": 1.1460938453674316, + "learning_rate": 1.2210884353741497e-05, + "loss": 0.4706, + "step": 18902 + }, + { + "epoch": 24.19584, + "grad_norm": 1.2374320030212402, + "learning_rate": 1.220888355342137e-05, + "loss": 0.5082, + "step": 18903 + }, + { + "epoch": 24.19712, + "grad_norm": 1.1086463928222656, + "learning_rate": 1.2206882753101241e-05, + "loss": 0.4465, + "step": 18904 + }, + { + "epoch": 24.1984, + "grad_norm": 1.147619605064392, + "learning_rate": 1.2204881952781113e-05, + "loss": 0.4718, + "step": 18905 + }, + { + "epoch": 24.19968, + "grad_norm": 1.117663025856018, + "learning_rate": 1.2202881152460985e-05, + "loss": 0.5058, + "step": 18906 + }, + { + "epoch": 24.20096, + "grad_norm": 1.1890056133270264, + "learning_rate": 1.2200880352140857e-05, + "loss": 0.4962, + "step": 18907 + }, + { + "epoch": 24.20224, + "grad_norm": 1.0809297561645508, + "learning_rate": 1.2198879551820729e-05, + "loss": 0.4143, + "step": 18908 + }, + { + "epoch": 24.20352, + "grad_norm": 1.146008014678955, + "learning_rate": 1.21968787515006e-05, + "loss": 0.4672, + "step": 18909 + }, + { + "epoch": 24.2048, + "grad_norm": 1.1988301277160645, + "learning_rate": 1.2194877951180472e-05, + "loss": 0.4645, + "step": 18910 + }, + { + "epoch": 24.20608, + "grad_norm": 1.2274937629699707, + "learning_rate": 1.2192877150860344e-05, + "loss": 0.5402, + "step": 18911 + }, + { + "epoch": 24.20736, + "grad_norm": 1.1610702276229858, + "learning_rate": 1.2190876350540216e-05, + "loss": 0.5225, + "step": 18912 + }, + { + "epoch": 24.20864, + "grad_norm": 1.2057294845581055, + "learning_rate": 1.2188875550220088e-05, + "loss": 0.4622, + "step": 18913 + }, + { + "epoch": 24.20992, + "grad_norm": 1.1343713998794556, + "learning_rate": 1.2186874749899961e-05, + "loss": 0.4997, + "step": 18914 + }, + { + "epoch": 24.2112, + "grad_norm": 1.0980840921401978, + "learning_rate": 1.2184873949579832e-05, + "loss": 0.4442, + "step": 18915 + }, + { + "epoch": 24.21248, + "grad_norm": 1.099935531616211, + "learning_rate": 1.2182873149259705e-05, + "loss": 0.4833, + "step": 18916 + }, + { + "epoch": 24.21376, + "grad_norm": 1.2026798725128174, + "learning_rate": 1.2180872348939575e-05, + "loss": 0.4885, + "step": 18917 + }, + { + "epoch": 24.21504, + "grad_norm": 1.138269066810608, + "learning_rate": 1.2178871548619449e-05, + "loss": 0.4539, + "step": 18918 + }, + { + "epoch": 24.21632, + "grad_norm": 1.233717441558838, + "learning_rate": 1.2176870748299319e-05, + "loss": 0.5163, + "step": 18919 + }, + { + "epoch": 24.2176, + "grad_norm": 1.2078992128372192, + "learning_rate": 1.2174869947979193e-05, + "loss": 0.4496, + "step": 18920 + }, + { + "epoch": 24.21888, + "grad_norm": 1.226100206375122, + "learning_rate": 1.2172869147659064e-05, + "loss": 0.511, + "step": 18921 + }, + { + "epoch": 24.22016, + "grad_norm": 1.183384895324707, + "learning_rate": 1.2170868347338936e-05, + "loss": 0.4633, + "step": 18922 + }, + { + "epoch": 24.22144, + "grad_norm": 1.2039517164230347, + "learning_rate": 1.2168867547018808e-05, + "loss": 0.5063, + "step": 18923 + }, + { + "epoch": 24.22272, + "grad_norm": 1.2347691059112549, + "learning_rate": 1.216686674669868e-05, + "loss": 0.4843, + "step": 18924 + }, + { + "epoch": 24.224, + "grad_norm": 1.2100071907043457, + "learning_rate": 1.2164865946378552e-05, + "loss": 0.4907, + "step": 18925 + }, + { + "epoch": 24.22528, + "grad_norm": 1.1934117078781128, + "learning_rate": 1.2162865146058424e-05, + "loss": 0.4824, + "step": 18926 + }, + { + "epoch": 24.22656, + "grad_norm": 1.1818937063217163, + "learning_rate": 1.2160864345738296e-05, + "loss": 0.4746, + "step": 18927 + }, + { + "epoch": 24.22784, + "grad_norm": 1.0990982055664062, + "learning_rate": 1.2158863545418169e-05, + "loss": 0.4461, + "step": 18928 + }, + { + "epoch": 24.22912, + "grad_norm": 1.1781893968582153, + "learning_rate": 1.215686274509804e-05, + "loss": 0.5018, + "step": 18929 + }, + { + "epoch": 24.2304, + "grad_norm": 1.147173523902893, + "learning_rate": 1.2154861944777913e-05, + "loss": 0.4664, + "step": 18930 + }, + { + "epoch": 24.23168, + "grad_norm": 1.0861092805862427, + "learning_rate": 1.2152861144457783e-05, + "loss": 0.4398, + "step": 18931 + }, + { + "epoch": 24.23296, + "grad_norm": 1.103131651878357, + "learning_rate": 1.2150860344137657e-05, + "loss": 0.4301, + "step": 18932 + }, + { + "epoch": 24.23424, + "grad_norm": 1.0709394216537476, + "learning_rate": 1.2148859543817527e-05, + "loss": 0.4424, + "step": 18933 + }, + { + "epoch": 24.23552, + "grad_norm": 1.1708987951278687, + "learning_rate": 1.21468587434974e-05, + "loss": 0.4815, + "step": 18934 + }, + { + "epoch": 24.2368, + "grad_norm": 1.1951818466186523, + "learning_rate": 1.2144857943177272e-05, + "loss": 0.4715, + "step": 18935 + }, + { + "epoch": 24.23808, + "grad_norm": 1.169890284538269, + "learning_rate": 1.2142857142857144e-05, + "loss": 0.4762, + "step": 18936 + }, + { + "epoch": 24.23936, + "grad_norm": 1.164069652557373, + "learning_rate": 1.2140856342537016e-05, + "loss": 0.4739, + "step": 18937 + }, + { + "epoch": 24.24064, + "grad_norm": 1.2312345504760742, + "learning_rate": 1.2138855542216888e-05, + "loss": 0.5088, + "step": 18938 + }, + { + "epoch": 24.24192, + "grad_norm": 1.0831817388534546, + "learning_rate": 1.213685474189676e-05, + "loss": 0.4128, + "step": 18939 + }, + { + "epoch": 24.2432, + "grad_norm": 1.1156847476959229, + "learning_rate": 1.2134853941576631e-05, + "loss": 0.4483, + "step": 18940 + }, + { + "epoch": 24.24448, + "grad_norm": 1.2278788089752197, + "learning_rate": 1.2132853141256503e-05, + "loss": 0.5139, + "step": 18941 + }, + { + "epoch": 24.24576, + "grad_norm": 1.1557207107543945, + "learning_rate": 1.2130852340936375e-05, + "loss": 0.5051, + "step": 18942 + }, + { + "epoch": 24.24704, + "grad_norm": 1.1289258003234863, + "learning_rate": 1.2128851540616247e-05, + "loss": 0.473, + "step": 18943 + }, + { + "epoch": 24.24832, + "grad_norm": 1.245375633239746, + "learning_rate": 1.2126850740296119e-05, + "loss": 0.4962, + "step": 18944 + }, + { + "epoch": 24.2496, + "grad_norm": 1.1361275911331177, + "learning_rate": 1.212484993997599e-05, + "loss": 0.4961, + "step": 18945 + }, + { + "epoch": 24.25088, + "grad_norm": 1.0928107500076294, + "learning_rate": 1.2122849139655863e-05, + "loss": 0.4831, + "step": 18946 + }, + { + "epoch": 24.25216, + "grad_norm": 1.0881211757659912, + "learning_rate": 1.2120848339335734e-05, + "loss": 0.4388, + "step": 18947 + }, + { + "epoch": 24.25344, + "grad_norm": 1.1172457933425903, + "learning_rate": 1.2118847539015606e-05, + "loss": 0.4584, + "step": 18948 + }, + { + "epoch": 24.25472, + "grad_norm": 1.1269158124923706, + "learning_rate": 1.2116846738695478e-05, + "loss": 0.4362, + "step": 18949 + }, + { + "epoch": 24.256, + "grad_norm": 1.0757176876068115, + "learning_rate": 1.211484593837535e-05, + "loss": 0.4515, + "step": 18950 + }, + { + "epoch": 24.25728, + "grad_norm": 1.0964628458023071, + "learning_rate": 1.2112845138055224e-05, + "loss": 0.4624, + "step": 18951 + }, + { + "epoch": 24.25856, + "grad_norm": 1.1697242259979248, + "learning_rate": 1.2110844337735094e-05, + "loss": 0.4753, + "step": 18952 + }, + { + "epoch": 24.25984, + "grad_norm": 1.098563313484192, + "learning_rate": 1.2108843537414967e-05, + "loss": 0.4689, + "step": 18953 + }, + { + "epoch": 24.26112, + "grad_norm": 1.081687092781067, + "learning_rate": 1.2106842737094837e-05, + "loss": 0.467, + "step": 18954 + }, + { + "epoch": 24.2624, + "grad_norm": 1.130462884902954, + "learning_rate": 1.2104841936774711e-05, + "loss": 0.4772, + "step": 18955 + }, + { + "epoch": 24.26368, + "grad_norm": 1.1619678735733032, + "learning_rate": 1.2102841136454581e-05, + "loss": 0.4805, + "step": 18956 + }, + { + "epoch": 24.26496, + "grad_norm": 1.2183215618133545, + "learning_rate": 1.2100840336134455e-05, + "loss": 0.4988, + "step": 18957 + }, + { + "epoch": 24.26624, + "grad_norm": 1.1362768411636353, + "learning_rate": 1.2098839535814327e-05, + "loss": 0.4678, + "step": 18958 + }, + { + "epoch": 24.26752, + "grad_norm": 1.1488263607025146, + "learning_rate": 1.2096838735494198e-05, + "loss": 0.4232, + "step": 18959 + }, + { + "epoch": 24.2688, + "grad_norm": 1.1357872486114502, + "learning_rate": 1.209483793517407e-05, + "loss": 0.4901, + "step": 18960 + }, + { + "epoch": 24.27008, + "grad_norm": 1.1027874946594238, + "learning_rate": 1.2092837134853942e-05, + "loss": 0.4982, + "step": 18961 + }, + { + "epoch": 24.27136, + "grad_norm": 1.157282829284668, + "learning_rate": 1.2090836334533814e-05, + "loss": 0.4774, + "step": 18962 + }, + { + "epoch": 24.27264, + "grad_norm": 1.1591331958770752, + "learning_rate": 1.2088835534213686e-05, + "loss": 0.4684, + "step": 18963 + }, + { + "epoch": 24.27392, + "grad_norm": 1.0676262378692627, + "learning_rate": 1.2086834733893558e-05, + "loss": 0.4187, + "step": 18964 + }, + { + "epoch": 24.2752, + "grad_norm": 1.143548846244812, + "learning_rate": 1.2084833933573431e-05, + "loss": 0.4558, + "step": 18965 + }, + { + "epoch": 24.27648, + "grad_norm": 1.1962380409240723, + "learning_rate": 1.2082833133253301e-05, + "loss": 0.5233, + "step": 18966 + }, + { + "epoch": 24.27776, + "grad_norm": 1.1192182302474976, + "learning_rate": 1.2080832332933175e-05, + "loss": 0.4541, + "step": 18967 + }, + { + "epoch": 24.27904, + "grad_norm": 1.1944539546966553, + "learning_rate": 1.2078831532613045e-05, + "loss": 0.4669, + "step": 18968 + }, + { + "epoch": 24.28032, + "grad_norm": 1.0787324905395508, + "learning_rate": 1.2076830732292919e-05, + "loss": 0.4497, + "step": 18969 + }, + { + "epoch": 24.2816, + "grad_norm": 1.12637197971344, + "learning_rate": 1.2074829931972789e-05, + "loss": 0.4599, + "step": 18970 + }, + { + "epoch": 24.28288, + "grad_norm": 1.1529687643051147, + "learning_rate": 1.2072829131652662e-05, + "loss": 0.5066, + "step": 18971 + }, + { + "epoch": 24.28416, + "grad_norm": 1.202721357345581, + "learning_rate": 1.2070828331332533e-05, + "loss": 0.494, + "step": 18972 + }, + { + "epoch": 24.28544, + "grad_norm": 1.1301331520080566, + "learning_rate": 1.2068827531012406e-05, + "loss": 0.4652, + "step": 18973 + }, + { + "epoch": 24.28672, + "grad_norm": 1.128435492515564, + "learning_rate": 1.2066826730692278e-05, + "loss": 0.4296, + "step": 18974 + }, + { + "epoch": 24.288, + "grad_norm": 1.1200505495071411, + "learning_rate": 1.206482593037215e-05, + "loss": 0.474, + "step": 18975 + }, + { + "epoch": 24.28928, + "grad_norm": 1.2111964225769043, + "learning_rate": 1.2062825130052022e-05, + "loss": 0.4765, + "step": 18976 + }, + { + "epoch": 24.29056, + "grad_norm": 1.1505967378616333, + "learning_rate": 1.2060824329731893e-05, + "loss": 0.4658, + "step": 18977 + }, + { + "epoch": 24.29184, + "grad_norm": 1.0870449542999268, + "learning_rate": 1.2058823529411765e-05, + "loss": 0.4378, + "step": 18978 + }, + { + "epoch": 24.29312, + "grad_norm": 1.1326916217803955, + "learning_rate": 1.2056822729091637e-05, + "loss": 0.4744, + "step": 18979 + }, + { + "epoch": 24.2944, + "grad_norm": 1.1708118915557861, + "learning_rate": 1.2054821928771509e-05, + "loss": 0.4676, + "step": 18980 + }, + { + "epoch": 24.29568, + "grad_norm": 1.1142549514770508, + "learning_rate": 1.2052821128451381e-05, + "loss": 0.4682, + "step": 18981 + }, + { + "epoch": 24.29696, + "grad_norm": 1.2015085220336914, + "learning_rate": 1.2050820328131253e-05, + "loss": 0.5163, + "step": 18982 + }, + { + "epoch": 24.29824, + "grad_norm": 1.0846655368804932, + "learning_rate": 1.2048819527811125e-05, + "loss": 0.411, + "step": 18983 + }, + { + "epoch": 24.29952, + "grad_norm": 1.1354156732559204, + "learning_rate": 1.2046818727490996e-05, + "loss": 0.4529, + "step": 18984 + }, + { + "epoch": 24.3008, + "grad_norm": 1.1821706295013428, + "learning_rate": 1.2044817927170868e-05, + "loss": 0.4717, + "step": 18985 + }, + { + "epoch": 24.30208, + "grad_norm": 1.1429351568222046, + "learning_rate": 1.204281712685074e-05, + "loss": 0.4679, + "step": 18986 + }, + { + "epoch": 24.30336, + "grad_norm": 1.1987007856369019, + "learning_rate": 1.2040816326530612e-05, + "loss": 0.479, + "step": 18987 + }, + { + "epoch": 24.30464, + "grad_norm": 1.1858272552490234, + "learning_rate": 1.2038815526210486e-05, + "loss": 0.4528, + "step": 18988 + }, + { + "epoch": 24.30592, + "grad_norm": 1.1735626459121704, + "learning_rate": 1.2036814725890356e-05, + "loss": 0.4459, + "step": 18989 + }, + { + "epoch": 24.3072, + "grad_norm": 1.2804443836212158, + "learning_rate": 1.203481392557023e-05, + "loss": 0.5244, + "step": 18990 + }, + { + "epoch": 24.30848, + "grad_norm": 1.2020955085754395, + "learning_rate": 1.20328131252501e-05, + "loss": 0.4598, + "step": 18991 + }, + { + "epoch": 24.30976, + "grad_norm": 1.2442394495010376, + "learning_rate": 1.2030812324929973e-05, + "loss": 0.4843, + "step": 18992 + }, + { + "epoch": 24.31104, + "grad_norm": 1.137221097946167, + "learning_rate": 1.2028811524609843e-05, + "loss": 0.4527, + "step": 18993 + }, + { + "epoch": 24.31232, + "grad_norm": 1.1395703554153442, + "learning_rate": 1.2026810724289717e-05, + "loss": 0.5018, + "step": 18994 + }, + { + "epoch": 24.3136, + "grad_norm": 1.1961019039154053, + "learning_rate": 1.2024809923969587e-05, + "loss": 0.5364, + "step": 18995 + }, + { + "epoch": 24.31488, + "grad_norm": 1.1549445390701294, + "learning_rate": 1.202280912364946e-05, + "loss": 0.4855, + "step": 18996 + }, + { + "epoch": 24.31616, + "grad_norm": 1.1084400415420532, + "learning_rate": 1.2020808323329332e-05, + "loss": 0.4317, + "step": 18997 + }, + { + "epoch": 24.31744, + "grad_norm": 1.1791542768478394, + "learning_rate": 1.2018807523009204e-05, + "loss": 0.5045, + "step": 18998 + }, + { + "epoch": 24.31872, + "grad_norm": 1.1057560443878174, + "learning_rate": 1.2016806722689076e-05, + "loss": 0.4459, + "step": 18999 + }, + { + "epoch": 24.32, + "grad_norm": 1.1355136632919312, + "learning_rate": 1.2014805922368948e-05, + "loss": 0.4693, + "step": 19000 + }, + { + "epoch": 24.32128, + "grad_norm": 1.204648494720459, + "learning_rate": 1.201280512204882e-05, + "loss": 0.5096, + "step": 19001 + }, + { + "epoch": 24.32256, + "grad_norm": 1.1683499813079834, + "learning_rate": 1.2010804321728692e-05, + "loss": 0.4805, + "step": 19002 + }, + { + "epoch": 24.32384, + "grad_norm": 1.0788731575012207, + "learning_rate": 1.2008803521408563e-05, + "loss": 0.4542, + "step": 19003 + }, + { + "epoch": 24.32512, + "grad_norm": 1.1736633777618408, + "learning_rate": 1.2006802721088437e-05, + "loss": 0.4771, + "step": 19004 + }, + { + "epoch": 24.3264, + "grad_norm": 1.1012755632400513, + "learning_rate": 1.2004801920768307e-05, + "loss": 0.4554, + "step": 19005 + }, + { + "epoch": 24.32768, + "grad_norm": 1.0851147174835205, + "learning_rate": 1.200280112044818e-05, + "loss": 0.4286, + "step": 19006 + }, + { + "epoch": 24.32896, + "grad_norm": 1.1383137702941895, + "learning_rate": 1.2000800320128051e-05, + "loss": 0.4664, + "step": 19007 + }, + { + "epoch": 24.33024, + "grad_norm": 1.1844596862792969, + "learning_rate": 1.1998799519807924e-05, + "loss": 0.5101, + "step": 19008 + }, + { + "epoch": 24.33152, + "grad_norm": 1.212876319885254, + "learning_rate": 1.1996798719487795e-05, + "loss": 0.524, + "step": 19009 + }, + { + "epoch": 24.3328, + "grad_norm": 1.1917188167572021, + "learning_rate": 1.1994797919167668e-05, + "loss": 0.5007, + "step": 19010 + }, + { + "epoch": 24.33408, + "grad_norm": 1.0979135036468506, + "learning_rate": 1.199279711884754e-05, + "loss": 0.4251, + "step": 19011 + }, + { + "epoch": 24.33536, + "grad_norm": 1.197851300239563, + "learning_rate": 1.1990796318527412e-05, + "loss": 0.4987, + "step": 19012 + }, + { + "epoch": 24.33664, + "grad_norm": 1.1661128997802734, + "learning_rate": 1.1988795518207284e-05, + "loss": 0.4864, + "step": 19013 + }, + { + "epoch": 24.33792, + "grad_norm": 1.1655899286270142, + "learning_rate": 1.1986794717887156e-05, + "loss": 0.4645, + "step": 19014 + }, + { + "epoch": 24.3392, + "grad_norm": 1.1260086297988892, + "learning_rate": 1.1984793917567027e-05, + "loss": 0.4571, + "step": 19015 + }, + { + "epoch": 24.34048, + "grad_norm": 1.1551868915557861, + "learning_rate": 1.19827931172469e-05, + "loss": 0.454, + "step": 19016 + }, + { + "epoch": 24.34176, + "grad_norm": 1.09392249584198, + "learning_rate": 1.1980792316926771e-05, + "loss": 0.4698, + "step": 19017 + }, + { + "epoch": 24.34304, + "grad_norm": 1.2250076532363892, + "learning_rate": 1.1978791516606645e-05, + "loss": 0.4851, + "step": 19018 + }, + { + "epoch": 24.34432, + "grad_norm": 1.1403043270111084, + "learning_rate": 1.1976790716286515e-05, + "loss": 0.4895, + "step": 19019 + }, + { + "epoch": 24.3456, + "grad_norm": 1.113724946975708, + "learning_rate": 1.1974789915966388e-05, + "loss": 0.4718, + "step": 19020 + }, + { + "epoch": 24.34688, + "grad_norm": 1.1426780223846436, + "learning_rate": 1.1972789115646259e-05, + "loss": 0.4889, + "step": 19021 + }, + { + "epoch": 24.34816, + "grad_norm": 1.1902352571487427, + "learning_rate": 1.1970788315326132e-05, + "loss": 0.4702, + "step": 19022 + }, + { + "epoch": 24.34944, + "grad_norm": 1.1023898124694824, + "learning_rate": 1.1968787515006002e-05, + "loss": 0.4402, + "step": 19023 + }, + { + "epoch": 24.35072, + "grad_norm": 1.1217379570007324, + "learning_rate": 1.1966786714685876e-05, + "loss": 0.453, + "step": 19024 + }, + { + "epoch": 24.352, + "grad_norm": 1.115411639213562, + "learning_rate": 1.1964785914365746e-05, + "loss": 0.4485, + "step": 19025 + }, + { + "epoch": 24.35328, + "grad_norm": 1.209621787071228, + "learning_rate": 1.196278511404562e-05, + "loss": 0.5006, + "step": 19026 + }, + { + "epoch": 24.35456, + "grad_norm": 1.1467214822769165, + "learning_rate": 1.1960784313725491e-05, + "loss": 0.4591, + "step": 19027 + }, + { + "epoch": 24.35584, + "grad_norm": 1.1208250522613525, + "learning_rate": 1.1958783513405363e-05, + "loss": 0.4738, + "step": 19028 + }, + { + "epoch": 24.35712, + "grad_norm": 1.1195433139801025, + "learning_rate": 1.1956782713085235e-05, + "loss": 0.4478, + "step": 19029 + }, + { + "epoch": 24.3584, + "grad_norm": 1.1476173400878906, + "learning_rate": 1.1954781912765107e-05, + "loss": 0.4901, + "step": 19030 + }, + { + "epoch": 24.35968, + "grad_norm": 1.2079092264175415, + "learning_rate": 1.1952781112444979e-05, + "loss": 0.4439, + "step": 19031 + }, + { + "epoch": 24.36096, + "grad_norm": 1.1161270141601562, + "learning_rate": 1.195078031212485e-05, + "loss": 0.4548, + "step": 19032 + }, + { + "epoch": 24.36224, + "grad_norm": 1.1574740409851074, + "learning_rate": 1.1948779511804723e-05, + "loss": 0.4633, + "step": 19033 + }, + { + "epoch": 24.36352, + "grad_norm": 1.1372309923171997, + "learning_rate": 1.1946778711484594e-05, + "loss": 0.4352, + "step": 19034 + }, + { + "epoch": 24.3648, + "grad_norm": 1.1243098974227905, + "learning_rate": 1.1944777911164466e-05, + "loss": 0.4584, + "step": 19035 + }, + { + "epoch": 24.36608, + "grad_norm": 1.1075938940048218, + "learning_rate": 1.1942777110844338e-05, + "loss": 0.4726, + "step": 19036 + }, + { + "epoch": 24.36736, + "grad_norm": 1.1360459327697754, + "learning_rate": 1.194077631052421e-05, + "loss": 0.423, + "step": 19037 + }, + { + "epoch": 24.36864, + "grad_norm": 1.1323789358139038, + "learning_rate": 1.1938775510204082e-05, + "loss": 0.4819, + "step": 19038 + }, + { + "epoch": 24.36992, + "grad_norm": 1.1539483070373535, + "learning_rate": 1.1936774709883954e-05, + "loss": 0.4689, + "step": 19039 + }, + { + "epoch": 24.3712, + "grad_norm": 1.1773465871810913, + "learning_rate": 1.1934773909563826e-05, + "loss": 0.5437, + "step": 19040 + }, + { + "epoch": 24.37248, + "grad_norm": 1.0829496383666992, + "learning_rate": 1.1932773109243699e-05, + "loss": 0.4514, + "step": 19041 + }, + { + "epoch": 24.37376, + "grad_norm": 1.1402915716171265, + "learning_rate": 1.193077230892357e-05, + "loss": 0.4494, + "step": 19042 + }, + { + "epoch": 24.37504, + "grad_norm": 1.1688295602798462, + "learning_rate": 1.1928771508603443e-05, + "loss": 0.4596, + "step": 19043 + }, + { + "epoch": 24.37632, + "grad_norm": 1.1912732124328613, + "learning_rate": 1.1926770708283313e-05, + "loss": 0.4838, + "step": 19044 + }, + { + "epoch": 24.3776, + "grad_norm": 1.1036256551742554, + "learning_rate": 1.1924769907963187e-05, + "loss": 0.4478, + "step": 19045 + }, + { + "epoch": 24.37888, + "grad_norm": 1.1427876949310303, + "learning_rate": 1.1922769107643057e-05, + "loss": 0.4632, + "step": 19046 + }, + { + "epoch": 24.38016, + "grad_norm": 1.2275477647781372, + "learning_rate": 1.192076830732293e-05, + "loss": 0.5451, + "step": 19047 + }, + { + "epoch": 24.38144, + "grad_norm": 1.0969351530075073, + "learning_rate": 1.1918767507002802e-05, + "loss": 0.4578, + "step": 19048 + }, + { + "epoch": 24.38272, + "grad_norm": 1.117645502090454, + "learning_rate": 1.1916766706682674e-05, + "loss": 0.4327, + "step": 19049 + }, + { + "epoch": 24.384, + "grad_norm": 1.157266616821289, + "learning_rate": 1.1914765906362546e-05, + "loss": 0.478, + "step": 19050 + }, + { + "epoch": 24.38528, + "grad_norm": 1.1453224420547485, + "learning_rate": 1.1912765106042418e-05, + "loss": 0.4923, + "step": 19051 + }, + { + "epoch": 24.38656, + "grad_norm": 1.1940408945083618, + "learning_rate": 1.191076430572229e-05, + "loss": 0.5081, + "step": 19052 + }, + { + "epoch": 24.38784, + "grad_norm": 1.0976282358169556, + "learning_rate": 1.1908763505402161e-05, + "loss": 0.4342, + "step": 19053 + }, + { + "epoch": 24.38912, + "grad_norm": 1.211582899093628, + "learning_rate": 1.1906762705082033e-05, + "loss": 0.4941, + "step": 19054 + }, + { + "epoch": 24.3904, + "grad_norm": 1.1101171970367432, + "learning_rate": 1.1904761904761905e-05, + "loss": 0.4526, + "step": 19055 + }, + { + "epoch": 24.39168, + "grad_norm": 1.213545560836792, + "learning_rate": 1.1902761104441777e-05, + "loss": 0.4828, + "step": 19056 + }, + { + "epoch": 24.39296, + "grad_norm": 1.1441152095794678, + "learning_rate": 1.190076030412165e-05, + "loss": 0.4691, + "step": 19057 + }, + { + "epoch": 24.39424, + "grad_norm": 1.1789923906326294, + "learning_rate": 1.189875950380152e-05, + "loss": 0.4947, + "step": 19058 + }, + { + "epoch": 24.39552, + "grad_norm": 1.031014323234558, + "learning_rate": 1.1896758703481394e-05, + "loss": 0.3981, + "step": 19059 + }, + { + "epoch": 24.3968, + "grad_norm": 1.2153488397598267, + "learning_rate": 1.1894757903161264e-05, + "loss": 0.5261, + "step": 19060 + }, + { + "epoch": 24.39808, + "grad_norm": 1.2073795795440674, + "learning_rate": 1.1892757102841138e-05, + "loss": 0.49, + "step": 19061 + }, + { + "epoch": 24.39936, + "grad_norm": 1.1584200859069824, + "learning_rate": 1.1890756302521008e-05, + "loss": 0.4965, + "step": 19062 + }, + { + "epoch": 24.40064, + "grad_norm": 1.1565824747085571, + "learning_rate": 1.1888755502200882e-05, + "loss": 0.4811, + "step": 19063 + }, + { + "epoch": 24.40192, + "grad_norm": 1.1859803199768066, + "learning_rate": 1.1886754701880753e-05, + "loss": 0.4927, + "step": 19064 + }, + { + "epoch": 24.4032, + "grad_norm": 1.1612244844436646, + "learning_rate": 1.1884753901560625e-05, + "loss": 0.5152, + "step": 19065 + }, + { + "epoch": 24.40448, + "grad_norm": 1.1404790878295898, + "learning_rate": 1.1882753101240497e-05, + "loss": 0.4933, + "step": 19066 + }, + { + "epoch": 24.40576, + "grad_norm": 1.1987690925598145, + "learning_rate": 1.1880752300920369e-05, + "loss": 0.4743, + "step": 19067 + }, + { + "epoch": 24.40704, + "grad_norm": 1.145716905593872, + "learning_rate": 1.1878751500600241e-05, + "loss": 0.4787, + "step": 19068 + }, + { + "epoch": 24.40832, + "grad_norm": 1.1686224937438965, + "learning_rate": 1.1876750700280113e-05, + "loss": 0.485, + "step": 19069 + }, + { + "epoch": 24.4096, + "grad_norm": 1.1903284788131714, + "learning_rate": 1.1874749899959985e-05, + "loss": 0.4924, + "step": 19070 + }, + { + "epoch": 24.41088, + "grad_norm": 1.1217108964920044, + "learning_rate": 1.1872749099639856e-05, + "loss": 0.4669, + "step": 19071 + }, + { + "epoch": 24.41216, + "grad_norm": 1.1774252653121948, + "learning_rate": 1.1870748299319728e-05, + "loss": 0.4977, + "step": 19072 + }, + { + "epoch": 24.41344, + "grad_norm": 1.2087725400924683, + "learning_rate": 1.18687474989996e-05, + "loss": 0.5188, + "step": 19073 + }, + { + "epoch": 24.41472, + "grad_norm": 1.1201173067092896, + "learning_rate": 1.1866746698679472e-05, + "loss": 0.4849, + "step": 19074 + }, + { + "epoch": 24.416, + "grad_norm": 1.0546908378601074, + "learning_rate": 1.1864745898359344e-05, + "loss": 0.4321, + "step": 19075 + }, + { + "epoch": 24.41728, + "grad_norm": 1.2529699802398682, + "learning_rate": 1.1862745098039216e-05, + "loss": 0.5136, + "step": 19076 + }, + { + "epoch": 24.41856, + "grad_norm": 1.12168550491333, + "learning_rate": 1.1860744297719088e-05, + "loss": 0.484, + "step": 19077 + }, + { + "epoch": 24.41984, + "grad_norm": 1.2384254932403564, + "learning_rate": 1.1858743497398961e-05, + "loss": 0.4956, + "step": 19078 + }, + { + "epoch": 24.42112, + "grad_norm": 1.1256715059280396, + "learning_rate": 1.1856742697078831e-05, + "loss": 0.512, + "step": 19079 + }, + { + "epoch": 24.4224, + "grad_norm": 1.2130337953567505, + "learning_rate": 1.1854741896758705e-05, + "loss": 0.4678, + "step": 19080 + }, + { + "epoch": 24.42368, + "grad_norm": 1.1287142038345337, + "learning_rate": 1.1852741096438575e-05, + "loss": 0.4757, + "step": 19081 + }, + { + "epoch": 24.42496, + "grad_norm": 1.1295257806777954, + "learning_rate": 1.1850740296118449e-05, + "loss": 0.4708, + "step": 19082 + }, + { + "epoch": 24.42624, + "grad_norm": 1.1850224733352661, + "learning_rate": 1.1848739495798319e-05, + "loss": 0.493, + "step": 19083 + }, + { + "epoch": 24.42752, + "grad_norm": 1.1218152046203613, + "learning_rate": 1.1846738695478192e-05, + "loss": 0.4252, + "step": 19084 + }, + { + "epoch": 24.4288, + "grad_norm": 1.257108449935913, + "learning_rate": 1.1844737895158062e-05, + "loss": 0.5102, + "step": 19085 + }, + { + "epoch": 24.43008, + "grad_norm": 1.2210159301757812, + "learning_rate": 1.1842737094837936e-05, + "loss": 0.4865, + "step": 19086 + }, + { + "epoch": 24.43136, + "grad_norm": 1.188494086265564, + "learning_rate": 1.1840736294517808e-05, + "loss": 0.5042, + "step": 19087 + }, + { + "epoch": 24.43264, + "grad_norm": 1.1470962762832642, + "learning_rate": 1.183873549419768e-05, + "loss": 0.4539, + "step": 19088 + }, + { + "epoch": 24.43392, + "grad_norm": 1.181655764579773, + "learning_rate": 1.1836734693877552e-05, + "loss": 0.4374, + "step": 19089 + }, + { + "epoch": 24.4352, + "grad_norm": 1.1574394702911377, + "learning_rate": 1.1834733893557423e-05, + "loss": 0.4477, + "step": 19090 + }, + { + "epoch": 24.43648, + "grad_norm": 1.2077171802520752, + "learning_rate": 1.1832733093237295e-05, + "loss": 0.4863, + "step": 19091 + }, + { + "epoch": 24.43776, + "grad_norm": 1.1384170055389404, + "learning_rate": 1.1830732292917167e-05, + "loss": 0.4579, + "step": 19092 + }, + { + "epoch": 24.43904, + "grad_norm": 1.1912693977355957, + "learning_rate": 1.1828731492597039e-05, + "loss": 0.5061, + "step": 19093 + }, + { + "epoch": 24.44032, + "grad_norm": 1.2225050926208496, + "learning_rate": 1.1826730692276913e-05, + "loss": 0.5158, + "step": 19094 + }, + { + "epoch": 24.4416, + "grad_norm": 1.105109691619873, + "learning_rate": 1.1824729891956783e-05, + "loss": 0.4698, + "step": 19095 + }, + { + "epoch": 24.44288, + "grad_norm": 1.2286189794540405, + "learning_rate": 1.1822729091636656e-05, + "loss": 0.5637, + "step": 19096 + }, + { + "epoch": 24.44416, + "grad_norm": 1.2093946933746338, + "learning_rate": 1.1820728291316526e-05, + "loss": 0.5066, + "step": 19097 + }, + { + "epoch": 24.44544, + "grad_norm": 1.1525238752365112, + "learning_rate": 1.18187274909964e-05, + "loss": 0.4576, + "step": 19098 + }, + { + "epoch": 24.44672, + "grad_norm": 1.128960371017456, + "learning_rate": 1.181672669067627e-05, + "loss": 0.4811, + "step": 19099 + }, + { + "epoch": 24.448, + "grad_norm": 1.11859130859375, + "learning_rate": 1.1814725890356144e-05, + "loss": 0.4477, + "step": 19100 + }, + { + "epoch": 24.44928, + "grad_norm": 1.2186490297317505, + "learning_rate": 1.1812725090036016e-05, + "loss": 0.492, + "step": 19101 + }, + { + "epoch": 24.45056, + "grad_norm": 1.1721785068511963, + "learning_rate": 1.1810724289715887e-05, + "loss": 0.4775, + "step": 19102 + }, + { + "epoch": 24.45184, + "grad_norm": 1.1152302026748657, + "learning_rate": 1.180872348939576e-05, + "loss": 0.445, + "step": 19103 + }, + { + "epoch": 24.45312, + "grad_norm": 1.2016420364379883, + "learning_rate": 1.1806722689075631e-05, + "loss": 0.4517, + "step": 19104 + }, + { + "epoch": 24.4544, + "grad_norm": 1.1429190635681152, + "learning_rate": 1.1804721888755503e-05, + "loss": 0.4761, + "step": 19105 + }, + { + "epoch": 24.45568, + "grad_norm": 1.1792242527008057, + "learning_rate": 1.1802721088435375e-05, + "loss": 0.4446, + "step": 19106 + }, + { + "epoch": 24.45696, + "grad_norm": 1.086751103401184, + "learning_rate": 1.1800720288115247e-05, + "loss": 0.4721, + "step": 19107 + }, + { + "epoch": 24.45824, + "grad_norm": 1.1734704971313477, + "learning_rate": 1.1798719487795119e-05, + "loss": 0.4855, + "step": 19108 + }, + { + "epoch": 24.45952, + "grad_norm": 1.1557416915893555, + "learning_rate": 1.179671868747499e-05, + "loss": 0.4758, + "step": 19109 + }, + { + "epoch": 24.4608, + "grad_norm": 1.1353665590286255, + "learning_rate": 1.1794717887154862e-05, + "loss": 0.5025, + "step": 19110 + }, + { + "epoch": 24.46208, + "grad_norm": 1.1943861246109009, + "learning_rate": 1.1792717086834734e-05, + "loss": 0.4488, + "step": 19111 + }, + { + "epoch": 24.46336, + "grad_norm": 1.1474003791809082, + "learning_rate": 1.1790716286514606e-05, + "loss": 0.4272, + "step": 19112 + }, + { + "epoch": 24.46464, + "grad_norm": 1.1481927633285522, + "learning_rate": 1.1788715486194478e-05, + "loss": 0.4674, + "step": 19113 + }, + { + "epoch": 24.46592, + "grad_norm": 1.2008520364761353, + "learning_rate": 1.178671468587435e-05, + "loss": 0.47, + "step": 19114 + }, + { + "epoch": 24.4672, + "grad_norm": 1.1071596145629883, + "learning_rate": 1.1784713885554222e-05, + "loss": 0.4489, + "step": 19115 + }, + { + "epoch": 24.46848, + "grad_norm": 1.1181803941726685, + "learning_rate": 1.1782713085234093e-05, + "loss": 0.4641, + "step": 19116 + }, + { + "epoch": 24.46976, + "grad_norm": 1.163570523262024, + "learning_rate": 1.1780712284913967e-05, + "loss": 0.492, + "step": 19117 + }, + { + "epoch": 24.47104, + "grad_norm": 1.1192409992218018, + "learning_rate": 1.1778711484593837e-05, + "loss": 0.4299, + "step": 19118 + }, + { + "epoch": 24.47232, + "grad_norm": 1.1589858531951904, + "learning_rate": 1.177671068427371e-05, + "loss": 0.4381, + "step": 19119 + }, + { + "epoch": 24.4736, + "grad_norm": 1.183703064918518, + "learning_rate": 1.1774709883953581e-05, + "loss": 0.4732, + "step": 19120 + }, + { + "epoch": 24.47488, + "grad_norm": 1.0970691442489624, + "learning_rate": 1.1772709083633454e-05, + "loss": 0.4542, + "step": 19121 + }, + { + "epoch": 24.47616, + "grad_norm": 1.1885040998458862, + "learning_rate": 1.1770708283313325e-05, + "loss": 0.5307, + "step": 19122 + }, + { + "epoch": 24.47744, + "grad_norm": 1.1278455257415771, + "learning_rate": 1.1768707482993198e-05, + "loss": 0.4681, + "step": 19123 + }, + { + "epoch": 24.47872, + "grad_norm": 1.0988664627075195, + "learning_rate": 1.176670668267307e-05, + "loss": 0.4867, + "step": 19124 + }, + { + "epoch": 24.48, + "grad_norm": 1.0938818454742432, + "learning_rate": 1.1764705882352942e-05, + "loss": 0.4929, + "step": 19125 + }, + { + "epoch": 24.48128, + "grad_norm": 1.1701966524124146, + "learning_rate": 1.1762705082032814e-05, + "loss": 0.4634, + "step": 19126 + }, + { + "epoch": 24.48256, + "grad_norm": 1.2021418809890747, + "learning_rate": 1.1760704281712686e-05, + "loss": 0.4588, + "step": 19127 + }, + { + "epoch": 24.48384, + "grad_norm": 1.1727592945098877, + "learning_rate": 1.1758703481392557e-05, + "loss": 0.4352, + "step": 19128 + }, + { + "epoch": 24.48512, + "grad_norm": 1.174090027809143, + "learning_rate": 1.175670268107243e-05, + "loss": 0.4726, + "step": 19129 + }, + { + "epoch": 24.4864, + "grad_norm": 1.1474714279174805, + "learning_rate": 1.1754701880752301e-05, + "loss": 0.4448, + "step": 19130 + }, + { + "epoch": 24.48768, + "grad_norm": 1.2793716192245483, + "learning_rate": 1.1752701080432175e-05, + "loss": 0.5175, + "step": 19131 + }, + { + "epoch": 24.48896, + "grad_norm": 1.201866626739502, + "learning_rate": 1.1750700280112045e-05, + "loss": 0.4494, + "step": 19132 + }, + { + "epoch": 24.49024, + "grad_norm": 1.1244724988937378, + "learning_rate": 1.1748699479791918e-05, + "loss": 0.4488, + "step": 19133 + }, + { + "epoch": 24.49152, + "grad_norm": 1.1583161354064941, + "learning_rate": 1.1746698679471789e-05, + "loss": 0.4516, + "step": 19134 + }, + { + "epoch": 24.4928, + "grad_norm": 1.1337168216705322, + "learning_rate": 1.1744697879151662e-05, + "loss": 0.4695, + "step": 19135 + }, + { + "epoch": 24.49408, + "grad_norm": 1.064746618270874, + "learning_rate": 1.1742697078831532e-05, + "loss": 0.4032, + "step": 19136 + }, + { + "epoch": 24.49536, + "grad_norm": 1.1812145709991455, + "learning_rate": 1.1740696278511406e-05, + "loss": 0.5035, + "step": 19137 + }, + { + "epoch": 24.49664, + "grad_norm": 1.156087875366211, + "learning_rate": 1.1738695478191276e-05, + "loss": 0.4852, + "step": 19138 + }, + { + "epoch": 24.49792, + "grad_norm": 1.1552976369857788, + "learning_rate": 1.173669467787115e-05, + "loss": 0.4727, + "step": 19139 + }, + { + "epoch": 24.4992, + "grad_norm": 1.160605788230896, + "learning_rate": 1.1734693877551021e-05, + "loss": 0.4935, + "step": 19140 + }, + { + "epoch": 24.50048, + "grad_norm": 1.1020570993423462, + "learning_rate": 1.1732693077230893e-05, + "loss": 0.453, + "step": 19141 + }, + { + "epoch": 24.50176, + "grad_norm": 1.234426498413086, + "learning_rate": 1.1730692276910765e-05, + "loss": 0.5184, + "step": 19142 + }, + { + "epoch": 24.50304, + "grad_norm": 1.2169140577316284, + "learning_rate": 1.1728691476590637e-05, + "loss": 0.509, + "step": 19143 + }, + { + "epoch": 24.50432, + "grad_norm": 1.199015736579895, + "learning_rate": 1.1726690676270509e-05, + "loss": 0.4727, + "step": 19144 + }, + { + "epoch": 24.5056, + "grad_norm": 1.1079201698303223, + "learning_rate": 1.172468987595038e-05, + "loss": 0.4845, + "step": 19145 + }, + { + "epoch": 24.50688, + "grad_norm": 1.146178126335144, + "learning_rate": 1.1722689075630253e-05, + "loss": 0.4743, + "step": 19146 + }, + { + "epoch": 24.50816, + "grad_norm": 1.0882790088653564, + "learning_rate": 1.1720688275310124e-05, + "loss": 0.4622, + "step": 19147 + }, + { + "epoch": 24.50944, + "grad_norm": 1.2238930463790894, + "learning_rate": 1.1718687474989996e-05, + "loss": 0.4771, + "step": 19148 + }, + { + "epoch": 24.51072, + "grad_norm": 1.1269352436065674, + "learning_rate": 1.1716686674669868e-05, + "loss": 0.5029, + "step": 19149 + }, + { + "epoch": 24.512, + "grad_norm": 1.1483230590820312, + "learning_rate": 1.171468587434974e-05, + "loss": 0.4726, + "step": 19150 + }, + { + "epoch": 24.51328, + "grad_norm": 1.1776012182235718, + "learning_rate": 1.1712685074029612e-05, + "loss": 0.497, + "step": 19151 + }, + { + "epoch": 24.51456, + "grad_norm": 1.1069426536560059, + "learning_rate": 1.1710684273709484e-05, + "loss": 0.4583, + "step": 19152 + }, + { + "epoch": 24.51584, + "grad_norm": 1.1324975490570068, + "learning_rate": 1.1708683473389356e-05, + "loss": 0.4128, + "step": 19153 + }, + { + "epoch": 24.51712, + "grad_norm": 1.190555214881897, + "learning_rate": 1.1706682673069229e-05, + "loss": 0.4871, + "step": 19154 + }, + { + "epoch": 24.5184, + "grad_norm": 1.1613634824752808, + "learning_rate": 1.17046818727491e-05, + "loss": 0.4866, + "step": 19155 + }, + { + "epoch": 24.51968, + "grad_norm": 1.1718133687973022, + "learning_rate": 1.1702681072428973e-05, + "loss": 0.4818, + "step": 19156 + }, + { + "epoch": 24.52096, + "grad_norm": 1.146981954574585, + "learning_rate": 1.1700680272108843e-05, + "loss": 0.5019, + "step": 19157 + }, + { + "epoch": 24.52224, + "grad_norm": 1.192657709121704, + "learning_rate": 1.1698679471788717e-05, + "loss": 0.4981, + "step": 19158 + }, + { + "epoch": 24.52352, + "grad_norm": 1.0763636827468872, + "learning_rate": 1.1696678671468587e-05, + "loss": 0.4314, + "step": 19159 + }, + { + "epoch": 24.5248, + "grad_norm": 1.1718692779541016, + "learning_rate": 1.169467787114846e-05, + "loss": 0.4831, + "step": 19160 + }, + { + "epoch": 24.52608, + "grad_norm": 1.0933730602264404, + "learning_rate": 1.1692677070828332e-05, + "loss": 0.4976, + "step": 19161 + }, + { + "epoch": 24.52736, + "grad_norm": 1.0974316596984863, + "learning_rate": 1.1690676270508204e-05, + "loss": 0.4816, + "step": 19162 + }, + { + "epoch": 24.52864, + "grad_norm": 1.2042176723480225, + "learning_rate": 1.1688675470188076e-05, + "loss": 0.4928, + "step": 19163 + }, + { + "epoch": 24.52992, + "grad_norm": 1.1524473428726196, + "learning_rate": 1.1686674669867948e-05, + "loss": 0.4771, + "step": 19164 + }, + { + "epoch": 24.5312, + "grad_norm": 1.2503443956375122, + "learning_rate": 1.168467386954782e-05, + "loss": 0.4702, + "step": 19165 + }, + { + "epoch": 24.53248, + "grad_norm": 1.1078400611877441, + "learning_rate": 1.1682673069227691e-05, + "loss": 0.4361, + "step": 19166 + }, + { + "epoch": 24.53376, + "grad_norm": 1.1431939601898193, + "learning_rate": 1.1680672268907563e-05, + "loss": 0.4693, + "step": 19167 + }, + { + "epoch": 24.53504, + "grad_norm": 1.201981782913208, + "learning_rate": 1.1678671468587435e-05, + "loss": 0.4996, + "step": 19168 + }, + { + "epoch": 24.53632, + "grad_norm": 1.2216880321502686, + "learning_rate": 1.1676670668267307e-05, + "loss": 0.4964, + "step": 19169 + }, + { + "epoch": 24.5376, + "grad_norm": 1.199825644493103, + "learning_rate": 1.167466986794718e-05, + "loss": 0.484, + "step": 19170 + }, + { + "epoch": 24.53888, + "grad_norm": 1.2285641431808472, + "learning_rate": 1.167266906762705e-05, + "loss": 0.509, + "step": 19171 + }, + { + "epoch": 24.54016, + "grad_norm": 1.1986514329910278, + "learning_rate": 1.1670668267306924e-05, + "loss": 0.49, + "step": 19172 + }, + { + "epoch": 24.54144, + "grad_norm": 1.1682507991790771, + "learning_rate": 1.1668667466986794e-05, + "loss": 0.4695, + "step": 19173 + }, + { + "epoch": 24.54272, + "grad_norm": 1.216668963432312, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.508, + "step": 19174 + }, + { + "epoch": 24.544, + "grad_norm": 1.08146071434021, + "learning_rate": 1.1664665866346538e-05, + "loss": 0.4698, + "step": 19175 + }, + { + "epoch": 24.545279999999998, + "grad_norm": 1.1765329837799072, + "learning_rate": 1.1662665066026412e-05, + "loss": 0.4394, + "step": 19176 + }, + { + "epoch": 24.54656, + "grad_norm": 1.1449719667434692, + "learning_rate": 1.1660664265706283e-05, + "loss": 0.4574, + "step": 19177 + }, + { + "epoch": 24.54784, + "grad_norm": 1.0752500295639038, + "learning_rate": 1.1658663465386155e-05, + "loss": 0.4515, + "step": 19178 + }, + { + "epoch": 24.54912, + "grad_norm": 1.214896559715271, + "learning_rate": 1.1656662665066027e-05, + "loss": 0.5097, + "step": 19179 + }, + { + "epoch": 24.5504, + "grad_norm": 1.113995909690857, + "learning_rate": 1.1654661864745899e-05, + "loss": 0.4797, + "step": 19180 + }, + { + "epoch": 24.55168, + "grad_norm": 1.1679980754852295, + "learning_rate": 1.1652661064425771e-05, + "loss": 0.4592, + "step": 19181 + }, + { + "epoch": 24.55296, + "grad_norm": 1.154405117034912, + "learning_rate": 1.1650660264105643e-05, + "loss": 0.4735, + "step": 19182 + }, + { + "epoch": 24.55424, + "grad_norm": 1.2269680500030518, + "learning_rate": 1.1648659463785515e-05, + "loss": 0.499, + "step": 19183 + }, + { + "epoch": 24.55552, + "grad_norm": 1.146951675415039, + "learning_rate": 1.1646658663465388e-05, + "loss": 0.4796, + "step": 19184 + }, + { + "epoch": 24.5568, + "grad_norm": 1.2074452638626099, + "learning_rate": 1.1644657863145258e-05, + "loss": 0.4796, + "step": 19185 + }, + { + "epoch": 24.55808, + "grad_norm": 1.1281073093414307, + "learning_rate": 1.1642657062825132e-05, + "loss": 0.4737, + "step": 19186 + }, + { + "epoch": 24.55936, + "grad_norm": 1.1057974100112915, + "learning_rate": 1.1640656262505002e-05, + "loss": 0.4699, + "step": 19187 + }, + { + "epoch": 24.56064, + "grad_norm": 1.1083946228027344, + "learning_rate": 1.1638655462184876e-05, + "loss": 0.4705, + "step": 19188 + }, + { + "epoch": 24.56192, + "grad_norm": 1.1685590744018555, + "learning_rate": 1.1636654661864746e-05, + "loss": 0.4922, + "step": 19189 + }, + { + "epoch": 24.5632, + "grad_norm": 1.1086533069610596, + "learning_rate": 1.163465386154462e-05, + "loss": 0.4473, + "step": 19190 + }, + { + "epoch": 24.56448, + "grad_norm": 1.156948447227478, + "learning_rate": 1.163265306122449e-05, + "loss": 0.4685, + "step": 19191 + }, + { + "epoch": 24.56576, + "grad_norm": 1.1937439441680908, + "learning_rate": 1.1630652260904363e-05, + "loss": 0.4974, + "step": 19192 + }, + { + "epoch": 24.56704, + "grad_norm": 1.1433395147323608, + "learning_rate": 1.1628651460584235e-05, + "loss": 0.4823, + "step": 19193 + }, + { + "epoch": 24.56832, + "grad_norm": 1.1796252727508545, + "learning_rate": 1.1626650660264107e-05, + "loss": 0.4591, + "step": 19194 + }, + { + "epoch": 24.5696, + "grad_norm": 1.1520229578018188, + "learning_rate": 1.1624649859943979e-05, + "loss": 0.4805, + "step": 19195 + }, + { + "epoch": 24.57088, + "grad_norm": 1.0709677934646606, + "learning_rate": 1.162264905962385e-05, + "loss": 0.4397, + "step": 19196 + }, + { + "epoch": 24.57216, + "grad_norm": 1.1727806329727173, + "learning_rate": 1.1620648259303722e-05, + "loss": 0.5408, + "step": 19197 + }, + { + "epoch": 24.57344, + "grad_norm": 1.1703429222106934, + "learning_rate": 1.1618647458983594e-05, + "loss": 0.5012, + "step": 19198 + }, + { + "epoch": 24.57472, + "grad_norm": 1.1146478652954102, + "learning_rate": 1.1616646658663466e-05, + "loss": 0.4456, + "step": 19199 + }, + { + "epoch": 24.576, + "grad_norm": 1.1429121494293213, + "learning_rate": 1.1614645858343338e-05, + "loss": 0.4617, + "step": 19200 + }, + { + "epoch": 24.577280000000002, + "grad_norm": 1.0925348997116089, + "learning_rate": 1.161264505802321e-05, + "loss": 0.487, + "step": 19201 + }, + { + "epoch": 24.57856, + "grad_norm": 1.1159874200820923, + "learning_rate": 1.1610644257703082e-05, + "loss": 0.4982, + "step": 19202 + }, + { + "epoch": 24.57984, + "grad_norm": 1.1333866119384766, + "learning_rate": 1.1608643457382953e-05, + "loss": 0.4537, + "step": 19203 + }, + { + "epoch": 24.58112, + "grad_norm": 1.134494662284851, + "learning_rate": 1.1606642657062825e-05, + "loss": 0.4516, + "step": 19204 + }, + { + "epoch": 24.5824, + "grad_norm": 1.1905516386032104, + "learning_rate": 1.1604641856742697e-05, + "loss": 0.4714, + "step": 19205 + }, + { + "epoch": 24.58368, + "grad_norm": 1.208512544631958, + "learning_rate": 1.1602641056422569e-05, + "loss": 0.4834, + "step": 19206 + }, + { + "epoch": 24.58496, + "grad_norm": 1.2601969242095947, + "learning_rate": 1.1600640256102443e-05, + "loss": 0.501, + "step": 19207 + }, + { + "epoch": 24.58624, + "grad_norm": 1.1832962036132812, + "learning_rate": 1.1598639455782313e-05, + "loss": 0.459, + "step": 19208 + }, + { + "epoch": 24.58752, + "grad_norm": 1.2461732625961304, + "learning_rate": 1.1596638655462186e-05, + "loss": 0.5219, + "step": 19209 + }, + { + "epoch": 24.5888, + "grad_norm": 1.1089426279067993, + "learning_rate": 1.1594637855142056e-05, + "loss": 0.5035, + "step": 19210 + }, + { + "epoch": 24.59008, + "grad_norm": 1.2491692304611206, + "learning_rate": 1.159263705482193e-05, + "loss": 0.5015, + "step": 19211 + }, + { + "epoch": 24.59136, + "grad_norm": 1.1578104496002197, + "learning_rate": 1.15906362545018e-05, + "loss": 0.4683, + "step": 19212 + }, + { + "epoch": 24.59264, + "grad_norm": 1.1508525609970093, + "learning_rate": 1.1588635454181674e-05, + "loss": 0.4839, + "step": 19213 + }, + { + "epoch": 24.59392, + "grad_norm": 1.249266266822815, + "learning_rate": 1.1586634653861546e-05, + "loss": 0.4757, + "step": 19214 + }, + { + "epoch": 24.5952, + "grad_norm": 1.13655686378479, + "learning_rate": 1.1584633853541417e-05, + "loss": 0.4996, + "step": 19215 + }, + { + "epoch": 24.59648, + "grad_norm": 1.1786251068115234, + "learning_rate": 1.158263305322129e-05, + "loss": 0.4798, + "step": 19216 + }, + { + "epoch": 24.59776, + "grad_norm": 1.1763968467712402, + "learning_rate": 1.1580632252901161e-05, + "loss": 0.4782, + "step": 19217 + }, + { + "epoch": 24.59904, + "grad_norm": 1.097705602645874, + "learning_rate": 1.1578631452581033e-05, + "loss": 0.4706, + "step": 19218 + }, + { + "epoch": 24.60032, + "grad_norm": 1.1707555055618286, + "learning_rate": 1.1576630652260905e-05, + "loss": 0.4738, + "step": 19219 + }, + { + "epoch": 24.6016, + "grad_norm": 1.1376193761825562, + "learning_rate": 1.1574629851940777e-05, + "loss": 0.505, + "step": 19220 + }, + { + "epoch": 24.60288, + "grad_norm": 1.26047945022583, + "learning_rate": 1.1572629051620649e-05, + "loss": 0.5244, + "step": 19221 + }, + { + "epoch": 24.60416, + "grad_norm": 1.1913353204727173, + "learning_rate": 1.157062825130052e-05, + "loss": 0.4879, + "step": 19222 + }, + { + "epoch": 24.60544, + "grad_norm": 1.1769354343414307, + "learning_rate": 1.1568627450980394e-05, + "loss": 0.5038, + "step": 19223 + }, + { + "epoch": 24.60672, + "grad_norm": 1.16858971118927, + "learning_rate": 1.1566626650660264e-05, + "loss": 0.4796, + "step": 19224 + }, + { + "epoch": 24.608, + "grad_norm": 1.1839061975479126, + "learning_rate": 1.1564625850340138e-05, + "loss": 0.5132, + "step": 19225 + }, + { + "epoch": 24.60928, + "grad_norm": 1.2122026681900024, + "learning_rate": 1.1562625050020008e-05, + "loss": 0.5023, + "step": 19226 + }, + { + "epoch": 24.61056, + "grad_norm": 1.1488457918167114, + "learning_rate": 1.1560624249699881e-05, + "loss": 0.4586, + "step": 19227 + }, + { + "epoch": 24.61184, + "grad_norm": 1.1649909019470215, + "learning_rate": 1.1558623449379752e-05, + "loss": 0.4974, + "step": 19228 + }, + { + "epoch": 24.61312, + "grad_norm": 1.180234670639038, + "learning_rate": 1.1556622649059625e-05, + "loss": 0.4725, + "step": 19229 + }, + { + "epoch": 24.6144, + "grad_norm": 1.1081424951553345, + "learning_rate": 1.1554621848739497e-05, + "loss": 0.47, + "step": 19230 + }, + { + "epoch": 24.61568, + "grad_norm": 1.1423239707946777, + "learning_rate": 1.1552621048419369e-05, + "loss": 0.4692, + "step": 19231 + }, + { + "epoch": 24.61696, + "grad_norm": 1.1524152755737305, + "learning_rate": 1.155062024809924e-05, + "loss": 0.4718, + "step": 19232 + }, + { + "epoch": 24.61824, + "grad_norm": 1.2075049877166748, + "learning_rate": 1.1548619447779113e-05, + "loss": 0.4985, + "step": 19233 + }, + { + "epoch": 24.61952, + "grad_norm": 1.1373603343963623, + "learning_rate": 1.1546618647458984e-05, + "loss": 0.5022, + "step": 19234 + }, + { + "epoch": 24.6208, + "grad_norm": 1.1606000661849976, + "learning_rate": 1.1544617847138856e-05, + "loss": 0.4829, + "step": 19235 + }, + { + "epoch": 24.62208, + "grad_norm": 1.1605327129364014, + "learning_rate": 1.1542617046818728e-05, + "loss": 0.4924, + "step": 19236 + }, + { + "epoch": 24.62336, + "grad_norm": 1.228710651397705, + "learning_rate": 1.15406162464986e-05, + "loss": 0.5122, + "step": 19237 + }, + { + "epoch": 24.62464, + "grad_norm": 1.0705116987228394, + "learning_rate": 1.1538615446178472e-05, + "loss": 0.4644, + "step": 19238 + }, + { + "epoch": 24.62592, + "grad_norm": 1.2077350616455078, + "learning_rate": 1.1536614645858344e-05, + "loss": 0.4765, + "step": 19239 + }, + { + "epoch": 24.6272, + "grad_norm": 1.1825919151306152, + "learning_rate": 1.1534613845538216e-05, + "loss": 0.47, + "step": 19240 + }, + { + "epoch": 24.62848, + "grad_norm": 1.1296800374984741, + "learning_rate": 1.1532613045218087e-05, + "loss": 0.4958, + "step": 19241 + }, + { + "epoch": 24.62976, + "grad_norm": 1.184819221496582, + "learning_rate": 1.153061224489796e-05, + "loss": 0.4932, + "step": 19242 + }, + { + "epoch": 24.63104, + "grad_norm": 1.2108218669891357, + "learning_rate": 1.1528611444577831e-05, + "loss": 0.4847, + "step": 19243 + }, + { + "epoch": 24.63232, + "grad_norm": 1.1210412979125977, + "learning_rate": 1.1526610644257705e-05, + "loss": 0.4636, + "step": 19244 + }, + { + "epoch": 24.6336, + "grad_norm": 1.1325141191482544, + "learning_rate": 1.1524609843937575e-05, + "loss": 0.4885, + "step": 19245 + }, + { + "epoch": 24.63488, + "grad_norm": 1.198911428451538, + "learning_rate": 1.1522609043617448e-05, + "loss": 0.4902, + "step": 19246 + }, + { + "epoch": 24.63616, + "grad_norm": 1.1128379106521606, + "learning_rate": 1.1520608243297319e-05, + "loss": 0.4737, + "step": 19247 + }, + { + "epoch": 24.63744, + "grad_norm": 1.147925853729248, + "learning_rate": 1.1518607442977192e-05, + "loss": 0.4618, + "step": 19248 + }, + { + "epoch": 24.63872, + "grad_norm": 1.10917329788208, + "learning_rate": 1.1516606642657062e-05, + "loss": 0.437, + "step": 19249 + }, + { + "epoch": 24.64, + "grad_norm": 1.1810179948806763, + "learning_rate": 1.1514605842336936e-05, + "loss": 0.4697, + "step": 19250 + }, + { + "epoch": 24.64128, + "grad_norm": 1.15571928024292, + "learning_rate": 1.1512605042016806e-05, + "loss": 0.4482, + "step": 19251 + }, + { + "epoch": 24.64256, + "grad_norm": 1.2140380144119263, + "learning_rate": 1.151060424169668e-05, + "loss": 0.4722, + "step": 19252 + }, + { + "epoch": 24.64384, + "grad_norm": 1.1459300518035889, + "learning_rate": 1.1508603441376551e-05, + "loss": 0.4415, + "step": 19253 + }, + { + "epoch": 24.64512, + "grad_norm": 1.2092753648757935, + "learning_rate": 1.1506602641056423e-05, + "loss": 0.4927, + "step": 19254 + }, + { + "epoch": 24.6464, + "grad_norm": 1.0979299545288086, + "learning_rate": 1.1504601840736295e-05, + "loss": 0.474, + "step": 19255 + }, + { + "epoch": 24.64768, + "grad_norm": 1.2090137004852295, + "learning_rate": 1.1502601040416167e-05, + "loss": 0.4667, + "step": 19256 + }, + { + "epoch": 24.64896, + "grad_norm": 1.0773359537124634, + "learning_rate": 1.1500600240096039e-05, + "loss": 0.4631, + "step": 19257 + }, + { + "epoch": 24.65024, + "grad_norm": 1.1589806079864502, + "learning_rate": 1.149859943977591e-05, + "loss": 0.4615, + "step": 19258 + }, + { + "epoch": 24.65152, + "grad_norm": 1.1646665334701538, + "learning_rate": 1.1496598639455783e-05, + "loss": 0.5144, + "step": 19259 + }, + { + "epoch": 24.6528, + "grad_norm": 1.1850485801696777, + "learning_rate": 1.1494597839135656e-05, + "loss": 0.4873, + "step": 19260 + }, + { + "epoch": 24.65408, + "grad_norm": 1.111702561378479, + "learning_rate": 1.1492597038815526e-05, + "loss": 0.462, + "step": 19261 + }, + { + "epoch": 24.65536, + "grad_norm": 1.0776835680007935, + "learning_rate": 1.14905962384954e-05, + "loss": 0.4624, + "step": 19262 + }, + { + "epoch": 24.65664, + "grad_norm": 1.1940804719924927, + "learning_rate": 1.148859543817527e-05, + "loss": 0.4731, + "step": 19263 + }, + { + "epoch": 24.65792, + "grad_norm": 1.0899372100830078, + "learning_rate": 1.1486594637855143e-05, + "loss": 0.4729, + "step": 19264 + }, + { + "epoch": 24.6592, + "grad_norm": 1.2058807611465454, + "learning_rate": 1.1484593837535014e-05, + "loss": 0.4518, + "step": 19265 + }, + { + "epoch": 24.66048, + "grad_norm": 1.14639413356781, + "learning_rate": 1.1482593037214887e-05, + "loss": 0.4715, + "step": 19266 + }, + { + "epoch": 24.66176, + "grad_norm": 1.0863559246063232, + "learning_rate": 1.1480592236894759e-05, + "loss": 0.4479, + "step": 19267 + }, + { + "epoch": 24.66304, + "grad_norm": 1.1600267887115479, + "learning_rate": 1.1478591436574631e-05, + "loss": 0.5148, + "step": 19268 + }, + { + "epoch": 24.66432, + "grad_norm": 1.1542725563049316, + "learning_rate": 1.1476590636254503e-05, + "loss": 0.4798, + "step": 19269 + }, + { + "epoch": 24.6656, + "grad_norm": 1.1546802520751953, + "learning_rate": 1.1474589835934375e-05, + "loss": 0.4433, + "step": 19270 + }, + { + "epoch": 24.66688, + "grad_norm": 1.123247742652893, + "learning_rate": 1.1472589035614246e-05, + "loss": 0.4422, + "step": 19271 + }, + { + "epoch": 24.66816, + "grad_norm": 1.2257593870162964, + "learning_rate": 1.1470588235294118e-05, + "loss": 0.4726, + "step": 19272 + }, + { + "epoch": 24.66944, + "grad_norm": 1.196321725845337, + "learning_rate": 1.146858743497399e-05, + "loss": 0.5249, + "step": 19273 + }, + { + "epoch": 24.67072, + "grad_norm": 1.1580753326416016, + "learning_rate": 1.1466586634653862e-05, + "loss": 0.4793, + "step": 19274 + }, + { + "epoch": 24.672, + "grad_norm": 1.10203218460083, + "learning_rate": 1.1464585834333734e-05, + "loss": 0.416, + "step": 19275 + }, + { + "epoch": 24.67328, + "grad_norm": 1.0155160427093506, + "learning_rate": 1.1462585034013606e-05, + "loss": 0.4385, + "step": 19276 + }, + { + "epoch": 24.67456, + "grad_norm": 1.1510674953460693, + "learning_rate": 1.1460584233693478e-05, + "loss": 0.4783, + "step": 19277 + }, + { + "epoch": 24.67584, + "grad_norm": 1.1276487112045288, + "learning_rate": 1.145858343337335e-05, + "loss": 0.4613, + "step": 19278 + }, + { + "epoch": 24.67712, + "grad_norm": 1.1388295888900757, + "learning_rate": 1.1456582633053221e-05, + "loss": 0.4549, + "step": 19279 + }, + { + "epoch": 24.6784, + "grad_norm": 1.1739778518676758, + "learning_rate": 1.1454581832733093e-05, + "loss": 0.4569, + "step": 19280 + }, + { + "epoch": 24.67968, + "grad_norm": 1.0705592632293701, + "learning_rate": 1.1452581032412965e-05, + "loss": 0.4347, + "step": 19281 + }, + { + "epoch": 24.68096, + "grad_norm": 1.2039315700531006, + "learning_rate": 1.1450580232092837e-05, + "loss": 0.4751, + "step": 19282 + }, + { + "epoch": 24.68224, + "grad_norm": 1.1473381519317627, + "learning_rate": 1.144857943177271e-05, + "loss": 0.4583, + "step": 19283 + }, + { + "epoch": 24.68352, + "grad_norm": 1.2741856575012207, + "learning_rate": 1.144657863145258e-05, + "loss": 0.4695, + "step": 19284 + }, + { + "epoch": 24.6848, + "grad_norm": 1.1236690282821655, + "learning_rate": 1.1444577831132454e-05, + "loss": 0.4626, + "step": 19285 + }, + { + "epoch": 24.68608, + "grad_norm": 1.2310682535171509, + "learning_rate": 1.1442577030812324e-05, + "loss": 0.4727, + "step": 19286 + }, + { + "epoch": 24.687359999999998, + "grad_norm": 1.1304185390472412, + "learning_rate": 1.1440576230492198e-05, + "loss": 0.4577, + "step": 19287 + }, + { + "epoch": 24.68864, + "grad_norm": 1.2843713760375977, + "learning_rate": 1.1438575430172068e-05, + "loss": 0.4966, + "step": 19288 + }, + { + "epoch": 24.68992, + "grad_norm": 1.1501768827438354, + "learning_rate": 1.1436574629851942e-05, + "loss": 0.4763, + "step": 19289 + }, + { + "epoch": 24.6912, + "grad_norm": 1.2315309047698975, + "learning_rate": 1.1434573829531813e-05, + "loss": 0.4693, + "step": 19290 + }, + { + "epoch": 24.69248, + "grad_norm": 1.1766899824142456, + "learning_rate": 1.1432573029211685e-05, + "loss": 0.4635, + "step": 19291 + }, + { + "epoch": 24.69376, + "grad_norm": 1.2574455738067627, + "learning_rate": 1.1430572228891557e-05, + "loss": 0.5019, + "step": 19292 + }, + { + "epoch": 24.69504, + "grad_norm": 1.1796600818634033, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.4645, + "step": 19293 + }, + { + "epoch": 24.69632, + "grad_norm": 1.1451575756072998, + "learning_rate": 1.1426570628251301e-05, + "loss": 0.4712, + "step": 19294 + }, + { + "epoch": 24.6976, + "grad_norm": 1.1572251319885254, + "learning_rate": 1.1424569827931173e-05, + "loss": 0.5001, + "step": 19295 + }, + { + "epoch": 24.69888, + "grad_norm": 1.1308900117874146, + "learning_rate": 1.1422569027611045e-05, + "loss": 0.4832, + "step": 19296 + }, + { + "epoch": 24.70016, + "grad_norm": 1.1736658811569214, + "learning_rate": 1.1420568227290918e-05, + "loss": 0.4795, + "step": 19297 + }, + { + "epoch": 24.70144, + "grad_norm": 1.182158350944519, + "learning_rate": 1.1418567426970788e-05, + "loss": 0.4893, + "step": 19298 + }, + { + "epoch": 24.70272, + "grad_norm": 1.1963363885879517, + "learning_rate": 1.1416566626650662e-05, + "loss": 0.4335, + "step": 19299 + }, + { + "epoch": 24.704, + "grad_norm": 1.1854673624038696, + "learning_rate": 1.1414565826330532e-05, + "loss": 0.4765, + "step": 19300 + }, + { + "epoch": 24.70528, + "grad_norm": 1.1632603406906128, + "learning_rate": 1.1412565026010406e-05, + "loss": 0.453, + "step": 19301 + }, + { + "epoch": 24.70656, + "grad_norm": 1.1224347352981567, + "learning_rate": 1.1410564225690276e-05, + "loss": 0.4329, + "step": 19302 + }, + { + "epoch": 24.70784, + "grad_norm": 1.083587408065796, + "learning_rate": 1.140856342537015e-05, + "loss": 0.4908, + "step": 19303 + }, + { + "epoch": 24.70912, + "grad_norm": 1.1031328439712524, + "learning_rate": 1.140656262505002e-05, + "loss": 0.4961, + "step": 19304 + }, + { + "epoch": 24.7104, + "grad_norm": 1.1741230487823486, + "learning_rate": 1.1404561824729893e-05, + "loss": 0.4992, + "step": 19305 + }, + { + "epoch": 24.71168, + "grad_norm": 1.1455093622207642, + "learning_rate": 1.1402561024409765e-05, + "loss": 0.4462, + "step": 19306 + }, + { + "epoch": 24.71296, + "grad_norm": 1.1792043447494507, + "learning_rate": 1.1400560224089637e-05, + "loss": 0.4428, + "step": 19307 + }, + { + "epoch": 24.71424, + "grad_norm": 1.1586357355117798, + "learning_rate": 1.1398559423769509e-05, + "loss": 0.4899, + "step": 19308 + }, + { + "epoch": 24.71552, + "grad_norm": 1.1558829545974731, + "learning_rate": 1.139655862344938e-05, + "loss": 0.4433, + "step": 19309 + }, + { + "epoch": 24.7168, + "grad_norm": 1.1648732423782349, + "learning_rate": 1.1394557823129252e-05, + "loss": 0.4847, + "step": 19310 + }, + { + "epoch": 24.71808, + "grad_norm": 1.1810020208358765, + "learning_rate": 1.1392557022809124e-05, + "loss": 0.524, + "step": 19311 + }, + { + "epoch": 24.71936, + "grad_norm": 1.210523247718811, + "learning_rate": 1.1390556222488996e-05, + "loss": 0.4705, + "step": 19312 + }, + { + "epoch": 24.72064, + "grad_norm": 1.1746503114700317, + "learning_rate": 1.1388555422168868e-05, + "loss": 0.4782, + "step": 19313 + }, + { + "epoch": 24.72192, + "grad_norm": 1.1971492767333984, + "learning_rate": 1.138655462184874e-05, + "loss": 0.5247, + "step": 19314 + }, + { + "epoch": 24.7232, + "grad_norm": 1.219415307044983, + "learning_rate": 1.1384553821528612e-05, + "loss": 0.5261, + "step": 19315 + }, + { + "epoch": 24.72448, + "grad_norm": 1.157728910446167, + "learning_rate": 1.1382553021208483e-05, + "loss": 0.5033, + "step": 19316 + }, + { + "epoch": 24.72576, + "grad_norm": 1.21958327293396, + "learning_rate": 1.1380552220888355e-05, + "loss": 0.5063, + "step": 19317 + }, + { + "epoch": 24.72704, + "grad_norm": 1.183083176612854, + "learning_rate": 1.1378551420568227e-05, + "loss": 0.5095, + "step": 19318 + }, + { + "epoch": 24.72832, + "grad_norm": 1.1700811386108398, + "learning_rate": 1.1376550620248099e-05, + "loss": 0.4605, + "step": 19319 + }, + { + "epoch": 24.7296, + "grad_norm": 1.1539628505706787, + "learning_rate": 1.1374549819927973e-05, + "loss": 0.49, + "step": 19320 + }, + { + "epoch": 24.73088, + "grad_norm": 1.0613603591918945, + "learning_rate": 1.1372549019607843e-05, + "loss": 0.4647, + "step": 19321 + }, + { + "epoch": 24.73216, + "grad_norm": 1.124666690826416, + "learning_rate": 1.1370548219287716e-05, + "loss": 0.4497, + "step": 19322 + }, + { + "epoch": 24.73344, + "grad_norm": 1.1756181716918945, + "learning_rate": 1.1368547418967586e-05, + "loss": 0.4863, + "step": 19323 + }, + { + "epoch": 24.73472, + "grad_norm": 1.0413018465042114, + "learning_rate": 1.136654661864746e-05, + "loss": 0.4702, + "step": 19324 + }, + { + "epoch": 24.736, + "grad_norm": 1.0692871809005737, + "learning_rate": 1.136454581832733e-05, + "loss": 0.4444, + "step": 19325 + }, + { + "epoch": 24.73728, + "grad_norm": 1.1003504991531372, + "learning_rate": 1.1362545018007204e-05, + "loss": 0.4332, + "step": 19326 + }, + { + "epoch": 24.73856, + "grad_norm": 1.126057744026184, + "learning_rate": 1.1360544217687076e-05, + "loss": 0.4527, + "step": 19327 + }, + { + "epoch": 24.73984, + "grad_norm": 1.1299209594726562, + "learning_rate": 1.1358543417366947e-05, + "loss": 0.4338, + "step": 19328 + }, + { + "epoch": 24.74112, + "grad_norm": 1.0575133562088013, + "learning_rate": 1.135654261704682e-05, + "loss": 0.4591, + "step": 19329 + }, + { + "epoch": 24.7424, + "grad_norm": 1.1829102039337158, + "learning_rate": 1.1354541816726691e-05, + "loss": 0.4912, + "step": 19330 + }, + { + "epoch": 24.74368, + "grad_norm": 1.154536247253418, + "learning_rate": 1.1352541016406563e-05, + "loss": 0.5155, + "step": 19331 + }, + { + "epoch": 24.74496, + "grad_norm": 1.1477676630020142, + "learning_rate": 1.1350540216086435e-05, + "loss": 0.4689, + "step": 19332 + }, + { + "epoch": 24.74624, + "grad_norm": 1.1423364877700806, + "learning_rate": 1.1348539415766307e-05, + "loss": 0.4852, + "step": 19333 + }, + { + "epoch": 24.74752, + "grad_norm": 1.1712870597839355, + "learning_rate": 1.1346538615446179e-05, + "loss": 0.4899, + "step": 19334 + }, + { + "epoch": 24.7488, + "grad_norm": 1.229701042175293, + "learning_rate": 1.134453781512605e-05, + "loss": 0.4972, + "step": 19335 + }, + { + "epoch": 24.75008, + "grad_norm": 1.098663330078125, + "learning_rate": 1.1342537014805924e-05, + "loss": 0.4796, + "step": 19336 + }, + { + "epoch": 24.75136, + "grad_norm": 1.0754587650299072, + "learning_rate": 1.1340536214485794e-05, + "loss": 0.4549, + "step": 19337 + }, + { + "epoch": 24.75264, + "grad_norm": 1.1610110998153687, + "learning_rate": 1.1338535414165668e-05, + "loss": 0.4837, + "step": 19338 + }, + { + "epoch": 24.75392, + "grad_norm": 1.2524155378341675, + "learning_rate": 1.1336534613845538e-05, + "loss": 0.5235, + "step": 19339 + }, + { + "epoch": 24.7552, + "grad_norm": 1.139482021331787, + "learning_rate": 1.1334533813525411e-05, + "loss": 0.5018, + "step": 19340 + }, + { + "epoch": 24.75648, + "grad_norm": 1.2066160440444946, + "learning_rate": 1.1332533013205282e-05, + "loss": 0.5218, + "step": 19341 + }, + { + "epoch": 24.75776, + "grad_norm": 1.0574979782104492, + "learning_rate": 1.1330532212885155e-05, + "loss": 0.4569, + "step": 19342 + }, + { + "epoch": 24.75904, + "grad_norm": 1.1374906301498413, + "learning_rate": 1.1328531412565027e-05, + "loss": 0.4353, + "step": 19343 + }, + { + "epoch": 24.76032, + "grad_norm": 1.135045051574707, + "learning_rate": 1.1326530612244899e-05, + "loss": 0.4811, + "step": 19344 + }, + { + "epoch": 24.7616, + "grad_norm": 1.1101502180099487, + "learning_rate": 1.132452981192477e-05, + "loss": 0.4816, + "step": 19345 + }, + { + "epoch": 24.76288, + "grad_norm": 1.1571624279022217, + "learning_rate": 1.1322529011604643e-05, + "loss": 0.4853, + "step": 19346 + }, + { + "epoch": 24.76416, + "grad_norm": 1.216991662979126, + "learning_rate": 1.1320528211284514e-05, + "loss": 0.4746, + "step": 19347 + }, + { + "epoch": 24.76544, + "grad_norm": 1.1886634826660156, + "learning_rate": 1.1318527410964386e-05, + "loss": 0.4852, + "step": 19348 + }, + { + "epoch": 24.76672, + "grad_norm": 1.2373932600021362, + "learning_rate": 1.1316526610644258e-05, + "loss": 0.463, + "step": 19349 + }, + { + "epoch": 24.768, + "grad_norm": 1.2170404195785522, + "learning_rate": 1.1314525810324132e-05, + "loss": 0.5278, + "step": 19350 + }, + { + "epoch": 24.76928, + "grad_norm": 1.1271648406982422, + "learning_rate": 1.1312525010004002e-05, + "loss": 0.4666, + "step": 19351 + }, + { + "epoch": 24.77056, + "grad_norm": 1.2704542875289917, + "learning_rate": 1.1310524209683875e-05, + "loss": 0.541, + "step": 19352 + }, + { + "epoch": 24.77184, + "grad_norm": 1.1266765594482422, + "learning_rate": 1.1308523409363746e-05, + "loss": 0.4432, + "step": 19353 + }, + { + "epoch": 24.77312, + "grad_norm": 1.1655040979385376, + "learning_rate": 1.1306522609043619e-05, + "loss": 0.4932, + "step": 19354 + }, + { + "epoch": 24.7744, + "grad_norm": 1.0827305316925049, + "learning_rate": 1.130452180872349e-05, + "loss": 0.4499, + "step": 19355 + }, + { + "epoch": 24.77568, + "grad_norm": 1.1636383533477783, + "learning_rate": 1.1302521008403363e-05, + "loss": 0.5046, + "step": 19356 + }, + { + "epoch": 24.77696, + "grad_norm": 1.1613502502441406, + "learning_rate": 1.1300520208083235e-05, + "loss": 0.454, + "step": 19357 + }, + { + "epoch": 24.77824, + "grad_norm": 1.1521443128585815, + "learning_rate": 1.1298519407763106e-05, + "loss": 0.4479, + "step": 19358 + }, + { + "epoch": 24.77952, + "grad_norm": 1.129206657409668, + "learning_rate": 1.1296518607442978e-05, + "loss": 0.4722, + "step": 19359 + }, + { + "epoch": 24.7808, + "grad_norm": 1.1790378093719482, + "learning_rate": 1.129451780712285e-05, + "loss": 0.4682, + "step": 19360 + }, + { + "epoch": 24.78208, + "grad_norm": 1.1063263416290283, + "learning_rate": 1.1292517006802722e-05, + "loss": 0.4484, + "step": 19361 + }, + { + "epoch": 24.78336, + "grad_norm": 1.1789052486419678, + "learning_rate": 1.1290516206482594e-05, + "loss": 0.4894, + "step": 19362 + }, + { + "epoch": 24.78464, + "grad_norm": 1.210119605064392, + "learning_rate": 1.1288515406162466e-05, + "loss": 0.533, + "step": 19363 + }, + { + "epoch": 24.78592, + "grad_norm": 1.0903270244598389, + "learning_rate": 1.1286514605842338e-05, + "loss": 0.4602, + "step": 19364 + }, + { + "epoch": 24.7872, + "grad_norm": 1.1570169925689697, + "learning_rate": 1.128451380552221e-05, + "loss": 0.4611, + "step": 19365 + }, + { + "epoch": 24.78848, + "grad_norm": 1.0663702487945557, + "learning_rate": 1.1282513005202081e-05, + "loss": 0.4379, + "step": 19366 + }, + { + "epoch": 24.78976, + "grad_norm": 1.067586898803711, + "learning_rate": 1.1280512204881953e-05, + "loss": 0.4306, + "step": 19367 + }, + { + "epoch": 24.79104, + "grad_norm": 1.1584056615829468, + "learning_rate": 1.1278511404561825e-05, + "loss": 0.4455, + "step": 19368 + }, + { + "epoch": 24.79232, + "grad_norm": 1.188567042350769, + "learning_rate": 1.1276510604241697e-05, + "loss": 0.5147, + "step": 19369 + }, + { + "epoch": 24.7936, + "grad_norm": 1.2147189378738403, + "learning_rate": 1.1274509803921569e-05, + "loss": 0.4957, + "step": 19370 + }, + { + "epoch": 24.79488, + "grad_norm": 1.1530333757400513, + "learning_rate": 1.127250900360144e-05, + "loss": 0.4771, + "step": 19371 + }, + { + "epoch": 24.79616, + "grad_norm": 1.1799629926681519, + "learning_rate": 1.1270508203281312e-05, + "loss": 0.5172, + "step": 19372 + }, + { + "epoch": 24.79744, + "grad_norm": 1.1808135509490967, + "learning_rate": 1.1268507402961186e-05, + "loss": 0.4899, + "step": 19373 + }, + { + "epoch": 24.79872, + "grad_norm": 1.169081687927246, + "learning_rate": 1.1266506602641056e-05, + "loss": 0.4794, + "step": 19374 + }, + { + "epoch": 24.8, + "grad_norm": 1.2258483171463013, + "learning_rate": 1.126450580232093e-05, + "loss": 0.4697, + "step": 19375 + }, + { + "epoch": 24.80128, + "grad_norm": 1.1741260290145874, + "learning_rate": 1.12625050020008e-05, + "loss": 0.5146, + "step": 19376 + }, + { + "epoch": 24.80256, + "grad_norm": 1.1346204280853271, + "learning_rate": 1.1260504201680673e-05, + "loss": 0.4403, + "step": 19377 + }, + { + "epoch": 24.80384, + "grad_norm": 1.1797999143600464, + "learning_rate": 1.1258503401360544e-05, + "loss": 0.449, + "step": 19378 + }, + { + "epoch": 24.80512, + "grad_norm": 1.2282897233963013, + "learning_rate": 1.1256502601040417e-05, + "loss": 0.476, + "step": 19379 + }, + { + "epoch": 24.8064, + "grad_norm": 1.1009891033172607, + "learning_rate": 1.1254501800720289e-05, + "loss": 0.498, + "step": 19380 + }, + { + "epoch": 24.80768, + "grad_norm": 1.0904237031936646, + "learning_rate": 1.1252501000400161e-05, + "loss": 0.4462, + "step": 19381 + }, + { + "epoch": 24.80896, + "grad_norm": 1.162895679473877, + "learning_rate": 1.1250500200080033e-05, + "loss": 0.4788, + "step": 19382 + }, + { + "epoch": 24.81024, + "grad_norm": 1.1506489515304565, + "learning_rate": 1.1248499399759905e-05, + "loss": 0.4834, + "step": 19383 + }, + { + "epoch": 24.81152, + "grad_norm": 1.1401734352111816, + "learning_rate": 1.1246498599439776e-05, + "loss": 0.4822, + "step": 19384 + }, + { + "epoch": 24.8128, + "grad_norm": 1.0999462604522705, + "learning_rate": 1.1244497799119648e-05, + "loss": 0.4681, + "step": 19385 + }, + { + "epoch": 24.81408, + "grad_norm": 1.060306429862976, + "learning_rate": 1.124249699879952e-05, + "loss": 0.4561, + "step": 19386 + }, + { + "epoch": 24.81536, + "grad_norm": 1.2425814867019653, + "learning_rate": 1.1240496198479392e-05, + "loss": 0.5074, + "step": 19387 + }, + { + "epoch": 24.81664, + "grad_norm": 1.209997534751892, + "learning_rate": 1.1238495398159264e-05, + "loss": 0.5151, + "step": 19388 + }, + { + "epoch": 24.81792, + "grad_norm": 1.1693835258483887, + "learning_rate": 1.1236494597839137e-05, + "loss": 0.5006, + "step": 19389 + }, + { + "epoch": 24.8192, + "grad_norm": 1.1733357906341553, + "learning_rate": 1.1234493797519008e-05, + "loss": 0.5059, + "step": 19390 + }, + { + "epoch": 24.82048, + "grad_norm": 1.16240656375885, + "learning_rate": 1.1232492997198881e-05, + "loss": 0.458, + "step": 19391 + }, + { + "epoch": 24.82176, + "grad_norm": 1.0904651880264282, + "learning_rate": 1.1230492196878751e-05, + "loss": 0.4414, + "step": 19392 + }, + { + "epoch": 24.82304, + "grad_norm": 1.1327263116836548, + "learning_rate": 1.1228491396558625e-05, + "loss": 0.4849, + "step": 19393 + }, + { + "epoch": 24.82432, + "grad_norm": 1.2412240505218506, + "learning_rate": 1.1226490596238495e-05, + "loss": 0.5405, + "step": 19394 + }, + { + "epoch": 24.8256, + "grad_norm": 1.1290397644042969, + "learning_rate": 1.1224489795918369e-05, + "loss": 0.4745, + "step": 19395 + }, + { + "epoch": 24.82688, + "grad_norm": 1.1640607118606567, + "learning_rate": 1.122248899559824e-05, + "loss": 0.4839, + "step": 19396 + }, + { + "epoch": 24.82816, + "grad_norm": 1.1949489116668701, + "learning_rate": 1.1220488195278112e-05, + "loss": 0.4592, + "step": 19397 + }, + { + "epoch": 24.829439999999998, + "grad_norm": 1.164475440979004, + "learning_rate": 1.1218487394957984e-05, + "loss": 0.4737, + "step": 19398 + }, + { + "epoch": 24.83072, + "grad_norm": 1.185512661933899, + "learning_rate": 1.1216486594637856e-05, + "loss": 0.451, + "step": 19399 + }, + { + "epoch": 24.832, + "grad_norm": 1.096878170967102, + "learning_rate": 1.1214485794317728e-05, + "loss": 0.5043, + "step": 19400 + }, + { + "epoch": 24.83328, + "grad_norm": 1.2109886407852173, + "learning_rate": 1.12124849939976e-05, + "loss": 0.569, + "step": 19401 + }, + { + "epoch": 24.83456, + "grad_norm": 1.1665548086166382, + "learning_rate": 1.1210484193677472e-05, + "loss": 0.5026, + "step": 19402 + }, + { + "epoch": 24.83584, + "grad_norm": 1.1184903383255005, + "learning_rate": 1.1208483393357343e-05, + "loss": 0.4835, + "step": 19403 + }, + { + "epoch": 24.83712, + "grad_norm": 1.2306338548660278, + "learning_rate": 1.1206482593037215e-05, + "loss": 0.4726, + "step": 19404 + }, + { + "epoch": 24.8384, + "grad_norm": 1.2000937461853027, + "learning_rate": 1.1204481792717087e-05, + "loss": 0.4827, + "step": 19405 + }, + { + "epoch": 24.83968, + "grad_norm": 1.1614446640014648, + "learning_rate": 1.1202480992396959e-05, + "loss": 0.4524, + "step": 19406 + }, + { + "epoch": 24.84096, + "grad_norm": 1.1615883111953735, + "learning_rate": 1.1200480192076831e-05, + "loss": 0.5126, + "step": 19407 + }, + { + "epoch": 24.84224, + "grad_norm": 1.1704049110412598, + "learning_rate": 1.1198479391756703e-05, + "loss": 0.4715, + "step": 19408 + }, + { + "epoch": 24.84352, + "grad_norm": 1.2115988731384277, + "learning_rate": 1.1196478591436575e-05, + "loss": 0.4983, + "step": 19409 + }, + { + "epoch": 24.8448, + "grad_norm": 1.1601941585540771, + "learning_rate": 1.1194477791116448e-05, + "loss": 0.5095, + "step": 19410 + }, + { + "epoch": 24.84608, + "grad_norm": 1.1305478811264038, + "learning_rate": 1.1192476990796318e-05, + "loss": 0.4968, + "step": 19411 + }, + { + "epoch": 24.84736, + "grad_norm": 1.243231177330017, + "learning_rate": 1.1190476190476192e-05, + "loss": 0.4912, + "step": 19412 + }, + { + "epoch": 24.84864, + "grad_norm": 1.0542876720428467, + "learning_rate": 1.1188475390156062e-05, + "loss": 0.4489, + "step": 19413 + }, + { + "epoch": 24.84992, + "grad_norm": 1.184417486190796, + "learning_rate": 1.1186474589835936e-05, + "loss": 0.4938, + "step": 19414 + }, + { + "epoch": 24.8512, + "grad_norm": 1.2762627601623535, + "learning_rate": 1.1184473789515806e-05, + "loss": 0.5049, + "step": 19415 + }, + { + "epoch": 24.85248, + "grad_norm": 1.0796477794647217, + "learning_rate": 1.118247298919568e-05, + "loss": 0.4241, + "step": 19416 + }, + { + "epoch": 24.85376, + "grad_norm": 1.2161225080490112, + "learning_rate": 1.118047218887555e-05, + "loss": 0.5025, + "step": 19417 + }, + { + "epoch": 24.85504, + "grad_norm": 1.180956482887268, + "learning_rate": 1.1178471388555423e-05, + "loss": 0.4798, + "step": 19418 + }, + { + "epoch": 24.85632, + "grad_norm": 1.1922006607055664, + "learning_rate": 1.1176470588235295e-05, + "loss": 0.4928, + "step": 19419 + }, + { + "epoch": 24.8576, + "grad_norm": 1.1288496255874634, + "learning_rate": 1.1174469787915167e-05, + "loss": 0.4416, + "step": 19420 + }, + { + "epoch": 24.85888, + "grad_norm": 1.1524951457977295, + "learning_rate": 1.1172468987595039e-05, + "loss": 0.4827, + "step": 19421 + }, + { + "epoch": 24.86016, + "grad_norm": 1.239288330078125, + "learning_rate": 1.117046818727491e-05, + "loss": 0.4972, + "step": 19422 + }, + { + "epoch": 24.86144, + "grad_norm": 1.1284489631652832, + "learning_rate": 1.1168467386954782e-05, + "loss": 0.4293, + "step": 19423 + }, + { + "epoch": 24.86272, + "grad_norm": 1.178391456604004, + "learning_rate": 1.1166466586634654e-05, + "loss": 0.4698, + "step": 19424 + }, + { + "epoch": 24.864, + "grad_norm": 1.1781879663467407, + "learning_rate": 1.1164465786314526e-05, + "loss": 0.4427, + "step": 19425 + }, + { + "epoch": 24.86528, + "grad_norm": 1.1736154556274414, + "learning_rate": 1.11624649859944e-05, + "loss": 0.4899, + "step": 19426 + }, + { + "epoch": 24.86656, + "grad_norm": 1.210067868232727, + "learning_rate": 1.116046418567427e-05, + "loss": 0.4746, + "step": 19427 + }, + { + "epoch": 24.86784, + "grad_norm": 1.1524900197982788, + "learning_rate": 1.1158463385354143e-05, + "loss": 0.4755, + "step": 19428 + }, + { + "epoch": 24.86912, + "grad_norm": 1.077675461769104, + "learning_rate": 1.1156462585034013e-05, + "loss": 0.4248, + "step": 19429 + }, + { + "epoch": 24.8704, + "grad_norm": 1.1400989294052124, + "learning_rate": 1.1154461784713887e-05, + "loss": 0.4719, + "step": 19430 + }, + { + "epoch": 24.87168, + "grad_norm": 1.0998234748840332, + "learning_rate": 1.1152460984393757e-05, + "loss": 0.4563, + "step": 19431 + }, + { + "epoch": 24.87296, + "grad_norm": 1.2079157829284668, + "learning_rate": 1.115046018407363e-05, + "loss": 0.508, + "step": 19432 + }, + { + "epoch": 24.87424, + "grad_norm": 1.118981122970581, + "learning_rate": 1.1148459383753503e-05, + "loss": 0.4577, + "step": 19433 + }, + { + "epoch": 24.87552, + "grad_norm": 1.1195893287658691, + "learning_rate": 1.1146458583433374e-05, + "loss": 0.4688, + "step": 19434 + }, + { + "epoch": 24.8768, + "grad_norm": 1.1069542169570923, + "learning_rate": 1.1144457783113246e-05, + "loss": 0.4522, + "step": 19435 + }, + { + "epoch": 24.87808, + "grad_norm": 1.2050132751464844, + "learning_rate": 1.1142456982793118e-05, + "loss": 0.4965, + "step": 19436 + }, + { + "epoch": 24.87936, + "grad_norm": 1.0464030504226685, + "learning_rate": 1.114045618247299e-05, + "loss": 0.4366, + "step": 19437 + }, + { + "epoch": 24.88064, + "grad_norm": 1.1153632402420044, + "learning_rate": 1.1138455382152862e-05, + "loss": 0.454, + "step": 19438 + }, + { + "epoch": 24.88192, + "grad_norm": 1.178065299987793, + "learning_rate": 1.1136454581832734e-05, + "loss": 0.4914, + "step": 19439 + }, + { + "epoch": 24.8832, + "grad_norm": 1.2401835918426514, + "learning_rate": 1.1134453781512606e-05, + "loss": 0.4877, + "step": 19440 + }, + { + "epoch": 24.88448, + "grad_norm": 1.2349568605422974, + "learning_rate": 1.1132452981192477e-05, + "loss": 0.5027, + "step": 19441 + }, + { + "epoch": 24.88576, + "grad_norm": 1.188691258430481, + "learning_rate": 1.113045218087235e-05, + "loss": 0.4862, + "step": 19442 + }, + { + "epoch": 24.88704, + "grad_norm": 1.2274821996688843, + "learning_rate": 1.1128451380552221e-05, + "loss": 0.489, + "step": 19443 + }, + { + "epoch": 24.88832, + "grad_norm": 1.2157435417175293, + "learning_rate": 1.1126450580232093e-05, + "loss": 0.518, + "step": 19444 + }, + { + "epoch": 24.8896, + "grad_norm": 1.1647244691848755, + "learning_rate": 1.1124449779911965e-05, + "loss": 0.5051, + "step": 19445 + }, + { + "epoch": 24.89088, + "grad_norm": 1.1653064489364624, + "learning_rate": 1.1122448979591837e-05, + "loss": 0.4646, + "step": 19446 + }, + { + "epoch": 24.89216, + "grad_norm": 1.1139925718307495, + "learning_rate": 1.1120448179271709e-05, + "loss": 0.4621, + "step": 19447 + }, + { + "epoch": 24.89344, + "grad_norm": 1.1649415493011475, + "learning_rate": 1.111844737895158e-05, + "loss": 0.4833, + "step": 19448 + }, + { + "epoch": 24.89472, + "grad_norm": 1.2173422574996948, + "learning_rate": 1.1116446578631454e-05, + "loss": 0.5032, + "step": 19449 + }, + { + "epoch": 24.896, + "grad_norm": 1.1215828657150269, + "learning_rate": 1.1114445778311324e-05, + "loss": 0.4903, + "step": 19450 + }, + { + "epoch": 24.89728, + "grad_norm": 1.179179310798645, + "learning_rate": 1.1112444977991198e-05, + "loss": 0.4854, + "step": 19451 + }, + { + "epoch": 24.89856, + "grad_norm": 1.1343804597854614, + "learning_rate": 1.1110444177671068e-05, + "loss": 0.4477, + "step": 19452 + }, + { + "epoch": 24.89984, + "grad_norm": 1.1283519268035889, + "learning_rate": 1.1108443377350941e-05, + "loss": 0.4814, + "step": 19453 + }, + { + "epoch": 24.90112, + "grad_norm": 1.117431879043579, + "learning_rate": 1.1106442577030812e-05, + "loss": 0.4512, + "step": 19454 + }, + { + "epoch": 24.9024, + "grad_norm": 1.1315174102783203, + "learning_rate": 1.1104441776710685e-05, + "loss": 0.4423, + "step": 19455 + }, + { + "epoch": 24.90368, + "grad_norm": 1.2112675905227661, + "learning_rate": 1.1102440976390557e-05, + "loss": 0.5169, + "step": 19456 + }, + { + "epoch": 24.90496, + "grad_norm": 1.1385278701782227, + "learning_rate": 1.1100440176070429e-05, + "loss": 0.4717, + "step": 19457 + }, + { + "epoch": 24.90624, + "grad_norm": 1.1794296503067017, + "learning_rate": 1.10984393757503e-05, + "loss": 0.503, + "step": 19458 + }, + { + "epoch": 24.90752, + "grad_norm": 1.1522889137268066, + "learning_rate": 1.1096438575430172e-05, + "loss": 0.4582, + "step": 19459 + }, + { + "epoch": 24.9088, + "grad_norm": 1.233756422996521, + "learning_rate": 1.1094437775110044e-05, + "loss": 0.521, + "step": 19460 + }, + { + "epoch": 24.91008, + "grad_norm": 1.1709715127944946, + "learning_rate": 1.1092436974789916e-05, + "loss": 0.492, + "step": 19461 + }, + { + "epoch": 24.91136, + "grad_norm": 1.193808913230896, + "learning_rate": 1.1090436174469788e-05, + "loss": 0.4568, + "step": 19462 + }, + { + "epoch": 24.91264, + "grad_norm": 1.14645254611969, + "learning_rate": 1.1088435374149662e-05, + "loss": 0.4904, + "step": 19463 + }, + { + "epoch": 24.91392, + "grad_norm": 1.100979208946228, + "learning_rate": 1.1086434573829532e-05, + "loss": 0.4828, + "step": 19464 + }, + { + "epoch": 24.9152, + "grad_norm": 1.1360437870025635, + "learning_rate": 1.1084433773509405e-05, + "loss": 0.4519, + "step": 19465 + }, + { + "epoch": 24.91648, + "grad_norm": 1.0801247358322144, + "learning_rate": 1.1082432973189275e-05, + "loss": 0.4426, + "step": 19466 + }, + { + "epoch": 24.91776, + "grad_norm": 1.2544835805892944, + "learning_rate": 1.1080432172869149e-05, + "loss": 0.5123, + "step": 19467 + }, + { + "epoch": 24.91904, + "grad_norm": 1.1919646263122559, + "learning_rate": 1.107843137254902e-05, + "loss": 0.4854, + "step": 19468 + }, + { + "epoch": 24.92032, + "grad_norm": 1.1021586656570435, + "learning_rate": 1.1076430572228893e-05, + "loss": 0.5155, + "step": 19469 + }, + { + "epoch": 24.9216, + "grad_norm": 1.230746865272522, + "learning_rate": 1.1074429771908763e-05, + "loss": 0.5456, + "step": 19470 + }, + { + "epoch": 24.92288, + "grad_norm": 1.0849162340164185, + "learning_rate": 1.1072428971588636e-05, + "loss": 0.4524, + "step": 19471 + }, + { + "epoch": 24.92416, + "grad_norm": 1.1267224550247192, + "learning_rate": 1.1070428171268508e-05, + "loss": 0.4821, + "step": 19472 + }, + { + "epoch": 24.925440000000002, + "grad_norm": 1.134163737297058, + "learning_rate": 1.106842737094838e-05, + "loss": 0.4881, + "step": 19473 + }, + { + "epoch": 24.92672, + "grad_norm": 1.142830729484558, + "learning_rate": 1.1066426570628252e-05, + "loss": 0.5139, + "step": 19474 + }, + { + "epoch": 24.928, + "grad_norm": 1.1527884006500244, + "learning_rate": 1.1064425770308124e-05, + "loss": 0.4652, + "step": 19475 + }, + { + "epoch": 24.92928, + "grad_norm": 1.1849658489227295, + "learning_rate": 1.1062424969987996e-05, + "loss": 0.5047, + "step": 19476 + }, + { + "epoch": 24.93056, + "grad_norm": 1.1960779428482056, + "learning_rate": 1.1060424169667868e-05, + "loss": 0.4927, + "step": 19477 + }, + { + "epoch": 24.93184, + "grad_norm": 1.118734359741211, + "learning_rate": 1.105842336934774e-05, + "loss": 0.463, + "step": 19478 + }, + { + "epoch": 24.93312, + "grad_norm": 1.1411648988723755, + "learning_rate": 1.1056422569027611e-05, + "loss": 0.4535, + "step": 19479 + }, + { + "epoch": 24.9344, + "grad_norm": 1.1188914775848389, + "learning_rate": 1.1054421768707483e-05, + "loss": 0.4437, + "step": 19480 + }, + { + "epoch": 24.93568, + "grad_norm": 1.1471185684204102, + "learning_rate": 1.1052420968387355e-05, + "loss": 0.4782, + "step": 19481 + }, + { + "epoch": 24.93696, + "grad_norm": 1.2301186323165894, + "learning_rate": 1.1050420168067227e-05, + "loss": 0.4759, + "step": 19482 + }, + { + "epoch": 24.93824, + "grad_norm": 1.175561547279358, + "learning_rate": 1.1048419367747099e-05, + "loss": 0.4798, + "step": 19483 + }, + { + "epoch": 24.93952, + "grad_norm": 1.172258734703064, + "learning_rate": 1.104641856742697e-05, + "loss": 0.465, + "step": 19484 + }, + { + "epoch": 24.9408, + "grad_norm": 1.1110254526138306, + "learning_rate": 1.1044417767106842e-05, + "loss": 0.4597, + "step": 19485 + }, + { + "epoch": 24.94208, + "grad_norm": 1.0964692831039429, + "learning_rate": 1.1042416966786716e-05, + "loss": 0.4654, + "step": 19486 + }, + { + "epoch": 24.94336, + "grad_norm": 1.187997579574585, + "learning_rate": 1.1040416166466586e-05, + "loss": 0.4786, + "step": 19487 + }, + { + "epoch": 24.94464, + "grad_norm": 1.1386297941207886, + "learning_rate": 1.103841536614646e-05, + "loss": 0.4923, + "step": 19488 + }, + { + "epoch": 24.94592, + "grad_norm": 1.0917891263961792, + "learning_rate": 1.103641456582633e-05, + "loss": 0.4856, + "step": 19489 + }, + { + "epoch": 24.9472, + "grad_norm": 1.1903581619262695, + "learning_rate": 1.1034413765506203e-05, + "loss": 0.5102, + "step": 19490 + }, + { + "epoch": 24.94848, + "grad_norm": 1.118826985359192, + "learning_rate": 1.1032412965186074e-05, + "loss": 0.4864, + "step": 19491 + }, + { + "epoch": 24.94976, + "grad_norm": 1.0741007328033447, + "learning_rate": 1.1030412164865947e-05, + "loss": 0.4401, + "step": 19492 + }, + { + "epoch": 24.95104, + "grad_norm": 1.154763102531433, + "learning_rate": 1.1028411364545819e-05, + "loss": 0.493, + "step": 19493 + }, + { + "epoch": 24.95232, + "grad_norm": 1.167617678642273, + "learning_rate": 1.1026410564225691e-05, + "loss": 0.4935, + "step": 19494 + }, + { + "epoch": 24.9536, + "grad_norm": 1.1910767555236816, + "learning_rate": 1.1024409763905563e-05, + "loss": 0.5077, + "step": 19495 + }, + { + "epoch": 24.95488, + "grad_norm": 1.1297580003738403, + "learning_rate": 1.1022408963585435e-05, + "loss": 0.5074, + "step": 19496 + }, + { + "epoch": 24.95616, + "grad_norm": 1.1995117664337158, + "learning_rate": 1.1020408163265306e-05, + "loss": 0.4817, + "step": 19497 + }, + { + "epoch": 24.95744, + "grad_norm": 1.1976964473724365, + "learning_rate": 1.1018407362945178e-05, + "loss": 0.5004, + "step": 19498 + }, + { + "epoch": 24.95872, + "grad_norm": 1.1336443424224854, + "learning_rate": 1.101640656262505e-05, + "loss": 0.46, + "step": 19499 + }, + { + "epoch": 24.96, + "grad_norm": 1.1351187229156494, + "learning_rate": 1.1014405762304922e-05, + "loss": 0.4802, + "step": 19500 + }, + { + "epoch": 24.96128, + "grad_norm": 1.1822195053100586, + "learning_rate": 1.1012404961984794e-05, + "loss": 0.5001, + "step": 19501 + }, + { + "epoch": 24.96256, + "grad_norm": 1.1518561840057373, + "learning_rate": 1.1010404161664667e-05, + "loss": 0.4872, + "step": 19502 + }, + { + "epoch": 24.96384, + "grad_norm": 1.2118054628372192, + "learning_rate": 1.1008403361344538e-05, + "loss": 0.4995, + "step": 19503 + }, + { + "epoch": 24.96512, + "grad_norm": 1.1399140357971191, + "learning_rate": 1.1006402561024411e-05, + "loss": 0.4491, + "step": 19504 + }, + { + "epoch": 24.9664, + "grad_norm": 1.1353464126586914, + "learning_rate": 1.1004401760704281e-05, + "loss": 0.4953, + "step": 19505 + }, + { + "epoch": 24.96768, + "grad_norm": 1.1443747282028198, + "learning_rate": 1.1002400960384155e-05, + "loss": 0.4445, + "step": 19506 + }, + { + "epoch": 24.96896, + "grad_norm": 1.1765371561050415, + "learning_rate": 1.1000400160064025e-05, + "loss": 0.5095, + "step": 19507 + }, + { + "epoch": 24.97024, + "grad_norm": 1.1671823263168335, + "learning_rate": 1.0998399359743899e-05, + "loss": 0.5449, + "step": 19508 + }, + { + "epoch": 24.97152, + "grad_norm": 1.2038657665252686, + "learning_rate": 1.099639855942377e-05, + "loss": 0.4931, + "step": 19509 + }, + { + "epoch": 24.9728, + "grad_norm": 1.1690441370010376, + "learning_rate": 1.0994397759103642e-05, + "loss": 0.4586, + "step": 19510 + }, + { + "epoch": 24.97408, + "grad_norm": 1.184372901916504, + "learning_rate": 1.0992396958783514e-05, + "loss": 0.4714, + "step": 19511 + }, + { + "epoch": 24.97536, + "grad_norm": 1.1089961528778076, + "learning_rate": 1.0990396158463386e-05, + "loss": 0.4356, + "step": 19512 + }, + { + "epoch": 24.97664, + "grad_norm": 1.1306204795837402, + "learning_rate": 1.0988395358143258e-05, + "loss": 0.4918, + "step": 19513 + }, + { + "epoch": 24.97792, + "grad_norm": 1.166355013847351, + "learning_rate": 1.098639455782313e-05, + "loss": 0.5125, + "step": 19514 + }, + { + "epoch": 24.9792, + "grad_norm": 1.1938666105270386, + "learning_rate": 1.0984393757503002e-05, + "loss": 0.5012, + "step": 19515 + }, + { + "epoch": 24.98048, + "grad_norm": 1.2338180541992188, + "learning_rate": 1.0982392957182875e-05, + "loss": 0.5217, + "step": 19516 + }, + { + "epoch": 24.98176, + "grad_norm": 1.154515027999878, + "learning_rate": 1.0980392156862745e-05, + "loss": 0.497, + "step": 19517 + }, + { + "epoch": 24.98304, + "grad_norm": 1.1719876527786255, + "learning_rate": 1.0978391356542619e-05, + "loss": 0.4643, + "step": 19518 + }, + { + "epoch": 24.98432, + "grad_norm": 1.1815974712371826, + "learning_rate": 1.0976390556222489e-05, + "loss": 0.5102, + "step": 19519 + }, + { + "epoch": 24.9856, + "grad_norm": 1.095108985900879, + "learning_rate": 1.0974389755902363e-05, + "loss": 0.4735, + "step": 19520 + }, + { + "epoch": 24.98688, + "grad_norm": 1.1639820337295532, + "learning_rate": 1.0972388955582233e-05, + "loss": 0.4904, + "step": 19521 + }, + { + "epoch": 24.98816, + "grad_norm": 1.1341108083724976, + "learning_rate": 1.0970388155262106e-05, + "loss": 0.4568, + "step": 19522 + }, + { + "epoch": 24.98944, + "grad_norm": 1.2175263166427612, + "learning_rate": 1.0968387354941978e-05, + "loss": 0.4845, + "step": 19523 + }, + { + "epoch": 24.99072, + "grad_norm": 1.1433970928192139, + "learning_rate": 1.096638655462185e-05, + "loss": 0.4664, + "step": 19524 + }, + { + "epoch": 24.992, + "grad_norm": 1.1078377962112427, + "learning_rate": 1.0964385754301722e-05, + "loss": 0.4772, + "step": 19525 + }, + { + "epoch": 24.99328, + "grad_norm": 1.181159257888794, + "learning_rate": 1.0962384953981594e-05, + "loss": 0.4969, + "step": 19526 + }, + { + "epoch": 24.99456, + "grad_norm": 1.0896167755126953, + "learning_rate": 1.0960384153661466e-05, + "loss": 0.4691, + "step": 19527 + }, + { + "epoch": 24.99584, + "grad_norm": 1.1512247323989868, + "learning_rate": 1.0958383353341337e-05, + "loss": 0.4367, + "step": 19528 + }, + { + "epoch": 24.99712, + "grad_norm": 1.1622196435928345, + "learning_rate": 1.095638255302121e-05, + "loss": 0.4856, + "step": 19529 + }, + { + "epoch": 24.9984, + "grad_norm": 1.1247361898422241, + "learning_rate": 1.0954381752701081e-05, + "loss": 0.4831, + "step": 19530 + }, + { + "epoch": 24.99968, + "grad_norm": 1.0839389562606812, + "learning_rate": 1.0952380952380953e-05, + "loss": 0.4514, + "step": 19531 + }, + { + "epoch": 25.00096, + "grad_norm": Infinity, + "learning_rate": 1.0952380952380953e-05, + "loss": 0.8962, + "step": 19532 + }, + { + "epoch": 25.00224, + "grad_norm": 1.1649938821792603, + "learning_rate": 1.0950380152060825e-05, + "loss": 0.4679, + "step": 19533 + }, + { + "epoch": 25.00352, + "grad_norm": 1.159871220588684, + "learning_rate": 1.0948379351740697e-05, + "loss": 0.4861, + "step": 19534 + }, + { + "epoch": 25.0048, + "grad_norm": 1.1205157041549683, + "learning_rate": 1.0946378551420569e-05, + "loss": 0.4473, + "step": 19535 + }, + { + "epoch": 25.00608, + "grad_norm": 1.156924843788147, + "learning_rate": 1.094437775110044e-05, + "loss": 0.4897, + "step": 19536 + }, + { + "epoch": 25.00736, + "grad_norm": 1.0987516641616821, + "learning_rate": 1.0942376950780312e-05, + "loss": 0.4423, + "step": 19537 + }, + { + "epoch": 25.00864, + "grad_norm": 1.0584564208984375, + "learning_rate": 1.0940376150460184e-05, + "loss": 0.4282, + "step": 19538 + }, + { + "epoch": 25.00992, + "grad_norm": 1.0948853492736816, + "learning_rate": 1.0938375350140056e-05, + "loss": 0.453, + "step": 19539 + }, + { + "epoch": 25.0112, + "grad_norm": 1.1634764671325684, + "learning_rate": 1.093637454981993e-05, + "loss": 0.4743, + "step": 19540 + }, + { + "epoch": 25.01248, + "grad_norm": 1.087397575378418, + "learning_rate": 1.09343737494998e-05, + "loss": 0.4398, + "step": 19541 + }, + { + "epoch": 25.01376, + "grad_norm": 1.1633117198944092, + "learning_rate": 1.0932372949179673e-05, + "loss": 0.4746, + "step": 19542 + }, + { + "epoch": 25.01504, + "grad_norm": 1.2032607793807983, + "learning_rate": 1.0930372148859543e-05, + "loss": 0.4951, + "step": 19543 + }, + { + "epoch": 25.01632, + "grad_norm": 1.1877236366271973, + "learning_rate": 1.0928371348539417e-05, + "loss": 0.5303, + "step": 19544 + }, + { + "epoch": 25.0176, + "grad_norm": 1.0547975301742554, + "learning_rate": 1.0926370548219287e-05, + "loss": 0.4098, + "step": 19545 + }, + { + "epoch": 25.01888, + "grad_norm": 1.1014307737350464, + "learning_rate": 1.092436974789916e-05, + "loss": 0.4382, + "step": 19546 + }, + { + "epoch": 25.02016, + "grad_norm": 1.15842866897583, + "learning_rate": 1.0922368947579032e-05, + "loss": 0.4791, + "step": 19547 + }, + { + "epoch": 25.02144, + "grad_norm": 1.2171627283096313, + "learning_rate": 1.0920368147258904e-05, + "loss": 0.4589, + "step": 19548 + }, + { + "epoch": 25.02272, + "grad_norm": 1.2597172260284424, + "learning_rate": 1.0918367346938776e-05, + "loss": 0.4976, + "step": 19549 + }, + { + "epoch": 25.024, + "grad_norm": 1.1223315000534058, + "learning_rate": 1.0916366546618648e-05, + "loss": 0.4765, + "step": 19550 + }, + { + "epoch": 25.02528, + "grad_norm": 1.0641728639602661, + "learning_rate": 1.091436574629852e-05, + "loss": 0.4378, + "step": 19551 + }, + { + "epoch": 25.02656, + "grad_norm": 1.0940122604370117, + "learning_rate": 1.0912364945978392e-05, + "loss": 0.4407, + "step": 19552 + }, + { + "epoch": 25.02784, + "grad_norm": 1.1914441585540771, + "learning_rate": 1.0910364145658264e-05, + "loss": 0.4541, + "step": 19553 + }, + { + "epoch": 25.02912, + "grad_norm": 1.1707878112792969, + "learning_rate": 1.0908363345338135e-05, + "loss": 0.4925, + "step": 19554 + }, + { + "epoch": 25.0304, + "grad_norm": 1.0737802982330322, + "learning_rate": 1.0906362545018007e-05, + "loss": 0.4441, + "step": 19555 + }, + { + "epoch": 25.03168, + "grad_norm": 1.1500251293182373, + "learning_rate": 1.0904361744697881e-05, + "loss": 0.4752, + "step": 19556 + }, + { + "epoch": 25.03296, + "grad_norm": 1.1668425798416138, + "learning_rate": 1.0902360944377751e-05, + "loss": 0.4919, + "step": 19557 + }, + { + "epoch": 25.03424, + "grad_norm": 1.0954525470733643, + "learning_rate": 1.0900360144057625e-05, + "loss": 0.4363, + "step": 19558 + }, + { + "epoch": 25.03552, + "grad_norm": 1.1638271808624268, + "learning_rate": 1.0898359343737495e-05, + "loss": 0.488, + "step": 19559 + }, + { + "epoch": 25.0368, + "grad_norm": 1.1640788316726685, + "learning_rate": 1.0896358543417368e-05, + "loss": 0.4723, + "step": 19560 + }, + { + "epoch": 25.03808, + "grad_norm": 1.1712232828140259, + "learning_rate": 1.0894357743097238e-05, + "loss": 0.4883, + "step": 19561 + }, + { + "epoch": 25.03936, + "grad_norm": 1.1707978248596191, + "learning_rate": 1.0892356942777112e-05, + "loss": 0.5356, + "step": 19562 + }, + { + "epoch": 25.04064, + "grad_norm": 1.095038890838623, + "learning_rate": 1.0890356142456984e-05, + "loss": 0.4394, + "step": 19563 + }, + { + "epoch": 25.04192, + "grad_norm": 1.108280062675476, + "learning_rate": 1.0888355342136856e-05, + "loss": 0.4608, + "step": 19564 + }, + { + "epoch": 25.0432, + "grad_norm": 1.1268192529678345, + "learning_rate": 1.0886354541816728e-05, + "loss": 0.4504, + "step": 19565 + }, + { + "epoch": 25.04448, + "grad_norm": 1.1444103717803955, + "learning_rate": 1.08843537414966e-05, + "loss": 0.4217, + "step": 19566 + }, + { + "epoch": 25.04576, + "grad_norm": 1.1157702207565308, + "learning_rate": 1.0882352941176471e-05, + "loss": 0.4607, + "step": 19567 + }, + { + "epoch": 25.04704, + "grad_norm": 1.1346677541732788, + "learning_rate": 1.0880352140856343e-05, + "loss": 0.4446, + "step": 19568 + }, + { + "epoch": 25.04832, + "grad_norm": 1.1954710483551025, + "learning_rate": 1.0878351340536215e-05, + "loss": 0.4784, + "step": 19569 + }, + { + "epoch": 25.0496, + "grad_norm": 1.1586507558822632, + "learning_rate": 1.0876350540216087e-05, + "loss": 0.457, + "step": 19570 + }, + { + "epoch": 25.05088, + "grad_norm": 1.112011432647705, + "learning_rate": 1.0874349739895959e-05, + "loss": 0.4386, + "step": 19571 + }, + { + "epoch": 25.05216, + "grad_norm": 1.1521170139312744, + "learning_rate": 1.087234893957583e-05, + "loss": 0.4442, + "step": 19572 + }, + { + "epoch": 25.05344, + "grad_norm": 1.1750129461288452, + "learning_rate": 1.0870348139255702e-05, + "loss": 0.4451, + "step": 19573 + }, + { + "epoch": 25.05472, + "grad_norm": 1.1142715215682983, + "learning_rate": 1.0868347338935574e-05, + "loss": 0.4397, + "step": 19574 + }, + { + "epoch": 25.056, + "grad_norm": 1.1580311059951782, + "learning_rate": 1.0866346538615446e-05, + "loss": 0.5067, + "step": 19575 + }, + { + "epoch": 25.05728, + "grad_norm": 1.1484664678573608, + "learning_rate": 1.0864345738295318e-05, + "loss": 0.461, + "step": 19576 + }, + { + "epoch": 25.05856, + "grad_norm": 1.169391393661499, + "learning_rate": 1.0862344937975192e-05, + "loss": 0.4694, + "step": 19577 + }, + { + "epoch": 25.05984, + "grad_norm": 1.1013165712356567, + "learning_rate": 1.0860344137655062e-05, + "loss": 0.4279, + "step": 19578 + }, + { + "epoch": 25.06112, + "grad_norm": 1.248039960861206, + "learning_rate": 1.0858343337334935e-05, + "loss": 0.4867, + "step": 19579 + }, + { + "epoch": 25.0624, + "grad_norm": 1.119541883468628, + "learning_rate": 1.0856342537014805e-05, + "loss": 0.455, + "step": 19580 + }, + { + "epoch": 25.06368, + "grad_norm": 1.1457856893539429, + "learning_rate": 1.0854341736694679e-05, + "loss": 0.4475, + "step": 19581 + }, + { + "epoch": 25.06496, + "grad_norm": 1.1471192836761475, + "learning_rate": 1.085234093637455e-05, + "loss": 0.4766, + "step": 19582 + }, + { + "epoch": 25.06624, + "grad_norm": 1.1262675523757935, + "learning_rate": 1.0850340136054423e-05, + "loss": 0.4753, + "step": 19583 + }, + { + "epoch": 25.06752, + "grad_norm": 1.182495355606079, + "learning_rate": 1.0848339335734293e-05, + "loss": 0.4704, + "step": 19584 + }, + { + "epoch": 25.0688, + "grad_norm": 1.0830098390579224, + "learning_rate": 1.0846338535414166e-05, + "loss": 0.4645, + "step": 19585 + }, + { + "epoch": 25.07008, + "grad_norm": 1.1021302938461304, + "learning_rate": 1.0844337735094038e-05, + "loss": 0.4325, + "step": 19586 + }, + { + "epoch": 25.07136, + "grad_norm": 1.192583441734314, + "learning_rate": 1.084233693477391e-05, + "loss": 0.5081, + "step": 19587 + }, + { + "epoch": 25.07264, + "grad_norm": 1.1611496210098267, + "learning_rate": 1.0840336134453782e-05, + "loss": 0.4763, + "step": 19588 + }, + { + "epoch": 25.07392, + "grad_norm": 1.1356542110443115, + "learning_rate": 1.0838335334133654e-05, + "loss": 0.4435, + "step": 19589 + }, + { + "epoch": 25.0752, + "grad_norm": 1.1349308490753174, + "learning_rate": 1.0836334533813526e-05, + "loss": 0.4455, + "step": 19590 + }, + { + "epoch": 25.07648, + "grad_norm": 1.1213163137435913, + "learning_rate": 1.0834333733493398e-05, + "loss": 0.4538, + "step": 19591 + }, + { + "epoch": 25.07776, + "grad_norm": 1.151708960533142, + "learning_rate": 1.083233293317327e-05, + "loss": 0.4627, + "step": 19592 + }, + { + "epoch": 25.07904, + "grad_norm": 1.1179898977279663, + "learning_rate": 1.0830332132853143e-05, + "loss": 0.4877, + "step": 19593 + }, + { + "epoch": 25.08032, + "grad_norm": 1.1163991689682007, + "learning_rate": 1.0828331332533013e-05, + "loss": 0.4423, + "step": 19594 + }, + { + "epoch": 25.0816, + "grad_norm": 1.1171714067459106, + "learning_rate": 1.0826330532212887e-05, + "loss": 0.4341, + "step": 19595 + }, + { + "epoch": 25.08288, + "grad_norm": 1.216645359992981, + "learning_rate": 1.0824329731892757e-05, + "loss": 0.4985, + "step": 19596 + }, + { + "epoch": 25.08416, + "grad_norm": 1.1512889862060547, + "learning_rate": 1.082232893157263e-05, + "loss": 0.479, + "step": 19597 + }, + { + "epoch": 25.08544, + "grad_norm": 1.210100531578064, + "learning_rate": 1.08203281312525e-05, + "loss": 0.476, + "step": 19598 + }, + { + "epoch": 25.08672, + "grad_norm": 1.1688969135284424, + "learning_rate": 1.0818327330932374e-05, + "loss": 0.4753, + "step": 19599 + }, + { + "epoch": 25.088, + "grad_norm": 1.1013729572296143, + "learning_rate": 1.0816326530612246e-05, + "loss": 0.4628, + "step": 19600 + }, + { + "epoch": 25.08928, + "grad_norm": 1.096455454826355, + "learning_rate": 1.0814325730292118e-05, + "loss": 0.4244, + "step": 19601 + }, + { + "epoch": 25.09056, + "grad_norm": 1.1417826414108276, + "learning_rate": 1.081232492997199e-05, + "loss": 0.4263, + "step": 19602 + }, + { + "epoch": 25.09184, + "grad_norm": 1.2026113271713257, + "learning_rate": 1.0810324129651862e-05, + "loss": 0.4583, + "step": 19603 + }, + { + "epoch": 25.09312, + "grad_norm": 1.1624215841293335, + "learning_rate": 1.0808323329331733e-05, + "loss": 0.464, + "step": 19604 + }, + { + "epoch": 25.0944, + "grad_norm": 1.2777810096740723, + "learning_rate": 1.0806322529011605e-05, + "loss": 0.486, + "step": 19605 + }, + { + "epoch": 25.09568, + "grad_norm": 1.1731245517730713, + "learning_rate": 1.0804321728691477e-05, + "loss": 0.4465, + "step": 19606 + }, + { + "epoch": 25.09696, + "grad_norm": 1.1680535078048706, + "learning_rate": 1.0802320928371349e-05, + "loss": 0.4408, + "step": 19607 + }, + { + "epoch": 25.09824, + "grad_norm": 1.1202332973480225, + "learning_rate": 1.080032012805122e-05, + "loss": 0.4328, + "step": 19608 + }, + { + "epoch": 25.09952, + "grad_norm": 1.1760560274124146, + "learning_rate": 1.0798319327731093e-05, + "loss": 0.4216, + "step": 19609 + }, + { + "epoch": 25.1008, + "grad_norm": 1.2153133153915405, + "learning_rate": 1.0796318527410965e-05, + "loss": 0.5195, + "step": 19610 + }, + { + "epoch": 25.10208, + "grad_norm": 1.2274070978164673, + "learning_rate": 1.0794317727090836e-05, + "loss": 0.4786, + "step": 19611 + }, + { + "epoch": 25.10336, + "grad_norm": 1.141127109527588, + "learning_rate": 1.0792316926770708e-05, + "loss": 0.4644, + "step": 19612 + }, + { + "epoch": 25.10464, + "grad_norm": 1.1838977336883545, + "learning_rate": 1.079031612645058e-05, + "loss": 0.494, + "step": 19613 + }, + { + "epoch": 25.10592, + "grad_norm": 1.1999531984329224, + "learning_rate": 1.0788315326130452e-05, + "loss": 0.52, + "step": 19614 + }, + { + "epoch": 25.1072, + "grad_norm": 1.1070973873138428, + "learning_rate": 1.0786314525810324e-05, + "loss": 0.4395, + "step": 19615 + }, + { + "epoch": 25.10848, + "grad_norm": 1.1713796854019165, + "learning_rate": 1.0784313725490197e-05, + "loss": 0.4757, + "step": 19616 + }, + { + "epoch": 25.10976, + "grad_norm": 1.1294423341751099, + "learning_rate": 1.0782312925170068e-05, + "loss": 0.4865, + "step": 19617 + }, + { + "epoch": 25.11104, + "grad_norm": 1.1077033281326294, + "learning_rate": 1.0780312124849941e-05, + "loss": 0.4506, + "step": 19618 + }, + { + "epoch": 25.11232, + "grad_norm": 1.2054367065429688, + "learning_rate": 1.0778311324529811e-05, + "loss": 0.4717, + "step": 19619 + }, + { + "epoch": 25.1136, + "grad_norm": 1.1735788583755493, + "learning_rate": 1.0776310524209685e-05, + "loss": 0.479, + "step": 19620 + }, + { + "epoch": 25.11488, + "grad_norm": 1.128080129623413, + "learning_rate": 1.0774309723889555e-05, + "loss": 0.4248, + "step": 19621 + }, + { + "epoch": 25.11616, + "grad_norm": 1.1419153213500977, + "learning_rate": 1.0772308923569429e-05, + "loss": 0.4552, + "step": 19622 + }, + { + "epoch": 25.11744, + "grad_norm": 1.2297865152359009, + "learning_rate": 1.07703081232493e-05, + "loss": 0.4826, + "step": 19623 + }, + { + "epoch": 25.11872, + "grad_norm": 1.1833217144012451, + "learning_rate": 1.0768307322929172e-05, + "loss": 0.521, + "step": 19624 + }, + { + "epoch": 25.12, + "grad_norm": 1.152848243713379, + "learning_rate": 1.0766306522609044e-05, + "loss": 0.5335, + "step": 19625 + }, + { + "epoch": 25.12128, + "grad_norm": 1.1874018907546997, + "learning_rate": 1.0764305722288916e-05, + "loss": 0.4737, + "step": 19626 + }, + { + "epoch": 25.12256, + "grad_norm": 1.2371412515640259, + "learning_rate": 1.0762304921968788e-05, + "loss": 0.4999, + "step": 19627 + }, + { + "epoch": 25.12384, + "grad_norm": 1.167389988899231, + "learning_rate": 1.076030412164866e-05, + "loss": 0.4559, + "step": 19628 + }, + { + "epoch": 25.12512, + "grad_norm": 1.1866403818130493, + "learning_rate": 1.0758303321328532e-05, + "loss": 0.4684, + "step": 19629 + }, + { + "epoch": 25.1264, + "grad_norm": 1.1500517129898071, + "learning_rate": 1.0756302521008405e-05, + "loss": 0.4702, + "step": 19630 + }, + { + "epoch": 25.12768, + "grad_norm": 1.1612963676452637, + "learning_rate": 1.0754301720688275e-05, + "loss": 0.4909, + "step": 19631 + }, + { + "epoch": 25.12896, + "grad_norm": 1.1947745084762573, + "learning_rate": 1.0752300920368149e-05, + "loss": 0.4601, + "step": 19632 + }, + { + "epoch": 25.13024, + "grad_norm": 1.2526357173919678, + "learning_rate": 1.0750300120048019e-05, + "loss": 0.4815, + "step": 19633 + }, + { + "epoch": 25.13152, + "grad_norm": 1.1006672382354736, + "learning_rate": 1.0748299319727893e-05, + "loss": 0.4635, + "step": 19634 + }, + { + "epoch": 25.1328, + "grad_norm": 1.0949301719665527, + "learning_rate": 1.0746298519407763e-05, + "loss": 0.4385, + "step": 19635 + }, + { + "epoch": 25.13408, + "grad_norm": 1.2405381202697754, + "learning_rate": 1.0744297719087636e-05, + "loss": 0.5009, + "step": 19636 + }, + { + "epoch": 25.13536, + "grad_norm": 1.1082509756088257, + "learning_rate": 1.0742296918767508e-05, + "loss": 0.4463, + "step": 19637 + }, + { + "epoch": 25.13664, + "grad_norm": 1.1167351007461548, + "learning_rate": 1.074029611844738e-05, + "loss": 0.4892, + "step": 19638 + }, + { + "epoch": 25.13792, + "grad_norm": 1.158218264579773, + "learning_rate": 1.0738295318127252e-05, + "loss": 0.4539, + "step": 19639 + }, + { + "epoch": 25.1392, + "grad_norm": 1.2034755945205688, + "learning_rate": 1.0736294517807124e-05, + "loss": 0.4988, + "step": 19640 + }, + { + "epoch": 25.14048, + "grad_norm": 1.1195570230484009, + "learning_rate": 1.0734293717486996e-05, + "loss": 0.4505, + "step": 19641 + }, + { + "epoch": 25.14176, + "grad_norm": 1.1278491020202637, + "learning_rate": 1.0732292917166867e-05, + "loss": 0.4631, + "step": 19642 + }, + { + "epoch": 25.14304, + "grad_norm": 1.1615363359451294, + "learning_rate": 1.073029211684674e-05, + "loss": 0.4845, + "step": 19643 + }, + { + "epoch": 25.14432, + "grad_norm": 1.1552554368972778, + "learning_rate": 1.0728291316526611e-05, + "loss": 0.4543, + "step": 19644 + }, + { + "epoch": 25.1456, + "grad_norm": 1.1733779907226562, + "learning_rate": 1.0726290516206483e-05, + "loss": 0.5251, + "step": 19645 + }, + { + "epoch": 25.14688, + "grad_norm": 1.2017656564712524, + "learning_rate": 1.0724289715886355e-05, + "loss": 0.5293, + "step": 19646 + }, + { + "epoch": 25.14816, + "grad_norm": 1.1338574886322021, + "learning_rate": 1.0722288915566227e-05, + "loss": 0.4493, + "step": 19647 + }, + { + "epoch": 25.14944, + "grad_norm": 1.1379152536392212, + "learning_rate": 1.0720288115246099e-05, + "loss": 0.4381, + "step": 19648 + }, + { + "epoch": 25.15072, + "grad_norm": 1.1826634407043457, + "learning_rate": 1.071828731492597e-05, + "loss": 0.4921, + "step": 19649 + }, + { + "epoch": 25.152, + "grad_norm": 1.1363234519958496, + "learning_rate": 1.0716286514605842e-05, + "loss": 0.4593, + "step": 19650 + }, + { + "epoch": 25.15328, + "grad_norm": 1.157832384109497, + "learning_rate": 1.0714285714285714e-05, + "loss": 0.456, + "step": 19651 + }, + { + "epoch": 25.15456, + "grad_norm": 1.1512408256530762, + "learning_rate": 1.0712284913965586e-05, + "loss": 0.4946, + "step": 19652 + }, + { + "epoch": 25.15584, + "grad_norm": 1.2528743743896484, + "learning_rate": 1.071028411364546e-05, + "loss": 0.5087, + "step": 19653 + }, + { + "epoch": 25.15712, + "grad_norm": 1.1764575242996216, + "learning_rate": 1.070828331332533e-05, + "loss": 0.4661, + "step": 19654 + }, + { + "epoch": 25.1584, + "grad_norm": 1.1859891414642334, + "learning_rate": 1.0706282513005203e-05, + "loss": 0.4623, + "step": 19655 + }, + { + "epoch": 25.15968, + "grad_norm": 1.1567497253417969, + "learning_rate": 1.0704281712685073e-05, + "loss": 0.4522, + "step": 19656 + }, + { + "epoch": 25.16096, + "grad_norm": 1.191056489944458, + "learning_rate": 1.0702280912364947e-05, + "loss": 0.4735, + "step": 19657 + }, + { + "epoch": 25.16224, + "grad_norm": 1.1542645692825317, + "learning_rate": 1.0700280112044817e-05, + "loss": 0.4693, + "step": 19658 + }, + { + "epoch": 25.16352, + "grad_norm": 1.1220331192016602, + "learning_rate": 1.069827931172469e-05, + "loss": 0.4753, + "step": 19659 + }, + { + "epoch": 25.1648, + "grad_norm": 1.1264070272445679, + "learning_rate": 1.0696278511404562e-05, + "loss": 0.4709, + "step": 19660 + }, + { + "epoch": 25.16608, + "grad_norm": 1.16756272315979, + "learning_rate": 1.0694277711084434e-05, + "loss": 0.4573, + "step": 19661 + }, + { + "epoch": 25.16736, + "grad_norm": 1.1602298021316528, + "learning_rate": 1.0692276910764306e-05, + "loss": 0.4363, + "step": 19662 + }, + { + "epoch": 25.16864, + "grad_norm": 1.1670148372650146, + "learning_rate": 1.0690276110444178e-05, + "loss": 0.4925, + "step": 19663 + }, + { + "epoch": 25.16992, + "grad_norm": 1.2356739044189453, + "learning_rate": 1.068827531012405e-05, + "loss": 0.4923, + "step": 19664 + }, + { + "epoch": 25.1712, + "grad_norm": 1.2018803358078003, + "learning_rate": 1.0686274509803922e-05, + "loss": 0.4857, + "step": 19665 + }, + { + "epoch": 25.17248, + "grad_norm": 1.1678217649459839, + "learning_rate": 1.0684273709483794e-05, + "loss": 0.4493, + "step": 19666 + }, + { + "epoch": 25.17376, + "grad_norm": 1.1933562755584717, + "learning_rate": 1.0682272909163665e-05, + "loss": 0.4712, + "step": 19667 + }, + { + "epoch": 25.17504, + "grad_norm": 1.1472783088684082, + "learning_rate": 1.0680272108843537e-05, + "loss": 0.4889, + "step": 19668 + }, + { + "epoch": 25.17632, + "grad_norm": 1.113418698310852, + "learning_rate": 1.0678271308523411e-05, + "loss": 0.4455, + "step": 19669 + }, + { + "epoch": 25.1776, + "grad_norm": 1.2086471319198608, + "learning_rate": 1.0676270508203281e-05, + "loss": 0.4709, + "step": 19670 + }, + { + "epoch": 25.17888, + "grad_norm": 1.0935802459716797, + "learning_rate": 1.0674269707883155e-05, + "loss": 0.4329, + "step": 19671 + }, + { + "epoch": 25.18016, + "grad_norm": 1.1935173273086548, + "learning_rate": 1.0672268907563025e-05, + "loss": 0.4988, + "step": 19672 + }, + { + "epoch": 25.18144, + "grad_norm": 1.189258337020874, + "learning_rate": 1.0670268107242898e-05, + "loss": 0.4994, + "step": 19673 + }, + { + "epoch": 25.18272, + "grad_norm": 1.2858033180236816, + "learning_rate": 1.0668267306922768e-05, + "loss": 0.5511, + "step": 19674 + }, + { + "epoch": 25.184, + "grad_norm": 1.2078543901443481, + "learning_rate": 1.0666266506602642e-05, + "loss": 0.4853, + "step": 19675 + }, + { + "epoch": 25.18528, + "grad_norm": 1.1853865385055542, + "learning_rate": 1.0664265706282514e-05, + "loss": 0.475, + "step": 19676 + }, + { + "epoch": 25.18656, + "grad_norm": 1.1798455715179443, + "learning_rate": 1.0662264905962386e-05, + "loss": 0.5103, + "step": 19677 + }, + { + "epoch": 25.18784, + "grad_norm": 1.1571396589279175, + "learning_rate": 1.0660264105642258e-05, + "loss": 0.4582, + "step": 19678 + }, + { + "epoch": 25.18912, + "grad_norm": 1.178780198097229, + "learning_rate": 1.065826330532213e-05, + "loss": 0.5142, + "step": 19679 + }, + { + "epoch": 25.1904, + "grad_norm": 1.1873482465744019, + "learning_rate": 1.0656262505002001e-05, + "loss": 0.4801, + "step": 19680 + }, + { + "epoch": 25.19168, + "grad_norm": 1.1041178703308105, + "learning_rate": 1.0654261704681873e-05, + "loss": 0.4445, + "step": 19681 + }, + { + "epoch": 25.19296, + "grad_norm": 1.1332768201828003, + "learning_rate": 1.0652260904361745e-05, + "loss": 0.444, + "step": 19682 + }, + { + "epoch": 25.19424, + "grad_norm": 1.1968101263046265, + "learning_rate": 1.0650260104041619e-05, + "loss": 0.5094, + "step": 19683 + }, + { + "epoch": 25.19552, + "grad_norm": 1.158816933631897, + "learning_rate": 1.0648259303721489e-05, + "loss": 0.4494, + "step": 19684 + }, + { + "epoch": 25.1968, + "grad_norm": 1.192723274230957, + "learning_rate": 1.0646258503401362e-05, + "loss": 0.4797, + "step": 19685 + }, + { + "epoch": 25.19808, + "grad_norm": 1.202081561088562, + "learning_rate": 1.0644257703081232e-05, + "loss": 0.441, + "step": 19686 + }, + { + "epoch": 25.19936, + "grad_norm": 1.2185006141662598, + "learning_rate": 1.0642256902761106e-05, + "loss": 0.4857, + "step": 19687 + }, + { + "epoch": 25.20064, + "grad_norm": 1.2291009426116943, + "learning_rate": 1.0640256102440976e-05, + "loss": 0.4885, + "step": 19688 + }, + { + "epoch": 25.20192, + "grad_norm": 1.1647063493728638, + "learning_rate": 1.063825530212085e-05, + "loss": 0.4644, + "step": 19689 + }, + { + "epoch": 25.2032, + "grad_norm": 1.197346806526184, + "learning_rate": 1.0636254501800722e-05, + "loss": 0.5212, + "step": 19690 + }, + { + "epoch": 25.20448, + "grad_norm": 1.0721224546432495, + "learning_rate": 1.0634253701480593e-05, + "loss": 0.4732, + "step": 19691 + }, + { + "epoch": 25.20576, + "grad_norm": 1.1543662548065186, + "learning_rate": 1.0632252901160465e-05, + "loss": 0.4853, + "step": 19692 + }, + { + "epoch": 25.20704, + "grad_norm": 1.0685230493545532, + "learning_rate": 1.0630252100840337e-05, + "loss": 0.4494, + "step": 19693 + }, + { + "epoch": 25.20832, + "grad_norm": 1.1875240802764893, + "learning_rate": 1.0628251300520209e-05, + "loss": 0.4819, + "step": 19694 + }, + { + "epoch": 25.209600000000002, + "grad_norm": 1.1269207000732422, + "learning_rate": 1.0626250500200081e-05, + "loss": 0.4352, + "step": 19695 + }, + { + "epoch": 25.21088, + "grad_norm": 1.164787769317627, + "learning_rate": 1.0624249699879953e-05, + "loss": 0.4917, + "step": 19696 + }, + { + "epoch": 25.21216, + "grad_norm": 1.168983817100525, + "learning_rate": 1.0622248899559825e-05, + "loss": 0.454, + "step": 19697 + }, + { + "epoch": 25.21344, + "grad_norm": 1.1204594373703003, + "learning_rate": 1.0620248099239696e-05, + "loss": 0.428, + "step": 19698 + }, + { + "epoch": 25.21472, + "grad_norm": 1.1424245834350586, + "learning_rate": 1.0618247298919568e-05, + "loss": 0.441, + "step": 19699 + }, + { + "epoch": 25.216, + "grad_norm": 1.2119210958480835, + "learning_rate": 1.061624649859944e-05, + "loss": 0.4883, + "step": 19700 + }, + { + "epoch": 25.21728, + "grad_norm": 1.1396766901016235, + "learning_rate": 1.0614245698279312e-05, + "loss": 0.4449, + "step": 19701 + }, + { + "epoch": 25.21856, + "grad_norm": 1.164551019668579, + "learning_rate": 1.0612244897959184e-05, + "loss": 0.466, + "step": 19702 + }, + { + "epoch": 25.21984, + "grad_norm": 1.1590672731399536, + "learning_rate": 1.0610244097639056e-05, + "loss": 0.4948, + "step": 19703 + }, + { + "epoch": 25.22112, + "grad_norm": 1.2192983627319336, + "learning_rate": 1.0608243297318928e-05, + "loss": 0.463, + "step": 19704 + }, + { + "epoch": 25.2224, + "grad_norm": 1.1451095342636108, + "learning_rate": 1.06062424969988e-05, + "loss": 0.4541, + "step": 19705 + }, + { + "epoch": 25.22368, + "grad_norm": 1.203220009803772, + "learning_rate": 1.0604241696678673e-05, + "loss": 0.4684, + "step": 19706 + }, + { + "epoch": 25.22496, + "grad_norm": 1.1325691938400269, + "learning_rate": 1.0602240896358543e-05, + "loss": 0.5005, + "step": 19707 + }, + { + "epoch": 25.22624, + "grad_norm": 1.106677532196045, + "learning_rate": 1.0600240096038417e-05, + "loss": 0.5002, + "step": 19708 + }, + { + "epoch": 25.22752, + "grad_norm": 1.176331639289856, + "learning_rate": 1.0598239295718287e-05, + "loss": 0.4872, + "step": 19709 + }, + { + "epoch": 25.2288, + "grad_norm": 1.1356264352798462, + "learning_rate": 1.059623849539816e-05, + "loss": 0.4681, + "step": 19710 + }, + { + "epoch": 25.23008, + "grad_norm": 1.0752949714660645, + "learning_rate": 1.059423769507803e-05, + "loss": 0.4497, + "step": 19711 + }, + { + "epoch": 25.23136, + "grad_norm": 1.1183545589447021, + "learning_rate": 1.0592236894757904e-05, + "loss": 0.4709, + "step": 19712 + }, + { + "epoch": 25.23264, + "grad_norm": 1.1348707675933838, + "learning_rate": 1.0590236094437776e-05, + "loss": 0.4373, + "step": 19713 + }, + { + "epoch": 25.23392, + "grad_norm": 1.201171636581421, + "learning_rate": 1.0588235294117648e-05, + "loss": 0.4347, + "step": 19714 + }, + { + "epoch": 25.2352, + "grad_norm": 1.2042893171310425, + "learning_rate": 1.058623449379752e-05, + "loss": 0.4988, + "step": 19715 + }, + { + "epoch": 25.23648, + "grad_norm": 1.172882318496704, + "learning_rate": 1.0584233693477392e-05, + "loss": 0.4931, + "step": 19716 + }, + { + "epoch": 25.23776, + "grad_norm": 1.1809163093566895, + "learning_rate": 1.0582232893157263e-05, + "loss": 0.463, + "step": 19717 + }, + { + "epoch": 25.23904, + "grad_norm": 1.2052890062332153, + "learning_rate": 1.0580232092837135e-05, + "loss": 0.487, + "step": 19718 + }, + { + "epoch": 25.24032, + "grad_norm": 1.1632746458053589, + "learning_rate": 1.0578231292517007e-05, + "loss": 0.4584, + "step": 19719 + }, + { + "epoch": 25.2416, + "grad_norm": 1.1408315896987915, + "learning_rate": 1.057623049219688e-05, + "loss": 0.4565, + "step": 19720 + }, + { + "epoch": 25.24288, + "grad_norm": 1.1668243408203125, + "learning_rate": 1.057422969187675e-05, + "loss": 0.4432, + "step": 19721 + }, + { + "epoch": 25.24416, + "grad_norm": 1.1259456872940063, + "learning_rate": 1.0572228891556624e-05, + "loss": 0.456, + "step": 19722 + }, + { + "epoch": 25.24544, + "grad_norm": 1.109616994857788, + "learning_rate": 1.0570228091236495e-05, + "loss": 0.4352, + "step": 19723 + }, + { + "epoch": 25.24672, + "grad_norm": 1.1580768823623657, + "learning_rate": 1.0568227290916368e-05, + "loss": 0.451, + "step": 19724 + }, + { + "epoch": 25.248, + "grad_norm": 1.1380645036697388, + "learning_rate": 1.0566226490596238e-05, + "loss": 0.4576, + "step": 19725 + }, + { + "epoch": 25.24928, + "grad_norm": 1.1373761892318726, + "learning_rate": 1.0564225690276112e-05, + "loss": 0.4766, + "step": 19726 + }, + { + "epoch": 25.25056, + "grad_norm": 1.169333577156067, + "learning_rate": 1.0562224889955982e-05, + "loss": 0.5047, + "step": 19727 + }, + { + "epoch": 25.25184, + "grad_norm": 1.1574784517288208, + "learning_rate": 1.0560224089635856e-05, + "loss": 0.4608, + "step": 19728 + }, + { + "epoch": 25.25312, + "grad_norm": 1.2007426023483276, + "learning_rate": 1.0558223289315727e-05, + "loss": 0.4717, + "step": 19729 + }, + { + "epoch": 25.2544, + "grad_norm": 1.2134323120117188, + "learning_rate": 1.05562224889956e-05, + "loss": 0.4757, + "step": 19730 + }, + { + "epoch": 25.25568, + "grad_norm": 1.1803364753723145, + "learning_rate": 1.0554221688675471e-05, + "loss": 0.462, + "step": 19731 + }, + { + "epoch": 25.25696, + "grad_norm": 1.1004109382629395, + "learning_rate": 1.0552220888355343e-05, + "loss": 0.4594, + "step": 19732 + }, + { + "epoch": 25.25824, + "grad_norm": 1.1959086656570435, + "learning_rate": 1.0550220088035215e-05, + "loss": 0.4424, + "step": 19733 + }, + { + "epoch": 25.25952, + "grad_norm": 1.1749048233032227, + "learning_rate": 1.0548219287715087e-05, + "loss": 0.4687, + "step": 19734 + }, + { + "epoch": 25.2608, + "grad_norm": 1.2056117057800293, + "learning_rate": 1.0546218487394959e-05, + "loss": 0.4574, + "step": 19735 + }, + { + "epoch": 25.26208, + "grad_norm": 1.1588826179504395, + "learning_rate": 1.054421768707483e-05, + "loss": 0.4271, + "step": 19736 + }, + { + "epoch": 25.26336, + "grad_norm": 1.1901261806488037, + "learning_rate": 1.0542216886754702e-05, + "loss": 0.4621, + "step": 19737 + }, + { + "epoch": 25.26464, + "grad_norm": 1.1712379455566406, + "learning_rate": 1.0540216086434574e-05, + "loss": 0.4774, + "step": 19738 + }, + { + "epoch": 25.26592, + "grad_norm": 1.1799527406692505, + "learning_rate": 1.0538215286114446e-05, + "loss": 0.4345, + "step": 19739 + }, + { + "epoch": 25.2672, + "grad_norm": 1.1726861000061035, + "learning_rate": 1.0536214485794318e-05, + "loss": 0.5192, + "step": 19740 + }, + { + "epoch": 25.26848, + "grad_norm": 1.1405744552612305, + "learning_rate": 1.053421368547419e-05, + "loss": 0.4844, + "step": 19741 + }, + { + "epoch": 25.26976, + "grad_norm": 1.1705641746520996, + "learning_rate": 1.0532212885154062e-05, + "loss": 0.4587, + "step": 19742 + }, + { + "epoch": 25.27104, + "grad_norm": 1.0993807315826416, + "learning_rate": 1.0530212084833935e-05, + "loss": 0.4864, + "step": 19743 + }, + { + "epoch": 25.27232, + "grad_norm": 1.1683118343353271, + "learning_rate": 1.0528211284513805e-05, + "loss": 0.4339, + "step": 19744 + }, + { + "epoch": 25.2736, + "grad_norm": 1.18573796749115, + "learning_rate": 1.0526210484193679e-05, + "loss": 0.5003, + "step": 19745 + }, + { + "epoch": 25.27488, + "grad_norm": 1.1361162662506104, + "learning_rate": 1.0524209683873549e-05, + "loss": 0.4511, + "step": 19746 + }, + { + "epoch": 25.27616, + "grad_norm": 1.144995927810669, + "learning_rate": 1.0522208883553422e-05, + "loss": 0.4936, + "step": 19747 + }, + { + "epoch": 25.27744, + "grad_norm": 1.0969334840774536, + "learning_rate": 1.0520208083233293e-05, + "loss": 0.4298, + "step": 19748 + }, + { + "epoch": 25.27872, + "grad_norm": 1.1504309177398682, + "learning_rate": 1.0518207282913166e-05, + "loss": 0.4984, + "step": 19749 + }, + { + "epoch": 25.28, + "grad_norm": 1.1646525859832764, + "learning_rate": 1.0516206482593036e-05, + "loss": 0.4784, + "step": 19750 + }, + { + "epoch": 25.28128, + "grad_norm": 1.2210896015167236, + "learning_rate": 1.051420568227291e-05, + "loss": 0.5125, + "step": 19751 + }, + { + "epoch": 25.28256, + "grad_norm": 1.1825578212738037, + "learning_rate": 1.0512204881952782e-05, + "loss": 0.4775, + "step": 19752 + }, + { + "epoch": 25.28384, + "grad_norm": 1.098472237586975, + "learning_rate": 1.0510204081632654e-05, + "loss": 0.4602, + "step": 19753 + }, + { + "epoch": 25.28512, + "grad_norm": 1.095916748046875, + "learning_rate": 1.0508203281312525e-05, + "loss": 0.4521, + "step": 19754 + }, + { + "epoch": 25.2864, + "grad_norm": 1.1654385328292847, + "learning_rate": 1.0506202480992397e-05, + "loss": 0.4875, + "step": 19755 + }, + { + "epoch": 25.28768, + "grad_norm": 1.1669584512710571, + "learning_rate": 1.050420168067227e-05, + "loss": 0.4939, + "step": 19756 + }, + { + "epoch": 25.28896, + "grad_norm": 1.1289222240447998, + "learning_rate": 1.0502200880352141e-05, + "loss": 0.4583, + "step": 19757 + }, + { + "epoch": 25.29024, + "grad_norm": 1.213550090789795, + "learning_rate": 1.0500200080032013e-05, + "loss": 0.4899, + "step": 19758 + }, + { + "epoch": 25.29152, + "grad_norm": 1.2919397354125977, + "learning_rate": 1.0498199279711886e-05, + "loss": 0.5269, + "step": 19759 + }, + { + "epoch": 25.2928, + "grad_norm": 1.1487518548965454, + "learning_rate": 1.0496198479391757e-05, + "loss": 0.4528, + "step": 19760 + }, + { + "epoch": 25.29408, + "grad_norm": 1.0663877725601196, + "learning_rate": 1.049419767907163e-05, + "loss": 0.4368, + "step": 19761 + }, + { + "epoch": 25.29536, + "grad_norm": 1.125251293182373, + "learning_rate": 1.04921968787515e-05, + "loss": 0.4331, + "step": 19762 + }, + { + "epoch": 25.29664, + "grad_norm": 1.1998741626739502, + "learning_rate": 1.0490196078431374e-05, + "loss": 0.4779, + "step": 19763 + }, + { + "epoch": 25.29792, + "grad_norm": 1.1586112976074219, + "learning_rate": 1.0488195278111244e-05, + "loss": 0.4594, + "step": 19764 + }, + { + "epoch": 25.2992, + "grad_norm": 1.1433591842651367, + "learning_rate": 1.0486194477791118e-05, + "loss": 0.4665, + "step": 19765 + }, + { + "epoch": 25.30048, + "grad_norm": 1.2368263006210327, + "learning_rate": 1.048419367747099e-05, + "loss": 0.5092, + "step": 19766 + }, + { + "epoch": 25.30176, + "grad_norm": 1.1645898818969727, + "learning_rate": 1.0482192877150861e-05, + "loss": 0.4237, + "step": 19767 + }, + { + "epoch": 25.30304, + "grad_norm": 1.1961017847061157, + "learning_rate": 1.0480192076830733e-05, + "loss": 0.4801, + "step": 19768 + }, + { + "epoch": 25.30432, + "grad_norm": 1.1524591445922852, + "learning_rate": 1.0478191276510605e-05, + "loss": 0.4472, + "step": 19769 + }, + { + "epoch": 25.3056, + "grad_norm": 1.2043436765670776, + "learning_rate": 1.0476190476190477e-05, + "loss": 0.4915, + "step": 19770 + }, + { + "epoch": 25.30688, + "grad_norm": 1.173971176147461, + "learning_rate": 1.0474189675870349e-05, + "loss": 0.464, + "step": 19771 + }, + { + "epoch": 25.30816, + "grad_norm": 1.1863977909088135, + "learning_rate": 1.047218887555022e-05, + "loss": 0.4996, + "step": 19772 + }, + { + "epoch": 25.30944, + "grad_norm": 1.2210062742233276, + "learning_rate": 1.0470188075230092e-05, + "loss": 0.4825, + "step": 19773 + }, + { + "epoch": 25.31072, + "grad_norm": 1.209662914276123, + "learning_rate": 1.0468187274909964e-05, + "loss": 0.4904, + "step": 19774 + }, + { + "epoch": 25.312, + "grad_norm": 1.1289560794830322, + "learning_rate": 1.0466186474589836e-05, + "loss": 0.4321, + "step": 19775 + }, + { + "epoch": 25.31328, + "grad_norm": 1.1407843828201294, + "learning_rate": 1.0464185674269708e-05, + "loss": 0.4982, + "step": 19776 + }, + { + "epoch": 25.31456, + "grad_norm": 1.0865232944488525, + "learning_rate": 1.046218487394958e-05, + "loss": 0.4361, + "step": 19777 + }, + { + "epoch": 25.31584, + "grad_norm": 1.192724585533142, + "learning_rate": 1.0460184073629452e-05, + "loss": 0.5161, + "step": 19778 + }, + { + "epoch": 25.31712, + "grad_norm": 1.1117703914642334, + "learning_rate": 1.0458183273309324e-05, + "loss": 0.4656, + "step": 19779 + }, + { + "epoch": 25.3184, + "grad_norm": 1.1453944444656372, + "learning_rate": 1.0456182472989195e-05, + "loss": 0.4729, + "step": 19780 + }, + { + "epoch": 25.31968, + "grad_norm": 1.1950432062149048, + "learning_rate": 1.0454181672669067e-05, + "loss": 0.4684, + "step": 19781 + }, + { + "epoch": 25.32096, + "grad_norm": 1.17280912399292, + "learning_rate": 1.0452180872348941e-05, + "loss": 0.4754, + "step": 19782 + }, + { + "epoch": 25.32224, + "grad_norm": 1.2976539134979248, + "learning_rate": 1.0450180072028811e-05, + "loss": 0.5065, + "step": 19783 + }, + { + "epoch": 25.32352, + "grad_norm": 1.2335752248764038, + "learning_rate": 1.0448179271708685e-05, + "loss": 0.4805, + "step": 19784 + }, + { + "epoch": 25.3248, + "grad_norm": 1.1491906642913818, + "learning_rate": 1.0446178471388555e-05, + "loss": 0.424, + "step": 19785 + }, + { + "epoch": 25.32608, + "grad_norm": 1.1486718654632568, + "learning_rate": 1.0444177671068428e-05, + "loss": 0.4615, + "step": 19786 + }, + { + "epoch": 25.32736, + "grad_norm": 1.197849988937378, + "learning_rate": 1.0442176870748298e-05, + "loss": 0.4458, + "step": 19787 + }, + { + "epoch": 25.32864, + "grad_norm": 1.2207767963409424, + "learning_rate": 1.0440176070428172e-05, + "loss": 0.5007, + "step": 19788 + }, + { + "epoch": 25.32992, + "grad_norm": 1.2052980661392212, + "learning_rate": 1.0438175270108044e-05, + "loss": 0.4814, + "step": 19789 + }, + { + "epoch": 25.3312, + "grad_norm": 1.1656453609466553, + "learning_rate": 1.0436174469787916e-05, + "loss": 0.4519, + "step": 19790 + }, + { + "epoch": 25.33248, + "grad_norm": 1.150758981704712, + "learning_rate": 1.0434173669467788e-05, + "loss": 0.4832, + "step": 19791 + }, + { + "epoch": 25.33376, + "grad_norm": 1.1945995092391968, + "learning_rate": 1.043217286914766e-05, + "loss": 0.469, + "step": 19792 + }, + { + "epoch": 25.33504, + "grad_norm": 1.1622135639190674, + "learning_rate": 1.0430172068827531e-05, + "loss": 0.4425, + "step": 19793 + }, + { + "epoch": 25.33632, + "grad_norm": 1.1778172254562378, + "learning_rate": 1.0428171268507403e-05, + "loss": 0.4779, + "step": 19794 + }, + { + "epoch": 25.3376, + "grad_norm": 1.2094917297363281, + "learning_rate": 1.0426170468187275e-05, + "loss": 0.4715, + "step": 19795 + }, + { + "epoch": 25.33888, + "grad_norm": 1.206709861755371, + "learning_rate": 1.0424169667867149e-05, + "loss": 0.5091, + "step": 19796 + }, + { + "epoch": 25.34016, + "grad_norm": 1.2187248468399048, + "learning_rate": 1.0422168867547019e-05, + "loss": 0.5013, + "step": 19797 + }, + { + "epoch": 25.34144, + "grad_norm": 1.3126270771026611, + "learning_rate": 1.0420168067226892e-05, + "loss": 0.4873, + "step": 19798 + }, + { + "epoch": 25.34272, + "grad_norm": 1.1935737133026123, + "learning_rate": 1.0418167266906762e-05, + "loss": 0.4794, + "step": 19799 + }, + { + "epoch": 25.344, + "grad_norm": 1.2357202768325806, + "learning_rate": 1.0416166466586636e-05, + "loss": 0.4987, + "step": 19800 + }, + { + "epoch": 25.34528, + "grad_norm": 1.1685526371002197, + "learning_rate": 1.0414165666266506e-05, + "loss": 0.4617, + "step": 19801 + }, + { + "epoch": 25.34656, + "grad_norm": 1.1791237592697144, + "learning_rate": 1.041216486594638e-05, + "loss": 0.4917, + "step": 19802 + }, + { + "epoch": 25.34784, + "grad_norm": 1.1622064113616943, + "learning_rate": 1.0410164065626252e-05, + "loss": 0.4591, + "step": 19803 + }, + { + "epoch": 25.34912, + "grad_norm": 1.1878505945205688, + "learning_rate": 1.0408163265306123e-05, + "loss": 0.483, + "step": 19804 + }, + { + "epoch": 25.3504, + "grad_norm": 1.2232707738876343, + "learning_rate": 1.0406162464985995e-05, + "loss": 0.4873, + "step": 19805 + }, + { + "epoch": 25.35168, + "grad_norm": 1.1818996667861938, + "learning_rate": 1.0404161664665867e-05, + "loss": 0.4899, + "step": 19806 + }, + { + "epoch": 25.35296, + "grad_norm": 1.1730505228042603, + "learning_rate": 1.0402160864345739e-05, + "loss": 0.4547, + "step": 19807 + }, + { + "epoch": 25.35424, + "grad_norm": 1.2024863958358765, + "learning_rate": 1.040016006402561e-05, + "loss": 0.4791, + "step": 19808 + }, + { + "epoch": 25.35552, + "grad_norm": 1.1775726079940796, + "learning_rate": 1.0398159263705483e-05, + "loss": 0.4802, + "step": 19809 + }, + { + "epoch": 25.3568, + "grad_norm": 1.1456516981124878, + "learning_rate": 1.0396158463385355e-05, + "loss": 0.4666, + "step": 19810 + }, + { + "epoch": 25.35808, + "grad_norm": 1.1130499839782715, + "learning_rate": 1.0394157663065226e-05, + "loss": 0.4674, + "step": 19811 + }, + { + "epoch": 25.35936, + "grad_norm": 1.2441742420196533, + "learning_rate": 1.0392156862745098e-05, + "loss": 0.5162, + "step": 19812 + }, + { + "epoch": 25.36064, + "grad_norm": 1.139827847480774, + "learning_rate": 1.039015606242497e-05, + "loss": 0.4504, + "step": 19813 + }, + { + "epoch": 25.36192, + "grad_norm": 1.1869721412658691, + "learning_rate": 1.0388155262104842e-05, + "loss": 0.5, + "step": 19814 + }, + { + "epoch": 25.3632, + "grad_norm": 1.1606616973876953, + "learning_rate": 1.0386154461784714e-05, + "loss": 0.5037, + "step": 19815 + }, + { + "epoch": 25.36448, + "grad_norm": 1.1421772241592407, + "learning_rate": 1.0384153661464586e-05, + "loss": 0.4588, + "step": 19816 + }, + { + "epoch": 25.36576, + "grad_norm": 1.224739909172058, + "learning_rate": 1.0382152861144458e-05, + "loss": 0.4998, + "step": 19817 + }, + { + "epoch": 25.36704, + "grad_norm": 1.2049306631088257, + "learning_rate": 1.038015206082433e-05, + "loss": 0.5418, + "step": 19818 + }, + { + "epoch": 25.36832, + "grad_norm": 1.1043879985809326, + "learning_rate": 1.0378151260504203e-05, + "loss": 0.4709, + "step": 19819 + }, + { + "epoch": 25.3696, + "grad_norm": 1.1660434007644653, + "learning_rate": 1.0376150460184073e-05, + "loss": 0.48, + "step": 19820 + }, + { + "epoch": 25.37088, + "grad_norm": 1.2284889221191406, + "learning_rate": 1.0374149659863947e-05, + "loss": 0.5137, + "step": 19821 + }, + { + "epoch": 25.37216, + "grad_norm": 1.163232684135437, + "learning_rate": 1.0372148859543817e-05, + "loss": 0.4841, + "step": 19822 + }, + { + "epoch": 25.37344, + "grad_norm": 1.2111849784851074, + "learning_rate": 1.037014805922369e-05, + "loss": 0.4561, + "step": 19823 + }, + { + "epoch": 25.37472, + "grad_norm": 1.1000887155532837, + "learning_rate": 1.036814725890356e-05, + "loss": 0.4191, + "step": 19824 + }, + { + "epoch": 25.376, + "grad_norm": 1.1611400842666626, + "learning_rate": 1.0366146458583434e-05, + "loss": 0.5186, + "step": 19825 + }, + { + "epoch": 25.37728, + "grad_norm": 1.2061467170715332, + "learning_rate": 1.0364145658263306e-05, + "loss": 0.4728, + "step": 19826 + }, + { + "epoch": 25.37856, + "grad_norm": 1.147557020187378, + "learning_rate": 1.0362144857943178e-05, + "loss": 0.4578, + "step": 19827 + }, + { + "epoch": 25.37984, + "grad_norm": 1.1873794794082642, + "learning_rate": 1.036014405762305e-05, + "loss": 0.4687, + "step": 19828 + }, + { + "epoch": 25.38112, + "grad_norm": 1.1945492029190063, + "learning_rate": 1.0358143257302922e-05, + "loss": 0.4752, + "step": 19829 + }, + { + "epoch": 25.3824, + "grad_norm": 1.1753865480422974, + "learning_rate": 1.0356142456982793e-05, + "loss": 0.4974, + "step": 19830 + }, + { + "epoch": 25.38368, + "grad_norm": 1.1818007230758667, + "learning_rate": 1.0354141656662665e-05, + "loss": 0.4648, + "step": 19831 + }, + { + "epoch": 25.38496, + "grad_norm": 1.2011675834655762, + "learning_rate": 1.0352140856342537e-05, + "loss": 0.4546, + "step": 19832 + }, + { + "epoch": 25.38624, + "grad_norm": 1.203870415687561, + "learning_rate": 1.0350140056022409e-05, + "loss": 0.4703, + "step": 19833 + }, + { + "epoch": 25.38752, + "grad_norm": 1.1994456052780151, + "learning_rate": 1.034813925570228e-05, + "loss": 0.4695, + "step": 19834 + }, + { + "epoch": 25.3888, + "grad_norm": 1.1361202001571655, + "learning_rate": 1.0346138455382154e-05, + "loss": 0.4831, + "step": 19835 + }, + { + "epoch": 25.39008, + "grad_norm": 1.1568169593811035, + "learning_rate": 1.0344137655062025e-05, + "loss": 0.4666, + "step": 19836 + }, + { + "epoch": 25.39136, + "grad_norm": 1.1744780540466309, + "learning_rate": 1.0342136854741898e-05, + "loss": 0.4628, + "step": 19837 + }, + { + "epoch": 25.39264, + "grad_norm": 1.1357437372207642, + "learning_rate": 1.0340136054421768e-05, + "loss": 0.5288, + "step": 19838 + }, + { + "epoch": 25.39392, + "grad_norm": 1.2219725847244263, + "learning_rate": 1.0338135254101642e-05, + "loss": 0.4923, + "step": 19839 + }, + { + "epoch": 25.3952, + "grad_norm": 1.1251201629638672, + "learning_rate": 1.0336134453781512e-05, + "loss": 0.4548, + "step": 19840 + }, + { + "epoch": 25.39648, + "grad_norm": 1.1264392137527466, + "learning_rate": 1.0334133653461385e-05, + "loss": 0.4398, + "step": 19841 + }, + { + "epoch": 25.39776, + "grad_norm": 1.1508971452713013, + "learning_rate": 1.0332132853141257e-05, + "loss": 0.4722, + "step": 19842 + }, + { + "epoch": 25.39904, + "grad_norm": 1.1928080320358276, + "learning_rate": 1.033013205282113e-05, + "loss": 0.4667, + "step": 19843 + }, + { + "epoch": 25.40032, + "grad_norm": 1.1580041646957397, + "learning_rate": 1.0328131252501001e-05, + "loss": 0.4179, + "step": 19844 + }, + { + "epoch": 25.4016, + "grad_norm": 1.135171890258789, + "learning_rate": 1.0326130452180873e-05, + "loss": 0.4607, + "step": 19845 + }, + { + "epoch": 25.40288, + "grad_norm": 1.2124780416488647, + "learning_rate": 1.0324129651860745e-05, + "loss": 0.502, + "step": 19846 + }, + { + "epoch": 25.40416, + "grad_norm": 1.1926053762435913, + "learning_rate": 1.0322128851540617e-05, + "loss": 0.4687, + "step": 19847 + }, + { + "epoch": 25.40544, + "grad_norm": 1.2334791421890259, + "learning_rate": 1.0320128051220488e-05, + "loss": 0.5041, + "step": 19848 + }, + { + "epoch": 25.40672, + "grad_norm": 1.1845077276229858, + "learning_rate": 1.0318127250900362e-05, + "loss": 0.4649, + "step": 19849 + }, + { + "epoch": 25.408, + "grad_norm": 1.1910468339920044, + "learning_rate": 1.0316126450580232e-05, + "loss": 0.4352, + "step": 19850 + }, + { + "epoch": 25.40928, + "grad_norm": 1.1773654222488403, + "learning_rate": 1.0314125650260106e-05, + "loss": 0.4306, + "step": 19851 + }, + { + "epoch": 25.41056, + "grad_norm": 1.211543083190918, + "learning_rate": 1.0312124849939976e-05, + "loss": 0.4869, + "step": 19852 + }, + { + "epoch": 25.41184, + "grad_norm": 1.2373056411743164, + "learning_rate": 1.031012404961985e-05, + "loss": 0.4914, + "step": 19853 + }, + { + "epoch": 25.41312, + "grad_norm": 1.2094565629959106, + "learning_rate": 1.030812324929972e-05, + "loss": 0.4829, + "step": 19854 + }, + { + "epoch": 25.4144, + "grad_norm": 1.1150031089782715, + "learning_rate": 1.0306122448979593e-05, + "loss": 0.4435, + "step": 19855 + }, + { + "epoch": 25.41568, + "grad_norm": 1.189208984375, + "learning_rate": 1.0304121648659465e-05, + "loss": 0.4767, + "step": 19856 + }, + { + "epoch": 25.41696, + "grad_norm": 1.1635322570800781, + "learning_rate": 1.0302120848339337e-05, + "loss": 0.4754, + "step": 19857 + }, + { + "epoch": 25.41824, + "grad_norm": 1.2085617780685425, + "learning_rate": 1.0300120048019209e-05, + "loss": 0.5219, + "step": 19858 + }, + { + "epoch": 25.41952, + "grad_norm": 1.150395393371582, + "learning_rate": 1.029811924769908e-05, + "loss": 0.4459, + "step": 19859 + }, + { + "epoch": 25.4208, + "grad_norm": 1.1318018436431885, + "learning_rate": 1.0296118447378952e-05, + "loss": 0.453, + "step": 19860 + }, + { + "epoch": 25.42208, + "grad_norm": 1.1467453241348267, + "learning_rate": 1.0294117647058824e-05, + "loss": 0.4603, + "step": 19861 + }, + { + "epoch": 25.42336, + "grad_norm": 1.1553596258163452, + "learning_rate": 1.0292116846738696e-05, + "loss": 0.4754, + "step": 19862 + }, + { + "epoch": 25.42464, + "grad_norm": 1.1549872159957886, + "learning_rate": 1.0290116046418568e-05, + "loss": 0.4961, + "step": 19863 + }, + { + "epoch": 25.42592, + "grad_norm": 1.1191171407699585, + "learning_rate": 1.028811524609844e-05, + "loss": 0.4749, + "step": 19864 + }, + { + "epoch": 25.4272, + "grad_norm": 1.1042835712432861, + "learning_rate": 1.0286114445778312e-05, + "loss": 0.4217, + "step": 19865 + }, + { + "epoch": 25.42848, + "grad_norm": 1.2307777404785156, + "learning_rate": 1.0284113645458184e-05, + "loss": 0.4629, + "step": 19866 + }, + { + "epoch": 25.42976, + "grad_norm": 1.1764090061187744, + "learning_rate": 1.0282112845138055e-05, + "loss": 0.4898, + "step": 19867 + }, + { + "epoch": 25.43104, + "grad_norm": 1.189038634300232, + "learning_rate": 1.0280112044817927e-05, + "loss": 0.4423, + "step": 19868 + }, + { + "epoch": 25.43232, + "grad_norm": 1.1858000755310059, + "learning_rate": 1.02781112444978e-05, + "loss": 0.4757, + "step": 19869 + }, + { + "epoch": 25.4336, + "grad_norm": 1.191748023033142, + "learning_rate": 1.0276110444177671e-05, + "loss": 0.4882, + "step": 19870 + }, + { + "epoch": 25.43488, + "grad_norm": 1.2290452718734741, + "learning_rate": 1.0274109643857543e-05, + "loss": 0.4923, + "step": 19871 + }, + { + "epoch": 25.43616, + "grad_norm": 1.1514322757720947, + "learning_rate": 1.0272108843537416e-05, + "loss": 0.4417, + "step": 19872 + }, + { + "epoch": 25.43744, + "grad_norm": 1.162440299987793, + "learning_rate": 1.0270108043217287e-05, + "loss": 0.4663, + "step": 19873 + }, + { + "epoch": 25.43872, + "grad_norm": 1.1626965999603271, + "learning_rate": 1.026810724289716e-05, + "loss": 0.4808, + "step": 19874 + }, + { + "epoch": 25.44, + "grad_norm": 1.1720807552337646, + "learning_rate": 1.026610644257703e-05, + "loss": 0.4818, + "step": 19875 + }, + { + "epoch": 25.44128, + "grad_norm": 1.1925668716430664, + "learning_rate": 1.0264105642256904e-05, + "loss": 0.4541, + "step": 19876 + }, + { + "epoch": 25.44256, + "grad_norm": 1.2147929668426514, + "learning_rate": 1.0262104841936774e-05, + "loss": 0.465, + "step": 19877 + }, + { + "epoch": 25.44384, + "grad_norm": 1.1247954368591309, + "learning_rate": 1.0260104041616648e-05, + "loss": 0.4811, + "step": 19878 + }, + { + "epoch": 25.44512, + "grad_norm": 1.1577873229980469, + "learning_rate": 1.025810324129652e-05, + "loss": 0.4534, + "step": 19879 + }, + { + "epoch": 25.4464, + "grad_norm": 1.1290161609649658, + "learning_rate": 1.0256102440976391e-05, + "loss": 0.4636, + "step": 19880 + }, + { + "epoch": 25.44768, + "grad_norm": 1.1666961908340454, + "learning_rate": 1.0254101640656263e-05, + "loss": 0.4773, + "step": 19881 + }, + { + "epoch": 25.44896, + "grad_norm": 1.210201382637024, + "learning_rate": 1.0252100840336135e-05, + "loss": 0.5152, + "step": 19882 + }, + { + "epoch": 25.45024, + "grad_norm": 1.1907308101654053, + "learning_rate": 1.0250100040016007e-05, + "loss": 0.4904, + "step": 19883 + }, + { + "epoch": 25.45152, + "grad_norm": 1.2196985483169556, + "learning_rate": 1.0248099239695879e-05, + "loss": 0.499, + "step": 19884 + }, + { + "epoch": 25.4528, + "grad_norm": 1.1352413892745972, + "learning_rate": 1.024609843937575e-05, + "loss": 0.4525, + "step": 19885 + }, + { + "epoch": 25.45408, + "grad_norm": 1.1208443641662598, + "learning_rate": 1.0244097639055624e-05, + "loss": 0.4592, + "step": 19886 + }, + { + "epoch": 25.45536, + "grad_norm": 1.152306079864502, + "learning_rate": 1.0242096838735494e-05, + "loss": 0.4724, + "step": 19887 + }, + { + "epoch": 25.45664, + "grad_norm": 1.1952404975891113, + "learning_rate": 1.0240096038415368e-05, + "loss": 0.507, + "step": 19888 + }, + { + "epoch": 25.45792, + "grad_norm": 1.207161545753479, + "learning_rate": 1.0238095238095238e-05, + "loss": 0.4736, + "step": 19889 + }, + { + "epoch": 25.4592, + "grad_norm": 1.2183301448822021, + "learning_rate": 1.0236094437775112e-05, + "loss": 0.4445, + "step": 19890 + }, + { + "epoch": 25.46048, + "grad_norm": 1.1657265424728394, + "learning_rate": 1.0234093637454982e-05, + "loss": 0.4539, + "step": 19891 + }, + { + "epoch": 25.46176, + "grad_norm": 1.19996976852417, + "learning_rate": 1.0232092837134855e-05, + "loss": 0.4312, + "step": 19892 + }, + { + "epoch": 25.46304, + "grad_norm": 1.202269434928894, + "learning_rate": 1.0230092036814725e-05, + "loss": 0.4801, + "step": 19893 + }, + { + "epoch": 25.46432, + "grad_norm": 1.1666241884231567, + "learning_rate": 1.0228091236494599e-05, + "loss": 0.4813, + "step": 19894 + }, + { + "epoch": 25.4656, + "grad_norm": 1.2247138023376465, + "learning_rate": 1.022609043617447e-05, + "loss": 0.4749, + "step": 19895 + }, + { + "epoch": 25.46688, + "grad_norm": 1.2047659158706665, + "learning_rate": 1.0224089635854343e-05, + "loss": 0.4895, + "step": 19896 + }, + { + "epoch": 25.46816, + "grad_norm": 1.2424787282943726, + "learning_rate": 1.0222088835534215e-05, + "loss": 0.4762, + "step": 19897 + }, + { + "epoch": 25.46944, + "grad_norm": 1.190301775932312, + "learning_rate": 1.0220088035214086e-05, + "loss": 0.4786, + "step": 19898 + }, + { + "epoch": 25.47072, + "grad_norm": 1.2236806154251099, + "learning_rate": 1.0218087234893958e-05, + "loss": 0.5193, + "step": 19899 + }, + { + "epoch": 25.472, + "grad_norm": 1.1594198942184448, + "learning_rate": 1.021608643457383e-05, + "loss": 0.5058, + "step": 19900 + }, + { + "epoch": 25.47328, + "grad_norm": 1.1569578647613525, + "learning_rate": 1.0214085634253702e-05, + "loss": 0.4701, + "step": 19901 + }, + { + "epoch": 25.47456, + "grad_norm": 1.1034061908721924, + "learning_rate": 1.0212084833933574e-05, + "loss": 0.4354, + "step": 19902 + }, + { + "epoch": 25.47584, + "grad_norm": 1.136178970336914, + "learning_rate": 1.0210084033613446e-05, + "loss": 0.4281, + "step": 19903 + }, + { + "epoch": 25.47712, + "grad_norm": 1.1307159662246704, + "learning_rate": 1.0208083233293318e-05, + "loss": 0.4523, + "step": 19904 + }, + { + "epoch": 25.4784, + "grad_norm": 1.1765360832214355, + "learning_rate": 1.020608243297319e-05, + "loss": 0.4607, + "step": 19905 + }, + { + "epoch": 25.47968, + "grad_norm": 1.0838277339935303, + "learning_rate": 1.0204081632653061e-05, + "loss": 0.4631, + "step": 19906 + }, + { + "epoch": 25.48096, + "grad_norm": 1.1757216453552246, + "learning_rate": 1.0202080832332933e-05, + "loss": 0.4862, + "step": 19907 + }, + { + "epoch": 25.48224, + "grad_norm": 1.1511602401733398, + "learning_rate": 1.0200080032012805e-05, + "loss": 0.456, + "step": 19908 + }, + { + "epoch": 25.48352, + "grad_norm": 1.1518641710281372, + "learning_rate": 1.0198079231692679e-05, + "loss": 0.5169, + "step": 19909 + }, + { + "epoch": 25.4848, + "grad_norm": 1.1432472467422485, + "learning_rate": 1.0196078431372549e-05, + "loss": 0.4253, + "step": 19910 + }, + { + "epoch": 25.48608, + "grad_norm": 1.1166365146636963, + "learning_rate": 1.0194077631052422e-05, + "loss": 0.464, + "step": 19911 + }, + { + "epoch": 25.48736, + "grad_norm": 1.0852631330490112, + "learning_rate": 1.0192076830732292e-05, + "loss": 0.4376, + "step": 19912 + }, + { + "epoch": 25.48864, + "grad_norm": 1.1255972385406494, + "learning_rate": 1.0190076030412166e-05, + "loss": 0.4564, + "step": 19913 + }, + { + "epoch": 25.48992, + "grad_norm": 1.1327401399612427, + "learning_rate": 1.0188075230092036e-05, + "loss": 0.4365, + "step": 19914 + }, + { + "epoch": 25.4912, + "grad_norm": 1.1716783046722412, + "learning_rate": 1.018607442977191e-05, + "loss": 0.5002, + "step": 19915 + }, + { + "epoch": 25.49248, + "grad_norm": 1.2162928581237793, + "learning_rate": 1.018407362945178e-05, + "loss": 0.5105, + "step": 19916 + }, + { + "epoch": 25.49376, + "grad_norm": 1.23090660572052, + "learning_rate": 1.0182072829131653e-05, + "loss": 0.4842, + "step": 19917 + }, + { + "epoch": 25.49504, + "grad_norm": 1.1792917251586914, + "learning_rate": 1.0180072028811525e-05, + "loss": 0.505, + "step": 19918 + }, + { + "epoch": 25.49632, + "grad_norm": 1.1648669242858887, + "learning_rate": 1.0178071228491397e-05, + "loss": 0.4483, + "step": 19919 + }, + { + "epoch": 25.4976, + "grad_norm": 1.1607261896133423, + "learning_rate": 1.0176070428171269e-05, + "loss": 0.4921, + "step": 19920 + }, + { + "epoch": 25.49888, + "grad_norm": 1.116874098777771, + "learning_rate": 1.017406962785114e-05, + "loss": 0.4774, + "step": 19921 + }, + { + "epoch": 25.50016, + "grad_norm": 1.107079267501831, + "learning_rate": 1.0172068827531013e-05, + "loss": 0.4342, + "step": 19922 + }, + { + "epoch": 25.50144, + "grad_norm": 1.134993553161621, + "learning_rate": 1.0170068027210885e-05, + "loss": 0.4498, + "step": 19923 + }, + { + "epoch": 25.50272, + "grad_norm": 1.218907356262207, + "learning_rate": 1.0168067226890756e-05, + "loss": 0.4602, + "step": 19924 + }, + { + "epoch": 25.504, + "grad_norm": 1.1606868505477905, + "learning_rate": 1.016606642657063e-05, + "loss": 0.4432, + "step": 19925 + }, + { + "epoch": 25.50528, + "grad_norm": 1.0475679636001587, + "learning_rate": 1.01640656262505e-05, + "loss": 0.4417, + "step": 19926 + }, + { + "epoch": 25.50656, + "grad_norm": 1.0612995624542236, + "learning_rate": 1.0162064825930374e-05, + "loss": 0.4264, + "step": 19927 + }, + { + "epoch": 25.50784, + "grad_norm": 1.1685255765914917, + "learning_rate": 1.0160064025610244e-05, + "loss": 0.4719, + "step": 19928 + }, + { + "epoch": 25.50912, + "grad_norm": 1.1424115896224976, + "learning_rate": 1.0158063225290117e-05, + "loss": 0.4454, + "step": 19929 + }, + { + "epoch": 25.5104, + "grad_norm": 1.1809263229370117, + "learning_rate": 1.0156062424969988e-05, + "loss": 0.4515, + "step": 19930 + }, + { + "epoch": 25.51168, + "grad_norm": 1.25111985206604, + "learning_rate": 1.0154061624649861e-05, + "loss": 0.5329, + "step": 19931 + }, + { + "epoch": 25.51296, + "grad_norm": 1.130300760269165, + "learning_rate": 1.0152060824329733e-05, + "loss": 0.4795, + "step": 19932 + }, + { + "epoch": 25.51424, + "grad_norm": 1.1218140125274658, + "learning_rate": 1.0150060024009605e-05, + "loss": 0.4573, + "step": 19933 + }, + { + "epoch": 25.51552, + "grad_norm": 1.183595895767212, + "learning_rate": 1.0148059223689477e-05, + "loss": 0.4736, + "step": 19934 + }, + { + "epoch": 25.5168, + "grad_norm": 1.1084133386611938, + "learning_rate": 1.0146058423369348e-05, + "loss": 0.4453, + "step": 19935 + }, + { + "epoch": 25.51808, + "grad_norm": 1.1661556959152222, + "learning_rate": 1.014405762304922e-05, + "loss": 0.4992, + "step": 19936 + }, + { + "epoch": 25.51936, + "grad_norm": 1.1977107524871826, + "learning_rate": 1.0142056822729092e-05, + "loss": 0.4746, + "step": 19937 + }, + { + "epoch": 25.52064, + "grad_norm": 1.1810569763183594, + "learning_rate": 1.0140056022408964e-05, + "loss": 0.4634, + "step": 19938 + }, + { + "epoch": 25.52192, + "grad_norm": 1.1430659294128418, + "learning_rate": 1.0138055222088836e-05, + "loss": 0.5255, + "step": 19939 + }, + { + "epoch": 25.5232, + "grad_norm": 1.135312795639038, + "learning_rate": 1.0136054421768708e-05, + "loss": 0.4624, + "step": 19940 + }, + { + "epoch": 25.52448, + "grad_norm": 1.0835295915603638, + "learning_rate": 1.013405362144858e-05, + "loss": 0.4393, + "step": 19941 + }, + { + "epoch": 25.52576, + "grad_norm": 1.1327999830245972, + "learning_rate": 1.0132052821128451e-05, + "loss": 0.4165, + "step": 19942 + }, + { + "epoch": 25.52704, + "grad_norm": 1.2144399881362915, + "learning_rate": 1.0130052020808323e-05, + "loss": 0.5114, + "step": 19943 + }, + { + "epoch": 25.52832, + "grad_norm": 1.2352372407913208, + "learning_rate": 1.0128051220488195e-05, + "loss": 0.5182, + "step": 19944 + }, + { + "epoch": 25.5296, + "grad_norm": 1.109834909439087, + "learning_rate": 1.0126050420168067e-05, + "loss": 0.4612, + "step": 19945 + }, + { + "epoch": 25.53088, + "grad_norm": 1.163332462310791, + "learning_rate": 1.0124049619847939e-05, + "loss": 0.4823, + "step": 19946 + }, + { + "epoch": 25.53216, + "grad_norm": 1.219772458076477, + "learning_rate": 1.012204881952781e-05, + "loss": 0.5057, + "step": 19947 + }, + { + "epoch": 25.53344, + "grad_norm": 1.111663818359375, + "learning_rate": 1.0120048019207684e-05, + "loss": 0.4297, + "step": 19948 + }, + { + "epoch": 25.53472, + "grad_norm": 1.1989513635635376, + "learning_rate": 1.0118047218887554e-05, + "loss": 0.4661, + "step": 19949 + }, + { + "epoch": 25.536, + "grad_norm": 1.1225396394729614, + "learning_rate": 1.0116046418567428e-05, + "loss": 0.5113, + "step": 19950 + }, + { + "epoch": 25.53728, + "grad_norm": 1.134542465209961, + "learning_rate": 1.0114045618247298e-05, + "loss": 0.4717, + "step": 19951 + }, + { + "epoch": 25.53856, + "grad_norm": 1.1759467124938965, + "learning_rate": 1.0112044817927172e-05, + "loss": 0.4592, + "step": 19952 + }, + { + "epoch": 25.53984, + "grad_norm": 1.1189574003219604, + "learning_rate": 1.0110044017607042e-05, + "loss": 0.4471, + "step": 19953 + }, + { + "epoch": 25.54112, + "grad_norm": 1.2080943584442139, + "learning_rate": 1.0108043217286915e-05, + "loss": 0.5141, + "step": 19954 + }, + { + "epoch": 25.5424, + "grad_norm": 1.1960744857788086, + "learning_rate": 1.0106042416966787e-05, + "loss": 0.4721, + "step": 19955 + }, + { + "epoch": 25.54368, + "grad_norm": 1.1887983083724976, + "learning_rate": 1.010404161664666e-05, + "loss": 0.4531, + "step": 19956 + }, + { + "epoch": 25.54496, + "grad_norm": 1.228134036064148, + "learning_rate": 1.0102040816326531e-05, + "loss": 0.476, + "step": 19957 + }, + { + "epoch": 25.54624, + "grad_norm": 1.1741596460342407, + "learning_rate": 1.0100040016006403e-05, + "loss": 0.4897, + "step": 19958 + }, + { + "epoch": 25.54752, + "grad_norm": 1.1398587226867676, + "learning_rate": 1.0098039215686275e-05, + "loss": 0.4474, + "step": 19959 + }, + { + "epoch": 25.5488, + "grad_norm": 1.1780531406402588, + "learning_rate": 1.0096038415366147e-05, + "loss": 0.4595, + "step": 19960 + }, + { + "epoch": 25.55008, + "grad_norm": 1.1379321813583374, + "learning_rate": 1.0094037615046018e-05, + "loss": 0.4659, + "step": 19961 + }, + { + "epoch": 25.55136, + "grad_norm": 1.0986708402633667, + "learning_rate": 1.0092036814725892e-05, + "loss": 0.4721, + "step": 19962 + }, + { + "epoch": 25.55264, + "grad_norm": 1.1313260793685913, + "learning_rate": 1.0090036014405762e-05, + "loss": 0.4701, + "step": 19963 + }, + { + "epoch": 25.55392, + "grad_norm": 1.0673236846923828, + "learning_rate": 1.0088035214085636e-05, + "loss": 0.4375, + "step": 19964 + }, + { + "epoch": 25.5552, + "grad_norm": 1.1325130462646484, + "learning_rate": 1.0086034413765506e-05, + "loss": 0.4712, + "step": 19965 + }, + { + "epoch": 25.55648, + "grad_norm": 1.095177412033081, + "learning_rate": 1.008403361344538e-05, + "loss": 0.4617, + "step": 19966 + }, + { + "epoch": 25.557760000000002, + "grad_norm": 1.203389048576355, + "learning_rate": 1.008203281312525e-05, + "loss": 0.4962, + "step": 19967 + }, + { + "epoch": 25.55904, + "grad_norm": 1.1400015354156494, + "learning_rate": 1.0080032012805123e-05, + "loss": 0.4451, + "step": 19968 + }, + { + "epoch": 25.56032, + "grad_norm": 1.153382420539856, + "learning_rate": 1.0078031212484995e-05, + "loss": 0.4675, + "step": 19969 + }, + { + "epoch": 25.5616, + "grad_norm": 1.207768440246582, + "learning_rate": 1.0076030412164867e-05, + "loss": 0.5011, + "step": 19970 + }, + { + "epoch": 25.56288, + "grad_norm": 1.1720563173294067, + "learning_rate": 1.0074029611844739e-05, + "loss": 0.4832, + "step": 19971 + }, + { + "epoch": 25.56416, + "grad_norm": 1.2000620365142822, + "learning_rate": 1.007202881152461e-05, + "loss": 0.4259, + "step": 19972 + }, + { + "epoch": 25.56544, + "grad_norm": 1.208329200744629, + "learning_rate": 1.0070028011204482e-05, + "loss": 0.543, + "step": 19973 + }, + { + "epoch": 25.56672, + "grad_norm": 1.165389895439148, + "learning_rate": 1.0068027210884354e-05, + "loss": 0.4622, + "step": 19974 + }, + { + "epoch": 25.568, + "grad_norm": 1.1432007551193237, + "learning_rate": 1.0066026410564226e-05, + "loss": 0.4856, + "step": 19975 + }, + { + "epoch": 25.56928, + "grad_norm": 1.1231637001037598, + "learning_rate": 1.0064025610244098e-05, + "loss": 0.4722, + "step": 19976 + }, + { + "epoch": 25.57056, + "grad_norm": 1.2253926992416382, + "learning_rate": 1.006202480992397e-05, + "loss": 0.4966, + "step": 19977 + }, + { + "epoch": 25.57184, + "grad_norm": 1.1498738527297974, + "learning_rate": 1.0060024009603842e-05, + "loss": 0.4766, + "step": 19978 + }, + { + "epoch": 25.57312, + "grad_norm": 1.264626383781433, + "learning_rate": 1.0058023209283714e-05, + "loss": 0.547, + "step": 19979 + }, + { + "epoch": 25.5744, + "grad_norm": 1.1682270765304565, + "learning_rate": 1.0056022408963585e-05, + "loss": 0.4642, + "step": 19980 + }, + { + "epoch": 25.57568, + "grad_norm": 1.1158068180084229, + "learning_rate": 1.0054021608643457e-05, + "loss": 0.4469, + "step": 19981 + }, + { + "epoch": 25.57696, + "grad_norm": 1.223067283630371, + "learning_rate": 1.005202080832333e-05, + "loss": 0.4862, + "step": 19982 + }, + { + "epoch": 25.57824, + "grad_norm": 1.2305022478103638, + "learning_rate": 1.0050020008003201e-05, + "loss": 0.5224, + "step": 19983 + }, + { + "epoch": 25.57952, + "grad_norm": 1.176965594291687, + "learning_rate": 1.0048019207683073e-05, + "loss": 0.4521, + "step": 19984 + }, + { + "epoch": 25.5808, + "grad_norm": 1.1142029762268066, + "learning_rate": 1.0046018407362946e-05, + "loss": 0.4352, + "step": 19985 + }, + { + "epoch": 25.58208, + "grad_norm": 1.123673677444458, + "learning_rate": 1.0044017607042817e-05, + "loss": 0.4876, + "step": 19986 + }, + { + "epoch": 25.58336, + "grad_norm": 1.1586241722106934, + "learning_rate": 1.004201680672269e-05, + "loss": 0.4535, + "step": 19987 + }, + { + "epoch": 25.58464, + "grad_norm": 1.125301480293274, + "learning_rate": 1.004001600640256e-05, + "loss": 0.4405, + "step": 19988 + }, + { + "epoch": 25.58592, + "grad_norm": 1.1621462106704712, + "learning_rate": 1.0038015206082434e-05, + "loss": 0.4536, + "step": 19989 + }, + { + "epoch": 25.5872, + "grad_norm": 1.1989449262619019, + "learning_rate": 1.0036014405762304e-05, + "loss": 0.4923, + "step": 19990 + }, + { + "epoch": 25.58848, + "grad_norm": 1.2146766185760498, + "learning_rate": 1.0034013605442178e-05, + "loss": 0.4667, + "step": 19991 + }, + { + "epoch": 25.58976, + "grad_norm": 1.1815863847732544, + "learning_rate": 1.003201280512205e-05, + "loss": 0.4751, + "step": 19992 + }, + { + "epoch": 25.59104, + "grad_norm": 1.1977012157440186, + "learning_rate": 1.0030012004801921e-05, + "loss": 0.4829, + "step": 19993 + }, + { + "epoch": 25.59232, + "grad_norm": 1.1629470586776733, + "learning_rate": 1.0028011204481793e-05, + "loss": 0.4888, + "step": 19994 + }, + { + "epoch": 25.5936, + "grad_norm": 1.2305023670196533, + "learning_rate": 1.0026010404161665e-05, + "loss": 0.5316, + "step": 19995 + }, + { + "epoch": 25.59488, + "grad_norm": 1.1644953489303589, + "learning_rate": 1.0024009603841537e-05, + "loss": 0.5375, + "step": 19996 + }, + { + "epoch": 25.59616, + "grad_norm": 1.1814461946487427, + "learning_rate": 1.0022008803521409e-05, + "loss": 0.4741, + "step": 19997 + }, + { + "epoch": 25.59744, + "grad_norm": 1.1363074779510498, + "learning_rate": 1.002000800320128e-05, + "loss": 0.4249, + "step": 19998 + }, + { + "epoch": 25.59872, + "grad_norm": 1.2371721267700195, + "learning_rate": 1.0018007202881154e-05, + "loss": 0.4753, + "step": 19999 + }, + { + "epoch": 25.6, + "grad_norm": 1.137081265449524, + "learning_rate": 1.0016006402561024e-05, + "loss": 0.4507, + "step": 20000 + } + ], + "logging_steps": 1, + "max_steps": 25000, + "num_input_tokens_seen": 0, + "num_train_epochs": 33, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.571466040761385e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}