{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1956, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005112474437627812, "grad_norm": 3.3048973083496094, "learning_rate": 1.0000000000000002e-06, "loss": 1.1371, "step": 1 }, { "epoch": 0.0010224948875255625, "grad_norm": 3.735586404800415, "learning_rate": 2.0000000000000003e-06, "loss": 1.1928, "step": 2 }, { "epoch": 0.0015337423312883436, "grad_norm": 3.930663585662842, "learning_rate": 3e-06, "loss": 1.3484, "step": 3 }, { "epoch": 0.002044989775051125, "grad_norm": 4.375387668609619, "learning_rate": 4.000000000000001e-06, "loss": 1.2683, "step": 4 }, { "epoch": 0.002556237218813906, "grad_norm": 5.1499857902526855, "learning_rate": 5e-06, "loss": 1.1936, "step": 5 }, { "epoch": 0.003067484662576687, "grad_norm": 5.033441066741943, "learning_rate": 6e-06, "loss": 1.3051, "step": 6 }, { "epoch": 0.0035787321063394683, "grad_norm": 5.029082775115967, "learning_rate": 7.000000000000001e-06, "loss": 1.1804, "step": 7 }, { "epoch": 0.00408997955010225, "grad_norm": 4.745914459228516, "learning_rate": 8.000000000000001e-06, "loss": 1.2379, "step": 8 }, { "epoch": 0.004601226993865031, "grad_norm": 4.976576805114746, "learning_rate": 9e-06, "loss": 1.282, "step": 9 }, { "epoch": 0.005112474437627812, "grad_norm": 4.350550174713135, "learning_rate": 1e-05, "loss": 1.1822, "step": 10 }, { "epoch": 0.005623721881390593, "grad_norm": 4.9506940841674805, "learning_rate": 1.1000000000000001e-05, "loss": 1.2666, "step": 11 }, { "epoch": 0.006134969325153374, "grad_norm": 5.068627834320068, "learning_rate": 1.2e-05, "loss": 1.2411, "step": 12 }, { "epoch": 0.0066462167689161555, "grad_norm": 5.628175735473633, "learning_rate": 1.3000000000000001e-05, "loss": 1.2263, "step": 13 }, { "epoch": 0.007157464212678937, "grad_norm": 5.16189432144165, "learning_rate": 1.4000000000000001e-05, "loss": 1.2809, "step": 14 }, { "epoch": 0.007668711656441718, "grad_norm": 4.856331825256348, "learning_rate": 1.5e-05, "loss": 1.1944, "step": 15 }, { "epoch": 0.0081799591002045, "grad_norm": 4.980719089508057, "learning_rate": 1.6000000000000003e-05, "loss": 1.2265, "step": 16 }, { "epoch": 0.008691206543967281, "grad_norm": 5.2572832107543945, "learning_rate": 1.7000000000000003e-05, "loss": 1.2408, "step": 17 }, { "epoch": 0.009202453987730062, "grad_norm": 5.128628730773926, "learning_rate": 1.8e-05, "loss": 1.2272, "step": 18 }, { "epoch": 0.009713701431492843, "grad_norm": 5.2293829917907715, "learning_rate": 1.9e-05, "loss": 1.195, "step": 19 }, { "epoch": 0.010224948875255624, "grad_norm": 4.794898986816406, "learning_rate": 2e-05, "loss": 1.2411, "step": 20 }, { "epoch": 0.010736196319018405, "grad_norm": 5.007977485656738, "learning_rate": 2.1e-05, "loss": 1.1821, "step": 21 }, { "epoch": 0.011247443762781187, "grad_norm": 4.727319240570068, "learning_rate": 2.2000000000000003e-05, "loss": 1.188, "step": 22 }, { "epoch": 0.011758691206543968, "grad_norm": 4.606495380401611, "learning_rate": 2.3000000000000003e-05, "loss": 1.1372, "step": 23 }, { "epoch": 0.012269938650306749, "grad_norm": 4.6991753578186035, "learning_rate": 2.4e-05, "loss": 1.1395, "step": 24 }, { "epoch": 0.01278118609406953, "grad_norm": 4.635598659515381, "learning_rate": 2.5e-05, "loss": 1.1782, "step": 25 }, { "epoch": 0.013292433537832311, "grad_norm": 4.882657051086426, "learning_rate": 2.6000000000000002e-05, "loss": 1.1939, "step": 26 }, { "epoch": 0.013803680981595092, "grad_norm": 5.13659143447876, "learning_rate": 2.7000000000000002e-05, "loss": 1.1567, "step": 27 }, { "epoch": 0.014314928425357873, "grad_norm": 5.102069854736328, "learning_rate": 2.8000000000000003e-05, "loss": 1.2254, "step": 28 }, { "epoch": 0.014826175869120654, "grad_norm": 5.0762505531311035, "learning_rate": 2.9e-05, "loss": 1.1643, "step": 29 }, { "epoch": 0.015337423312883436, "grad_norm": 4.819282054901123, "learning_rate": 3e-05, "loss": 1.0718, "step": 30 }, { "epoch": 0.015848670756646217, "grad_norm": 4.949357032775879, "learning_rate": 3.1e-05, "loss": 1.091, "step": 31 }, { "epoch": 0.016359918200409, "grad_norm": 5.103873252868652, "learning_rate": 3.2000000000000005e-05, "loss": 1.0602, "step": 32 }, { "epoch": 0.01687116564417178, "grad_norm": 4.818650722503662, "learning_rate": 3.3e-05, "loss": 0.9318, "step": 33 }, { "epoch": 0.017382413087934562, "grad_norm": 4.793953895568848, "learning_rate": 3.4000000000000007e-05, "loss": 1.0556, "step": 34 }, { "epoch": 0.01789366053169734, "grad_norm": 5.540240287780762, "learning_rate": 3.5e-05, "loss": 1.1009, "step": 35 }, { "epoch": 0.018404907975460124, "grad_norm": 5.293509483337402, "learning_rate": 3.6e-05, "loss": 1.1285, "step": 36 }, { "epoch": 0.018916155419222903, "grad_norm": 5.887397289276123, "learning_rate": 3.7e-05, "loss": 1.0327, "step": 37 }, { "epoch": 0.019427402862985686, "grad_norm": 4.989732265472412, "learning_rate": 3.8e-05, "loss": 0.9327, "step": 38 }, { "epoch": 0.019938650306748466, "grad_norm": 5.251559257507324, "learning_rate": 3.9000000000000006e-05, "loss": 1.114, "step": 39 }, { "epoch": 0.02044989775051125, "grad_norm": 6.304392337799072, "learning_rate": 4e-05, "loss": 1.0943, "step": 40 }, { "epoch": 0.020961145194274028, "grad_norm": 5.590821266174316, "learning_rate": 4.1e-05, "loss": 1.0598, "step": 41 }, { "epoch": 0.02147239263803681, "grad_norm": 5.784496784210205, "learning_rate": 4.2e-05, "loss": 1.021, "step": 42 }, { "epoch": 0.02198364008179959, "grad_norm": 5.714466094970703, "learning_rate": 4.3e-05, "loss": 0.929, "step": 43 }, { "epoch": 0.022494887525562373, "grad_norm": 6.554811000823975, "learning_rate": 4.4000000000000006e-05, "loss": 1.1903, "step": 44 }, { "epoch": 0.023006134969325152, "grad_norm": 6.44047212600708, "learning_rate": 4.5e-05, "loss": 0.8508, "step": 45 }, { "epoch": 0.023517382413087935, "grad_norm": 6.655950546264648, "learning_rate": 4.600000000000001e-05, "loss": 0.8196, "step": 46 }, { "epoch": 0.024028629856850715, "grad_norm": 6.728236198425293, "learning_rate": 4.7e-05, "loss": 0.9124, "step": 47 }, { "epoch": 0.024539877300613498, "grad_norm": 7.260775566101074, "learning_rate": 4.8e-05, "loss": 0.7433, "step": 48 }, { "epoch": 0.025051124744376277, "grad_norm": 8.426919937133789, "learning_rate": 4.9e-05, "loss": 0.902, "step": 49 }, { "epoch": 0.02556237218813906, "grad_norm": 14.32536506652832, "learning_rate": 5e-05, "loss": 0.8194, "step": 50 }, { "epoch": 0.02607361963190184, "grad_norm": 4.535055637359619, "learning_rate": 5.1000000000000006e-05, "loss": 1.0298, "step": 51 }, { "epoch": 0.026584867075664622, "grad_norm": 5.038204193115234, "learning_rate": 5.2000000000000004e-05, "loss": 1.0745, "step": 52 }, { "epoch": 0.0270961145194274, "grad_norm": 6.9380927085876465, "learning_rate": 5.300000000000001e-05, "loss": 1.2208, "step": 53 }, { "epoch": 0.027607361963190184, "grad_norm": 4.157510280609131, "learning_rate": 5.4000000000000005e-05, "loss": 1.135, "step": 54 }, { "epoch": 0.028118609406952964, "grad_norm": 3.202308416366577, "learning_rate": 5.500000000000001e-05, "loss": 1.0679, "step": 55 }, { "epoch": 0.028629856850715747, "grad_norm": 3.3872716426849365, "learning_rate": 5.6000000000000006e-05, "loss": 1.0871, "step": 56 }, { "epoch": 0.029141104294478526, "grad_norm": 3.4161555767059326, "learning_rate": 5.6999999999999996e-05, "loss": 1.149, "step": 57 }, { "epoch": 0.02965235173824131, "grad_norm": 4.0091962814331055, "learning_rate": 5.8e-05, "loss": 1.096, "step": 58 }, { "epoch": 0.03016359918200409, "grad_norm": 3.4014410972595215, "learning_rate": 5.9e-05, "loss": 0.9183, "step": 59 }, { "epoch": 0.03067484662576687, "grad_norm": 3.9149327278137207, "learning_rate": 6e-05, "loss": 1.0704, "step": 60 }, { "epoch": 0.031186094069529654, "grad_norm": 4.084487438201904, "learning_rate": 6.1e-05, "loss": 1.0907, "step": 61 }, { "epoch": 0.03169734151329243, "grad_norm": 3.6023764610290527, "learning_rate": 6.2e-05, "loss": 0.9582, "step": 62 }, { "epoch": 0.032208588957055216, "grad_norm": 3.806703805923462, "learning_rate": 6.3e-05, "loss": 1.0072, "step": 63 }, { "epoch": 0.032719836400818, "grad_norm": 3.710463523864746, "learning_rate": 6.400000000000001e-05, "loss": 0.9751, "step": 64 }, { "epoch": 0.033231083844580775, "grad_norm": 3.8242218494415283, "learning_rate": 6.500000000000001e-05, "loss": 1.0477, "step": 65 }, { "epoch": 0.03374233128834356, "grad_norm": 4.025355815887451, "learning_rate": 6.6e-05, "loss": 1.0147, "step": 66 }, { "epoch": 0.03425357873210634, "grad_norm": 3.9239964485168457, "learning_rate": 6.7e-05, "loss": 1.1412, "step": 67 }, { "epoch": 0.034764826175869123, "grad_norm": 4.182487964630127, "learning_rate": 6.800000000000001e-05, "loss": 1.012, "step": 68 }, { "epoch": 0.0352760736196319, "grad_norm": 4.380066871643066, "learning_rate": 6.9e-05, "loss": 1.1207, "step": 69 }, { "epoch": 0.03578732106339468, "grad_norm": 4.1866230964660645, "learning_rate": 7e-05, "loss": 1.0963, "step": 70 }, { "epoch": 0.036298568507157465, "grad_norm": 4.3079833984375, "learning_rate": 7.1e-05, "loss": 0.9799, "step": 71 }, { "epoch": 0.03680981595092025, "grad_norm": 4.587730407714844, "learning_rate": 7.2e-05, "loss": 1.0129, "step": 72 }, { "epoch": 0.037321063394683024, "grad_norm": 4.465561389923096, "learning_rate": 7.3e-05, "loss": 1.0579, "step": 73 }, { "epoch": 0.03783231083844581, "grad_norm": 4.654197692871094, "learning_rate": 7.4e-05, "loss": 1.0536, "step": 74 }, { "epoch": 0.03834355828220859, "grad_norm": 4.685794353485107, "learning_rate": 7.500000000000001e-05, "loss": 1.0567, "step": 75 }, { "epoch": 0.03885480572597137, "grad_norm": 4.281529426574707, "learning_rate": 7.6e-05, "loss": 0.9689, "step": 76 }, { "epoch": 0.03936605316973415, "grad_norm": 4.096267223358154, "learning_rate": 7.7e-05, "loss": 1.0279, "step": 77 }, { "epoch": 0.03987730061349693, "grad_norm": 4.406935214996338, "learning_rate": 7.800000000000001e-05, "loss": 1.0197, "step": 78 }, { "epoch": 0.040388548057259714, "grad_norm": 4.344164848327637, "learning_rate": 7.900000000000001e-05, "loss": 0.9634, "step": 79 }, { "epoch": 0.0408997955010225, "grad_norm": 4.969428539276123, "learning_rate": 8e-05, "loss": 1.0437, "step": 80 }, { "epoch": 0.04141104294478527, "grad_norm": 4.627592086791992, "learning_rate": 8.1e-05, "loss": 1.0735, "step": 81 }, { "epoch": 0.041922290388548056, "grad_norm": 4.50595760345459, "learning_rate": 8.2e-05, "loss": 0.9614, "step": 82 }, { "epoch": 0.04243353783231084, "grad_norm": 4.57030725479126, "learning_rate": 8.3e-05, "loss": 0.9109, "step": 83 }, { "epoch": 0.04294478527607362, "grad_norm": 5.381258487701416, "learning_rate": 8.4e-05, "loss": 1.1194, "step": 84 }, { "epoch": 0.0434560327198364, "grad_norm": 5.224101543426514, "learning_rate": 8.5e-05, "loss": 1.0022, "step": 85 }, { "epoch": 0.04396728016359918, "grad_norm": 4.670103073120117, "learning_rate": 8.6e-05, "loss": 0.9712, "step": 86 }, { "epoch": 0.04447852760736196, "grad_norm": 5.264283657073975, "learning_rate": 8.7e-05, "loss": 0.9767, "step": 87 }, { "epoch": 0.044989775051124746, "grad_norm": 4.800637245178223, "learning_rate": 8.800000000000001e-05, "loss": 0.8713, "step": 88 }, { "epoch": 0.04550102249488753, "grad_norm": 5.133697986602783, "learning_rate": 8.900000000000001e-05, "loss": 0.9959, "step": 89 }, { "epoch": 0.046012269938650305, "grad_norm": 5.831608295440674, "learning_rate": 9e-05, "loss": 0.9995, "step": 90 }, { "epoch": 0.04652351738241309, "grad_norm": 5.324717998504639, "learning_rate": 9.1e-05, "loss": 0.9143, "step": 91 }, { "epoch": 0.04703476482617587, "grad_norm": 6.4028401374816895, "learning_rate": 9.200000000000001e-05, "loss": 1.1914, "step": 92 }, { "epoch": 0.04754601226993865, "grad_norm": 6.07197904586792, "learning_rate": 9.300000000000001e-05, "loss": 0.9809, "step": 93 }, { "epoch": 0.04805725971370143, "grad_norm": 5.957538604736328, "learning_rate": 9.4e-05, "loss": 1.0473, "step": 94 }, { "epoch": 0.04856850715746421, "grad_norm": 5.8357343673706055, "learning_rate": 9.5e-05, "loss": 0.9465, "step": 95 }, { "epoch": 0.049079754601226995, "grad_norm": 5.610650062561035, "learning_rate": 9.6e-05, "loss": 0.7326, "step": 96 }, { "epoch": 0.04959100204498978, "grad_norm": 6.265199661254883, "learning_rate": 9.7e-05, "loss": 0.9174, "step": 97 }, { "epoch": 0.050102249488752554, "grad_norm": 7.403947353363037, "learning_rate": 9.8e-05, "loss": 0.8325, "step": 98 }, { "epoch": 0.05061349693251534, "grad_norm": 7.57144021987915, "learning_rate": 9.900000000000001e-05, "loss": 0.6444, "step": 99 }, { "epoch": 0.05112474437627812, "grad_norm": 10.456438064575195, "learning_rate": 0.0001, "loss": 0.6351, "step": 100 }, { "epoch": 0.0516359918200409, "grad_norm": 3.450286865234375, "learning_rate": 9.999992837185459e-05, "loss": 1.1081, "step": 101 }, { "epoch": 0.05214723926380368, "grad_norm": 3.6296398639678955, "learning_rate": 9.999971348762359e-05, "loss": 1.1465, "step": 102 }, { "epoch": 0.05265848670756646, "grad_norm": 3.699775457382202, "learning_rate": 9.999935534792264e-05, "loss": 1.19, "step": 103 }, { "epoch": 0.053169734151329244, "grad_norm": 3.4316041469573975, "learning_rate": 9.999885395377788e-05, "loss": 1.1055, "step": 104 }, { "epoch": 0.05368098159509203, "grad_norm": 3.6994450092315674, "learning_rate": 9.999820930662585e-05, "loss": 1.1867, "step": 105 }, { "epoch": 0.0541922290388548, "grad_norm": 3.6903419494628906, "learning_rate": 9.999742140831357e-05, "loss": 1.077, "step": 106 }, { "epoch": 0.054703476482617586, "grad_norm": 4.079493522644043, "learning_rate": 9.999649026109845e-05, "loss": 1.1008, "step": 107 }, { "epoch": 0.05521472392638037, "grad_norm": 3.9423539638519287, "learning_rate": 9.999541586764836e-05, "loss": 1.2268, "step": 108 }, { "epoch": 0.05572597137014315, "grad_norm": 4.003759860992432, "learning_rate": 9.999419823104155e-05, "loss": 1.0735, "step": 109 }, { "epoch": 0.05623721881390593, "grad_norm": 3.989661693572998, "learning_rate": 9.999283735476672e-05, "loss": 1.0735, "step": 110 }, { "epoch": 0.05674846625766871, "grad_norm": 3.9586427211761475, "learning_rate": 9.999133324272294e-05, "loss": 1.0708, "step": 111 }, { "epoch": 0.05725971370143149, "grad_norm": 4.020519733428955, "learning_rate": 9.998968589921969e-05, "loss": 1.0245, "step": 112 }, { "epoch": 0.057770961145194276, "grad_norm": 4.426997184753418, "learning_rate": 9.998789532897681e-05, "loss": 1.0745, "step": 113 }, { "epoch": 0.05828220858895705, "grad_norm": 3.993692636489868, "learning_rate": 9.998596153712451e-05, "loss": 1.0767, "step": 114 }, { "epoch": 0.058793456032719835, "grad_norm": 4.429073333740234, "learning_rate": 9.998388452920334e-05, "loss": 1.017, "step": 115 }, { "epoch": 0.05930470347648262, "grad_norm": 3.983215093612671, "learning_rate": 9.99816643111642e-05, "loss": 1.021, "step": 116 }, { "epoch": 0.0598159509202454, "grad_norm": 4.212836742401123, "learning_rate": 9.997930088936828e-05, "loss": 0.982, "step": 117 }, { "epoch": 0.06032719836400818, "grad_norm": 4.086669921875, "learning_rate": 9.997679427058713e-05, "loss": 1.0085, "step": 118 }, { "epoch": 0.06083844580777096, "grad_norm": 3.8549580574035645, "learning_rate": 9.997414446200246e-05, "loss": 0.9703, "step": 119 }, { "epoch": 0.06134969325153374, "grad_norm": 4.432587623596191, "learning_rate": 9.997135147120633e-05, "loss": 1.0989, "step": 120 }, { "epoch": 0.061860940695296525, "grad_norm": 4.559906959533691, "learning_rate": 9.996841530620103e-05, "loss": 0.9276, "step": 121 }, { "epoch": 0.06237218813905931, "grad_norm": 4.4931721687316895, "learning_rate": 9.996533597539901e-05, "loss": 0.9899, "step": 122 }, { "epoch": 0.06288343558282208, "grad_norm": 4.057201862335205, "learning_rate": 9.996211348762296e-05, "loss": 1.0011, "step": 123 }, { "epoch": 0.06339468302658487, "grad_norm": 4.463088512420654, "learning_rate": 9.995874785210573e-05, "loss": 0.9554, "step": 124 }, { "epoch": 0.06390593047034765, "grad_norm": 4.215217590332031, "learning_rate": 9.995523907849024e-05, "loss": 0.9568, "step": 125 }, { "epoch": 0.06441717791411043, "grad_norm": 4.900539875030518, "learning_rate": 9.995158717682963e-05, "loss": 0.9864, "step": 126 }, { "epoch": 0.06492842535787322, "grad_norm": 4.836815357208252, "learning_rate": 9.9947792157587e-05, "loss": 0.9749, "step": 127 }, { "epoch": 0.065439672801636, "grad_norm": 4.402955055236816, "learning_rate": 9.99438540316356e-05, "loss": 0.9626, "step": 128 }, { "epoch": 0.06595092024539877, "grad_norm": 4.344229698181152, "learning_rate": 9.993977281025862e-05, "loss": 0.9272, "step": 129 }, { "epoch": 0.06646216768916155, "grad_norm": 4.652369976043701, "learning_rate": 9.99355485051493e-05, "loss": 1.0139, "step": 130 }, { "epoch": 0.06697341513292433, "grad_norm": 4.622213840484619, "learning_rate": 9.99311811284108e-05, "loss": 0.9265, "step": 131 }, { "epoch": 0.06748466257668712, "grad_norm": 5.247522830963135, "learning_rate": 9.992667069255619e-05, "loss": 1.0134, "step": 132 }, { "epoch": 0.0679959100204499, "grad_norm": 4.564031600952148, "learning_rate": 9.992201721050847e-05, "loss": 0.9342, "step": 133 }, { "epoch": 0.06850715746421268, "grad_norm": 4.3959245681762695, "learning_rate": 9.991722069560041e-05, "loss": 0.9452, "step": 134 }, { "epoch": 0.06901840490797546, "grad_norm": 4.695987701416016, "learning_rate": 9.991228116157466e-05, "loss": 0.9721, "step": 135 }, { "epoch": 0.06952965235173825, "grad_norm": 4.595483303070068, "learning_rate": 9.990719862258358e-05, "loss": 0.9288, "step": 136 }, { "epoch": 0.07004089979550102, "grad_norm": 4.6917009353637695, "learning_rate": 9.99019730931893e-05, "loss": 0.8785, "step": 137 }, { "epoch": 0.0705521472392638, "grad_norm": 6.002026081085205, "learning_rate": 9.98966045883636e-05, "loss": 0.9841, "step": 138 }, { "epoch": 0.07106339468302658, "grad_norm": 5.35076379776001, "learning_rate": 9.989109312348796e-05, "loss": 1.0745, "step": 139 }, { "epoch": 0.07157464212678936, "grad_norm": 5.220756530761719, "learning_rate": 9.98854387143534e-05, "loss": 0.9308, "step": 140 }, { "epoch": 0.07208588957055215, "grad_norm": 5.255858421325684, "learning_rate": 9.98796413771605e-05, "loss": 0.9596, "step": 141 }, { "epoch": 0.07259713701431493, "grad_norm": 5.472203254699707, "learning_rate": 9.987370112851939e-05, "loss": 0.9245, "step": 142 }, { "epoch": 0.07310838445807771, "grad_norm": 5.4891252517700195, "learning_rate": 9.98676179854496e-05, "loss": 0.9549, "step": 143 }, { "epoch": 0.0736196319018405, "grad_norm": 5.602303504943848, "learning_rate": 9.986139196538011e-05, "loss": 0.9336, "step": 144 }, { "epoch": 0.07413087934560328, "grad_norm": 6.35874080657959, "learning_rate": 9.985502308614927e-05, "loss": 1.0135, "step": 145 }, { "epoch": 0.07464212678936605, "grad_norm": 6.352203369140625, "learning_rate": 9.984851136600469e-05, "loss": 0.8946, "step": 146 }, { "epoch": 0.07515337423312883, "grad_norm": 6.650132179260254, "learning_rate": 9.984185682360327e-05, "loss": 0.943, "step": 147 }, { "epoch": 0.07566462167689161, "grad_norm": 7.255720138549805, "learning_rate": 9.983505947801115e-05, "loss": 0.8747, "step": 148 }, { "epoch": 0.0761758691206544, "grad_norm": 7.385144233703613, "learning_rate": 9.982811934870353e-05, "loss": 0.7153, "step": 149 }, { "epoch": 0.07668711656441718, "grad_norm": 11.432644844055176, "learning_rate": 9.982103645556478e-05, "loss": 1.1386, "step": 150 }, { "epoch": 0.07719836400817996, "grad_norm": 3.1650919914245605, "learning_rate": 9.981381081888828e-05, "loss": 1.0661, "step": 151 }, { "epoch": 0.07770961145194274, "grad_norm": 3.7646584510803223, "learning_rate": 9.980644245937639e-05, "loss": 1.1944, "step": 152 }, { "epoch": 0.07822085889570553, "grad_norm": 3.43169903755188, "learning_rate": 9.979893139814039e-05, "loss": 1.1582, "step": 153 }, { "epoch": 0.0787321063394683, "grad_norm": 3.5647740364074707, "learning_rate": 9.979127765670039e-05, "loss": 1.1398, "step": 154 }, { "epoch": 0.07924335378323108, "grad_norm": 3.8447012901306152, "learning_rate": 9.978348125698534e-05, "loss": 1.1126, "step": 155 }, { "epoch": 0.07975460122699386, "grad_norm": 3.9558143615722656, "learning_rate": 9.977554222133292e-05, "loss": 1.1155, "step": 156 }, { "epoch": 0.08026584867075665, "grad_norm": 3.363036870956421, "learning_rate": 9.976746057248943e-05, "loss": 0.9753, "step": 157 }, { "epoch": 0.08077709611451943, "grad_norm": 4.080865383148193, "learning_rate": 9.975923633360985e-05, "loss": 1.1136, "step": 158 }, { "epoch": 0.08128834355828221, "grad_norm": 3.7571935653686523, "learning_rate": 9.975086952825762e-05, "loss": 1.1015, "step": 159 }, { "epoch": 0.081799591002045, "grad_norm": 3.6946327686309814, "learning_rate": 9.974236018040474e-05, "loss": 1.0684, "step": 160 }, { "epoch": 0.08231083844580778, "grad_norm": 4.085092544555664, "learning_rate": 9.973370831443152e-05, "loss": 1.082, "step": 161 }, { "epoch": 0.08282208588957055, "grad_norm": 4.023382186889648, "learning_rate": 9.972491395512666e-05, "loss": 1.059, "step": 162 }, { "epoch": 0.08333333333333333, "grad_norm": 4.119484901428223, "learning_rate": 9.97159771276871e-05, "loss": 0.996, "step": 163 }, { "epoch": 0.08384458077709611, "grad_norm": 3.6977996826171875, "learning_rate": 9.970689785771798e-05, "loss": 1.0283, "step": 164 }, { "epoch": 0.0843558282208589, "grad_norm": 3.9231314659118652, "learning_rate": 9.969767617123256e-05, "loss": 1.0912, "step": 165 }, { "epoch": 0.08486707566462168, "grad_norm": 3.965254306793213, "learning_rate": 9.968831209465211e-05, "loss": 1.0856, "step": 166 }, { "epoch": 0.08537832310838446, "grad_norm": 3.6064834594726562, "learning_rate": 9.967880565480589e-05, "loss": 0.9545, "step": 167 }, { "epoch": 0.08588957055214724, "grad_norm": 4.335506439208984, "learning_rate": 9.966915687893108e-05, "loss": 1.0064, "step": 168 }, { "epoch": 0.08640081799591003, "grad_norm": 4.090417385101318, "learning_rate": 9.96593657946726e-05, "loss": 0.9032, "step": 169 }, { "epoch": 0.0869120654396728, "grad_norm": 4.244670391082764, "learning_rate": 9.964943243008315e-05, "loss": 0.9544, "step": 170 }, { "epoch": 0.08742331288343558, "grad_norm": 4.027634143829346, "learning_rate": 9.963935681362308e-05, "loss": 1.0019, "step": 171 }, { "epoch": 0.08793456032719836, "grad_norm": 4.118989944458008, "learning_rate": 9.962913897416028e-05, "loss": 0.9132, "step": 172 }, { "epoch": 0.08844580777096114, "grad_norm": 4.275842666625977, "learning_rate": 9.961877894097016e-05, "loss": 0.9875, "step": 173 }, { "epoch": 0.08895705521472393, "grad_norm": 4.40177059173584, "learning_rate": 9.96082767437355e-05, "loss": 1.0284, "step": 174 }, { "epoch": 0.08946830265848671, "grad_norm": 4.55718994140625, "learning_rate": 9.959763241254645e-05, "loss": 1.0491, "step": 175 }, { "epoch": 0.08997955010224949, "grad_norm": 3.950263261795044, "learning_rate": 9.958684597790031e-05, "loss": 0.8668, "step": 176 }, { "epoch": 0.09049079754601227, "grad_norm": 4.344690322875977, "learning_rate": 9.957591747070163e-05, "loss": 0.9561, "step": 177 }, { "epoch": 0.09100204498977506, "grad_norm": 4.7496232986450195, "learning_rate": 9.95648469222619e-05, "loss": 1.2058, "step": 178 }, { "epoch": 0.09151329243353783, "grad_norm": 4.409940242767334, "learning_rate": 9.955363436429968e-05, "loss": 1.0548, "step": 179 }, { "epoch": 0.09202453987730061, "grad_norm": 4.629715919494629, "learning_rate": 9.954227982894034e-05, "loss": 0.9482, "step": 180 }, { "epoch": 0.09253578732106339, "grad_norm": 4.308309078216553, "learning_rate": 9.953078334871606e-05, "loss": 0.9435, "step": 181 }, { "epoch": 0.09304703476482618, "grad_norm": 4.249781131744385, "learning_rate": 9.951914495656569e-05, "loss": 0.9578, "step": 182 }, { "epoch": 0.09355828220858896, "grad_norm": 4.906545162200928, "learning_rate": 9.950736468583468e-05, "loss": 1.0188, "step": 183 }, { "epoch": 0.09406952965235174, "grad_norm": 5.413058280944824, "learning_rate": 9.949544257027502e-05, "loss": 1.1337, "step": 184 }, { "epoch": 0.09458077709611452, "grad_norm": 4.498751640319824, "learning_rate": 9.948337864404504e-05, "loss": 0.9442, "step": 185 }, { "epoch": 0.0950920245398773, "grad_norm": 4.727104187011719, "learning_rate": 9.947117294170944e-05, "loss": 1.0067, "step": 186 }, { "epoch": 0.09560327198364008, "grad_norm": 4.974936008453369, "learning_rate": 9.945882549823906e-05, "loss": 1.0294, "step": 187 }, { "epoch": 0.09611451942740286, "grad_norm": 5.312014102935791, "learning_rate": 9.944633634901088e-05, "loss": 0.9887, "step": 188 }, { "epoch": 0.09662576687116564, "grad_norm": 5.3448405265808105, "learning_rate": 9.943370552980791e-05, "loss": 0.9342, "step": 189 }, { "epoch": 0.09713701431492842, "grad_norm": 5.098217964172363, "learning_rate": 9.942093307681901e-05, "loss": 0.9, "step": 190 }, { "epoch": 0.09764826175869121, "grad_norm": 5.733004570007324, "learning_rate": 9.940801902663889e-05, "loss": 1.0274, "step": 191 }, { "epoch": 0.09815950920245399, "grad_norm": 5.353964328765869, "learning_rate": 9.939496341626791e-05, "loss": 1.0018, "step": 192 }, { "epoch": 0.09867075664621677, "grad_norm": 5.414928913116455, "learning_rate": 9.938176628311204e-05, "loss": 0.8237, "step": 193 }, { "epoch": 0.09918200408997956, "grad_norm": 5.456855297088623, "learning_rate": 9.936842766498273e-05, "loss": 0.9474, "step": 194 }, { "epoch": 0.09969325153374232, "grad_norm": 6.7383856773376465, "learning_rate": 9.935494760009678e-05, "loss": 1.0278, "step": 195 }, { "epoch": 0.10020449897750511, "grad_norm": 6.588730812072754, "learning_rate": 9.934132612707632e-05, "loss": 1.0217, "step": 196 }, { "epoch": 0.10071574642126789, "grad_norm": 7.4416093826293945, "learning_rate": 9.932756328494852e-05, "loss": 0.9317, "step": 197 }, { "epoch": 0.10122699386503067, "grad_norm": 7.225059986114502, "learning_rate": 9.931365911314572e-05, "loss": 0.8769, "step": 198 }, { "epoch": 0.10173824130879346, "grad_norm": 7.279494285583496, "learning_rate": 9.929961365150506e-05, "loss": 0.6817, "step": 199 }, { "epoch": 0.10224948875255624, "grad_norm": 10.894880294799805, "learning_rate": 9.928542694026862e-05, "loss": 0.9749, "step": 200 }, { "epoch": 0.10276073619631902, "grad_norm": 3.4081413745880127, "learning_rate": 9.927109902008303e-05, "loss": 0.9225, "step": 201 }, { "epoch": 0.1032719836400818, "grad_norm": 4.234488010406494, "learning_rate": 9.925662993199967e-05, "loss": 1.1298, "step": 202 }, { "epoch": 0.10378323108384459, "grad_norm": 3.798875093460083, "learning_rate": 9.924201971747423e-05, "loss": 1.0496, "step": 203 }, { "epoch": 0.10429447852760736, "grad_norm": 3.6677727699279785, "learning_rate": 9.922726841836684e-05, "loss": 1.1259, "step": 204 }, { "epoch": 0.10480572597137014, "grad_norm": 3.7715201377868652, "learning_rate": 9.921237607694184e-05, "loss": 1.105, "step": 205 }, { "epoch": 0.10531697341513292, "grad_norm": 3.534221649169922, "learning_rate": 9.919734273586767e-05, "loss": 1.0223, "step": 206 }, { "epoch": 0.1058282208588957, "grad_norm": 3.9099631309509277, "learning_rate": 9.91821684382167e-05, "loss": 1.0874, "step": 207 }, { "epoch": 0.10633946830265849, "grad_norm": 3.318946361541748, "learning_rate": 9.916685322746524e-05, "loss": 0.9967, "step": 208 }, { "epoch": 0.10685071574642127, "grad_norm": 4.089531421661377, "learning_rate": 9.915139714749328e-05, "loss": 0.973, "step": 209 }, { "epoch": 0.10736196319018405, "grad_norm": 3.753154993057251, "learning_rate": 9.913580024258442e-05, "loss": 1.0184, "step": 210 }, { "epoch": 0.10787321063394684, "grad_norm": 4.053317070007324, "learning_rate": 9.912006255742579e-05, "loss": 1.1166, "step": 211 }, { "epoch": 0.1083844580777096, "grad_norm": 3.554300308227539, "learning_rate": 9.91041841371078e-05, "loss": 0.9794, "step": 212 }, { "epoch": 0.10889570552147239, "grad_norm": 3.9923880100250244, "learning_rate": 9.908816502712415e-05, "loss": 1.1504, "step": 213 }, { "epoch": 0.10940695296523517, "grad_norm": 3.900874137878418, "learning_rate": 9.90720052733716e-05, "loss": 1.0554, "step": 214 }, { "epoch": 0.10991820040899795, "grad_norm": 3.914747714996338, "learning_rate": 9.905570492214987e-05, "loss": 1.0801, "step": 215 }, { "epoch": 0.11042944785276074, "grad_norm": 4.028392791748047, "learning_rate": 9.903926402016153e-05, "loss": 1.0332, "step": 216 }, { "epoch": 0.11094069529652352, "grad_norm": 4.589059829711914, "learning_rate": 9.902268261451182e-05, "loss": 1.0179, "step": 217 }, { "epoch": 0.1114519427402863, "grad_norm": 4.278644561767578, "learning_rate": 9.900596075270856e-05, "loss": 1.0743, "step": 218 }, { "epoch": 0.11196319018404909, "grad_norm": 4.3040008544921875, "learning_rate": 9.898909848266196e-05, "loss": 0.9975, "step": 219 }, { "epoch": 0.11247443762781185, "grad_norm": 3.8854451179504395, "learning_rate": 9.897209585268458e-05, "loss": 1.003, "step": 220 }, { "epoch": 0.11298568507157464, "grad_norm": 4.615149021148682, "learning_rate": 9.89549529114911e-05, "loss": 0.9241, "step": 221 }, { "epoch": 0.11349693251533742, "grad_norm": 4.247766494750977, "learning_rate": 9.893766970819819e-05, "loss": 0.9801, "step": 222 }, { "epoch": 0.1140081799591002, "grad_norm": 4.650023460388184, "learning_rate": 9.89202462923244e-05, "loss": 1.0031, "step": 223 }, { "epoch": 0.11451942740286299, "grad_norm": 4.242456912994385, "learning_rate": 9.890268271379e-05, "loss": 1.0527, "step": 224 }, { "epoch": 0.11503067484662577, "grad_norm": 4.012096881866455, "learning_rate": 9.888497902291685e-05, "loss": 0.9761, "step": 225 }, { "epoch": 0.11554192229038855, "grad_norm": 4.403133392333984, "learning_rate": 9.886713527042828e-05, "loss": 0.9944, "step": 226 }, { "epoch": 0.11605316973415133, "grad_norm": 4.339402675628662, "learning_rate": 9.884915150744885e-05, "loss": 1.0336, "step": 227 }, { "epoch": 0.1165644171779141, "grad_norm": 5.434019088745117, "learning_rate": 9.883102778550434e-05, "loss": 1.0191, "step": 228 }, { "epoch": 0.11707566462167689, "grad_norm": 4.682615280151367, "learning_rate": 9.881276415652148e-05, "loss": 0.9405, "step": 229 }, { "epoch": 0.11758691206543967, "grad_norm": 4.088139057159424, "learning_rate": 9.879436067282783e-05, "loss": 0.8859, "step": 230 }, { "epoch": 0.11809815950920245, "grad_norm": 4.25062370300293, "learning_rate": 9.877581738715174e-05, "loss": 0.9826, "step": 231 }, { "epoch": 0.11860940695296524, "grad_norm": 4.560945510864258, "learning_rate": 9.875713435262203e-05, "loss": 0.8834, "step": 232 }, { "epoch": 0.11912065439672802, "grad_norm": 4.465799331665039, "learning_rate": 9.873831162276796e-05, "loss": 0.9343, "step": 233 }, { "epoch": 0.1196319018404908, "grad_norm": 4.841387748718262, "learning_rate": 9.871934925151898e-05, "loss": 0.9551, "step": 234 }, { "epoch": 0.12014314928425358, "grad_norm": 4.5967326164245605, "learning_rate": 9.870024729320471e-05, "loss": 0.9566, "step": 235 }, { "epoch": 0.12065439672801637, "grad_norm": 5.284646987915039, "learning_rate": 9.868100580255466e-05, "loss": 0.9958, "step": 236 }, { "epoch": 0.12116564417177914, "grad_norm": 5.193093299865723, "learning_rate": 9.86616248346981e-05, "loss": 1.0898, "step": 237 }, { "epoch": 0.12167689161554192, "grad_norm": 5.222631931304932, "learning_rate": 9.864210444516395e-05, "loss": 0.9452, "step": 238 }, { "epoch": 0.1221881390593047, "grad_norm": 5.9728569984436035, "learning_rate": 9.86224446898806e-05, "loss": 1.0229, "step": 239 }, { "epoch": 0.12269938650306748, "grad_norm": 4.982256889343262, "learning_rate": 9.86026456251757e-05, "loss": 1.0025, "step": 240 }, { "epoch": 0.12321063394683027, "grad_norm": 5.724108695983887, "learning_rate": 9.858270730777608e-05, "loss": 1.0477, "step": 241 }, { "epoch": 0.12372188139059305, "grad_norm": 5.632500648498535, "learning_rate": 9.85626297948075e-05, "loss": 0.927, "step": 242 }, { "epoch": 0.12423312883435583, "grad_norm": 5.947004795074463, "learning_rate": 9.854241314379457e-05, "loss": 1.0072, "step": 243 }, { "epoch": 0.12474437627811862, "grad_norm": 5.381914138793945, "learning_rate": 9.852205741266058e-05, "loss": 0.8549, "step": 244 }, { "epoch": 0.1252556237218814, "grad_norm": 6.360842227935791, "learning_rate": 9.850156265972721e-05, "loss": 0.911, "step": 245 }, { "epoch": 0.12576687116564417, "grad_norm": 6.210946083068848, "learning_rate": 9.848092894371452e-05, "loss": 0.8747, "step": 246 }, { "epoch": 0.12627811860940696, "grad_norm": 7.40360689163208, "learning_rate": 9.84601563237407e-05, "loss": 1.0297, "step": 247 }, { "epoch": 0.12678936605316973, "grad_norm": 6.7166032791137695, "learning_rate": 9.843924485932194e-05, "loss": 0.791, "step": 248 }, { "epoch": 0.1273006134969325, "grad_norm": 8.442200660705566, "learning_rate": 9.841819461037219e-05, "loss": 0.9056, "step": 249 }, { "epoch": 0.1278118609406953, "grad_norm": 10.764420509338379, "learning_rate": 9.839700563720309e-05, "loss": 0.9064, "step": 250 }, { "epoch": 0.12832310838445807, "grad_norm": 2.9699931144714355, "learning_rate": 9.837567800052368e-05, "loss": 0.909, "step": 251 }, { "epoch": 0.12883435582822086, "grad_norm": 3.270521640777588, "learning_rate": 9.835421176144035e-05, "loss": 1.0767, "step": 252 }, { "epoch": 0.12934560327198363, "grad_norm": 3.5505306720733643, "learning_rate": 9.833260698145656e-05, "loss": 1.102, "step": 253 }, { "epoch": 0.12985685071574643, "grad_norm": 3.706258535385132, "learning_rate": 9.831086372247272e-05, "loss": 1.0359, "step": 254 }, { "epoch": 0.1303680981595092, "grad_norm": 3.501894950866699, "learning_rate": 9.828898204678603e-05, "loss": 1.0805, "step": 255 }, { "epoch": 0.130879345603272, "grad_norm": 3.5987539291381836, "learning_rate": 9.826696201709021e-05, "loss": 1.0488, "step": 256 }, { "epoch": 0.13139059304703476, "grad_norm": 3.721916437149048, "learning_rate": 9.824480369647544e-05, "loss": 1.0055, "step": 257 }, { "epoch": 0.13190184049079753, "grad_norm": 3.430462121963501, "learning_rate": 9.82225071484281e-05, "loss": 0.9892, "step": 258 }, { "epoch": 0.13241308793456033, "grad_norm": 3.6975953578948975, "learning_rate": 9.820007243683057e-05, "loss": 1.0917, "step": 259 }, { "epoch": 0.1329243353783231, "grad_norm": 3.9236834049224854, "learning_rate": 9.817749962596115e-05, "loss": 1.0534, "step": 260 }, { "epoch": 0.1334355828220859, "grad_norm": 3.7525970935821533, "learning_rate": 9.815478878049378e-05, "loss": 0.9877, "step": 261 }, { "epoch": 0.13394683026584867, "grad_norm": 3.929924488067627, "learning_rate": 9.813193996549789e-05, "loss": 1.0387, "step": 262 }, { "epoch": 0.13445807770961146, "grad_norm": 3.305387496948242, "learning_rate": 9.810895324643821e-05, "loss": 0.8865, "step": 263 }, { "epoch": 0.13496932515337423, "grad_norm": 4.399075984954834, "learning_rate": 9.808582868917458e-05, "loss": 1.048, "step": 264 }, { "epoch": 0.13548057259713703, "grad_norm": 3.7269322872161865, "learning_rate": 9.806256635996175e-05, "loss": 1.0156, "step": 265 }, { "epoch": 0.1359918200408998, "grad_norm": 3.96358323097229, "learning_rate": 9.803916632544924e-05, "loss": 1.0946, "step": 266 }, { "epoch": 0.13650306748466257, "grad_norm": 3.7565560340881348, "learning_rate": 9.801562865268109e-05, "loss": 0.8618, "step": 267 }, { "epoch": 0.13701431492842536, "grad_norm": 3.869610548019409, "learning_rate": 9.79919534090957e-05, "loss": 1.0215, "step": 268 }, { "epoch": 0.13752556237218813, "grad_norm": 4.0557403564453125, "learning_rate": 9.79681406625256e-05, "loss": 1.019, "step": 269 }, { "epoch": 0.13803680981595093, "grad_norm": 4.523059368133545, "learning_rate": 9.794419048119733e-05, "loss": 0.9854, "step": 270 }, { "epoch": 0.1385480572597137, "grad_norm": 4.085573673248291, "learning_rate": 9.792010293373116e-05, "loss": 0.9711, "step": 271 }, { "epoch": 0.1390593047034765, "grad_norm": 4.123945713043213, "learning_rate": 9.789587808914093e-05, "loss": 0.9732, "step": 272 }, { "epoch": 0.13957055214723926, "grad_norm": 4.804892063140869, "learning_rate": 9.78715160168339e-05, "loss": 0.9582, "step": 273 }, { "epoch": 0.14008179959100203, "grad_norm": 4.866881370544434, "learning_rate": 9.784701678661045e-05, "loss": 0.9736, "step": 274 }, { "epoch": 0.14059304703476483, "grad_norm": 4.524308681488037, "learning_rate": 9.782238046866397e-05, "loss": 0.9397, "step": 275 }, { "epoch": 0.1411042944785276, "grad_norm": 4.348950386047363, "learning_rate": 9.779760713358059e-05, "loss": 1.0025, "step": 276 }, { "epoch": 0.1416155419222904, "grad_norm": 4.245871543884277, "learning_rate": 9.777269685233906e-05, "loss": 1.002, "step": 277 }, { "epoch": 0.14212678936605316, "grad_norm": 5.086630344390869, "learning_rate": 9.774764969631043e-05, "loss": 0.9445, "step": 278 }, { "epoch": 0.14263803680981596, "grad_norm": 4.560818195343018, "learning_rate": 9.772246573725799e-05, "loss": 0.9957, "step": 279 }, { "epoch": 0.14314928425357873, "grad_norm": 5.210485458374023, "learning_rate": 9.769714504733694e-05, "loss": 0.903, "step": 280 }, { "epoch": 0.14366053169734153, "grad_norm": 4.558165550231934, "learning_rate": 9.767168769909425e-05, "loss": 1.0033, "step": 281 }, { "epoch": 0.1441717791411043, "grad_norm": 4.681556701660156, "learning_rate": 9.76460937654684e-05, "loss": 0.9035, "step": 282 }, { "epoch": 0.14468302658486706, "grad_norm": 4.702577114105225, "learning_rate": 9.762036331978927e-05, "loss": 1.0228, "step": 283 }, { "epoch": 0.14519427402862986, "grad_norm": 5.256160259246826, "learning_rate": 9.759449643577778e-05, "loss": 1.0509, "step": 284 }, { "epoch": 0.14570552147239263, "grad_norm": 4.62410831451416, "learning_rate": 9.756849318754584e-05, "loss": 0.9495, "step": 285 }, { "epoch": 0.14621676891615543, "grad_norm": 4.917038917541504, "learning_rate": 9.754235364959601e-05, "loss": 1.0343, "step": 286 }, { "epoch": 0.1467280163599182, "grad_norm": 5.558690547943115, "learning_rate": 9.751607789682138e-05, "loss": 0.9006, "step": 287 }, { "epoch": 0.147239263803681, "grad_norm": 5.684017658233643, "learning_rate": 9.748966600450525e-05, "loss": 0.955, "step": 288 }, { "epoch": 0.14775051124744376, "grad_norm": 5.3489508628845215, "learning_rate": 9.746311804832105e-05, "loss": 0.8096, "step": 289 }, { "epoch": 0.14826175869120656, "grad_norm": 5.76107931137085, "learning_rate": 9.743643410433197e-05, "loss": 1.0569, "step": 290 }, { "epoch": 0.14877300613496933, "grad_norm": 5.042691707611084, "learning_rate": 9.740961424899092e-05, "loss": 0.8595, "step": 291 }, { "epoch": 0.1492842535787321, "grad_norm": 5.10003137588501, "learning_rate": 9.738265855914013e-05, "loss": 0.8284, "step": 292 }, { "epoch": 0.1497955010224949, "grad_norm": 5.545564651489258, "learning_rate": 9.735556711201103e-05, "loss": 0.9223, "step": 293 }, { "epoch": 0.15030674846625766, "grad_norm": 5.7029337882995605, "learning_rate": 9.732833998522407e-05, "loss": 0.8667, "step": 294 }, { "epoch": 0.15081799591002046, "grad_norm": 6.165698528289795, "learning_rate": 9.730097725678834e-05, "loss": 0.912, "step": 295 }, { "epoch": 0.15132924335378323, "grad_norm": 6.472256183624268, "learning_rate": 9.727347900510155e-05, "loss": 0.9143, "step": 296 }, { "epoch": 0.15184049079754602, "grad_norm": 6.160712718963623, "learning_rate": 9.724584530894962e-05, "loss": 0.8261, "step": 297 }, { "epoch": 0.1523517382413088, "grad_norm": 7.447754859924316, "learning_rate": 9.721807624750658e-05, "loss": 0.996, "step": 298 }, { "epoch": 0.15286298568507156, "grad_norm": 7.772396087646484, "learning_rate": 9.719017190033425e-05, "loss": 0.7686, "step": 299 }, { "epoch": 0.15337423312883436, "grad_norm": 8.158390045166016, "learning_rate": 9.716213234738215e-05, "loss": 0.5601, "step": 300 }, { "epoch": 0.15388548057259713, "grad_norm": 3.221916437149048, "learning_rate": 9.71339576689871e-05, "loss": 0.8234, "step": 301 }, { "epoch": 0.15439672801635992, "grad_norm": 3.221381187438965, "learning_rate": 9.710564794587309e-05, "loss": 1.1071, "step": 302 }, { "epoch": 0.1549079754601227, "grad_norm": 3.519752025604248, "learning_rate": 9.707720325915104e-05, "loss": 1.0924, "step": 303 }, { "epoch": 0.1554192229038855, "grad_norm": 3.5323832035064697, "learning_rate": 9.704862369031857e-05, "loss": 1.034, "step": 304 }, { "epoch": 0.15593047034764826, "grad_norm": 3.57119083404541, "learning_rate": 9.701990932125976e-05, "loss": 1.1247, "step": 305 }, { "epoch": 0.15644171779141106, "grad_norm": 3.4789652824401855, "learning_rate": 9.699106023424482e-05, "loss": 1.1318, "step": 306 }, { "epoch": 0.15695296523517382, "grad_norm": 3.551086664199829, "learning_rate": 9.696207651193008e-05, "loss": 1.0059, "step": 307 }, { "epoch": 0.1574642126789366, "grad_norm": 3.465805768966675, "learning_rate": 9.693295823735753e-05, "loss": 1.1119, "step": 308 }, { "epoch": 0.1579754601226994, "grad_norm": 4.0047807693481445, "learning_rate": 9.690370549395468e-05, "loss": 0.9925, "step": 309 }, { "epoch": 0.15848670756646216, "grad_norm": 3.724668502807617, "learning_rate": 9.687431836553434e-05, "loss": 1.0408, "step": 310 }, { "epoch": 0.15899795501022496, "grad_norm": 4.307526111602783, "learning_rate": 9.684479693629432e-05, "loss": 1.162, "step": 311 }, { "epoch": 0.15950920245398773, "grad_norm": 4.771853446960449, "learning_rate": 9.681514129081724e-05, "loss": 1.0714, "step": 312 }, { "epoch": 0.16002044989775052, "grad_norm": 3.7539167404174805, "learning_rate": 9.678535151407023e-05, "loss": 0.9453, "step": 313 }, { "epoch": 0.1605316973415133, "grad_norm": 4.115946292877197, "learning_rate": 9.675542769140476e-05, "loss": 1.1075, "step": 314 }, { "epoch": 0.16104294478527606, "grad_norm": 3.835968255996704, "learning_rate": 9.672536990855635e-05, "loss": 0.93, "step": 315 }, { "epoch": 0.16155419222903886, "grad_norm": 4.084163665771484, "learning_rate": 9.669517825164434e-05, "loss": 1.0297, "step": 316 }, { "epoch": 0.16206543967280163, "grad_norm": 5.519273281097412, "learning_rate": 9.666485280717161e-05, "loss": 0.9544, "step": 317 }, { "epoch": 0.16257668711656442, "grad_norm": 4.394032955169678, "learning_rate": 9.663439366202438e-05, "loss": 0.9425, "step": 318 }, { "epoch": 0.1630879345603272, "grad_norm": 4.335214138031006, "learning_rate": 9.660380090347192e-05, "loss": 0.9202, "step": 319 }, { "epoch": 0.16359918200409, "grad_norm": 3.6991333961486816, "learning_rate": 9.657307461916635e-05, "loss": 0.9667, "step": 320 }, { "epoch": 0.16411042944785276, "grad_norm": 4.654758453369141, "learning_rate": 9.654221489714234e-05, "loss": 1.0158, "step": 321 }, { "epoch": 0.16462167689161555, "grad_norm": 4.374093055725098, "learning_rate": 9.651122182581688e-05, "loss": 1.0641, "step": 322 }, { "epoch": 0.16513292433537832, "grad_norm": 4.553246974945068, "learning_rate": 9.6480095493989e-05, "loss": 0.9888, "step": 323 }, { "epoch": 0.1656441717791411, "grad_norm": 3.8664913177490234, "learning_rate": 9.644883599083958e-05, "loss": 0.89, "step": 324 }, { "epoch": 0.1661554192229039, "grad_norm": 4.2256760597229, "learning_rate": 9.6417443405931e-05, "loss": 0.9653, "step": 325 }, { "epoch": 0.16666666666666666, "grad_norm": 4.243971347808838, "learning_rate": 9.638591782920698e-05, "loss": 0.9588, "step": 326 }, { "epoch": 0.16717791411042945, "grad_norm": 4.24473762512207, "learning_rate": 9.635425935099228e-05, "loss": 0.9181, "step": 327 }, { "epoch": 0.16768916155419222, "grad_norm": 4.342863082885742, "learning_rate": 9.632246806199241e-05, "loss": 0.9603, "step": 328 }, { "epoch": 0.16820040899795502, "grad_norm": 4.400531768798828, "learning_rate": 9.62905440532934e-05, "loss": 0.9983, "step": 329 }, { "epoch": 0.1687116564417178, "grad_norm": 4.428309440612793, "learning_rate": 9.625848741636157e-05, "loss": 0.8637, "step": 330 }, { "epoch": 0.16922290388548059, "grad_norm": 4.119474411010742, "learning_rate": 9.62262982430432e-05, "loss": 0.8998, "step": 331 }, { "epoch": 0.16973415132924335, "grad_norm": 4.993500709533691, "learning_rate": 9.619397662556435e-05, "loss": 0.9449, "step": 332 }, { "epoch": 0.17024539877300612, "grad_norm": 4.711348056793213, "learning_rate": 9.616152265653048e-05, "loss": 1.0019, "step": 333 }, { "epoch": 0.17075664621676892, "grad_norm": 4.277347564697266, "learning_rate": 9.612893642892634e-05, "loss": 0.9081, "step": 334 }, { "epoch": 0.1712678936605317, "grad_norm": 4.902201175689697, "learning_rate": 9.609621803611555e-05, "loss": 0.9769, "step": 335 }, { "epoch": 0.17177914110429449, "grad_norm": 4.935635566711426, "learning_rate": 9.60633675718404e-05, "loss": 1.0006, "step": 336 }, { "epoch": 0.17229038854805726, "grad_norm": 4.689595699310303, "learning_rate": 9.603038513022165e-05, "loss": 0.9397, "step": 337 }, { "epoch": 0.17280163599182005, "grad_norm": 5.0966362953186035, "learning_rate": 9.59972708057581e-05, "loss": 0.9297, "step": 338 }, { "epoch": 0.17331288343558282, "grad_norm": 4.845507621765137, "learning_rate": 9.596402469332648e-05, "loss": 0.9348, "step": 339 }, { "epoch": 0.1738241308793456, "grad_norm": 4.724996089935303, "learning_rate": 9.59306468881811e-05, "loss": 0.9311, "step": 340 }, { "epoch": 0.1743353783231084, "grad_norm": 4.883242130279541, "learning_rate": 9.589713748595352e-05, "loss": 0.8652, "step": 341 }, { "epoch": 0.17484662576687116, "grad_norm": 5.192939758300781, "learning_rate": 9.586349658265245e-05, "loss": 0.8496, "step": 342 }, { "epoch": 0.17535787321063395, "grad_norm": 4.756380558013916, "learning_rate": 9.582972427466328e-05, "loss": 0.8408, "step": 343 }, { "epoch": 0.17586912065439672, "grad_norm": 5.135041236877441, "learning_rate": 9.579582065874793e-05, "loss": 0.8428, "step": 344 }, { "epoch": 0.17638036809815952, "grad_norm": 6.151747703552246, "learning_rate": 9.576178583204453e-05, "loss": 0.9985, "step": 345 }, { "epoch": 0.1768916155419223, "grad_norm": 6.850955486297607, "learning_rate": 9.572761989206712e-05, "loss": 0.824, "step": 346 }, { "epoch": 0.17740286298568508, "grad_norm": 6.425426959991455, "learning_rate": 9.569332293670543e-05, "loss": 0.8296, "step": 347 }, { "epoch": 0.17791411042944785, "grad_norm": 6.790580749511719, "learning_rate": 9.565889506422456e-05, "loss": 0.9905, "step": 348 }, { "epoch": 0.17842535787321062, "grad_norm": 6.514560699462891, "learning_rate": 9.562433637326468e-05, "loss": 0.6521, "step": 349 }, { "epoch": 0.17893660531697342, "grad_norm": 9.105517387390137, "learning_rate": 9.558964696284081e-05, "loss": 0.7948, "step": 350 }, { "epoch": 0.1794478527607362, "grad_norm": 3.3637101650238037, "learning_rate": 9.555482693234245e-05, "loss": 1.011, "step": 351 }, { "epoch": 0.17995910020449898, "grad_norm": 2.648695230484009, "learning_rate": 9.551987638153339e-05, "loss": 0.9487, "step": 352 }, { "epoch": 0.18047034764826175, "grad_norm": 3.222560405731201, "learning_rate": 9.548479541055133e-05, "loss": 1.0157, "step": 353 }, { "epoch": 0.18098159509202455, "grad_norm": 3.2273850440979004, "learning_rate": 9.54495841199077e-05, "loss": 1.116, "step": 354 }, { "epoch": 0.18149284253578732, "grad_norm": 3.239750862121582, "learning_rate": 9.541424261048725e-05, "loss": 0.9958, "step": 355 }, { "epoch": 0.18200408997955012, "grad_norm": 3.241530418395996, "learning_rate": 9.537877098354786e-05, "loss": 1.0277, "step": 356 }, { "epoch": 0.18251533742331288, "grad_norm": 3.041856050491333, "learning_rate": 9.534316934072021e-05, "loss": 1.0157, "step": 357 }, { "epoch": 0.18302658486707565, "grad_norm": 3.3174169063568115, "learning_rate": 9.530743778400746e-05, "loss": 1.0286, "step": 358 }, { "epoch": 0.18353783231083845, "grad_norm": 3.524456262588501, "learning_rate": 9.527157641578506e-05, "loss": 1.0808, "step": 359 }, { "epoch": 0.18404907975460122, "grad_norm": 3.1490230560302734, "learning_rate": 9.52355853388003e-05, "loss": 1.0235, "step": 360 }, { "epoch": 0.18456032719836402, "grad_norm": 3.3029637336730957, "learning_rate": 9.519946465617218e-05, "loss": 1.0764, "step": 361 }, { "epoch": 0.18507157464212678, "grad_norm": 3.6158411502838135, "learning_rate": 9.516321447139096e-05, "loss": 1.0236, "step": 362 }, { "epoch": 0.18558282208588958, "grad_norm": 3.52386736869812, "learning_rate": 9.512683488831802e-05, "loss": 1.0615, "step": 363 }, { "epoch": 0.18609406952965235, "grad_norm": 4.17709493637085, "learning_rate": 9.509032601118541e-05, "loss": 0.9873, "step": 364 }, { "epoch": 0.18660531697341512, "grad_norm": 3.8697471618652344, "learning_rate": 9.505368794459568e-05, "loss": 1.0644, "step": 365 }, { "epoch": 0.18711656441717792, "grad_norm": 3.6970717906951904, "learning_rate": 9.50169207935215e-05, "loss": 1.0334, "step": 366 }, { "epoch": 0.18762781186094069, "grad_norm": 3.9770524501800537, "learning_rate": 9.498002466330535e-05, "loss": 0.9283, "step": 367 }, { "epoch": 0.18813905930470348, "grad_norm": 3.809328317642212, "learning_rate": 9.494299965965933e-05, "loss": 0.9207, "step": 368 }, { "epoch": 0.18865030674846625, "grad_norm": 4.586056232452393, "learning_rate": 9.490584588866471e-05, "loss": 0.9965, "step": 369 }, { "epoch": 0.18916155419222905, "grad_norm": 4.179121494293213, "learning_rate": 9.486856345677173e-05, "loss": 0.921, "step": 370 }, { "epoch": 0.18967280163599182, "grad_norm": 4.585204124450684, "learning_rate": 9.483115247079924e-05, "loss": 0.95, "step": 371 }, { "epoch": 0.1901840490797546, "grad_norm": 4.396795749664307, "learning_rate": 9.47936130379344e-05, "loss": 1.0279, "step": 372 }, { "epoch": 0.19069529652351738, "grad_norm": 3.7172300815582275, "learning_rate": 9.475594526573245e-05, "loss": 1.0017, "step": 373 }, { "epoch": 0.19120654396728015, "grad_norm": 3.854496717453003, "learning_rate": 9.471814926211627e-05, "loss": 0.9455, "step": 374 }, { "epoch": 0.19171779141104295, "grad_norm": 4.243702411651611, "learning_rate": 9.468022513537617e-05, "loss": 0.9785, "step": 375 }, { "epoch": 0.19222903885480572, "grad_norm": 4.117717266082764, "learning_rate": 9.464217299416956e-05, "loss": 0.8867, "step": 376 }, { "epoch": 0.19274028629856851, "grad_norm": 4.680307865142822, "learning_rate": 9.460399294752061e-05, "loss": 1.0332, "step": 377 }, { "epoch": 0.19325153374233128, "grad_norm": 3.853069543838501, "learning_rate": 9.456568510481992e-05, "loss": 0.8719, "step": 378 }, { "epoch": 0.19376278118609408, "grad_norm": 4.585060119628906, "learning_rate": 9.452724957582433e-05, "loss": 0.8617, "step": 379 }, { "epoch": 0.19427402862985685, "grad_norm": 4.1220598220825195, "learning_rate": 9.448868647065642e-05, "loss": 0.9412, "step": 380 }, { "epoch": 0.19478527607361965, "grad_norm": 4.529792785644531, "learning_rate": 9.444999589980437e-05, "loss": 0.8709, "step": 381 }, { "epoch": 0.19529652351738241, "grad_norm": 4.1829447746276855, "learning_rate": 9.441117797412154e-05, "loss": 0.9142, "step": 382 }, { "epoch": 0.19580777096114518, "grad_norm": 4.374628067016602, "learning_rate": 9.437223280482613e-05, "loss": 1.0274, "step": 383 }, { "epoch": 0.19631901840490798, "grad_norm": 4.125802516937256, "learning_rate": 9.433316050350099e-05, "loss": 0.937, "step": 384 }, { "epoch": 0.19683026584867075, "grad_norm": 4.6389360427856445, "learning_rate": 9.429396118209316e-05, "loss": 1.1074, "step": 385 }, { "epoch": 0.19734151329243355, "grad_norm": 5.501649379730225, "learning_rate": 9.425463495291363e-05, "loss": 0.9531, "step": 386 }, { "epoch": 0.19785276073619631, "grad_norm": 4.594613552093506, "learning_rate": 9.421518192863701e-05, "loss": 0.8191, "step": 387 }, { "epoch": 0.1983640081799591, "grad_norm": 4.314398288726807, "learning_rate": 9.417560222230115e-05, "loss": 0.8347, "step": 388 }, { "epoch": 0.19887525562372188, "grad_norm": 4.536204814910889, "learning_rate": 9.413589594730692e-05, "loss": 0.9885, "step": 389 }, { "epoch": 0.19938650306748465, "grad_norm": 4.814162254333496, "learning_rate": 9.409606321741775e-05, "loss": 0.8417, "step": 390 }, { "epoch": 0.19989775051124745, "grad_norm": 5.067505359649658, "learning_rate": 9.405610414675948e-05, "loss": 0.9362, "step": 391 }, { "epoch": 0.20040899795501022, "grad_norm": 5.531371593475342, "learning_rate": 9.401601884981983e-05, "loss": 0.8284, "step": 392 }, { "epoch": 0.200920245398773, "grad_norm": 5.133208751678467, "learning_rate": 9.397580744144822e-05, "loss": 1.0076, "step": 393 }, { "epoch": 0.20143149284253578, "grad_norm": 5.163256645202637, "learning_rate": 9.393547003685543e-05, "loss": 0.8889, "step": 394 }, { "epoch": 0.20194274028629858, "grad_norm": 4.979630947113037, "learning_rate": 9.389500675161318e-05, "loss": 0.8315, "step": 395 }, { "epoch": 0.20245398773006135, "grad_norm": 5.355342864990234, "learning_rate": 9.385441770165385e-05, "loss": 0.822, "step": 396 }, { "epoch": 0.20296523517382414, "grad_norm": 5.970470428466797, "learning_rate": 9.381370300327021e-05, "loss": 0.8511, "step": 397 }, { "epoch": 0.2034764826175869, "grad_norm": 6.616512775421143, "learning_rate": 9.377286277311496e-05, "loss": 0.8692, "step": 398 }, { "epoch": 0.20398773006134968, "grad_norm": 6.648594856262207, "learning_rate": 9.373189712820055e-05, "loss": 0.8196, "step": 399 }, { "epoch": 0.20449897750511248, "grad_norm": 9.843427658081055, "learning_rate": 9.369080618589864e-05, "loss": 1.005, "step": 400 }, { "epoch": 0.20501022494887525, "grad_norm": 3.1759207248687744, "learning_rate": 9.364959006394002e-05, "loss": 0.9657, "step": 401 }, { "epoch": 0.20552147239263804, "grad_norm": 3.081962823867798, "learning_rate": 9.3608248880414e-05, "loss": 1.008, "step": 402 }, { "epoch": 0.2060327198364008, "grad_norm": 3.0837666988372803, "learning_rate": 9.356678275376832e-05, "loss": 1.0316, "step": 403 }, { "epoch": 0.2065439672801636, "grad_norm": 3.0529189109802246, "learning_rate": 9.35251918028086e-05, "loss": 1.086, "step": 404 }, { "epoch": 0.20705521472392638, "grad_norm": 3.318873405456543, "learning_rate": 9.34834761466982e-05, "loss": 1.0928, "step": 405 }, { "epoch": 0.20756646216768918, "grad_norm": 3.416926860809326, "learning_rate": 9.344163590495771e-05, "loss": 1.2316, "step": 406 }, { "epoch": 0.20807770961145194, "grad_norm": 3.2749698162078857, "learning_rate": 9.339967119746465e-05, "loss": 1.0556, "step": 407 }, { "epoch": 0.2085889570552147, "grad_norm": 3.5399835109710693, "learning_rate": 9.335758214445324e-05, "loss": 1.0681, "step": 408 }, { "epoch": 0.2091002044989775, "grad_norm": 3.271688222885132, "learning_rate": 9.331536886651387e-05, "loss": 0.903, "step": 409 }, { "epoch": 0.20961145194274028, "grad_norm": 3.6455349922180176, "learning_rate": 9.327303148459293e-05, "loss": 1.0631, "step": 410 }, { "epoch": 0.21012269938650308, "grad_norm": 3.230680227279663, "learning_rate": 9.32305701199923e-05, "loss": 1.0304, "step": 411 }, { "epoch": 0.21063394683026584, "grad_norm": 3.522308349609375, "learning_rate": 9.318798489436917e-05, "loss": 1.0796, "step": 412 }, { "epoch": 0.21114519427402864, "grad_norm": 3.4290144443511963, "learning_rate": 9.314527592973555e-05, "loss": 1.0312, "step": 413 }, { "epoch": 0.2116564417177914, "grad_norm": 3.476480007171631, "learning_rate": 9.310244334845801e-05, "loss": 0.9853, "step": 414 }, { "epoch": 0.21216768916155418, "grad_norm": 3.495586633682251, "learning_rate": 9.305948727325728e-05, "loss": 0.9321, "step": 415 }, { "epoch": 0.21267893660531698, "grad_norm": 3.8010475635528564, "learning_rate": 9.301640782720792e-05, "loss": 0.9269, "step": 416 }, { "epoch": 0.21319018404907975, "grad_norm": 3.931910276412964, "learning_rate": 9.297320513373795e-05, "loss": 1.0371, "step": 417 }, { "epoch": 0.21370143149284254, "grad_norm": 3.3684935569763184, "learning_rate": 9.292987931662855e-05, "loss": 0.9407, "step": 418 }, { "epoch": 0.2142126789366053, "grad_norm": 3.5472538471221924, "learning_rate": 9.288643050001361e-05, "loss": 0.9555, "step": 419 }, { "epoch": 0.2147239263803681, "grad_norm": 4.427033424377441, "learning_rate": 9.284285880837946e-05, "loss": 0.9743, "step": 420 }, { "epoch": 0.21523517382413088, "grad_norm": 3.681061267852783, "learning_rate": 9.279916436656451e-05, "loss": 0.8528, "step": 421 }, { "epoch": 0.21574642126789367, "grad_norm": 3.8058080673217773, "learning_rate": 9.275534729975879e-05, "loss": 0.9228, "step": 422 }, { "epoch": 0.21625766871165644, "grad_norm": 3.4031405448913574, "learning_rate": 9.271140773350373e-05, "loss": 0.8928, "step": 423 }, { "epoch": 0.2167689161554192, "grad_norm": 3.6969170570373535, "learning_rate": 9.266734579369172e-05, "loss": 0.9488, "step": 424 }, { "epoch": 0.217280163599182, "grad_norm": 3.792942762374878, "learning_rate": 9.262316160656575e-05, "loss": 0.9386, "step": 425 }, { "epoch": 0.21779141104294478, "grad_norm": 4.491650581359863, "learning_rate": 9.257885529871908e-05, "loss": 0.9395, "step": 426 }, { "epoch": 0.21830265848670757, "grad_norm": 3.8580541610717773, "learning_rate": 9.253442699709485e-05, "loss": 0.9405, "step": 427 }, { "epoch": 0.21881390593047034, "grad_norm": 4.1208696365356445, "learning_rate": 9.248987682898575e-05, "loss": 0.9959, "step": 428 }, { "epoch": 0.21932515337423314, "grad_norm": 4.131077766418457, "learning_rate": 9.24452049220336e-05, "loss": 1.0929, "step": 429 }, { "epoch": 0.2198364008179959, "grad_norm": 4.098513603210449, "learning_rate": 9.240041140422905e-05, "loss": 0.8605, "step": 430 }, { "epoch": 0.22034764826175868, "grad_norm": 3.7456324100494385, "learning_rate": 9.235549640391115e-05, "loss": 0.88, "step": 431 }, { "epoch": 0.22085889570552147, "grad_norm": 4.278169631958008, "learning_rate": 9.231046004976704e-05, "loss": 1.0368, "step": 432 }, { "epoch": 0.22137014314928424, "grad_norm": 4.015793800354004, "learning_rate": 9.226530247083153e-05, "loss": 1.0141, "step": 433 }, { "epoch": 0.22188139059304704, "grad_norm": 5.274085998535156, "learning_rate": 9.222002379648675e-05, "loss": 0.8493, "step": 434 }, { "epoch": 0.2223926380368098, "grad_norm": 4.8212432861328125, "learning_rate": 9.217462415646185e-05, "loss": 0.9706, "step": 435 }, { "epoch": 0.2229038854805726, "grad_norm": 3.999441385269165, "learning_rate": 9.212910368083245e-05, "loss": 0.8746, "step": 436 }, { "epoch": 0.22341513292433537, "grad_norm": 3.9513213634490967, "learning_rate": 9.208346250002049e-05, "loss": 0.8408, "step": 437 }, { "epoch": 0.22392638036809817, "grad_norm": 4.810235977172852, "learning_rate": 9.203770074479367e-05, "loss": 0.9253, "step": 438 }, { "epoch": 0.22443762781186094, "grad_norm": 5.90993070602417, "learning_rate": 9.199181854626517e-05, "loss": 0.9008, "step": 439 }, { "epoch": 0.2249488752556237, "grad_norm": 5.908392906188965, "learning_rate": 9.194581603589328e-05, "loss": 0.9596, "step": 440 }, { "epoch": 0.2254601226993865, "grad_norm": 4.973458766937256, "learning_rate": 9.189969334548096e-05, "loss": 1.0126, "step": 441 }, { "epoch": 0.22597137014314927, "grad_norm": 5.1067962646484375, "learning_rate": 9.185345060717554e-05, "loss": 0.8453, "step": 442 }, { "epoch": 0.22648261758691207, "grad_norm": 5.444033622741699, "learning_rate": 9.180708795346829e-05, "loss": 0.7763, "step": 443 }, { "epoch": 0.22699386503067484, "grad_norm": 5.796935558319092, "learning_rate": 9.1760605517194e-05, "loss": 1.0804, "step": 444 }, { "epoch": 0.22750511247443764, "grad_norm": 5.549903869628906, "learning_rate": 9.171400343153076e-05, "loss": 1.0069, "step": 445 }, { "epoch": 0.2280163599182004, "grad_norm": 5.64030122756958, "learning_rate": 9.166728182999936e-05, "loss": 0.8366, "step": 446 }, { "epoch": 0.2285276073619632, "grad_norm": 6.40301513671875, "learning_rate": 9.16204408464631e-05, "loss": 0.9099, "step": 447 }, { "epoch": 0.22903885480572597, "grad_norm": 6.116087913513184, "learning_rate": 9.157348061512727e-05, "loss": 0.7831, "step": 448 }, { "epoch": 0.22955010224948874, "grad_norm": 7.226142406463623, "learning_rate": 9.152640127053884e-05, "loss": 0.8518, "step": 449 }, { "epoch": 0.23006134969325154, "grad_norm": 8.906120300292969, "learning_rate": 9.147920294758607e-05, "loss": 0.6824, "step": 450 }, { "epoch": 0.2305725971370143, "grad_norm": 2.9229636192321777, "learning_rate": 9.143188578149809e-05, "loss": 1.0234, "step": 451 }, { "epoch": 0.2310838445807771, "grad_norm": 3.875776529312134, "learning_rate": 9.138444990784453e-05, "loss": 1.0901, "step": 452 }, { "epoch": 0.23159509202453987, "grad_norm": 3.7210988998413086, "learning_rate": 9.133689546253514e-05, "loss": 1.0057, "step": 453 }, { "epoch": 0.23210633946830267, "grad_norm": 3.645733118057251, "learning_rate": 9.128922258181938e-05, "loss": 0.9967, "step": 454 }, { "epoch": 0.23261758691206544, "grad_norm": 3.363543748855591, "learning_rate": 9.124143140228608e-05, "loss": 1.0951, "step": 455 }, { "epoch": 0.2331288343558282, "grad_norm": 3.1317014694213867, "learning_rate": 9.119352206086293e-05, "loss": 1.0259, "step": 456 }, { "epoch": 0.233640081799591, "grad_norm": 3.3655569553375244, "learning_rate": 9.114549469481626e-05, "loss": 1.119, "step": 457 }, { "epoch": 0.23415132924335377, "grad_norm": 3.224952220916748, "learning_rate": 9.10973494417505e-05, "loss": 1.0946, "step": 458 }, { "epoch": 0.23466257668711657, "grad_norm": 3.7233197689056396, "learning_rate": 9.104908643960787e-05, "loss": 0.9717, "step": 459 }, { "epoch": 0.23517382413087934, "grad_norm": 3.78554630279541, "learning_rate": 9.100070582666795e-05, "loss": 1.049, "step": 460 }, { "epoch": 0.23568507157464214, "grad_norm": 3.5677905082702637, "learning_rate": 9.095220774154726e-05, "loss": 0.9873, "step": 461 }, { "epoch": 0.2361963190184049, "grad_norm": 3.1176674365997314, "learning_rate": 9.090359232319893e-05, "loss": 0.8864, "step": 462 }, { "epoch": 0.2367075664621677, "grad_norm": 3.3484392166137695, "learning_rate": 9.085485971091225e-05, "loss": 0.9754, "step": 463 }, { "epoch": 0.23721881390593047, "grad_norm": 3.456024408340454, "learning_rate": 9.080601004431229e-05, "loss": 0.8267, "step": 464 }, { "epoch": 0.23773006134969324, "grad_norm": 3.6923391819000244, "learning_rate": 9.075704346335947e-05, "loss": 0.916, "step": 465 }, { "epoch": 0.23824130879345604, "grad_norm": 3.570155143737793, "learning_rate": 9.070796010834923e-05, "loss": 0.9735, "step": 466 }, { "epoch": 0.2387525562372188, "grad_norm": 4.115891456604004, "learning_rate": 9.065876011991155e-05, "loss": 1.0247, "step": 467 }, { "epoch": 0.2392638036809816, "grad_norm": 3.4448201656341553, "learning_rate": 9.060944363901056e-05, "loss": 0.9963, "step": 468 }, { "epoch": 0.23977505112474437, "grad_norm": 3.9490420818328857, "learning_rate": 9.056001080694423e-05, "loss": 0.9454, "step": 469 }, { "epoch": 0.24028629856850717, "grad_norm": 3.538975954055786, "learning_rate": 9.05104617653438e-05, "loss": 0.9077, "step": 470 }, { "epoch": 0.24079754601226994, "grad_norm": 3.586324453353882, "learning_rate": 9.046079665617354e-05, "loss": 0.9294, "step": 471 }, { "epoch": 0.24130879345603273, "grad_norm": 3.7943360805511475, "learning_rate": 9.041101562173023e-05, "loss": 0.9706, "step": 472 }, { "epoch": 0.2418200408997955, "grad_norm": 3.871798276901245, "learning_rate": 9.036111880464277e-05, "loss": 0.9572, "step": 473 }, { "epoch": 0.24233128834355827, "grad_norm": 3.678107976913452, "learning_rate": 9.031110634787186e-05, "loss": 0.9078, "step": 474 }, { "epoch": 0.24284253578732107, "grad_norm": 3.8214452266693115, "learning_rate": 9.026097839470947e-05, "loss": 0.899, "step": 475 }, { "epoch": 0.24335378323108384, "grad_norm": 4.086860656738281, "learning_rate": 9.021073508877845e-05, "loss": 0.8796, "step": 476 }, { "epoch": 0.24386503067484663, "grad_norm": 3.7152156829833984, "learning_rate": 9.016037657403224e-05, "loss": 0.8959, "step": 477 }, { "epoch": 0.2443762781186094, "grad_norm": 4.333080291748047, "learning_rate": 9.010990299475433e-05, "loss": 0.8454, "step": 478 }, { "epoch": 0.2448875255623722, "grad_norm": 3.956611156463623, "learning_rate": 9.005931449555782e-05, "loss": 1.0178, "step": 479 }, { "epoch": 0.24539877300613497, "grad_norm": 3.650317430496216, "learning_rate": 9.000861122138517e-05, "loss": 0.8329, "step": 480 }, { "epoch": 0.24591002044989774, "grad_norm": 4.062465190887451, "learning_rate": 8.995779331750764e-05, "loss": 0.9728, "step": 481 }, { "epoch": 0.24642126789366053, "grad_norm": 4.255726337432861, "learning_rate": 8.99068609295249e-05, "loss": 0.9331, "step": 482 }, { "epoch": 0.2469325153374233, "grad_norm": 4.312367916107178, "learning_rate": 8.985581420336465e-05, "loss": 0.9291, "step": 483 }, { "epoch": 0.2474437627811861, "grad_norm": 4.059680461883545, "learning_rate": 8.980465328528219e-05, "loss": 0.9143, "step": 484 }, { "epoch": 0.24795501022494887, "grad_norm": 4.124285697937012, "learning_rate": 8.975337832186e-05, "loss": 0.8942, "step": 485 }, { "epoch": 0.24846625766871167, "grad_norm": 4.120577335357666, "learning_rate": 8.970198946000727e-05, "loss": 0.8887, "step": 486 }, { "epoch": 0.24897750511247443, "grad_norm": 4.375182628631592, "learning_rate": 8.965048684695955e-05, "loss": 0.9409, "step": 487 }, { "epoch": 0.24948875255623723, "grad_norm": 4.595582962036133, "learning_rate": 8.959887063027837e-05, "loss": 0.8955, "step": 488 }, { "epoch": 0.25, "grad_norm": 4.253859519958496, "learning_rate": 8.954714095785062e-05, "loss": 0.8379, "step": 489 }, { "epoch": 0.2505112474437628, "grad_norm": 4.908374309539795, "learning_rate": 8.949529797788836e-05, "loss": 0.9577, "step": 490 }, { "epoch": 0.25102249488752554, "grad_norm": 4.503420829772949, "learning_rate": 8.944334183892822e-05, "loss": 0.8906, "step": 491 }, { "epoch": 0.25153374233128833, "grad_norm": 6.037133693695068, "learning_rate": 8.939127268983108e-05, "loss": 0.95, "step": 492 }, { "epoch": 0.25204498977505113, "grad_norm": 4.602914333343506, "learning_rate": 8.933909067978163e-05, "loss": 0.8129, "step": 493 }, { "epoch": 0.25255623721881393, "grad_norm": 5.256838321685791, "learning_rate": 8.928679595828786e-05, "loss": 0.8297, "step": 494 }, { "epoch": 0.25306748466257667, "grad_norm": 5.184623718261719, "learning_rate": 8.923438867518075e-05, "loss": 0.9879, "step": 495 }, { "epoch": 0.25357873210633947, "grad_norm": 5.045862674713135, "learning_rate": 8.918186898061376e-05, "loss": 0.911, "step": 496 }, { "epoch": 0.25408997955010226, "grad_norm": 5.845618724822998, "learning_rate": 8.912923702506241e-05, "loss": 0.8597, "step": 497 }, { "epoch": 0.254601226993865, "grad_norm": 6.090303421020508, "learning_rate": 8.907649295932387e-05, "loss": 0.8402, "step": 498 }, { "epoch": 0.2551124744376278, "grad_norm": 6.933328628540039, "learning_rate": 8.902363693451654e-05, "loss": 0.92, "step": 499 }, { "epoch": 0.2556237218813906, "grad_norm": 8.922635078430176, "learning_rate": 8.897066910207958e-05, "loss": 0.9011, "step": 500 }, { "epoch": 0.2561349693251534, "grad_norm": 2.395228147506714, "learning_rate": 8.89175896137725e-05, "loss": 0.8274, "step": 501 }, { "epoch": 0.25664621676891614, "grad_norm": 2.550231695175171, "learning_rate": 8.886439862167469e-05, "loss": 0.9732, "step": 502 }, { "epoch": 0.25715746421267893, "grad_norm": 3.1747238636016846, "learning_rate": 8.881109627818504e-05, "loss": 1.1108, "step": 503 }, { "epoch": 0.25766871165644173, "grad_norm": 3.0045149326324463, "learning_rate": 8.875768273602148e-05, "loss": 1.0678, "step": 504 }, { "epoch": 0.2581799591002045, "grad_norm": 2.9451730251312256, "learning_rate": 8.870415814822052e-05, "loss": 1.025, "step": 505 }, { "epoch": 0.25869120654396727, "grad_norm": 3.0222766399383545, "learning_rate": 8.865052266813685e-05, "loss": 1.1336, "step": 506 }, { "epoch": 0.25920245398773006, "grad_norm": 3.1302473545074463, "learning_rate": 8.859677644944287e-05, "loss": 1.088, "step": 507 }, { "epoch": 0.25971370143149286, "grad_norm": 3.14927339553833, "learning_rate": 8.854291964612825e-05, "loss": 1.0584, "step": 508 }, { "epoch": 0.2602249488752556, "grad_norm": 3.135263204574585, "learning_rate": 8.848895241249949e-05, "loss": 1.0483, "step": 509 }, { "epoch": 0.2607361963190184, "grad_norm": 3.1471927165985107, "learning_rate": 8.843487490317954e-05, "loss": 1.0553, "step": 510 }, { "epoch": 0.2612474437627812, "grad_norm": 2.991029739379883, "learning_rate": 8.838068727310724e-05, "loss": 0.9881, "step": 511 }, { "epoch": 0.261758691206544, "grad_norm": 3.197028398513794, "learning_rate": 8.832638967753699e-05, "loss": 0.9918, "step": 512 }, { "epoch": 0.26226993865030673, "grad_norm": 3.349247694015503, "learning_rate": 8.827198227203821e-05, "loss": 0.9742, "step": 513 }, { "epoch": 0.26278118609406953, "grad_norm": 3.336989164352417, "learning_rate": 8.821746521249499e-05, "loss": 0.9569, "step": 514 }, { "epoch": 0.2632924335378323, "grad_norm": 3.321845531463623, "learning_rate": 8.816283865510554e-05, "loss": 0.9315, "step": 515 }, { "epoch": 0.26380368098159507, "grad_norm": 3.3562281131744385, "learning_rate": 8.810810275638183e-05, "loss": 1.0072, "step": 516 }, { "epoch": 0.26431492842535786, "grad_norm": 3.2323803901672363, "learning_rate": 8.805325767314909e-05, "loss": 0.9317, "step": 517 }, { "epoch": 0.26482617586912066, "grad_norm": 3.820136547088623, "learning_rate": 8.799830356254539e-05, "loss": 0.9336, "step": 518 }, { "epoch": 0.26533742331288346, "grad_norm": 3.5097525119781494, "learning_rate": 8.794324058202116e-05, "loss": 0.9786, "step": 519 }, { "epoch": 0.2658486707566462, "grad_norm": 3.6844258308410645, "learning_rate": 8.788806888933881e-05, "loss": 0.975, "step": 520 }, { "epoch": 0.266359918200409, "grad_norm": 3.6815667152404785, "learning_rate": 8.783278864257211e-05, "loss": 0.9411, "step": 521 }, { "epoch": 0.2668711656441718, "grad_norm": 3.773531675338745, "learning_rate": 8.777740000010599e-05, "loss": 0.9664, "step": 522 }, { "epoch": 0.26738241308793453, "grad_norm": 3.854552745819092, "learning_rate": 8.772190312063583e-05, "loss": 0.9537, "step": 523 }, { "epoch": 0.26789366053169733, "grad_norm": 4.011033058166504, "learning_rate": 8.766629816316721e-05, "loss": 0.9038, "step": 524 }, { "epoch": 0.2684049079754601, "grad_norm": 3.57070255279541, "learning_rate": 8.761058528701529e-05, "loss": 0.8889, "step": 525 }, { "epoch": 0.2689161554192229, "grad_norm": 3.7002241611480713, "learning_rate": 8.75547646518045e-05, "loss": 0.9296, "step": 526 }, { "epoch": 0.26942740286298567, "grad_norm": 3.872769594192505, "learning_rate": 8.749883641746798e-05, "loss": 0.9211, "step": 527 }, { "epoch": 0.26993865030674846, "grad_norm": 4.013867378234863, "learning_rate": 8.744280074424713e-05, "loss": 1.0296, "step": 528 }, { "epoch": 0.27044989775051126, "grad_norm": 3.432490587234497, "learning_rate": 8.738665779269125e-05, "loss": 0.7884, "step": 529 }, { "epoch": 0.27096114519427406, "grad_norm": 3.7517478466033936, "learning_rate": 8.733040772365693e-05, "loss": 0.8918, "step": 530 }, { "epoch": 0.2714723926380368, "grad_norm": 3.8097121715545654, "learning_rate": 8.72740506983077e-05, "loss": 1.0147, "step": 531 }, { "epoch": 0.2719836400817996, "grad_norm": 4.307250022888184, "learning_rate": 8.721758687811352e-05, "loss": 0.9346, "step": 532 }, { "epoch": 0.2724948875255624, "grad_norm": 4.444828033447266, "learning_rate": 8.716101642485035e-05, "loss": 1.0747, "step": 533 }, { "epoch": 0.27300613496932513, "grad_norm": 4.189756393432617, "learning_rate": 8.710433950059967e-05, "loss": 0.9034, "step": 534 }, { "epoch": 0.27351738241308793, "grad_norm": 4.013859748840332, "learning_rate": 8.704755626774796e-05, "loss": 0.9403, "step": 535 }, { "epoch": 0.2740286298568507, "grad_norm": 4.528006553649902, "learning_rate": 8.699066688898635e-05, "loss": 0.9282, "step": 536 }, { "epoch": 0.2745398773006135, "grad_norm": 4.314260482788086, "learning_rate": 8.69336715273101e-05, "loss": 0.9731, "step": 537 }, { "epoch": 0.27505112474437626, "grad_norm": 4.596576690673828, "learning_rate": 8.687657034601801e-05, "loss": 0.8781, "step": 538 }, { "epoch": 0.27556237218813906, "grad_norm": 4.438699245452881, "learning_rate": 8.681936350871222e-05, "loss": 0.8967, "step": 539 }, { "epoch": 0.27607361963190186, "grad_norm": 5.038392543792725, "learning_rate": 8.676205117929752e-05, "loss": 1.0196, "step": 540 }, { "epoch": 0.2765848670756646, "grad_norm": 4.725849628448486, "learning_rate": 8.670463352198088e-05, "loss": 0.903, "step": 541 }, { "epoch": 0.2770961145194274, "grad_norm": 4.913168907165527, "learning_rate": 8.664711070127117e-05, "loss": 0.9673, "step": 542 }, { "epoch": 0.2776073619631902, "grad_norm": 4.891239166259766, "learning_rate": 8.658948288197848e-05, "loss": 0.9758, "step": 543 }, { "epoch": 0.278118609406953, "grad_norm": 4.979687213897705, "learning_rate": 8.65317502292138e-05, "loss": 0.9457, "step": 544 }, { "epoch": 0.27862985685071573, "grad_norm": 5.6750359535217285, "learning_rate": 8.647391290838838e-05, "loss": 0.8713, "step": 545 }, { "epoch": 0.2791411042944785, "grad_norm": 5.564166069030762, "learning_rate": 8.641597108521348e-05, "loss": 1.0865, "step": 546 }, { "epoch": 0.2796523517382413, "grad_norm": 5.280520439147949, "learning_rate": 8.635792492569967e-05, "loss": 0.8019, "step": 547 }, { "epoch": 0.28016359918200406, "grad_norm": 6.287930965423584, "learning_rate": 8.629977459615655e-05, "loss": 0.8003, "step": 548 }, { "epoch": 0.28067484662576686, "grad_norm": 6.9441914558410645, "learning_rate": 8.624152026319208e-05, "loss": 0.8961, "step": 549 }, { "epoch": 0.28118609406952966, "grad_norm": 8.341976165771484, "learning_rate": 8.618316209371228e-05, "loss": 0.6323, "step": 550 }, { "epoch": 0.28169734151329245, "grad_norm": 2.6429460048675537, "learning_rate": 8.612470025492065e-05, "loss": 1.0268, "step": 551 }, { "epoch": 0.2822085889570552, "grad_norm": 2.9558145999908447, "learning_rate": 8.60661349143177e-05, "loss": 0.9619, "step": 552 }, { "epoch": 0.282719836400818, "grad_norm": 2.9567391872406006, "learning_rate": 8.600746623970051e-05, "loss": 0.9782, "step": 553 }, { "epoch": 0.2832310838445808, "grad_norm": 2.8197598457336426, "learning_rate": 8.59486943991622e-05, "loss": 0.9863, "step": 554 }, { "epoch": 0.2837423312883436, "grad_norm": 2.895512819290161, "learning_rate": 8.588981956109151e-05, "loss": 0.9764, "step": 555 }, { "epoch": 0.2842535787321063, "grad_norm": 2.976269245147705, "learning_rate": 8.583084189417224e-05, "loss": 1.1054, "step": 556 }, { "epoch": 0.2847648261758691, "grad_norm": 2.900879144668579, "learning_rate": 8.577176156738282e-05, "loss": 1.0042, "step": 557 }, { "epoch": 0.2852760736196319, "grad_norm": 3.3032069206237793, "learning_rate": 8.571257874999585e-05, "loss": 1.0139, "step": 558 }, { "epoch": 0.28578732106339466, "grad_norm": 3.3368518352508545, "learning_rate": 8.565329361157752e-05, "loss": 1.037, "step": 559 }, { "epoch": 0.28629856850715746, "grad_norm": 2.9615988731384277, "learning_rate": 8.559390632198723e-05, "loss": 0.916, "step": 560 }, { "epoch": 0.28680981595092025, "grad_norm": 3.3652186393737793, "learning_rate": 8.553441705137702e-05, "loss": 1.0821, "step": 561 }, { "epoch": 0.28732106339468305, "grad_norm": 3.1181633472442627, "learning_rate": 8.547482597019114e-05, "loss": 0.9685, "step": 562 }, { "epoch": 0.2878323108384458, "grad_norm": 3.476203203201294, "learning_rate": 8.541513324916555e-05, "loss": 0.9567, "step": 563 }, { "epoch": 0.2883435582822086, "grad_norm": 3.4987919330596924, "learning_rate": 8.535533905932738e-05, "loss": 1.0481, "step": 564 }, { "epoch": 0.2888548057259714, "grad_norm": 3.677459478378296, "learning_rate": 8.529544357199453e-05, "loss": 0.9656, "step": 565 }, { "epoch": 0.2893660531697341, "grad_norm": 3.3879263401031494, "learning_rate": 8.523544695877508e-05, "loss": 0.9833, "step": 566 }, { "epoch": 0.2898773006134969, "grad_norm": 3.864903688430786, "learning_rate": 8.51753493915669e-05, "loss": 0.9329, "step": 567 }, { "epoch": 0.2903885480572597, "grad_norm": 4.15344762802124, "learning_rate": 8.51151510425571e-05, "loss": 1.049, "step": 568 }, { "epoch": 0.2908997955010225, "grad_norm": 3.7756378650665283, "learning_rate": 8.505485208422146e-05, "loss": 0.9141, "step": 569 }, { "epoch": 0.29141104294478526, "grad_norm": 3.6224896907806396, "learning_rate": 8.499445268932413e-05, "loss": 0.9635, "step": 570 }, { "epoch": 0.29192229038854806, "grad_norm": 3.796037197113037, "learning_rate": 8.493395303091697e-05, "loss": 1.0782, "step": 571 }, { "epoch": 0.29243353783231085, "grad_norm": 3.62247371673584, "learning_rate": 8.487335328233912e-05, "loss": 0.9458, "step": 572 }, { "epoch": 0.2929447852760736, "grad_norm": 3.6064841747283936, "learning_rate": 8.481265361721645e-05, "loss": 0.877, "step": 573 }, { "epoch": 0.2934560327198364, "grad_norm": 3.84397292137146, "learning_rate": 8.475185420946118e-05, "loss": 1.0055, "step": 574 }, { "epoch": 0.2939672801635992, "grad_norm": 3.980118751525879, "learning_rate": 8.469095523327124e-05, "loss": 0.9517, "step": 575 }, { "epoch": 0.294478527607362, "grad_norm": 3.9962069988250732, "learning_rate": 8.462995686312985e-05, "loss": 0.9557, "step": 576 }, { "epoch": 0.2949897750511247, "grad_norm": 3.9836580753326416, "learning_rate": 8.456885927380504e-05, "loss": 1.0057, "step": 577 }, { "epoch": 0.2955010224948875, "grad_norm": 3.9834954738616943, "learning_rate": 8.450766264034907e-05, "loss": 0.9853, "step": 578 }, { "epoch": 0.2960122699386503, "grad_norm": 3.7003509998321533, "learning_rate": 8.444636713809801e-05, "loss": 0.8674, "step": 579 }, { "epoch": 0.2965235173824131, "grad_norm": 3.7649359703063965, "learning_rate": 8.438497294267117e-05, "loss": 0.9366, "step": 580 }, { "epoch": 0.29703476482617586, "grad_norm": 4.7126994132995605, "learning_rate": 8.432348022997066e-05, "loss": 0.911, "step": 581 }, { "epoch": 0.29754601226993865, "grad_norm": 3.622835159301758, "learning_rate": 8.426188917618083e-05, "loss": 0.8138, "step": 582 }, { "epoch": 0.29805725971370145, "grad_norm": 3.986794948577881, "learning_rate": 8.420019995776779e-05, "loss": 0.978, "step": 583 }, { "epoch": 0.2985685071574642, "grad_norm": 4.0647478103637695, "learning_rate": 8.413841275147892e-05, "loss": 0.9557, "step": 584 }, { "epoch": 0.299079754601227, "grad_norm": 4.3159637451171875, "learning_rate": 8.407652773434236e-05, "loss": 0.8454, "step": 585 }, { "epoch": 0.2995910020449898, "grad_norm": 4.256467342376709, "learning_rate": 8.401454508366643e-05, "loss": 0.9827, "step": 586 }, { "epoch": 0.3001022494887526, "grad_norm": 4.750210762023926, "learning_rate": 8.395246497703925e-05, "loss": 0.8911, "step": 587 }, { "epoch": 0.3006134969325153, "grad_norm": 4.348392963409424, "learning_rate": 8.389028759232815e-05, "loss": 0.8436, "step": 588 }, { "epoch": 0.3011247443762781, "grad_norm": 4.388984680175781, "learning_rate": 8.382801310767912e-05, "loss": 0.9827, "step": 589 }, { "epoch": 0.3016359918200409, "grad_norm": 4.420820236206055, "learning_rate": 8.376564170151642e-05, "loss": 0.896, "step": 590 }, { "epoch": 0.30214723926380366, "grad_norm": 4.915138244628906, "learning_rate": 8.370317355254197e-05, "loss": 0.8738, "step": 591 }, { "epoch": 0.30265848670756645, "grad_norm": 5.0341644287109375, "learning_rate": 8.364060883973489e-05, "loss": 0.8735, "step": 592 }, { "epoch": 0.30316973415132925, "grad_norm": 4.664186000823975, "learning_rate": 8.357794774235092e-05, "loss": 0.82, "step": 593 }, { "epoch": 0.30368098159509205, "grad_norm": 5.188631057739258, "learning_rate": 8.351519043992201e-05, "loss": 0.8746, "step": 594 }, { "epoch": 0.3041922290388548, "grad_norm": 5.177921295166016, "learning_rate": 8.345233711225574e-05, "loss": 0.8297, "step": 595 }, { "epoch": 0.3047034764826176, "grad_norm": 5.583620548248291, "learning_rate": 8.338938793943478e-05, "loss": 1.0842, "step": 596 }, { "epoch": 0.3052147239263804, "grad_norm": 6.216187953948975, "learning_rate": 8.332634310181644e-05, "loss": 0.9047, "step": 597 }, { "epoch": 0.3057259713701431, "grad_norm": 5.41096305847168, "learning_rate": 8.326320278003211e-05, "loss": 0.6631, "step": 598 }, { "epoch": 0.3062372188139059, "grad_norm": 7.021457672119141, "learning_rate": 8.319996715498675e-05, "loss": 0.8118, "step": 599 }, { "epoch": 0.3067484662576687, "grad_norm": 9.457502365112305, "learning_rate": 8.313663640785839e-05, "loss": 0.5443, "step": 600 }, { "epoch": 0.3072597137014315, "grad_norm": 2.781672239303589, "learning_rate": 8.307321072009759e-05, "loss": 1.0299, "step": 601 }, { "epoch": 0.30777096114519426, "grad_norm": 3.157245635986328, "learning_rate": 8.300969027342692e-05, "loss": 1.0549, "step": 602 }, { "epoch": 0.30828220858895705, "grad_norm": 3.0656280517578125, "learning_rate": 8.294607524984045e-05, "loss": 1.1016, "step": 603 }, { "epoch": 0.30879345603271985, "grad_norm": 3.2387661933898926, "learning_rate": 8.288236583160322e-05, "loss": 1.0825, "step": 604 }, { "epoch": 0.30930470347648265, "grad_norm": 3.1593637466430664, "learning_rate": 8.281856220125076e-05, "loss": 1.0234, "step": 605 }, { "epoch": 0.3098159509202454, "grad_norm": 3.0978777408599854, "learning_rate": 8.275466454158847e-05, "loss": 1.0093, "step": 606 }, { "epoch": 0.3103271983640082, "grad_norm": 2.8354928493499756, "learning_rate": 8.269067303569118e-05, "loss": 1.0444, "step": 607 }, { "epoch": 0.310838445807771, "grad_norm": 2.848942756652832, "learning_rate": 8.262658786690262e-05, "loss": 0.9818, "step": 608 }, { "epoch": 0.3113496932515337, "grad_norm": 3.241898536682129, "learning_rate": 8.256240921883487e-05, "loss": 1.0262, "step": 609 }, { "epoch": 0.3118609406952965, "grad_norm": 3.1023952960968018, "learning_rate": 8.249813727536781e-05, "loss": 1.1397, "step": 610 }, { "epoch": 0.3123721881390593, "grad_norm": 3.007742166519165, "learning_rate": 8.243377222064865e-05, "loss": 0.9883, "step": 611 }, { "epoch": 0.3128834355828221, "grad_norm": 2.9802401065826416, "learning_rate": 8.236931423909138e-05, "loss": 0.9806, "step": 612 }, { "epoch": 0.31339468302658485, "grad_norm": 3.3660130500793457, "learning_rate": 8.230476351537624e-05, "loss": 0.9672, "step": 613 }, { "epoch": 0.31390593047034765, "grad_norm": 3.3183295726776123, "learning_rate": 8.224012023444913e-05, "loss": 0.9066, "step": 614 }, { "epoch": 0.31441717791411045, "grad_norm": 3.352731466293335, "learning_rate": 8.217538458152122e-05, "loss": 0.9771, "step": 615 }, { "epoch": 0.3149284253578732, "grad_norm": 3.439236640930176, "learning_rate": 8.211055674206828e-05, "loss": 1.0707, "step": 616 }, { "epoch": 0.315439672801636, "grad_norm": 3.3290817737579346, "learning_rate": 8.204563690183024e-05, "loss": 1.0231, "step": 617 }, { "epoch": 0.3159509202453988, "grad_norm": 3.7434792518615723, "learning_rate": 8.198062524681061e-05, "loss": 0.9347, "step": 618 }, { "epoch": 0.3164621676891616, "grad_norm": 3.3235063552856445, "learning_rate": 8.191552196327596e-05, "loss": 0.8501, "step": 619 }, { "epoch": 0.3169734151329243, "grad_norm": 3.5552737712860107, "learning_rate": 8.185032723775539e-05, "loss": 0.971, "step": 620 }, { "epoch": 0.3174846625766871, "grad_norm": 3.5360236167907715, "learning_rate": 8.178504125703997e-05, "loss": 1.0067, "step": 621 }, { "epoch": 0.3179959100204499, "grad_norm": 3.9374284744262695, "learning_rate": 8.171966420818228e-05, "loss": 0.9743, "step": 622 }, { "epoch": 0.31850715746421265, "grad_norm": 3.668915271759033, "learning_rate": 8.165419627849578e-05, "loss": 0.9575, "step": 623 }, { "epoch": 0.31901840490797545, "grad_norm": 3.8559343814849854, "learning_rate": 8.15886376555543e-05, "loss": 0.9918, "step": 624 }, { "epoch": 0.31952965235173825, "grad_norm": 3.806727409362793, "learning_rate": 8.152298852719159e-05, "loss": 0.9559, "step": 625 }, { "epoch": 0.32004089979550104, "grad_norm": 3.8618004322052, "learning_rate": 8.145724908150063e-05, "loss": 0.9213, "step": 626 }, { "epoch": 0.3205521472392638, "grad_norm": 3.607926607131958, "learning_rate": 8.13914195068332e-05, "loss": 0.8309, "step": 627 }, { "epoch": 0.3210633946830266, "grad_norm": 3.718229055404663, "learning_rate": 8.132549999179933e-05, "loss": 0.8815, "step": 628 }, { "epoch": 0.3215746421267894, "grad_norm": 4.1860785484313965, "learning_rate": 8.125949072526673e-05, "loss": 0.9077, "step": 629 }, { "epoch": 0.3220858895705521, "grad_norm": 3.8918352127075195, "learning_rate": 8.119339189636023e-05, "loss": 0.9646, "step": 630 }, { "epoch": 0.3225971370143149, "grad_norm": 3.9071269035339355, "learning_rate": 8.11272036944613e-05, "loss": 0.8756, "step": 631 }, { "epoch": 0.3231083844580777, "grad_norm": 3.965230941772461, "learning_rate": 8.106092630920749e-05, "loss": 0.8488, "step": 632 }, { "epoch": 0.3236196319018405, "grad_norm": 4.22208309173584, "learning_rate": 8.099455993049181e-05, "loss": 0.9365, "step": 633 }, { "epoch": 0.32413087934560325, "grad_norm": 4.312554359436035, "learning_rate": 8.092810474846231e-05, "loss": 0.9539, "step": 634 }, { "epoch": 0.32464212678936605, "grad_norm": 4.0963592529296875, "learning_rate": 8.086156095352144e-05, "loss": 0.9983, "step": 635 }, { "epoch": 0.32515337423312884, "grad_norm": 4.197260856628418, "learning_rate": 8.079492873632554e-05, "loss": 1.0168, "step": 636 }, { "epoch": 0.32566462167689164, "grad_norm": 4.2196173667907715, "learning_rate": 8.07282082877843e-05, "loss": 0.9127, "step": 637 }, { "epoch": 0.3261758691206544, "grad_norm": 4.162968635559082, "learning_rate": 8.06613997990602e-05, "loss": 0.9794, "step": 638 }, { "epoch": 0.3266871165644172, "grad_norm": 4.384978294372559, "learning_rate": 8.059450346156796e-05, "loss": 0.9692, "step": 639 }, { "epoch": 0.32719836400818, "grad_norm": 4.631701469421387, "learning_rate": 8.052751946697403e-05, "loss": 0.9136, "step": 640 }, { "epoch": 0.3277096114519427, "grad_norm": 5.009700298309326, "learning_rate": 8.046044800719594e-05, "loss": 0.8666, "step": 641 }, { "epoch": 0.3282208588957055, "grad_norm": 5.072287082672119, "learning_rate": 8.039328927440188e-05, "loss": 0.9186, "step": 642 }, { "epoch": 0.3287321063394683, "grad_norm": 5.042779445648193, "learning_rate": 8.032604346101009e-05, "loss": 0.843, "step": 643 }, { "epoch": 0.3292433537832311, "grad_norm": 4.215601921081543, "learning_rate": 8.025871075968828e-05, "loss": 0.8339, "step": 644 }, { "epoch": 0.32975460122699385, "grad_norm": 4.962833881378174, "learning_rate": 8.019129136335306e-05, "loss": 0.8041, "step": 645 }, { "epoch": 0.33026584867075665, "grad_norm": 5.858077526092529, "learning_rate": 8.012378546516954e-05, "loss": 0.763, "step": 646 }, { "epoch": 0.33077709611451944, "grad_norm": 6.057731628417969, "learning_rate": 8.00561932585506e-05, "loss": 0.8779, "step": 647 }, { "epoch": 0.3312883435582822, "grad_norm": 6.040656089782715, "learning_rate": 7.99885149371564e-05, "loss": 0.7984, "step": 648 }, { "epoch": 0.331799591002045, "grad_norm": 4.905963897705078, "learning_rate": 7.992075069489387e-05, "loss": 0.5278, "step": 649 }, { "epoch": 0.3323108384458078, "grad_norm": 7.595052242279053, "learning_rate": 7.985290072591605e-05, "loss": 0.5099, "step": 650 }, { "epoch": 0.3328220858895706, "grad_norm": 2.33105206489563, "learning_rate": 7.978496522462167e-05, "loss": 0.8827, "step": 651 }, { "epoch": 0.3333333333333333, "grad_norm": 2.849806070327759, "learning_rate": 7.97169443856545e-05, "loss": 0.9505, "step": 652 }, { "epoch": 0.3338445807770961, "grad_norm": 3.0055220127105713, "learning_rate": 7.964883840390276e-05, "loss": 1.0614, "step": 653 }, { "epoch": 0.3343558282208589, "grad_norm": 3.2699313163757324, "learning_rate": 7.958064747449869e-05, "loss": 1.0308, "step": 654 }, { "epoch": 0.33486707566462165, "grad_norm": 2.836352586746216, "learning_rate": 7.951237179281788e-05, "loss": 1.0131, "step": 655 }, { "epoch": 0.33537832310838445, "grad_norm": 2.913180351257324, "learning_rate": 7.944401155447871e-05, "loss": 0.9972, "step": 656 }, { "epoch": 0.33588957055214724, "grad_norm": 3.03965163230896, "learning_rate": 7.937556695534193e-05, "loss": 0.968, "step": 657 }, { "epoch": 0.33640081799591004, "grad_norm": 2.8682761192321777, "learning_rate": 7.930703819150987e-05, "loss": 0.9992, "step": 658 }, { "epoch": 0.3369120654396728, "grad_norm": 2.7956974506378174, "learning_rate": 7.92384254593261e-05, "loss": 1.0082, "step": 659 }, { "epoch": 0.3374233128834356, "grad_norm": 3.3379359245300293, "learning_rate": 7.916972895537471e-05, "loss": 1.0037, "step": 660 }, { "epoch": 0.3379345603271984, "grad_norm": 3.0874998569488525, "learning_rate": 7.91009488764798e-05, "loss": 1.0145, "step": 661 }, { "epoch": 0.33844580777096117, "grad_norm": 3.038001537322998, "learning_rate": 7.903208541970501e-05, "loss": 0.9002, "step": 662 }, { "epoch": 0.3389570552147239, "grad_norm": 3.2250359058380127, "learning_rate": 7.896313878235278e-05, "loss": 0.9536, "step": 663 }, { "epoch": 0.3394683026584867, "grad_norm": 3.5659546852111816, "learning_rate": 7.889410916196389e-05, "loss": 0.9676, "step": 664 }, { "epoch": 0.3399795501022495, "grad_norm": 3.521686553955078, "learning_rate": 7.882499675631689e-05, "loss": 0.9694, "step": 665 }, { "epoch": 0.34049079754601225, "grad_norm": 3.544236660003662, "learning_rate": 7.875580176342753e-05, "loss": 0.8514, "step": 666 }, { "epoch": 0.34100204498977504, "grad_norm": 3.529057025909424, "learning_rate": 7.868652438154815e-05, "loss": 1.0767, "step": 667 }, { "epoch": 0.34151329243353784, "grad_norm": 3.835726261138916, "learning_rate": 7.86171648091672e-05, "loss": 0.9648, "step": 668 }, { "epoch": 0.34202453987730064, "grad_norm": 3.570894479751587, "learning_rate": 7.854772324500854e-05, "loss": 0.9455, "step": 669 }, { "epoch": 0.3425357873210634, "grad_norm": 3.518613815307617, "learning_rate": 7.847819988803102e-05, "loss": 1.0018, "step": 670 }, { "epoch": 0.3430470347648262, "grad_norm": 3.49399733543396, "learning_rate": 7.84085949374278e-05, "loss": 0.8864, "step": 671 }, { "epoch": 0.34355828220858897, "grad_norm": 3.5140910148620605, "learning_rate": 7.833890859262579e-05, "loss": 1.067, "step": 672 }, { "epoch": 0.3440695296523517, "grad_norm": 3.35764479637146, "learning_rate": 7.826914105328519e-05, "loss": 0.9411, "step": 673 }, { "epoch": 0.3445807770961145, "grad_norm": 3.6672184467315674, "learning_rate": 7.819929251929873e-05, "loss": 0.9742, "step": 674 }, { "epoch": 0.3450920245398773, "grad_norm": 3.7109756469726562, "learning_rate": 7.812936319079127e-05, "loss": 0.9916, "step": 675 }, { "epoch": 0.3456032719836401, "grad_norm": 3.371408224105835, "learning_rate": 7.805935326811912e-05, "loss": 0.9044, "step": 676 }, { "epoch": 0.34611451942740284, "grad_norm": 3.713839054107666, "learning_rate": 7.798926295186955e-05, "loss": 0.9528, "step": 677 }, { "epoch": 0.34662576687116564, "grad_norm": 3.572603702545166, "learning_rate": 7.791909244286009e-05, "loss": 0.8672, "step": 678 }, { "epoch": 0.34713701431492844, "grad_norm": 3.5241007804870605, "learning_rate": 7.784884194213811e-05, "loss": 0.862, "step": 679 }, { "epoch": 0.3476482617586912, "grad_norm": 4.026426792144775, "learning_rate": 7.777851165098012e-05, "loss": 0.96, "step": 680 }, { "epoch": 0.348159509202454, "grad_norm": 4.031967639923096, "learning_rate": 7.770810177089126e-05, "loss": 1.0187, "step": 681 }, { "epoch": 0.3486707566462168, "grad_norm": 3.622410297393799, "learning_rate": 7.763761250360467e-05, "loss": 0.8931, "step": 682 }, { "epoch": 0.34918200408997957, "grad_norm": 3.932042121887207, "learning_rate": 7.756704405108102e-05, "loss": 0.8535, "step": 683 }, { "epoch": 0.3496932515337423, "grad_norm": 3.7137181758880615, "learning_rate": 7.749639661550775e-05, "loss": 0.808, "step": 684 }, { "epoch": 0.3502044989775051, "grad_norm": 3.7734367847442627, "learning_rate": 7.74256703992987e-05, "loss": 0.8274, "step": 685 }, { "epoch": 0.3507157464212679, "grad_norm": 4.37022590637207, "learning_rate": 7.735486560509332e-05, "loss": 0.913, "step": 686 }, { "epoch": 0.3512269938650307, "grad_norm": 3.8798720836639404, "learning_rate": 7.728398243575631e-05, "loss": 0.8297, "step": 687 }, { "epoch": 0.35173824130879344, "grad_norm": 4.2243266105651855, "learning_rate": 7.721302109437685e-05, "loss": 0.9338, "step": 688 }, { "epoch": 0.35224948875255624, "grad_norm": 4.538561820983887, "learning_rate": 7.71419817842681e-05, "loss": 0.917, "step": 689 }, { "epoch": 0.35276073619631904, "grad_norm": 5.12889289855957, "learning_rate": 7.707086470896663e-05, "loss": 0.8964, "step": 690 }, { "epoch": 0.3532719836400818, "grad_norm": 4.37795877456665, "learning_rate": 7.699967007223182e-05, "loss": 0.8095, "step": 691 }, { "epoch": 0.3537832310838446, "grad_norm": 4.92649507522583, "learning_rate": 7.692839807804521e-05, "loss": 0.9034, "step": 692 }, { "epoch": 0.35429447852760737, "grad_norm": 4.200683116912842, "learning_rate": 7.68570489306101e-05, "loss": 0.8081, "step": 693 }, { "epoch": 0.35480572597137017, "grad_norm": 5.237700462341309, "learning_rate": 7.678562283435076e-05, "loss": 1.0181, "step": 694 }, { "epoch": 0.3553169734151329, "grad_norm": 5.287177562713623, "learning_rate": 7.67141199939119e-05, "loss": 1.0058, "step": 695 }, { "epoch": 0.3558282208588957, "grad_norm": 5.213104724884033, "learning_rate": 7.664254061415818e-05, "loss": 0.881, "step": 696 }, { "epoch": 0.3563394683026585, "grad_norm": 5.322814464569092, "learning_rate": 7.657088490017354e-05, "loss": 0.9247, "step": 697 }, { "epoch": 0.35685071574642124, "grad_norm": 5.443381309509277, "learning_rate": 7.64991530572606e-05, "loss": 0.789, "step": 698 }, { "epoch": 0.35736196319018404, "grad_norm": 6.3556060791015625, "learning_rate": 7.642734529094012e-05, "loss": 0.8338, "step": 699 }, { "epoch": 0.35787321063394684, "grad_norm": 8.609705924987793, "learning_rate": 7.635546180695038e-05, "loss": 0.9239, "step": 700 }, { "epoch": 0.35838445807770963, "grad_norm": 2.269519329071045, "learning_rate": 7.628350281124663e-05, "loss": 0.8918, "step": 701 }, { "epoch": 0.3588957055214724, "grad_norm": 2.6576449871063232, "learning_rate": 7.621146851000043e-05, "loss": 1.0788, "step": 702 }, { "epoch": 0.35940695296523517, "grad_norm": 2.7672042846679688, "learning_rate": 7.61393591095991e-05, "loss": 1.0559, "step": 703 }, { "epoch": 0.35991820040899797, "grad_norm": 2.5997817516326904, "learning_rate": 7.606717481664514e-05, "loss": 1.0025, "step": 704 }, { "epoch": 0.3604294478527607, "grad_norm": 2.5708072185516357, "learning_rate": 7.599491583795569e-05, "loss": 0.9982, "step": 705 }, { "epoch": 0.3609406952965235, "grad_norm": 2.8804173469543457, "learning_rate": 7.592258238056174e-05, "loss": 1.0188, "step": 706 }, { "epoch": 0.3614519427402863, "grad_norm": 2.8373563289642334, "learning_rate": 7.58501746517078e-05, "loss": 0.921, "step": 707 }, { "epoch": 0.3619631901840491, "grad_norm": 2.876730442047119, "learning_rate": 7.577769285885109e-05, "loss": 0.9169, "step": 708 }, { "epoch": 0.36247443762781184, "grad_norm": 2.9293148517608643, "learning_rate": 7.570513720966108e-05, "loss": 1.01, "step": 709 }, { "epoch": 0.36298568507157464, "grad_norm": 3.1806929111480713, "learning_rate": 7.563250791201886e-05, "loss": 0.9419, "step": 710 }, { "epoch": 0.36349693251533743, "grad_norm": 3.0284066200256348, "learning_rate": 7.555980517401645e-05, "loss": 1.011, "step": 711 }, { "epoch": 0.36400817995910023, "grad_norm": 3.130155086517334, "learning_rate": 7.548702920395638e-05, "loss": 0.9283, "step": 712 }, { "epoch": 0.36451942740286297, "grad_norm": 3.167332172393799, "learning_rate": 7.541418021035098e-05, "loss": 0.9085, "step": 713 }, { "epoch": 0.36503067484662577, "grad_norm": 3.1641294956207275, "learning_rate": 7.534125840192172e-05, "loss": 0.9266, "step": 714 }, { "epoch": 0.36554192229038857, "grad_norm": 3.073336601257324, "learning_rate": 7.526826398759881e-05, "loss": 0.8859, "step": 715 }, { "epoch": 0.3660531697341513, "grad_norm": 2.870695114135742, "learning_rate": 7.519519717652039e-05, "loss": 0.7983, "step": 716 }, { "epoch": 0.3665644171779141, "grad_norm": 3.243685483932495, "learning_rate": 7.512205817803211e-05, "loss": 0.9613, "step": 717 }, { "epoch": 0.3670756646216769, "grad_norm": 3.117615222930908, "learning_rate": 7.504884720168637e-05, "loss": 0.859, "step": 718 }, { "epoch": 0.3675869120654397, "grad_norm": 3.478621244430542, "learning_rate": 7.497556445724183e-05, "loss": 0.9985, "step": 719 }, { "epoch": 0.36809815950920244, "grad_norm": 3.571955680847168, "learning_rate": 7.490221015466279e-05, "loss": 0.9798, "step": 720 }, { "epoch": 0.36860940695296524, "grad_norm": 3.45243239402771, "learning_rate": 7.482878450411854e-05, "loss": 0.97, "step": 721 }, { "epoch": 0.36912065439672803, "grad_norm": 3.4372360706329346, "learning_rate": 7.475528771598283e-05, "loss": 1.0453, "step": 722 }, { "epoch": 0.3696319018404908, "grad_norm": 3.8749423027038574, "learning_rate": 7.468172000083319e-05, "loss": 0.9453, "step": 723 }, { "epoch": 0.37014314928425357, "grad_norm": 3.3690943717956543, "learning_rate": 7.460808156945036e-05, "loss": 0.8492, "step": 724 }, { "epoch": 0.37065439672801637, "grad_norm": 3.636127471923828, "learning_rate": 7.453437263281777e-05, "loss": 0.9653, "step": 725 }, { "epoch": 0.37116564417177916, "grad_norm": 3.700061082839966, "learning_rate": 7.446059340212072e-05, "loss": 0.9498, "step": 726 }, { "epoch": 0.3716768916155419, "grad_norm": 3.9007370471954346, "learning_rate": 7.438674408874604e-05, "loss": 0.8941, "step": 727 }, { "epoch": 0.3721881390593047, "grad_norm": 3.541825771331787, "learning_rate": 7.431282490428129e-05, "loss": 0.8871, "step": 728 }, { "epoch": 0.3726993865030675, "grad_norm": 3.9885671138763428, "learning_rate": 7.423883606051424e-05, "loss": 1.0297, "step": 729 }, { "epoch": 0.37321063394683024, "grad_norm": 3.7179479598999023, "learning_rate": 7.416477776943222e-05, "loss": 0.8756, "step": 730 }, { "epoch": 0.37372188139059304, "grad_norm": 4.2167229652404785, "learning_rate": 7.409065024322158e-05, "loss": 0.9019, "step": 731 }, { "epoch": 0.37423312883435583, "grad_norm": 3.5955021381378174, "learning_rate": 7.401645369426697e-05, "loss": 0.8644, "step": 732 }, { "epoch": 0.37474437627811863, "grad_norm": 3.7380001544952393, "learning_rate": 7.394218833515088e-05, "loss": 0.8517, "step": 733 }, { "epoch": 0.37525562372188137, "grad_norm": 3.396381378173828, "learning_rate": 7.386785437865287e-05, "loss": 0.7941, "step": 734 }, { "epoch": 0.37576687116564417, "grad_norm": 3.9136502742767334, "learning_rate": 7.37934520377491e-05, "loss": 0.8351, "step": 735 }, { "epoch": 0.37627811860940696, "grad_norm": 3.892547130584717, "learning_rate": 7.371898152561166e-05, "loss": 0.9343, "step": 736 }, { "epoch": 0.37678936605316976, "grad_norm": 3.785649061203003, "learning_rate": 7.364444305560787e-05, "loss": 0.8726, "step": 737 }, { "epoch": 0.3773006134969325, "grad_norm": 4.7827372550964355, "learning_rate": 7.35698368412999e-05, "loss": 0.9685, "step": 738 }, { "epoch": 0.3778118609406953, "grad_norm": 3.921762466430664, "learning_rate": 7.349516309644388e-05, "loss": 0.8303, "step": 739 }, { "epoch": 0.3783231083844581, "grad_norm": 4.252273082733154, "learning_rate": 7.342042203498951e-05, "loss": 0.8746, "step": 740 }, { "epoch": 0.37883435582822084, "grad_norm": 4.284645080566406, "learning_rate": 7.334561387107935e-05, "loss": 0.8135, "step": 741 }, { "epoch": 0.37934560327198363, "grad_norm": 4.3504638671875, "learning_rate": 7.327073881904819e-05, "loss": 0.8595, "step": 742 }, { "epoch": 0.37985685071574643, "grad_norm": 4.201150417327881, "learning_rate": 7.319579709342244e-05, "loss": 0.7961, "step": 743 }, { "epoch": 0.3803680981595092, "grad_norm": 4.760327339172363, "learning_rate": 7.312078890891963e-05, "loss": 0.7495, "step": 744 }, { "epoch": 0.38087934560327197, "grad_norm": 5.05194091796875, "learning_rate": 7.304571448044759e-05, "loss": 0.9073, "step": 745 }, { "epoch": 0.38139059304703476, "grad_norm": 5.017317295074463, "learning_rate": 7.297057402310406e-05, "loss": 0.7862, "step": 746 }, { "epoch": 0.38190184049079756, "grad_norm": 5.787736892700195, "learning_rate": 7.289536775217587e-05, "loss": 0.8488, "step": 747 }, { "epoch": 0.3824130879345603, "grad_norm": 5.555422306060791, "learning_rate": 7.282009588313845e-05, "loss": 0.7949, "step": 748 }, { "epoch": 0.3829243353783231, "grad_norm": 6.039085865020752, "learning_rate": 7.274475863165518e-05, "loss": 0.5657, "step": 749 }, { "epoch": 0.3834355828220859, "grad_norm": 7.193869590759277, "learning_rate": 7.266935621357677e-05, "loss": 0.6427, "step": 750 }, { "epoch": 0.3839468302658487, "grad_norm": 1.978843331336975, "learning_rate": 7.259388884494064e-05, "loss": 0.853, "step": 751 }, { "epoch": 0.38445807770961143, "grad_norm": 2.8742833137512207, "learning_rate": 7.251835674197029e-05, "loss": 1.0315, "step": 752 }, { "epoch": 0.38496932515337423, "grad_norm": 2.5204415321350098, "learning_rate": 7.244276012107467e-05, "loss": 1.0057, "step": 753 }, { "epoch": 0.38548057259713703, "grad_norm": 2.7091565132141113, "learning_rate": 7.236709919884763e-05, "loss": 1.066, "step": 754 }, { "epoch": 0.38599182004089977, "grad_norm": 2.9578049182891846, "learning_rate": 7.229137419206727e-05, "loss": 1.0613, "step": 755 }, { "epoch": 0.38650306748466257, "grad_norm": 2.7542765140533447, "learning_rate": 7.221558531769519e-05, "loss": 0.9996, "step": 756 }, { "epoch": 0.38701431492842536, "grad_norm": 2.6492252349853516, "learning_rate": 7.21397327928761e-05, "loss": 0.9659, "step": 757 }, { "epoch": 0.38752556237218816, "grad_norm": 2.741001605987549, "learning_rate": 7.206381683493702e-05, "loss": 0.9089, "step": 758 }, { "epoch": 0.3880368098159509, "grad_norm": 2.586782693862915, "learning_rate": 7.19878376613867e-05, "loss": 0.9266, "step": 759 }, { "epoch": 0.3885480572597137, "grad_norm": 2.960686206817627, "learning_rate": 7.191179548991507e-05, "loss": 0.9666, "step": 760 }, { "epoch": 0.3890593047034765, "grad_norm": 3.008049249649048, "learning_rate": 7.183569053839248e-05, "loss": 1.0174, "step": 761 }, { "epoch": 0.3895705521472393, "grad_norm": 2.975431203842163, "learning_rate": 7.17595230248692e-05, "loss": 0.9742, "step": 762 }, { "epoch": 0.39008179959100203, "grad_norm": 3.148494243621826, "learning_rate": 7.168329316757475e-05, "loss": 0.9141, "step": 763 }, { "epoch": 0.39059304703476483, "grad_norm": 3.210510730743408, "learning_rate": 7.160700118491728e-05, "loss": 0.9639, "step": 764 }, { "epoch": 0.3911042944785276, "grad_norm": 3.1042428016662598, "learning_rate": 7.15306472954829e-05, "loss": 0.9609, "step": 765 }, { "epoch": 0.39161554192229037, "grad_norm": 3.2748963832855225, "learning_rate": 7.14542317180351e-05, "loss": 0.9741, "step": 766 }, { "epoch": 0.39212678936605316, "grad_norm": 3.0430991649627686, "learning_rate": 7.137775467151411e-05, "loss": 0.9359, "step": 767 }, { "epoch": 0.39263803680981596, "grad_norm": 3.2744600772857666, "learning_rate": 7.130121637503632e-05, "loss": 0.8993, "step": 768 }, { "epoch": 0.39314928425357876, "grad_norm": 3.287580728530884, "learning_rate": 7.122461704789358e-05, "loss": 0.8918, "step": 769 }, { "epoch": 0.3936605316973415, "grad_norm": 3.8326618671417236, "learning_rate": 7.11479569095526e-05, "loss": 0.8956, "step": 770 }, { "epoch": 0.3941717791411043, "grad_norm": 3.353121519088745, "learning_rate": 7.10712361796543e-05, "loss": 0.9102, "step": 771 }, { "epoch": 0.3946830265848671, "grad_norm": 3.537923574447632, "learning_rate": 7.099445507801323e-05, "loss": 0.9339, "step": 772 }, { "epoch": 0.39519427402862983, "grad_norm": 3.2914555072784424, "learning_rate": 7.091761382461692e-05, "loss": 0.8794, "step": 773 }, { "epoch": 0.39570552147239263, "grad_norm": 3.461108922958374, "learning_rate": 7.084071263962521e-05, "loss": 0.9089, "step": 774 }, { "epoch": 0.3962167689161554, "grad_norm": 3.281498908996582, "learning_rate": 7.076375174336972e-05, "loss": 0.8186, "step": 775 }, { "epoch": 0.3967280163599182, "grad_norm": 3.6270294189453125, "learning_rate": 7.068673135635302e-05, "loss": 0.8756, "step": 776 }, { "epoch": 0.39723926380368096, "grad_norm": 3.7898643016815186, "learning_rate": 7.060965169924828e-05, "loss": 0.9115, "step": 777 }, { "epoch": 0.39775051124744376, "grad_norm": 3.4987165927886963, "learning_rate": 7.053251299289837e-05, "loss": 0.8832, "step": 778 }, { "epoch": 0.39826175869120656, "grad_norm": 3.6572365760803223, "learning_rate": 7.045531545831541e-05, "loss": 0.9144, "step": 779 }, { "epoch": 0.3987730061349693, "grad_norm": 4.028952598571777, "learning_rate": 7.037805931668005e-05, "loss": 0.9033, "step": 780 }, { "epoch": 0.3992842535787321, "grad_norm": 3.5009331703186035, "learning_rate": 7.030074478934085e-05, "loss": 0.8903, "step": 781 }, { "epoch": 0.3997955010224949, "grad_norm": 3.7041122913360596, "learning_rate": 7.022337209781367e-05, "loss": 0.8992, "step": 782 }, { "epoch": 0.4003067484662577, "grad_norm": 3.8405399322509766, "learning_rate": 7.014594146378097e-05, "loss": 0.8085, "step": 783 }, { "epoch": 0.40081799591002043, "grad_norm": 3.704244375228882, "learning_rate": 7.006845310909131e-05, "loss": 0.8948, "step": 784 }, { "epoch": 0.4013292433537832, "grad_norm": 3.5414698123931885, "learning_rate": 6.999090725575854e-05, "loss": 0.7511, "step": 785 }, { "epoch": 0.401840490797546, "grad_norm": 4.033692836761475, "learning_rate": 6.99133041259613e-05, "loss": 0.8177, "step": 786 }, { "epoch": 0.4023517382413088, "grad_norm": 4.002652168273926, "learning_rate": 6.98356439420423e-05, "loss": 0.8903, "step": 787 }, { "epoch": 0.40286298568507156, "grad_norm": 4.07108736038208, "learning_rate": 6.975792692650777e-05, "loss": 0.972, "step": 788 }, { "epoch": 0.40337423312883436, "grad_norm": 3.756382942199707, "learning_rate": 6.968015330202672e-05, "loss": 0.8346, "step": 789 }, { "epoch": 0.40388548057259716, "grad_norm": 4.131988525390625, "learning_rate": 6.960232329143039e-05, "loss": 0.9014, "step": 790 }, { "epoch": 0.4043967280163599, "grad_norm": 4.3247833251953125, "learning_rate": 6.952443711771151e-05, "loss": 0.8945, "step": 791 }, { "epoch": 0.4049079754601227, "grad_norm": 4.037295818328857, "learning_rate": 6.94464950040238e-05, "loss": 0.9311, "step": 792 }, { "epoch": 0.4054192229038855, "grad_norm": 4.475683689117432, "learning_rate": 6.936849717368122e-05, "loss": 0.8492, "step": 793 }, { "epoch": 0.4059304703476483, "grad_norm": 4.772801399230957, "learning_rate": 6.929044385015735e-05, "loss": 0.9542, "step": 794 }, { "epoch": 0.40644171779141103, "grad_norm": 4.975774765014648, "learning_rate": 6.921233525708479e-05, "loss": 0.9035, "step": 795 }, { "epoch": 0.4069529652351738, "grad_norm": 4.816725254058838, "learning_rate": 6.91341716182545e-05, "loss": 0.9328, "step": 796 }, { "epoch": 0.4074642126789366, "grad_norm": 5.29307222366333, "learning_rate": 6.905595315761511e-05, "loss": 0.6613, "step": 797 }, { "epoch": 0.40797546012269936, "grad_norm": 5.284379959106445, "learning_rate": 6.897768009927239e-05, "loss": 0.6603, "step": 798 }, { "epoch": 0.40848670756646216, "grad_norm": 6.468302249908447, "learning_rate": 6.889935266748848e-05, "loss": 0.5906, "step": 799 }, { "epoch": 0.40899795501022496, "grad_norm": 7.637302398681641, "learning_rate": 6.882097108668132e-05, "loss": 0.8293, "step": 800 }, { "epoch": 0.40950920245398775, "grad_norm": 2.510533094406128, "learning_rate": 6.8742535581424e-05, "loss": 0.8645, "step": 801 }, { "epoch": 0.4100204498977505, "grad_norm": 2.7541604042053223, "learning_rate": 6.866404637644414e-05, "loss": 1.0526, "step": 802 }, { "epoch": 0.4105316973415133, "grad_norm": 2.601616382598877, "learning_rate": 6.858550369662316e-05, "loss": 0.9845, "step": 803 }, { "epoch": 0.4110429447852761, "grad_norm": 3.236582040786743, "learning_rate": 6.850690776699573e-05, "loss": 1.0338, "step": 804 }, { "epoch": 0.41155419222903883, "grad_norm": 3.11091685295105, "learning_rate": 6.842825881274905e-05, "loss": 0.9695, "step": 805 }, { "epoch": 0.4120654396728016, "grad_norm": 3.053615093231201, "learning_rate": 6.834955705922232e-05, "loss": 0.9887, "step": 806 }, { "epoch": 0.4125766871165644, "grad_norm": 2.8809502124786377, "learning_rate": 6.827080273190593e-05, "loss": 0.9319, "step": 807 }, { "epoch": 0.4130879345603272, "grad_norm": 2.974292039871216, "learning_rate": 6.819199605644094e-05, "loss": 0.9261, "step": 808 }, { "epoch": 0.41359918200408996, "grad_norm": 2.8838043212890625, "learning_rate": 6.81131372586184e-05, "loss": 0.9214, "step": 809 }, { "epoch": 0.41411042944785276, "grad_norm": 3.148404598236084, "learning_rate": 6.803422656437867e-05, "loss": 1.0052, "step": 810 }, { "epoch": 0.41462167689161555, "grad_norm": 4.05100679397583, "learning_rate": 6.795526419981084e-05, "loss": 1.0043, "step": 811 }, { "epoch": 0.41513292433537835, "grad_norm": 3.163527727127075, "learning_rate": 6.7876250391152e-05, "loss": 1.018, "step": 812 }, { "epoch": 0.4156441717791411, "grad_norm": 3.0442557334899902, "learning_rate": 6.779718536478667e-05, "loss": 1.0564, "step": 813 }, { "epoch": 0.4161554192229039, "grad_norm": 3.1469643115997314, "learning_rate": 6.771806934724609e-05, "loss": 0.9531, "step": 814 }, { "epoch": 0.4166666666666667, "grad_norm": 2.9278955459594727, "learning_rate": 6.763890256520761e-05, "loss": 0.8475, "step": 815 }, { "epoch": 0.4171779141104294, "grad_norm": 2.9922728538513184, "learning_rate": 6.755968524549402e-05, "loss": 0.9396, "step": 816 }, { "epoch": 0.4176891615541922, "grad_norm": 3.1033873558044434, "learning_rate": 6.748041761507289e-05, "loss": 0.9287, "step": 817 }, { "epoch": 0.418200408997955, "grad_norm": 3.116170644760132, "learning_rate": 6.740109990105599e-05, "loss": 0.9613, "step": 818 }, { "epoch": 0.4187116564417178, "grad_norm": 3.148620367050171, "learning_rate": 6.73217323306985e-05, "loss": 0.8181, "step": 819 }, { "epoch": 0.41922290388548056, "grad_norm": 3.3708975315093994, "learning_rate": 6.724231513139852e-05, "loss": 0.932, "step": 820 }, { "epoch": 0.41973415132924335, "grad_norm": 3.3369412422180176, "learning_rate": 6.716284853069634e-05, "loss": 0.8951, "step": 821 }, { "epoch": 0.42024539877300615, "grad_norm": 3.5555686950683594, "learning_rate": 6.708333275627374e-05, "loss": 0.9422, "step": 822 }, { "epoch": 0.4207566462167689, "grad_norm": 3.544792890548706, "learning_rate": 6.700376803595343e-05, "loss": 0.9328, "step": 823 }, { "epoch": 0.4212678936605317, "grad_norm": 3.218123197555542, "learning_rate": 6.692415459769836e-05, "loss": 0.9012, "step": 824 }, { "epoch": 0.4217791411042945, "grad_norm": 3.3866398334503174, "learning_rate": 6.6844492669611e-05, "loss": 0.8799, "step": 825 }, { "epoch": 0.4222903885480573, "grad_norm": 4.010083198547363, "learning_rate": 6.676478247993284e-05, "loss": 0.9741, "step": 826 }, { "epoch": 0.42280163599182, "grad_norm": 3.7181904315948486, "learning_rate": 6.668502425704359e-05, "loss": 0.9802, "step": 827 }, { "epoch": 0.4233128834355828, "grad_norm": 3.6271464824676514, "learning_rate": 6.66052182294606e-05, "loss": 0.9424, "step": 828 }, { "epoch": 0.4238241308793456, "grad_norm": 3.841322898864746, "learning_rate": 6.652536462583817e-05, "loss": 0.9182, "step": 829 }, { "epoch": 0.42433537832310836, "grad_norm": 4.184422492980957, "learning_rate": 6.644546367496692e-05, "loss": 0.8529, "step": 830 }, { "epoch": 0.42484662576687116, "grad_norm": 3.9780843257904053, "learning_rate": 6.636551560577312e-05, "loss": 0.881, "step": 831 }, { "epoch": 0.42535787321063395, "grad_norm": 3.784928798675537, "learning_rate": 6.628552064731807e-05, "loss": 0.9653, "step": 832 }, { "epoch": 0.42586912065439675, "grad_norm": 3.955132484436035, "learning_rate": 6.620547902879738e-05, "loss": 0.9304, "step": 833 }, { "epoch": 0.4263803680981595, "grad_norm": 3.502516031265259, "learning_rate": 6.612539097954035e-05, "loss": 0.7941, "step": 834 }, { "epoch": 0.4268916155419223, "grad_norm": 4.0014824867248535, "learning_rate": 6.604525672900932e-05, "loss": 0.9795, "step": 835 }, { "epoch": 0.4274028629856851, "grad_norm": 4.320385932922363, "learning_rate": 6.5965076506799e-05, "loss": 1.14, "step": 836 }, { "epoch": 0.4279141104294479, "grad_norm": 3.5574934482574463, "learning_rate": 6.588485054263583e-05, "loss": 0.8958, "step": 837 }, { "epoch": 0.4284253578732106, "grad_norm": 4.321437358856201, "learning_rate": 6.580457906637727e-05, "loss": 0.9631, "step": 838 }, { "epoch": 0.4289366053169734, "grad_norm": 3.9037375450134277, "learning_rate": 6.57242623080112e-05, "loss": 0.8151, "step": 839 }, { "epoch": 0.4294478527607362, "grad_norm": 4.19363260269165, "learning_rate": 6.564390049765528e-05, "loss": 0.9471, "step": 840 }, { "epoch": 0.42995910020449896, "grad_norm": 3.846937894821167, "learning_rate": 6.556349386555615e-05, "loss": 0.761, "step": 841 }, { "epoch": 0.43047034764826175, "grad_norm": 3.813044786453247, "learning_rate": 6.548304264208894e-05, "loss": 0.7439, "step": 842 }, { "epoch": 0.43098159509202455, "grad_norm": 4.229809284210205, "learning_rate": 6.540254705775657e-05, "loss": 0.8742, "step": 843 }, { "epoch": 0.43149284253578735, "grad_norm": 4.918692588806152, "learning_rate": 6.532200734318896e-05, "loss": 0.9837, "step": 844 }, { "epoch": 0.4320040899795501, "grad_norm": 4.5872955322265625, "learning_rate": 6.524142372914255e-05, "loss": 0.9378, "step": 845 }, { "epoch": 0.4325153374233129, "grad_norm": 4.543177604675293, "learning_rate": 6.516079644649954e-05, "loss": 0.7143, "step": 846 }, { "epoch": 0.4330265848670757, "grad_norm": 5.153186321258545, "learning_rate": 6.508012572626724e-05, "loss": 0.743, "step": 847 }, { "epoch": 0.4335378323108384, "grad_norm": 4.661349296569824, "learning_rate": 6.49994117995774e-05, "loss": 0.7493, "step": 848 }, { "epoch": 0.4340490797546012, "grad_norm": 5.541446685791016, "learning_rate": 6.491865489768556e-05, "loss": 0.6782, "step": 849 }, { "epoch": 0.434560327198364, "grad_norm": 7.626821041107178, "learning_rate": 6.483785525197044e-05, "loss": 0.7414, "step": 850 }, { "epoch": 0.4350715746421268, "grad_norm": 2.0656769275665283, "learning_rate": 6.475701309393318e-05, "loss": 0.898, "step": 851 }, { "epoch": 0.43558282208588955, "grad_norm": 2.457845449447632, "learning_rate": 6.467612865519674e-05, "loss": 0.9019, "step": 852 }, { "epoch": 0.43609406952965235, "grad_norm": 2.4107584953308105, "learning_rate": 6.45952021675052e-05, "loss": 0.9884, "step": 853 }, { "epoch": 0.43660531697341515, "grad_norm": 2.524989128112793, "learning_rate": 6.451423386272312e-05, "loss": 1.0709, "step": 854 }, { "epoch": 0.4371165644171779, "grad_norm": 2.8318159580230713, "learning_rate": 6.44332239728349e-05, "loss": 0.9647, "step": 855 }, { "epoch": 0.4376278118609407, "grad_norm": 2.6179370880126953, "learning_rate": 6.435217272994406e-05, "loss": 1.0725, "step": 856 }, { "epoch": 0.4381390593047035, "grad_norm": 2.7386882305145264, "learning_rate": 6.427108036627262e-05, "loss": 0.9827, "step": 857 }, { "epoch": 0.4386503067484663, "grad_norm": 2.72357439994812, "learning_rate": 6.418994711416038e-05, "loss": 0.945, "step": 858 }, { "epoch": 0.439161554192229, "grad_norm": 2.9313924312591553, "learning_rate": 6.410877320606432e-05, "loss": 1.0524, "step": 859 }, { "epoch": 0.4396728016359918, "grad_norm": 3.1169660091400146, "learning_rate": 6.402755887455792e-05, "loss": 1.0112, "step": 860 }, { "epoch": 0.4401840490797546, "grad_norm": 3.151090621948242, "learning_rate": 6.394630435233045e-05, "loss": 1.1376, "step": 861 }, { "epoch": 0.44069529652351735, "grad_norm": 3.0045671463012695, "learning_rate": 6.386500987218633e-05, "loss": 0.8827, "step": 862 }, { "epoch": 0.44120654396728015, "grad_norm": 3.427995443344116, "learning_rate": 6.378367566704449e-05, "loss": 0.9608, "step": 863 }, { "epoch": 0.44171779141104295, "grad_norm": 3.3372836112976074, "learning_rate": 6.370230196993763e-05, "loss": 1.0425, "step": 864 }, { "epoch": 0.44222903885480574, "grad_norm": 2.8694262504577637, "learning_rate": 6.362088901401165e-05, "loss": 0.9048, "step": 865 }, { "epoch": 0.4427402862985685, "grad_norm": 3.1645376682281494, "learning_rate": 6.353943703252493e-05, "loss": 1.0502, "step": 866 }, { "epoch": 0.4432515337423313, "grad_norm": 3.32002592086792, "learning_rate": 6.345794625884762e-05, "loss": 0.9785, "step": 867 }, { "epoch": 0.4437627811860941, "grad_norm": 3.2119669914245605, "learning_rate": 6.337641692646106e-05, "loss": 0.9322, "step": 868 }, { "epoch": 0.4442740286298569, "grad_norm": 3.1519265174865723, "learning_rate": 6.3294849268957e-05, "loss": 0.9326, "step": 869 }, { "epoch": 0.4447852760736196, "grad_norm": 3.1073238849639893, "learning_rate": 6.321324352003711e-05, "loss": 0.915, "step": 870 }, { "epoch": 0.4452965235173824, "grad_norm": 3.0250244140625, "learning_rate": 6.313159991351206e-05, "loss": 0.8389, "step": 871 }, { "epoch": 0.4458077709611452, "grad_norm": 3.6045401096343994, "learning_rate": 6.30499186833011e-05, "loss": 0.996, "step": 872 }, { "epoch": 0.44631901840490795, "grad_norm": 3.9004733562469482, "learning_rate": 6.296820006343122e-05, "loss": 0.993, "step": 873 }, { "epoch": 0.44683026584867075, "grad_norm": 3.555574417114258, "learning_rate": 6.288644428803653e-05, "loss": 0.918, "step": 874 }, { "epoch": 0.44734151329243355, "grad_norm": 3.5566821098327637, "learning_rate": 6.280465159135763e-05, "loss": 0.8514, "step": 875 }, { "epoch": 0.44785276073619634, "grad_norm": 3.650169849395752, "learning_rate": 6.272282220774091e-05, "loss": 0.9625, "step": 876 }, { "epoch": 0.4483640081799591, "grad_norm": 3.5784823894500732, "learning_rate": 6.264095637163779e-05, "loss": 0.9592, "step": 877 }, { "epoch": 0.4488752556237219, "grad_norm": 3.529385566711426, "learning_rate": 6.255905431760424e-05, "loss": 0.9422, "step": 878 }, { "epoch": 0.4493865030674847, "grad_norm": 3.6039819717407227, "learning_rate": 6.247711628029993e-05, "loss": 0.8743, "step": 879 }, { "epoch": 0.4498977505112474, "grad_norm": 3.4549014568328857, "learning_rate": 6.239514249448767e-05, "loss": 0.889, "step": 880 }, { "epoch": 0.4504089979550102, "grad_norm": 3.8468194007873535, "learning_rate": 6.231313319503264e-05, "loss": 1.023, "step": 881 }, { "epoch": 0.450920245398773, "grad_norm": 4.1849365234375, "learning_rate": 6.22310886169018e-05, "loss": 0.9639, "step": 882 }, { "epoch": 0.4514314928425358, "grad_norm": 3.9194602966308594, "learning_rate": 6.21490089951632e-05, "loss": 0.9048, "step": 883 }, { "epoch": 0.45194274028629855, "grad_norm": 3.7644968032836914, "learning_rate": 6.206689456498529e-05, "loss": 0.8741, "step": 884 }, { "epoch": 0.45245398773006135, "grad_norm": 3.7964026927948, "learning_rate": 6.198474556163623e-05, "loss": 0.8582, "step": 885 }, { "epoch": 0.45296523517382414, "grad_norm": 4.092611312866211, "learning_rate": 6.190256222048327e-05, "loss": 0.8069, "step": 886 }, { "epoch": 0.4534764826175869, "grad_norm": 3.79891300201416, "learning_rate": 6.1820344776992e-05, "loss": 0.887, "step": 887 }, { "epoch": 0.4539877300613497, "grad_norm": 4.135311126708984, "learning_rate": 6.173809346672574e-05, "loss": 0.8481, "step": 888 }, { "epoch": 0.4544989775051125, "grad_norm": 4.441306114196777, "learning_rate": 6.165580852534487e-05, "loss": 0.8705, "step": 889 }, { "epoch": 0.4550102249488753, "grad_norm": 4.326587677001953, "learning_rate": 6.157349018860607e-05, "loss": 0.9348, "step": 890 }, { "epoch": 0.455521472392638, "grad_norm": 5.497438907623291, "learning_rate": 6.149113869236175e-05, "loss": 0.7662, "step": 891 }, { "epoch": 0.4560327198364008, "grad_norm": 4.413537979125977, "learning_rate": 6.14087542725593e-05, "loss": 0.9143, "step": 892 }, { "epoch": 0.4565439672801636, "grad_norm": 4.53402853012085, "learning_rate": 6.132633716524046e-05, "loss": 0.789, "step": 893 }, { "epoch": 0.4570552147239264, "grad_norm": 4.856945037841797, "learning_rate": 6.124388760654059e-05, "loss": 0.9029, "step": 894 }, { "epoch": 0.45756646216768915, "grad_norm": 5.3796610832214355, "learning_rate": 6.116140583268806e-05, "loss": 0.7887, "step": 895 }, { "epoch": 0.45807770961145194, "grad_norm": 4.99921178817749, "learning_rate": 6.107889208000354e-05, "loss": 0.8267, "step": 896 }, { "epoch": 0.45858895705521474, "grad_norm": 5.182647705078125, "learning_rate": 6.0996346584899286e-05, "loss": 0.8042, "step": 897 }, { "epoch": 0.4591002044989775, "grad_norm": 4.6235270500183105, "learning_rate": 6.0913769583878555e-05, "loss": 0.6185, "step": 898 }, { "epoch": 0.4596114519427403, "grad_norm": 5.750977039337158, "learning_rate": 6.083116131353484e-05, "loss": 0.7183, "step": 899 }, { "epoch": 0.4601226993865031, "grad_norm": 7.206790447235107, "learning_rate": 6.0748522010551215e-05, "loss": 0.4708, "step": 900 }, { "epoch": 0.4606339468302659, "grad_norm": 2.1739280223846436, "learning_rate": 6.066585191169969e-05, "loss": 0.8614, "step": 901 }, { "epoch": 0.4611451942740286, "grad_norm": 2.5511627197265625, "learning_rate": 6.0583151253840486e-05, "loss": 0.9832, "step": 902 }, { "epoch": 0.4616564417177914, "grad_norm": 2.4750165939331055, "learning_rate": 6.050042027392142e-05, "loss": 0.9799, "step": 903 }, { "epoch": 0.4621676891615542, "grad_norm": 2.503610849380493, "learning_rate": 6.0417659208977127e-05, "loss": 0.8716, "step": 904 }, { "epoch": 0.46267893660531695, "grad_norm": 2.991971492767334, "learning_rate": 6.033486829612851e-05, "loss": 0.9855, "step": 905 }, { "epoch": 0.46319018404907975, "grad_norm": 2.931039571762085, "learning_rate": 6.025204777258191e-05, "loss": 1.0956, "step": 906 }, { "epoch": 0.46370143149284254, "grad_norm": 2.9422197341918945, "learning_rate": 6.016919787562858e-05, "loss": 0.9248, "step": 907 }, { "epoch": 0.46421267893660534, "grad_norm": 2.9751358032226562, "learning_rate": 6.008631884264388e-05, "loss": 0.9444, "step": 908 }, { "epoch": 0.4647239263803681, "grad_norm": 2.9331753253936768, "learning_rate": 6.000341091108665e-05, "loss": 1.0345, "step": 909 }, { "epoch": 0.4652351738241309, "grad_norm": 2.805023670196533, "learning_rate": 5.99204743184986e-05, "loss": 0.9525, "step": 910 }, { "epoch": 0.4657464212678937, "grad_norm": 2.883270740509033, "learning_rate": 5.983750930250345e-05, "loss": 0.9199, "step": 911 }, { "epoch": 0.4662576687116564, "grad_norm": 3.328139066696167, "learning_rate": 5.9754516100806423e-05, "loss": 1.0521, "step": 912 }, { "epoch": 0.4667689161554192, "grad_norm": 2.935974359512329, "learning_rate": 5.9671494951193486e-05, "loss": 0.9543, "step": 913 }, { "epoch": 0.467280163599182, "grad_norm": 2.9338274002075195, "learning_rate": 5.958844609153068e-05, "loss": 0.8694, "step": 914 }, { "epoch": 0.4677914110429448, "grad_norm": 3.335564374923706, "learning_rate": 5.950536975976345e-05, "loss": 0.9262, "step": 915 }, { "epoch": 0.46830265848670755, "grad_norm": 3.262054681777954, "learning_rate": 5.9422266193915924e-05, "loss": 0.9122, "step": 916 }, { "epoch": 0.46881390593047034, "grad_norm": 3.0267891883850098, "learning_rate": 5.933913563209026e-05, "loss": 0.8729, "step": 917 }, { "epoch": 0.46932515337423314, "grad_norm": 3.102147102355957, "learning_rate": 5.925597831246601e-05, "loss": 0.9029, "step": 918 }, { "epoch": 0.46983640081799594, "grad_norm": 3.2279419898986816, "learning_rate": 5.917279447329933e-05, "loss": 0.9477, "step": 919 }, { "epoch": 0.4703476482617587, "grad_norm": 3.404975652694702, "learning_rate": 5.908958435292241e-05, "loss": 0.9897, "step": 920 }, { "epoch": 0.4708588957055215, "grad_norm": 3.5861120223999023, "learning_rate": 5.900634818974269e-05, "loss": 0.915, "step": 921 }, { "epoch": 0.47137014314928427, "grad_norm": 3.6053731441497803, "learning_rate": 5.8923086222242255e-05, "loss": 0.8687, "step": 922 }, { "epoch": 0.471881390593047, "grad_norm": 3.653125286102295, "learning_rate": 5.883979868897712e-05, "loss": 0.8448, "step": 923 }, { "epoch": 0.4723926380368098, "grad_norm": 3.7469069957733154, "learning_rate": 5.8756485828576544e-05, "loss": 0.971, "step": 924 }, { "epoch": 0.4729038854805726, "grad_norm": 3.449720859527588, "learning_rate": 5.867314787974237e-05, "loss": 0.877, "step": 925 }, { "epoch": 0.4734151329243354, "grad_norm": 3.7333271503448486, "learning_rate": 5.858978508124829e-05, "loss": 0.9484, "step": 926 }, { "epoch": 0.47392638036809814, "grad_norm": 3.670445442199707, "learning_rate": 5.8506397671939214e-05, "loss": 0.9182, "step": 927 }, { "epoch": 0.47443762781186094, "grad_norm": 4.089524745941162, "learning_rate": 5.8422985890730576e-05, "loss": 0.9254, "step": 928 }, { "epoch": 0.47494887525562374, "grad_norm": 3.6270928382873535, "learning_rate": 5.833954997660761e-05, "loss": 0.9148, "step": 929 }, { "epoch": 0.4754601226993865, "grad_norm": 3.7784345149993896, "learning_rate": 5.825609016862469e-05, "loss": 0.8091, "step": 930 }, { "epoch": 0.4759713701431493, "grad_norm": 4.248725414276123, "learning_rate": 5.81726067059047e-05, "loss": 1.0091, "step": 931 }, { "epoch": 0.47648261758691207, "grad_norm": 3.9264259338378906, "learning_rate": 5.808909982763825e-05, "loss": 0.9502, "step": 932 }, { "epoch": 0.47699386503067487, "grad_norm": 3.8590493202209473, "learning_rate": 5.8005569773083035e-05, "loss": 0.8454, "step": 933 }, { "epoch": 0.4775051124744376, "grad_norm": 4.2589802742004395, "learning_rate": 5.7922016781563205e-05, "loss": 1.0871, "step": 934 }, { "epoch": 0.4780163599182004, "grad_norm": 3.9979145526885986, "learning_rate": 5.7838441092468565e-05, "loss": 0.8526, "step": 935 }, { "epoch": 0.4785276073619632, "grad_norm": 4.0638041496276855, "learning_rate": 5.775484294525399e-05, "loss": 0.9021, "step": 936 }, { "epoch": 0.47903885480572594, "grad_norm": 3.9989681243896484, "learning_rate": 5.767122257943869e-05, "loss": 0.8017, "step": 937 }, { "epoch": 0.47955010224948874, "grad_norm": 3.931239366531372, "learning_rate": 5.758758023460553e-05, "loss": 0.8762, "step": 938 }, { "epoch": 0.48006134969325154, "grad_norm": 4.070004463195801, "learning_rate": 5.750391615040037e-05, "loss": 0.7725, "step": 939 }, { "epoch": 0.48057259713701433, "grad_norm": 3.9469223022460938, "learning_rate": 5.742023056653131e-05, "loss": 0.7972, "step": 940 }, { "epoch": 0.4810838445807771, "grad_norm": 4.728452682495117, "learning_rate": 5.733652372276809e-05, "loss": 0.9215, "step": 941 }, { "epoch": 0.4815950920245399, "grad_norm": 4.572932243347168, "learning_rate": 5.7252795858941344e-05, "loss": 0.8456, "step": 942 }, { "epoch": 0.48210633946830267, "grad_norm": 5.1015520095825195, "learning_rate": 5.716904721494195e-05, "loss": 0.9307, "step": 943 }, { "epoch": 0.48261758691206547, "grad_norm": 4.975411415100098, "learning_rate": 5.70852780307203e-05, "loss": 0.7936, "step": 944 }, { "epoch": 0.4831288343558282, "grad_norm": 4.388181209564209, "learning_rate": 5.7001488546285666e-05, "loss": 0.7024, "step": 945 }, { "epoch": 0.483640081799591, "grad_norm": 5.069786548614502, "learning_rate": 5.691767900170542e-05, "loss": 0.6689, "step": 946 }, { "epoch": 0.4841513292433538, "grad_norm": 5.24841833114624, "learning_rate": 5.6833849637104476e-05, "loss": 0.8018, "step": 947 }, { "epoch": 0.48466257668711654, "grad_norm": 5.333656311035156, "learning_rate": 5.675000069266451e-05, "loss": 0.7003, "step": 948 }, { "epoch": 0.48517382413087934, "grad_norm": 6.533324718475342, "learning_rate": 5.666613240862331e-05, "loss": 0.6977, "step": 949 }, { "epoch": 0.48568507157464214, "grad_norm": 8.306028366088867, "learning_rate": 5.658224502527404e-05, "loss": 1.1272, "step": 950 }, { "epoch": 0.48619631901840493, "grad_norm": 2.5492875576019287, "learning_rate": 5.649833878296462e-05, "loss": 0.8884, "step": 951 }, { "epoch": 0.4867075664621677, "grad_norm": 2.5944671630859375, "learning_rate": 5.641441392209699e-05, "loss": 0.8951, "step": 952 }, { "epoch": 0.48721881390593047, "grad_norm": 2.6134727001190186, "learning_rate": 5.6330470683126424e-05, "loss": 0.9544, "step": 953 }, { "epoch": 0.48773006134969327, "grad_norm": 2.720135450363159, "learning_rate": 5.624650930656089e-05, "loss": 0.9283, "step": 954 }, { "epoch": 0.488241308793456, "grad_norm": 2.600879669189453, "learning_rate": 5.616253003296027e-05, "loss": 0.9107, "step": 955 }, { "epoch": 0.4887525562372188, "grad_norm": 2.7407121658325195, "learning_rate": 5.6078533102935745e-05, "loss": 0.95, "step": 956 }, { "epoch": 0.4892638036809816, "grad_norm": 3.158928155899048, "learning_rate": 5.599451875714913e-05, "loss": 0.9942, "step": 957 }, { "epoch": 0.4897750511247444, "grad_norm": 2.9920670986175537, "learning_rate": 5.5910487236312045e-05, "loss": 1.0038, "step": 958 }, { "epoch": 0.49028629856850714, "grad_norm": 3.3155324459075928, "learning_rate": 5.582643878118541e-05, "loss": 0.9121, "step": 959 }, { "epoch": 0.49079754601226994, "grad_norm": 3.5027592182159424, "learning_rate": 5.574237363257858e-05, "loss": 0.9685, "step": 960 }, { "epoch": 0.49130879345603273, "grad_norm": 3.1491141319274902, "learning_rate": 5.565829203134881e-05, "loss": 1.1047, "step": 961 }, { "epoch": 0.4918200408997955, "grad_norm": 3.131809949874878, "learning_rate": 5.557419421840048e-05, "loss": 0.9373, "step": 962 }, { "epoch": 0.49233128834355827, "grad_norm": 3.011101245880127, "learning_rate": 5.549008043468439e-05, "loss": 0.9795, "step": 963 }, { "epoch": 0.49284253578732107, "grad_norm": 3.0677943229675293, "learning_rate": 5.540595092119709e-05, "loss": 0.9323, "step": 964 }, { "epoch": 0.49335378323108386, "grad_norm": 2.9634270668029785, "learning_rate": 5.532180591898026e-05, "loss": 0.917, "step": 965 }, { "epoch": 0.4938650306748466, "grad_norm": 3.0387773513793945, "learning_rate": 5.5237645669119895e-05, "loss": 0.8734, "step": 966 }, { "epoch": 0.4943762781186094, "grad_norm": 3.3674721717834473, "learning_rate": 5.51534704127457e-05, "loss": 0.9791, "step": 967 }, { "epoch": 0.4948875255623722, "grad_norm": 3.1492297649383545, "learning_rate": 5.50692803910304e-05, "loss": 0.9631, "step": 968 }, { "epoch": 0.495398773006135, "grad_norm": 3.6630218029022217, "learning_rate": 5.498507584518896e-05, "loss": 0.9708, "step": 969 }, { "epoch": 0.49591002044989774, "grad_norm": 3.3851146697998047, "learning_rate": 5.490085701647805e-05, "loss": 0.8997, "step": 970 }, { "epoch": 0.49642126789366053, "grad_norm": 3.156371831893921, "learning_rate": 5.481662414619515e-05, "loss": 0.9712, "step": 971 }, { "epoch": 0.49693251533742333, "grad_norm": 3.431939125061035, "learning_rate": 5.473237747567805e-05, "loss": 0.9149, "step": 972 }, { "epoch": 0.49744376278118607, "grad_norm": 3.7921383380889893, "learning_rate": 5.464811724630411e-05, "loss": 0.8872, "step": 973 }, { "epoch": 0.49795501022494887, "grad_norm": 3.2609188556671143, "learning_rate": 5.456384369948943e-05, "loss": 0.8625, "step": 974 }, { "epoch": 0.49846625766871167, "grad_norm": 3.48323392868042, "learning_rate": 5.4479557076688325e-05, "loss": 0.9007, "step": 975 }, { "epoch": 0.49897750511247446, "grad_norm": 3.2786710262298584, "learning_rate": 5.439525761939261e-05, "loss": 0.8592, "step": 976 }, { "epoch": 0.4994887525562372, "grad_norm": 3.686288833618164, "learning_rate": 5.431094556913082e-05, "loss": 0.8544, "step": 977 }, { "epoch": 0.5, "grad_norm": 4.029841899871826, "learning_rate": 5.422662116746759e-05, "loss": 0.9432, "step": 978 }, { "epoch": 0.5005112474437627, "grad_norm": 3.703411102294922, "learning_rate": 5.414228465600293e-05, "loss": 0.9156, "step": 979 }, { "epoch": 0.5010224948875256, "grad_norm": 4.18709659576416, "learning_rate": 5.4057936276371565e-05, "loss": 0.9549, "step": 980 }, { "epoch": 0.5015337423312883, "grad_norm": 3.750430107116699, "learning_rate": 5.397357627024221e-05, "loss": 0.8906, "step": 981 }, { "epoch": 0.5020449897750511, "grad_norm": 3.5980660915374756, "learning_rate": 5.3889204879316913e-05, "loss": 0.8738, "step": 982 }, { "epoch": 0.5025562372188139, "grad_norm": 4.250522136688232, "learning_rate": 5.3804822345330295e-05, "loss": 0.9729, "step": 983 }, { "epoch": 0.5030674846625767, "grad_norm": 4.09278678894043, "learning_rate": 5.372042891004896e-05, "loss": 0.8706, "step": 984 }, { "epoch": 0.5035787321063395, "grad_norm": 4.25390100479126, "learning_rate": 5.36360248152707e-05, "loss": 0.9537, "step": 985 }, { "epoch": 0.5040899795501023, "grad_norm": 3.719179153442383, "learning_rate": 5.355161030282387e-05, "loss": 0.8335, "step": 986 }, { "epoch": 0.504601226993865, "grad_norm": 3.710343837738037, "learning_rate": 5.346718561456669e-05, "loss": 0.8313, "step": 987 }, { "epoch": 0.5051124744376279, "grad_norm": 4.005296230316162, "learning_rate": 5.338275099238647e-05, "loss": 0.9525, "step": 988 }, { "epoch": 0.5056237218813906, "grad_norm": 4.096909523010254, "learning_rate": 5.329830667819905e-05, "loss": 0.8994, "step": 989 }, { "epoch": 0.5061349693251533, "grad_norm": 4.005760669708252, "learning_rate": 5.3213852913948026e-05, "loss": 0.894, "step": 990 }, { "epoch": 0.5066462167689162, "grad_norm": 4.101000785827637, "learning_rate": 5.312938994160404e-05, "loss": 0.872, "step": 991 }, { "epoch": 0.5071574642126789, "grad_norm": 4.393942356109619, "learning_rate": 5.3044918003164156e-05, "loss": 0.8976, "step": 992 }, { "epoch": 0.5076687116564417, "grad_norm": 4.222607135772705, "learning_rate": 5.296043734065108e-05, "loss": 0.7865, "step": 993 }, { "epoch": 0.5081799591002045, "grad_norm": 4.471764087677002, "learning_rate": 5.287594819611256e-05, "loss": 0.8599, "step": 994 }, { "epoch": 0.5086912065439673, "grad_norm": 4.455383777618408, "learning_rate": 5.2791450811620615e-05, "loss": 0.839, "step": 995 }, { "epoch": 0.50920245398773, "grad_norm": 4.264684200286865, "learning_rate": 5.270694542927088e-05, "loss": 0.7249, "step": 996 }, { "epoch": 0.5097137014314929, "grad_norm": 4.615438938140869, "learning_rate": 5.262243229118192e-05, "loss": 0.8047, "step": 997 }, { "epoch": 0.5102249488752556, "grad_norm": 5.186718940734863, "learning_rate": 5.2537911639494494e-05, "loss": 0.773, "step": 998 }, { "epoch": 0.5107361963190185, "grad_norm": 5.383667945861816, "learning_rate": 5.245338371637091e-05, "loss": 0.6268, "step": 999 }, { "epoch": 0.5112474437627812, "grad_norm": 9.21727466583252, "learning_rate": 5.236884876399429e-05, "loss": 0.9014, "step": 1000 }, { "epoch": 0.5117586912065439, "grad_norm": 2.15889573097229, "learning_rate": 5.2284307024567936e-05, "loss": 0.8665, "step": 1001 }, { "epoch": 0.5122699386503068, "grad_norm": 2.28460693359375, "learning_rate": 5.219975874031456e-05, "loss": 0.9948, "step": 1002 }, { "epoch": 0.5127811860940695, "grad_norm": 2.767993211746216, "learning_rate": 5.211520415347562e-05, "loss": 1.0241, "step": 1003 }, { "epoch": 0.5132924335378323, "grad_norm": 2.5393383502960205, "learning_rate": 5.203064350631064e-05, "loss": 0.8823, "step": 1004 }, { "epoch": 0.5138036809815951, "grad_norm": 2.8708670139312744, "learning_rate": 5.194607704109653e-05, "loss": 1.1419, "step": 1005 }, { "epoch": 0.5143149284253579, "grad_norm": 2.744375228881836, "learning_rate": 5.186150500012685e-05, "loss": 1.0083, "step": 1006 }, { "epoch": 0.5148261758691206, "grad_norm": 2.742351770401001, "learning_rate": 5.1776927625711156e-05, "loss": 0.9516, "step": 1007 }, { "epoch": 0.5153374233128835, "grad_norm": 3.1860601902008057, "learning_rate": 5.1692345160174225e-05, "loss": 0.9949, "step": 1008 }, { "epoch": 0.5158486707566462, "grad_norm": 2.853633403778076, "learning_rate": 5.160775784585551e-05, "loss": 0.9623, "step": 1009 }, { "epoch": 0.516359918200409, "grad_norm": 3.070064067840576, "learning_rate": 5.152316592510826e-05, "loss": 0.9656, "step": 1010 }, { "epoch": 0.5168711656441718, "grad_norm": 3.3969080448150635, "learning_rate": 5.1438569640299006e-05, "loss": 0.9152, "step": 1011 }, { "epoch": 0.5173824130879345, "grad_norm": 3.090078830718994, "learning_rate": 5.135396923380673e-05, "loss": 1.0513, "step": 1012 }, { "epoch": 0.5178936605316974, "grad_norm": 3.320444107055664, "learning_rate": 5.1269364948022255e-05, "loss": 0.9246, "step": 1013 }, { "epoch": 0.5184049079754601, "grad_norm": 3.0707502365112305, "learning_rate": 5.118475702534748e-05, "loss": 0.8337, "step": 1014 }, { "epoch": 0.5189161554192229, "grad_norm": 3.0075268745422363, "learning_rate": 5.110014570819478e-05, "loss": 0.9142, "step": 1015 }, { "epoch": 0.5194274028629857, "grad_norm": 2.8207309246063232, "learning_rate": 5.101553123898622e-05, "loss": 0.8493, "step": 1016 }, { "epoch": 0.5199386503067485, "grad_norm": 3.2407922744750977, "learning_rate": 5.093091386015287e-05, "loss": 0.8856, "step": 1017 }, { "epoch": 0.5204498977505112, "grad_norm": 3.477283239364624, "learning_rate": 5.08462938141342e-05, "loss": 0.989, "step": 1018 }, { "epoch": 0.5209611451942741, "grad_norm": 3.3261115550994873, "learning_rate": 5.0761671343377256e-05, "loss": 0.9585, "step": 1019 }, { "epoch": 0.5214723926380368, "grad_norm": 3.273200273513794, "learning_rate": 5.0677046690336096e-05, "loss": 0.914, "step": 1020 }, { "epoch": 0.5219836400817995, "grad_norm": 3.3354554176330566, "learning_rate": 5.0592420097471e-05, "loss": 0.9123, "step": 1021 }, { "epoch": 0.5224948875255624, "grad_norm": 3.29411244392395, "learning_rate": 5.050779180724776e-05, "loss": 0.865, "step": 1022 }, { "epoch": 0.5230061349693251, "grad_norm": 3.2440457344055176, "learning_rate": 5.042316206213712e-05, "loss": 0.7901, "step": 1023 }, { "epoch": 0.523517382413088, "grad_norm": 3.6321732997894287, "learning_rate": 5.0338531104613926e-05, "loss": 0.9108, "step": 1024 }, { "epoch": 0.5240286298568507, "grad_norm": 3.4314496517181396, "learning_rate": 5.025389917715653e-05, "loss": 0.8733, "step": 1025 }, { "epoch": 0.5245398773006135, "grad_norm": 3.502465009689331, "learning_rate": 5.016926652224604e-05, "loss": 1.011, "step": 1026 }, { "epoch": 0.5250511247443763, "grad_norm": 3.4616212844848633, "learning_rate": 5.008463338236566e-05, "loss": 0.8113, "step": 1027 }, { "epoch": 0.5255623721881391, "grad_norm": 3.4691240787506104, "learning_rate": 5e-05, "loss": 0.8563, "step": 1028 }, { "epoch": 0.5260736196319018, "grad_norm": 3.7538325786590576, "learning_rate": 4.991536661763434e-05, "loss": 0.9536, "step": 1029 }, { "epoch": 0.5265848670756647, "grad_norm": 3.647031307220459, "learning_rate": 4.983073347775397e-05, "loss": 0.9124, "step": 1030 }, { "epoch": 0.5270961145194274, "grad_norm": 3.6543843746185303, "learning_rate": 4.9746100822843477e-05, "loss": 0.9421, "step": 1031 }, { "epoch": 0.5276073619631901, "grad_norm": 3.728757619857788, "learning_rate": 4.966146889538608e-05, "loss": 0.8373, "step": 1032 }, { "epoch": 0.528118609406953, "grad_norm": 3.5960378646850586, "learning_rate": 4.95768379378629e-05, "loss": 0.9218, "step": 1033 }, { "epoch": 0.5286298568507157, "grad_norm": 3.6316988468170166, "learning_rate": 4.9492208192752256e-05, "loss": 0.9065, "step": 1034 }, { "epoch": 0.5291411042944786, "grad_norm": 4.244678020477295, "learning_rate": 4.940757990252902e-05, "loss": 0.8062, "step": 1035 }, { "epoch": 0.5296523517382413, "grad_norm": 3.584723711013794, "learning_rate": 4.9322953309663916e-05, "loss": 0.8503, "step": 1036 }, { "epoch": 0.5301635991820041, "grad_norm": 4.005549430847168, "learning_rate": 4.923832865662275e-05, "loss": 0.895, "step": 1037 }, { "epoch": 0.5306748466257669, "grad_norm": 4.480841636657715, "learning_rate": 4.9153706185865815e-05, "loss": 1.0696, "step": 1038 }, { "epoch": 0.5311860940695297, "grad_norm": 4.540712356567383, "learning_rate": 4.9069086139847134e-05, "loss": 1.0161, "step": 1039 }, { "epoch": 0.5316973415132924, "grad_norm": 4.2374372482299805, "learning_rate": 4.898446876101379e-05, "loss": 0.9821, "step": 1040 }, { "epoch": 0.5322085889570553, "grad_norm": 4.287907600402832, "learning_rate": 4.889985429180521e-05, "loss": 0.7927, "step": 1041 }, { "epoch": 0.532719836400818, "grad_norm": 4.230398654937744, "learning_rate": 4.881524297465251e-05, "loss": 0.8622, "step": 1042 }, { "epoch": 0.5332310838445807, "grad_norm": 4.255669593811035, "learning_rate": 4.873063505197777e-05, "loss": 0.7223, "step": 1043 }, { "epoch": 0.5337423312883436, "grad_norm": 4.257006645202637, "learning_rate": 4.8646030766193285e-05, "loss": 0.7052, "step": 1044 }, { "epoch": 0.5342535787321063, "grad_norm": 4.510742664337158, "learning_rate": 4.856143035970101e-05, "loss": 0.7036, "step": 1045 }, { "epoch": 0.5347648261758691, "grad_norm": 5.051736831665039, "learning_rate": 4.847683407489175e-05, "loss": 0.8619, "step": 1046 }, { "epoch": 0.5352760736196319, "grad_norm": 4.3064093589782715, "learning_rate": 4.83922421541445e-05, "loss": 0.6374, "step": 1047 }, { "epoch": 0.5357873210633947, "grad_norm": 4.907425403594971, "learning_rate": 4.830765483982578e-05, "loss": 0.6855, "step": 1048 }, { "epoch": 0.5362985685071575, "grad_norm": 6.278970241546631, "learning_rate": 4.822307237428885e-05, "loss": 0.7517, "step": 1049 }, { "epoch": 0.5368098159509203, "grad_norm": 7.862913608551025, "learning_rate": 4.813849499987314e-05, "loss": 0.7461, "step": 1050 }, { "epoch": 0.537321063394683, "grad_norm": 2.1761088371276855, "learning_rate": 4.8053922958903466e-05, "loss": 1.0014, "step": 1051 }, { "epoch": 0.5378323108384458, "grad_norm": 2.4289817810058594, "learning_rate": 4.796935649368935e-05, "loss": 1.0023, "step": 1052 }, { "epoch": 0.5383435582822086, "grad_norm": 2.4582669734954834, "learning_rate": 4.788479584652441e-05, "loss": 1.0581, "step": 1053 }, { "epoch": 0.5388548057259713, "grad_norm": 2.5545613765716553, "learning_rate": 4.7800241259685464e-05, "loss": 1.0115, "step": 1054 }, { "epoch": 0.5393660531697342, "grad_norm": 2.4820990562438965, "learning_rate": 4.7715692975432076e-05, "loss": 0.941, "step": 1055 }, { "epoch": 0.5398773006134969, "grad_norm": 2.6315999031066895, "learning_rate": 4.763115123600571e-05, "loss": 0.9529, "step": 1056 }, { "epoch": 0.5403885480572597, "grad_norm": 2.7553720474243164, "learning_rate": 4.7546616283629105e-05, "loss": 0.9689, "step": 1057 }, { "epoch": 0.5408997955010225, "grad_norm": 2.7258827686309814, "learning_rate": 4.746208836050551e-05, "loss": 0.9044, "step": 1058 }, { "epoch": 0.5414110429447853, "grad_norm": 2.9439940452575684, "learning_rate": 4.737756770881809e-05, "loss": 0.9474, "step": 1059 }, { "epoch": 0.5419222903885481, "grad_norm": 3.022825241088867, "learning_rate": 4.729305457072913e-05, "loss": 1.0523, "step": 1060 }, { "epoch": 0.5424335378323109, "grad_norm": 3.053812265396118, "learning_rate": 4.720854918837939e-05, "loss": 0.8946, "step": 1061 }, { "epoch": 0.5429447852760736, "grad_norm": 3.0392403602600098, "learning_rate": 4.712405180388745e-05, "loss": 0.9809, "step": 1062 }, { "epoch": 0.5434560327198364, "grad_norm": 2.9778902530670166, "learning_rate": 4.703956265934894e-05, "loss": 0.883, "step": 1063 }, { "epoch": 0.5439672801635992, "grad_norm": 3.0558431148529053, "learning_rate": 4.695508199683586e-05, "loss": 0.8867, "step": 1064 }, { "epoch": 0.5444785276073619, "grad_norm": 3.2434241771698, "learning_rate": 4.687061005839597e-05, "loss": 0.9211, "step": 1065 }, { "epoch": 0.5449897750511248, "grad_norm": 3.273984909057617, "learning_rate": 4.678614708605199e-05, "loss": 0.9619, "step": 1066 }, { "epoch": 0.5455010224948875, "grad_norm": 3.257916212081909, "learning_rate": 4.670169332180096e-05, "loss": 0.9638, "step": 1067 }, { "epoch": 0.5460122699386503, "grad_norm": 3.309859275817871, "learning_rate": 4.6617249007613544e-05, "loss": 1.0217, "step": 1068 }, { "epoch": 0.5465235173824131, "grad_norm": 3.2615549564361572, "learning_rate": 4.653281438543333e-05, "loss": 0.9133, "step": 1069 }, { "epoch": 0.5470347648261759, "grad_norm": 3.446503162384033, "learning_rate": 4.644838969717613e-05, "loss": 0.9331, "step": 1070 }, { "epoch": 0.5475460122699386, "grad_norm": 3.353502035140991, "learning_rate": 4.636397518472931e-05, "loss": 0.9512, "step": 1071 }, { "epoch": 0.5480572597137015, "grad_norm": 3.027780055999756, "learning_rate": 4.6279571089951054e-05, "loss": 0.8765, "step": 1072 }, { "epoch": 0.5485685071574642, "grad_norm": 3.0944087505340576, "learning_rate": 4.619517765466972e-05, "loss": 0.9157, "step": 1073 }, { "epoch": 0.549079754601227, "grad_norm": 3.496917486190796, "learning_rate": 4.61107951206831e-05, "loss": 0.9094, "step": 1074 }, { "epoch": 0.5495910020449898, "grad_norm": 3.2800345420837402, "learning_rate": 4.60264237297578e-05, "loss": 0.9757, "step": 1075 }, { "epoch": 0.5501022494887525, "grad_norm": 3.8397467136383057, "learning_rate": 4.594206372362845e-05, "loss": 0.9101, "step": 1076 }, { "epoch": 0.5506134969325154, "grad_norm": 3.393571376800537, "learning_rate": 4.5857715343997076e-05, "loss": 0.8935, "step": 1077 }, { "epoch": 0.5511247443762781, "grad_norm": 3.46854829788208, "learning_rate": 4.577337883253242e-05, "loss": 0.9132, "step": 1078 }, { "epoch": 0.5516359918200409, "grad_norm": 3.91577410697937, "learning_rate": 4.5689054430869184e-05, "loss": 0.8591, "step": 1079 }, { "epoch": 0.5521472392638037, "grad_norm": 3.561288356781006, "learning_rate": 4.560474238060739e-05, "loss": 0.8733, "step": 1080 }, { "epoch": 0.5526584867075665, "grad_norm": 3.7657833099365234, "learning_rate": 4.5520442923311666e-05, "loss": 0.8395, "step": 1081 }, { "epoch": 0.5531697341513292, "grad_norm": 3.7186317443847656, "learning_rate": 4.54361563005106e-05, "loss": 0.9149, "step": 1082 }, { "epoch": 0.553680981595092, "grad_norm": 4.215698719024658, "learning_rate": 4.535188275369592e-05, "loss": 0.8546, "step": 1083 }, { "epoch": 0.5541922290388548, "grad_norm": 4.031459808349609, "learning_rate": 4.526762252432195e-05, "loss": 0.8887, "step": 1084 }, { "epoch": 0.5547034764826176, "grad_norm": 3.89286208152771, "learning_rate": 4.5183375853804864e-05, "loss": 0.8501, "step": 1085 }, { "epoch": 0.5552147239263804, "grad_norm": 4.312307357788086, "learning_rate": 4.509914298352197e-05, "loss": 0.8708, "step": 1086 }, { "epoch": 0.5557259713701431, "grad_norm": 4.412914752960205, "learning_rate": 4.501492415481105e-05, "loss": 0.923, "step": 1087 }, { "epoch": 0.556237218813906, "grad_norm": 3.436657667160034, "learning_rate": 4.493071960896961e-05, "loss": 0.7221, "step": 1088 }, { "epoch": 0.5567484662576687, "grad_norm": 4.254183292388916, "learning_rate": 4.4846529587254296e-05, "loss": 0.8008, "step": 1089 }, { "epoch": 0.5572597137014315, "grad_norm": 4.078035354614258, "learning_rate": 4.476235433088011e-05, "loss": 0.7969, "step": 1090 }, { "epoch": 0.5577709611451943, "grad_norm": 4.366103649139404, "learning_rate": 4.4678194081019764e-05, "loss": 1.0288, "step": 1091 }, { "epoch": 0.558282208588957, "grad_norm": 4.415543079376221, "learning_rate": 4.4594049078802925e-05, "loss": 0.8554, "step": 1092 }, { "epoch": 0.5587934560327198, "grad_norm": 4.634888648986816, "learning_rate": 4.450991956531564e-05, "loss": 0.82, "step": 1093 }, { "epoch": 0.5593047034764826, "grad_norm": 5.047021865844727, "learning_rate": 4.4425805781599535e-05, "loss": 0.9254, "step": 1094 }, { "epoch": 0.5598159509202454, "grad_norm": 4.863358497619629, "learning_rate": 4.434170796865119e-05, "loss": 0.7494, "step": 1095 }, { "epoch": 0.5603271983640081, "grad_norm": 4.6440863609313965, "learning_rate": 4.425762636742143e-05, "loss": 0.7075, "step": 1096 }, { "epoch": 0.560838445807771, "grad_norm": 5.290436744689941, "learning_rate": 4.417356121881461e-05, "loss": 0.6747, "step": 1097 }, { "epoch": 0.5613496932515337, "grad_norm": 5.783541202545166, "learning_rate": 4.4089512763687966e-05, "loss": 0.764, "step": 1098 }, { "epoch": 0.5618609406952966, "grad_norm": 5.853088855743408, "learning_rate": 4.4005481242850884e-05, "loss": 0.5864, "step": 1099 }, { "epoch": 0.5623721881390593, "grad_norm": 7.8992791175842285, "learning_rate": 4.392146689706425e-05, "loss": 0.712, "step": 1100 }, { "epoch": 0.5628834355828221, "grad_norm": 1.9316519498825073, "learning_rate": 4.3837469967039745e-05, "loss": 0.7383, "step": 1101 }, { "epoch": 0.5633946830265849, "grad_norm": 2.5184829235076904, "learning_rate": 4.3753490693439135e-05, "loss": 0.9712, "step": 1102 }, { "epoch": 0.5639059304703476, "grad_norm": 2.3476760387420654, "learning_rate": 4.3669529316873594e-05, "loss": 0.9155, "step": 1103 }, { "epoch": 0.5644171779141104, "grad_norm": 2.6786763668060303, "learning_rate": 4.358558607790303e-05, "loss": 0.9806, "step": 1104 }, { "epoch": 0.5649284253578732, "grad_norm": 2.709429979324341, "learning_rate": 4.350166121703539e-05, "loss": 0.9601, "step": 1105 }, { "epoch": 0.565439672801636, "grad_norm": 2.7100303173065186, "learning_rate": 4.341775497472597e-05, "loss": 0.9591, "step": 1106 }, { "epoch": 0.5659509202453987, "grad_norm": 3.0667576789855957, "learning_rate": 4.3333867591376704e-05, "loss": 0.9735, "step": 1107 }, { "epoch": 0.5664621676891616, "grad_norm": 2.943694591522217, "learning_rate": 4.3249999307335495e-05, "loss": 0.9863, "step": 1108 }, { "epoch": 0.5669734151329243, "grad_norm": 3.2859537601470947, "learning_rate": 4.316615036289553e-05, "loss": 0.9712, "step": 1109 }, { "epoch": 0.5674846625766872, "grad_norm": 2.9579572677612305, "learning_rate": 4.3082320998294594e-05, "loss": 0.9693, "step": 1110 }, { "epoch": 0.5679959100204499, "grad_norm": 3.6050679683685303, "learning_rate": 4.2998511453714366e-05, "loss": 1.0155, "step": 1111 }, { "epoch": 0.5685071574642127, "grad_norm": 3.0228002071380615, "learning_rate": 4.2914721969279705e-05, "loss": 0.9553, "step": 1112 }, { "epoch": 0.5690184049079755, "grad_norm": 3.024217128753662, "learning_rate": 4.283095278505806e-05, "loss": 1.0179, "step": 1113 }, { "epoch": 0.5695296523517382, "grad_norm": 2.834343433380127, "learning_rate": 4.274720414105866e-05, "loss": 0.8619, "step": 1114 }, { "epoch": 0.570040899795501, "grad_norm": 2.9928078651428223, "learning_rate": 4.2663476277231915e-05, "loss": 0.9013, "step": 1115 }, { "epoch": 0.5705521472392638, "grad_norm": 2.8394672870635986, "learning_rate": 4.2579769433468694e-05, "loss": 0.8572, "step": 1116 }, { "epoch": 0.5710633946830266, "grad_norm": 3.117053508758545, "learning_rate": 4.249608384959964e-05, "loss": 0.9714, "step": 1117 }, { "epoch": 0.5715746421267893, "grad_norm": 3.2112934589385986, "learning_rate": 4.241241976539447e-05, "loss": 0.9014, "step": 1118 }, { "epoch": 0.5720858895705522, "grad_norm": 3.148987293243408, "learning_rate": 4.232877742056131e-05, "loss": 0.8606, "step": 1119 }, { "epoch": 0.5725971370143149, "grad_norm": 3.3100132942199707, "learning_rate": 4.224515705474603e-05, "loss": 0.9315, "step": 1120 }, { "epoch": 0.5731083844580777, "grad_norm": 3.2287204265594482, "learning_rate": 4.2161558907531454e-05, "loss": 0.8845, "step": 1121 }, { "epoch": 0.5736196319018405, "grad_norm": 3.292661428451538, "learning_rate": 4.207798321843681e-05, "loss": 0.9407, "step": 1122 }, { "epoch": 0.5741308793456033, "grad_norm": 3.141087055206299, "learning_rate": 4.199443022691698e-05, "loss": 0.8379, "step": 1123 }, { "epoch": 0.5746421267893661, "grad_norm": 3.3399198055267334, "learning_rate": 4.1910900172361764e-05, "loss": 0.8819, "step": 1124 }, { "epoch": 0.5751533742331288, "grad_norm": 3.5327308177948, "learning_rate": 4.182739329409532e-05, "loss": 0.9396, "step": 1125 }, { "epoch": 0.5756646216768916, "grad_norm": 3.4489853382110596, "learning_rate": 4.1743909831375316e-05, "loss": 0.8109, "step": 1126 }, { "epoch": 0.5761758691206544, "grad_norm": 3.4719979763031006, "learning_rate": 4.16604500233924e-05, "loss": 0.9068, "step": 1127 }, { "epoch": 0.5766871165644172, "grad_norm": 3.4496169090270996, "learning_rate": 4.157701410926943e-05, "loss": 0.898, "step": 1128 }, { "epoch": 0.5771983640081799, "grad_norm": 3.652449131011963, "learning_rate": 4.149360232806078e-05, "loss": 0.794, "step": 1129 }, { "epoch": 0.5777096114519428, "grad_norm": 4.12328577041626, "learning_rate": 4.141021491875172e-05, "loss": 0.9067, "step": 1130 }, { "epoch": 0.5782208588957055, "grad_norm": 3.9710137844085693, "learning_rate": 4.132685212025764e-05, "loss": 0.985, "step": 1131 }, { "epoch": 0.5787321063394683, "grad_norm": 3.8590469360351562, "learning_rate": 4.124351417142347e-05, "loss": 0.9224, "step": 1132 }, { "epoch": 0.5792433537832311, "grad_norm": 3.658128023147583, "learning_rate": 4.11602013110229e-05, "loss": 0.9181, "step": 1133 }, { "epoch": 0.5797546012269938, "grad_norm": 3.682197332382202, "learning_rate": 4.1076913777757757e-05, "loss": 0.8932, "step": 1134 }, { "epoch": 0.5802658486707567, "grad_norm": 3.68892765045166, "learning_rate": 4.099365181025733e-05, "loss": 0.8478, "step": 1135 }, { "epoch": 0.5807770961145194, "grad_norm": 3.860687255859375, "learning_rate": 4.09104156470776e-05, "loss": 0.7584, "step": 1136 }, { "epoch": 0.5812883435582822, "grad_norm": 3.890394926071167, "learning_rate": 4.0827205526700673e-05, "loss": 0.8957, "step": 1137 }, { "epoch": 0.581799591002045, "grad_norm": 4.4587321281433105, "learning_rate": 4.074402168753399e-05, "loss": 0.9948, "step": 1138 }, { "epoch": 0.5823108384458078, "grad_norm": 4.47606897354126, "learning_rate": 4.0660864367909734e-05, "loss": 0.8756, "step": 1139 }, { "epoch": 0.5828220858895705, "grad_norm": 4.071111679077148, "learning_rate": 4.057773380608411e-05, "loss": 0.8526, "step": 1140 }, { "epoch": 0.5833333333333334, "grad_norm": 4.525618553161621, "learning_rate": 4.049463024023657e-05, "loss": 1.0175, "step": 1141 }, { "epoch": 0.5838445807770961, "grad_norm": 4.352678298950195, "learning_rate": 4.041155390846933e-05, "loss": 0.8302, "step": 1142 }, { "epoch": 0.5843558282208589, "grad_norm": 4.422893047332764, "learning_rate": 4.0328505048806525e-05, "loss": 0.871, "step": 1143 }, { "epoch": 0.5848670756646217, "grad_norm": 4.7299275398254395, "learning_rate": 4.0245483899193595e-05, "loss": 0.7768, "step": 1144 }, { "epoch": 0.5853783231083844, "grad_norm": 4.770334720611572, "learning_rate": 4.016249069749657e-05, "loss": 0.8351, "step": 1145 }, { "epoch": 0.5858895705521472, "grad_norm": 5.195528984069824, "learning_rate": 4.0079525681501415e-05, "loss": 0.6847, "step": 1146 }, { "epoch": 0.58640081799591, "grad_norm": 4.96982479095459, "learning_rate": 3.999658908891334e-05, "loss": 0.709, "step": 1147 }, { "epoch": 0.5869120654396728, "grad_norm": 4.830311298370361, "learning_rate": 3.991368115735612e-05, "loss": 0.5307, "step": 1148 }, { "epoch": 0.5874233128834356, "grad_norm": 5.926014423370361, "learning_rate": 3.983080212437144e-05, "loss": 0.706, "step": 1149 }, { "epoch": 0.5879345603271984, "grad_norm": 6.610885143280029, "learning_rate": 3.9747952227418096e-05, "loss": 0.5711, "step": 1150 }, { "epoch": 0.5884458077709611, "grad_norm": 1.9169435501098633, "learning_rate": 3.96651317038715e-05, "loss": 0.923, "step": 1151 }, { "epoch": 0.588957055214724, "grad_norm": 2.1922850608825684, "learning_rate": 3.958234079102288e-05, "loss": 0.9157, "step": 1152 }, { "epoch": 0.5894683026584867, "grad_norm": 2.3200840950012207, "learning_rate": 3.9499579726078594e-05, "loss": 0.8759, "step": 1153 }, { "epoch": 0.5899795501022495, "grad_norm": 2.6276164054870605, "learning_rate": 3.941684874615952e-05, "loss": 0.9767, "step": 1154 }, { "epoch": 0.5904907975460123, "grad_norm": 2.754929304122925, "learning_rate": 3.933414808830033e-05, "loss": 0.9734, "step": 1155 }, { "epoch": 0.591002044989775, "grad_norm": 2.80434513092041, "learning_rate": 3.92514779894488e-05, "loss": 1.0555, "step": 1156 }, { "epoch": 0.5915132924335378, "grad_norm": 2.974435806274414, "learning_rate": 3.916883868646517e-05, "loss": 1.018, "step": 1157 }, { "epoch": 0.5920245398773006, "grad_norm": 2.803382396697998, "learning_rate": 3.908623041612144e-05, "loss": 0.9716, "step": 1158 }, { "epoch": 0.5925357873210634, "grad_norm": 2.8717260360717773, "learning_rate": 3.900365341510073e-05, "loss": 0.9645, "step": 1159 }, { "epoch": 0.5930470347648262, "grad_norm": 3.0683510303497314, "learning_rate": 3.892110791999649e-05, "loss": 0.8748, "step": 1160 }, { "epoch": 0.593558282208589, "grad_norm": 2.787853240966797, "learning_rate": 3.883859416731195e-05, "loss": 0.9037, "step": 1161 }, { "epoch": 0.5940695296523517, "grad_norm": 2.759490728378296, "learning_rate": 3.875611239345942e-05, "loss": 0.8391, "step": 1162 }, { "epoch": 0.5945807770961146, "grad_norm": 2.975919008255005, "learning_rate": 3.867366283475955e-05, "loss": 0.89, "step": 1163 }, { "epoch": 0.5950920245398773, "grad_norm": 2.9492640495300293, "learning_rate": 3.859124572744071e-05, "loss": 0.8719, "step": 1164 }, { "epoch": 0.59560327198364, "grad_norm": 3.2285163402557373, "learning_rate": 3.850886130763825e-05, "loss": 0.9296, "step": 1165 }, { "epoch": 0.5961145194274029, "grad_norm": 3.5377800464630127, "learning_rate": 3.842650981139393e-05, "loss": 0.9344, "step": 1166 }, { "epoch": 0.5966257668711656, "grad_norm": 3.2627103328704834, "learning_rate": 3.834419147465513e-05, "loss": 0.9323, "step": 1167 }, { "epoch": 0.5971370143149284, "grad_norm": 3.227404832839966, "learning_rate": 3.8261906533274254e-05, "loss": 0.8478, "step": 1168 }, { "epoch": 0.5976482617586912, "grad_norm": 3.3979251384735107, "learning_rate": 3.817965522300803e-05, "loss": 0.8301, "step": 1169 }, { "epoch": 0.598159509202454, "grad_norm": 2.9473681449890137, "learning_rate": 3.809743777951675e-05, "loss": 0.881, "step": 1170 }, { "epoch": 0.5986707566462167, "grad_norm": 3.4166736602783203, "learning_rate": 3.801525443836378e-05, "loss": 0.9838, "step": 1171 }, { "epoch": 0.5991820040899796, "grad_norm": 3.37152099609375, "learning_rate": 3.793310543501473e-05, "loss": 0.9654, "step": 1172 }, { "epoch": 0.5996932515337423, "grad_norm": 3.3099162578582764, "learning_rate": 3.785099100483681e-05, "loss": 0.9081, "step": 1173 }, { "epoch": 0.6002044989775052, "grad_norm": 3.897919178009033, "learning_rate": 3.776891138309821e-05, "loss": 0.8691, "step": 1174 }, { "epoch": 0.6007157464212679, "grad_norm": 3.367326259613037, "learning_rate": 3.768686680496737e-05, "loss": 0.8366, "step": 1175 }, { "epoch": 0.6012269938650306, "grad_norm": 3.388650417327881, "learning_rate": 3.7604857505512345e-05, "loss": 0.8758, "step": 1176 }, { "epoch": 0.6017382413087935, "grad_norm": 3.35302996635437, "learning_rate": 3.752288371970006e-05, "loss": 0.9105, "step": 1177 }, { "epoch": 0.6022494887525562, "grad_norm": 3.6092076301574707, "learning_rate": 3.744094568239577e-05, "loss": 0.9705, "step": 1178 }, { "epoch": 0.602760736196319, "grad_norm": 3.6518003940582275, "learning_rate": 3.735904362836222e-05, "loss": 0.9318, "step": 1179 }, { "epoch": 0.6032719836400818, "grad_norm": 3.510641098022461, "learning_rate": 3.7277177792259114e-05, "loss": 0.8585, "step": 1180 }, { "epoch": 0.6037832310838446, "grad_norm": 3.7477800846099854, "learning_rate": 3.719534840864237e-05, "loss": 0.9278, "step": 1181 }, { "epoch": 0.6042944785276073, "grad_norm": 3.890324115753174, "learning_rate": 3.7113555711963474e-05, "loss": 0.9617, "step": 1182 }, { "epoch": 0.6048057259713702, "grad_norm": 3.7491960525512695, "learning_rate": 3.70317999365688e-05, "loss": 0.8225, "step": 1183 }, { "epoch": 0.6053169734151329, "grad_norm": 3.997459888458252, "learning_rate": 3.695008131669891e-05, "loss": 0.8853, "step": 1184 }, { "epoch": 0.6058282208588958, "grad_norm": 3.706289529800415, "learning_rate": 3.686840008648794e-05, "loss": 0.8468, "step": 1185 }, { "epoch": 0.6063394683026585, "grad_norm": 3.999207019805908, "learning_rate": 3.67867564799629e-05, "loss": 0.9332, "step": 1186 }, { "epoch": 0.6068507157464212, "grad_norm": 3.972015857696533, "learning_rate": 3.6705150731043005e-05, "loss": 0.9797, "step": 1187 }, { "epoch": 0.6073619631901841, "grad_norm": 4.267761707305908, "learning_rate": 3.6623583073538966e-05, "loss": 0.984, "step": 1188 }, { "epoch": 0.6078732106339468, "grad_norm": 4.182210445404053, "learning_rate": 3.6542053741152394e-05, "loss": 0.7828, "step": 1189 }, { "epoch": 0.6083844580777096, "grad_norm": 4.105729103088379, "learning_rate": 3.646056296747507e-05, "loss": 0.9388, "step": 1190 }, { "epoch": 0.6088957055214724, "grad_norm": 4.066761493682861, "learning_rate": 3.637911098598836e-05, "loss": 0.9364, "step": 1191 }, { "epoch": 0.6094069529652352, "grad_norm": 4.344300270080566, "learning_rate": 3.629769803006239e-05, "loss": 0.7844, "step": 1192 }, { "epoch": 0.6099182004089979, "grad_norm": 4.508508682250977, "learning_rate": 3.621632433295553e-05, "loss": 0.8825, "step": 1193 }, { "epoch": 0.6104294478527608, "grad_norm": 4.3253583908081055, "learning_rate": 3.613499012781368e-05, "loss": 0.8134, "step": 1194 }, { "epoch": 0.6109406952965235, "grad_norm": 4.819972515106201, "learning_rate": 3.605369564766956e-05, "loss": 0.8738, "step": 1195 }, { "epoch": 0.6114519427402862, "grad_norm": 5.187696933746338, "learning_rate": 3.597244112544208e-05, "loss": 0.8424, "step": 1196 }, { "epoch": 0.6119631901840491, "grad_norm": 5.0562968254089355, "learning_rate": 3.5891226793935675e-05, "loss": 0.7322, "step": 1197 }, { "epoch": 0.6124744376278118, "grad_norm": 5.8710832595825195, "learning_rate": 3.581005288583964e-05, "loss": 0.8522, "step": 1198 }, { "epoch": 0.6129856850715747, "grad_norm": 5.613986968994141, "learning_rate": 3.57289196337274e-05, "loss": 0.8068, "step": 1199 }, { "epoch": 0.6134969325153374, "grad_norm": 5.67006778717041, "learning_rate": 3.5647827270055945e-05, "loss": 0.3523, "step": 1200 }, { "epoch": 0.6140081799591002, "grad_norm": 2.078099489212036, "learning_rate": 3.5566776027165106e-05, "loss": 0.8653, "step": 1201 }, { "epoch": 0.614519427402863, "grad_norm": 2.6257896423339844, "learning_rate": 3.5485766137276894e-05, "loss": 1.0512, "step": 1202 }, { "epoch": 0.6150306748466258, "grad_norm": 2.1455605030059814, "learning_rate": 3.540479783249482e-05, "loss": 0.9154, "step": 1203 }, { "epoch": 0.6155419222903885, "grad_norm": 2.551408290863037, "learning_rate": 3.5323871344803263e-05, "loss": 0.9544, "step": 1204 }, { "epoch": 0.6160531697341514, "grad_norm": 2.475545883178711, "learning_rate": 3.524298690606681e-05, "loss": 0.902, "step": 1205 }, { "epoch": 0.6165644171779141, "grad_norm": 2.6632912158966064, "learning_rate": 3.5162144748029555e-05, "loss": 0.9852, "step": 1206 }, { "epoch": 0.6170756646216768, "grad_norm": 2.898348331451416, "learning_rate": 3.5081345102314454e-05, "loss": 0.9091, "step": 1207 }, { "epoch": 0.6175869120654397, "grad_norm": 2.6006739139556885, "learning_rate": 3.500058820042263e-05, "loss": 0.894, "step": 1208 }, { "epoch": 0.6180981595092024, "grad_norm": 2.914081573486328, "learning_rate": 3.4919874273732784e-05, "loss": 0.9868, "step": 1209 }, { "epoch": 0.6186094069529653, "grad_norm": 2.8525145053863525, "learning_rate": 3.4839203553500474e-05, "loss": 0.9638, "step": 1210 }, { "epoch": 0.619120654396728, "grad_norm": 2.761507034301758, "learning_rate": 3.475857627085746e-05, "loss": 0.9268, "step": 1211 }, { "epoch": 0.6196319018404908, "grad_norm": 2.962073802947998, "learning_rate": 3.467799265681105e-05, "loss": 0.9322, "step": 1212 }, { "epoch": 0.6201431492842536, "grad_norm": 3.011537551879883, "learning_rate": 3.4597452942243446e-05, "loss": 1.0676, "step": 1213 }, { "epoch": 0.6206543967280164, "grad_norm": 2.7569634914398193, "learning_rate": 3.451695735791106e-05, "loss": 0.8432, "step": 1214 }, { "epoch": 0.6211656441717791, "grad_norm": 3.254464864730835, "learning_rate": 3.4436506134443866e-05, "loss": 0.9145, "step": 1215 }, { "epoch": 0.621676891615542, "grad_norm": 3.0407156944274902, "learning_rate": 3.435609950234473e-05, "loss": 0.9686, "step": 1216 }, { "epoch": 0.6221881390593047, "grad_norm": 3.2604401111602783, "learning_rate": 3.427573769198879e-05, "loss": 0.8542, "step": 1217 }, { "epoch": 0.6226993865030674, "grad_norm": 3.351675271987915, "learning_rate": 3.419542093362274e-05, "loss": 0.873, "step": 1218 }, { "epoch": 0.6232106339468303, "grad_norm": 2.7927772998809814, "learning_rate": 3.411514945736419e-05, "loss": 0.7677, "step": 1219 }, { "epoch": 0.623721881390593, "grad_norm": 3.3357203006744385, "learning_rate": 3.403492349320101e-05, "loss": 0.9263, "step": 1220 }, { "epoch": 0.6242331288343558, "grad_norm": 3.1823763847351074, "learning_rate": 3.3954743270990695e-05, "loss": 0.8802, "step": 1221 }, { "epoch": 0.6247443762781186, "grad_norm": 3.5401298999786377, "learning_rate": 3.387460902045967e-05, "loss": 0.9269, "step": 1222 }, { "epoch": 0.6252556237218814, "grad_norm": 3.1181726455688477, "learning_rate": 3.3794520971202635e-05, "loss": 0.8232, "step": 1223 }, { "epoch": 0.6257668711656442, "grad_norm": 3.3269927501678467, "learning_rate": 3.371447935268194e-05, "loss": 0.9379, "step": 1224 }, { "epoch": 0.626278118609407, "grad_norm": 3.3684628009796143, "learning_rate": 3.363448439422688e-05, "loss": 0.7728, "step": 1225 }, { "epoch": 0.6267893660531697, "grad_norm": 3.450457811355591, "learning_rate": 3.3554536325033095e-05, "loss": 0.9889, "step": 1226 }, { "epoch": 0.6273006134969326, "grad_norm": 3.5818159580230713, "learning_rate": 3.3474635374161845e-05, "loss": 0.9753, "step": 1227 }, { "epoch": 0.6278118609406953, "grad_norm": 3.563751697540283, "learning_rate": 3.339478177053941e-05, "loss": 0.9556, "step": 1228 }, { "epoch": 0.628323108384458, "grad_norm": 3.62951922416687, "learning_rate": 3.3314975742956424e-05, "loss": 0.9656, "step": 1229 }, { "epoch": 0.6288343558282209, "grad_norm": 3.5825490951538086, "learning_rate": 3.323521752006716e-05, "loss": 0.8455, "step": 1230 }, { "epoch": 0.6293456032719836, "grad_norm": 3.4373691082000732, "learning_rate": 3.3155507330389e-05, "loss": 0.7958, "step": 1231 }, { "epoch": 0.6298568507157464, "grad_norm": 4.127106666564941, "learning_rate": 3.3075845402301655e-05, "loss": 0.893, "step": 1232 }, { "epoch": 0.6303680981595092, "grad_norm": 3.9865024089813232, "learning_rate": 3.299623196404657e-05, "loss": 0.8928, "step": 1233 }, { "epoch": 0.630879345603272, "grad_norm": 3.4670865535736084, "learning_rate": 3.291666724372626e-05, "loss": 0.8081, "step": 1234 }, { "epoch": 0.6313905930470347, "grad_norm": 3.6847434043884277, "learning_rate": 3.2837151469303664e-05, "loss": 0.744, "step": 1235 }, { "epoch": 0.6319018404907976, "grad_norm": 3.732963800430298, "learning_rate": 3.275768486860149e-05, "loss": 0.7568, "step": 1236 }, { "epoch": 0.6324130879345603, "grad_norm": 3.961927652359009, "learning_rate": 3.267826766930152e-05, "loss": 0.9264, "step": 1237 }, { "epoch": 0.6329243353783232, "grad_norm": 4.04052209854126, "learning_rate": 3.2598900098944043e-05, "loss": 0.9307, "step": 1238 }, { "epoch": 0.6334355828220859, "grad_norm": 4.026215076446533, "learning_rate": 3.251958238492711e-05, "loss": 0.8288, "step": 1239 }, { "epoch": 0.6339468302658486, "grad_norm": 4.72583532333374, "learning_rate": 3.244031475450599e-05, "loss": 0.8492, "step": 1240 }, { "epoch": 0.6344580777096115, "grad_norm": 4.102770805358887, "learning_rate": 3.2361097434792396e-05, "loss": 0.8145, "step": 1241 }, { "epoch": 0.6349693251533742, "grad_norm": 4.25362491607666, "learning_rate": 3.228193065275391e-05, "loss": 0.7773, "step": 1242 }, { "epoch": 0.635480572597137, "grad_norm": 5.0050225257873535, "learning_rate": 3.220281463521333e-05, "loss": 0.9292, "step": 1243 }, { "epoch": 0.6359918200408998, "grad_norm": 4.613053321838379, "learning_rate": 3.2123749608848e-05, "loss": 0.7985, "step": 1244 }, { "epoch": 0.6365030674846626, "grad_norm": 4.7116546630859375, "learning_rate": 3.204473580018916e-05, "loss": 0.858, "step": 1245 }, { "epoch": 0.6370143149284253, "grad_norm": 5.056521892547607, "learning_rate": 3.196577343562135e-05, "loss": 0.8168, "step": 1246 }, { "epoch": 0.6375255623721882, "grad_norm": 5.530933380126953, "learning_rate": 3.188686274138163e-05, "loss": 0.7587, "step": 1247 }, { "epoch": 0.6380368098159509, "grad_norm": 5.684156894683838, "learning_rate": 3.180800394355908e-05, "loss": 0.7681, "step": 1248 }, { "epoch": 0.6385480572597138, "grad_norm": 5.361364841461182, "learning_rate": 3.172919726809409e-05, "loss": 0.7175, "step": 1249 }, { "epoch": 0.6390593047034765, "grad_norm": 6.80435848236084, "learning_rate": 3.1650442940777695e-05, "loss": 0.5696, "step": 1250 }, { "epoch": 0.6395705521472392, "grad_norm": 1.996549367904663, "learning_rate": 3.1571741187250945e-05, "loss": 0.8583, "step": 1251 }, { "epoch": 0.6400817995910021, "grad_norm": 2.412594795227051, "learning_rate": 3.149309223300428e-05, "loss": 0.985, "step": 1252 }, { "epoch": 0.6405930470347648, "grad_norm": 2.6278765201568604, "learning_rate": 3.141449630337685e-05, "loss": 0.9277, "step": 1253 }, { "epoch": 0.6411042944785276, "grad_norm": 2.631162405014038, "learning_rate": 3.1335953623555865e-05, "loss": 0.9586, "step": 1254 }, { "epoch": 0.6416155419222904, "grad_norm": 2.5849802494049072, "learning_rate": 3.1257464418575986e-05, "loss": 0.8985, "step": 1255 }, { "epoch": 0.6421267893660532, "grad_norm": 2.6325793266296387, "learning_rate": 3.11790289133187e-05, "loss": 0.9677, "step": 1256 }, { "epoch": 0.6426380368098159, "grad_norm": 2.7737441062927246, "learning_rate": 3.110064733251154e-05, "loss": 0.9517, "step": 1257 }, { "epoch": 0.6431492842535788, "grad_norm": 2.836144208908081, "learning_rate": 3.102231990072763e-05, "loss": 1.0036, "step": 1258 }, { "epoch": 0.6436605316973415, "grad_norm": 3.148357391357422, "learning_rate": 3.094404684238489e-05, "loss": 0.9177, "step": 1259 }, { "epoch": 0.6441717791411042, "grad_norm": 3.0685768127441406, "learning_rate": 3.086582838174551e-05, "loss": 0.9073, "step": 1260 }, { "epoch": 0.6446830265848671, "grad_norm": 2.947803020477295, "learning_rate": 3.078766474291522e-05, "loss": 0.91, "step": 1261 }, { "epoch": 0.6451942740286298, "grad_norm": 3.012056350708008, "learning_rate": 3.0709556149842664e-05, "loss": 0.8971, "step": 1262 }, { "epoch": 0.6457055214723927, "grad_norm": 3.072740316390991, "learning_rate": 3.063150282631879e-05, "loss": 0.9495, "step": 1263 }, { "epoch": 0.6462167689161554, "grad_norm": 3.0233309268951416, "learning_rate": 3.05535049959762e-05, "loss": 0.9246, "step": 1264 }, { "epoch": 0.6467280163599182, "grad_norm": 3.3996291160583496, "learning_rate": 3.047556288228851e-05, "loss": 0.9428, "step": 1265 }, { "epoch": 0.647239263803681, "grad_norm": 3.211404323577881, "learning_rate": 3.0397676708569633e-05, "loss": 0.8686, "step": 1266 }, { "epoch": 0.6477505112474438, "grad_norm": 3.0419328212738037, "learning_rate": 3.031984669797328e-05, "loss": 0.891, "step": 1267 }, { "epoch": 0.6482617586912065, "grad_norm": 3.230339527130127, "learning_rate": 3.024207307349224e-05, "loss": 0.8741, "step": 1268 }, { "epoch": 0.6487730061349694, "grad_norm": 3.1209537982940674, "learning_rate": 3.0164356057957706e-05, "loss": 0.917, "step": 1269 }, { "epoch": 0.6492842535787321, "grad_norm": 3.3551604747772217, "learning_rate": 3.0086695874038717e-05, "loss": 0.9233, "step": 1270 }, { "epoch": 0.6497955010224948, "grad_norm": 3.477663516998291, "learning_rate": 3.0009092744241472e-05, "loss": 0.8906, "step": 1271 }, { "epoch": 0.6503067484662577, "grad_norm": 3.3048524856567383, "learning_rate": 2.9931546890908697e-05, "loss": 0.9094, "step": 1272 }, { "epoch": 0.6508179959100204, "grad_norm": 3.2910995483398438, "learning_rate": 2.985405853621902e-05, "loss": 0.8444, "step": 1273 }, { "epoch": 0.6513292433537833, "grad_norm": 3.342087507247925, "learning_rate": 2.9776627902186338e-05, "loss": 0.8766, "step": 1274 }, { "epoch": 0.651840490797546, "grad_norm": 3.8440937995910645, "learning_rate": 2.9699255210659166e-05, "loss": 0.9573, "step": 1275 }, { "epoch": 0.6523517382413088, "grad_norm": 3.666816473007202, "learning_rate": 2.962194068331996e-05, "loss": 0.9012, "step": 1276 }, { "epoch": 0.6528629856850716, "grad_norm": 3.627305746078491, "learning_rate": 2.9544684541684598e-05, "loss": 0.9463, "step": 1277 }, { "epoch": 0.6533742331288344, "grad_norm": 3.7801358699798584, "learning_rate": 2.9467487007101636e-05, "loss": 0.9248, "step": 1278 }, { "epoch": 0.6538854805725971, "grad_norm": 3.5962491035461426, "learning_rate": 2.939034830075173e-05, "loss": 0.9153, "step": 1279 }, { "epoch": 0.65439672801636, "grad_norm": 3.716352939605713, "learning_rate": 2.9313268643646986e-05, "loss": 0.8698, "step": 1280 }, { "epoch": 0.6549079754601227, "grad_norm": 3.7603189945220947, "learning_rate": 2.92362482566303e-05, "loss": 1.0128, "step": 1281 }, { "epoch": 0.6554192229038854, "grad_norm": 3.9676363468170166, "learning_rate": 2.915928736037478e-05, "loss": 0.8961, "step": 1282 }, { "epoch": 0.6559304703476483, "grad_norm": 4.314609527587891, "learning_rate": 2.908238617538307e-05, "loss": 0.8685, "step": 1283 }, { "epoch": 0.656441717791411, "grad_norm": 3.607283115386963, "learning_rate": 2.900554492198677e-05, "loss": 0.7664, "step": 1284 }, { "epoch": 0.6569529652351738, "grad_norm": 3.701615810394287, "learning_rate": 2.8928763820345716e-05, "loss": 0.8553, "step": 1285 }, { "epoch": 0.6574642126789366, "grad_norm": 4.535073757171631, "learning_rate": 2.8852043090447423e-05, "loss": 0.849, "step": 1286 }, { "epoch": 0.6579754601226994, "grad_norm": 3.8456053733825684, "learning_rate": 2.877538295210644e-05, "loss": 0.7781, "step": 1287 }, { "epoch": 0.6584867075664622, "grad_norm": 4.1655168533325195, "learning_rate": 2.869878362496368e-05, "loss": 0.8895, "step": 1288 }, { "epoch": 0.658997955010225, "grad_norm": 4.286038398742676, "learning_rate": 2.8622245328485907e-05, "loss": 0.8201, "step": 1289 }, { "epoch": 0.6595092024539877, "grad_norm": 4.441523551940918, "learning_rate": 2.8545768281964925e-05, "loss": 0.7817, "step": 1290 }, { "epoch": 0.6600204498977505, "grad_norm": 4.569100379943848, "learning_rate": 2.846935270451712e-05, "loss": 0.8183, "step": 1291 }, { "epoch": 0.6605316973415133, "grad_norm": 4.366547584533691, "learning_rate": 2.8392998815082717e-05, "loss": 0.8923, "step": 1292 }, { "epoch": 0.661042944785276, "grad_norm": 4.581328392028809, "learning_rate": 2.8316706832425243e-05, "loss": 0.7825, "step": 1293 }, { "epoch": 0.6615541922290389, "grad_norm": 5.016780376434326, "learning_rate": 2.8240476975130803e-05, "loss": 0.912, "step": 1294 }, { "epoch": 0.6620654396728016, "grad_norm": 4.488846302032471, "learning_rate": 2.8164309461607547e-05, "loss": 0.719, "step": 1295 }, { "epoch": 0.6625766871165644, "grad_norm": 5.28438138961792, "learning_rate": 2.808820451008495e-05, "loss": 0.8663, "step": 1296 }, { "epoch": 0.6630879345603272, "grad_norm": 5.2693071365356445, "learning_rate": 2.801216233861331e-05, "loss": 0.7197, "step": 1297 }, { "epoch": 0.66359918200409, "grad_norm": 5.987064838409424, "learning_rate": 2.793618316506299e-05, "loss": 0.6999, "step": 1298 }, { "epoch": 0.6641104294478528, "grad_norm": 5.712385654449463, "learning_rate": 2.7860267207123914e-05, "loss": 0.7776, "step": 1299 }, { "epoch": 0.6646216768916156, "grad_norm": 5.60577917098999, "learning_rate": 2.7784414682304832e-05, "loss": 0.3877, "step": 1300 }, { "epoch": 0.6651329243353783, "grad_norm": 1.8261439800262451, "learning_rate": 2.7708625807932754e-05, "loss": 0.8337, "step": 1301 }, { "epoch": 0.6656441717791411, "grad_norm": 2.3331668376922607, "learning_rate": 2.763290080115238e-05, "loss": 0.8949, "step": 1302 }, { "epoch": 0.6661554192229039, "grad_norm": 2.5809342861175537, "learning_rate": 2.7557239878925335e-05, "loss": 0.9388, "step": 1303 }, { "epoch": 0.6666666666666666, "grad_norm": 2.492849111557007, "learning_rate": 2.7481643258029748e-05, "loss": 0.9331, "step": 1304 }, { "epoch": 0.6671779141104295, "grad_norm": 2.6156399250030518, "learning_rate": 2.740611115505937e-05, "loss": 0.9623, "step": 1305 }, { "epoch": 0.6676891615541922, "grad_norm": 2.552900791168213, "learning_rate": 2.7330643786423237e-05, "loss": 0.8438, "step": 1306 }, { "epoch": 0.668200408997955, "grad_norm": 2.678558588027954, "learning_rate": 2.7255241368344815e-05, "loss": 0.8981, "step": 1307 }, { "epoch": 0.6687116564417178, "grad_norm": 2.8384506702423096, "learning_rate": 2.7179904116861556e-05, "loss": 1.0157, "step": 1308 }, { "epoch": 0.6692229038854806, "grad_norm": 3.0380685329437256, "learning_rate": 2.7104632247824126e-05, "loss": 0.8501, "step": 1309 }, { "epoch": 0.6697341513292433, "grad_norm": 2.7943999767303467, "learning_rate": 2.7029425976895943e-05, "loss": 0.8643, "step": 1310 }, { "epoch": 0.6702453987730062, "grad_norm": 2.9918973445892334, "learning_rate": 2.6954285519552415e-05, "loss": 0.8909, "step": 1311 }, { "epoch": 0.6707566462167689, "grad_norm": 2.9454028606414795, "learning_rate": 2.687921109108038e-05, "loss": 0.919, "step": 1312 }, { "epoch": 0.6712678936605317, "grad_norm": 3.3940649032592773, "learning_rate": 2.680420290657757e-05, "loss": 1.051, "step": 1313 }, { "epoch": 0.6717791411042945, "grad_norm": 3.3205361366271973, "learning_rate": 2.6729261180951847e-05, "loss": 0.8133, "step": 1314 }, { "epoch": 0.6722903885480572, "grad_norm": 3.0560505390167236, "learning_rate": 2.6654386128920683e-05, "loss": 0.8722, "step": 1315 }, { "epoch": 0.6728016359918201, "grad_norm": 3.47533917427063, "learning_rate": 2.65795779650105e-05, "loss": 0.8353, "step": 1316 }, { "epoch": 0.6733128834355828, "grad_norm": 3.5020523071289062, "learning_rate": 2.650483690355614e-05, "loss": 0.9415, "step": 1317 }, { "epoch": 0.6738241308793456, "grad_norm": 3.358147144317627, "learning_rate": 2.6430163158700115e-05, "loss": 0.8847, "step": 1318 }, { "epoch": 0.6743353783231084, "grad_norm": 3.2747859954833984, "learning_rate": 2.6355556944392136e-05, "loss": 0.893, "step": 1319 }, { "epoch": 0.6748466257668712, "grad_norm": 3.4800641536712646, "learning_rate": 2.628101847438835e-05, "loss": 0.8938, "step": 1320 }, { "epoch": 0.6753578732106339, "grad_norm": 3.310898780822754, "learning_rate": 2.6206547962250894e-05, "loss": 0.8795, "step": 1321 }, { "epoch": 0.6758691206543967, "grad_norm": 3.385996103286743, "learning_rate": 2.6132145621347114e-05, "loss": 0.8546, "step": 1322 }, { "epoch": 0.6763803680981595, "grad_norm": 3.280492067337036, "learning_rate": 2.605781166484914e-05, "loss": 0.914, "step": 1323 }, { "epoch": 0.6768916155419223, "grad_norm": 3.5544273853302, "learning_rate": 2.598354630573303e-05, "loss": 0.8673, "step": 1324 }, { "epoch": 0.6774028629856851, "grad_norm": 3.4663734436035156, "learning_rate": 2.5909349756778446e-05, "loss": 0.9238, "step": 1325 }, { "epoch": 0.6779141104294478, "grad_norm": 3.577603578567505, "learning_rate": 2.5835222230567803e-05, "loss": 0.9245, "step": 1326 }, { "epoch": 0.6784253578732107, "grad_norm": 3.5449306964874268, "learning_rate": 2.5761163939485772e-05, "loss": 0.9191, "step": 1327 }, { "epoch": 0.6789366053169734, "grad_norm": 3.638456344604492, "learning_rate": 2.5687175095718723e-05, "loss": 0.8672, "step": 1328 }, { "epoch": 0.6794478527607362, "grad_norm": 3.5813286304473877, "learning_rate": 2.5613255911253963e-05, "loss": 0.9806, "step": 1329 }, { "epoch": 0.679959100204499, "grad_norm": 3.563659191131592, "learning_rate": 2.553940659787929e-05, "loss": 0.8792, "step": 1330 }, { "epoch": 0.6804703476482618, "grad_norm": 3.872799873352051, "learning_rate": 2.5465627367182243e-05, "loss": 0.8539, "step": 1331 }, { "epoch": 0.6809815950920245, "grad_norm": 3.625586986541748, "learning_rate": 2.539191843054963e-05, "loss": 0.968, "step": 1332 }, { "epoch": 0.6814928425357873, "grad_norm": 3.7508835792541504, "learning_rate": 2.5318279999166828e-05, "loss": 0.7757, "step": 1333 }, { "epoch": 0.6820040899795501, "grad_norm": 3.7676475048065186, "learning_rate": 2.5244712284017186e-05, "loss": 0.8444, "step": 1334 }, { "epoch": 0.6825153374233128, "grad_norm": 3.5776097774505615, "learning_rate": 2.5171215495881458e-05, "loss": 0.8295, "step": 1335 }, { "epoch": 0.6830265848670757, "grad_norm": 4.256826877593994, "learning_rate": 2.5097789845337223e-05, "loss": 0.9222, "step": 1336 }, { "epoch": 0.6835378323108384, "grad_norm": 3.883566379547119, "learning_rate": 2.5024435542758173e-05, "loss": 0.8951, "step": 1337 }, { "epoch": 0.6840490797546013, "grad_norm": 4.0634284019470215, "learning_rate": 2.495115279831365e-05, "loss": 0.7853, "step": 1338 }, { "epoch": 0.684560327198364, "grad_norm": 4.287288665771484, "learning_rate": 2.4877941821967914e-05, "loss": 0.7984, "step": 1339 }, { "epoch": 0.6850715746421268, "grad_norm": 4.691751003265381, "learning_rate": 2.4804802823479613e-05, "loss": 0.8143, "step": 1340 }, { "epoch": 0.6855828220858896, "grad_norm": 4.170199394226074, "learning_rate": 2.4731736012401207e-05, "loss": 0.8166, "step": 1341 }, { "epoch": 0.6860940695296524, "grad_norm": 4.324507713317871, "learning_rate": 2.465874159807828e-05, "loss": 0.7594, "step": 1342 }, { "epoch": 0.6866053169734151, "grad_norm": 4.2558913230896, "learning_rate": 2.4585819789649057e-05, "loss": 0.7372, "step": 1343 }, { "epoch": 0.6871165644171779, "grad_norm": 4.4216461181640625, "learning_rate": 2.4512970796043616e-05, "loss": 0.6851, "step": 1344 }, { "epoch": 0.6876278118609407, "grad_norm": 4.879353046417236, "learning_rate": 2.444019482598356e-05, "loss": 0.6924, "step": 1345 }, { "epoch": 0.6881390593047034, "grad_norm": 5.454744338989258, "learning_rate": 2.4367492087981155e-05, "loss": 0.8629, "step": 1346 }, { "epoch": 0.6886503067484663, "grad_norm": 4.753787517547607, "learning_rate": 2.4294862790338917e-05, "loss": 0.7606, "step": 1347 }, { "epoch": 0.689161554192229, "grad_norm": 4.211785316467285, "learning_rate": 2.422230714114891e-05, "loss": 0.469, "step": 1348 }, { "epoch": 0.6896728016359919, "grad_norm": 6.177483558654785, "learning_rate": 2.4149825348292215e-05, "loss": 0.9659, "step": 1349 }, { "epoch": 0.6901840490797546, "grad_norm": 6.019729137420654, "learning_rate": 2.4077417619438275e-05, "loss": 0.4232, "step": 1350 }, { "epoch": 0.6906952965235174, "grad_norm": 1.807778239250183, "learning_rate": 2.400508416204433e-05, "loss": 0.7991, "step": 1351 }, { "epoch": 0.6912065439672802, "grad_norm": 2.253737688064575, "learning_rate": 2.393282518335486e-05, "loss": 0.9311, "step": 1352 }, { "epoch": 0.691717791411043, "grad_norm": 2.301241159439087, "learning_rate": 2.3860640890400925e-05, "loss": 0.9348, "step": 1353 }, { "epoch": 0.6922290388548057, "grad_norm": 2.6120917797088623, "learning_rate": 2.3788531489999604e-05, "loss": 1.0091, "step": 1354 }, { "epoch": 0.6927402862985685, "grad_norm": 2.5849595069885254, "learning_rate": 2.371649718875338e-05, "loss": 0.9379, "step": 1355 }, { "epoch": 0.6932515337423313, "grad_norm": 2.7695066928863525, "learning_rate": 2.3644538193049625e-05, "loss": 0.9027, "step": 1356 }, { "epoch": 0.693762781186094, "grad_norm": 2.7553393840789795, "learning_rate": 2.3572654709059882e-05, "loss": 0.8889, "step": 1357 }, { "epoch": 0.6942740286298569, "grad_norm": 2.850353717803955, "learning_rate": 2.3500846942739406e-05, "loss": 0.9374, "step": 1358 }, { "epoch": 0.6947852760736196, "grad_norm": 2.645514726638794, "learning_rate": 2.3429115099826458e-05, "loss": 0.888, "step": 1359 }, { "epoch": 0.6952965235173824, "grad_norm": 2.834381341934204, "learning_rate": 2.3357459385841823e-05, "loss": 0.8912, "step": 1360 }, { "epoch": 0.6958077709611452, "grad_norm": 2.820368766784668, "learning_rate": 2.328588000608812e-05, "loss": 0.8868, "step": 1361 }, { "epoch": 0.696319018404908, "grad_norm": 2.9356560707092285, "learning_rate": 2.321437716564927e-05, "loss": 0.9155, "step": 1362 }, { "epoch": 0.6968302658486708, "grad_norm": 2.9917140007019043, "learning_rate": 2.3142951069389902e-05, "loss": 0.9129, "step": 1363 }, { "epoch": 0.6973415132924335, "grad_norm": 2.977782726287842, "learning_rate": 2.3071601921954794e-05, "loss": 0.8904, "step": 1364 }, { "epoch": 0.6978527607361963, "grad_norm": 2.948878765106201, "learning_rate": 2.3000329927768214e-05, "loss": 0.8717, "step": 1365 }, { "epoch": 0.6983640081799591, "grad_norm": 3.2418344020843506, "learning_rate": 2.292913529103338e-05, "loss": 0.8991, "step": 1366 }, { "epoch": 0.6988752556237219, "grad_norm": 3.2583718299865723, "learning_rate": 2.285801821573192e-05, "loss": 0.8856, "step": 1367 }, { "epoch": 0.6993865030674846, "grad_norm": 3.554612398147583, "learning_rate": 2.278697890562316e-05, "loss": 0.9651, "step": 1368 }, { "epoch": 0.6998977505112475, "grad_norm": 3.229438543319702, "learning_rate": 2.2716017564243697e-05, "loss": 0.8992, "step": 1369 }, { "epoch": 0.7004089979550102, "grad_norm": 3.106508255004883, "learning_rate": 2.2645134394906676e-05, "loss": 0.9012, "step": 1370 }, { "epoch": 0.700920245398773, "grad_norm": 3.6647939682006836, "learning_rate": 2.257432960070132e-05, "loss": 0.9815, "step": 1371 }, { "epoch": 0.7014314928425358, "grad_norm": 3.2601795196533203, "learning_rate": 2.250360338449226e-05, "loss": 0.8928, "step": 1372 }, { "epoch": 0.7019427402862985, "grad_norm": 3.264993906021118, "learning_rate": 2.243295594891901e-05, "loss": 0.9133, "step": 1373 }, { "epoch": 0.7024539877300614, "grad_norm": 3.4938395023345947, "learning_rate": 2.2362387496395333e-05, "loss": 0.8657, "step": 1374 }, { "epoch": 0.7029652351738241, "grad_norm": 3.3451359272003174, "learning_rate": 2.229189822910876e-05, "loss": 0.8688, "step": 1375 }, { "epoch": 0.7034764826175869, "grad_norm": 3.640519857406616, "learning_rate": 2.2221488349019903e-05, "loss": 0.8744, "step": 1376 }, { "epoch": 0.7039877300613497, "grad_norm": 3.371683359146118, "learning_rate": 2.21511580578619e-05, "loss": 0.887, "step": 1377 }, { "epoch": 0.7044989775051125, "grad_norm": 3.3256847858428955, "learning_rate": 2.2080907557139922e-05, "loss": 0.8098, "step": 1378 }, { "epoch": 0.7050102249488752, "grad_norm": 3.6227996349334717, "learning_rate": 2.201073704813046e-05, "loss": 0.8696, "step": 1379 }, { "epoch": 0.7055214723926381, "grad_norm": 3.5112345218658447, "learning_rate": 2.194064673188089e-05, "loss": 0.8226, "step": 1380 }, { "epoch": 0.7060327198364008, "grad_norm": 3.5785815715789795, "learning_rate": 2.187063680920875e-05, "loss": 0.9444, "step": 1381 }, { "epoch": 0.7065439672801636, "grad_norm": 3.671478271484375, "learning_rate": 2.18007074807013e-05, "loss": 0.8786, "step": 1382 }, { "epoch": 0.7070552147239264, "grad_norm": 3.485398054122925, "learning_rate": 2.1730858946714828e-05, "loss": 0.7943, "step": 1383 }, { "epoch": 0.7075664621676891, "grad_norm": 3.856492280960083, "learning_rate": 2.1661091407374218e-05, "loss": 0.8619, "step": 1384 }, { "epoch": 0.7080777096114519, "grad_norm": 4.099680423736572, "learning_rate": 2.1591405062572213e-05, "loss": 0.8321, "step": 1385 }, { "epoch": 0.7085889570552147, "grad_norm": 4.101649761199951, "learning_rate": 2.1521800111968992e-05, "loss": 0.9423, "step": 1386 }, { "epoch": 0.7091002044989775, "grad_norm": 4.159656047821045, "learning_rate": 2.1452276754991456e-05, "loss": 0.8539, "step": 1387 }, { "epoch": 0.7096114519427403, "grad_norm": 4.193243026733398, "learning_rate": 2.1382835190832813e-05, "loss": 0.8896, "step": 1388 }, { "epoch": 0.7101226993865031, "grad_norm": 4.159454345703125, "learning_rate": 2.1313475618451857e-05, "loss": 0.8223, "step": 1389 }, { "epoch": 0.7106339468302658, "grad_norm": 4.094180107116699, "learning_rate": 2.1244198236572477e-05, "loss": 0.7764, "step": 1390 }, { "epoch": 0.7111451942740287, "grad_norm": 4.319839000701904, "learning_rate": 2.1175003243683117e-05, "loss": 0.7791, "step": 1391 }, { "epoch": 0.7116564417177914, "grad_norm": 4.31974983215332, "learning_rate": 2.110589083803613e-05, "loss": 0.831, "step": 1392 }, { "epoch": 0.7121676891615542, "grad_norm": 4.274682521820068, "learning_rate": 2.1036861217647242e-05, "loss": 0.8928, "step": 1393 }, { "epoch": 0.712678936605317, "grad_norm": 4.526281356811523, "learning_rate": 2.096791458029499e-05, "loss": 0.6852, "step": 1394 }, { "epoch": 0.7131901840490797, "grad_norm": 4.865596771240234, "learning_rate": 2.0899051123520196e-05, "loss": 0.8364, "step": 1395 }, { "epoch": 0.7137014314928425, "grad_norm": 4.703789710998535, "learning_rate": 2.08302710446253e-05, "loss": 0.7496, "step": 1396 }, { "epoch": 0.7142126789366053, "grad_norm": 5.552756309509277, "learning_rate": 2.076157454067391e-05, "loss": 0.9237, "step": 1397 }, { "epoch": 0.7147239263803681, "grad_norm": 4.742315769195557, "learning_rate": 2.069296180849012e-05, "loss": 0.6129, "step": 1398 }, { "epoch": 0.7152351738241309, "grad_norm": 6.7453413009643555, "learning_rate": 2.0624433044658076e-05, "loss": 0.8049, "step": 1399 }, { "epoch": 0.7157464212678937, "grad_norm": 5.575685977935791, "learning_rate": 2.055598844552129e-05, "loss": 0.3437, "step": 1400 }, { "epoch": 0.7162576687116564, "grad_norm": 1.7778542041778564, "learning_rate": 2.0487628207182148e-05, "loss": 0.7406, "step": 1401 }, { "epoch": 0.7167689161554193, "grad_norm": 2.223677158355713, "learning_rate": 2.0419352525501316e-05, "loss": 0.9807, "step": 1402 }, { "epoch": 0.717280163599182, "grad_norm": 2.3648645877838135, "learning_rate": 2.035116159609725e-05, "loss": 0.9588, "step": 1403 }, { "epoch": 0.7177914110429447, "grad_norm": 2.5223405361175537, "learning_rate": 2.0283055614345532e-05, "loss": 0.982, "step": 1404 }, { "epoch": 0.7183026584867076, "grad_norm": 2.8171517848968506, "learning_rate": 2.0215034775378332e-05, "loss": 0.9429, "step": 1405 }, { "epoch": 0.7188139059304703, "grad_norm": 2.740797281265259, "learning_rate": 2.014709927408397e-05, "loss": 0.9401, "step": 1406 }, { "epoch": 0.7193251533742331, "grad_norm": 2.6437759399414062, "learning_rate": 2.0079249305106146e-05, "loss": 0.8826, "step": 1407 }, { "epoch": 0.7198364008179959, "grad_norm": 3.0211195945739746, "learning_rate": 2.001148506284361e-05, "loss": 0.9285, "step": 1408 }, { "epoch": 0.7203476482617587, "grad_norm": 3.08404541015625, "learning_rate": 1.99438067414494e-05, "loss": 0.9163, "step": 1409 }, { "epoch": 0.7208588957055214, "grad_norm": 3.098724842071533, "learning_rate": 1.9876214534830477e-05, "loss": 0.9279, "step": 1410 }, { "epoch": 0.7213701431492843, "grad_norm": 2.9764244556427, "learning_rate": 1.980870863664695e-05, "loss": 0.8716, "step": 1411 }, { "epoch": 0.721881390593047, "grad_norm": 2.8523361682891846, "learning_rate": 1.9741289240311755e-05, "loss": 0.9273, "step": 1412 }, { "epoch": 0.7223926380368099, "grad_norm": 2.991556406021118, "learning_rate": 1.9673956538989912e-05, "loss": 0.928, "step": 1413 }, { "epoch": 0.7229038854805726, "grad_norm": 2.887350559234619, "learning_rate": 1.960671072559812e-05, "loss": 0.8584, "step": 1414 }, { "epoch": 0.7234151329243353, "grad_norm": 3.0709550380706787, "learning_rate": 1.953955199280408e-05, "loss": 0.8571, "step": 1415 }, { "epoch": 0.7239263803680982, "grad_norm": 3.103337287902832, "learning_rate": 1.9472480533025984e-05, "loss": 0.9068, "step": 1416 }, { "epoch": 0.7244376278118609, "grad_norm": 2.9760520458221436, "learning_rate": 1.9405496538432043e-05, "loss": 0.8228, "step": 1417 }, { "epoch": 0.7249488752556237, "grad_norm": 3.2029881477355957, "learning_rate": 1.9338600200939805e-05, "loss": 0.9329, "step": 1418 }, { "epoch": 0.7254601226993865, "grad_norm": 3.1520378589630127, "learning_rate": 1.927179171221571e-05, "loss": 0.9649, "step": 1419 }, { "epoch": 0.7259713701431493, "grad_norm": 3.432126045227051, "learning_rate": 1.920507126367448e-05, "loss": 0.8563, "step": 1420 }, { "epoch": 0.726482617586912, "grad_norm": 3.369917154312134, "learning_rate": 1.9138439046478585e-05, "loss": 0.9227, "step": 1421 }, { "epoch": 0.7269938650306749, "grad_norm": 3.478285551071167, "learning_rate": 1.9071895251537702e-05, "loss": 0.8868, "step": 1422 }, { "epoch": 0.7275051124744376, "grad_norm": 3.168760061264038, "learning_rate": 1.90054400695082e-05, "loss": 0.8772, "step": 1423 }, { "epoch": 0.7280163599182005, "grad_norm": 3.382513999938965, "learning_rate": 1.893907369079252e-05, "loss": 0.7854, "step": 1424 }, { "epoch": 0.7285276073619632, "grad_norm": 3.3271732330322266, "learning_rate": 1.8872796305538698e-05, "loss": 0.9083, "step": 1425 }, { "epoch": 0.7290388548057259, "grad_norm": 3.714656352996826, "learning_rate": 1.8806608103639766e-05, "loss": 0.9312, "step": 1426 }, { "epoch": 0.7295501022494888, "grad_norm": 3.502117872238159, "learning_rate": 1.8740509274733276e-05, "loss": 0.8154, "step": 1427 }, { "epoch": 0.7300613496932515, "grad_norm": 3.463674783706665, "learning_rate": 1.8674500008200674e-05, "loss": 0.8553, "step": 1428 }, { "epoch": 0.7305725971370143, "grad_norm": 3.8900396823883057, "learning_rate": 1.8608580493166795e-05, "loss": 0.972, "step": 1429 }, { "epoch": 0.7310838445807771, "grad_norm": 3.5256094932556152, "learning_rate": 1.8542750918499395e-05, "loss": 0.8967, "step": 1430 }, { "epoch": 0.7315950920245399, "grad_norm": 3.558422327041626, "learning_rate": 1.8477011472808424e-05, "loss": 0.9316, "step": 1431 }, { "epoch": 0.7321063394683026, "grad_norm": 3.8706467151641846, "learning_rate": 1.8411362344445708e-05, "loss": 0.856, "step": 1432 }, { "epoch": 0.7326175869120655, "grad_norm": 4.188419342041016, "learning_rate": 1.8345803721504234e-05, "loss": 0.9773, "step": 1433 }, { "epoch": 0.7331288343558282, "grad_norm": 3.983508825302124, "learning_rate": 1.8280335791817733e-05, "loss": 0.875, "step": 1434 }, { "epoch": 0.733640081799591, "grad_norm": 3.518186569213867, "learning_rate": 1.821495874296003e-05, "loss": 0.7896, "step": 1435 }, { "epoch": 0.7341513292433538, "grad_norm": 4.46856164932251, "learning_rate": 1.8149672762244624e-05, "loss": 1.0133, "step": 1436 }, { "epoch": 0.7346625766871165, "grad_norm": 3.835460662841797, "learning_rate": 1.808447803672404e-05, "loss": 0.837, "step": 1437 }, { "epoch": 0.7351738241308794, "grad_norm": 4.0243916511535645, "learning_rate": 1.801937475318939e-05, "loss": 0.8681, "step": 1438 }, { "epoch": 0.7356850715746421, "grad_norm": 4.4127960205078125, "learning_rate": 1.7954363098169768e-05, "loss": 0.9832, "step": 1439 }, { "epoch": 0.7361963190184049, "grad_norm": 4.0299248695373535, "learning_rate": 1.7889443257931737e-05, "loss": 0.8392, "step": 1440 }, { "epoch": 0.7367075664621677, "grad_norm": 4.006964206695557, "learning_rate": 1.782461541847879e-05, "loss": 0.7381, "step": 1441 }, { "epoch": 0.7372188139059305, "grad_norm": 4.23606014251709, "learning_rate": 1.7759879765550887e-05, "loss": 0.8984, "step": 1442 }, { "epoch": 0.7377300613496932, "grad_norm": 4.232422351837158, "learning_rate": 1.769523648462379e-05, "loss": 0.8403, "step": 1443 }, { "epoch": 0.7382413087934561, "grad_norm": 4.366641044616699, "learning_rate": 1.7630685760908622e-05, "loss": 0.8691, "step": 1444 }, { "epoch": 0.7387525562372188, "grad_norm": 4.925327777862549, "learning_rate": 1.7566227779351357e-05, "loss": 0.9101, "step": 1445 }, { "epoch": 0.7392638036809815, "grad_norm": 5.099564552307129, "learning_rate": 1.750186272463219e-05, "loss": 0.7557, "step": 1446 }, { "epoch": 0.7397750511247444, "grad_norm": 5.312896728515625, "learning_rate": 1.7437590781165138e-05, "loss": 0.6865, "step": 1447 }, { "epoch": 0.7402862985685071, "grad_norm": 5.26222038269043, "learning_rate": 1.7373412133097372e-05, "loss": 0.7639, "step": 1448 }, { "epoch": 0.74079754601227, "grad_norm": 4.816617012023926, "learning_rate": 1.7309326964308838e-05, "loss": 0.4676, "step": 1449 }, { "epoch": 0.7413087934560327, "grad_norm": 6.375493049621582, "learning_rate": 1.7245335458411542e-05, "loss": 0.4741, "step": 1450 }, { "epoch": 0.7418200408997955, "grad_norm": 1.9384807348251343, "learning_rate": 1.7181437798749256e-05, "loss": 0.8033, "step": 1451 }, { "epoch": 0.7423312883435583, "grad_norm": 2.3475661277770996, "learning_rate": 1.7117634168396774e-05, "loss": 0.9676, "step": 1452 }, { "epoch": 0.7428425357873211, "grad_norm": 2.614190101623535, "learning_rate": 1.705392475015956e-05, "loss": 1.0618, "step": 1453 }, { "epoch": 0.7433537832310838, "grad_norm": 2.739084243774414, "learning_rate": 1.6990309726573095e-05, "loss": 0.9619, "step": 1454 }, { "epoch": 0.7438650306748467, "grad_norm": 2.832260847091675, "learning_rate": 1.6926789279902412e-05, "loss": 0.9322, "step": 1455 }, { "epoch": 0.7443762781186094, "grad_norm": 2.8056933879852295, "learning_rate": 1.6863363592141618e-05, "loss": 0.9499, "step": 1456 }, { "epoch": 0.7448875255623721, "grad_norm": 2.8965537548065186, "learning_rate": 1.6800032845013247e-05, "loss": 0.907, "step": 1457 }, { "epoch": 0.745398773006135, "grad_norm": 3.008425235748291, "learning_rate": 1.673679721996789e-05, "loss": 0.9932, "step": 1458 }, { "epoch": 0.7459100204498977, "grad_norm": 2.81937837600708, "learning_rate": 1.6673656898183572e-05, "loss": 0.9586, "step": 1459 }, { "epoch": 0.7464212678936605, "grad_norm": 2.9769203662872314, "learning_rate": 1.6610612060565234e-05, "loss": 0.9148, "step": 1460 }, { "epoch": 0.7469325153374233, "grad_norm": 3.3971211910247803, "learning_rate": 1.6547662887744265e-05, "loss": 0.906, "step": 1461 }, { "epoch": 0.7474437627811861, "grad_norm": 2.9512698650360107, "learning_rate": 1.648480956007799e-05, "loss": 0.9082, "step": 1462 }, { "epoch": 0.7479550102249489, "grad_norm": 3.2894976139068604, "learning_rate": 1.6422052257649078e-05, "loss": 0.9537, "step": 1463 }, { "epoch": 0.7484662576687117, "grad_norm": 2.9925243854522705, "learning_rate": 1.6359391160265125e-05, "loss": 0.9018, "step": 1464 }, { "epoch": 0.7489775051124744, "grad_norm": 3.4137401580810547, "learning_rate": 1.629682644745802e-05, "loss": 0.9413, "step": 1465 }, { "epoch": 0.7494887525562373, "grad_norm": 3.152463436126709, "learning_rate": 1.6234358298483575e-05, "loss": 0.9389, "step": 1466 }, { "epoch": 0.75, "grad_norm": 3.088907480239868, "learning_rate": 1.6171986892320884e-05, "loss": 0.8818, "step": 1467 }, { "epoch": 0.7505112474437627, "grad_norm": 2.9647679328918457, "learning_rate": 1.6109712407671867e-05, "loss": 0.8208, "step": 1468 }, { "epoch": 0.7510224948875256, "grad_norm": 3.121256113052368, "learning_rate": 1.6047535022960757e-05, "loss": 0.8292, "step": 1469 }, { "epoch": 0.7515337423312883, "grad_norm": 3.0089125633239746, "learning_rate": 1.5985454916333577e-05, "loss": 0.8211, "step": 1470 }, { "epoch": 0.7520449897750511, "grad_norm": 3.4958393573760986, "learning_rate": 1.592347226565766e-05, "loss": 0.9042, "step": 1471 }, { "epoch": 0.7525562372188139, "grad_norm": 3.168567419052124, "learning_rate": 1.586158724852108e-05, "loss": 0.783, "step": 1472 }, { "epoch": 0.7530674846625767, "grad_norm": 3.506746530532837, "learning_rate": 1.579980004223222e-05, "loss": 0.8124, "step": 1473 }, { "epoch": 0.7535787321063395, "grad_norm": 3.678039789199829, "learning_rate": 1.573811082381918e-05, "loss": 0.9267, "step": 1474 }, { "epoch": 0.7540899795501023, "grad_norm": 3.4833285808563232, "learning_rate": 1.567651977002935e-05, "loss": 0.8201, "step": 1475 }, { "epoch": 0.754601226993865, "grad_norm": 3.767958879470825, "learning_rate": 1.561502705732883e-05, "loss": 0.9726, "step": 1476 }, { "epoch": 0.7551124744376279, "grad_norm": 3.3126537799835205, "learning_rate": 1.5553632861901995e-05, "loss": 0.7471, "step": 1477 }, { "epoch": 0.7556237218813906, "grad_norm": 3.590383529663086, "learning_rate": 1.5492337359650937e-05, "loss": 0.8543, "step": 1478 }, { "epoch": 0.7561349693251533, "grad_norm": 3.4559550285339355, "learning_rate": 1.5431140726194974e-05, "loss": 0.8802, "step": 1479 }, { "epoch": 0.7566462167689162, "grad_norm": 3.7260870933532715, "learning_rate": 1.5370043136870148e-05, "loss": 0.9178, "step": 1480 }, { "epoch": 0.7571574642126789, "grad_norm": 4.135682106018066, "learning_rate": 1.5309044766728775e-05, "loss": 0.8823, "step": 1481 }, { "epoch": 0.7576687116564417, "grad_norm": 3.654975175857544, "learning_rate": 1.5248145790538837e-05, "loss": 0.8367, "step": 1482 }, { "epoch": 0.7581799591002045, "grad_norm": 3.653458833694458, "learning_rate": 1.5187346382783552e-05, "loss": 0.8005, "step": 1483 }, { "epoch": 0.7586912065439673, "grad_norm": 3.8800647258758545, "learning_rate": 1.5126646717660897e-05, "loss": 0.9004, "step": 1484 }, { "epoch": 0.75920245398773, "grad_norm": 3.79221773147583, "learning_rate": 1.5066046969083026e-05, "loss": 0.7986, "step": 1485 }, { "epoch": 0.7597137014314929, "grad_norm": 4.0721025466918945, "learning_rate": 1.5005547310675872e-05, "loss": 0.9721, "step": 1486 }, { "epoch": 0.7602249488752556, "grad_norm": 3.911992311477661, "learning_rate": 1.4945147915778535e-05, "loss": 0.8429, "step": 1487 }, { "epoch": 0.7607361963190185, "grad_norm": 4.218154430389404, "learning_rate": 1.4884848957442931e-05, "loss": 0.7957, "step": 1488 }, { "epoch": 0.7612474437627812, "grad_norm": 3.988151788711548, "learning_rate": 1.4824650608433099e-05, "loss": 0.8568, "step": 1489 }, { "epoch": 0.7617586912065439, "grad_norm": 4.378921985626221, "learning_rate": 1.4764553041224926e-05, "loss": 0.7963, "step": 1490 }, { "epoch": 0.7622699386503068, "grad_norm": 4.452109336853027, "learning_rate": 1.4704556428005478e-05, "loss": 0.835, "step": 1491 }, { "epoch": 0.7627811860940695, "grad_norm": 6.035761833190918, "learning_rate": 1.4644660940672627e-05, "loss": 0.9188, "step": 1492 }, { "epoch": 0.7632924335378323, "grad_norm": 4.4452996253967285, "learning_rate": 1.4584866750834464e-05, "loss": 0.7396, "step": 1493 }, { "epoch": 0.7638036809815951, "grad_norm": 4.476807594299316, "learning_rate": 1.4525174029808858e-05, "loss": 0.7628, "step": 1494 }, { "epoch": 0.7643149284253579, "grad_norm": 4.541752338409424, "learning_rate": 1.4465582948622986e-05, "loss": 0.6893, "step": 1495 }, { "epoch": 0.7648261758691206, "grad_norm": 4.8742899894714355, "learning_rate": 1.4406093678012766e-05, "loss": 0.7884, "step": 1496 }, { "epoch": 0.7653374233128835, "grad_norm": 5.151451110839844, "learning_rate": 1.4346706388422493e-05, "loss": 0.683, "step": 1497 }, { "epoch": 0.7658486707566462, "grad_norm": 5.411539554595947, "learning_rate": 1.4287421250004157e-05, "loss": 0.79, "step": 1498 }, { "epoch": 0.766359918200409, "grad_norm": 5.503897666931152, "learning_rate": 1.4228238432617186e-05, "loss": 0.556, "step": 1499 }, { "epoch": 0.7668711656441718, "grad_norm": 6.515267372131348, "learning_rate": 1.4169158105827768e-05, "loss": 0.393, "step": 1500 }, { "epoch": 0.7673824130879345, "grad_norm": 2.1720798015594482, "learning_rate": 1.4110180438908505e-05, "loss": 0.7376, "step": 1501 }, { "epoch": 0.7678936605316974, "grad_norm": 2.3930561542510986, "learning_rate": 1.4051305600837799e-05, "loss": 1.0402, "step": 1502 }, { "epoch": 0.7684049079754601, "grad_norm": 2.3610873222351074, "learning_rate": 1.3992533760299498e-05, "loss": 0.9187, "step": 1503 }, { "epoch": 0.7689161554192229, "grad_norm": 2.4556972980499268, "learning_rate": 1.3933865085682312e-05, "loss": 0.9221, "step": 1504 }, { "epoch": 0.7694274028629857, "grad_norm": 2.7687621116638184, "learning_rate": 1.387529974507935e-05, "loss": 1.0278, "step": 1505 }, { "epoch": 0.7699386503067485, "grad_norm": 2.8164188861846924, "learning_rate": 1.3816837906287722e-05, "loss": 0.9995, "step": 1506 }, { "epoch": 0.7704498977505112, "grad_norm": 2.5187416076660156, "learning_rate": 1.3758479736807928e-05, "loss": 0.813, "step": 1507 }, { "epoch": 0.7709611451942741, "grad_norm": 2.9586594104766846, "learning_rate": 1.3700225403843469e-05, "loss": 0.9001, "step": 1508 }, { "epoch": 0.7714723926380368, "grad_norm": 3.797866106033325, "learning_rate": 1.3642075074300325e-05, "loss": 0.9324, "step": 1509 }, { "epoch": 0.7719836400817995, "grad_norm": 2.9937219619750977, "learning_rate": 1.3584028914786539e-05, "loss": 0.922, "step": 1510 }, { "epoch": 0.7724948875255624, "grad_norm": 2.9317002296447754, "learning_rate": 1.3526087091611623e-05, "loss": 0.9387, "step": 1511 }, { "epoch": 0.7730061349693251, "grad_norm": 3.1819257736206055, "learning_rate": 1.3468249770786223e-05, "loss": 0.8964, "step": 1512 }, { "epoch": 0.773517382413088, "grad_norm": 2.7915236949920654, "learning_rate": 1.341051711802151e-05, "loss": 0.9163, "step": 1513 }, { "epoch": 0.7740286298568507, "grad_norm": 2.946099281311035, "learning_rate": 1.3352889298728832e-05, "loss": 0.9546, "step": 1514 }, { "epoch": 0.7745398773006135, "grad_norm": 3.0270745754241943, "learning_rate": 1.3295366478019112e-05, "loss": 0.9431, "step": 1515 }, { "epoch": 0.7750511247443763, "grad_norm": 3.3205618858337402, "learning_rate": 1.3237948820702495e-05, "loss": 0.9058, "step": 1516 }, { "epoch": 0.7755623721881391, "grad_norm": 3.179041624069214, "learning_rate": 1.3180636491287773e-05, "loss": 0.8804, "step": 1517 }, { "epoch": 0.7760736196319018, "grad_norm": 3.682973623275757, "learning_rate": 1.3123429653981995e-05, "loss": 0.985, "step": 1518 }, { "epoch": 0.7765848670756647, "grad_norm": 3.1981019973754883, "learning_rate": 1.3066328472689932e-05, "loss": 0.8148, "step": 1519 }, { "epoch": 0.7770961145194274, "grad_norm": 3.3223016262054443, "learning_rate": 1.300933311101365e-05, "loss": 0.8839, "step": 1520 }, { "epoch": 0.7776073619631901, "grad_norm": 3.214383602142334, "learning_rate": 1.2952443732252057e-05, "loss": 0.9364, "step": 1521 }, { "epoch": 0.778118609406953, "grad_norm": 3.4616458415985107, "learning_rate": 1.2895660499400348e-05, "loss": 0.8707, "step": 1522 }, { "epoch": 0.7786298568507157, "grad_norm": 3.4007458686828613, "learning_rate": 1.283898357514966e-05, "loss": 0.9236, "step": 1523 }, { "epoch": 0.7791411042944786, "grad_norm": 3.370945930480957, "learning_rate": 1.2782413121886483e-05, "loss": 0.8499, "step": 1524 }, { "epoch": 0.7796523517382413, "grad_norm": 3.3444669246673584, "learning_rate": 1.2725949301692314e-05, "loss": 0.8872, "step": 1525 }, { "epoch": 0.7801635991820041, "grad_norm": 3.51122784614563, "learning_rate": 1.2669592276343084e-05, "loss": 0.911, "step": 1526 }, { "epoch": 0.7806748466257669, "grad_norm": 3.6887714862823486, "learning_rate": 1.2613342207308764e-05, "loss": 0.9304, "step": 1527 }, { "epoch": 0.7811860940695297, "grad_norm": 3.856081485748291, "learning_rate": 1.2557199255752867e-05, "loss": 0.9293, "step": 1528 }, { "epoch": 0.7816973415132924, "grad_norm": 3.9948816299438477, "learning_rate": 1.2501163582532038e-05, "loss": 0.837, "step": 1529 }, { "epoch": 0.7822085889570553, "grad_norm": 3.6386806964874268, "learning_rate": 1.24452353481955e-05, "loss": 0.8118, "step": 1530 }, { "epoch": 0.782719836400818, "grad_norm": 3.716519355773926, "learning_rate": 1.2389414712984715e-05, "loss": 0.7942, "step": 1531 }, { "epoch": 0.7832310838445807, "grad_norm": 3.621734142303467, "learning_rate": 1.2333701836832812e-05, "loss": 0.7823, "step": 1532 }, { "epoch": 0.7837423312883436, "grad_norm": 3.675961494445801, "learning_rate": 1.227809687936417e-05, "loss": 0.8813, "step": 1533 }, { "epoch": 0.7842535787321063, "grad_norm": 4.009909629821777, "learning_rate": 1.2222599999894018e-05, "loss": 0.9671, "step": 1534 }, { "epoch": 0.7847648261758691, "grad_norm": 3.6856648921966553, "learning_rate": 1.2167211357427878e-05, "loss": 0.7759, "step": 1535 }, { "epoch": 0.7852760736196319, "grad_norm": 4.252887725830078, "learning_rate": 1.2111931110661212e-05, "loss": 0.8245, "step": 1536 }, { "epoch": 0.7857873210633947, "grad_norm": 4.382503986358643, "learning_rate": 1.2056759417978835e-05, "loss": 1.0605, "step": 1537 }, { "epoch": 0.7862985685071575, "grad_norm": 4.000999927520752, "learning_rate": 1.2001696437454624e-05, "loss": 0.8371, "step": 1538 }, { "epoch": 0.7868098159509203, "grad_norm": 4.442836761474609, "learning_rate": 1.1946742326850912e-05, "loss": 0.8388, "step": 1539 }, { "epoch": 0.787321063394683, "grad_norm": 4.2866530418396, "learning_rate": 1.1891897243618182e-05, "loss": 0.7989, "step": 1540 }, { "epoch": 0.7878323108384458, "grad_norm": 4.245499610900879, "learning_rate": 1.183716134489446e-05, "loss": 0.9314, "step": 1541 }, { "epoch": 0.7883435582822086, "grad_norm": 4.35511589050293, "learning_rate": 1.1782534787505017e-05, "loss": 0.7689, "step": 1542 }, { "epoch": 0.7888548057259713, "grad_norm": 4.421148777008057, "learning_rate": 1.1728017727961794e-05, "loss": 0.7314, "step": 1543 }, { "epoch": 0.7893660531697342, "grad_norm": 4.419631481170654, "learning_rate": 1.1673610322463014e-05, "loss": 0.7021, "step": 1544 }, { "epoch": 0.7898773006134969, "grad_norm": 4.954163074493408, "learning_rate": 1.1619312726892762e-05, "loss": 0.7313, "step": 1545 }, { "epoch": 0.7903885480572597, "grad_norm": 4.796021461486816, "learning_rate": 1.1565125096820473e-05, "loss": 0.6311, "step": 1546 }, { "epoch": 0.7908997955010225, "grad_norm": 4.741822242736816, "learning_rate": 1.1511047587500523e-05, "loss": 0.5983, "step": 1547 }, { "epoch": 0.7914110429447853, "grad_norm": 6.696097373962402, "learning_rate": 1.1457080353871769e-05, "loss": 0.9485, "step": 1548 }, { "epoch": 0.7919222903885481, "grad_norm": 5.633409023284912, "learning_rate": 1.1403223550557146e-05, "loss": 0.6431, "step": 1549 }, { "epoch": 0.7924335378323109, "grad_norm": 7.090452194213867, "learning_rate": 1.134947733186315e-05, "loss": 0.4912, "step": 1550 }, { "epoch": 0.7929447852760736, "grad_norm": 1.965378761291504, "learning_rate": 1.1295841851779488e-05, "loss": 0.9309, "step": 1551 }, { "epoch": 0.7934560327198364, "grad_norm": 2.6269702911376953, "learning_rate": 1.1242317263978525e-05, "loss": 1.0566, "step": 1552 }, { "epoch": 0.7939672801635992, "grad_norm": 2.5522866249084473, "learning_rate": 1.118890372181497e-05, "loss": 0.9996, "step": 1553 }, { "epoch": 0.7944785276073619, "grad_norm": 2.6898603439331055, "learning_rate": 1.1135601378325316e-05, "loss": 0.8959, "step": 1554 }, { "epoch": 0.7949897750511248, "grad_norm": 2.5996921062469482, "learning_rate": 1.1082410386227527e-05, "loss": 0.9355, "step": 1555 }, { "epoch": 0.7955010224948875, "grad_norm": 2.5710082054138184, "learning_rate": 1.102933089792042e-05, "loss": 0.9206, "step": 1556 }, { "epoch": 0.7960122699386503, "grad_norm": 2.8860228061676025, "learning_rate": 1.0976363065483464e-05, "loss": 0.9861, "step": 1557 }, { "epoch": 0.7965235173824131, "grad_norm": 3.0384457111358643, "learning_rate": 1.092350704067614e-05, "loss": 1.0165, "step": 1558 }, { "epoch": 0.7970347648261759, "grad_norm": 2.8058857917785645, "learning_rate": 1.0870762974937598e-05, "loss": 0.8881, "step": 1559 }, { "epoch": 0.7975460122699386, "grad_norm": 2.571446418762207, "learning_rate": 1.0818131019386252e-05, "loss": 0.8571, "step": 1560 }, { "epoch": 0.7980572597137015, "grad_norm": 3.1314709186553955, "learning_rate": 1.0765611324819247e-05, "loss": 0.8383, "step": 1561 }, { "epoch": 0.7985685071574642, "grad_norm": 2.915090799331665, "learning_rate": 1.0713204041712145e-05, "loss": 0.8493, "step": 1562 }, { "epoch": 0.799079754601227, "grad_norm": 3.2132315635681152, "learning_rate": 1.066090932021837e-05, "loss": 0.9181, "step": 1563 }, { "epoch": 0.7995910020449898, "grad_norm": 3.073258399963379, "learning_rate": 1.060872731016892e-05, "loss": 0.8863, "step": 1564 }, { "epoch": 0.8001022494887525, "grad_norm": 3.104003667831421, "learning_rate": 1.0556658161071792e-05, "loss": 0.9108, "step": 1565 }, { "epoch": 0.8006134969325154, "grad_norm": 3.0709762573242188, "learning_rate": 1.0504702022111661e-05, "loss": 0.8055, "step": 1566 }, { "epoch": 0.8011247443762781, "grad_norm": 3.2626142501831055, "learning_rate": 1.0452859042149382e-05, "loss": 0.8732, "step": 1567 }, { "epoch": 0.8016359918200409, "grad_norm": 3.4502737522125244, "learning_rate": 1.040112936972164e-05, "loss": 0.9525, "step": 1568 }, { "epoch": 0.8021472392638037, "grad_norm": 3.2807888984680176, "learning_rate": 1.0349513153040436e-05, "loss": 0.937, "step": 1569 }, { "epoch": 0.8026584867075665, "grad_norm": 3.3237321376800537, "learning_rate": 1.0298010539992748e-05, "loss": 0.8712, "step": 1570 }, { "epoch": 0.8031697341513292, "grad_norm": 3.9305286407470703, "learning_rate": 1.0246621678140023e-05, "loss": 0.9547, "step": 1571 }, { "epoch": 0.803680981595092, "grad_norm": 3.3259196281433105, "learning_rate": 1.0195346714717813e-05, "loss": 0.8361, "step": 1572 }, { "epoch": 0.8041922290388548, "grad_norm": 3.3074426651000977, "learning_rate": 1.0144185796635359e-05, "loss": 0.7841, "step": 1573 }, { "epoch": 0.8047034764826176, "grad_norm": 3.421103000640869, "learning_rate": 1.00931390704751e-05, "loss": 0.8898, "step": 1574 }, { "epoch": 0.8052147239263804, "grad_norm": 3.6833465099334717, "learning_rate": 1.0042206682492372e-05, "loss": 0.9386, "step": 1575 }, { "epoch": 0.8057259713701431, "grad_norm": 3.3095312118530273, "learning_rate": 9.991388778614824e-06, "loss": 0.8398, "step": 1576 }, { "epoch": 0.806237218813906, "grad_norm": 3.4535927772521973, "learning_rate": 9.940685504442183e-06, "loss": 0.86, "step": 1577 }, { "epoch": 0.8067484662576687, "grad_norm": 3.7982585430145264, "learning_rate": 9.89009700524568e-06, "loss": 0.9577, "step": 1578 }, { "epoch": 0.8072597137014315, "grad_norm": 3.5163521766662598, "learning_rate": 9.83962342596776e-06, "loss": 0.7962, "step": 1579 }, { "epoch": 0.8077709611451943, "grad_norm": 3.4356038570404053, "learning_rate": 9.789264911221546e-06, "loss": 0.8193, "step": 1580 }, { "epoch": 0.808282208588957, "grad_norm": 3.920682907104492, "learning_rate": 9.739021605290549e-06, "loss": 0.8903, "step": 1581 }, { "epoch": 0.8087934560327198, "grad_norm": 3.7626466751098633, "learning_rate": 9.688893652128151e-06, "loss": 0.8662, "step": 1582 }, { "epoch": 0.8093047034764826, "grad_norm": 4.124518394470215, "learning_rate": 9.638881195357224e-06, "loss": 0.8829, "step": 1583 }, { "epoch": 0.8098159509202454, "grad_norm": 3.74548602104187, "learning_rate": 9.588984378269783e-06, "loss": 0.7028, "step": 1584 }, { "epoch": 0.8103271983640081, "grad_norm": 3.769134998321533, "learning_rate": 9.539203343826469e-06, "loss": 0.9024, "step": 1585 }, { "epoch": 0.810838445807771, "grad_norm": 3.955554723739624, "learning_rate": 9.489538234656214e-06, "loss": 0.8195, "step": 1586 }, { "epoch": 0.8113496932515337, "grad_norm": 3.704868793487549, "learning_rate": 9.439989193055788e-06, "loss": 0.7934, "step": 1587 }, { "epoch": 0.8118609406952966, "grad_norm": 3.6619386672973633, "learning_rate": 9.39055636098945e-06, "loss": 0.868, "step": 1588 }, { "epoch": 0.8123721881390593, "grad_norm": 4.098423480987549, "learning_rate": 9.341239880088465e-06, "loss": 0.8754, "step": 1589 }, { "epoch": 0.8128834355828221, "grad_norm": 4.314637184143066, "learning_rate": 9.292039891650784e-06, "loss": 0.8408, "step": 1590 }, { "epoch": 0.8133946830265849, "grad_norm": 4.415641784667969, "learning_rate": 9.24295653664053e-06, "loss": 0.8727, "step": 1591 }, { "epoch": 0.8139059304703476, "grad_norm": 4.379350185394287, "learning_rate": 9.193989955687715e-06, "loss": 0.8209, "step": 1592 }, { "epoch": 0.8144171779141104, "grad_norm": 4.419501304626465, "learning_rate": 9.145140289087755e-06, "loss": 0.9432, "step": 1593 }, { "epoch": 0.8149284253578732, "grad_norm": 4.199825763702393, "learning_rate": 9.096407676801077e-06, "loss": 0.7578, "step": 1594 }, { "epoch": 0.815439672801636, "grad_norm": 5.105489253997803, "learning_rate": 9.047792258452742e-06, "loss": 0.8932, "step": 1595 }, { "epoch": 0.8159509202453987, "grad_norm": 5.17318058013916, "learning_rate": 8.999294173332058e-06, "loss": 0.9014, "step": 1596 }, { "epoch": 0.8164621676891616, "grad_norm": 4.764847755432129, "learning_rate": 8.950913560392132e-06, "loss": 0.7204, "step": 1597 }, { "epoch": 0.8169734151329243, "grad_norm": 5.115185737609863, "learning_rate": 8.902650558249499e-06, "loss": 0.7984, "step": 1598 }, { "epoch": 0.8174846625766872, "grad_norm": 5.964483261108398, "learning_rate": 8.854505305183752e-06, "loss": 0.709, "step": 1599 }, { "epoch": 0.8179959100204499, "grad_norm": 5.853104591369629, "learning_rate": 8.80647793913708e-06, "loss": 0.4204, "step": 1600 }, { "epoch": 0.8185071574642127, "grad_norm": 1.909232258796692, "learning_rate": 8.758568597713946e-06, "loss": 0.9726, "step": 1601 }, { "epoch": 0.8190184049079755, "grad_norm": 2.361161231994629, "learning_rate": 8.710777418180615e-06, "loss": 0.939, "step": 1602 }, { "epoch": 0.8195296523517382, "grad_norm": 2.425341844558716, "learning_rate": 8.663104537464866e-06, "loss": 0.8918, "step": 1603 }, { "epoch": 0.820040899795501, "grad_norm": 2.4736742973327637, "learning_rate": 8.615550092155478e-06, "loss": 0.9242, "step": 1604 }, { "epoch": 0.8205521472392638, "grad_norm": 2.6613001823425293, "learning_rate": 8.568114218501922e-06, "loss": 0.9088, "step": 1605 }, { "epoch": 0.8210633946830266, "grad_norm": 2.5696263313293457, "learning_rate": 8.520797052413931e-06, "loss": 0.8776, "step": 1606 }, { "epoch": 0.8215746421267893, "grad_norm": 2.944065570831299, "learning_rate": 8.473598729461163e-06, "loss": 0.9622, "step": 1607 }, { "epoch": 0.8220858895705522, "grad_norm": 2.8112123012542725, "learning_rate": 8.426519384872733e-06, "loss": 0.905, "step": 1608 }, { "epoch": 0.8225971370143149, "grad_norm": 2.9660046100616455, "learning_rate": 8.379559153536909e-06, "loss": 0.9772, "step": 1609 }, { "epoch": 0.8231083844580777, "grad_norm": 2.898458242416382, "learning_rate": 8.332718170000647e-06, "loss": 0.9601, "step": 1610 }, { "epoch": 0.8236196319018405, "grad_norm": 2.934843063354492, "learning_rate": 8.285996568469245e-06, "loss": 0.9621, "step": 1611 }, { "epoch": 0.8241308793456033, "grad_norm": 2.774526834487915, "learning_rate": 8.239394482805996e-06, "loss": 0.8366, "step": 1612 }, { "epoch": 0.8246421267893661, "grad_norm": 3.195984363555908, "learning_rate": 8.192912046531732e-06, "loss": 0.8993, "step": 1613 }, { "epoch": 0.8251533742331288, "grad_norm": 2.9150772094726562, "learning_rate": 8.14654939282447e-06, "loss": 0.818, "step": 1614 }, { "epoch": 0.8256646216768916, "grad_norm": 3.1671900749206543, "learning_rate": 8.10030665451904e-06, "loss": 0.9031, "step": 1615 }, { "epoch": 0.8261758691206544, "grad_norm": 3.0718796253204346, "learning_rate": 8.054183964106738e-06, "loss": 0.902, "step": 1616 }, { "epoch": 0.8266871165644172, "grad_norm": 3.13236403465271, "learning_rate": 8.008181453734832e-06, "loss": 0.8966, "step": 1617 }, { "epoch": 0.8271983640081799, "grad_norm": 3.3213281631469727, "learning_rate": 7.96229925520634e-06, "loss": 0.9051, "step": 1618 }, { "epoch": 0.8277096114519428, "grad_norm": 3.212383508682251, "learning_rate": 7.916537499979509e-06, "loss": 0.8877, "step": 1619 }, { "epoch": 0.8282208588957055, "grad_norm": 3.477694272994995, "learning_rate": 7.870896319167548e-06, "loss": 0.8837, "step": 1620 }, { "epoch": 0.8287321063394683, "grad_norm": 3.3036673069000244, "learning_rate": 7.825375843538163e-06, "loss": 0.8893, "step": 1621 }, { "epoch": 0.8292433537832311, "grad_norm": 3.0736191272735596, "learning_rate": 7.77997620351324e-06, "loss": 0.7746, "step": 1622 }, { "epoch": 0.8297546012269938, "grad_norm": 3.514597177505493, "learning_rate": 7.734697529168483e-06, "loss": 0.9255, "step": 1623 }, { "epoch": 0.8302658486707567, "grad_norm": 3.152017831802368, "learning_rate": 7.689539950232977e-06, "loss": 0.8449, "step": 1624 }, { "epoch": 0.8307770961145194, "grad_norm": 3.652573823928833, "learning_rate": 7.644503596088865e-06, "loss": 0.884, "step": 1625 }, { "epoch": 0.8312883435582822, "grad_norm": 3.5253610610961914, "learning_rate": 7.599588595770957e-06, "loss": 0.8573, "step": 1626 }, { "epoch": 0.831799591002045, "grad_norm": 3.6805419921875, "learning_rate": 7.554795077966409e-06, "loss": 0.7815, "step": 1627 }, { "epoch": 0.8323108384458078, "grad_norm": 3.5299296379089355, "learning_rate": 7.510123171014255e-06, "loss": 0.9062, "step": 1628 }, { "epoch": 0.8328220858895705, "grad_norm": 3.8087053298950195, "learning_rate": 7.4655730029051575e-06, "loss": 0.9056, "step": 1629 }, { "epoch": 0.8333333333333334, "grad_norm": 3.496819496154785, "learning_rate": 7.42114470128093e-06, "loss": 0.7842, "step": 1630 }, { "epoch": 0.8338445807770961, "grad_norm": 3.998002767562866, "learning_rate": 7.376838393434265e-06, "loss": 0.8565, "step": 1631 }, { "epoch": 0.8343558282208589, "grad_norm": 3.822242498397827, "learning_rate": 7.332654206308298e-06, "loss": 0.8017, "step": 1632 }, { "epoch": 0.8348670756646217, "grad_norm": 3.5486857891082764, "learning_rate": 7.288592266496286e-06, "loss": 0.8265, "step": 1633 }, { "epoch": 0.8353783231083844, "grad_norm": 3.6276445388793945, "learning_rate": 7.2446527002412225e-06, "loss": 0.851, "step": 1634 }, { "epoch": 0.8358895705521472, "grad_norm": 3.7170212268829346, "learning_rate": 7.20083563343551e-06, "loss": 0.7564, "step": 1635 }, { "epoch": 0.83640081799591, "grad_norm": 3.684058666229248, "learning_rate": 7.157141191620548e-06, "loss": 0.8008, "step": 1636 }, { "epoch": 0.8369120654396728, "grad_norm": 3.752002000808716, "learning_rate": 7.1135694999864e-06, "loss": 0.7732, "step": 1637 }, { "epoch": 0.8374233128834356, "grad_norm": 4.312587738037109, "learning_rate": 7.070120683371462e-06, "loss": 0.808, "step": 1638 }, { "epoch": 0.8379345603271984, "grad_norm": 4.659461498260498, "learning_rate": 7.026794866262048e-06, "loss": 0.8293, "step": 1639 }, { "epoch": 0.8384458077709611, "grad_norm": 4.59619665145874, "learning_rate": 6.983592172792086e-06, "loss": 0.851, "step": 1640 }, { "epoch": 0.838957055214724, "grad_norm": 4.543966293334961, "learning_rate": 6.940512726742715e-06, "loss": 0.8155, "step": 1641 }, { "epoch": 0.8394683026584867, "grad_norm": 4.165318489074707, "learning_rate": 6.897556651542003e-06, "loss": 0.7734, "step": 1642 }, { "epoch": 0.8399795501022495, "grad_norm": 4.547415733337402, "learning_rate": 6.854724070264451e-06, "loss": 0.9181, "step": 1643 }, { "epoch": 0.8404907975460123, "grad_norm": 4.19881010055542, "learning_rate": 6.812015105630842e-06, "loss": 0.7814, "step": 1644 }, { "epoch": 0.841002044989775, "grad_norm": 4.672346115112305, "learning_rate": 6.769429880007705e-06, "loss": 0.6833, "step": 1645 }, { "epoch": 0.8415132924335378, "grad_norm": 4.165514945983887, "learning_rate": 6.7269685154070895e-06, "loss": 0.5298, "step": 1646 }, { "epoch": 0.8420245398773006, "grad_norm": 5.330099105834961, "learning_rate": 6.6846311334861415e-06, "loss": 0.6836, "step": 1647 }, { "epoch": 0.8425357873210634, "grad_norm": 4.8200860023498535, "learning_rate": 6.642417855546768e-06, "loss": 0.7025, "step": 1648 }, { "epoch": 0.8430470347648262, "grad_norm": 5.211686611175537, "learning_rate": 6.600328802535355e-06, "loss": 0.5772, "step": 1649 }, { "epoch": 0.843558282208589, "grad_norm": 5.803190231323242, "learning_rate": 6.558364095042302e-06, "loss": 0.632, "step": 1650 }, { "epoch": 0.8440695296523517, "grad_norm": 2.0162458419799805, "learning_rate": 6.516523853301804e-06, "loss": 0.7469, "step": 1651 }, { "epoch": 0.8445807770961146, "grad_norm": 2.1648364067077637, "learning_rate": 6.474808197191401e-06, "loss": 0.9474, "step": 1652 }, { "epoch": 0.8450920245398773, "grad_norm": 2.424511194229126, "learning_rate": 6.433217246231704e-06, "loss": 0.9164, "step": 1653 }, { "epoch": 0.84560327198364, "grad_norm": 2.5511527061462402, "learning_rate": 6.391751119586003e-06, "loss": 0.9075, "step": 1654 }, { "epoch": 0.8461145194274029, "grad_norm": 2.9107165336608887, "learning_rate": 6.350409936059998e-06, "loss": 0.9481, "step": 1655 }, { "epoch": 0.8466257668711656, "grad_norm": 2.7874770164489746, "learning_rate": 6.3091938141013495e-06, "loss": 0.9119, "step": 1656 }, { "epoch": 0.8471370143149284, "grad_norm": 2.7134737968444824, "learning_rate": 6.268102871799459e-06, "loss": 0.8897, "step": 1657 }, { "epoch": 0.8476482617586912, "grad_norm": 2.6800270080566406, "learning_rate": 6.227137226885027e-06, "loss": 0.8625, "step": 1658 }, { "epoch": 0.848159509202454, "grad_norm": 3.039987564086914, "learning_rate": 6.186296996729796e-06, "loss": 0.9901, "step": 1659 }, { "epoch": 0.8486707566462167, "grad_norm": 2.883843183517456, "learning_rate": 6.145582298346153e-06, "loss": 0.8557, "step": 1660 }, { "epoch": 0.8491820040899796, "grad_norm": 3.040339469909668, "learning_rate": 6.104993248386831e-06, "loss": 0.9266, "step": 1661 }, { "epoch": 0.8496932515337423, "grad_norm": 3.1167523860931396, "learning_rate": 6.064529963144583e-06, "loss": 0.922, "step": 1662 }, { "epoch": 0.8502044989775052, "grad_norm": 3.086543560028076, "learning_rate": 6.024192558551784e-06, "loss": 0.8847, "step": 1663 }, { "epoch": 0.8507157464212679, "grad_norm": 3.4161489009857178, "learning_rate": 5.98398115018019e-06, "loss": 0.8947, "step": 1664 }, { "epoch": 0.8512269938650306, "grad_norm": 3.196218252182007, "learning_rate": 5.943895853240533e-06, "loss": 0.8973, "step": 1665 }, { "epoch": 0.8517382413087935, "grad_norm": 3.152571201324463, "learning_rate": 5.903936782582253e-06, "loss": 0.8851, "step": 1666 }, { "epoch": 0.8522494887525562, "grad_norm": 3.339434862136841, "learning_rate": 5.8641040526930925e-06, "loss": 0.9871, "step": 1667 }, { "epoch": 0.852760736196319, "grad_norm": 3.3059394359588623, "learning_rate": 5.824397777698859e-06, "loss": 0.8634, "step": 1668 }, { "epoch": 0.8532719836400818, "grad_norm": 3.992922306060791, "learning_rate": 5.784818071362996e-06, "loss": 0.9209, "step": 1669 }, { "epoch": 0.8537832310838446, "grad_norm": 3.1529664993286133, "learning_rate": 5.74536504708637e-06, "loss": 0.889, "step": 1670 }, { "epoch": 0.8542944785276073, "grad_norm": 3.0963938236236572, "learning_rate": 5.706038817906845e-06, "loss": 0.8722, "step": 1671 }, { "epoch": 0.8548057259713702, "grad_norm": 3.323939085006714, "learning_rate": 5.666839496499022e-06, "loss": 0.8879, "step": 1672 }, { "epoch": 0.8553169734151329, "grad_norm": 3.5231525897979736, "learning_rate": 5.6277671951738716e-06, "loss": 0.7867, "step": 1673 }, { "epoch": 0.8558282208588958, "grad_norm": 3.041142225265503, "learning_rate": 5.588822025878476e-06, "loss": 0.7922, "step": 1674 }, { "epoch": 0.8563394683026585, "grad_norm": 3.526838779449463, "learning_rate": 5.550004100195639e-06, "loss": 0.9025, "step": 1675 }, { "epoch": 0.8568507157464212, "grad_norm": 3.6863765716552734, "learning_rate": 5.5113135293435815e-06, "loss": 0.9744, "step": 1676 }, { "epoch": 0.8573619631901841, "grad_norm": 3.2295031547546387, "learning_rate": 5.4727504241756874e-06, "loss": 0.8475, "step": 1677 }, { "epoch": 0.8578732106339468, "grad_norm": 3.341581344604492, "learning_rate": 5.434314895180082e-06, "loss": 0.8515, "step": 1678 }, { "epoch": 0.8583844580777096, "grad_norm": 3.328876495361328, "learning_rate": 5.396007052479407e-06, "loss": 0.8321, "step": 1679 }, { "epoch": 0.8588957055214724, "grad_norm": 3.497668743133545, "learning_rate": 5.357827005830435e-06, "loss": 0.8929, "step": 1680 }, { "epoch": 0.8594069529652352, "grad_norm": 3.690748691558838, "learning_rate": 5.319774864623834e-06, "loss": 0.8603, "step": 1681 }, { "epoch": 0.8599182004089979, "grad_norm": 3.709012508392334, "learning_rate": 5.281850737883731e-06, "loss": 0.9677, "step": 1682 }, { "epoch": 0.8604294478527608, "grad_norm": 3.8703484535217285, "learning_rate": 5.2440547342675614e-06, "loss": 0.8876, "step": 1683 }, { "epoch": 0.8609406952965235, "grad_norm": 3.9959239959716797, "learning_rate": 5.206386962065602e-06, "loss": 0.791, "step": 1684 }, { "epoch": 0.8614519427402862, "grad_norm": 3.8929603099823, "learning_rate": 5.168847529200782e-06, "loss": 0.8654, "step": 1685 }, { "epoch": 0.8619631901840491, "grad_norm": 4.002877235412598, "learning_rate": 5.1314365432282904e-06, "loss": 0.8284, "step": 1686 }, { "epoch": 0.8624744376278118, "grad_norm": 4.066626071929932, "learning_rate": 5.094154111335292e-06, "loss": 0.8603, "step": 1687 }, { "epoch": 0.8629856850715747, "grad_norm": 4.232320785522461, "learning_rate": 5.057000340340678e-06, "loss": 0.9426, "step": 1688 }, { "epoch": 0.8634969325153374, "grad_norm": 3.965895175933838, "learning_rate": 5.019975336694649e-06, "loss": 0.7798, "step": 1689 }, { "epoch": 0.8640081799591002, "grad_norm": 4.249181270599365, "learning_rate": 4.983079206478513e-06, "loss": 0.8211, "step": 1690 }, { "epoch": 0.864519427402863, "grad_norm": 4.230957508087158, "learning_rate": 4.946312055404328e-06, "loss": 0.9142, "step": 1691 }, { "epoch": 0.8650306748466258, "grad_norm": 4.479368209838867, "learning_rate": 4.909673988814601e-06, "loss": 0.9803, "step": 1692 }, { "epoch": 0.8655419222903885, "grad_norm": 4.021700859069824, "learning_rate": 4.873165111681993e-06, "loss": 0.7705, "step": 1693 }, { "epoch": 0.8660531697341514, "grad_norm": 4.131351470947266, "learning_rate": 4.836785528609051e-06, "loss": 0.7206, "step": 1694 }, { "epoch": 0.8665644171779141, "grad_norm": 4.708834171295166, "learning_rate": 4.800535343827833e-06, "loss": 0.8698, "step": 1695 }, { "epoch": 0.8670756646216768, "grad_norm": 4.927844047546387, "learning_rate": 4.764414661199707e-06, "loss": 0.7615, "step": 1696 }, { "epoch": 0.8675869120654397, "grad_norm": 5.510052680969238, "learning_rate": 4.728423584214947e-06, "loss": 0.9676, "step": 1697 }, { "epoch": 0.8680981595092024, "grad_norm": 5.557268142700195, "learning_rate": 4.692562215992541e-06, "loss": 0.6631, "step": 1698 }, { "epoch": 0.8686094069529653, "grad_norm": 6.219789028167725, "learning_rate": 4.656830659279804e-06, "loss": 0.8028, "step": 1699 }, { "epoch": 0.869120654396728, "grad_norm": 5.629024028778076, "learning_rate": 4.621229016452156e-06, "loss": 0.4033, "step": 1700 }, { "epoch": 0.8696319018404908, "grad_norm": 1.9563968181610107, "learning_rate": 4.585757389512768e-06, "loss": 0.8627, "step": 1701 }, { "epoch": 0.8701431492842536, "grad_norm": 2.113006591796875, "learning_rate": 4.550415880092313e-06, "loss": 0.8872, "step": 1702 }, { "epoch": 0.8706543967280164, "grad_norm": 2.489034652709961, "learning_rate": 4.515204589448674e-06, "loss": 0.9124, "step": 1703 }, { "epoch": 0.8711656441717791, "grad_norm": 2.6170196533203125, "learning_rate": 4.48012361846662e-06, "loss": 1.0461, "step": 1704 }, { "epoch": 0.871676891615542, "grad_norm": 2.774226188659668, "learning_rate": 4.445173067657554e-06, "loss": 1.0186, "step": 1705 }, { "epoch": 0.8721881390593047, "grad_norm": 2.7719779014587402, "learning_rate": 4.410353037159193e-06, "loss": 0.9851, "step": 1706 }, { "epoch": 0.8726993865030674, "grad_norm": 2.8371808528900146, "learning_rate": 4.3756636267353214e-06, "loss": 1.0079, "step": 1707 }, { "epoch": 0.8732106339468303, "grad_norm": 3.129444122314453, "learning_rate": 4.341104935775442e-06, "loss": 0.8973, "step": 1708 }, { "epoch": 0.873721881390593, "grad_norm": 2.850924491882324, "learning_rate": 4.306677063294573e-06, "loss": 0.918, "step": 1709 }, { "epoch": 0.8742331288343558, "grad_norm": 2.839162826538086, "learning_rate": 4.272380107932888e-06, "loss": 0.9134, "step": 1710 }, { "epoch": 0.8747443762781186, "grad_norm": 3.149712562561035, "learning_rate": 4.238214167955484e-06, "loss": 0.9679, "step": 1711 }, { "epoch": 0.8752556237218814, "grad_norm": 2.945416212081909, "learning_rate": 4.2041793412520734e-06, "loss": 0.8602, "step": 1712 }, { "epoch": 0.8757668711656442, "grad_norm": 3.0397250652313232, "learning_rate": 4.17027572533672e-06, "loss": 0.9156, "step": 1713 }, { "epoch": 0.876278118609407, "grad_norm": 3.256863832473755, "learning_rate": 4.136503417347554e-06, "loss": 0.7461, "step": 1714 }, { "epoch": 0.8767893660531697, "grad_norm": 3.1958820819854736, "learning_rate": 4.102862514046474e-06, "loss": 0.8761, "step": 1715 }, { "epoch": 0.8773006134969326, "grad_norm": 3.4146556854248047, "learning_rate": 4.069353111818913e-06, "loss": 0.9106, "step": 1716 }, { "epoch": 0.8778118609406953, "grad_norm": 3.057286024093628, "learning_rate": 4.035975306673517e-06, "loss": 0.8755, "step": 1717 }, { "epoch": 0.878323108384458, "grad_norm": 3.0085763931274414, "learning_rate": 4.0027291942419055e-06, "loss": 0.7805, "step": 1718 }, { "epoch": 0.8788343558282209, "grad_norm": 3.2898752689361572, "learning_rate": 3.969614869778354e-06, "loss": 0.8877, "step": 1719 }, { "epoch": 0.8793456032719836, "grad_norm": 3.1025335788726807, "learning_rate": 3.936632428159609e-06, "loss": 0.808, "step": 1720 }, { "epoch": 0.8798568507157464, "grad_norm": 3.2930991649627686, "learning_rate": 3.903781963884467e-06, "loss": 0.8736, "step": 1721 }, { "epoch": 0.8803680981595092, "grad_norm": 3.3883445262908936, "learning_rate": 3.871063571073668e-06, "loss": 0.9969, "step": 1722 }, { "epoch": 0.880879345603272, "grad_norm": 3.6220452785491943, "learning_rate": 3.838477343469516e-06, "loss": 0.88, "step": 1723 }, { "epoch": 0.8813905930470347, "grad_norm": 3.937267303466797, "learning_rate": 3.8060233744356633e-06, "loss": 0.9832, "step": 1724 }, { "epoch": 0.8819018404907976, "grad_norm": 3.5126397609710693, "learning_rate": 3.77370175695681e-06, "loss": 0.8513, "step": 1725 }, { "epoch": 0.8824130879345603, "grad_norm": 3.5317177772521973, "learning_rate": 3.74151258363844e-06, "loss": 0.8387, "step": 1726 }, { "epoch": 0.8829243353783232, "grad_norm": 3.232076406478882, "learning_rate": 3.7094559467066083e-06, "loss": 0.8725, "step": 1727 }, { "epoch": 0.8834355828220859, "grad_norm": 3.6095595359802246, "learning_rate": 3.6775319380076e-06, "loss": 0.9068, "step": 1728 }, { "epoch": 0.8839468302658486, "grad_norm": 3.2999720573425293, "learning_rate": 3.645740649007734e-06, "loss": 0.806, "step": 1729 }, { "epoch": 0.8844580777096115, "grad_norm": 3.733455181121826, "learning_rate": 3.614082170793021e-06, "loss": 0.8415, "step": 1730 }, { "epoch": 0.8849693251533742, "grad_norm": 3.5478620529174805, "learning_rate": 3.5825565940690087e-06, "loss": 0.8471, "step": 1731 }, { "epoch": 0.885480572597137, "grad_norm": 3.889519214630127, "learning_rate": 3.551164009160429e-06, "loss": 0.8128, "step": 1732 }, { "epoch": 0.8859918200408998, "grad_norm": 3.7842485904693604, "learning_rate": 3.5199045060110013e-06, "loss": 0.9556, "step": 1733 }, { "epoch": 0.8865030674846626, "grad_norm": 3.5816397666931152, "learning_rate": 3.488778174183116e-06, "loss": 0.8108, "step": 1734 }, { "epoch": 0.8870143149284253, "grad_norm": 3.812117338180542, "learning_rate": 3.4577851028576523e-06, "loss": 0.7997, "step": 1735 }, { "epoch": 0.8875255623721882, "grad_norm": 3.7888643741607666, "learning_rate": 3.4269253808336455e-06, "loss": 0.8451, "step": 1736 }, { "epoch": 0.8880368098159509, "grad_norm": 3.8936595916748047, "learning_rate": 3.3961990965280745e-06, "loss": 0.8154, "step": 1737 }, { "epoch": 0.8885480572597138, "grad_norm": 4.134406566619873, "learning_rate": 3.36560633797563e-06, "loss": 0.8866, "step": 1738 }, { "epoch": 0.8890593047034765, "grad_norm": 4.125121116638184, "learning_rate": 3.335147192828403e-06, "loss": 0.7727, "step": 1739 }, { "epoch": 0.8895705521472392, "grad_norm": 4.411681175231934, "learning_rate": 3.3048217483556744e-06, "loss": 0.8563, "step": 1740 }, { "epoch": 0.8900817995910021, "grad_norm": 3.7549989223480225, "learning_rate": 3.2746300914436534e-06, "loss": 0.7357, "step": 1741 }, { "epoch": 0.8905930470347648, "grad_norm": 4.538980960845947, "learning_rate": 3.2445723085952504e-06, "loss": 0.818, "step": 1742 }, { "epoch": 0.8911042944785276, "grad_norm": 4.389502048492432, "learning_rate": 3.214648485929783e-06, "loss": 0.8799, "step": 1743 }, { "epoch": 0.8916155419222904, "grad_norm": 4.548320293426514, "learning_rate": 3.184858709182775e-06, "loss": 0.6832, "step": 1744 }, { "epoch": 0.8921267893660532, "grad_norm": 4.614025115966797, "learning_rate": 3.1552030637056806e-06, "loss": 0.8305, "step": 1745 }, { "epoch": 0.8926380368098159, "grad_norm": 4.867095470428467, "learning_rate": 3.1256816344656602e-06, "loss": 0.8145, "step": 1746 }, { "epoch": 0.8931492842535788, "grad_norm": 4.788604259490967, "learning_rate": 3.096294506045311e-06, "loss": 0.8077, "step": 1747 }, { "epoch": 0.8936605316973415, "grad_norm": 5.478014945983887, "learning_rate": 3.067041762642475e-06, "loss": 0.771, "step": 1748 }, { "epoch": 0.8941717791411042, "grad_norm": 6.572366237640381, "learning_rate": 3.037923488069927e-06, "loss": 0.7604, "step": 1749 }, { "epoch": 0.8946830265848671, "grad_norm": 5.753463268280029, "learning_rate": 3.0089397657551865e-06, "loss": 0.4072, "step": 1750 }, { "epoch": 0.8951942740286298, "grad_norm": 1.9616332054138184, "learning_rate": 2.9800906787402716e-06, "loss": 0.9257, "step": 1751 }, { "epoch": 0.8957055214723927, "grad_norm": 2.392256259918213, "learning_rate": 2.9513763096814305e-06, "loss": 1.0013, "step": 1752 }, { "epoch": 0.8962167689161554, "grad_norm": 2.3039324283599854, "learning_rate": 2.9227967408489653e-06, "loss": 0.9961, "step": 1753 }, { "epoch": 0.8967280163599182, "grad_norm": 2.534642457962036, "learning_rate": 2.89435205412692e-06, "loss": 0.9289, "step": 1754 }, { "epoch": 0.897239263803681, "grad_norm": 2.941516160964966, "learning_rate": 2.8660423310129135e-06, "loss": 0.957, "step": 1755 }, { "epoch": 0.8977505112474438, "grad_norm": 2.806290626525879, "learning_rate": 2.8378676526178482e-06, "loss": 1.0622, "step": 1756 }, { "epoch": 0.8982617586912065, "grad_norm": 2.6641461849212646, "learning_rate": 2.8098280996657456e-06, "loss": 0.9309, "step": 1757 }, { "epoch": 0.8987730061349694, "grad_norm": 2.6802642345428467, "learning_rate": 2.781923752493437e-06, "loss": 0.8945, "step": 1758 }, { "epoch": 0.8992842535787321, "grad_norm": 2.851447820663452, "learning_rate": 2.754154691050387e-06, "loss": 0.8722, "step": 1759 }, { "epoch": 0.8997955010224948, "grad_norm": 2.958563804626465, "learning_rate": 2.7265209948984514e-06, "loss": 0.8529, "step": 1760 }, { "epoch": 0.9003067484662577, "grad_norm": 2.867089033126831, "learning_rate": 2.6990227432116544e-06, "loss": 0.8957, "step": 1761 }, { "epoch": 0.9008179959100204, "grad_norm": 3.3198704719543457, "learning_rate": 2.671660014775934e-06, "loss": 0.8905, "step": 1762 }, { "epoch": 0.9013292433537833, "grad_norm": 3.0108888149261475, "learning_rate": 2.6444328879889622e-06, "loss": 0.8434, "step": 1763 }, { "epoch": 0.901840490797546, "grad_norm": 2.9155540466308594, "learning_rate": 2.6173414408598827e-06, "loss": 0.8414, "step": 1764 }, { "epoch": 0.9023517382413088, "grad_norm": 3.178889036178589, "learning_rate": 2.5903857510090835e-06, "loss": 0.9461, "step": 1765 }, { "epoch": 0.9028629856850716, "grad_norm": 3.119640588760376, "learning_rate": 2.56356589566803e-06, "loss": 0.8996, "step": 1766 }, { "epoch": 0.9033742331288344, "grad_norm": 3.1831552982330322, "learning_rate": 2.53688195167896e-06, "loss": 0.9139, "step": 1767 }, { "epoch": 0.9038854805725971, "grad_norm": 3.3082258701324463, "learning_rate": 2.5103339954947626e-06, "loss": 0.8465, "step": 1768 }, { "epoch": 0.90439672801636, "grad_norm": 3.1402430534362793, "learning_rate": 2.483922103178632e-06, "loss": 0.9071, "step": 1769 }, { "epoch": 0.9049079754601227, "grad_norm": 3.3881165981292725, "learning_rate": 2.4576463504039913e-06, "loss": 0.9479, "step": 1770 }, { "epoch": 0.9054192229038854, "grad_norm": 3.3849008083343506, "learning_rate": 2.4315068124541597e-06, "loss": 0.8833, "step": 1771 }, { "epoch": 0.9059304703476483, "grad_norm": 3.725773334503174, "learning_rate": 2.4055035642222224e-06, "loss": 0.8946, "step": 1772 }, { "epoch": 0.906441717791411, "grad_norm": 4.276130676269531, "learning_rate": 2.3796366802107394e-06, "loss": 0.9649, "step": 1773 }, { "epoch": 0.9069529652351738, "grad_norm": 3.543367862701416, "learning_rate": 2.3539062345316e-06, "loss": 0.8461, "step": 1774 }, { "epoch": 0.9074642126789366, "grad_norm": 3.076871156692505, "learning_rate": 2.3283123009057607e-06, "loss": 0.8243, "step": 1775 }, { "epoch": 0.9079754601226994, "grad_norm": 3.6242263317108154, "learning_rate": 2.3028549526630583e-06, "loss": 0.8345, "step": 1776 }, { "epoch": 0.9084867075664622, "grad_norm": 3.5291264057159424, "learning_rate": 2.277534262742015e-06, "loss": 0.892, "step": 1777 }, { "epoch": 0.908997955010225, "grad_norm": 3.4097137451171875, "learning_rate": 2.2523503036895764e-06, "loss": 0.8539, "step": 1778 }, { "epoch": 0.9095092024539877, "grad_norm": 3.7175967693328857, "learning_rate": 2.227303147660964e-06, "loss": 0.9083, "step": 1779 }, { "epoch": 0.9100204498977505, "grad_norm": 3.9345943927764893, "learning_rate": 2.202392866419423e-06, "loss": 0.9569, "step": 1780 }, { "epoch": 0.9105316973415133, "grad_norm": 3.6897552013397217, "learning_rate": 2.1776195313360505e-06, "loss": 0.8444, "step": 1781 }, { "epoch": 0.911042944785276, "grad_norm": 3.9733729362487793, "learning_rate": 2.152983213389559e-06, "loss": 0.9426, "step": 1782 }, { "epoch": 0.9115541922290389, "grad_norm": 4.125808238983154, "learning_rate": 2.1284839831661075e-06, "loss": 0.8886, "step": 1783 }, { "epoch": 0.9120654396728016, "grad_norm": 3.4890663623809814, "learning_rate": 2.1041219108590692e-06, "loss": 0.8138, "step": 1784 }, { "epoch": 0.9125766871165644, "grad_norm": 3.9719114303588867, "learning_rate": 2.0798970662688545e-06, "loss": 0.8747, "step": 1785 }, { "epoch": 0.9130879345603272, "grad_norm": 4.040510654449463, "learning_rate": 2.055809518802676e-06, "loss": 1.0, "step": 1786 }, { "epoch": 0.91359918200409, "grad_norm": 4.081876754760742, "learning_rate": 2.031859337474407e-06, "loss": 0.7761, "step": 1787 }, { "epoch": 0.9141104294478528, "grad_norm": 4.227837562561035, "learning_rate": 2.0080465909043113e-06, "loss": 0.8716, "step": 1788 }, { "epoch": 0.9146216768916156, "grad_norm": 4.269420623779297, "learning_rate": 1.984371347318914e-06, "loss": 0.8066, "step": 1789 }, { "epoch": 0.9151329243353783, "grad_norm": 4.166466236114502, "learning_rate": 1.9608336745507716e-06, "loss": 0.802, "step": 1790 }, { "epoch": 0.9156441717791411, "grad_norm": 4.287985801696777, "learning_rate": 1.937433640038261e-06, "loss": 0.7958, "step": 1791 }, { "epoch": 0.9161554192229039, "grad_norm": 4.09846830368042, "learning_rate": 1.914171310825441e-06, "loss": 0.7164, "step": 1792 }, { "epoch": 0.9166666666666666, "grad_norm": 4.546944618225098, "learning_rate": 1.8910467535617983e-06, "loss": 0.7325, "step": 1793 }, { "epoch": 0.9171779141104295, "grad_norm": 4.4474897384643555, "learning_rate": 1.8680600345021171e-06, "loss": 0.7291, "step": 1794 }, { "epoch": 0.9176891615541922, "grad_norm": 4.722184658050537, "learning_rate": 1.845211219506221e-06, "loss": 0.6715, "step": 1795 }, { "epoch": 0.918200408997955, "grad_norm": 5.06058931350708, "learning_rate": 1.8225003740388547e-06, "loss": 0.9135, "step": 1796 }, { "epoch": 0.9187116564417178, "grad_norm": 5.043420791625977, "learning_rate": 1.79992756316944e-06, "loss": 0.7258, "step": 1797 }, { "epoch": 0.9192229038854806, "grad_norm": 5.176082134246826, "learning_rate": 1.7774928515719157e-06, "loss": 0.7435, "step": 1798 }, { "epoch": 0.9197341513292433, "grad_norm": 6.064294815063477, "learning_rate": 1.7551963035245588e-06, "loss": 0.7142, "step": 1799 }, { "epoch": 0.9202453987730062, "grad_norm": 6.77680778503418, "learning_rate": 1.733037982909791e-06, "loss": 0.4713, "step": 1800 }, { "epoch": 0.9207566462167689, "grad_norm": 1.9696067571640015, "learning_rate": 1.7110179532139781e-06, "loss": 0.8423, "step": 1801 }, { "epoch": 0.9212678936605317, "grad_norm": 2.4090499877929688, "learning_rate": 1.6891362775272812e-06, "loss": 0.9596, "step": 1802 }, { "epoch": 0.9217791411042945, "grad_norm": 2.441483974456787, "learning_rate": 1.6673930185434561e-06, "loss": 0.8638, "step": 1803 }, { "epoch": 0.9222903885480572, "grad_norm": 2.631037712097168, "learning_rate": 1.6457882385596646e-06, "loss": 1.0107, "step": 1804 }, { "epoch": 0.9228016359918201, "grad_norm": 2.69331431388855, "learning_rate": 1.6243219994763304e-06, "loss": 0.9641, "step": 1805 }, { "epoch": 0.9233128834355828, "grad_norm": 2.4592180252075195, "learning_rate": 1.6029943627969223e-06, "loss": 0.8723, "step": 1806 }, { "epoch": 0.9238241308793456, "grad_norm": 2.729130744934082, "learning_rate": 1.5818053896278162e-06, "loss": 0.9765, "step": 1807 }, { "epoch": 0.9243353783231084, "grad_norm": 2.6787896156311035, "learning_rate": 1.5607551406780717e-06, "loss": 0.8601, "step": 1808 }, { "epoch": 0.9248466257668712, "grad_norm": 2.9896671772003174, "learning_rate": 1.5398436762593061e-06, "loss": 0.8525, "step": 1809 }, { "epoch": 0.9253578732106339, "grad_norm": 3.0496749877929688, "learning_rate": 1.519071056285487e-06, "loss": 0.9679, "step": 1810 }, { "epoch": 0.9258691206543967, "grad_norm": 2.976200819015503, "learning_rate": 1.4984373402728014e-06, "loss": 0.8789, "step": 1811 }, { "epoch": 0.9263803680981595, "grad_norm": 3.0850539207458496, "learning_rate": 1.4779425873394259e-06, "loss": 0.85, "step": 1812 }, { "epoch": 0.9268916155419223, "grad_norm": 3.117995023727417, "learning_rate": 1.4575868562054228e-06, "loss": 0.8234, "step": 1813 }, { "epoch": 0.9274028629856851, "grad_norm": 2.9029812812805176, "learning_rate": 1.4373702051925065e-06, "loss": 0.9211, "step": 1814 }, { "epoch": 0.9279141104294478, "grad_norm": 3.243557929992676, "learning_rate": 1.4172926922239315e-06, "loss": 0.8936, "step": 1815 }, { "epoch": 0.9284253578732107, "grad_norm": 2.932993173599243, "learning_rate": 1.3973543748243e-06, "loss": 0.8516, "step": 1816 }, { "epoch": 0.9289366053169734, "grad_norm": 3.073636054992676, "learning_rate": 1.377555310119405e-06, "loss": 0.8922, "step": 1817 }, { "epoch": 0.9294478527607362, "grad_norm": 3.3185694217681885, "learning_rate": 1.3578955548360473e-06, "loss": 0.8148, "step": 1818 }, { "epoch": 0.929959100204499, "grad_norm": 3.1685848236083984, "learning_rate": 1.3383751653019029e-06, "loss": 0.9387, "step": 1819 }, { "epoch": 0.9304703476482618, "grad_norm": 3.10244083404541, "learning_rate": 1.31899419744535e-06, "loss": 0.7719, "step": 1820 }, { "epoch": 0.9309815950920245, "grad_norm": 3.1995668411254883, "learning_rate": 1.2997527067952875e-06, "loss": 0.9624, "step": 1821 }, { "epoch": 0.9314928425357873, "grad_norm": 3.6962995529174805, "learning_rate": 1.2806507484810215e-06, "loss": 0.8737, "step": 1822 }, { "epoch": 0.9320040899795501, "grad_norm": 3.9050567150115967, "learning_rate": 1.2616883772320508e-06, "loss": 1.0082, "step": 1823 }, { "epoch": 0.9325153374233128, "grad_norm": 3.417041540145874, "learning_rate": 1.2428656473779721e-06, "loss": 0.8719, "step": 1824 }, { "epoch": 0.9330265848670757, "grad_norm": 3.355666160583496, "learning_rate": 1.2241826128482625e-06, "loss": 0.9281, "step": 1825 }, { "epoch": 0.9335378323108384, "grad_norm": 3.6375527381896973, "learning_rate": 1.20563932717217e-06, "loss": 0.8613, "step": 1826 }, { "epoch": 0.9340490797546013, "grad_norm": 3.6801555156707764, "learning_rate": 1.1872358434785346e-06, "loss": 0.7958, "step": 1827 }, { "epoch": 0.934560327198364, "grad_norm": 3.775987148284912, "learning_rate": 1.1689722144956671e-06, "loss": 0.8864, "step": 1828 }, { "epoch": 0.9350715746421268, "grad_norm": 3.3887085914611816, "learning_rate": 1.1508484925511542e-06, "loss": 0.8428, "step": 1829 }, { "epoch": 0.9355828220858896, "grad_norm": 3.701101064682007, "learning_rate": 1.132864729571731e-06, "loss": 0.8598, "step": 1830 }, { "epoch": 0.9360940695296524, "grad_norm": 3.671170711517334, "learning_rate": 1.1150209770831588e-06, "loss": 0.8273, "step": 1831 }, { "epoch": 0.9366053169734151, "grad_norm": 3.679232120513916, "learning_rate": 1.0973172862100145e-06, "loss": 0.899, "step": 1832 }, { "epoch": 0.9371165644171779, "grad_norm": 3.9870572090148926, "learning_rate": 1.0797537076756127e-06, "loss": 0.9478, "step": 1833 }, { "epoch": 0.9376278118609407, "grad_norm": 3.767942190170288, "learning_rate": 1.0623302918018108e-06, "loss": 0.8134, "step": 1834 }, { "epoch": 0.9381390593047034, "grad_norm": 4.18915319442749, "learning_rate": 1.0450470885088937e-06, "loss": 0.9433, "step": 1835 }, { "epoch": 0.9386503067484663, "grad_norm": 4.106919765472412, "learning_rate": 1.0279041473154116e-06, "loss": 0.8786, "step": 1836 }, { "epoch": 0.939161554192229, "grad_norm": 4.100769519805908, "learning_rate": 1.010901517338042e-06, "loss": 0.8459, "step": 1837 }, { "epoch": 0.9396728016359919, "grad_norm": 3.8171629905700684, "learning_rate": 9.94039247291456e-07, "loss": 0.7617, "step": 1838 }, { "epoch": 0.9401840490797546, "grad_norm": 4.406624794006348, "learning_rate": 9.773173854881913e-07, "loss": 0.7923, "step": 1839 }, { "epoch": 0.9406952965235174, "grad_norm": 4.20837926864624, "learning_rate": 9.607359798384785e-07, "loss": 0.9773, "step": 1840 }, { "epoch": 0.9412065439672802, "grad_norm": 4.484842777252197, "learning_rate": 9.442950778501325e-07, "loss": 0.8711, "step": 1841 }, { "epoch": 0.941717791411043, "grad_norm": 4.719925403594971, "learning_rate": 9.279947266284061e-07, "loss": 0.8392, "step": 1842 }, { "epoch": 0.9422290388548057, "grad_norm": 4.476531982421875, "learning_rate": 9.118349728758468e-07, "loss": 0.767, "step": 1843 }, { "epoch": 0.9427402862985685, "grad_norm": 4.48267126083374, "learning_rate": 8.958158628922019e-07, "loss": 0.6777, "step": 1844 }, { "epoch": 0.9432515337423313, "grad_norm": 4.858860015869141, "learning_rate": 8.799374425742246e-07, "loss": 0.7469, "step": 1845 }, { "epoch": 0.943762781186094, "grad_norm": 4.577967166900635, "learning_rate": 8.641997574155846e-07, "loss": 0.676, "step": 1846 }, { "epoch": 0.9442740286298569, "grad_norm": 5.150705337524414, "learning_rate": 8.486028525067358e-07, "loss": 0.7106, "step": 1847 }, { "epoch": 0.9447852760736196, "grad_norm": 4.760867595672607, "learning_rate": 8.331467725347708e-07, "loss": 0.7528, "step": 1848 }, { "epoch": 0.9452965235173824, "grad_norm": 6.085768222808838, "learning_rate": 8.178315617832999e-07, "loss": 0.784, "step": 1849 }, { "epoch": 0.9458077709611452, "grad_norm": 6.886128902435303, "learning_rate": 8.026572641323393e-07, "loss": 0.5051, "step": 1850 }, { "epoch": 0.946319018404908, "grad_norm": 2.127772808074951, "learning_rate": 7.876239230581506e-07, "loss": 0.9763, "step": 1851 }, { "epoch": 0.9468302658486708, "grad_norm": 2.3633391857147217, "learning_rate": 7.727315816331515e-07, "loss": 1.012, "step": 1852 }, { "epoch": 0.9473415132924335, "grad_norm": 2.347792625427246, "learning_rate": 7.579802825257775e-07, "loss": 0.9064, "step": 1853 }, { "epoch": 0.9478527607361963, "grad_norm": 2.4126203060150146, "learning_rate": 7.43370068000343e-07, "loss": 0.8778, "step": 1854 }, { "epoch": 0.9483640081799591, "grad_norm": 2.6466588973999023, "learning_rate": 7.289009799169688e-07, "loss": 0.9782, "step": 1855 }, { "epoch": 0.9488752556237219, "grad_norm": 2.6971752643585205, "learning_rate": 7.145730597314049e-07, "loss": 0.979, "step": 1856 }, { "epoch": 0.9493865030674846, "grad_norm": 2.6623589992523193, "learning_rate": 7.003863484949413e-07, "loss": 0.9151, "step": 1857 }, { "epoch": 0.9498977505112475, "grad_norm": 2.865541696548462, "learning_rate": 6.86340886854292e-07, "loss": 0.8949, "step": 1858 }, { "epoch": 0.9504089979550102, "grad_norm": 3.0056710243225098, "learning_rate": 6.724367150514777e-07, "loss": 0.8936, "step": 1859 }, { "epoch": 0.950920245398773, "grad_norm": 2.805217742919922, "learning_rate": 6.58673872923693e-07, "loss": 0.831, "step": 1860 }, { "epoch": 0.9514314928425358, "grad_norm": 3.1351966857910156, "learning_rate": 6.450523999032177e-07, "loss": 1.0224, "step": 1861 }, { "epoch": 0.9519427402862985, "grad_norm": 2.93005633354187, "learning_rate": 6.315723350172775e-07, "loss": 0.8546, "step": 1862 }, { "epoch": 0.9524539877300614, "grad_norm": 3.181835651397705, "learning_rate": 6.182337168879671e-07, "loss": 0.903, "step": 1863 }, { "epoch": 0.9529652351738241, "grad_norm": 3.401155948638916, "learning_rate": 6.050365837320992e-07, "loss": 0.9181, "step": 1864 }, { "epoch": 0.9534764826175869, "grad_norm": 3.0578250885009766, "learning_rate": 5.919809733611171e-07, "loss": 0.8978, "step": 1865 }, { "epoch": 0.9539877300613497, "grad_norm": 3.142280101776123, "learning_rate": 5.790669231809875e-07, "loss": 0.9351, "step": 1866 }, { "epoch": 0.9544989775051125, "grad_norm": 3.1850273609161377, "learning_rate": 5.66294470192097e-07, "loss": 0.9621, "step": 1867 }, { "epoch": 0.9550102249488752, "grad_norm": 3.274036169052124, "learning_rate": 5.536636509891225e-07, "loss": 0.8428, "step": 1868 }, { "epoch": 0.9555214723926381, "grad_norm": 3.323293924331665, "learning_rate": 5.411745017609493e-07, "loss": 0.9623, "step": 1869 }, { "epoch": 0.9560327198364008, "grad_norm": 3.296380043029785, "learning_rate": 5.288270582905708e-07, "loss": 0.86, "step": 1870 }, { "epoch": 0.9565439672801636, "grad_norm": 3.7464077472686768, "learning_rate": 5.166213559549549e-07, "loss": 0.8665, "step": 1871 }, { "epoch": 0.9570552147239264, "grad_norm": 3.5171990394592285, "learning_rate": 5.045574297249833e-07, "loss": 0.9063, "step": 1872 }, { "epoch": 0.9575664621676891, "grad_norm": 3.684037685394287, "learning_rate": 4.926353141653184e-07, "loss": 0.9079, "step": 1873 }, { "epoch": 0.9580777096114519, "grad_norm": 3.424814462661743, "learning_rate": 4.80855043434325e-07, "loss": 0.8508, "step": 1874 }, { "epoch": 0.9585889570552147, "grad_norm": 3.3258156776428223, "learning_rate": 4.692166512839491e-07, "loss": 0.8615, "step": 1875 }, { "epoch": 0.9591002044989775, "grad_norm": 3.7672901153564453, "learning_rate": 4.577201710596612e-07, "loss": 0.9087, "step": 1876 }, { "epoch": 0.9596114519427403, "grad_norm": 3.638936996459961, "learning_rate": 4.4636563570031873e-07, "loss": 0.921, "step": 1877 }, { "epoch": 0.9601226993865031, "grad_norm": 3.7481777667999268, "learning_rate": 4.3515307773809855e-07, "loss": 0.8998, "step": 1878 }, { "epoch": 0.9606339468302658, "grad_norm": 4.0111212730407715, "learning_rate": 4.240825292983808e-07, "loss": 0.8379, "step": 1879 }, { "epoch": 0.9611451942740287, "grad_norm": 3.8063735961914062, "learning_rate": 4.131540220996877e-07, "loss": 1.0216, "step": 1880 }, { "epoch": 0.9616564417177914, "grad_norm": 3.818110942840576, "learning_rate": 4.023675874535671e-07, "loss": 0.9189, "step": 1881 }, { "epoch": 0.9621676891615542, "grad_norm": 3.307727098464966, "learning_rate": 3.917232562645035e-07, "loss": 0.7723, "step": 1882 }, { "epoch": 0.962678936605317, "grad_norm": 3.7070207595825195, "learning_rate": 3.812210590298515e-07, "loss": 0.8208, "step": 1883 }, { "epoch": 0.9631901840490797, "grad_norm": 3.999943971633911, "learning_rate": 3.7086102583972494e-07, "loss": 0.9011, "step": 1884 }, { "epoch": 0.9637014314928425, "grad_norm": 3.9105939865112305, "learning_rate": 3.6064318637693e-07, "loss": 0.9319, "step": 1885 }, { "epoch": 0.9642126789366053, "grad_norm": 3.4953696727752686, "learning_rate": 3.505675699168487e-07, "loss": 0.836, "step": 1886 }, { "epoch": 0.9647239263803681, "grad_norm": 4.094799995422363, "learning_rate": 3.406342053274003e-07, "loss": 0.7909, "step": 1887 }, { "epoch": 0.9652351738241309, "grad_norm": 3.863297700881958, "learning_rate": 3.3084312106892446e-07, "loss": 0.8657, "step": 1888 }, { "epoch": 0.9657464212678937, "grad_norm": 4.07185173034668, "learning_rate": 3.211943451941035e-07, "loss": 0.8233, "step": 1889 }, { "epoch": 0.9662576687116564, "grad_norm": 4.35256814956665, "learning_rate": 3.1168790534789605e-07, "loss": 0.9713, "step": 1890 }, { "epoch": 0.9667689161554193, "grad_norm": 3.9577109813690186, "learning_rate": 3.023238287674479e-07, "loss": 0.8875, "step": 1891 }, { "epoch": 0.967280163599182, "grad_norm": 4.198614597320557, "learning_rate": 2.9310214228202013e-07, "loss": 0.7918, "step": 1892 }, { "epoch": 0.9677914110429447, "grad_norm": 4.020992279052734, "learning_rate": 2.840228723129001e-07, "loss": 0.7969, "step": 1893 }, { "epoch": 0.9683026584867076, "grad_norm": 4.857772350311279, "learning_rate": 2.750860448733461e-07, "loss": 0.7989, "step": 1894 }, { "epoch": 0.9688139059304703, "grad_norm": 4.296475887298584, "learning_rate": 2.662916855684816e-07, "loss": 0.7896, "step": 1895 }, { "epoch": 0.9693251533742331, "grad_norm": 4.845669269561768, "learning_rate": 2.5763981959526786e-07, "loss": 0.9473, "step": 1896 }, { "epoch": 0.9698364008179959, "grad_norm": 5.276214599609375, "learning_rate": 2.4913047174237035e-07, "loss": 0.8389, "step": 1897 }, { "epoch": 0.9703476482617587, "grad_norm": 5.026844501495361, "learning_rate": 2.407636663901591e-07, "loss": 0.6934, "step": 1898 }, { "epoch": 0.9708588957055214, "grad_norm": 5.138829231262207, "learning_rate": 2.3253942751056968e-07, "loss": 0.5662, "step": 1899 }, { "epoch": 0.9713701431492843, "grad_norm": 8.76285457611084, "learning_rate": 2.2445777866709205e-07, "loss": 0.8929, "step": 1900 }, { "epoch": 0.971881390593047, "grad_norm": 2.1116087436676025, "learning_rate": 2.1651874301465979e-07, "loss": 0.8929, "step": 1901 }, { "epoch": 0.9723926380368099, "grad_norm": 2.1844208240509033, "learning_rate": 2.087223432996166e-07, "loss": 0.8937, "step": 1902 }, { "epoch": 0.9729038854805726, "grad_norm": 2.286012887954712, "learning_rate": 2.0106860185962194e-07, "loss": 0.8114, "step": 1903 }, { "epoch": 0.9734151329243353, "grad_norm": 2.505859613418579, "learning_rate": 1.935575406236123e-07, "loss": 0.9787, "step": 1904 }, { "epoch": 0.9739263803680982, "grad_norm": 2.708432674407959, "learning_rate": 1.861891811117178e-07, "loss": 0.9853, "step": 1905 }, { "epoch": 0.9744376278118609, "grad_norm": 2.945042610168457, "learning_rate": 1.7896354443521778e-07, "loss": 0.9986, "step": 1906 }, { "epoch": 0.9749488752556237, "grad_norm": 2.5554847717285156, "learning_rate": 1.7188065129647435e-07, "loss": 0.9626, "step": 1907 }, { "epoch": 0.9754601226993865, "grad_norm": 2.838167190551758, "learning_rate": 1.6494052198886555e-07, "loss": 0.9229, "step": 1908 }, { "epoch": 0.9759713701431493, "grad_norm": 2.6479218006134033, "learning_rate": 1.5814317639673005e-07, "loss": 0.8479, "step": 1909 }, { "epoch": 0.976482617586912, "grad_norm": 3.0106728076934814, "learning_rate": 1.5148863399532254e-07, "loss": 0.8836, "step": 1910 }, { "epoch": 0.9769938650306749, "grad_norm": 3.02433705329895, "learning_rate": 1.4497691385074175e-07, "loss": 0.9304, "step": 1911 }, { "epoch": 0.9775051124744376, "grad_norm": 3.085106134414673, "learning_rate": 1.3860803461989146e-07, "loss": 0.8676, "step": 1912 }, { "epoch": 0.9780163599182005, "grad_norm": 2.835423469543457, "learning_rate": 1.3238201455040844e-07, "loss": 0.8435, "step": 1913 }, { "epoch": 0.9785276073619632, "grad_norm": 3.2276833057403564, "learning_rate": 1.2629887148061792e-07, "loss": 0.9043, "step": 1914 }, { "epoch": 0.9790388548057259, "grad_norm": 3.260972261428833, "learning_rate": 1.203586228395004e-07, "loss": 0.9502, "step": 1915 }, { "epoch": 0.9795501022494888, "grad_norm": 3.3064229488372803, "learning_rate": 1.1456128564660273e-07, "loss": 0.9968, "step": 1916 }, { "epoch": 0.9800613496932515, "grad_norm": 3.274178981781006, "learning_rate": 1.0890687651203823e-07, "loss": 0.8302, "step": 1917 }, { "epoch": 0.9805725971370143, "grad_norm": 3.076536178588867, "learning_rate": 1.0339541163639776e-07, "loss": 0.9421, "step": 1918 }, { "epoch": 0.9810838445807771, "grad_norm": 3.247903823852539, "learning_rate": 9.802690681071647e-08, "loss": 0.9819, "step": 1919 }, { "epoch": 0.9815950920245399, "grad_norm": 3.3538260459899902, "learning_rate": 9.280137741643491e-08, "loss": 0.8744, "step": 1920 }, { "epoch": 0.9821063394683026, "grad_norm": 3.515782356262207, "learning_rate": 8.771883842536021e-08, "loss": 0.9124, "step": 1921 }, { "epoch": 0.9826175869120655, "grad_norm": 3.6226806640625, "learning_rate": 8.277930439959946e-08, "loss": 0.9011, "step": 1922 }, { "epoch": 0.9831288343558282, "grad_norm": 3.3394203186035156, "learning_rate": 7.798278949154303e-08, "loss": 0.8316, "step": 1923 }, { "epoch": 0.983640081799591, "grad_norm": 3.246371030807495, "learning_rate": 7.332930744380906e-08, "loss": 0.8556, "step": 1924 }, { "epoch": 0.9841513292433538, "grad_norm": 3.402927875518799, "learning_rate": 6.881887158920464e-08, "loss": 0.7978, "step": 1925 }, { "epoch": 0.9846625766871165, "grad_norm": 3.8112809658050537, "learning_rate": 6.445149485070357e-08, "loss": 0.9133, "step": 1926 }, { "epoch": 0.9851738241308794, "grad_norm": 3.5460119247436523, "learning_rate": 6.022718974137975e-08, "loss": 0.8158, "step": 1927 }, { "epoch": 0.9856850715746421, "grad_norm": 3.341395854949951, "learning_rate": 5.614596836440722e-08, "loss": 0.8246, "step": 1928 }, { "epoch": 0.9861963190184049, "grad_norm": 3.6873090267181396, "learning_rate": 5.2207842412999034e-08, "loss": 0.8714, "step": 1929 }, { "epoch": 0.9867075664621677, "grad_norm": 3.4815688133239746, "learning_rate": 4.841282317037399e-08, "loss": 0.8948, "step": 1930 }, { "epoch": 0.9872188139059305, "grad_norm": 3.5316038131713867, "learning_rate": 4.476092150975109e-08, "loss": 0.8622, "step": 1931 }, { "epoch": 0.9877300613496932, "grad_norm": 3.5975794792175293, "learning_rate": 4.1252147894277336e-08, "loss": 0.881, "step": 1932 }, { "epoch": 0.9882413087934561, "grad_norm": 3.441171646118164, "learning_rate": 3.7886512377033334e-08, "loss": 0.8396, "step": 1933 }, { "epoch": 0.9887525562372188, "grad_norm": 3.8511383533477783, "learning_rate": 3.4664024600988835e-08, "loss": 0.9208, "step": 1934 }, { "epoch": 0.9892638036809815, "grad_norm": 3.8687822818756104, "learning_rate": 3.158469379898055e-08, "loss": 0.9135, "step": 1935 }, { "epoch": 0.9897750511247444, "grad_norm": 3.593276023864746, "learning_rate": 2.8648528793673302e-08, "loss": 0.8474, "step": 1936 }, { "epoch": 0.9902862985685071, "grad_norm": 4.0986127853393555, "learning_rate": 2.5855537997548917e-08, "loss": 0.8883, "step": 1937 }, { "epoch": 0.99079754601227, "grad_norm": 4.03285551071167, "learning_rate": 2.3205729412884016e-08, "loss": 0.7779, "step": 1938 }, { "epoch": 0.9913087934560327, "grad_norm": 4.346153736114502, "learning_rate": 2.0699110631711148e-08, "loss": 0.8757, "step": 1939 }, { "epoch": 0.9918200408997955, "grad_norm": 4.283609390258789, "learning_rate": 1.8335688835802167e-08, "loss": 0.8173, "step": 1940 }, { "epoch": 0.9923312883435583, "grad_norm": 4.301876068115234, "learning_rate": 1.6115470796662647e-08, "loss": 0.9134, "step": 1941 }, { "epoch": 0.9928425357873211, "grad_norm": 4.885223865509033, "learning_rate": 1.4038462875504143e-08, "loss": 0.8289, "step": 1942 }, { "epoch": 0.9933537832310838, "grad_norm": 4.63042688369751, "learning_rate": 1.2104671023199787e-08, "loss": 0.8625, "step": 1943 }, { "epoch": 0.9938650306748467, "grad_norm": 4.702084064483643, "learning_rate": 1.0314100780317581e-08, "loss": 0.9342, "step": 1944 }, { "epoch": 0.9943762781186094, "grad_norm": 4.4585771560668945, "learning_rate": 8.666757277064897e-09, "loss": 0.6828, "step": 1945 }, { "epoch": 0.9948875255623721, "grad_norm": 4.869369029998779, "learning_rate": 7.162645233282916e-09, "loss": 0.8505, "step": 1946 }, { "epoch": 0.995398773006135, "grad_norm": 4.623004913330078, "learning_rate": 5.8017689584521915e-09, "loss": 0.6772, "step": 1947 }, { "epoch": 0.9959100204498977, "grad_norm": 5.718740940093994, "learning_rate": 4.584132351642678e-09, "loss": 0.8251, "step": 1948 }, { "epoch": 0.9964212678936605, "grad_norm": 5.196649551391602, "learning_rate": 3.509738901547044e-09, "loss": 0.6039, "step": 1949 }, { "epoch": 0.9969325153374233, "grad_norm": 6.253082752227783, "learning_rate": 2.5785916864307092e-09, "loss": 0.5829, "step": 1950 }, { "epoch": 0.9974437627811861, "grad_norm": 2.393533229827881, "learning_rate": 1.7906933741484999e-09, "loss": 0.869, "step": 1951 }, { "epoch": 0.9979550102249489, "grad_norm": 2.994673728942871, "learning_rate": 1.1460462221279944e-09, "loss": 0.9103, "step": 1952 }, { "epoch": 0.9984662576687117, "grad_norm": 3.5846893787384033, "learning_rate": 6.446520773695231e-10, "loss": 0.8366, "step": 1953 }, { "epoch": 0.9989775051124744, "grad_norm": 3.5298497676849365, "learning_rate": 2.8651237642396414e-10, "loss": 0.7694, "step": 1954 }, { "epoch": 0.9994887525562373, "grad_norm": 3.686530828475952, "learning_rate": 7.162814541494811e-11, "loss": 0.67, "step": 1955 }, { "epoch": 1.0, "grad_norm": 5.693774223327637, "learning_rate": 0.0, "loss": 0.7087, "step": 1956 } ], "logging_steps": 1, "max_steps": 1956, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 239, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.478583213010452e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }