{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999754305791012, "eval_steps": 1000, "global_step": 10175, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000294833050784993, "grad_norm": 351.9773864746094, "learning_rate": 4.715127701375246e-07, "loss": 174.741, "num_input_tokens_seen": 147204, "step": 3 }, { "epoch": 0.000589666101569986, "grad_norm": 335.4129333496094, "learning_rate": 9.430255402750492e-07, "loss": 172.9366, "num_input_tokens_seen": 298172, "step": 6 }, { "epoch": 0.000884499152354979, "grad_norm": 221.5324249267578, "learning_rate": 1.4145383104125737e-06, "loss": 169.1322, "num_input_tokens_seen": 438208, "step": 9 }, { "epoch": 0.001179332203139972, "grad_norm": 166.86729431152344, "learning_rate": 1.8860510805500984e-06, "loss": 165.9725, "num_input_tokens_seen": 598448, "step": 12 }, { "epoch": 0.001474165253924965, "grad_norm": 115.85004425048828, "learning_rate": 2.357563850687623e-06, "loss": 164.239, "num_input_tokens_seen": 761840, "step": 15 }, { "epoch": 0.001768998304709958, "grad_norm": 92.04082489013672, "learning_rate": 2.8290766208251474e-06, "loss": 162.3242, "num_input_tokens_seen": 913844, "step": 18 }, { "epoch": 0.002063831355494951, "grad_norm": 81.15711212158203, "learning_rate": 3.3005893909626725e-06, "loss": 161.3536, "num_input_tokens_seen": 1079924, "step": 21 }, { "epoch": 0.002358664406279944, "grad_norm": 80.38558959960938, "learning_rate": 3.7721021611001968e-06, "loss": 160.6213, "num_input_tokens_seen": 1217660, "step": 24 }, { "epoch": 0.002653497457064937, "grad_norm": 73.85279083251953, "learning_rate": 4.243614931237721e-06, "loss": 159.9705, "num_input_tokens_seen": 1371872, "step": 27 }, { "epoch": 0.00294833050784993, "grad_norm": 64.86689758300781, "learning_rate": 4.715127701375246e-06, "loss": 159.3978, "num_input_tokens_seen": 1552480, "step": 30 }, { "epoch": 0.003243163558634923, "grad_norm": 64.5094223022461, "learning_rate": 5.1866404715127704e-06, "loss": 159.1339, "num_input_tokens_seen": 1705784, "step": 33 }, { "epoch": 0.003537996609419916, "grad_norm": 58.84899139404297, "learning_rate": 5.658153241650295e-06, "loss": 158.6619, "num_input_tokens_seen": 1866676, "step": 36 }, { "epoch": 0.003832829660204909, "grad_norm": 68.10957336425781, "learning_rate": 6.129666011787819e-06, "loss": 157.7221, "num_input_tokens_seen": 2025592, "step": 39 }, { "epoch": 0.004127662710989902, "grad_norm": 72.53784942626953, "learning_rate": 6.601178781925345e-06, "loss": 157.304, "num_input_tokens_seen": 2184160, "step": 42 }, { "epoch": 0.004422495761774895, "grad_norm": 110.91462707519531, "learning_rate": 7.072691552062869e-06, "loss": 155.9637, "num_input_tokens_seen": 2315356, "step": 45 }, { "epoch": 0.004717328812559888, "grad_norm": 89.97759246826172, "learning_rate": 7.5442043222003935e-06, "loss": 155.4095, "num_input_tokens_seen": 2466780, "step": 48 }, { "epoch": 0.005012161863344881, "grad_norm": 78.38220977783203, "learning_rate": 8.015717092337918e-06, "loss": 154.4634, "num_input_tokens_seen": 2629552, "step": 51 }, { "epoch": 0.005306994914129874, "grad_norm": 106.3893814086914, "learning_rate": 8.487229862475442e-06, "loss": 154.4811, "num_input_tokens_seen": 2762900, "step": 54 }, { "epoch": 0.005601827964914867, "grad_norm": 60.77946472167969, "learning_rate": 8.958742632612968e-06, "loss": 153.6777, "num_input_tokens_seen": 2910676, "step": 57 }, { "epoch": 0.00589666101569986, "grad_norm": 74.85725402832031, "learning_rate": 9.430255402750492e-06, "loss": 153.2267, "num_input_tokens_seen": 3068036, "step": 60 }, { "epoch": 0.006191494066484853, "grad_norm": 108.25921630859375, "learning_rate": 9.901768172888017e-06, "loss": 152.5115, "num_input_tokens_seen": 3225588, "step": 63 }, { "epoch": 0.006486327117269846, "grad_norm": 64.9744644165039, "learning_rate": 1.0373280943025541e-05, "loss": 151.9292, "num_input_tokens_seen": 3366520, "step": 66 }, { "epoch": 0.006781160168054839, "grad_norm": 70.4972152709961, "learning_rate": 1.0844793713163067e-05, "loss": 151.1674, "num_input_tokens_seen": 3544740, "step": 69 }, { "epoch": 0.007075993218839832, "grad_norm": 96.35140991210938, "learning_rate": 1.131630648330059e-05, "loss": 151.7663, "num_input_tokens_seen": 3691996, "step": 72 }, { "epoch": 0.007370826269624825, "grad_norm": 76.31684112548828, "learning_rate": 1.1787819253438115e-05, "loss": 150.7677, "num_input_tokens_seen": 3840260, "step": 75 }, { "epoch": 0.007665659320409818, "grad_norm": 107.214111328125, "learning_rate": 1.2259332023575638e-05, "loss": 149.475, "num_input_tokens_seen": 3989812, "step": 78 }, { "epoch": 0.00796049237119481, "grad_norm": 58.428550720214844, "learning_rate": 1.2730844793713164e-05, "loss": 149.771, "num_input_tokens_seen": 4128548, "step": 81 }, { "epoch": 0.008255325421979804, "grad_norm": 107.3379898071289, "learning_rate": 1.320235756385069e-05, "loss": 148.0709, "num_input_tokens_seen": 4291808, "step": 84 }, { "epoch": 0.008550158472764796, "grad_norm": 79.58226013183594, "learning_rate": 1.3673870333988213e-05, "loss": 147.9338, "num_input_tokens_seen": 4450936, "step": 87 }, { "epoch": 0.00884499152354979, "grad_norm": 90.84200286865234, "learning_rate": 1.4145383104125738e-05, "loss": 147.3218, "num_input_tokens_seen": 4606692, "step": 90 }, { "epoch": 0.009139824574334782, "grad_norm": 82.57324981689453, "learning_rate": 1.4616895874263261e-05, "loss": 146.2605, "num_input_tokens_seen": 4752792, "step": 93 }, { "epoch": 0.009434657625119776, "grad_norm": 127.30277252197266, "learning_rate": 1.5088408644400787e-05, "loss": 146.2332, "num_input_tokens_seen": 4910364, "step": 96 }, { "epoch": 0.009729490675904768, "grad_norm": 79.93926239013672, "learning_rate": 1.555992141453831e-05, "loss": 145.666, "num_input_tokens_seen": 5052164, "step": 99 }, { "epoch": 0.010024323726689762, "grad_norm": 82.26026916503906, "learning_rate": 1.6031434184675836e-05, "loss": 144.4902, "num_input_tokens_seen": 5174080, "step": 102 }, { "epoch": 0.010319156777474754, "grad_norm": 63.15846252441406, "learning_rate": 1.650294695481336e-05, "loss": 143.5527, "num_input_tokens_seen": 5320224, "step": 105 }, { "epoch": 0.010613989828259748, "grad_norm": 57.38979721069336, "learning_rate": 1.6974459724950884e-05, "loss": 143.0506, "num_input_tokens_seen": 5469368, "step": 108 }, { "epoch": 0.01090882287904474, "grad_norm": 47.44795227050781, "learning_rate": 1.7445972495088412e-05, "loss": 142.9269, "num_input_tokens_seen": 5613288, "step": 111 }, { "epoch": 0.011203655929829734, "grad_norm": 46.905216217041016, "learning_rate": 1.7917485265225936e-05, "loss": 142.9721, "num_input_tokens_seen": 5754360, "step": 114 }, { "epoch": 0.011498488980614728, "grad_norm": 79.62482452392578, "learning_rate": 1.838899803536346e-05, "loss": 141.9862, "num_input_tokens_seen": 5928136, "step": 117 }, { "epoch": 0.01179332203139972, "grad_norm": 54.01189422607422, "learning_rate": 1.8860510805500985e-05, "loss": 140.0655, "num_input_tokens_seen": 6077916, "step": 120 }, { "epoch": 0.012088155082184714, "grad_norm": 68.40033721923828, "learning_rate": 1.933202357563851e-05, "loss": 140.2441, "num_input_tokens_seen": 6223140, "step": 123 }, { "epoch": 0.012382988132969706, "grad_norm": 43.50001907348633, "learning_rate": 1.9803536345776033e-05, "loss": 139.027, "num_input_tokens_seen": 6384480, "step": 126 }, { "epoch": 0.0126778211837547, "grad_norm": 80.14400482177734, "learning_rate": 2.0275049115913557e-05, "loss": 138.4401, "num_input_tokens_seen": 6547396, "step": 129 }, { "epoch": 0.012972654234539692, "grad_norm": 44.20638656616211, "learning_rate": 2.0746561886051082e-05, "loss": 138.0098, "num_input_tokens_seen": 6690760, "step": 132 }, { "epoch": 0.013267487285324685, "grad_norm": 55.782615661621094, "learning_rate": 2.1218074656188606e-05, "loss": 136.31, "num_input_tokens_seen": 6834084, "step": 135 }, { "epoch": 0.013562320336109678, "grad_norm": 54.36646270751953, "learning_rate": 2.1689587426326134e-05, "loss": 134.8045, "num_input_tokens_seen": 6981456, "step": 138 }, { "epoch": 0.013857153386894671, "grad_norm": 48.38446807861328, "learning_rate": 2.2161100196463658e-05, "loss": 135.7917, "num_input_tokens_seen": 7150592, "step": 141 }, { "epoch": 0.014151986437679663, "grad_norm": 74.71308898925781, "learning_rate": 2.263261296660118e-05, "loss": 133.7626, "num_input_tokens_seen": 7305448, "step": 144 }, { "epoch": 0.014446819488464657, "grad_norm": 60.53500747680664, "learning_rate": 2.3104125736738707e-05, "loss": 134.0327, "num_input_tokens_seen": 7470308, "step": 147 }, { "epoch": 0.01474165253924965, "grad_norm": 59.86284255981445, "learning_rate": 2.357563850687623e-05, "loss": 132.9785, "num_input_tokens_seen": 7624272, "step": 150 }, { "epoch": 0.015036485590034643, "grad_norm": 43.723575592041016, "learning_rate": 2.4047151277013755e-05, "loss": 132.5055, "num_input_tokens_seen": 7765928, "step": 153 }, { "epoch": 0.015331318640819635, "grad_norm": 48.423545837402344, "learning_rate": 2.4518664047151276e-05, "loss": 131.7431, "num_input_tokens_seen": 7930544, "step": 156 }, { "epoch": 0.01562615169160463, "grad_norm": 50.08180236816406, "learning_rate": 2.4990176817288804e-05, "loss": 130.7851, "num_input_tokens_seen": 8090808, "step": 159 }, { "epoch": 0.01592098474238962, "grad_norm": 36.15312576293945, "learning_rate": 2.5461689587426328e-05, "loss": 129.9209, "num_input_tokens_seen": 8243468, "step": 162 }, { "epoch": 0.016215817793174613, "grad_norm": 51.09037780761719, "learning_rate": 2.5933202357563852e-05, "loss": 128.3312, "num_input_tokens_seen": 8391768, "step": 165 }, { "epoch": 0.01651065084395961, "grad_norm": 42.25214767456055, "learning_rate": 2.640471512770138e-05, "loss": 127.7597, "num_input_tokens_seen": 8561392, "step": 168 }, { "epoch": 0.0168054838947446, "grad_norm": 33.23691177368164, "learning_rate": 2.68762278978389e-05, "loss": 127.3331, "num_input_tokens_seen": 8715268, "step": 171 }, { "epoch": 0.017100316945529593, "grad_norm": 38.06812286376953, "learning_rate": 2.7347740667976425e-05, "loss": 126.6195, "num_input_tokens_seen": 8863028, "step": 174 }, { "epoch": 0.01739514999631459, "grad_norm": 32.519142150878906, "learning_rate": 2.7819253438113953e-05, "loss": 126.7838, "num_input_tokens_seen": 9009124, "step": 177 }, { "epoch": 0.01768998304709958, "grad_norm": 49.16916275024414, "learning_rate": 2.8290766208251477e-05, "loss": 126.2718, "num_input_tokens_seen": 9173364, "step": 180 }, { "epoch": 0.017984816097884573, "grad_norm": 39.809391021728516, "learning_rate": 2.8762278978389e-05, "loss": 124.7026, "num_input_tokens_seen": 9320500, "step": 183 }, { "epoch": 0.018279649148669565, "grad_norm": 37.8610725402832, "learning_rate": 2.9233791748526522e-05, "loss": 124.2418, "num_input_tokens_seen": 9497040, "step": 186 }, { "epoch": 0.01857448219945456, "grad_norm": 28.63737678527832, "learning_rate": 2.970530451866405e-05, "loss": 125.1306, "num_input_tokens_seen": 9651548, "step": 189 }, { "epoch": 0.018869315250239552, "grad_norm": 29.44608497619629, "learning_rate": 3.0176817288801574e-05, "loss": 123.0679, "num_input_tokens_seen": 9811940, "step": 192 }, { "epoch": 0.019164148301024544, "grad_norm": 28.310522079467773, "learning_rate": 3.06483300589391e-05, "loss": 122.1647, "num_input_tokens_seen": 9965988, "step": 195 }, { "epoch": 0.019458981351809537, "grad_norm": 36.38974380493164, "learning_rate": 3.111984282907662e-05, "loss": 122.0278, "num_input_tokens_seen": 10116488, "step": 198 }, { "epoch": 0.019753814402594532, "grad_norm": 45.70870590209961, "learning_rate": 3.159135559921415e-05, "loss": 120.921, "num_input_tokens_seen": 10252748, "step": 201 }, { "epoch": 0.020048647453379524, "grad_norm": 30.969467163085938, "learning_rate": 3.206286836935167e-05, "loss": 122.1355, "num_input_tokens_seen": 10421036, "step": 204 }, { "epoch": 0.020343480504164516, "grad_norm": 29.877321243286133, "learning_rate": 3.2534381139489195e-05, "loss": 121.7966, "num_input_tokens_seen": 10580420, "step": 207 }, { "epoch": 0.02063831355494951, "grad_norm": 29.29607582092285, "learning_rate": 3.300589390962672e-05, "loss": 120.6734, "num_input_tokens_seen": 10721712, "step": 210 }, { "epoch": 0.020933146605734504, "grad_norm": 31.24442481994629, "learning_rate": 3.3477406679764244e-05, "loss": 120.3414, "num_input_tokens_seen": 10887044, "step": 213 }, { "epoch": 0.021227979656519496, "grad_norm": 28.847200393676758, "learning_rate": 3.394891944990177e-05, "loss": 121.1287, "num_input_tokens_seen": 11071896, "step": 216 }, { "epoch": 0.021522812707304488, "grad_norm": 27.561906814575195, "learning_rate": 3.44204322200393e-05, "loss": 121.9215, "num_input_tokens_seen": 11225176, "step": 219 }, { "epoch": 0.02181764575808948, "grad_norm": 25.985300064086914, "learning_rate": 3.4891944990176824e-05, "loss": 118.9775, "num_input_tokens_seen": 11372232, "step": 222 }, { "epoch": 0.022112478808874476, "grad_norm": 34.018558502197266, "learning_rate": 3.536345776031434e-05, "loss": 121.1285, "num_input_tokens_seen": 11494888, "step": 225 }, { "epoch": 0.022407311859659468, "grad_norm": 33.21733856201172, "learning_rate": 3.583497053045187e-05, "loss": 118.8301, "num_input_tokens_seen": 11644560, "step": 228 }, { "epoch": 0.02270214491044446, "grad_norm": 28.910791397094727, "learning_rate": 3.6306483300589396e-05, "loss": 120.0619, "num_input_tokens_seen": 11796652, "step": 231 }, { "epoch": 0.022996977961229455, "grad_norm": 32.809871673583984, "learning_rate": 3.677799607072692e-05, "loss": 117.1904, "num_input_tokens_seen": 11938880, "step": 234 }, { "epoch": 0.023291811012014448, "grad_norm": 32.05815887451172, "learning_rate": 3.724950884086444e-05, "loss": 118.6215, "num_input_tokens_seen": 12077780, "step": 237 }, { "epoch": 0.02358664406279944, "grad_norm": 38.406951904296875, "learning_rate": 3.772102161100197e-05, "loss": 119.2037, "num_input_tokens_seen": 12230832, "step": 240 }, { "epoch": 0.02388147711358443, "grad_norm": 33.12141036987305, "learning_rate": 3.8192534381139494e-05, "loss": 120.1299, "num_input_tokens_seen": 12365416, "step": 243 }, { "epoch": 0.024176310164369427, "grad_norm": 35.890018463134766, "learning_rate": 3.866404715127702e-05, "loss": 119.1851, "num_input_tokens_seen": 12527200, "step": 246 }, { "epoch": 0.02447114321515442, "grad_norm": 32.43567657470703, "learning_rate": 3.913555992141454e-05, "loss": 120.2267, "num_input_tokens_seen": 12662252, "step": 249 }, { "epoch": 0.02476597626593941, "grad_norm": 27.831727981567383, "learning_rate": 3.9607072691552066e-05, "loss": 117.902, "num_input_tokens_seen": 12817284, "step": 252 }, { "epoch": 0.025060809316724404, "grad_norm": 35.014984130859375, "learning_rate": 4.007858546168959e-05, "loss": 118.5048, "num_input_tokens_seen": 12981920, "step": 255 }, { "epoch": 0.0253556423675094, "grad_norm": 31.94507598876953, "learning_rate": 4.0550098231827115e-05, "loss": 116.9797, "num_input_tokens_seen": 13146228, "step": 258 }, { "epoch": 0.02565047541829439, "grad_norm": 34.79327392578125, "learning_rate": 4.102161100196464e-05, "loss": 118.0679, "num_input_tokens_seen": 13299160, "step": 261 }, { "epoch": 0.025945308469079383, "grad_norm": 27.8881778717041, "learning_rate": 4.1493123772102163e-05, "loss": 118.1522, "num_input_tokens_seen": 13430784, "step": 264 }, { "epoch": 0.026240141519864375, "grad_norm": 35.07295227050781, "learning_rate": 4.1964636542239695e-05, "loss": 117.0864, "num_input_tokens_seen": 13568028, "step": 267 }, { "epoch": 0.02653497457064937, "grad_norm": 30.21860694885254, "learning_rate": 4.243614931237721e-05, "loss": 115.8842, "num_input_tokens_seen": 13736356, "step": 270 }, { "epoch": 0.026829807621434363, "grad_norm": 30.91063117980957, "learning_rate": 4.2907662082514736e-05, "loss": 118.1064, "num_input_tokens_seen": 13892812, "step": 273 }, { "epoch": 0.027124640672219355, "grad_norm": 28.20920753479004, "learning_rate": 4.337917485265227e-05, "loss": 116.0458, "num_input_tokens_seen": 14034716, "step": 276 }, { "epoch": 0.027419473723004347, "grad_norm": 27.55144691467285, "learning_rate": 4.3850687622789785e-05, "loss": 117.1214, "num_input_tokens_seen": 14161152, "step": 279 }, { "epoch": 0.027714306773789343, "grad_norm": 25.209985733032227, "learning_rate": 4.4322200392927316e-05, "loss": 115.2847, "num_input_tokens_seen": 14311492, "step": 282 }, { "epoch": 0.028009139824574335, "grad_norm": 31.782129287719727, "learning_rate": 4.479371316306484e-05, "loss": 116.3986, "num_input_tokens_seen": 14468124, "step": 285 }, { "epoch": 0.028303972875359327, "grad_norm": 33.29216003417969, "learning_rate": 4.526522593320236e-05, "loss": 117.0832, "num_input_tokens_seen": 14621272, "step": 288 }, { "epoch": 0.028598805926144322, "grad_norm": 34.79197311401367, "learning_rate": 4.573673870333989e-05, "loss": 115.3495, "num_input_tokens_seen": 14758068, "step": 291 }, { "epoch": 0.028893638976929314, "grad_norm": 29.354154586791992, "learning_rate": 4.620825147347741e-05, "loss": 115.9104, "num_input_tokens_seen": 14908960, "step": 294 }, { "epoch": 0.029188472027714307, "grad_norm": 27.28232192993164, "learning_rate": 4.667976424361493e-05, "loss": 115.7575, "num_input_tokens_seen": 15053712, "step": 297 }, { "epoch": 0.0294833050784993, "grad_norm": 36.298770904541016, "learning_rate": 4.715127701375246e-05, "loss": 114.2888, "num_input_tokens_seen": 15211664, "step": 300 }, { "epoch": 0.029778138129284294, "grad_norm": 34.47058868408203, "learning_rate": 4.762278978388998e-05, "loss": 113.5061, "num_input_tokens_seen": 15359696, "step": 303 }, { "epoch": 0.030072971180069286, "grad_norm": 36.17288589477539, "learning_rate": 4.809430255402751e-05, "loss": 114.0696, "num_input_tokens_seen": 15519284, "step": 306 }, { "epoch": 0.03036780423085428, "grad_norm": 36.91202163696289, "learning_rate": 4.8565815324165034e-05, "loss": 115.5098, "num_input_tokens_seen": 15675800, "step": 309 }, { "epoch": 0.03066263728163927, "grad_norm": 47.51340103149414, "learning_rate": 4.903732809430255e-05, "loss": 115.061, "num_input_tokens_seen": 15828156, "step": 312 }, { "epoch": 0.030957470332424266, "grad_norm": 32.856422424316406, "learning_rate": 4.950884086444008e-05, "loss": 116.2509, "num_input_tokens_seen": 15994660, "step": 315 }, { "epoch": 0.03125230338320926, "grad_norm": 35.165409088134766, "learning_rate": 4.998035363457761e-05, "loss": 115.2751, "num_input_tokens_seen": 16143304, "step": 318 }, { "epoch": 0.031547136433994254, "grad_norm": 33.481590270996094, "learning_rate": 5.045186640471513e-05, "loss": 116.1703, "num_input_tokens_seen": 16291300, "step": 321 }, { "epoch": 0.03184196948477924, "grad_norm": 30.144989013671875, "learning_rate": 5.0923379174852656e-05, "loss": 113.4856, "num_input_tokens_seen": 16469844, "step": 324 }, { "epoch": 0.03213680253556424, "grad_norm": 26.71873664855957, "learning_rate": 5.139489194499019e-05, "loss": 115.6015, "num_input_tokens_seen": 16605332, "step": 327 }, { "epoch": 0.032431635586349226, "grad_norm": 28.083377838134766, "learning_rate": 5.1866404715127704e-05, "loss": 112.2329, "num_input_tokens_seen": 16737376, "step": 330 }, { "epoch": 0.03272646863713422, "grad_norm": 31.44344711303711, "learning_rate": 5.233791748526523e-05, "loss": 113.1789, "num_input_tokens_seen": 16891036, "step": 333 }, { "epoch": 0.03302130168791922, "grad_norm": 31.328773498535156, "learning_rate": 5.280943025540276e-05, "loss": 113.8001, "num_input_tokens_seen": 17041200, "step": 336 }, { "epoch": 0.033316134738704206, "grad_norm": 26.61764144897461, "learning_rate": 5.328094302554028e-05, "loss": 113.1791, "num_input_tokens_seen": 17200492, "step": 339 }, { "epoch": 0.0336109677894892, "grad_norm": 30.37028694152832, "learning_rate": 5.37524557956778e-05, "loss": 114.1511, "num_input_tokens_seen": 17349404, "step": 342 }, { "epoch": 0.0339058008402742, "grad_norm": 31.63671875, "learning_rate": 5.422396856581533e-05, "loss": 112.5117, "num_input_tokens_seen": 17497608, "step": 345 }, { "epoch": 0.034200633891059186, "grad_norm": 34.88505935668945, "learning_rate": 5.469548133595285e-05, "loss": 113.4745, "num_input_tokens_seen": 17630548, "step": 348 }, { "epoch": 0.03449546694184418, "grad_norm": 34.04216003417969, "learning_rate": 5.516699410609038e-05, "loss": 114.2201, "num_input_tokens_seen": 17770232, "step": 351 }, { "epoch": 0.03479029999262918, "grad_norm": 31.125619888305664, "learning_rate": 5.5638506876227905e-05, "loss": 113.5508, "num_input_tokens_seen": 17952832, "step": 354 }, { "epoch": 0.035085133043414166, "grad_norm": 33.099971771240234, "learning_rate": 5.611001964636542e-05, "loss": 112.3604, "num_input_tokens_seen": 18087932, "step": 357 }, { "epoch": 0.03537996609419916, "grad_norm": 27.962017059326172, "learning_rate": 5.6581532416502954e-05, "loss": 111.5293, "num_input_tokens_seen": 18249968, "step": 360 }, { "epoch": 0.03567479914498415, "grad_norm": 34.95933532714844, "learning_rate": 5.705304518664047e-05, "loss": 112.3723, "num_input_tokens_seen": 18420280, "step": 363 }, { "epoch": 0.035969632195769145, "grad_norm": 30.143390655517578, "learning_rate": 5.7524557956778e-05, "loss": 112.1157, "num_input_tokens_seen": 18578408, "step": 366 }, { "epoch": 0.03626446524655414, "grad_norm": 33.200233459472656, "learning_rate": 5.799607072691553e-05, "loss": 111.7586, "num_input_tokens_seen": 18710576, "step": 369 }, { "epoch": 0.03655929829733913, "grad_norm": 36.428489685058594, "learning_rate": 5.8467583497053044e-05, "loss": 112.8608, "num_input_tokens_seen": 18881144, "step": 372 }, { "epoch": 0.036854131348124125, "grad_norm": 202.75164794921875, "learning_rate": 5.8939096267190575e-05, "loss": 111.489, "num_input_tokens_seen": 19037724, "step": 375 }, { "epoch": 0.03714896439890912, "grad_norm": 39.6142578125, "learning_rate": 5.94106090373281e-05, "loss": 113.2192, "num_input_tokens_seen": 19184996, "step": 378 }, { "epoch": 0.03744379744969411, "grad_norm": 37.47132110595703, "learning_rate": 5.9882121807465624e-05, "loss": 111.0813, "num_input_tokens_seen": 19342988, "step": 381 }, { "epoch": 0.037738630500479105, "grad_norm": 46.92488098144531, "learning_rate": 6.035363457760315e-05, "loss": 111.7378, "num_input_tokens_seen": 19500860, "step": 384 }, { "epoch": 0.03803346355126409, "grad_norm": 34.19045639038086, "learning_rate": 6.082514734774068e-05, "loss": 111.4771, "num_input_tokens_seen": 19655396, "step": 387 }, { "epoch": 0.03832829660204909, "grad_norm": 33.17388916015625, "learning_rate": 6.12966601178782e-05, "loss": 111.7661, "num_input_tokens_seen": 19812628, "step": 390 }, { "epoch": 0.038623129652834085, "grad_norm": 37.04789733886719, "learning_rate": 6.176817288801572e-05, "loss": 111.0436, "num_input_tokens_seen": 19962968, "step": 393 }, { "epoch": 0.03891796270361907, "grad_norm": 34.42665100097656, "learning_rate": 6.223968565815325e-05, "loss": 110.9319, "num_input_tokens_seen": 20124412, "step": 396 }, { "epoch": 0.03921279575440407, "grad_norm": 30.080944061279297, "learning_rate": 6.271119842829077e-05, "loss": 109.7078, "num_input_tokens_seen": 20271340, "step": 399 }, { "epoch": 0.039507628805189064, "grad_norm": 37.49620056152344, "learning_rate": 6.31827111984283e-05, "loss": 109.8044, "num_input_tokens_seen": 20416520, "step": 402 }, { "epoch": 0.03980246185597405, "grad_norm": 40.49543762207031, "learning_rate": 6.365422396856582e-05, "loss": 111.5728, "num_input_tokens_seen": 20562864, "step": 405 }, { "epoch": 0.04009729490675905, "grad_norm": 32.68974304199219, "learning_rate": 6.412573673870334e-05, "loss": 110.8815, "num_input_tokens_seen": 20722032, "step": 408 }, { "epoch": 0.040392127957544044, "grad_norm": 31.40269660949707, "learning_rate": 6.459724950884087e-05, "loss": 110.8921, "num_input_tokens_seen": 20876444, "step": 411 }, { "epoch": 0.04068696100832903, "grad_norm": 35.71750259399414, "learning_rate": 6.506876227897839e-05, "loss": 112.1176, "num_input_tokens_seen": 21048444, "step": 414 }, { "epoch": 0.04098179405911403, "grad_norm": 30.60179901123047, "learning_rate": 6.554027504911592e-05, "loss": 110.2898, "num_input_tokens_seen": 21213736, "step": 417 }, { "epoch": 0.04127662710989902, "grad_norm": 33.711368560791016, "learning_rate": 6.601178781925344e-05, "loss": 109.2687, "num_input_tokens_seen": 21386808, "step": 420 }, { "epoch": 0.04157146016068401, "grad_norm": 28.957597732543945, "learning_rate": 6.648330058939096e-05, "loss": 108.5565, "num_input_tokens_seen": 21564740, "step": 423 }, { "epoch": 0.04186629321146901, "grad_norm": 33.985660552978516, "learning_rate": 6.695481335952849e-05, "loss": 108.6807, "num_input_tokens_seen": 21717276, "step": 426 }, { "epoch": 0.042161126262253996, "grad_norm": 35.086524963378906, "learning_rate": 6.742632612966603e-05, "loss": 111.5809, "num_input_tokens_seen": 21886308, "step": 429 }, { "epoch": 0.04245595931303899, "grad_norm": 29.464298248291016, "learning_rate": 6.789783889980354e-05, "loss": 110.4938, "num_input_tokens_seen": 22035420, "step": 432 }, { "epoch": 0.04275079236382399, "grad_norm": 31.72478675842285, "learning_rate": 6.836935166994106e-05, "loss": 108.5007, "num_input_tokens_seen": 22195320, "step": 435 }, { "epoch": 0.043045625414608976, "grad_norm": 29.742385864257812, "learning_rate": 6.88408644400786e-05, "loss": 109.9771, "num_input_tokens_seen": 22353492, "step": 438 }, { "epoch": 0.04334045846539397, "grad_norm": 27.068031311035156, "learning_rate": 6.931237721021611e-05, "loss": 109.8151, "num_input_tokens_seen": 22499056, "step": 441 }, { "epoch": 0.04363529151617896, "grad_norm": 31.5910587310791, "learning_rate": 6.978388998035365e-05, "loss": 109.6762, "num_input_tokens_seen": 22661068, "step": 444 }, { "epoch": 0.043930124566963956, "grad_norm": 32.54317855834961, "learning_rate": 7.025540275049117e-05, "loss": 109.202, "num_input_tokens_seen": 22825316, "step": 447 }, { "epoch": 0.04422495761774895, "grad_norm": 29.726049423217773, "learning_rate": 7.072691552062868e-05, "loss": 109.1553, "num_input_tokens_seen": 22968708, "step": 450 }, { "epoch": 0.04451979066853394, "grad_norm": 29.412227630615234, "learning_rate": 7.119842829076622e-05, "loss": 109.7633, "num_input_tokens_seen": 23115468, "step": 453 }, { "epoch": 0.044814623719318936, "grad_norm": 29.294687271118164, "learning_rate": 7.166994106090374e-05, "loss": 108.4812, "num_input_tokens_seen": 23287960, "step": 456 }, { "epoch": 0.04510945677010393, "grad_norm": 28.74698257446289, "learning_rate": 7.214145383104126e-05, "loss": 108.3726, "num_input_tokens_seen": 23436948, "step": 459 }, { "epoch": 0.04540428982088892, "grad_norm": 47.22918701171875, "learning_rate": 7.261296660117879e-05, "loss": 109.4055, "num_input_tokens_seen": 23600328, "step": 462 }, { "epoch": 0.045699122871673915, "grad_norm": 31.876970291137695, "learning_rate": 7.30844793713163e-05, "loss": 107.6664, "num_input_tokens_seen": 23745424, "step": 465 }, { "epoch": 0.04599395592245891, "grad_norm": 35.708683013916016, "learning_rate": 7.355599214145384e-05, "loss": 108.5909, "num_input_tokens_seen": 23903584, "step": 468 }, { "epoch": 0.0462887889732439, "grad_norm": 28.689655303955078, "learning_rate": 7.402750491159137e-05, "loss": 108.6428, "num_input_tokens_seen": 24052516, "step": 471 }, { "epoch": 0.046583622024028895, "grad_norm": 29.488140106201172, "learning_rate": 7.449901768172888e-05, "loss": 106.9835, "num_input_tokens_seen": 24211128, "step": 474 }, { "epoch": 0.046878455074813884, "grad_norm": 38.3368034362793, "learning_rate": 7.497053045186641e-05, "loss": 108.5497, "num_input_tokens_seen": 24363412, "step": 477 }, { "epoch": 0.04717328812559888, "grad_norm": 30.385234832763672, "learning_rate": 7.544204322200394e-05, "loss": 106.6094, "num_input_tokens_seen": 24524724, "step": 480 }, { "epoch": 0.047468121176383875, "grad_norm": 73.74546813964844, "learning_rate": 7.591355599214146e-05, "loss": 107.275, "num_input_tokens_seen": 24672488, "step": 483 }, { "epoch": 0.04776295422716886, "grad_norm": 30.18967628479004, "learning_rate": 7.638506876227899e-05, "loss": 107.4118, "num_input_tokens_seen": 24822048, "step": 486 }, { "epoch": 0.04805778727795386, "grad_norm": 30.575702667236328, "learning_rate": 7.685658153241651e-05, "loss": 108.2546, "num_input_tokens_seen": 24971528, "step": 489 }, { "epoch": 0.048352620328738855, "grad_norm": 34.882266998291016, "learning_rate": 7.732809430255404e-05, "loss": 106.8039, "num_input_tokens_seen": 25107552, "step": 492 }, { "epoch": 0.04864745337952384, "grad_norm": 31.217613220214844, "learning_rate": 7.779960707269156e-05, "loss": 108.3255, "num_input_tokens_seen": 25288080, "step": 495 }, { "epoch": 0.04894228643030884, "grad_norm": 29.565536499023438, "learning_rate": 7.827111984282908e-05, "loss": 106.2473, "num_input_tokens_seen": 25445284, "step": 498 }, { "epoch": 0.04923711948109383, "grad_norm": 36.96455001831055, "learning_rate": 7.874263261296661e-05, "loss": 106.5716, "num_input_tokens_seen": 25627308, "step": 501 }, { "epoch": 0.04953195253187882, "grad_norm": 41.639625549316406, "learning_rate": 7.921414538310413e-05, "loss": 108.6796, "num_input_tokens_seen": 25782124, "step": 504 }, { "epoch": 0.04982678558266382, "grad_norm": 31.215431213378906, "learning_rate": 7.968565815324166e-05, "loss": 106.9381, "num_input_tokens_seen": 25932328, "step": 507 }, { "epoch": 0.05012161863344881, "grad_norm": 32.43290328979492, "learning_rate": 8e-05, "loss": 107.4913, "num_input_tokens_seen": 26084420, "step": 510 }, { "epoch": 0.0504164516842338, "grad_norm": 48.26959228515625, "learning_rate": 8e-05, "loss": 107.523, "num_input_tokens_seen": 26251152, "step": 513 }, { "epoch": 0.0507112847350188, "grad_norm": 35.35361099243164, "learning_rate": 8e-05, "loss": 108.4295, "num_input_tokens_seen": 26406284, "step": 516 }, { "epoch": 0.05100611778580379, "grad_norm": 37.7109260559082, "learning_rate": 8e-05, "loss": 106.0299, "num_input_tokens_seen": 26544532, "step": 519 }, { "epoch": 0.05130095083658878, "grad_norm": 31.95452880859375, "learning_rate": 8e-05, "loss": 107.3508, "num_input_tokens_seen": 26697740, "step": 522 }, { "epoch": 0.05159578388737378, "grad_norm": 33.43760681152344, "learning_rate": 8e-05, "loss": 105.9003, "num_input_tokens_seen": 26854696, "step": 525 }, { "epoch": 0.051890616938158766, "grad_norm": 34.20586013793945, "learning_rate": 8e-05, "loss": 107.3876, "num_input_tokens_seen": 27009780, "step": 528 }, { "epoch": 0.05218544998894376, "grad_norm": 31.921728134155273, "learning_rate": 8e-05, "loss": 105.3969, "num_input_tokens_seen": 27150128, "step": 531 }, { "epoch": 0.05248028303972875, "grad_norm": 36.0076789855957, "learning_rate": 8e-05, "loss": 106.9184, "num_input_tokens_seen": 27304528, "step": 534 }, { "epoch": 0.052775116090513746, "grad_norm": 29.839731216430664, "learning_rate": 8e-05, "loss": 105.1159, "num_input_tokens_seen": 27450804, "step": 537 }, { "epoch": 0.05306994914129874, "grad_norm": 33.60909652709961, "learning_rate": 8e-05, "loss": 104.5463, "num_input_tokens_seen": 27611428, "step": 540 }, { "epoch": 0.05336478219208373, "grad_norm": 30.84507179260254, "learning_rate": 8e-05, "loss": 103.293, "num_input_tokens_seen": 27759640, "step": 543 }, { "epoch": 0.053659615242868726, "grad_norm": 35.389583587646484, "learning_rate": 8e-05, "loss": 104.1917, "num_input_tokens_seen": 27912976, "step": 546 }, { "epoch": 0.05395444829365372, "grad_norm": 31.530858993530273, "learning_rate": 8e-05, "loss": 105.6748, "num_input_tokens_seen": 28064152, "step": 549 }, { "epoch": 0.05424928134443871, "grad_norm": 41.1005859375, "learning_rate": 8e-05, "loss": 107.298, "num_input_tokens_seen": 28217516, "step": 552 }, { "epoch": 0.054544114395223706, "grad_norm": 61.542354583740234, "learning_rate": 8e-05, "loss": 106.5471, "num_input_tokens_seen": 28356428, "step": 555 }, { "epoch": 0.054838947446008694, "grad_norm": 46.59098434448242, "learning_rate": 8e-05, "loss": 104.5256, "num_input_tokens_seen": 28529116, "step": 558 }, { "epoch": 0.05513378049679369, "grad_norm": 31.784414291381836, "learning_rate": 8e-05, "loss": 107.8473, "num_input_tokens_seen": 28697036, "step": 561 }, { "epoch": 0.055428613547578685, "grad_norm": 35.911521911621094, "learning_rate": 8e-05, "loss": 104.2456, "num_input_tokens_seen": 28851120, "step": 564 }, { "epoch": 0.055723446598363674, "grad_norm": 28.5731201171875, "learning_rate": 8e-05, "loss": 103.9228, "num_input_tokens_seen": 28991428, "step": 567 }, { "epoch": 0.05601827964914867, "grad_norm": 33.67176055908203, "learning_rate": 8e-05, "loss": 105.1696, "num_input_tokens_seen": 29153744, "step": 570 }, { "epoch": 0.056313112699933665, "grad_norm": 30.27126121520996, "learning_rate": 8e-05, "loss": 104.0718, "num_input_tokens_seen": 29291672, "step": 573 }, { "epoch": 0.056607945750718654, "grad_norm": 32.40224075317383, "learning_rate": 8e-05, "loss": 105.3532, "num_input_tokens_seen": 29444228, "step": 576 }, { "epoch": 0.05690277880150365, "grad_norm": 32.94948959350586, "learning_rate": 8e-05, "loss": 105.5668, "num_input_tokens_seen": 29614308, "step": 579 }, { "epoch": 0.057197611852288645, "grad_norm": 36.907508850097656, "learning_rate": 8e-05, "loss": 104.2329, "num_input_tokens_seen": 29782636, "step": 582 }, { "epoch": 0.05749244490307363, "grad_norm": 32.03764343261719, "learning_rate": 8e-05, "loss": 106.6867, "num_input_tokens_seen": 29928448, "step": 585 }, { "epoch": 0.05778727795385863, "grad_norm": 36.27505874633789, "learning_rate": 8e-05, "loss": 103.9458, "num_input_tokens_seen": 30081348, "step": 588 }, { "epoch": 0.05808211100464362, "grad_norm": 48.598350524902344, "learning_rate": 8e-05, "loss": 104.8759, "num_input_tokens_seen": 30230504, "step": 591 }, { "epoch": 0.05837694405542861, "grad_norm": 32.41371536254883, "learning_rate": 8e-05, "loss": 103.2528, "num_input_tokens_seen": 30384920, "step": 594 }, { "epoch": 0.05867177710621361, "grad_norm": 28.43147087097168, "learning_rate": 8e-05, "loss": 102.9143, "num_input_tokens_seen": 30538908, "step": 597 }, { "epoch": 0.0589666101569986, "grad_norm": 28.21474838256836, "learning_rate": 8e-05, "loss": 104.1305, "num_input_tokens_seen": 30696424, "step": 600 }, { "epoch": 0.05926144320778359, "grad_norm": 33.953983306884766, "learning_rate": 8e-05, "loss": 105.6636, "num_input_tokens_seen": 30848860, "step": 603 }, { "epoch": 0.05955627625856859, "grad_norm": 27.20724868774414, "learning_rate": 8e-05, "loss": 102.2961, "num_input_tokens_seen": 30997372, "step": 606 }, { "epoch": 0.05985110930935358, "grad_norm": 33.98468780517578, "learning_rate": 8e-05, "loss": 103.787, "num_input_tokens_seen": 31157788, "step": 609 }, { "epoch": 0.06014594236013857, "grad_norm": 40.239501953125, "learning_rate": 8e-05, "loss": 104.2993, "num_input_tokens_seen": 31325552, "step": 612 }, { "epoch": 0.06044077541092356, "grad_norm": 32.588470458984375, "learning_rate": 8e-05, "loss": 104.0729, "num_input_tokens_seen": 31466752, "step": 615 }, { "epoch": 0.06073560846170856, "grad_norm": 37.641700744628906, "learning_rate": 8e-05, "loss": 103.5312, "num_input_tokens_seen": 31636332, "step": 618 }, { "epoch": 0.06103044151249355, "grad_norm": 31.127992630004883, "learning_rate": 8e-05, "loss": 104.2723, "num_input_tokens_seen": 31781852, "step": 621 }, { "epoch": 0.06132527456327854, "grad_norm": 28.448110580444336, "learning_rate": 8e-05, "loss": 104.1347, "num_input_tokens_seen": 31946376, "step": 624 }, { "epoch": 0.061620107614063536, "grad_norm": 29.03575897216797, "learning_rate": 8e-05, "loss": 102.5833, "num_input_tokens_seen": 32108392, "step": 627 }, { "epoch": 0.06191494066484853, "grad_norm": 37.53111267089844, "learning_rate": 8e-05, "loss": 105.0999, "num_input_tokens_seen": 32269168, "step": 630 }, { "epoch": 0.06220977371563352, "grad_norm": 35.30291748046875, "learning_rate": 8e-05, "loss": 102.7633, "num_input_tokens_seen": 32426728, "step": 633 }, { "epoch": 0.06250460676641852, "grad_norm": 63.35116195678711, "learning_rate": 8e-05, "loss": 105.0097, "num_input_tokens_seen": 32592400, "step": 636 }, { "epoch": 0.06279943981720351, "grad_norm": 28.85284996032715, "learning_rate": 8e-05, "loss": 103.9038, "num_input_tokens_seen": 32760464, "step": 639 }, { "epoch": 0.06309427286798851, "grad_norm": 29.186050415039062, "learning_rate": 8e-05, "loss": 102.84, "num_input_tokens_seen": 32899168, "step": 642 }, { "epoch": 0.06338910591877349, "grad_norm": 35.00688552856445, "learning_rate": 8e-05, "loss": 102.888, "num_input_tokens_seen": 33040460, "step": 645 }, { "epoch": 0.06368393896955848, "grad_norm": 31.484594345092773, "learning_rate": 8e-05, "loss": 101.2444, "num_input_tokens_seen": 33191032, "step": 648 }, { "epoch": 0.06397877202034348, "grad_norm": 28.048425674438477, "learning_rate": 8e-05, "loss": 100.0415, "num_input_tokens_seen": 33344604, "step": 651 }, { "epoch": 0.06427360507112848, "grad_norm": 27.901996612548828, "learning_rate": 8e-05, "loss": 102.9519, "num_input_tokens_seen": 33494576, "step": 654 }, { "epoch": 0.06456843812191347, "grad_norm": 33.018165588378906, "learning_rate": 8e-05, "loss": 102.6747, "num_input_tokens_seen": 33648388, "step": 657 }, { "epoch": 0.06486327117269845, "grad_norm": 29.498411178588867, "learning_rate": 8e-05, "loss": 104.4521, "num_input_tokens_seen": 33772076, "step": 660 }, { "epoch": 0.06515810422348345, "grad_norm": 28.262418746948242, "learning_rate": 8e-05, "loss": 101.4761, "num_input_tokens_seen": 33922880, "step": 663 }, { "epoch": 0.06545293727426844, "grad_norm": 28.457019805908203, "learning_rate": 8e-05, "loss": 102.9336, "num_input_tokens_seen": 34087612, "step": 666 }, { "epoch": 0.06574777032505344, "grad_norm": 31.7344970703125, "learning_rate": 8e-05, "loss": 101.4416, "num_input_tokens_seen": 34246408, "step": 669 }, { "epoch": 0.06604260337583844, "grad_norm": 30.76515007019043, "learning_rate": 8e-05, "loss": 102.94, "num_input_tokens_seen": 34396300, "step": 672 }, { "epoch": 0.06633743642662343, "grad_norm": 29.8140811920166, "learning_rate": 8e-05, "loss": 102.4964, "num_input_tokens_seen": 34566084, "step": 675 }, { "epoch": 0.06663226947740841, "grad_norm": 38.429473876953125, "learning_rate": 8e-05, "loss": 100.1495, "num_input_tokens_seen": 34711580, "step": 678 }, { "epoch": 0.06692710252819341, "grad_norm": 35.76615905761719, "learning_rate": 8e-05, "loss": 101.2931, "num_input_tokens_seen": 34862992, "step": 681 }, { "epoch": 0.0672219355789784, "grad_norm": 29.05952262878418, "learning_rate": 8e-05, "loss": 101.0068, "num_input_tokens_seen": 35016092, "step": 684 }, { "epoch": 0.0675167686297634, "grad_norm": 28.329811096191406, "learning_rate": 8e-05, "loss": 105.2395, "num_input_tokens_seen": 35200604, "step": 687 }, { "epoch": 0.0678116016805484, "grad_norm": 27.511215209960938, "learning_rate": 8e-05, "loss": 102.3719, "num_input_tokens_seen": 35339672, "step": 690 }, { "epoch": 0.06810643473133338, "grad_norm": 29.413833618164062, "learning_rate": 8e-05, "loss": 101.1614, "num_input_tokens_seen": 35488832, "step": 693 }, { "epoch": 0.06840126778211837, "grad_norm": 33.835548400878906, "learning_rate": 8e-05, "loss": 101.2409, "num_input_tokens_seen": 35640116, "step": 696 }, { "epoch": 0.06869610083290337, "grad_norm": 37.69698715209961, "learning_rate": 8e-05, "loss": 101.9321, "num_input_tokens_seen": 35830276, "step": 699 }, { "epoch": 0.06899093388368836, "grad_norm": 34.55872344970703, "learning_rate": 8e-05, "loss": 101.7172, "num_input_tokens_seen": 35986216, "step": 702 }, { "epoch": 0.06928576693447336, "grad_norm": 38.171295166015625, "learning_rate": 8e-05, "loss": 100.8534, "num_input_tokens_seen": 36136840, "step": 705 }, { "epoch": 0.06958059998525835, "grad_norm": 30.427322387695312, "learning_rate": 8e-05, "loss": 101.2733, "num_input_tokens_seen": 36307732, "step": 708 }, { "epoch": 0.06987543303604334, "grad_norm": 33.20026779174805, "learning_rate": 8e-05, "loss": 100.3995, "num_input_tokens_seen": 36468544, "step": 711 }, { "epoch": 0.07017026608682833, "grad_norm": 28.34000015258789, "learning_rate": 8e-05, "loss": 100.1351, "num_input_tokens_seen": 36613340, "step": 714 }, { "epoch": 0.07046509913761333, "grad_norm": 31.049711227416992, "learning_rate": 8e-05, "loss": 99.2558, "num_input_tokens_seen": 36772992, "step": 717 }, { "epoch": 0.07075993218839832, "grad_norm": 28.40256118774414, "learning_rate": 8e-05, "loss": 98.7879, "num_input_tokens_seen": 36917312, "step": 720 }, { "epoch": 0.07105476523918332, "grad_norm": 35.21345901489258, "learning_rate": 8e-05, "loss": 99.0775, "num_input_tokens_seen": 37056508, "step": 723 }, { "epoch": 0.0713495982899683, "grad_norm": 28.639978408813477, "learning_rate": 8e-05, "loss": 100.1574, "num_input_tokens_seen": 37209684, "step": 726 }, { "epoch": 0.0716444313407533, "grad_norm": 35.251808166503906, "learning_rate": 8e-05, "loss": 100.5594, "num_input_tokens_seen": 37374472, "step": 729 }, { "epoch": 0.07193926439153829, "grad_norm": 40.02030563354492, "learning_rate": 8e-05, "loss": 97.8263, "num_input_tokens_seen": 37511060, "step": 732 }, { "epoch": 0.07223409744232329, "grad_norm": 31.680225372314453, "learning_rate": 8e-05, "loss": 98.7203, "num_input_tokens_seen": 37642500, "step": 735 }, { "epoch": 0.07252893049310828, "grad_norm": 31.110706329345703, "learning_rate": 8e-05, "loss": 102.0147, "num_input_tokens_seen": 37799960, "step": 738 }, { "epoch": 0.07282376354389328, "grad_norm": 29.42256736755371, "learning_rate": 8e-05, "loss": 98.3472, "num_input_tokens_seen": 37949436, "step": 741 }, { "epoch": 0.07311859659467826, "grad_norm": 35.685115814208984, "learning_rate": 8e-05, "loss": 99.096, "num_input_tokens_seen": 38114716, "step": 744 }, { "epoch": 0.07341342964546325, "grad_norm": 31.129684448242188, "learning_rate": 8e-05, "loss": 99.8503, "num_input_tokens_seen": 38266548, "step": 747 }, { "epoch": 0.07370826269624825, "grad_norm": 67.06864166259766, "learning_rate": 8e-05, "loss": 99.9585, "num_input_tokens_seen": 38421392, "step": 750 }, { "epoch": 0.07400309574703325, "grad_norm": 28.124284744262695, "learning_rate": 8e-05, "loss": 98.4267, "num_input_tokens_seen": 38577988, "step": 753 }, { "epoch": 0.07429792879781824, "grad_norm": 31.328781127929688, "learning_rate": 8e-05, "loss": 100.2827, "num_input_tokens_seen": 38753868, "step": 756 }, { "epoch": 0.07459276184860322, "grad_norm": 31.482263565063477, "learning_rate": 8e-05, "loss": 100.3612, "num_input_tokens_seen": 38919460, "step": 759 }, { "epoch": 0.07488759489938822, "grad_norm": 28.848894119262695, "learning_rate": 8e-05, "loss": 98.9799, "num_input_tokens_seen": 39051280, "step": 762 }, { "epoch": 0.07518242795017321, "grad_norm": 29.552709579467773, "learning_rate": 8e-05, "loss": 95.8027, "num_input_tokens_seen": 39223880, "step": 765 }, { "epoch": 0.07547726100095821, "grad_norm": 32.752559661865234, "learning_rate": 8e-05, "loss": 96.3181, "num_input_tokens_seen": 39388972, "step": 768 }, { "epoch": 0.0757720940517432, "grad_norm": 27.956424713134766, "learning_rate": 8e-05, "loss": 98.5633, "num_input_tokens_seen": 39524604, "step": 771 }, { "epoch": 0.07606692710252819, "grad_norm": 28.682424545288086, "learning_rate": 8e-05, "loss": 97.9849, "num_input_tokens_seen": 39667896, "step": 774 }, { "epoch": 0.07636176015331318, "grad_norm": 30.985292434692383, "learning_rate": 8e-05, "loss": 98.5518, "num_input_tokens_seen": 39840608, "step": 777 }, { "epoch": 0.07665659320409818, "grad_norm": 34.598148345947266, "learning_rate": 8e-05, "loss": 98.3472, "num_input_tokens_seen": 39975948, "step": 780 }, { "epoch": 0.07695142625488317, "grad_norm": 29.500808715820312, "learning_rate": 8e-05, "loss": 98.7694, "num_input_tokens_seen": 40135764, "step": 783 }, { "epoch": 0.07724625930566817, "grad_norm": 29.20780372619629, "learning_rate": 8e-05, "loss": 99.1242, "num_input_tokens_seen": 40282812, "step": 786 }, { "epoch": 0.07754109235645316, "grad_norm": 31.628633499145508, "learning_rate": 8e-05, "loss": 97.8333, "num_input_tokens_seen": 40432408, "step": 789 }, { "epoch": 0.07783592540723815, "grad_norm": 38.96535873413086, "learning_rate": 8e-05, "loss": 97.465, "num_input_tokens_seen": 40591640, "step": 792 }, { "epoch": 0.07813075845802314, "grad_norm": 30.278688430786133, "learning_rate": 8e-05, "loss": 96.5388, "num_input_tokens_seen": 40745628, "step": 795 }, { "epoch": 0.07842559150880814, "grad_norm": 34.10064697265625, "learning_rate": 8e-05, "loss": 98.7074, "num_input_tokens_seen": 40892836, "step": 798 }, { "epoch": 0.07872042455959313, "grad_norm": 34.84008026123047, "learning_rate": 8e-05, "loss": 98.8112, "num_input_tokens_seen": 41044032, "step": 801 }, { "epoch": 0.07901525761037813, "grad_norm": 29.791622161865234, "learning_rate": 8e-05, "loss": 96.3578, "num_input_tokens_seen": 41189400, "step": 804 }, { "epoch": 0.07931009066116311, "grad_norm": 29.386751174926758, "learning_rate": 8e-05, "loss": 97.5173, "num_input_tokens_seen": 41345100, "step": 807 }, { "epoch": 0.0796049237119481, "grad_norm": 27.887340545654297, "learning_rate": 8e-05, "loss": 97.5752, "num_input_tokens_seen": 41487676, "step": 810 }, { "epoch": 0.0798997567627331, "grad_norm": 29.79248046875, "learning_rate": 8e-05, "loss": 96.6194, "num_input_tokens_seen": 41651120, "step": 813 }, { "epoch": 0.0801945898135181, "grad_norm": 30.108230590820312, "learning_rate": 8e-05, "loss": 96.5995, "num_input_tokens_seen": 41806768, "step": 816 }, { "epoch": 0.08048942286430309, "grad_norm": 32.913536071777344, "learning_rate": 8e-05, "loss": 95.4259, "num_input_tokens_seen": 41950708, "step": 819 }, { "epoch": 0.08078425591508809, "grad_norm": 35.09928512573242, "learning_rate": 8e-05, "loss": 97.372, "num_input_tokens_seen": 42101944, "step": 822 }, { "epoch": 0.08107908896587307, "grad_norm": 36.4766845703125, "learning_rate": 8e-05, "loss": 96.4494, "num_input_tokens_seen": 42244600, "step": 825 }, { "epoch": 0.08137392201665807, "grad_norm": 43.9494743347168, "learning_rate": 8e-05, "loss": 96.8876, "num_input_tokens_seen": 42402116, "step": 828 }, { "epoch": 0.08166875506744306, "grad_norm": 31.989990234375, "learning_rate": 8e-05, "loss": 95.472, "num_input_tokens_seen": 42570484, "step": 831 }, { "epoch": 0.08196358811822806, "grad_norm": 27.526823043823242, "learning_rate": 8e-05, "loss": 94.3046, "num_input_tokens_seen": 42715820, "step": 834 }, { "epoch": 0.08225842116901305, "grad_norm": 29.665773391723633, "learning_rate": 8e-05, "loss": 96.0409, "num_input_tokens_seen": 42885048, "step": 837 }, { "epoch": 0.08255325421979803, "grad_norm": 33.27475357055664, "learning_rate": 8e-05, "loss": 94.5064, "num_input_tokens_seen": 43037472, "step": 840 }, { "epoch": 0.08284808727058303, "grad_norm": 27.184389114379883, "learning_rate": 8e-05, "loss": 95.9627, "num_input_tokens_seen": 43207080, "step": 843 }, { "epoch": 0.08314292032136802, "grad_norm": 30.488483428955078, "learning_rate": 8e-05, "loss": 95.5, "num_input_tokens_seen": 43342624, "step": 846 }, { "epoch": 0.08343775337215302, "grad_norm": 30.834781646728516, "learning_rate": 8e-05, "loss": 96.9813, "num_input_tokens_seen": 43479084, "step": 849 }, { "epoch": 0.08373258642293802, "grad_norm": 31.856342315673828, "learning_rate": 8e-05, "loss": 98.9376, "num_input_tokens_seen": 43625832, "step": 852 }, { "epoch": 0.08402741947372301, "grad_norm": 35.55719757080078, "learning_rate": 8e-05, "loss": 97.2659, "num_input_tokens_seen": 43778084, "step": 855 }, { "epoch": 0.08432225252450799, "grad_norm": 29.947450637817383, "learning_rate": 8e-05, "loss": 95.7474, "num_input_tokens_seen": 43922396, "step": 858 }, { "epoch": 0.08461708557529299, "grad_norm": 31.373952865600586, "learning_rate": 8e-05, "loss": 94.7743, "num_input_tokens_seen": 44085324, "step": 861 }, { "epoch": 0.08491191862607798, "grad_norm": 30.180021286010742, "learning_rate": 8e-05, "loss": 96.9605, "num_input_tokens_seen": 44230488, "step": 864 }, { "epoch": 0.08520675167686298, "grad_norm": 29.01386260986328, "learning_rate": 8e-05, "loss": 93.7627, "num_input_tokens_seen": 44390288, "step": 867 }, { "epoch": 0.08550158472764798, "grad_norm": 29.978891372680664, "learning_rate": 8e-05, "loss": 95.4733, "num_input_tokens_seen": 44548192, "step": 870 }, { "epoch": 0.08579641777843296, "grad_norm": 30.313705444335938, "learning_rate": 8e-05, "loss": 94.0545, "num_input_tokens_seen": 44690768, "step": 873 }, { "epoch": 0.08609125082921795, "grad_norm": 32.91279220581055, "learning_rate": 8e-05, "loss": 96.173, "num_input_tokens_seen": 44834448, "step": 876 }, { "epoch": 0.08638608388000295, "grad_norm": 28.27375602722168, "learning_rate": 8e-05, "loss": 94.625, "num_input_tokens_seen": 44980220, "step": 879 }, { "epoch": 0.08668091693078794, "grad_norm": 29.834308624267578, "learning_rate": 8e-05, "loss": 95.7682, "num_input_tokens_seen": 45130548, "step": 882 }, { "epoch": 0.08697574998157294, "grad_norm": 30.992219924926758, "learning_rate": 8e-05, "loss": 95.9339, "num_input_tokens_seen": 45285992, "step": 885 }, { "epoch": 0.08727058303235792, "grad_norm": 29.38202476501465, "learning_rate": 8e-05, "loss": 94.346, "num_input_tokens_seen": 45423456, "step": 888 }, { "epoch": 0.08756541608314292, "grad_norm": 30.09346580505371, "learning_rate": 8e-05, "loss": 96.734, "num_input_tokens_seen": 45606976, "step": 891 }, { "epoch": 0.08786024913392791, "grad_norm": 30.944683074951172, "learning_rate": 8e-05, "loss": 94.5057, "num_input_tokens_seen": 45742436, "step": 894 }, { "epoch": 0.08815508218471291, "grad_norm": 32.21797561645508, "learning_rate": 8e-05, "loss": 95.2261, "num_input_tokens_seen": 45891856, "step": 897 }, { "epoch": 0.0884499152354979, "grad_norm": 36.828975677490234, "learning_rate": 8e-05, "loss": 93.5124, "num_input_tokens_seen": 46030736, "step": 900 }, { "epoch": 0.0887447482862829, "grad_norm": 32.43253707885742, "learning_rate": 8e-05, "loss": 94.0418, "num_input_tokens_seen": 46180516, "step": 903 }, { "epoch": 0.08903958133706788, "grad_norm": 30.175016403198242, "learning_rate": 8e-05, "loss": 93.9941, "num_input_tokens_seen": 46330568, "step": 906 }, { "epoch": 0.08933441438785288, "grad_norm": 31.575355529785156, "learning_rate": 8e-05, "loss": 94.8749, "num_input_tokens_seen": 46485964, "step": 909 }, { "epoch": 0.08962924743863787, "grad_norm": 30.174617767333984, "learning_rate": 8e-05, "loss": 93.3188, "num_input_tokens_seen": 46641304, "step": 912 }, { "epoch": 0.08992408048942287, "grad_norm": 35.43874740600586, "learning_rate": 8e-05, "loss": 92.9305, "num_input_tokens_seen": 46796272, "step": 915 }, { "epoch": 0.09021891354020786, "grad_norm": 31.57838249206543, "learning_rate": 8e-05, "loss": 96.0303, "num_input_tokens_seen": 46941040, "step": 918 }, { "epoch": 0.09051374659099284, "grad_norm": 30.171436309814453, "learning_rate": 8e-05, "loss": 94.1278, "num_input_tokens_seen": 47111964, "step": 921 }, { "epoch": 0.09080857964177784, "grad_norm": 29.509170532226562, "learning_rate": 8e-05, "loss": 94.6481, "num_input_tokens_seen": 47280680, "step": 924 }, { "epoch": 0.09110341269256284, "grad_norm": 30.14413070678711, "learning_rate": 8e-05, "loss": 93.1313, "num_input_tokens_seen": 47427784, "step": 927 }, { "epoch": 0.09139824574334783, "grad_norm": 30.193668365478516, "learning_rate": 8e-05, "loss": 92.5065, "num_input_tokens_seen": 47585472, "step": 930 }, { "epoch": 0.09169307879413283, "grad_norm": 34.98398971557617, "learning_rate": 8e-05, "loss": 93.7416, "num_input_tokens_seen": 47750156, "step": 933 }, { "epoch": 0.09198791184491782, "grad_norm": 30.99966812133789, "learning_rate": 8e-05, "loss": 91.2385, "num_input_tokens_seen": 47898952, "step": 936 }, { "epoch": 0.0922827448957028, "grad_norm": 30.56644058227539, "learning_rate": 8e-05, "loss": 94.2638, "num_input_tokens_seen": 48053704, "step": 939 }, { "epoch": 0.0925775779464878, "grad_norm": 29.665430068969727, "learning_rate": 8e-05, "loss": 91.8333, "num_input_tokens_seen": 48212428, "step": 942 }, { "epoch": 0.0928724109972728, "grad_norm": 31.677806854248047, "learning_rate": 8e-05, "loss": 93.413, "num_input_tokens_seen": 48372056, "step": 945 }, { "epoch": 0.09316724404805779, "grad_norm": 29.71071434020996, "learning_rate": 8e-05, "loss": 92.5726, "num_input_tokens_seen": 48551512, "step": 948 }, { "epoch": 0.09346207709884279, "grad_norm": 30.616531372070312, "learning_rate": 8e-05, "loss": 92.5297, "num_input_tokens_seen": 48708300, "step": 951 }, { "epoch": 0.09375691014962777, "grad_norm": 31.245548248291016, "learning_rate": 8e-05, "loss": 90.3821, "num_input_tokens_seen": 48869976, "step": 954 }, { "epoch": 0.09405174320041276, "grad_norm": 32.1124382019043, "learning_rate": 8e-05, "loss": 92.5386, "num_input_tokens_seen": 49009364, "step": 957 }, { "epoch": 0.09434657625119776, "grad_norm": 194.75343322753906, "learning_rate": 8e-05, "loss": 94.1239, "num_input_tokens_seen": 49169552, "step": 960 }, { "epoch": 0.09464140930198275, "grad_norm": 32.542686462402344, "learning_rate": 8e-05, "loss": 94.3201, "num_input_tokens_seen": 49326648, "step": 963 }, { "epoch": 0.09493624235276775, "grad_norm": 85.58460235595703, "learning_rate": 8e-05, "loss": 92.559, "num_input_tokens_seen": 49473464, "step": 966 }, { "epoch": 0.09523107540355275, "grad_norm": 31.504518508911133, "learning_rate": 8e-05, "loss": 94.1152, "num_input_tokens_seen": 49641688, "step": 969 }, { "epoch": 0.09552590845433773, "grad_norm": 31.837738037109375, "learning_rate": 8e-05, "loss": 94.4699, "num_input_tokens_seen": 49807428, "step": 972 }, { "epoch": 0.09582074150512272, "grad_norm": 28.046907424926758, "learning_rate": 8e-05, "loss": 92.2552, "num_input_tokens_seen": 49960180, "step": 975 }, { "epoch": 0.09611557455590772, "grad_norm": 31.577808380126953, "learning_rate": 8e-05, "loss": 92.3609, "num_input_tokens_seen": 50130040, "step": 978 }, { "epoch": 0.09641040760669271, "grad_norm": 30.214200973510742, "learning_rate": 8e-05, "loss": 91.6302, "num_input_tokens_seen": 50293532, "step": 981 }, { "epoch": 0.09670524065747771, "grad_norm": 34.3266487121582, "learning_rate": 8e-05, "loss": 91.4269, "num_input_tokens_seen": 50461436, "step": 984 }, { "epoch": 0.09700007370826269, "grad_norm": 35.89109802246094, "learning_rate": 8e-05, "loss": 91.2092, "num_input_tokens_seen": 50615044, "step": 987 }, { "epoch": 0.09729490675904769, "grad_norm": 35.1023063659668, "learning_rate": 8e-05, "loss": 92.4097, "num_input_tokens_seen": 50776320, "step": 990 }, { "epoch": 0.09758973980983268, "grad_norm": 34.04882049560547, "learning_rate": 8e-05, "loss": 90.7081, "num_input_tokens_seen": 50938464, "step": 993 }, { "epoch": 0.09788457286061768, "grad_norm": 35.52146911621094, "learning_rate": 8e-05, "loss": 90.209, "num_input_tokens_seen": 51085308, "step": 996 }, { "epoch": 0.09817940591140267, "grad_norm": 38.20060729980469, "learning_rate": 8e-05, "loss": 90.1752, "num_input_tokens_seen": 51250996, "step": 999 }, { "epoch": 0.09827768359499767, "eval_gen_len": 69.32, "eval_loss": 5.634235382080078, "eval_rouge1": 16.5561, "eval_rouge2": 3.2961, "eval_rougeL": 14.7126, "eval_rougeLsum": 14.7712, "eval_runtime": 223.2645, "eval_samples_per_second": 0.896, "eval_steps_per_second": 0.224, "num_input_tokens_seen": 51291360, "step": 1000 }, { "epoch": 0.09847423896218765, "grad_norm": 32.12675857543945, "learning_rate": 8e-05, "loss": 91.8453, "num_input_tokens_seen": 51385280, "step": 1002 }, { "epoch": 0.09876907201297265, "grad_norm": 35.18289566040039, "learning_rate": 8e-05, "loss": 90.7393, "num_input_tokens_seen": 51541768, "step": 1005 }, { "epoch": 0.09906390506375765, "grad_norm": 33.29011535644531, "learning_rate": 8e-05, "loss": 89.8114, "num_input_tokens_seen": 51695044, "step": 1008 }, { "epoch": 0.09935873811454264, "grad_norm": 33.36168670654297, "learning_rate": 8e-05, "loss": 91.4891, "num_input_tokens_seen": 51853280, "step": 1011 }, { "epoch": 0.09965357116532764, "grad_norm": 33.3763542175293, "learning_rate": 8e-05, "loss": 90.593, "num_input_tokens_seen": 52000664, "step": 1014 }, { "epoch": 0.09994840421611263, "grad_norm": 36.2030029296875, "learning_rate": 8e-05, "loss": 90.8745, "num_input_tokens_seen": 52158944, "step": 1017 }, { "epoch": 0.10024323726689761, "grad_norm": 33.890750885009766, "learning_rate": 8e-05, "loss": 91.0693, "num_input_tokens_seen": 52313252, "step": 1020 }, { "epoch": 0.10053807031768261, "grad_norm": 30.91986846923828, "learning_rate": 8e-05, "loss": 88.7983, "num_input_tokens_seen": 52486164, "step": 1023 }, { "epoch": 0.1008329033684676, "grad_norm": 36.78636932373047, "learning_rate": 8e-05, "loss": 88.6159, "num_input_tokens_seen": 52637680, "step": 1026 }, { "epoch": 0.1011277364192526, "grad_norm": 35.55237579345703, "learning_rate": 8e-05, "loss": 88.7181, "num_input_tokens_seen": 52779392, "step": 1029 }, { "epoch": 0.1014225694700376, "grad_norm": 31.026241302490234, "learning_rate": 8e-05, "loss": 88.8933, "num_input_tokens_seen": 52942088, "step": 1032 }, { "epoch": 0.10171740252082258, "grad_norm": 102.04617309570312, "learning_rate": 8e-05, "loss": 88.6628, "num_input_tokens_seen": 53095664, "step": 1035 }, { "epoch": 0.10201223557160757, "grad_norm": 35.890018463134766, "learning_rate": 8e-05, "loss": 89.8409, "num_input_tokens_seen": 53253208, "step": 1038 }, { "epoch": 0.10230706862239257, "grad_norm": 32.98229217529297, "learning_rate": 8e-05, "loss": 90.546, "num_input_tokens_seen": 53402820, "step": 1041 }, { "epoch": 0.10260190167317756, "grad_norm": 53.527061462402344, "learning_rate": 8e-05, "loss": 89.9919, "num_input_tokens_seen": 53538824, "step": 1044 }, { "epoch": 0.10289673472396256, "grad_norm": 34.09297180175781, "learning_rate": 8e-05, "loss": 89.4842, "num_input_tokens_seen": 53684448, "step": 1047 }, { "epoch": 0.10319156777474756, "grad_norm": 32.06243133544922, "learning_rate": 8e-05, "loss": 89.727, "num_input_tokens_seen": 53865824, "step": 1050 }, { "epoch": 0.10348640082553254, "grad_norm": 37.76670455932617, "learning_rate": 8e-05, "loss": 89.0664, "num_input_tokens_seen": 54009924, "step": 1053 }, { "epoch": 0.10378123387631753, "grad_norm": 31.572418212890625, "learning_rate": 8e-05, "loss": 90.8275, "num_input_tokens_seen": 54159500, "step": 1056 }, { "epoch": 0.10407606692710253, "grad_norm": 40.10283660888672, "learning_rate": 8e-05, "loss": 90.409, "num_input_tokens_seen": 54303812, "step": 1059 }, { "epoch": 0.10437089997788752, "grad_norm": 41.28361129760742, "learning_rate": 8e-05, "loss": 90.1141, "num_input_tokens_seen": 54455676, "step": 1062 }, { "epoch": 0.10466573302867252, "grad_norm": 35.48882293701172, "learning_rate": 8e-05, "loss": 89.0017, "num_input_tokens_seen": 54620984, "step": 1065 }, { "epoch": 0.1049605660794575, "grad_norm": 44.871646881103516, "learning_rate": 8e-05, "loss": 87.272, "num_input_tokens_seen": 54795340, "step": 1068 }, { "epoch": 0.1052553991302425, "grad_norm": 37.673038482666016, "learning_rate": 8e-05, "loss": 89.4379, "num_input_tokens_seen": 54950136, "step": 1071 }, { "epoch": 0.10555023218102749, "grad_norm": 35.36658477783203, "learning_rate": 8e-05, "loss": 88.0971, "num_input_tokens_seen": 55101460, "step": 1074 }, { "epoch": 0.10584506523181249, "grad_norm": 35.320072174072266, "learning_rate": 8e-05, "loss": 89.0408, "num_input_tokens_seen": 55264840, "step": 1077 }, { "epoch": 0.10613989828259748, "grad_norm": 39.10199737548828, "learning_rate": 8e-05, "loss": 88.5933, "num_input_tokens_seen": 55429216, "step": 1080 }, { "epoch": 0.10643473133338248, "grad_norm": 57.2830696105957, "learning_rate": 8e-05, "loss": 88.9289, "num_input_tokens_seen": 55561768, "step": 1083 }, { "epoch": 0.10672956438416746, "grad_norm": 34.813167572021484, "learning_rate": 8e-05, "loss": 89.9692, "num_input_tokens_seen": 55724544, "step": 1086 }, { "epoch": 0.10702439743495246, "grad_norm": 38.25190734863281, "learning_rate": 8e-05, "loss": 86.0509, "num_input_tokens_seen": 55872416, "step": 1089 }, { "epoch": 0.10731923048573745, "grad_norm": 35.670265197753906, "learning_rate": 8e-05, "loss": 88.2196, "num_input_tokens_seen": 56021900, "step": 1092 }, { "epoch": 0.10761406353652245, "grad_norm": 45.03786087036133, "learning_rate": 8e-05, "loss": 90.6019, "num_input_tokens_seen": 56175616, "step": 1095 }, { "epoch": 0.10790889658730744, "grad_norm": 36.3194580078125, "learning_rate": 8e-05, "loss": 86.1967, "num_input_tokens_seen": 56322540, "step": 1098 }, { "epoch": 0.10820372963809242, "grad_norm": 61.72140884399414, "learning_rate": 8e-05, "loss": 85.0621, "num_input_tokens_seen": 56462500, "step": 1101 }, { "epoch": 0.10849856268887742, "grad_norm": 57.37849044799805, "learning_rate": 8e-05, "loss": 86.611, "num_input_tokens_seen": 56610976, "step": 1104 }, { "epoch": 0.10879339573966242, "grad_norm": 36.412654876708984, "learning_rate": 8e-05, "loss": 87.3833, "num_input_tokens_seen": 56758720, "step": 1107 }, { "epoch": 0.10908822879044741, "grad_norm": 77.69864654541016, "learning_rate": 8e-05, "loss": 87.5024, "num_input_tokens_seen": 56931780, "step": 1110 }, { "epoch": 0.1093830618412324, "grad_norm": 35.17158508300781, "learning_rate": 8e-05, "loss": 86.0469, "num_input_tokens_seen": 57083168, "step": 1113 }, { "epoch": 0.10967789489201739, "grad_norm": 39.6878662109375, "learning_rate": 8e-05, "loss": 83.8399, "num_input_tokens_seen": 57233344, "step": 1116 }, { "epoch": 0.10997272794280238, "grad_norm": 41.83913040161133, "learning_rate": 8e-05, "loss": 85.4779, "num_input_tokens_seen": 57389876, "step": 1119 }, { "epoch": 0.11026756099358738, "grad_norm": 35.37332534790039, "learning_rate": 8e-05, "loss": 85.3762, "num_input_tokens_seen": 57549376, "step": 1122 }, { "epoch": 0.11056239404437238, "grad_norm": 34.330841064453125, "learning_rate": 8e-05, "loss": 83.6415, "num_input_tokens_seen": 57718812, "step": 1125 }, { "epoch": 0.11085722709515737, "grad_norm": 33.91706085205078, "learning_rate": 8e-05, "loss": 85.471, "num_input_tokens_seen": 57872976, "step": 1128 }, { "epoch": 0.11115206014594237, "grad_norm": 34.27631759643555, "learning_rate": 8e-05, "loss": 86.9546, "num_input_tokens_seen": 58032316, "step": 1131 }, { "epoch": 0.11144689319672735, "grad_norm": 134.00440979003906, "learning_rate": 8e-05, "loss": 84.8564, "num_input_tokens_seen": 58185864, "step": 1134 }, { "epoch": 0.11174172624751234, "grad_norm": 61.93877410888672, "learning_rate": 8e-05, "loss": 86.4746, "num_input_tokens_seen": 58357596, "step": 1137 }, { "epoch": 0.11203655929829734, "grad_norm": 39.10196304321289, "learning_rate": 8e-05, "loss": 86.2628, "num_input_tokens_seen": 58517260, "step": 1140 }, { "epoch": 0.11233139234908233, "grad_norm": 35.11958694458008, "learning_rate": 8e-05, "loss": 84.882, "num_input_tokens_seen": 58670664, "step": 1143 }, { "epoch": 0.11262622539986733, "grad_norm": 156.38101196289062, "learning_rate": 8e-05, "loss": 85.4139, "num_input_tokens_seen": 58826068, "step": 1146 }, { "epoch": 0.11292105845065231, "grad_norm": 37.681278228759766, "learning_rate": 8e-05, "loss": 85.1363, "num_input_tokens_seen": 58989632, "step": 1149 }, { "epoch": 0.11321589150143731, "grad_norm": 34.654964447021484, "learning_rate": 8e-05, "loss": 85.26, "num_input_tokens_seen": 59149176, "step": 1152 }, { "epoch": 0.1135107245522223, "grad_norm": 34.971920013427734, "learning_rate": 8e-05, "loss": 83.7622, "num_input_tokens_seen": 59307864, "step": 1155 }, { "epoch": 0.1138055576030073, "grad_norm": 36.55862808227539, "learning_rate": 8e-05, "loss": 86.1476, "num_input_tokens_seen": 59477476, "step": 1158 }, { "epoch": 0.1141003906537923, "grad_norm": 37.583221435546875, "learning_rate": 8e-05, "loss": 85.1669, "num_input_tokens_seen": 59619472, "step": 1161 }, { "epoch": 0.11439522370457729, "grad_norm": 36.37376022338867, "learning_rate": 8e-05, "loss": 84.7555, "num_input_tokens_seen": 59786124, "step": 1164 }, { "epoch": 0.11469005675536227, "grad_norm": 36.52484893798828, "learning_rate": 8e-05, "loss": 85.2837, "num_input_tokens_seen": 59946560, "step": 1167 }, { "epoch": 0.11498488980614727, "grad_norm": 33.52820587158203, "learning_rate": 8e-05, "loss": 83.8732, "num_input_tokens_seen": 60107548, "step": 1170 }, { "epoch": 0.11527972285693226, "grad_norm": 32.112953186035156, "learning_rate": 8e-05, "loss": 83.5283, "num_input_tokens_seen": 60265444, "step": 1173 }, { "epoch": 0.11557455590771726, "grad_norm": 35.48290252685547, "learning_rate": 8e-05, "loss": 82.8051, "num_input_tokens_seen": 60426424, "step": 1176 }, { "epoch": 0.11586938895850225, "grad_norm": 37.165374755859375, "learning_rate": 8e-05, "loss": 84.2286, "num_input_tokens_seen": 60559724, "step": 1179 }, { "epoch": 0.11616422200928724, "grad_norm": 38.65311050415039, "learning_rate": 8e-05, "loss": 84.465, "num_input_tokens_seen": 60704680, "step": 1182 }, { "epoch": 0.11645905506007223, "grad_norm": 33.537418365478516, "learning_rate": 8e-05, "loss": 85.1183, "num_input_tokens_seen": 60866020, "step": 1185 }, { "epoch": 0.11675388811085723, "grad_norm": 36.444644927978516, "learning_rate": 8e-05, "loss": 85.455, "num_input_tokens_seen": 61030532, "step": 1188 }, { "epoch": 0.11704872116164222, "grad_norm": 39.73960876464844, "learning_rate": 8e-05, "loss": 84.1651, "num_input_tokens_seen": 61180192, "step": 1191 }, { "epoch": 0.11734355421242722, "grad_norm": 210.22747802734375, "learning_rate": 8e-05, "loss": 82.5961, "num_input_tokens_seen": 61319688, "step": 1194 }, { "epoch": 0.11763838726321221, "grad_norm": 35.85403060913086, "learning_rate": 8e-05, "loss": 86.2338, "num_input_tokens_seen": 61451216, "step": 1197 }, { "epoch": 0.1179332203139972, "grad_norm": 38.668426513671875, "learning_rate": 8e-05, "loss": 84.9885, "num_input_tokens_seen": 61614408, "step": 1200 }, { "epoch": 0.11822805336478219, "grad_norm": 37.449241638183594, "learning_rate": 8e-05, "loss": 84.2673, "num_input_tokens_seen": 61777484, "step": 1203 }, { "epoch": 0.11852288641556719, "grad_norm": 37.551456451416016, "learning_rate": 8e-05, "loss": 83.5775, "num_input_tokens_seen": 61925060, "step": 1206 }, { "epoch": 0.11881771946635218, "grad_norm": 37.979461669921875, "learning_rate": 8e-05, "loss": 85.261, "num_input_tokens_seen": 62069836, "step": 1209 }, { "epoch": 0.11911255251713718, "grad_norm": 41.7076530456543, "learning_rate": 8e-05, "loss": 82.3804, "num_input_tokens_seen": 62228640, "step": 1212 }, { "epoch": 0.11940738556792216, "grad_norm": 46.70987319946289, "learning_rate": 8e-05, "loss": 85.4103, "num_input_tokens_seen": 62379892, "step": 1215 }, { "epoch": 0.11970221861870715, "grad_norm": 40.67140579223633, "learning_rate": 8e-05, "loss": 84.2097, "num_input_tokens_seen": 62518936, "step": 1218 }, { "epoch": 0.11999705166949215, "grad_norm": 46.623600006103516, "learning_rate": 8e-05, "loss": 84.5125, "num_input_tokens_seen": 62695380, "step": 1221 }, { "epoch": 0.12029188472027715, "grad_norm": 36.65542984008789, "learning_rate": 8e-05, "loss": 80.6407, "num_input_tokens_seen": 62862792, "step": 1224 }, { "epoch": 0.12058671777106214, "grad_norm": 42.10529327392578, "learning_rate": 8e-05, "loss": 83.1043, "num_input_tokens_seen": 63016620, "step": 1227 }, { "epoch": 0.12088155082184712, "grad_norm": 44.04954528808594, "learning_rate": 8e-05, "loss": 82.43, "num_input_tokens_seen": 63155064, "step": 1230 }, { "epoch": 0.12117638387263212, "grad_norm": 39.24373245239258, "learning_rate": 8e-05, "loss": 83.1794, "num_input_tokens_seen": 63309192, "step": 1233 }, { "epoch": 0.12147121692341711, "grad_norm": 34.62562942504883, "learning_rate": 8e-05, "loss": 83.2758, "num_input_tokens_seen": 63476200, "step": 1236 }, { "epoch": 0.12176604997420211, "grad_norm": 40.90768051147461, "learning_rate": 8e-05, "loss": 82.4989, "num_input_tokens_seen": 63631280, "step": 1239 }, { "epoch": 0.1220608830249871, "grad_norm": 34.59130096435547, "learning_rate": 8e-05, "loss": 81.9263, "num_input_tokens_seen": 63782788, "step": 1242 }, { "epoch": 0.1223557160757721, "grad_norm": 39.34327697753906, "learning_rate": 8e-05, "loss": 85.3184, "num_input_tokens_seen": 63945800, "step": 1245 }, { "epoch": 0.12265054912655708, "grad_norm": 49.83769607543945, "learning_rate": 8e-05, "loss": 81.4915, "num_input_tokens_seen": 64094332, "step": 1248 }, { "epoch": 0.12294538217734208, "grad_norm": 44.31386947631836, "learning_rate": 8e-05, "loss": 81.2687, "num_input_tokens_seen": 64267812, "step": 1251 }, { "epoch": 0.12324021522812707, "grad_norm": 32.71273422241211, "learning_rate": 8e-05, "loss": 80.4577, "num_input_tokens_seen": 64413464, "step": 1254 }, { "epoch": 0.12353504827891207, "grad_norm": 34.991573333740234, "learning_rate": 8e-05, "loss": 81.0808, "num_input_tokens_seen": 64564716, "step": 1257 }, { "epoch": 0.12382988132969706, "grad_norm": 110.09813690185547, "learning_rate": 8e-05, "loss": 82.0606, "num_input_tokens_seen": 64723184, "step": 1260 }, { "epoch": 0.12412471438048205, "grad_norm": 37.52562713623047, "learning_rate": 8e-05, "loss": 79.5995, "num_input_tokens_seen": 64871304, "step": 1263 }, { "epoch": 0.12441954743126704, "grad_norm": 33.689971923828125, "learning_rate": 8e-05, "loss": 80.8298, "num_input_tokens_seen": 65027664, "step": 1266 }, { "epoch": 0.12471438048205204, "grad_norm": 35.41553497314453, "learning_rate": 8e-05, "loss": 79.461, "num_input_tokens_seen": 65172948, "step": 1269 }, { "epoch": 0.12500921353283703, "grad_norm": 36.45944595336914, "learning_rate": 8e-05, "loss": 80.9711, "num_input_tokens_seen": 65330620, "step": 1272 }, { "epoch": 0.12530404658362201, "grad_norm": 34.718894958496094, "learning_rate": 8e-05, "loss": 82.0436, "num_input_tokens_seen": 65508384, "step": 1275 }, { "epoch": 0.12559887963440702, "grad_norm": 43.59950256347656, "learning_rate": 8e-05, "loss": 81.6451, "num_input_tokens_seen": 65645948, "step": 1278 }, { "epoch": 0.125893712685192, "grad_norm": 37.226043701171875, "learning_rate": 8e-05, "loss": 79.2421, "num_input_tokens_seen": 65787648, "step": 1281 }, { "epoch": 0.12618854573597701, "grad_norm": 43.468685150146484, "learning_rate": 8e-05, "loss": 80.5175, "num_input_tokens_seen": 65953232, "step": 1284 }, { "epoch": 0.126483378786762, "grad_norm": 32.858699798583984, "learning_rate": 8e-05, "loss": 79.5494, "num_input_tokens_seen": 66116840, "step": 1287 }, { "epoch": 0.12677821183754698, "grad_norm": 234.20143127441406, "learning_rate": 8e-05, "loss": 79.8303, "num_input_tokens_seen": 66260528, "step": 1290 }, { "epoch": 0.127073044888332, "grad_norm": 50.944881439208984, "learning_rate": 8e-05, "loss": 82.267, "num_input_tokens_seen": 66437188, "step": 1293 }, { "epoch": 0.12736787793911697, "grad_norm": 37.243194580078125, "learning_rate": 8e-05, "loss": 80.2612, "num_input_tokens_seen": 66609192, "step": 1296 }, { "epoch": 0.12766271098990198, "grad_norm": 35.89582824707031, "learning_rate": 8e-05, "loss": 81.5992, "num_input_tokens_seen": 66774100, "step": 1299 }, { "epoch": 0.12795754404068696, "grad_norm": 32.087738037109375, "learning_rate": 8e-05, "loss": 79.4512, "num_input_tokens_seen": 66912696, "step": 1302 }, { "epoch": 0.12825237709147194, "grad_norm": 36.44207000732422, "learning_rate": 8e-05, "loss": 81.6661, "num_input_tokens_seen": 67073876, "step": 1305 }, { "epoch": 0.12854721014225695, "grad_norm": 36.4789924621582, "learning_rate": 8e-05, "loss": 81.5013, "num_input_tokens_seen": 67234592, "step": 1308 }, { "epoch": 0.12884204319304193, "grad_norm": 39.537139892578125, "learning_rate": 8e-05, "loss": 82.2263, "num_input_tokens_seen": 67389940, "step": 1311 }, { "epoch": 0.12913687624382694, "grad_norm": 36.199737548828125, "learning_rate": 8e-05, "loss": 81.6914, "num_input_tokens_seen": 67542840, "step": 1314 }, { "epoch": 0.12943170929461192, "grad_norm": 34.60406494140625, "learning_rate": 8e-05, "loss": 80.2577, "num_input_tokens_seen": 67690332, "step": 1317 }, { "epoch": 0.1297265423453969, "grad_norm": 39.24729537963867, "learning_rate": 8e-05, "loss": 79.3764, "num_input_tokens_seen": 67839352, "step": 1320 }, { "epoch": 0.13002137539618192, "grad_norm": 35.990760803222656, "learning_rate": 8e-05, "loss": 77.4493, "num_input_tokens_seen": 67981068, "step": 1323 }, { "epoch": 0.1303162084469669, "grad_norm": 37.870235443115234, "learning_rate": 8e-05, "loss": 81.2442, "num_input_tokens_seen": 68133412, "step": 1326 }, { "epoch": 0.1306110414977519, "grad_norm": 37.46533966064453, "learning_rate": 8e-05, "loss": 80.1867, "num_input_tokens_seen": 68287496, "step": 1329 }, { "epoch": 0.1309058745485369, "grad_norm": 39.02689743041992, "learning_rate": 8e-05, "loss": 78.4787, "num_input_tokens_seen": 68451852, "step": 1332 }, { "epoch": 0.1312007075993219, "grad_norm": 34.74725341796875, "learning_rate": 8e-05, "loss": 76.681, "num_input_tokens_seen": 68595968, "step": 1335 }, { "epoch": 0.13149554065010688, "grad_norm": 35.77785873413086, "learning_rate": 8e-05, "loss": 80.0318, "num_input_tokens_seen": 68755292, "step": 1338 }, { "epoch": 0.13179037370089186, "grad_norm": 36.45845413208008, "learning_rate": 8e-05, "loss": 77.0289, "num_input_tokens_seen": 68906272, "step": 1341 }, { "epoch": 0.13208520675167687, "grad_norm": 36.09657287597656, "learning_rate": 8e-05, "loss": 78.4508, "num_input_tokens_seen": 69039068, "step": 1344 }, { "epoch": 0.13238003980246185, "grad_norm": 35.88303756713867, "learning_rate": 8e-05, "loss": 78.0555, "num_input_tokens_seen": 69180160, "step": 1347 }, { "epoch": 0.13267487285324686, "grad_norm": 38.69503402709961, "learning_rate": 8e-05, "loss": 80.4413, "num_input_tokens_seen": 69344756, "step": 1350 }, { "epoch": 0.13296970590403184, "grad_norm": 35.21122360229492, "learning_rate": 8e-05, "loss": 78.5331, "num_input_tokens_seen": 69504472, "step": 1353 }, { "epoch": 0.13326453895481682, "grad_norm": 46.11822509765625, "learning_rate": 8e-05, "loss": 78.9917, "num_input_tokens_seen": 69656792, "step": 1356 }, { "epoch": 0.13355937200560183, "grad_norm": 36.190093994140625, "learning_rate": 8e-05, "loss": 79.9646, "num_input_tokens_seen": 69824388, "step": 1359 }, { "epoch": 0.13385420505638682, "grad_norm": 38.50130081176758, "learning_rate": 8e-05, "loss": 78.924, "num_input_tokens_seen": 69978636, "step": 1362 }, { "epoch": 0.13414903810717183, "grad_norm": 65.22470092773438, "learning_rate": 8e-05, "loss": 75.8164, "num_input_tokens_seen": 70126680, "step": 1365 }, { "epoch": 0.1344438711579568, "grad_norm": 60.97062683105469, "learning_rate": 8e-05, "loss": 77.5916, "num_input_tokens_seen": 70258788, "step": 1368 }, { "epoch": 0.1347387042087418, "grad_norm": 38.67707824707031, "learning_rate": 8e-05, "loss": 79.541, "num_input_tokens_seen": 70428536, "step": 1371 }, { "epoch": 0.1350335372595268, "grad_norm": 35.03982925415039, "learning_rate": 8e-05, "loss": 78.4784, "num_input_tokens_seen": 70595776, "step": 1374 }, { "epoch": 0.13532837031031178, "grad_norm": 38.07881546020508, "learning_rate": 8e-05, "loss": 77.7045, "num_input_tokens_seen": 70746704, "step": 1377 }, { "epoch": 0.1356232033610968, "grad_norm": 38.27223587036133, "learning_rate": 8e-05, "loss": 78.7978, "num_input_tokens_seen": 70918160, "step": 1380 }, { "epoch": 0.13591803641188177, "grad_norm": 38.4968376159668, "learning_rate": 8e-05, "loss": 77.204, "num_input_tokens_seen": 71067852, "step": 1383 }, { "epoch": 0.13621286946266675, "grad_norm": 57.48464584350586, "learning_rate": 8e-05, "loss": 74.9822, "num_input_tokens_seen": 71221176, "step": 1386 }, { "epoch": 0.13650770251345176, "grad_norm": 34.94679260253906, "learning_rate": 8e-05, "loss": 76.3158, "num_input_tokens_seen": 71359204, "step": 1389 }, { "epoch": 0.13680253556423674, "grad_norm": 36.15276336669922, "learning_rate": 8e-05, "loss": 76.0528, "num_input_tokens_seen": 71532088, "step": 1392 }, { "epoch": 0.13709736861502175, "grad_norm": 35.722801208496094, "learning_rate": 8e-05, "loss": 79.0203, "num_input_tokens_seen": 71692128, "step": 1395 }, { "epoch": 0.13739220166580673, "grad_norm": 32.96024703979492, "learning_rate": 8e-05, "loss": 78.2764, "num_input_tokens_seen": 71855924, "step": 1398 }, { "epoch": 0.13768703471659172, "grad_norm": 36.010257720947266, "learning_rate": 8e-05, "loss": 77.3026, "num_input_tokens_seen": 72011396, "step": 1401 }, { "epoch": 0.13798186776737673, "grad_norm": 51.21072769165039, "learning_rate": 8e-05, "loss": 77.9927, "num_input_tokens_seen": 72159264, "step": 1404 }, { "epoch": 0.1382767008181617, "grad_norm": 40.23877716064453, "learning_rate": 8e-05, "loss": 77.9101, "num_input_tokens_seen": 72320092, "step": 1407 }, { "epoch": 0.13857153386894672, "grad_norm": 41.20027160644531, "learning_rate": 8e-05, "loss": 74.7435, "num_input_tokens_seen": 72464520, "step": 1410 }, { "epoch": 0.1388663669197317, "grad_norm": 38.02928924560547, "learning_rate": 8e-05, "loss": 75.4835, "num_input_tokens_seen": 72614312, "step": 1413 }, { "epoch": 0.1391611999705167, "grad_norm": 34.233665466308594, "learning_rate": 8e-05, "loss": 76.4433, "num_input_tokens_seen": 72769836, "step": 1416 }, { "epoch": 0.1394560330213017, "grad_norm": 31.657432556152344, "learning_rate": 8e-05, "loss": 75.8057, "num_input_tokens_seen": 72931408, "step": 1419 }, { "epoch": 0.13975086607208667, "grad_norm": 36.81168746948242, "learning_rate": 8e-05, "loss": 75.7319, "num_input_tokens_seen": 73104460, "step": 1422 }, { "epoch": 0.14004569912287168, "grad_norm": 7573.0126953125, "learning_rate": 8e-05, "loss": 76.7549, "num_input_tokens_seen": 73260520, "step": 1425 }, { "epoch": 0.14034053217365666, "grad_norm": 38.15385437011719, "learning_rate": 8e-05, "loss": 76.9857, "num_input_tokens_seen": 73405808, "step": 1428 }, { "epoch": 0.14063536522444167, "grad_norm": 44.962528228759766, "learning_rate": 8e-05, "loss": 75.2264, "num_input_tokens_seen": 73573148, "step": 1431 }, { "epoch": 0.14093019827522665, "grad_norm": 41.70336151123047, "learning_rate": 8e-05, "loss": 74.1684, "num_input_tokens_seen": 73725792, "step": 1434 }, { "epoch": 0.14122503132601164, "grad_norm": 33.183799743652344, "learning_rate": 8e-05, "loss": 77.1766, "num_input_tokens_seen": 73892364, "step": 1437 }, { "epoch": 0.14151986437679664, "grad_norm": 39.463348388671875, "learning_rate": 8e-05, "loss": 76.8123, "num_input_tokens_seen": 74051972, "step": 1440 }, { "epoch": 0.14181469742758163, "grad_norm": 34.82969665527344, "learning_rate": 8e-05, "loss": 76.6535, "num_input_tokens_seen": 74220148, "step": 1443 }, { "epoch": 0.14210953047836664, "grad_norm": 35.903076171875, "learning_rate": 8e-05, "loss": 75.3105, "num_input_tokens_seen": 74405764, "step": 1446 }, { "epoch": 0.14240436352915162, "grad_norm": 75.70014953613281, "learning_rate": 8e-05, "loss": 75.8482, "num_input_tokens_seen": 74562012, "step": 1449 }, { "epoch": 0.1426991965799366, "grad_norm": 36.09914016723633, "learning_rate": 8e-05, "loss": 73.2226, "num_input_tokens_seen": 74705004, "step": 1452 }, { "epoch": 0.1429940296307216, "grad_norm": 35.636756896972656, "learning_rate": 8e-05, "loss": 78.9578, "num_input_tokens_seen": 74886308, "step": 1455 }, { "epoch": 0.1432888626815066, "grad_norm": 33.12376403808594, "learning_rate": 8e-05, "loss": 76.2449, "num_input_tokens_seen": 75059784, "step": 1458 }, { "epoch": 0.1435836957322916, "grad_norm": 34.66209411621094, "learning_rate": 8e-05, "loss": 74.7041, "num_input_tokens_seen": 75210648, "step": 1461 }, { "epoch": 0.14387852878307658, "grad_norm": 36.110801696777344, "learning_rate": 8e-05, "loss": 74.4723, "num_input_tokens_seen": 75367344, "step": 1464 }, { "epoch": 0.14417336183386156, "grad_norm": 41.02329635620117, "learning_rate": 8e-05, "loss": 74.6284, "num_input_tokens_seen": 75525132, "step": 1467 }, { "epoch": 0.14446819488464657, "grad_norm": 51.964595794677734, "learning_rate": 8e-05, "loss": 76.0471, "num_input_tokens_seen": 75672700, "step": 1470 }, { "epoch": 0.14476302793543155, "grad_norm": 37.155418395996094, "learning_rate": 8e-05, "loss": 77.9106, "num_input_tokens_seen": 75822896, "step": 1473 }, { "epoch": 0.14505786098621656, "grad_norm": 39.485267639160156, "learning_rate": 8e-05, "loss": 76.1002, "num_input_tokens_seen": 75990008, "step": 1476 }, { "epoch": 0.14535269403700155, "grad_norm": 34.41350555419922, "learning_rate": 8e-05, "loss": 76.0894, "num_input_tokens_seen": 76149960, "step": 1479 }, { "epoch": 0.14564752708778655, "grad_norm": 38.971229553222656, "learning_rate": 8e-05, "loss": 75.9462, "num_input_tokens_seen": 76327592, "step": 1482 }, { "epoch": 0.14594236013857154, "grad_norm": 37.98147964477539, "learning_rate": 8e-05, "loss": 73.715, "num_input_tokens_seen": 76479420, "step": 1485 }, { "epoch": 0.14623719318935652, "grad_norm": 43.49007034301758, "learning_rate": 8e-05, "loss": 74.4515, "num_input_tokens_seen": 76630452, "step": 1488 }, { "epoch": 0.14653202624014153, "grad_norm": 35.00507736206055, "learning_rate": 8e-05, "loss": 74.7782, "num_input_tokens_seen": 76781236, "step": 1491 }, { "epoch": 0.1468268592909265, "grad_norm": 34.62977981567383, "learning_rate": 8e-05, "loss": 75.692, "num_input_tokens_seen": 76928432, "step": 1494 }, { "epoch": 0.14712169234171152, "grad_norm": 43.507266998291016, "learning_rate": 8e-05, "loss": 75.112, "num_input_tokens_seen": 77074108, "step": 1497 }, { "epoch": 0.1474165253924965, "grad_norm": 38.37984085083008, "learning_rate": 8e-05, "loss": 73.1151, "num_input_tokens_seen": 77231832, "step": 1500 }, { "epoch": 0.14771135844328148, "grad_norm": 38.74737548828125, "learning_rate": 8e-05, "loss": 76.9576, "num_input_tokens_seen": 77393456, "step": 1503 }, { "epoch": 0.1480061914940665, "grad_norm": 36.35774230957031, "learning_rate": 8e-05, "loss": 77.1779, "num_input_tokens_seen": 77536392, "step": 1506 }, { "epoch": 0.14830102454485147, "grad_norm": 30.660966873168945, "learning_rate": 8e-05, "loss": 72.1792, "num_input_tokens_seen": 77689848, "step": 1509 }, { "epoch": 0.14859585759563648, "grad_norm": 30.7512149810791, "learning_rate": 8e-05, "loss": 72.6574, "num_input_tokens_seen": 77845968, "step": 1512 }, { "epoch": 0.14889069064642146, "grad_norm": 35.85249710083008, "learning_rate": 8e-05, "loss": 74.7064, "num_input_tokens_seen": 78000624, "step": 1515 }, { "epoch": 0.14918552369720645, "grad_norm": 35.147151947021484, "learning_rate": 8e-05, "loss": 72.1662, "num_input_tokens_seen": 78149056, "step": 1518 }, { "epoch": 0.14948035674799146, "grad_norm": 36.17216873168945, "learning_rate": 8e-05, "loss": 73.4778, "num_input_tokens_seen": 78288020, "step": 1521 }, { "epoch": 0.14977518979877644, "grad_norm": 36.134586334228516, "learning_rate": 8e-05, "loss": 74.2239, "num_input_tokens_seen": 78444656, "step": 1524 }, { "epoch": 0.15007002284956145, "grad_norm": 35.32678985595703, "learning_rate": 8e-05, "loss": 75.2825, "num_input_tokens_seen": 78581236, "step": 1527 }, { "epoch": 0.15036485590034643, "grad_norm": 34.928157806396484, "learning_rate": 8e-05, "loss": 71.9196, "num_input_tokens_seen": 78723900, "step": 1530 }, { "epoch": 0.1506596889511314, "grad_norm": 32.27571487426758, "learning_rate": 8e-05, "loss": 74.4344, "num_input_tokens_seen": 78874436, "step": 1533 }, { "epoch": 0.15095452200191642, "grad_norm": 39.34345245361328, "learning_rate": 8e-05, "loss": 72.6868, "num_input_tokens_seen": 79031824, "step": 1536 }, { "epoch": 0.1512493550527014, "grad_norm": 37.49494934082031, "learning_rate": 8e-05, "loss": 73.9715, "num_input_tokens_seen": 79181500, "step": 1539 }, { "epoch": 0.1515441881034864, "grad_norm": 41.84995651245117, "learning_rate": 8e-05, "loss": 74.3241, "num_input_tokens_seen": 79335788, "step": 1542 }, { "epoch": 0.1518390211542714, "grad_norm": 35.9151725769043, "learning_rate": 8e-05, "loss": 74.5754, "num_input_tokens_seen": 79500936, "step": 1545 }, { "epoch": 0.15213385420505637, "grad_norm": 34.1334342956543, "learning_rate": 8e-05, "loss": 71.089, "num_input_tokens_seen": 79672776, "step": 1548 }, { "epoch": 0.15242868725584138, "grad_norm": 42.203880310058594, "learning_rate": 8e-05, "loss": 71.2149, "num_input_tokens_seen": 79831716, "step": 1551 }, { "epoch": 0.15272352030662636, "grad_norm": 38.425052642822266, "learning_rate": 8e-05, "loss": 71.9566, "num_input_tokens_seen": 79988016, "step": 1554 }, { "epoch": 0.15301835335741137, "grad_norm": 39.873870849609375, "learning_rate": 8e-05, "loss": 72.8495, "num_input_tokens_seen": 80144084, "step": 1557 }, { "epoch": 0.15331318640819636, "grad_norm": 35.926002502441406, "learning_rate": 8e-05, "loss": 72.3613, "num_input_tokens_seen": 80267444, "step": 1560 }, { "epoch": 0.15360801945898137, "grad_norm": 38.23421096801758, "learning_rate": 8e-05, "loss": 74.279, "num_input_tokens_seen": 80446852, "step": 1563 }, { "epoch": 0.15390285250976635, "grad_norm": 36.72174072265625, "learning_rate": 8e-05, "loss": 72.2801, "num_input_tokens_seen": 80624428, "step": 1566 }, { "epoch": 0.15419768556055133, "grad_norm": 38.56344985961914, "learning_rate": 8e-05, "loss": 73.2882, "num_input_tokens_seen": 80781668, "step": 1569 }, { "epoch": 0.15449251861133634, "grad_norm": 35.1093864440918, "learning_rate": 8e-05, "loss": 72.6963, "num_input_tokens_seen": 80934068, "step": 1572 }, { "epoch": 0.15478735166212132, "grad_norm": 35.02631378173828, "learning_rate": 8e-05, "loss": 72.4826, "num_input_tokens_seen": 81103204, "step": 1575 }, { "epoch": 0.15508218471290633, "grad_norm": 39.04288101196289, "learning_rate": 8e-05, "loss": 71.8602, "num_input_tokens_seen": 81257460, "step": 1578 }, { "epoch": 0.1553770177636913, "grad_norm": 34.65717697143555, "learning_rate": 8e-05, "loss": 71.3285, "num_input_tokens_seen": 81408692, "step": 1581 }, { "epoch": 0.1556718508144763, "grad_norm": 33.63228988647461, "learning_rate": 8e-05, "loss": 72.5402, "num_input_tokens_seen": 81561644, "step": 1584 }, { "epoch": 0.1559666838652613, "grad_norm": 34.98646545410156, "learning_rate": 8e-05, "loss": 72.3205, "num_input_tokens_seen": 81714672, "step": 1587 }, { "epoch": 0.15626151691604628, "grad_norm": 34.4061279296875, "learning_rate": 8e-05, "loss": 70.3614, "num_input_tokens_seen": 81869272, "step": 1590 }, { "epoch": 0.1565563499668313, "grad_norm": 33.87858200073242, "learning_rate": 8e-05, "loss": 69.5967, "num_input_tokens_seen": 82036368, "step": 1593 }, { "epoch": 0.15685118301761627, "grad_norm": 42.29884338378906, "learning_rate": 8e-05, "loss": 74.2783, "num_input_tokens_seen": 82182920, "step": 1596 }, { "epoch": 0.15714601606840126, "grad_norm": 49.82851791381836, "learning_rate": 8e-05, "loss": 70.0678, "num_input_tokens_seen": 82345176, "step": 1599 }, { "epoch": 0.15744084911918627, "grad_norm": 34.39692306518555, "learning_rate": 8e-05, "loss": 71.5913, "num_input_tokens_seen": 82512116, "step": 1602 }, { "epoch": 0.15773568216997125, "grad_norm": 35.24238204956055, "learning_rate": 8e-05, "loss": 72.218, "num_input_tokens_seen": 82679872, "step": 1605 }, { "epoch": 0.15803051522075626, "grad_norm": 33.634464263916016, "learning_rate": 8e-05, "loss": 72.6587, "num_input_tokens_seen": 82819184, "step": 1608 }, { "epoch": 0.15832534827154124, "grad_norm": 34.30915832519531, "learning_rate": 8e-05, "loss": 71.4704, "num_input_tokens_seen": 82982680, "step": 1611 }, { "epoch": 0.15862018132232622, "grad_norm": 40.74231719970703, "learning_rate": 8e-05, "loss": 70.3003, "num_input_tokens_seen": 83135768, "step": 1614 }, { "epoch": 0.15891501437311123, "grad_norm": 55.77992630004883, "learning_rate": 8e-05, "loss": 71.88, "num_input_tokens_seen": 83308712, "step": 1617 }, { "epoch": 0.1592098474238962, "grad_norm": 39.83135223388672, "learning_rate": 8e-05, "loss": 71.1506, "num_input_tokens_seen": 83483420, "step": 1620 }, { "epoch": 0.15950468047468122, "grad_norm": 34.45026779174805, "learning_rate": 8e-05, "loss": 73.5146, "num_input_tokens_seen": 83641592, "step": 1623 }, { "epoch": 0.1597995135254662, "grad_norm": 42.277793884277344, "learning_rate": 8e-05, "loss": 75.9387, "num_input_tokens_seen": 83804164, "step": 1626 }, { "epoch": 0.16009434657625118, "grad_norm": 39.62792205810547, "learning_rate": 8e-05, "loss": 71.2239, "num_input_tokens_seen": 83966016, "step": 1629 }, { "epoch": 0.1603891796270362, "grad_norm": 49.15724182128906, "learning_rate": 8e-05, "loss": 70.4015, "num_input_tokens_seen": 84129672, "step": 1632 }, { "epoch": 0.16068401267782118, "grad_norm": 38.876102447509766, "learning_rate": 8e-05, "loss": 70.0127, "num_input_tokens_seen": 84278492, "step": 1635 }, { "epoch": 0.16097884572860618, "grad_norm": 34.91112518310547, "learning_rate": 8e-05, "loss": 72.3853, "num_input_tokens_seen": 84439544, "step": 1638 }, { "epoch": 0.16127367877939117, "grad_norm": 35.54895782470703, "learning_rate": 8e-05, "loss": 69.1896, "num_input_tokens_seen": 84600056, "step": 1641 }, { "epoch": 0.16156851183017618, "grad_norm": 39.978538513183594, "learning_rate": 8e-05, "loss": 70.4133, "num_input_tokens_seen": 84750168, "step": 1644 }, { "epoch": 0.16186334488096116, "grad_norm": 33.363372802734375, "learning_rate": 8e-05, "loss": 71.4283, "num_input_tokens_seen": 84899072, "step": 1647 }, { "epoch": 0.16215817793174614, "grad_norm": 33.02912521362305, "learning_rate": 8e-05, "loss": 70.59, "num_input_tokens_seen": 85040176, "step": 1650 }, { "epoch": 0.16245301098253115, "grad_norm": 34.67600631713867, "learning_rate": 8e-05, "loss": 72.9926, "num_input_tokens_seen": 85202388, "step": 1653 }, { "epoch": 0.16274784403331613, "grad_norm": 88.48844909667969, "learning_rate": 8e-05, "loss": 71.2351, "num_input_tokens_seen": 85369400, "step": 1656 }, { "epoch": 0.16304267708410114, "grad_norm": 38.78783416748047, "learning_rate": 8e-05, "loss": 67.4387, "num_input_tokens_seen": 85535784, "step": 1659 }, { "epoch": 0.16333751013488612, "grad_norm": 169.82952880859375, "learning_rate": 8e-05, "loss": 69.9607, "num_input_tokens_seen": 85706820, "step": 1662 }, { "epoch": 0.1636323431856711, "grad_norm": 40.37202453613281, "learning_rate": 8e-05, "loss": 71.8307, "num_input_tokens_seen": 85849804, "step": 1665 }, { "epoch": 0.1639271762364561, "grad_norm": 67.48583984375, "learning_rate": 8e-05, "loss": 69.4109, "num_input_tokens_seen": 85998468, "step": 1668 }, { "epoch": 0.1642220092872411, "grad_norm": 35.94486999511719, "learning_rate": 8e-05, "loss": 70.1855, "num_input_tokens_seen": 86158700, "step": 1671 }, { "epoch": 0.1645168423380261, "grad_norm": 49.25463104248047, "learning_rate": 8e-05, "loss": 69.7001, "num_input_tokens_seen": 86312436, "step": 1674 }, { "epoch": 0.16481167538881109, "grad_norm": 35.989192962646484, "learning_rate": 8e-05, "loss": 70.2536, "num_input_tokens_seen": 86492368, "step": 1677 }, { "epoch": 0.16510650843959607, "grad_norm": 34.7452392578125, "learning_rate": 8e-05, "loss": 70.5204, "num_input_tokens_seen": 86620228, "step": 1680 }, { "epoch": 0.16540134149038108, "grad_norm": 36.26546096801758, "learning_rate": 8e-05, "loss": 69.9993, "num_input_tokens_seen": 86761548, "step": 1683 }, { "epoch": 0.16569617454116606, "grad_norm": 73.40389251708984, "learning_rate": 8e-05, "loss": 71.2321, "num_input_tokens_seen": 86896420, "step": 1686 }, { "epoch": 0.16599100759195107, "grad_norm": 37.27740478515625, "learning_rate": 8e-05, "loss": 71.3492, "num_input_tokens_seen": 87064216, "step": 1689 }, { "epoch": 0.16628584064273605, "grad_norm": 43.147308349609375, "learning_rate": 8e-05, "loss": 70.2469, "num_input_tokens_seen": 87196748, "step": 1692 }, { "epoch": 0.16658067369352103, "grad_norm": 48.543495178222656, "learning_rate": 8e-05, "loss": 69.6811, "num_input_tokens_seen": 87350500, "step": 1695 }, { "epoch": 0.16687550674430604, "grad_norm": 41.09067153930664, "learning_rate": 8e-05, "loss": 68.8399, "num_input_tokens_seen": 87498152, "step": 1698 }, { "epoch": 0.16717033979509102, "grad_norm": 41.002784729003906, "learning_rate": 8e-05, "loss": 70.0191, "num_input_tokens_seen": 87657492, "step": 1701 }, { "epoch": 0.16746517284587603, "grad_norm": 42.89789581298828, "learning_rate": 8e-05, "loss": 67.6702, "num_input_tokens_seen": 87799916, "step": 1704 }, { "epoch": 0.167760005896661, "grad_norm": 38.87138366699219, "learning_rate": 8e-05, "loss": 68.2145, "num_input_tokens_seen": 87935648, "step": 1707 }, { "epoch": 0.16805483894744602, "grad_norm": 256.7908630371094, "learning_rate": 8e-05, "loss": 68.1492, "num_input_tokens_seen": 88081728, "step": 1710 }, { "epoch": 0.168349671998231, "grad_norm": 37.66705322265625, "learning_rate": 8e-05, "loss": 67.9451, "num_input_tokens_seen": 88240552, "step": 1713 }, { "epoch": 0.16864450504901599, "grad_norm": 33.00398635864258, "learning_rate": 8e-05, "loss": 72.0741, "num_input_tokens_seen": 88397404, "step": 1716 }, { "epoch": 0.168939338099801, "grad_norm": 32.67354965209961, "learning_rate": 8e-05, "loss": 67.1234, "num_input_tokens_seen": 88559332, "step": 1719 }, { "epoch": 0.16923417115058598, "grad_norm": 37.69291687011719, "learning_rate": 8e-05, "loss": 68.6124, "num_input_tokens_seen": 88707332, "step": 1722 }, { "epoch": 0.169529004201371, "grad_norm": 34.9688720703125, "learning_rate": 8e-05, "loss": 64.365, "num_input_tokens_seen": 88830788, "step": 1725 }, { "epoch": 0.16982383725215597, "grad_norm": 38.50293731689453, "learning_rate": 8e-05, "loss": 67.384, "num_input_tokens_seen": 88982944, "step": 1728 }, { "epoch": 0.17011867030294095, "grad_norm": 54.379638671875, "learning_rate": 8e-05, "loss": 69.4878, "num_input_tokens_seen": 89120944, "step": 1731 }, { "epoch": 0.17041350335372596, "grad_norm": 56.456138610839844, "learning_rate": 8e-05, "loss": 67.5283, "num_input_tokens_seen": 89290008, "step": 1734 }, { "epoch": 0.17070833640451094, "grad_norm": 34.861175537109375, "learning_rate": 8e-05, "loss": 68.7829, "num_input_tokens_seen": 89442452, "step": 1737 }, { "epoch": 0.17100316945529595, "grad_norm": 35.11691665649414, "learning_rate": 8e-05, "loss": 67.3596, "num_input_tokens_seen": 89587496, "step": 1740 }, { "epoch": 0.17129800250608093, "grad_norm": 43.15464782714844, "learning_rate": 8e-05, "loss": 66.5695, "num_input_tokens_seen": 89743128, "step": 1743 }, { "epoch": 0.1715928355568659, "grad_norm": 46.82964324951172, "learning_rate": 8e-05, "loss": 68.5125, "num_input_tokens_seen": 89900992, "step": 1746 }, { "epoch": 0.17188766860765092, "grad_norm": 38.2276725769043, "learning_rate": 8e-05, "loss": 68.039, "num_input_tokens_seen": 90059608, "step": 1749 }, { "epoch": 0.1721825016584359, "grad_norm": 39.11660385131836, "learning_rate": 8e-05, "loss": 69.9983, "num_input_tokens_seen": 90221080, "step": 1752 }, { "epoch": 0.17247733470922091, "grad_norm": 38.58439254760742, "learning_rate": 8e-05, "loss": 69.3442, "num_input_tokens_seen": 90376304, "step": 1755 }, { "epoch": 0.1727721677600059, "grad_norm": 36.8914680480957, "learning_rate": 8e-05, "loss": 66.3412, "num_input_tokens_seen": 90535060, "step": 1758 }, { "epoch": 0.17306700081079088, "grad_norm": 40.14888381958008, "learning_rate": 8e-05, "loss": 67.6869, "num_input_tokens_seen": 90717824, "step": 1761 }, { "epoch": 0.1733618338615759, "grad_norm": 34.87165451049805, "learning_rate": 8e-05, "loss": 68.2913, "num_input_tokens_seen": 90864896, "step": 1764 }, { "epoch": 0.17365666691236087, "grad_norm": 36.20130920410156, "learning_rate": 8e-05, "loss": 67.6237, "num_input_tokens_seen": 91011224, "step": 1767 }, { "epoch": 0.17395149996314588, "grad_norm": 41.79694747924805, "learning_rate": 8e-05, "loss": 67.4999, "num_input_tokens_seen": 91156916, "step": 1770 }, { "epoch": 0.17424633301393086, "grad_norm": 37.9937629699707, "learning_rate": 8e-05, "loss": 69.9868, "num_input_tokens_seen": 91276748, "step": 1773 }, { "epoch": 0.17454116606471584, "grad_norm": 37.94075012207031, "learning_rate": 8e-05, "loss": 66.9292, "num_input_tokens_seen": 91426680, "step": 1776 }, { "epoch": 0.17483599911550085, "grad_norm": 41.27400588989258, "learning_rate": 8e-05, "loss": 68.5003, "num_input_tokens_seen": 91580812, "step": 1779 }, { "epoch": 0.17513083216628583, "grad_norm": 41.00275802612305, "learning_rate": 8e-05, "loss": 67.9984, "num_input_tokens_seen": 91741720, "step": 1782 }, { "epoch": 0.17542566521707084, "grad_norm": 701.8508911132812, "learning_rate": 8e-05, "loss": 67.7746, "num_input_tokens_seen": 91900988, "step": 1785 }, { "epoch": 0.17572049826785582, "grad_norm": 36.433135986328125, "learning_rate": 8e-05, "loss": 65.6489, "num_input_tokens_seen": 92069236, "step": 1788 }, { "epoch": 0.17601533131864083, "grad_norm": 32.772438049316406, "learning_rate": 8e-05, "loss": 66.0876, "num_input_tokens_seen": 92212252, "step": 1791 }, { "epoch": 0.17631016436942581, "grad_norm": 34.46598434448242, "learning_rate": 8e-05, "loss": 66.1039, "num_input_tokens_seen": 92363816, "step": 1794 }, { "epoch": 0.1766049974202108, "grad_norm": 36.973140716552734, "learning_rate": 8e-05, "loss": 68.7322, "num_input_tokens_seen": 92514400, "step": 1797 }, { "epoch": 0.1768998304709958, "grad_norm": 33.909847259521484, "learning_rate": 8e-05, "loss": 68.1551, "num_input_tokens_seen": 92658624, "step": 1800 }, { "epoch": 0.1771946635217808, "grad_norm": 33.61487579345703, "learning_rate": 8e-05, "loss": 68.2057, "num_input_tokens_seen": 92813240, "step": 1803 }, { "epoch": 0.1774894965725658, "grad_norm": 48.34735107421875, "learning_rate": 8e-05, "loss": 67.1163, "num_input_tokens_seen": 92961244, "step": 1806 }, { "epoch": 0.17778432962335078, "grad_norm": 37.34203338623047, "learning_rate": 8e-05, "loss": 68.1642, "num_input_tokens_seen": 93126052, "step": 1809 }, { "epoch": 0.17807916267413576, "grad_norm": 36.56318283081055, "learning_rate": 8e-05, "loss": 67.9583, "num_input_tokens_seen": 93291776, "step": 1812 }, { "epoch": 0.17837399572492077, "grad_norm": 32.81214141845703, "learning_rate": 8e-05, "loss": 68.2649, "num_input_tokens_seen": 93437804, "step": 1815 }, { "epoch": 0.17866882877570575, "grad_norm": 39.49382781982422, "learning_rate": 8e-05, "loss": 66.587, "num_input_tokens_seen": 93590280, "step": 1818 }, { "epoch": 0.17896366182649076, "grad_norm": 37.001686096191406, "learning_rate": 8e-05, "loss": 68.0501, "num_input_tokens_seen": 93759692, "step": 1821 }, { "epoch": 0.17925849487727574, "grad_norm": 43.17896270751953, "learning_rate": 8e-05, "loss": 70.3608, "num_input_tokens_seen": 93912212, "step": 1824 }, { "epoch": 0.17955332792806072, "grad_norm": 31.220149993896484, "learning_rate": 8e-05, "loss": 67.3302, "num_input_tokens_seen": 94050348, "step": 1827 }, { "epoch": 0.17984816097884573, "grad_norm": 35.91094207763672, "learning_rate": 8e-05, "loss": 66.1876, "num_input_tokens_seen": 94215336, "step": 1830 }, { "epoch": 0.18014299402963072, "grad_norm": 39.12143325805664, "learning_rate": 8e-05, "loss": 67.2838, "num_input_tokens_seen": 94394660, "step": 1833 }, { "epoch": 0.18043782708041572, "grad_norm": 36.33224105834961, "learning_rate": 8e-05, "loss": 68.2328, "num_input_tokens_seen": 94554828, "step": 1836 }, { "epoch": 0.1807326601312007, "grad_norm": 33.85842514038086, "learning_rate": 8e-05, "loss": 64.8147, "num_input_tokens_seen": 94714520, "step": 1839 }, { "epoch": 0.1810274931819857, "grad_norm": 36.62459945678711, "learning_rate": 8e-05, "loss": 65.6021, "num_input_tokens_seen": 94861320, "step": 1842 }, { "epoch": 0.1813223262327707, "grad_norm": 33.322139739990234, "learning_rate": 8e-05, "loss": 63.1429, "num_input_tokens_seen": 95014704, "step": 1845 }, { "epoch": 0.18161715928355568, "grad_norm": 34.736106872558594, "learning_rate": 8e-05, "loss": 66.7304, "num_input_tokens_seen": 95170048, "step": 1848 }, { "epoch": 0.1819119923343407, "grad_norm": 37.72227096557617, "learning_rate": 8e-05, "loss": 63.1081, "num_input_tokens_seen": 95333344, "step": 1851 }, { "epoch": 0.18220682538512567, "grad_norm": 32.574039459228516, "learning_rate": 8e-05, "loss": 69.0083, "num_input_tokens_seen": 95504940, "step": 1854 }, { "epoch": 0.18250165843591065, "grad_norm": 33.293087005615234, "learning_rate": 8e-05, "loss": 68.1676, "num_input_tokens_seen": 95668432, "step": 1857 }, { "epoch": 0.18279649148669566, "grad_norm": 36.98726272583008, "learning_rate": 8e-05, "loss": 69.9773, "num_input_tokens_seen": 95811784, "step": 1860 }, { "epoch": 0.18309132453748064, "grad_norm": 40.061279296875, "learning_rate": 8e-05, "loss": 65.8954, "num_input_tokens_seen": 95986592, "step": 1863 }, { "epoch": 0.18338615758826565, "grad_norm": 56.59510803222656, "learning_rate": 8e-05, "loss": 67.293, "num_input_tokens_seen": 96161604, "step": 1866 }, { "epoch": 0.18368099063905063, "grad_norm": 37.23174285888672, "learning_rate": 8e-05, "loss": 65.7594, "num_input_tokens_seen": 96310164, "step": 1869 }, { "epoch": 0.18397582368983564, "grad_norm": 40.08700942993164, "learning_rate": 8e-05, "loss": 65.6641, "num_input_tokens_seen": 96432028, "step": 1872 }, { "epoch": 0.18427065674062063, "grad_norm": 36.7364501953125, "learning_rate": 8e-05, "loss": 66.722, "num_input_tokens_seen": 96585280, "step": 1875 }, { "epoch": 0.1845654897914056, "grad_norm": 34.42292404174805, "learning_rate": 8e-05, "loss": 65.9969, "num_input_tokens_seen": 96761724, "step": 1878 }, { "epoch": 0.18486032284219062, "grad_norm": 36.30381393432617, "learning_rate": 8e-05, "loss": 65.4957, "num_input_tokens_seen": 96911572, "step": 1881 }, { "epoch": 0.1851551558929756, "grad_norm": 35.37347412109375, "learning_rate": 8e-05, "loss": 64.8689, "num_input_tokens_seen": 97064136, "step": 1884 }, { "epoch": 0.1854499889437606, "grad_norm": 35.22996139526367, "learning_rate": 8e-05, "loss": 62.8097, "num_input_tokens_seen": 97225564, "step": 1887 }, { "epoch": 0.1857448219945456, "grad_norm": 45.67708969116211, "learning_rate": 8e-05, "loss": 64.6179, "num_input_tokens_seen": 97374648, "step": 1890 }, { "epoch": 0.18603965504533057, "grad_norm": 31.792390823364258, "learning_rate": 8e-05, "loss": 66.9663, "num_input_tokens_seen": 97534584, "step": 1893 }, { "epoch": 0.18633448809611558, "grad_norm": 41.057430267333984, "learning_rate": 8e-05, "loss": 65.637, "num_input_tokens_seen": 97684144, "step": 1896 }, { "epoch": 0.18662932114690056, "grad_norm": 31.76915168762207, "learning_rate": 8e-05, "loss": 66.0923, "num_input_tokens_seen": 97843812, "step": 1899 }, { "epoch": 0.18692415419768557, "grad_norm": 38.473182678222656, "learning_rate": 8e-05, "loss": 62.1717, "num_input_tokens_seen": 98004360, "step": 1902 }, { "epoch": 0.18721898724847055, "grad_norm": 32.907623291015625, "learning_rate": 8e-05, "loss": 64.183, "num_input_tokens_seen": 98157508, "step": 1905 }, { "epoch": 0.18751382029925553, "grad_norm": 88.73799896240234, "learning_rate": 8e-05, "loss": 60.3581, "num_input_tokens_seen": 98300784, "step": 1908 }, { "epoch": 0.18780865335004054, "grad_norm": 34.77318572998047, "learning_rate": 8e-05, "loss": 65.4762, "num_input_tokens_seen": 98447492, "step": 1911 }, { "epoch": 0.18810348640082553, "grad_norm": 41.78057861328125, "learning_rate": 8e-05, "loss": 61.738, "num_input_tokens_seen": 98612760, "step": 1914 }, { "epoch": 0.18839831945161054, "grad_norm": 42.5244026184082, "learning_rate": 8e-05, "loss": 63.2749, "num_input_tokens_seen": 98775560, "step": 1917 }, { "epoch": 0.18869315250239552, "grad_norm": 35.50346374511719, "learning_rate": 8e-05, "loss": 62.9821, "num_input_tokens_seen": 98923580, "step": 1920 }, { "epoch": 0.1889879855531805, "grad_norm": 39.55344772338867, "learning_rate": 8e-05, "loss": 63.3595, "num_input_tokens_seen": 99070644, "step": 1923 }, { "epoch": 0.1892828186039655, "grad_norm": 59.06232833862305, "learning_rate": 8e-05, "loss": 67.8251, "num_input_tokens_seen": 99221436, "step": 1926 }, { "epoch": 0.1895776516547505, "grad_norm": 35.27766418457031, "learning_rate": 8e-05, "loss": 65.3675, "num_input_tokens_seen": 99358224, "step": 1929 }, { "epoch": 0.1898724847055355, "grad_norm": 35.66068649291992, "learning_rate": 8e-05, "loss": 61.189, "num_input_tokens_seen": 99499396, "step": 1932 }, { "epoch": 0.19016731775632048, "grad_norm": 41.50188064575195, "learning_rate": 8e-05, "loss": 66.0529, "num_input_tokens_seen": 99641728, "step": 1935 }, { "epoch": 0.1904621508071055, "grad_norm": 35.998775482177734, "learning_rate": 8e-05, "loss": 61.1702, "num_input_tokens_seen": 99775096, "step": 1938 }, { "epoch": 0.19075698385789047, "grad_norm": 125.73440551757812, "learning_rate": 8e-05, "loss": 62.1858, "num_input_tokens_seen": 99931240, "step": 1941 }, { "epoch": 0.19105181690867545, "grad_norm": 39.456817626953125, "learning_rate": 8e-05, "loss": 65.4074, "num_input_tokens_seen": 100080408, "step": 1944 }, { "epoch": 0.19134664995946046, "grad_norm": 47.685970306396484, "learning_rate": 8e-05, "loss": 60.8324, "num_input_tokens_seen": 100230596, "step": 1947 }, { "epoch": 0.19164148301024544, "grad_norm": 48.207733154296875, "learning_rate": 8e-05, "loss": 64.0891, "num_input_tokens_seen": 100387164, "step": 1950 }, { "epoch": 0.19193631606103045, "grad_norm": 97.84766387939453, "learning_rate": 8e-05, "loss": 67.9796, "num_input_tokens_seen": 100531688, "step": 1953 }, { "epoch": 0.19223114911181544, "grad_norm": 53.36372756958008, "learning_rate": 8e-05, "loss": 62.5783, "num_input_tokens_seen": 100688964, "step": 1956 }, { "epoch": 0.19252598216260042, "grad_norm": 42.483978271484375, "learning_rate": 8e-05, "loss": 63.2629, "num_input_tokens_seen": 100848816, "step": 1959 }, { "epoch": 0.19282081521338543, "grad_norm": 100.14268493652344, "learning_rate": 8e-05, "loss": 64.8723, "num_input_tokens_seen": 100998108, "step": 1962 }, { "epoch": 0.1931156482641704, "grad_norm": 51.66161346435547, "learning_rate": 8e-05, "loss": 65.4101, "num_input_tokens_seen": 101160364, "step": 1965 }, { "epoch": 0.19341048131495542, "grad_norm": 86.82875061035156, "learning_rate": 8e-05, "loss": 64.7327, "num_input_tokens_seen": 101314144, "step": 1968 }, { "epoch": 0.1937053143657404, "grad_norm": 37.22882843017578, "learning_rate": 8e-05, "loss": 64.4545, "num_input_tokens_seen": 101461764, "step": 1971 }, { "epoch": 0.19400014741652538, "grad_norm": 54.93822479248047, "learning_rate": 8e-05, "loss": 67.0109, "num_input_tokens_seen": 101594676, "step": 1974 }, { "epoch": 0.1942949804673104, "grad_norm": 37.24103927612305, "learning_rate": 8e-05, "loss": 63.0324, "num_input_tokens_seen": 101758344, "step": 1977 }, { "epoch": 0.19458981351809537, "grad_norm": 32.88393783569336, "learning_rate": 8e-05, "loss": 63.9217, "num_input_tokens_seen": 101909124, "step": 1980 }, { "epoch": 0.19488464656888038, "grad_norm": 73.78339385986328, "learning_rate": 8e-05, "loss": 60.9234, "num_input_tokens_seen": 102057364, "step": 1983 }, { "epoch": 0.19517947961966536, "grad_norm": 40.5337028503418, "learning_rate": 8e-05, "loss": 64.0662, "num_input_tokens_seen": 102223364, "step": 1986 }, { "epoch": 0.19547431267045035, "grad_norm": 82.47228240966797, "learning_rate": 8e-05, "loss": 65.104, "num_input_tokens_seen": 102379700, "step": 1989 }, { "epoch": 0.19576914572123535, "grad_norm": 48.52934265136719, "learning_rate": 8e-05, "loss": 63.7737, "num_input_tokens_seen": 102538772, "step": 1992 }, { "epoch": 0.19606397877202034, "grad_norm": 105.58226013183594, "learning_rate": 8e-05, "loss": 61.4124, "num_input_tokens_seen": 102692596, "step": 1995 }, { "epoch": 0.19635881182280535, "grad_norm": 132.78990173339844, "learning_rate": 8e-05, "loss": 65.7669, "num_input_tokens_seen": 102836944, "step": 1998 }, { "epoch": 0.19655536718999533, "eval_gen_len": 41.59, "eval_loss": 4.052365779876709, "eval_rouge1": 27.4318, "eval_rouge2": 11.4034, "eval_rougeL": 24.5864, "eval_rougeLsum": 24.8835, "eval_runtime": 146.6637, "eval_samples_per_second": 1.364, "eval_steps_per_second": 0.341, "num_input_tokens_seen": 102933044, "step": 2000 }, { "epoch": 0.19665364487359033, "grad_norm": 50.9435920715332, "learning_rate": 8e-05, "loss": 65.0293, "num_input_tokens_seen": 102991844, "step": 2001 }, { "epoch": 0.1969484779243753, "grad_norm": 47.73625564575195, "learning_rate": 8e-05, "loss": 63.8356, "num_input_tokens_seen": 103117500, "step": 2004 }, { "epoch": 0.19724331097516032, "grad_norm": 34.227874755859375, "learning_rate": 8e-05, "loss": 61.5276, "num_input_tokens_seen": 103281116, "step": 2007 }, { "epoch": 0.1975381440259453, "grad_norm": 74.65849304199219, "learning_rate": 8e-05, "loss": 62.754, "num_input_tokens_seen": 103437116, "step": 2010 }, { "epoch": 0.1978329770767303, "grad_norm": 33.34313201904297, "learning_rate": 8e-05, "loss": 63.4937, "num_input_tokens_seen": 103608180, "step": 2013 }, { "epoch": 0.1981278101275153, "grad_norm": 40.12400817871094, "learning_rate": 8e-05, "loss": 64.5351, "num_input_tokens_seen": 103747448, "step": 2016 }, { "epoch": 0.1984226431783003, "grad_norm": 77.02454376220703, "learning_rate": 8e-05, "loss": 63.297, "num_input_tokens_seen": 103911272, "step": 2019 }, { "epoch": 0.19871747622908528, "grad_norm": 133.56906127929688, "learning_rate": 8e-05, "loss": 60.8824, "num_input_tokens_seen": 104053508, "step": 2022 }, { "epoch": 0.19901230927987026, "grad_norm": 41.40303421020508, "learning_rate": 8e-05, "loss": 60.8633, "num_input_tokens_seen": 104197836, "step": 2025 }, { "epoch": 0.19930714233065527, "grad_norm": 44.42088317871094, "learning_rate": 8e-05, "loss": 63.3138, "num_input_tokens_seen": 104321044, "step": 2028 }, { "epoch": 0.19960197538144026, "grad_norm": 48.47317886352539, "learning_rate": 8e-05, "loss": 64.958, "num_input_tokens_seen": 104487040, "step": 2031 }, { "epoch": 0.19989680843222526, "grad_norm": 40.33289337158203, "learning_rate": 8e-05, "loss": 65.5737, "num_input_tokens_seen": 104656620, "step": 2034 }, { "epoch": 0.20019164148301025, "grad_norm": 37.34440612792969, "learning_rate": 8e-05, "loss": 63.7026, "num_input_tokens_seen": 104807568, "step": 2037 }, { "epoch": 0.20048647453379523, "grad_norm": 40.222557067871094, "learning_rate": 8e-05, "loss": 60.0065, "num_input_tokens_seen": 104976912, "step": 2040 }, { "epoch": 0.20078130758458024, "grad_norm": 115.03577423095703, "learning_rate": 8e-05, "loss": 63.1647, "num_input_tokens_seen": 105134276, "step": 2043 }, { "epoch": 0.20107614063536522, "grad_norm": 36.157569885253906, "learning_rate": 8e-05, "loss": 62.082, "num_input_tokens_seen": 105299968, "step": 2046 }, { "epoch": 0.20137097368615023, "grad_norm": 37.57674789428711, "learning_rate": 8e-05, "loss": 62.3581, "num_input_tokens_seen": 105448460, "step": 2049 }, { "epoch": 0.2016658067369352, "grad_norm": 36.60391616821289, "learning_rate": 8e-05, "loss": 60.2257, "num_input_tokens_seen": 105620848, "step": 2052 }, { "epoch": 0.2019606397877202, "grad_norm": 33.81732177734375, "learning_rate": 8e-05, "loss": 63.3224, "num_input_tokens_seen": 105769236, "step": 2055 }, { "epoch": 0.2022554728385052, "grad_norm": 40.38296890258789, "learning_rate": 8e-05, "loss": 63.6429, "num_input_tokens_seen": 105921540, "step": 2058 }, { "epoch": 0.20255030588929018, "grad_norm": 36.80983352661133, "learning_rate": 8e-05, "loss": 60.2437, "num_input_tokens_seen": 106082260, "step": 2061 }, { "epoch": 0.2028451389400752, "grad_norm": 67.15686798095703, "learning_rate": 8e-05, "loss": 60.4693, "num_input_tokens_seen": 106241504, "step": 2064 }, { "epoch": 0.20313997199086017, "grad_norm": 34.85142135620117, "learning_rate": 8e-05, "loss": 62.0851, "num_input_tokens_seen": 106395036, "step": 2067 }, { "epoch": 0.20343480504164516, "grad_norm": 34.57643508911133, "learning_rate": 8e-05, "loss": 67.5032, "num_input_tokens_seen": 106552940, "step": 2070 }, { "epoch": 0.20372963809243017, "grad_norm": 41.602684020996094, "learning_rate": 8e-05, "loss": 59.1889, "num_input_tokens_seen": 106705432, "step": 2073 }, { "epoch": 0.20402447114321515, "grad_norm": 35.24937438964844, "learning_rate": 8e-05, "loss": 64.8413, "num_input_tokens_seen": 106862652, "step": 2076 }, { "epoch": 0.20431930419400016, "grad_norm": 33.99971389770508, "learning_rate": 8e-05, "loss": 65.4623, "num_input_tokens_seen": 107025212, "step": 2079 }, { "epoch": 0.20461413724478514, "grad_norm": 37.50223922729492, "learning_rate": 8e-05, "loss": 61.1447, "num_input_tokens_seen": 107200104, "step": 2082 }, { "epoch": 0.20490897029557012, "grad_norm": 39.0561637878418, "learning_rate": 8e-05, "loss": 64.2402, "num_input_tokens_seen": 107351344, "step": 2085 }, { "epoch": 0.20520380334635513, "grad_norm": 36.96356201171875, "learning_rate": 8e-05, "loss": 64.7978, "num_input_tokens_seen": 107502508, "step": 2088 }, { "epoch": 0.2054986363971401, "grad_norm": 34.923370361328125, "learning_rate": 8e-05, "loss": 62.0066, "num_input_tokens_seen": 107659476, "step": 2091 }, { "epoch": 0.20579346944792512, "grad_norm": 33.66584014892578, "learning_rate": 8e-05, "loss": 62.7879, "num_input_tokens_seen": 107813464, "step": 2094 }, { "epoch": 0.2060883024987101, "grad_norm": 43.05538558959961, "learning_rate": 8e-05, "loss": 65.1983, "num_input_tokens_seen": 107945632, "step": 2097 }, { "epoch": 0.2063831355494951, "grad_norm": 46.15825653076172, "learning_rate": 8e-05, "loss": 62.3188, "num_input_tokens_seen": 108105348, "step": 2100 }, { "epoch": 0.2066779686002801, "grad_norm": 38.95362091064453, "learning_rate": 8e-05, "loss": 63.0942, "num_input_tokens_seen": 108263528, "step": 2103 }, { "epoch": 0.20697280165106507, "grad_norm": 30.686132431030273, "learning_rate": 8e-05, "loss": 60.3785, "num_input_tokens_seen": 108411408, "step": 2106 }, { "epoch": 0.20726763470185008, "grad_norm": 47.61467742919922, "learning_rate": 8e-05, "loss": 61.1736, "num_input_tokens_seen": 108577660, "step": 2109 }, { "epoch": 0.20756246775263507, "grad_norm": 35.973411560058594, "learning_rate": 8e-05, "loss": 65.4033, "num_input_tokens_seen": 108731036, "step": 2112 }, { "epoch": 0.20785730080342008, "grad_norm": 36.56504440307617, "learning_rate": 8e-05, "loss": 59.0829, "num_input_tokens_seen": 108874528, "step": 2115 }, { "epoch": 0.20815213385420506, "grad_norm": 31.043546676635742, "learning_rate": 8e-05, "loss": 61.4159, "num_input_tokens_seen": 109028200, "step": 2118 }, { "epoch": 0.20844696690499004, "grad_norm": 28.701793670654297, "learning_rate": 8e-05, "loss": 59.8725, "num_input_tokens_seen": 109193424, "step": 2121 }, { "epoch": 0.20874179995577505, "grad_norm": 36.71512985229492, "learning_rate": 8e-05, "loss": 61.5933, "num_input_tokens_seen": 109348680, "step": 2124 }, { "epoch": 0.20903663300656003, "grad_norm": 33.578277587890625, "learning_rate": 8e-05, "loss": 62.275, "num_input_tokens_seen": 109490436, "step": 2127 }, { "epoch": 0.20933146605734504, "grad_norm": 35.8470573425293, "learning_rate": 8e-05, "loss": 60.6259, "num_input_tokens_seen": 109651880, "step": 2130 }, { "epoch": 0.20962629910813002, "grad_norm": 30.997970581054688, "learning_rate": 8e-05, "loss": 58.4895, "num_input_tokens_seen": 109786272, "step": 2133 }, { "epoch": 0.209921132158915, "grad_norm": 85.87825012207031, "learning_rate": 8e-05, "loss": 62.2673, "num_input_tokens_seen": 109946588, "step": 2136 }, { "epoch": 0.2102159652097, "grad_norm": 46.45925521850586, "learning_rate": 8e-05, "loss": 61.9533, "num_input_tokens_seen": 110106728, "step": 2139 }, { "epoch": 0.210510798260485, "grad_norm": 35.03594970703125, "learning_rate": 8e-05, "loss": 61.7035, "num_input_tokens_seen": 110278860, "step": 2142 }, { "epoch": 0.21080563131127, "grad_norm": 35.61140441894531, "learning_rate": 8e-05, "loss": 58.79, "num_input_tokens_seen": 110423348, "step": 2145 }, { "epoch": 0.21110046436205498, "grad_norm": 48.249786376953125, "learning_rate": 8e-05, "loss": 60.4113, "num_input_tokens_seen": 110576468, "step": 2148 }, { "epoch": 0.21139529741283997, "grad_norm": 40.02042007446289, "learning_rate": 8e-05, "loss": 60.7596, "num_input_tokens_seen": 110727952, "step": 2151 }, { "epoch": 0.21169013046362498, "grad_norm": 36.13483810424805, "learning_rate": 8e-05, "loss": 62.302, "num_input_tokens_seen": 110873488, "step": 2154 }, { "epoch": 0.21198496351440996, "grad_norm": 37.476776123046875, "learning_rate": 8e-05, "loss": 61.4339, "num_input_tokens_seen": 111019072, "step": 2157 }, { "epoch": 0.21227979656519497, "grad_norm": 31.12283706665039, "learning_rate": 8e-05, "loss": 57.8079, "num_input_tokens_seen": 111175896, "step": 2160 }, { "epoch": 0.21257462961597995, "grad_norm": 33.844364166259766, "learning_rate": 8e-05, "loss": 62.2772, "num_input_tokens_seen": 111309704, "step": 2163 }, { "epoch": 0.21286946266676496, "grad_norm": 39.86318588256836, "learning_rate": 8e-05, "loss": 56.3056, "num_input_tokens_seen": 111484440, "step": 2166 }, { "epoch": 0.21316429571754994, "grad_norm": 36.426536560058594, "learning_rate": 8e-05, "loss": 64.5189, "num_input_tokens_seen": 111623400, "step": 2169 }, { "epoch": 0.21345912876833492, "grad_norm": 32.7545051574707, "learning_rate": 8e-05, "loss": 60.6215, "num_input_tokens_seen": 111775816, "step": 2172 }, { "epoch": 0.21375396181911993, "grad_norm": 80.18124389648438, "learning_rate": 8e-05, "loss": 61.9464, "num_input_tokens_seen": 111924912, "step": 2175 }, { "epoch": 0.2140487948699049, "grad_norm": 40.502376556396484, "learning_rate": 8e-05, "loss": 63.1164, "num_input_tokens_seen": 112065796, "step": 2178 }, { "epoch": 0.21434362792068992, "grad_norm": 61.75509262084961, "learning_rate": 8e-05, "loss": 57.283, "num_input_tokens_seen": 112235840, "step": 2181 }, { "epoch": 0.2146384609714749, "grad_norm": 29.798152923583984, "learning_rate": 8e-05, "loss": 58.6705, "num_input_tokens_seen": 112382700, "step": 2184 }, { "epoch": 0.21493329402225989, "grad_norm": 35.83757781982422, "learning_rate": 8e-05, "loss": 56.2894, "num_input_tokens_seen": 112533292, "step": 2187 }, { "epoch": 0.2152281270730449, "grad_norm": 42.107147216796875, "learning_rate": 8e-05, "loss": 63.6896, "num_input_tokens_seen": 112687044, "step": 2190 }, { "epoch": 0.21552296012382988, "grad_norm": 37.97938537597656, "learning_rate": 8e-05, "loss": 61.2909, "num_input_tokens_seen": 112824956, "step": 2193 }, { "epoch": 0.21581779317461489, "grad_norm": 87.84455871582031, "learning_rate": 8e-05, "loss": 61.1541, "num_input_tokens_seen": 112998488, "step": 2196 }, { "epoch": 0.21611262622539987, "grad_norm": 49.55474853515625, "learning_rate": 8e-05, "loss": 60.0391, "num_input_tokens_seen": 113155280, "step": 2199 }, { "epoch": 0.21640745927618485, "grad_norm": 90.66659545898438, "learning_rate": 8e-05, "loss": 60.7969, "num_input_tokens_seen": 113330904, "step": 2202 }, { "epoch": 0.21670229232696986, "grad_norm": 38.3773078918457, "learning_rate": 8e-05, "loss": 62.971, "num_input_tokens_seen": 113484604, "step": 2205 }, { "epoch": 0.21699712537775484, "grad_norm": 36.9621696472168, "learning_rate": 8e-05, "loss": 61.0875, "num_input_tokens_seen": 113649252, "step": 2208 }, { "epoch": 0.21729195842853985, "grad_norm": 44.32843017578125, "learning_rate": 8e-05, "loss": 56.9424, "num_input_tokens_seen": 113793828, "step": 2211 }, { "epoch": 0.21758679147932483, "grad_norm": 34.17706298828125, "learning_rate": 8e-05, "loss": 56.7555, "num_input_tokens_seen": 113955836, "step": 2214 }, { "epoch": 0.2178816245301098, "grad_norm": 33.80354309082031, "learning_rate": 8e-05, "loss": 59.7268, "num_input_tokens_seen": 114119876, "step": 2217 }, { "epoch": 0.21817645758089482, "grad_norm": 52.79160690307617, "learning_rate": 8e-05, "loss": 59.4334, "num_input_tokens_seen": 114271076, "step": 2220 }, { "epoch": 0.2184712906316798, "grad_norm": 31.56442642211914, "learning_rate": 8e-05, "loss": 61.6311, "num_input_tokens_seen": 114431644, "step": 2223 }, { "epoch": 0.2187661236824648, "grad_norm": 37.805503845214844, "learning_rate": 8e-05, "loss": 59.168, "num_input_tokens_seen": 114598224, "step": 2226 }, { "epoch": 0.2190609567332498, "grad_norm": 36.063968658447266, "learning_rate": 8e-05, "loss": 58.9546, "num_input_tokens_seen": 114751272, "step": 2229 }, { "epoch": 0.21935578978403478, "grad_norm": 278.02459716796875, "learning_rate": 8e-05, "loss": 55.4246, "num_input_tokens_seen": 114878676, "step": 2232 }, { "epoch": 0.2196506228348198, "grad_norm": 34.540164947509766, "learning_rate": 8e-05, "loss": 61.1208, "num_input_tokens_seen": 115022248, "step": 2235 }, { "epoch": 0.21994545588560477, "grad_norm": 43.42741775512695, "learning_rate": 8e-05, "loss": 59.6594, "num_input_tokens_seen": 115202884, "step": 2238 }, { "epoch": 0.22024028893638978, "grad_norm": 53.860260009765625, "learning_rate": 8e-05, "loss": 59.714, "num_input_tokens_seen": 115338936, "step": 2241 }, { "epoch": 0.22053512198717476, "grad_norm": 37.03718185424805, "learning_rate": 8e-05, "loss": 61.3456, "num_input_tokens_seen": 115484732, "step": 2244 }, { "epoch": 0.22082995503795977, "grad_norm": 41.423885345458984, "learning_rate": 8e-05, "loss": 62.5182, "num_input_tokens_seen": 115633372, "step": 2247 }, { "epoch": 0.22112478808874475, "grad_norm": 38.18406295776367, "learning_rate": 8e-05, "loss": 58.236, "num_input_tokens_seen": 115796768, "step": 2250 }, { "epoch": 0.22141962113952973, "grad_norm": 35.71892166137695, "learning_rate": 8e-05, "loss": 56.6222, "num_input_tokens_seen": 115939736, "step": 2253 }, { "epoch": 0.22171445419031474, "grad_norm": 37.585693359375, "learning_rate": 8e-05, "loss": 58.3685, "num_input_tokens_seen": 116096544, "step": 2256 }, { "epoch": 0.22200928724109972, "grad_norm": 31.749067306518555, "learning_rate": 8e-05, "loss": 59.9537, "num_input_tokens_seen": 116267532, "step": 2259 }, { "epoch": 0.22230412029188473, "grad_norm": 32.64338302612305, "learning_rate": 8e-05, "loss": 60.2476, "num_input_tokens_seen": 116429168, "step": 2262 }, { "epoch": 0.22259895334266971, "grad_norm": 36.87150573730469, "learning_rate": 8e-05, "loss": 59.6034, "num_input_tokens_seen": 116583948, "step": 2265 }, { "epoch": 0.2228937863934547, "grad_norm": 35.45121383666992, "learning_rate": 8e-05, "loss": 61.7586, "num_input_tokens_seen": 116727856, "step": 2268 }, { "epoch": 0.2231886194442397, "grad_norm": 57.21012878417969, "learning_rate": 8e-05, "loss": 57.9647, "num_input_tokens_seen": 116878736, "step": 2271 }, { "epoch": 0.2234834524950247, "grad_norm": 37.94432830810547, "learning_rate": 8e-05, "loss": 60.7469, "num_input_tokens_seen": 117057792, "step": 2274 }, { "epoch": 0.2237782855458097, "grad_norm": 30.563480377197266, "learning_rate": 8e-05, "loss": 56.5987, "num_input_tokens_seen": 117225848, "step": 2277 }, { "epoch": 0.22407311859659468, "grad_norm": 31.604551315307617, "learning_rate": 8e-05, "loss": 60.5782, "num_input_tokens_seen": 117357464, "step": 2280 }, { "epoch": 0.22436795164737966, "grad_norm": 36.0341682434082, "learning_rate": 8e-05, "loss": 59.2931, "num_input_tokens_seen": 117506908, "step": 2283 }, { "epoch": 0.22466278469816467, "grad_norm": 35.426395416259766, "learning_rate": 8e-05, "loss": 61.6121, "num_input_tokens_seen": 117663704, "step": 2286 }, { "epoch": 0.22495761774894965, "grad_norm": 43.54740524291992, "learning_rate": 8e-05, "loss": 58.8303, "num_input_tokens_seen": 117807716, "step": 2289 }, { "epoch": 0.22525245079973466, "grad_norm": 35.09873962402344, "learning_rate": 8e-05, "loss": 57.9195, "num_input_tokens_seen": 117980380, "step": 2292 }, { "epoch": 0.22554728385051964, "grad_norm": 37.0145263671875, "learning_rate": 8e-05, "loss": 61.547, "num_input_tokens_seen": 118156204, "step": 2295 }, { "epoch": 0.22584211690130462, "grad_norm": 37.9959831237793, "learning_rate": 8e-05, "loss": 57.3029, "num_input_tokens_seen": 118287944, "step": 2298 }, { "epoch": 0.22613694995208963, "grad_norm": 42.40966033935547, "learning_rate": 8e-05, "loss": 61.7216, "num_input_tokens_seen": 118442436, "step": 2301 }, { "epoch": 0.22643178300287461, "grad_norm": 34.89582061767578, "learning_rate": 8e-05, "loss": 62.3091, "num_input_tokens_seen": 118595600, "step": 2304 }, { "epoch": 0.22672661605365962, "grad_norm": 32.897647857666016, "learning_rate": 8e-05, "loss": 56.707, "num_input_tokens_seen": 118755620, "step": 2307 }, { "epoch": 0.2270214491044446, "grad_norm": 50.87218475341797, "learning_rate": 8e-05, "loss": 58.4343, "num_input_tokens_seen": 118917800, "step": 2310 }, { "epoch": 0.2273162821552296, "grad_norm": 36.530635833740234, "learning_rate": 8e-05, "loss": 55.3719, "num_input_tokens_seen": 119066396, "step": 2313 }, { "epoch": 0.2276111152060146, "grad_norm": 38.81621551513672, "learning_rate": 8e-05, "loss": 62.1988, "num_input_tokens_seen": 119226812, "step": 2316 }, { "epoch": 0.22790594825679958, "grad_norm": 33.9807243347168, "learning_rate": 8e-05, "loss": 55.9138, "num_input_tokens_seen": 119388700, "step": 2319 }, { "epoch": 0.2282007813075846, "grad_norm": 45.930171966552734, "learning_rate": 8e-05, "loss": 58.6786, "num_input_tokens_seen": 119529208, "step": 2322 }, { "epoch": 0.22849561435836957, "grad_norm": 38.361358642578125, "learning_rate": 8e-05, "loss": 60.9187, "num_input_tokens_seen": 119673064, "step": 2325 }, { "epoch": 0.22879044740915458, "grad_norm": 33.112945556640625, "learning_rate": 8e-05, "loss": 59.7167, "num_input_tokens_seen": 119812848, "step": 2328 }, { "epoch": 0.22908528045993956, "grad_norm": 37.15597915649414, "learning_rate": 8e-05, "loss": 62.2376, "num_input_tokens_seen": 119964060, "step": 2331 }, { "epoch": 0.22938011351072454, "grad_norm": 38.77449035644531, "learning_rate": 8e-05, "loss": 61.0384, "num_input_tokens_seen": 120133004, "step": 2334 }, { "epoch": 0.22967494656150955, "grad_norm": 35.40461730957031, "learning_rate": 8e-05, "loss": 57.5117, "num_input_tokens_seen": 120282564, "step": 2337 }, { "epoch": 0.22996977961229453, "grad_norm": 39.79495620727539, "learning_rate": 8e-05, "loss": 61.9955, "num_input_tokens_seen": 120457480, "step": 2340 }, { "epoch": 0.23026461266307954, "grad_norm": 33.81966018676758, "learning_rate": 8e-05, "loss": 57.1984, "num_input_tokens_seen": 120604636, "step": 2343 }, { "epoch": 0.23055944571386452, "grad_norm": 33.839759826660156, "learning_rate": 8e-05, "loss": 59.4311, "num_input_tokens_seen": 120754696, "step": 2346 }, { "epoch": 0.2308542787646495, "grad_norm": 34.786277770996094, "learning_rate": 8e-05, "loss": 61.8979, "num_input_tokens_seen": 120895720, "step": 2349 }, { "epoch": 0.23114911181543452, "grad_norm": 36.697044372558594, "learning_rate": 8e-05, "loss": 56.9062, "num_input_tokens_seen": 121047480, "step": 2352 }, { "epoch": 0.2314439448662195, "grad_norm": 37.01709747314453, "learning_rate": 8e-05, "loss": 61.0646, "num_input_tokens_seen": 121209004, "step": 2355 }, { "epoch": 0.2317387779170045, "grad_norm": 32.902713775634766, "learning_rate": 8e-05, "loss": 57.6042, "num_input_tokens_seen": 121344680, "step": 2358 }, { "epoch": 0.2320336109677895, "grad_norm": 48.72361755371094, "learning_rate": 8e-05, "loss": 59.6539, "num_input_tokens_seen": 121501404, "step": 2361 }, { "epoch": 0.23232844401857447, "grad_norm": 33.87553405761719, "learning_rate": 8e-05, "loss": 57.2243, "num_input_tokens_seen": 121660472, "step": 2364 }, { "epoch": 0.23262327706935948, "grad_norm": 35.26018142700195, "learning_rate": 8e-05, "loss": 57.8523, "num_input_tokens_seen": 121818284, "step": 2367 }, { "epoch": 0.23291811012014446, "grad_norm": 36.639827728271484, "learning_rate": 8e-05, "loss": 59.1539, "num_input_tokens_seen": 121966424, "step": 2370 }, { "epoch": 0.23321294317092947, "grad_norm": 39.612640380859375, "learning_rate": 8e-05, "loss": 60.4686, "num_input_tokens_seen": 122139672, "step": 2373 }, { "epoch": 0.23350777622171445, "grad_norm": 37.069705963134766, "learning_rate": 8e-05, "loss": 58.0885, "num_input_tokens_seen": 122294392, "step": 2376 }, { "epoch": 0.23380260927249943, "grad_norm": 35.78841781616211, "learning_rate": 8e-05, "loss": 56.5211, "num_input_tokens_seen": 122444948, "step": 2379 }, { "epoch": 0.23409744232328444, "grad_norm": 116.17157745361328, "learning_rate": 8e-05, "loss": 59.1547, "num_input_tokens_seen": 122598056, "step": 2382 }, { "epoch": 0.23439227537406943, "grad_norm": 35.66117477416992, "learning_rate": 8e-05, "loss": 57.1662, "num_input_tokens_seen": 122744792, "step": 2385 }, { "epoch": 0.23468710842485443, "grad_norm": 40.41534423828125, "learning_rate": 8e-05, "loss": 59.0607, "num_input_tokens_seen": 122896372, "step": 2388 }, { "epoch": 0.23498194147563942, "grad_norm": 38.96350860595703, "learning_rate": 8e-05, "loss": 61.4238, "num_input_tokens_seen": 123069076, "step": 2391 }, { "epoch": 0.23527677452642443, "grad_norm": 31.967472076416016, "learning_rate": 8e-05, "loss": 57.8557, "num_input_tokens_seen": 123226456, "step": 2394 }, { "epoch": 0.2355716075772094, "grad_norm": 31.418663024902344, "learning_rate": 8e-05, "loss": 56.2966, "num_input_tokens_seen": 123376080, "step": 2397 }, { "epoch": 0.2358664406279944, "grad_norm": 32.175514221191406, "learning_rate": 8e-05, "loss": 57.1659, "num_input_tokens_seen": 123541412, "step": 2400 }, { "epoch": 0.2361612736787794, "grad_norm": 34.742496490478516, "learning_rate": 8e-05, "loss": 59.9754, "num_input_tokens_seen": 123706440, "step": 2403 }, { "epoch": 0.23645610672956438, "grad_norm": 29.358062744140625, "learning_rate": 8e-05, "loss": 54.3595, "num_input_tokens_seen": 123858644, "step": 2406 }, { "epoch": 0.2367509397803494, "grad_norm": 33.7983283996582, "learning_rate": 8e-05, "loss": 55.3638, "num_input_tokens_seen": 124025808, "step": 2409 }, { "epoch": 0.23704577283113437, "grad_norm": 39.6863899230957, "learning_rate": 8e-05, "loss": 56.7066, "num_input_tokens_seen": 124198856, "step": 2412 }, { "epoch": 0.23734060588191935, "grad_norm": 35.42102813720703, "learning_rate": 8e-05, "loss": 58.0268, "num_input_tokens_seen": 124340672, "step": 2415 }, { "epoch": 0.23763543893270436, "grad_norm": 33.165950775146484, "learning_rate": 8e-05, "loss": 59.3338, "num_input_tokens_seen": 124474688, "step": 2418 }, { "epoch": 0.23793027198348934, "grad_norm": 35.83243179321289, "learning_rate": 8e-05, "loss": 56.8695, "num_input_tokens_seen": 124628504, "step": 2421 }, { "epoch": 0.23822510503427435, "grad_norm": 30.844099044799805, "learning_rate": 8e-05, "loss": 53.5109, "num_input_tokens_seen": 124770256, "step": 2424 }, { "epoch": 0.23851993808505934, "grad_norm": 47.932647705078125, "learning_rate": 8e-05, "loss": 55.0004, "num_input_tokens_seen": 124920860, "step": 2427 }, { "epoch": 0.23881477113584432, "grad_norm": 37.77656173706055, "learning_rate": 8e-05, "loss": 56.5262, "num_input_tokens_seen": 125088196, "step": 2430 }, { "epoch": 0.23910960418662933, "grad_norm": 36.80366897583008, "learning_rate": 8e-05, "loss": 56.8007, "num_input_tokens_seen": 125243848, "step": 2433 }, { "epoch": 0.2394044372374143, "grad_norm": 34.99855041503906, "learning_rate": 8e-05, "loss": 58.5576, "num_input_tokens_seen": 125404240, "step": 2436 }, { "epoch": 0.23969927028819932, "grad_norm": 338.2091979980469, "learning_rate": 8e-05, "loss": 57.9598, "num_input_tokens_seen": 125566664, "step": 2439 }, { "epoch": 0.2399941033389843, "grad_norm": 41.78690719604492, "learning_rate": 8e-05, "loss": 56.5716, "num_input_tokens_seen": 125708800, "step": 2442 }, { "epoch": 0.24028893638976928, "grad_norm": 33.10038757324219, "learning_rate": 8e-05, "loss": 55.7882, "num_input_tokens_seen": 125865424, "step": 2445 }, { "epoch": 0.2405837694405543, "grad_norm": 32.90071487426758, "learning_rate": 8e-05, "loss": 54.1681, "num_input_tokens_seen": 125992848, "step": 2448 }, { "epoch": 0.24087860249133927, "grad_norm": 38.925724029541016, "learning_rate": 8e-05, "loss": 56.0581, "num_input_tokens_seen": 126151048, "step": 2451 }, { "epoch": 0.24117343554212428, "grad_norm": 36.73653793334961, "learning_rate": 8e-05, "loss": 57.7794, "num_input_tokens_seen": 126319592, "step": 2454 }, { "epoch": 0.24146826859290926, "grad_norm": 41.0250244140625, "learning_rate": 8e-05, "loss": 54.5532, "num_input_tokens_seen": 126481832, "step": 2457 }, { "epoch": 0.24176310164369424, "grad_norm": 30.037355422973633, "learning_rate": 8e-05, "loss": 58.5533, "num_input_tokens_seen": 126632916, "step": 2460 }, { "epoch": 0.24205793469447925, "grad_norm": 44.85157775878906, "learning_rate": 8e-05, "loss": 58.7409, "num_input_tokens_seen": 126777564, "step": 2463 }, { "epoch": 0.24235276774526424, "grad_norm": 29.838281631469727, "learning_rate": 8e-05, "loss": 55.2551, "num_input_tokens_seen": 126935656, "step": 2466 }, { "epoch": 0.24264760079604925, "grad_norm": 35.31085205078125, "learning_rate": 8e-05, "loss": 55.8649, "num_input_tokens_seen": 127082204, "step": 2469 }, { "epoch": 0.24294243384683423, "grad_norm": 36.91703796386719, "learning_rate": 8e-05, "loss": 57.6778, "num_input_tokens_seen": 127241464, "step": 2472 }, { "epoch": 0.24323726689761924, "grad_norm": 31.66358757019043, "learning_rate": 8e-05, "loss": 56.7686, "num_input_tokens_seen": 127397924, "step": 2475 }, { "epoch": 0.24353209994840422, "grad_norm": 35.33116912841797, "learning_rate": 8e-05, "loss": 56.6367, "num_input_tokens_seen": 127559408, "step": 2478 }, { "epoch": 0.2438269329991892, "grad_norm": 34.71982192993164, "learning_rate": 8e-05, "loss": 60.8994, "num_input_tokens_seen": 127728204, "step": 2481 }, { "epoch": 0.2441217660499742, "grad_norm": 35.25178146362305, "learning_rate": 8e-05, "loss": 58.6995, "num_input_tokens_seen": 127885936, "step": 2484 }, { "epoch": 0.2444165991007592, "grad_norm": 39.45205307006836, "learning_rate": 8e-05, "loss": 55.208, "num_input_tokens_seen": 128030752, "step": 2487 }, { "epoch": 0.2447114321515442, "grad_norm": 36.48596954345703, "learning_rate": 8e-05, "loss": 55.4749, "num_input_tokens_seen": 128191944, "step": 2490 }, { "epoch": 0.24500626520232918, "grad_norm": 34.035316467285156, "learning_rate": 8e-05, "loss": 53.0131, "num_input_tokens_seen": 128345520, "step": 2493 }, { "epoch": 0.24530109825311416, "grad_norm": 30.92428207397461, "learning_rate": 8e-05, "loss": 54.5648, "num_input_tokens_seen": 128490620, "step": 2496 }, { "epoch": 0.24559593130389917, "grad_norm": 35.92305374145508, "learning_rate": 8e-05, "loss": 57.5855, "num_input_tokens_seen": 128639248, "step": 2499 }, { "epoch": 0.24589076435468415, "grad_norm": 35.16416931152344, "learning_rate": 8e-05, "loss": 55.8813, "num_input_tokens_seen": 128806916, "step": 2502 }, { "epoch": 0.24618559740546916, "grad_norm": 35.76316452026367, "learning_rate": 8e-05, "loss": 56.8321, "num_input_tokens_seen": 128949452, "step": 2505 }, { "epoch": 0.24648043045625415, "grad_norm": 32.33674240112305, "learning_rate": 8e-05, "loss": 53.35, "num_input_tokens_seen": 129100852, "step": 2508 }, { "epoch": 0.24677526350703913, "grad_norm": 33.43571472167969, "learning_rate": 8e-05, "loss": 57.0671, "num_input_tokens_seen": 129252204, "step": 2511 }, { "epoch": 0.24707009655782414, "grad_norm": 140.02317810058594, "learning_rate": 8e-05, "loss": 55.7291, "num_input_tokens_seen": 129397452, "step": 2514 }, { "epoch": 0.24736492960860912, "grad_norm": 37.19168472290039, "learning_rate": 8e-05, "loss": 53.1093, "num_input_tokens_seen": 129535376, "step": 2517 }, { "epoch": 0.24765976265939413, "grad_norm": 48.09902572631836, "learning_rate": 8e-05, "loss": 56.053, "num_input_tokens_seen": 129676028, "step": 2520 }, { "epoch": 0.2479545957101791, "grad_norm": 37.414947509765625, "learning_rate": 8e-05, "loss": 54.0977, "num_input_tokens_seen": 129844620, "step": 2523 }, { "epoch": 0.2482494287609641, "grad_norm": 77.83485412597656, "learning_rate": 8e-05, "loss": 53.7719, "num_input_tokens_seen": 129978612, "step": 2526 }, { "epoch": 0.2485442618117491, "grad_norm": 33.74465560913086, "learning_rate": 8e-05, "loss": 52.7374, "num_input_tokens_seen": 130135364, "step": 2529 }, { "epoch": 0.24883909486253408, "grad_norm": 35.31605911254883, "learning_rate": 8e-05, "loss": 55.9591, "num_input_tokens_seen": 130296404, "step": 2532 }, { "epoch": 0.2491339279133191, "grad_norm": 28.435291290283203, "learning_rate": 8e-05, "loss": 50.9603, "num_input_tokens_seen": 130467904, "step": 2535 }, { "epoch": 0.24942876096410407, "grad_norm": 40.538909912109375, "learning_rate": 8e-05, "loss": 55.7693, "num_input_tokens_seen": 130627960, "step": 2538 }, { "epoch": 0.24972359401488906, "grad_norm": 34.60548782348633, "learning_rate": 8e-05, "loss": 53.3829, "num_input_tokens_seen": 130777832, "step": 2541 }, { "epoch": 0.25001842706567406, "grad_norm": 36.60821533203125, "learning_rate": 8e-05, "loss": 56.8237, "num_input_tokens_seen": 130921848, "step": 2544 }, { "epoch": 0.25031326011645905, "grad_norm": 31.987855911254883, "learning_rate": 8e-05, "loss": 53.2189, "num_input_tokens_seen": 131084284, "step": 2547 }, { "epoch": 0.25060809316724403, "grad_norm": 40.75633239746094, "learning_rate": 8e-05, "loss": 57.2221, "num_input_tokens_seen": 131236140, "step": 2550 }, { "epoch": 0.25090292621802907, "grad_norm": 34.93666076660156, "learning_rate": 8e-05, "loss": 58.1397, "num_input_tokens_seen": 131394640, "step": 2553 }, { "epoch": 0.25119775926881405, "grad_norm": 30.523983001708984, "learning_rate": 8e-05, "loss": 56.3125, "num_input_tokens_seen": 131544288, "step": 2556 }, { "epoch": 0.25149259231959903, "grad_norm": 36.5615348815918, "learning_rate": 8e-05, "loss": 58.1565, "num_input_tokens_seen": 131694888, "step": 2559 }, { "epoch": 0.251787425370384, "grad_norm": 41.425392150878906, "learning_rate": 8e-05, "loss": 53.5081, "num_input_tokens_seen": 131859732, "step": 2562 }, { "epoch": 0.252082258421169, "grad_norm": 39.53923034667969, "learning_rate": 8e-05, "loss": 52.4747, "num_input_tokens_seen": 131998368, "step": 2565 }, { "epoch": 0.25237709147195403, "grad_norm": 36.601383209228516, "learning_rate": 8e-05, "loss": 56.8967, "num_input_tokens_seen": 132155796, "step": 2568 }, { "epoch": 0.252671924522739, "grad_norm": 31.163291931152344, "learning_rate": 8e-05, "loss": 54.3867, "num_input_tokens_seen": 132304292, "step": 2571 }, { "epoch": 0.252966757573524, "grad_norm": 49.13053512573242, "learning_rate": 8e-05, "loss": 57.8686, "num_input_tokens_seen": 132467932, "step": 2574 }, { "epoch": 0.253261590624309, "grad_norm": 38.996482849121094, "learning_rate": 8e-05, "loss": 52.0673, "num_input_tokens_seen": 132617624, "step": 2577 }, { "epoch": 0.25355642367509396, "grad_norm": 43.19272994995117, "learning_rate": 8e-05, "loss": 55.1637, "num_input_tokens_seen": 132774048, "step": 2580 }, { "epoch": 0.253851256725879, "grad_norm": 34.15322494506836, "learning_rate": 8e-05, "loss": 58.8447, "num_input_tokens_seen": 132933688, "step": 2583 }, { "epoch": 0.254146089776664, "grad_norm": 31.180694580078125, "learning_rate": 8e-05, "loss": 55.6801, "num_input_tokens_seen": 133091016, "step": 2586 }, { "epoch": 0.25444092282744896, "grad_norm": 35.01714324951172, "learning_rate": 8e-05, "loss": 54.9311, "num_input_tokens_seen": 133224776, "step": 2589 }, { "epoch": 0.25473575587823394, "grad_norm": 39.41815185546875, "learning_rate": 8e-05, "loss": 55.9144, "num_input_tokens_seen": 133376372, "step": 2592 }, { "epoch": 0.2550305889290189, "grad_norm": 36.53614044189453, "learning_rate": 8e-05, "loss": 57.2327, "num_input_tokens_seen": 133537224, "step": 2595 }, { "epoch": 0.25532542197980396, "grad_norm": 34.72364807128906, "learning_rate": 8e-05, "loss": 56.9296, "num_input_tokens_seen": 133704596, "step": 2598 }, { "epoch": 0.25562025503058894, "grad_norm": 34.47258758544922, "learning_rate": 8e-05, "loss": 52.5994, "num_input_tokens_seen": 133853996, "step": 2601 }, { "epoch": 0.2559150880813739, "grad_norm": 38.56904220581055, "learning_rate": 8e-05, "loss": 53.1313, "num_input_tokens_seen": 134009856, "step": 2604 }, { "epoch": 0.2562099211321589, "grad_norm": 32.298828125, "learning_rate": 8e-05, "loss": 55.5707, "num_input_tokens_seen": 134152368, "step": 2607 }, { "epoch": 0.2565047541829439, "grad_norm": 38.61540985107422, "learning_rate": 8e-05, "loss": 57.645, "num_input_tokens_seen": 134319592, "step": 2610 }, { "epoch": 0.2567995872337289, "grad_norm": 49.49209213256836, "learning_rate": 8e-05, "loss": 52.0047, "num_input_tokens_seen": 134464892, "step": 2613 }, { "epoch": 0.2570944202845139, "grad_norm": 51.2288932800293, "learning_rate": 8e-05, "loss": 54.2843, "num_input_tokens_seen": 134616000, "step": 2616 }, { "epoch": 0.2573892533352989, "grad_norm": 31.375375747680664, "learning_rate": 8e-05, "loss": 53.1026, "num_input_tokens_seen": 134774184, "step": 2619 }, { "epoch": 0.25768408638608387, "grad_norm": 34.40566635131836, "learning_rate": 8e-05, "loss": 56.4985, "num_input_tokens_seen": 134940048, "step": 2622 }, { "epoch": 0.25797891943686885, "grad_norm": 33.19921112060547, "learning_rate": 8e-05, "loss": 53.7378, "num_input_tokens_seen": 135099000, "step": 2625 }, { "epoch": 0.2582737524876539, "grad_norm": 35.480255126953125, "learning_rate": 8e-05, "loss": 51.1283, "num_input_tokens_seen": 135251292, "step": 2628 }, { "epoch": 0.25856858553843887, "grad_norm": 51.189632415771484, "learning_rate": 8e-05, "loss": 58.2569, "num_input_tokens_seen": 135414044, "step": 2631 }, { "epoch": 0.25886341858922385, "grad_norm": 33.631473541259766, "learning_rate": 8e-05, "loss": 54.2228, "num_input_tokens_seen": 135587632, "step": 2634 }, { "epoch": 0.25915825164000883, "grad_norm": 35.20783615112305, "learning_rate": 8e-05, "loss": 50.3905, "num_input_tokens_seen": 135736856, "step": 2637 }, { "epoch": 0.2594530846907938, "grad_norm": 41.462608337402344, "learning_rate": 8e-05, "loss": 53.096, "num_input_tokens_seen": 135891044, "step": 2640 }, { "epoch": 0.25974791774157885, "grad_norm": 36.87857437133789, "learning_rate": 8e-05, "loss": 57.6014, "num_input_tokens_seen": 136053652, "step": 2643 }, { "epoch": 0.26004275079236383, "grad_norm": 34.586448669433594, "learning_rate": 8e-05, "loss": 56.0422, "num_input_tokens_seen": 136177132, "step": 2646 }, { "epoch": 0.2603375838431488, "grad_norm": 34.89237594604492, "learning_rate": 8e-05, "loss": 54.252, "num_input_tokens_seen": 136318884, "step": 2649 }, { "epoch": 0.2606324168939338, "grad_norm": 30.432071685791016, "learning_rate": 8e-05, "loss": 54.7018, "num_input_tokens_seen": 136478960, "step": 2652 }, { "epoch": 0.2609272499447188, "grad_norm": 37.170528411865234, "learning_rate": 8e-05, "loss": 52.9007, "num_input_tokens_seen": 136650348, "step": 2655 }, { "epoch": 0.2612220829955038, "grad_norm": 27.782188415527344, "learning_rate": 8e-05, "loss": 53.955, "num_input_tokens_seen": 136797140, "step": 2658 }, { "epoch": 0.2615169160462888, "grad_norm": 34.65721893310547, "learning_rate": 8e-05, "loss": 54.2823, "num_input_tokens_seen": 136949440, "step": 2661 }, { "epoch": 0.2618117490970738, "grad_norm": 37.2616081237793, "learning_rate": 8e-05, "loss": 56.0293, "num_input_tokens_seen": 137098148, "step": 2664 }, { "epoch": 0.26210658214785876, "grad_norm": 34.62560272216797, "learning_rate": 8e-05, "loss": 53.8384, "num_input_tokens_seen": 137253928, "step": 2667 }, { "epoch": 0.2624014151986438, "grad_norm": 31.226455688476562, "learning_rate": 8e-05, "loss": 55.0638, "num_input_tokens_seen": 137390820, "step": 2670 }, { "epoch": 0.2626962482494288, "grad_norm": 48.27977752685547, "learning_rate": 8e-05, "loss": 53.2792, "num_input_tokens_seen": 137559424, "step": 2673 }, { "epoch": 0.26299108130021376, "grad_norm": 32.660499572753906, "learning_rate": 8e-05, "loss": 52.7827, "num_input_tokens_seen": 137719552, "step": 2676 }, { "epoch": 0.26328591435099874, "grad_norm": 33.29769515991211, "learning_rate": 8e-05, "loss": 52.1215, "num_input_tokens_seen": 137877180, "step": 2679 }, { "epoch": 0.2635807474017837, "grad_norm": 33.17557144165039, "learning_rate": 8e-05, "loss": 54.5644, "num_input_tokens_seen": 138048340, "step": 2682 }, { "epoch": 0.26387558045256876, "grad_norm": 29.700239181518555, "learning_rate": 8e-05, "loss": 49.6223, "num_input_tokens_seen": 138212264, "step": 2685 }, { "epoch": 0.26417041350335374, "grad_norm": 33.049434661865234, "learning_rate": 8e-05, "loss": 56.5728, "num_input_tokens_seen": 138395368, "step": 2688 }, { "epoch": 0.2644652465541387, "grad_norm": 35.083763122558594, "learning_rate": 8e-05, "loss": 54.3003, "num_input_tokens_seen": 138527808, "step": 2691 }, { "epoch": 0.2647600796049237, "grad_norm": 29.783483505249023, "learning_rate": 8e-05, "loss": 51.9218, "num_input_tokens_seen": 138693232, "step": 2694 }, { "epoch": 0.2650549126557087, "grad_norm": 28.74640464782715, "learning_rate": 8e-05, "loss": 52.2437, "num_input_tokens_seen": 138844128, "step": 2697 }, { "epoch": 0.2653497457064937, "grad_norm": 33.24985122680664, "learning_rate": 8e-05, "loss": 54.1239, "num_input_tokens_seen": 138994916, "step": 2700 }, { "epoch": 0.2656445787572787, "grad_norm": 29.64424705505371, "learning_rate": 8e-05, "loss": 53.2436, "num_input_tokens_seen": 139143228, "step": 2703 }, { "epoch": 0.2659394118080637, "grad_norm": 35.86513137817383, "learning_rate": 8e-05, "loss": 56.2783, "num_input_tokens_seen": 139297640, "step": 2706 }, { "epoch": 0.26623424485884867, "grad_norm": 31.42705726623535, "learning_rate": 8e-05, "loss": 56.3895, "num_input_tokens_seen": 139449136, "step": 2709 }, { "epoch": 0.26652907790963365, "grad_norm": 170.20506286621094, "learning_rate": 8e-05, "loss": 53.5718, "num_input_tokens_seen": 139597664, "step": 2712 }, { "epoch": 0.2668239109604187, "grad_norm": 37.06209945678711, "learning_rate": 8e-05, "loss": 51.3754, "num_input_tokens_seen": 139744708, "step": 2715 }, { "epoch": 0.26711874401120367, "grad_norm": 32.0537223815918, "learning_rate": 8e-05, "loss": 54.4363, "num_input_tokens_seen": 139911304, "step": 2718 }, { "epoch": 0.26741357706198865, "grad_norm": 31.959020614624023, "learning_rate": 8e-05, "loss": 57.0125, "num_input_tokens_seen": 140068588, "step": 2721 }, { "epoch": 0.26770841011277363, "grad_norm": 32.1031608581543, "learning_rate": 8e-05, "loss": 48.1442, "num_input_tokens_seen": 140220864, "step": 2724 }, { "epoch": 0.2680032431635586, "grad_norm": 53.579872131347656, "learning_rate": 8e-05, "loss": 53.3052, "num_input_tokens_seen": 140390072, "step": 2727 }, { "epoch": 0.26829807621434365, "grad_norm": 32.496768951416016, "learning_rate": 8e-05, "loss": 55.885, "num_input_tokens_seen": 140551612, "step": 2730 }, { "epoch": 0.26859290926512863, "grad_norm": 33.73159408569336, "learning_rate": 8e-05, "loss": 49.2143, "num_input_tokens_seen": 140705784, "step": 2733 }, { "epoch": 0.2688877423159136, "grad_norm": 44.1909294128418, "learning_rate": 8e-05, "loss": 51.0975, "num_input_tokens_seen": 140851328, "step": 2736 }, { "epoch": 0.2691825753666986, "grad_norm": 35.7469367980957, "learning_rate": 8e-05, "loss": 52.4786, "num_input_tokens_seen": 141018260, "step": 2739 }, { "epoch": 0.2694774084174836, "grad_norm": 35.578575134277344, "learning_rate": 8e-05, "loss": 54.4309, "num_input_tokens_seen": 141157520, "step": 2742 }, { "epoch": 0.2697722414682686, "grad_norm": 31.34946060180664, "learning_rate": 8e-05, "loss": 54.8189, "num_input_tokens_seen": 141322616, "step": 2745 }, { "epoch": 0.2700670745190536, "grad_norm": 29.354907989501953, "learning_rate": 8e-05, "loss": 55.8781, "num_input_tokens_seen": 141496008, "step": 2748 }, { "epoch": 0.2703619075698386, "grad_norm": 30.352764129638672, "learning_rate": 8e-05, "loss": 52.979, "num_input_tokens_seen": 141654120, "step": 2751 }, { "epoch": 0.27065674062062356, "grad_norm": 29.738435745239258, "learning_rate": 8e-05, "loss": 50.5746, "num_input_tokens_seen": 141794060, "step": 2754 }, { "epoch": 0.27095157367140854, "grad_norm": 30.069454193115234, "learning_rate": 8e-05, "loss": 53.589, "num_input_tokens_seen": 141943596, "step": 2757 }, { "epoch": 0.2712464067221936, "grad_norm": 33.227054595947266, "learning_rate": 8e-05, "loss": 49.3517, "num_input_tokens_seen": 142090492, "step": 2760 }, { "epoch": 0.27154123977297856, "grad_norm": 32.27566909790039, "learning_rate": 8e-05, "loss": 53.5934, "num_input_tokens_seen": 142261100, "step": 2763 }, { "epoch": 0.27183607282376354, "grad_norm": 64.10994720458984, "learning_rate": 8e-05, "loss": 52.9152, "num_input_tokens_seen": 142407892, "step": 2766 }, { "epoch": 0.2721309058745485, "grad_norm": 96.94121551513672, "learning_rate": 8e-05, "loss": 51.15, "num_input_tokens_seen": 142552576, "step": 2769 }, { "epoch": 0.2724257389253335, "grad_norm": 40.26789093017578, "learning_rate": 8e-05, "loss": 52.0754, "num_input_tokens_seen": 142702412, "step": 2772 }, { "epoch": 0.27272057197611854, "grad_norm": 31.475791931152344, "learning_rate": 8e-05, "loss": 55.3629, "num_input_tokens_seen": 142858748, "step": 2775 }, { "epoch": 0.2730154050269035, "grad_norm": 32.535865783691406, "learning_rate": 8e-05, "loss": 48.8535, "num_input_tokens_seen": 143027580, "step": 2778 }, { "epoch": 0.2733102380776885, "grad_norm": 31.21608543395996, "learning_rate": 8e-05, "loss": 52.8587, "num_input_tokens_seen": 143182316, "step": 2781 }, { "epoch": 0.2736050711284735, "grad_norm": 33.09787368774414, "learning_rate": 8e-05, "loss": 52.7387, "num_input_tokens_seen": 143350384, "step": 2784 }, { "epoch": 0.27389990417925847, "grad_norm": 32.19013977050781, "learning_rate": 8e-05, "loss": 55.4064, "num_input_tokens_seen": 143494136, "step": 2787 }, { "epoch": 0.2741947372300435, "grad_norm": 33.92564010620117, "learning_rate": 8e-05, "loss": 52.2523, "num_input_tokens_seen": 143658476, "step": 2790 }, { "epoch": 0.2744895702808285, "grad_norm": 28.820104598999023, "learning_rate": 8e-05, "loss": 52.7883, "num_input_tokens_seen": 143809704, "step": 2793 }, { "epoch": 0.27478440333161347, "grad_norm": 32.38165283203125, "learning_rate": 8e-05, "loss": 55.9081, "num_input_tokens_seen": 143979960, "step": 2796 }, { "epoch": 0.27507923638239845, "grad_norm": 30.862777709960938, "learning_rate": 8e-05, "loss": 52.7154, "num_input_tokens_seen": 144132200, "step": 2799 }, { "epoch": 0.27537406943318343, "grad_norm": 31.817401885986328, "learning_rate": 8e-05, "loss": 52.9121, "num_input_tokens_seen": 144283840, "step": 2802 }, { "epoch": 0.27566890248396847, "grad_norm": 30.313737869262695, "learning_rate": 8e-05, "loss": 55.0518, "num_input_tokens_seen": 144446960, "step": 2805 }, { "epoch": 0.27596373553475345, "grad_norm": 35.830440521240234, "learning_rate": 8e-05, "loss": 53.0562, "num_input_tokens_seen": 144599380, "step": 2808 }, { "epoch": 0.27625856858553843, "grad_norm": 42.904781341552734, "learning_rate": 8e-05, "loss": 51.9012, "num_input_tokens_seen": 144765832, "step": 2811 }, { "epoch": 0.2765534016363234, "grad_norm": 36.148956298828125, "learning_rate": 8e-05, "loss": 50.594, "num_input_tokens_seen": 144901240, "step": 2814 }, { "epoch": 0.27684823468710845, "grad_norm": 28.935997009277344, "learning_rate": 8e-05, "loss": 52.4193, "num_input_tokens_seen": 145063040, "step": 2817 }, { "epoch": 0.27714306773789343, "grad_norm": 33.863101959228516, "learning_rate": 8e-05, "loss": 56.4542, "num_input_tokens_seen": 145217092, "step": 2820 }, { "epoch": 0.2774379007886784, "grad_norm": 30.103792190551758, "learning_rate": 8e-05, "loss": 47.8212, "num_input_tokens_seen": 145380040, "step": 2823 }, { "epoch": 0.2777327338394634, "grad_norm": 34.798763275146484, "learning_rate": 8e-05, "loss": 53.4973, "num_input_tokens_seen": 145519664, "step": 2826 }, { "epoch": 0.2780275668902484, "grad_norm": 32.65745544433594, "learning_rate": 8e-05, "loss": 51.7497, "num_input_tokens_seen": 145678828, "step": 2829 }, { "epoch": 0.2783223999410334, "grad_norm": 29.108938217163086, "learning_rate": 8e-05, "loss": 53.0858, "num_input_tokens_seen": 145853384, "step": 2832 }, { "epoch": 0.2786172329918184, "grad_norm": 32.11029815673828, "learning_rate": 8e-05, "loss": 47.0779, "num_input_tokens_seen": 146002992, "step": 2835 }, { "epoch": 0.2789120660426034, "grad_norm": 37.39936828613281, "learning_rate": 8e-05, "loss": 51.646, "num_input_tokens_seen": 146148328, "step": 2838 }, { "epoch": 0.27920689909338836, "grad_norm": 34.64923858642578, "learning_rate": 8e-05, "loss": 50.4198, "num_input_tokens_seen": 146291240, "step": 2841 }, { "epoch": 0.27950173214417334, "grad_norm": 34.844642639160156, "learning_rate": 8e-05, "loss": 53.106, "num_input_tokens_seen": 146413520, "step": 2844 }, { "epoch": 0.2797965651949584, "grad_norm": 41.19792556762695, "learning_rate": 8e-05, "loss": 51.1535, "num_input_tokens_seen": 146537460, "step": 2847 }, { "epoch": 0.28009139824574336, "grad_norm": 30.525279998779297, "learning_rate": 8e-05, "loss": 48.1843, "num_input_tokens_seen": 146686000, "step": 2850 }, { "epoch": 0.28038623129652834, "grad_norm": 32.820343017578125, "learning_rate": 8e-05, "loss": 51.6588, "num_input_tokens_seen": 146833596, "step": 2853 }, { "epoch": 0.2806810643473133, "grad_norm": 30.4845027923584, "learning_rate": 8e-05, "loss": 53.6651, "num_input_tokens_seen": 146986500, "step": 2856 }, { "epoch": 0.2809758973980983, "grad_norm": 28.711761474609375, "learning_rate": 8e-05, "loss": 48.2291, "num_input_tokens_seen": 147126368, "step": 2859 }, { "epoch": 0.28127073044888334, "grad_norm": 35.70820999145508, "learning_rate": 8e-05, "loss": 52.7987, "num_input_tokens_seen": 147291712, "step": 2862 }, { "epoch": 0.2815655634996683, "grad_norm": 36.2431526184082, "learning_rate": 8e-05, "loss": 52.1996, "num_input_tokens_seen": 147430972, "step": 2865 }, { "epoch": 0.2818603965504533, "grad_norm": 41.38965606689453, "learning_rate": 8e-05, "loss": 51.0413, "num_input_tokens_seen": 147601428, "step": 2868 }, { "epoch": 0.2821552296012383, "grad_norm": 34.36966323852539, "learning_rate": 8e-05, "loss": 49.9894, "num_input_tokens_seen": 147741796, "step": 2871 }, { "epoch": 0.28245006265202327, "grad_norm": 29.51498794555664, "learning_rate": 8e-05, "loss": 51.1804, "num_input_tokens_seen": 147896472, "step": 2874 }, { "epoch": 0.2827448957028083, "grad_norm": 31.154193878173828, "learning_rate": 8e-05, "loss": 50.2609, "num_input_tokens_seen": 148044756, "step": 2877 }, { "epoch": 0.2830397287535933, "grad_norm": 44.29144287109375, "learning_rate": 8e-05, "loss": 50.1375, "num_input_tokens_seen": 148207236, "step": 2880 }, { "epoch": 0.28333456180437827, "grad_norm": 32.92866897583008, "learning_rate": 8e-05, "loss": 50.9692, "num_input_tokens_seen": 148347756, "step": 2883 }, { "epoch": 0.28362939485516325, "grad_norm": 82.20730590820312, "learning_rate": 8e-05, "loss": 52.9466, "num_input_tokens_seen": 148507104, "step": 2886 }, { "epoch": 0.28392422790594823, "grad_norm": 31.4622859954834, "learning_rate": 8e-05, "loss": 52.708, "num_input_tokens_seen": 148645608, "step": 2889 }, { "epoch": 0.28421906095673327, "grad_norm": 31.294513702392578, "learning_rate": 8e-05, "loss": 51.7986, "num_input_tokens_seen": 148807032, "step": 2892 }, { "epoch": 0.28451389400751825, "grad_norm": 32.09864044189453, "learning_rate": 8e-05, "loss": 51.8296, "num_input_tokens_seen": 148955096, "step": 2895 }, { "epoch": 0.28480872705830323, "grad_norm": 32.155982971191406, "learning_rate": 8e-05, "loss": 51.9833, "num_input_tokens_seen": 149115876, "step": 2898 }, { "epoch": 0.2851035601090882, "grad_norm": 30.856664657592773, "learning_rate": 8e-05, "loss": 45.5968, "num_input_tokens_seen": 149283748, "step": 2901 }, { "epoch": 0.2853983931598732, "grad_norm": 37.33554458618164, "learning_rate": 8e-05, "loss": 53.588, "num_input_tokens_seen": 149424712, "step": 2904 }, { "epoch": 0.28569322621065824, "grad_norm": 34.55818557739258, "learning_rate": 8e-05, "loss": 52.8435, "num_input_tokens_seen": 149576360, "step": 2907 }, { "epoch": 0.2859880592614432, "grad_norm": 33.608280181884766, "learning_rate": 8e-05, "loss": 49.3937, "num_input_tokens_seen": 149747444, "step": 2910 }, { "epoch": 0.2862828923122282, "grad_norm": 43.21138000488281, "learning_rate": 8e-05, "loss": 53.2086, "num_input_tokens_seen": 149905412, "step": 2913 }, { "epoch": 0.2865777253630132, "grad_norm": 31.36229705810547, "learning_rate": 8e-05, "loss": 54.6847, "num_input_tokens_seen": 150066696, "step": 2916 }, { "epoch": 0.28687255841379816, "grad_norm": 34.487369537353516, "learning_rate": 8e-05, "loss": 50.4352, "num_input_tokens_seen": 150233700, "step": 2919 }, { "epoch": 0.2871673914645832, "grad_norm": 31.998851776123047, "learning_rate": 8e-05, "loss": 49.0981, "num_input_tokens_seen": 150388200, "step": 2922 }, { "epoch": 0.2874622245153682, "grad_norm": 29.5037841796875, "learning_rate": 8e-05, "loss": 52.8468, "num_input_tokens_seen": 150534184, "step": 2925 }, { "epoch": 0.28775705756615316, "grad_norm": 37.64830017089844, "learning_rate": 8e-05, "loss": 54.9494, "num_input_tokens_seen": 150687716, "step": 2928 }, { "epoch": 0.28805189061693814, "grad_norm": 29.77463722229004, "learning_rate": 8e-05, "loss": 51.2639, "num_input_tokens_seen": 150848480, "step": 2931 }, { "epoch": 0.2883467236677231, "grad_norm": 41.943302154541016, "learning_rate": 8e-05, "loss": 50.5784, "num_input_tokens_seen": 151012276, "step": 2934 }, { "epoch": 0.28864155671850816, "grad_norm": 29.216768264770508, "learning_rate": 8e-05, "loss": 48.4823, "num_input_tokens_seen": 151147724, "step": 2937 }, { "epoch": 0.28893638976929314, "grad_norm": 31.684417724609375, "learning_rate": 8e-05, "loss": 51.4178, "num_input_tokens_seen": 151286992, "step": 2940 }, { "epoch": 0.2892312228200781, "grad_norm": 29.715232849121094, "learning_rate": 8e-05, "loss": 52.4719, "num_input_tokens_seen": 151434844, "step": 2943 }, { "epoch": 0.2895260558708631, "grad_norm": 29.671850204467773, "learning_rate": 8e-05, "loss": 48.1795, "num_input_tokens_seen": 151572760, "step": 2946 }, { "epoch": 0.2898208889216481, "grad_norm": 30.16161346435547, "learning_rate": 8e-05, "loss": 50.0184, "num_input_tokens_seen": 151719104, "step": 2949 }, { "epoch": 0.2901157219724331, "grad_norm": 32.42359161376953, "learning_rate": 8e-05, "loss": 53.1842, "num_input_tokens_seen": 151864812, "step": 2952 }, { "epoch": 0.2904105550232181, "grad_norm": 37.412479400634766, "learning_rate": 8e-05, "loss": 52.6939, "num_input_tokens_seen": 152000620, "step": 2955 }, { "epoch": 0.2907053880740031, "grad_norm": 27.60318946838379, "learning_rate": 8e-05, "loss": 48.8245, "num_input_tokens_seen": 152163128, "step": 2958 }, { "epoch": 0.29100022112478807, "grad_norm": 31.002967834472656, "learning_rate": 8e-05, "loss": 52.472, "num_input_tokens_seen": 152312572, "step": 2961 }, { "epoch": 0.2912950541755731, "grad_norm": 32.0164680480957, "learning_rate": 8e-05, "loss": 51.9189, "num_input_tokens_seen": 152464844, "step": 2964 }, { "epoch": 0.2915898872263581, "grad_norm": 34.06315231323242, "learning_rate": 8e-05, "loss": 51.1857, "num_input_tokens_seen": 152627208, "step": 2967 }, { "epoch": 0.2918847202771431, "grad_norm": 34.70756530761719, "learning_rate": 8e-05, "loss": 52.5184, "num_input_tokens_seen": 152788952, "step": 2970 }, { "epoch": 0.29217955332792805, "grad_norm": 32.645355224609375, "learning_rate": 8e-05, "loss": 54.0848, "num_input_tokens_seen": 152941456, "step": 2973 }, { "epoch": 0.29247438637871304, "grad_norm": 35.99182891845703, "learning_rate": 8e-05, "loss": 49.7962, "num_input_tokens_seen": 153102644, "step": 2976 }, { "epoch": 0.2927692194294981, "grad_norm": 30.75571632385254, "learning_rate": 8e-05, "loss": 51.04, "num_input_tokens_seen": 153244812, "step": 2979 }, { "epoch": 0.29306405248028305, "grad_norm": 31.208608627319336, "learning_rate": 8e-05, "loss": 48.5619, "num_input_tokens_seen": 153404796, "step": 2982 }, { "epoch": 0.29335888553106804, "grad_norm": 35.94192123413086, "learning_rate": 8e-05, "loss": 49.4916, "num_input_tokens_seen": 153544740, "step": 2985 }, { "epoch": 0.293653718581853, "grad_norm": 33.36006546020508, "learning_rate": 8e-05, "loss": 52.1915, "num_input_tokens_seen": 153695968, "step": 2988 }, { "epoch": 0.293948551632638, "grad_norm": 29.09237289428711, "learning_rate": 8e-05, "loss": 50.3356, "num_input_tokens_seen": 153874444, "step": 2991 }, { "epoch": 0.29424338468342304, "grad_norm": 44.539207458496094, "learning_rate": 8e-05, "loss": 49.5421, "num_input_tokens_seen": 154035432, "step": 2994 }, { "epoch": 0.294538217734208, "grad_norm": 31.166061401367188, "learning_rate": 8e-05, "loss": 48.624, "num_input_tokens_seen": 154182420, "step": 2997 }, { "epoch": 0.294833050784993, "grad_norm": 27.469114303588867, "learning_rate": 8e-05, "loss": 51.9327, "num_input_tokens_seen": 154351440, "step": 3000 }, { "epoch": 0.294833050784993, "eval_gen_len": 30.495, "eval_loss": 3.2430038452148438, "eval_rouge1": 40.1723, "eval_rouge2": 21.3863, "eval_rougeL": 36.5277, "eval_rougeLsum": 36.8678, "eval_runtime": 91.7301, "eval_samples_per_second": 2.18, "eval_steps_per_second": 0.545, "num_input_tokens_seen": 154351440, "step": 3000 }, { "epoch": 0.295127883835778, "grad_norm": 27.834260940551758, "learning_rate": 8e-05, "loss": 51.2899, "num_input_tokens_seen": 154512120, "step": 3003 }, { "epoch": 0.29542271688656296, "grad_norm": 30.697715759277344, "learning_rate": 8e-05, "loss": 48.9938, "num_input_tokens_seen": 154684184, "step": 3006 }, { "epoch": 0.295717549937348, "grad_norm": 28.035795211791992, "learning_rate": 8e-05, "loss": 50.7757, "num_input_tokens_seen": 154848936, "step": 3009 }, { "epoch": 0.296012382988133, "grad_norm": 31.7027645111084, "learning_rate": 8e-05, "loss": 48.1236, "num_input_tokens_seen": 155004780, "step": 3012 }, { "epoch": 0.29630721603891796, "grad_norm": 30.000329971313477, "learning_rate": 8e-05, "loss": 49.6937, "num_input_tokens_seen": 155170616, "step": 3015 }, { "epoch": 0.29660204908970295, "grad_norm": 31.308128356933594, "learning_rate": 8e-05, "loss": 51.6159, "num_input_tokens_seen": 155325012, "step": 3018 }, { "epoch": 0.2968968821404879, "grad_norm": 40.389461517333984, "learning_rate": 8e-05, "loss": 45.708, "num_input_tokens_seen": 155476104, "step": 3021 }, { "epoch": 0.29719171519127296, "grad_norm": 93.52242279052734, "learning_rate": 8e-05, "loss": 45.6057, "num_input_tokens_seen": 155620040, "step": 3024 }, { "epoch": 0.29748654824205795, "grad_norm": 30.37114906311035, "learning_rate": 8e-05, "loss": 50.7246, "num_input_tokens_seen": 155776340, "step": 3027 }, { "epoch": 0.29778138129284293, "grad_norm": 32.123416900634766, "learning_rate": 8e-05, "loss": 47.2173, "num_input_tokens_seen": 155892028, "step": 3030 }, { "epoch": 0.2980762143436279, "grad_norm": 67.35137939453125, "learning_rate": 8e-05, "loss": 47.236, "num_input_tokens_seen": 156037452, "step": 3033 }, { "epoch": 0.2983710473944129, "grad_norm": 31.5852108001709, "learning_rate": 8e-05, "loss": 48.5519, "num_input_tokens_seen": 156179356, "step": 3036 }, { "epoch": 0.29866588044519793, "grad_norm": 31.071697235107422, "learning_rate": 8e-05, "loss": 52.0375, "num_input_tokens_seen": 156335380, "step": 3039 }, { "epoch": 0.2989607134959829, "grad_norm": 48.0234489440918, "learning_rate": 8e-05, "loss": 51.3958, "num_input_tokens_seen": 156519724, "step": 3042 }, { "epoch": 0.2992555465467679, "grad_norm": 32.876136779785156, "learning_rate": 8e-05, "loss": 50.8297, "num_input_tokens_seen": 156690848, "step": 3045 }, { "epoch": 0.2995503795975529, "grad_norm": 31.397253036499023, "learning_rate": 8e-05, "loss": 50.5913, "num_input_tokens_seen": 156848912, "step": 3048 }, { "epoch": 0.29984521264833786, "grad_norm": 30.054244995117188, "learning_rate": 8e-05, "loss": 45.5249, "num_input_tokens_seen": 157005116, "step": 3051 }, { "epoch": 0.3001400456991229, "grad_norm": 32.175376892089844, "learning_rate": 8e-05, "loss": 54.039, "num_input_tokens_seen": 157163468, "step": 3054 }, { "epoch": 0.3004348787499079, "grad_norm": 27.12468910217285, "learning_rate": 8e-05, "loss": 52.333, "num_input_tokens_seen": 157321272, "step": 3057 }, { "epoch": 0.30072971180069286, "grad_norm": 30.750747680664062, "learning_rate": 8e-05, "loss": 48.1032, "num_input_tokens_seen": 157498128, "step": 3060 }, { "epoch": 0.30102454485147784, "grad_norm": 35.073326110839844, "learning_rate": 8e-05, "loss": 47.2467, "num_input_tokens_seen": 157654200, "step": 3063 }, { "epoch": 0.3013193779022628, "grad_norm": 36.32673645019531, "learning_rate": 8e-05, "loss": 54.9197, "num_input_tokens_seen": 157818088, "step": 3066 }, { "epoch": 0.30161421095304786, "grad_norm": 26.123674392700195, "learning_rate": 8e-05, "loss": 50.2599, "num_input_tokens_seen": 157982596, "step": 3069 }, { "epoch": 0.30190904400383284, "grad_norm": 33.341209411621094, "learning_rate": 8e-05, "loss": 50.698, "num_input_tokens_seen": 158152676, "step": 3072 }, { "epoch": 0.3022038770546178, "grad_norm": 26.35466194152832, "learning_rate": 8e-05, "loss": 46.5832, "num_input_tokens_seen": 158311820, "step": 3075 }, { "epoch": 0.3024987101054028, "grad_norm": 34.99408721923828, "learning_rate": 8e-05, "loss": 51.7367, "num_input_tokens_seen": 158483932, "step": 3078 }, { "epoch": 0.3027935431561878, "grad_norm": 35.281272888183594, "learning_rate": 8e-05, "loss": 48.9734, "num_input_tokens_seen": 158620712, "step": 3081 }, { "epoch": 0.3030883762069728, "grad_norm": 28.188995361328125, "learning_rate": 8e-05, "loss": 47.357, "num_input_tokens_seen": 158771128, "step": 3084 }, { "epoch": 0.3033832092577578, "grad_norm": 31.777433395385742, "learning_rate": 8e-05, "loss": 51.7049, "num_input_tokens_seen": 158922868, "step": 3087 }, { "epoch": 0.3036780423085428, "grad_norm": 29.47471046447754, "learning_rate": 8e-05, "loss": 50.8354, "num_input_tokens_seen": 159070688, "step": 3090 }, { "epoch": 0.30397287535932777, "grad_norm": 31.329797744750977, "learning_rate": 8e-05, "loss": 48.8359, "num_input_tokens_seen": 159204564, "step": 3093 }, { "epoch": 0.30426770841011275, "grad_norm": 36.26081466674805, "learning_rate": 8e-05, "loss": 51.5042, "num_input_tokens_seen": 159341748, "step": 3096 }, { "epoch": 0.3045625414608978, "grad_norm": 46.48896789550781, "learning_rate": 8e-05, "loss": 48.3833, "num_input_tokens_seen": 159487756, "step": 3099 }, { "epoch": 0.30485737451168277, "grad_norm": 27.038835525512695, "learning_rate": 8e-05, "loss": 45.7966, "num_input_tokens_seen": 159646172, "step": 3102 }, { "epoch": 0.30515220756246775, "grad_norm": 29.98095703125, "learning_rate": 8e-05, "loss": 43.3869, "num_input_tokens_seen": 159823372, "step": 3105 }, { "epoch": 0.30544704061325273, "grad_norm": 30.19727897644043, "learning_rate": 8e-05, "loss": 47.5798, "num_input_tokens_seen": 159975388, "step": 3108 }, { "epoch": 0.3057418736640377, "grad_norm": 29.022533416748047, "learning_rate": 8e-05, "loss": 52.4457, "num_input_tokens_seen": 160103260, "step": 3111 }, { "epoch": 0.30603670671482275, "grad_norm": 90.77041625976562, "learning_rate": 8e-05, "loss": 54.8584, "num_input_tokens_seen": 160259356, "step": 3114 }, { "epoch": 0.30633153976560773, "grad_norm": 41.759761810302734, "learning_rate": 8e-05, "loss": 48.1346, "num_input_tokens_seen": 160427328, "step": 3117 }, { "epoch": 0.3066263728163927, "grad_norm": 38.2276496887207, "learning_rate": 8e-05, "loss": 48.8283, "num_input_tokens_seen": 160572824, "step": 3120 }, { "epoch": 0.3069212058671777, "grad_norm": 31.891613006591797, "learning_rate": 8e-05, "loss": 47.1253, "num_input_tokens_seen": 160751252, "step": 3123 }, { "epoch": 0.30721603891796273, "grad_norm": 29.48491668701172, "learning_rate": 8e-05, "loss": 47.7418, "num_input_tokens_seen": 160934284, "step": 3126 }, { "epoch": 0.3075108719687477, "grad_norm": 30.440406799316406, "learning_rate": 8e-05, "loss": 48.4942, "num_input_tokens_seen": 161106808, "step": 3129 }, { "epoch": 0.3078057050195327, "grad_norm": 32.63410186767578, "learning_rate": 8e-05, "loss": 51.9228, "num_input_tokens_seen": 161259732, "step": 3132 }, { "epoch": 0.3081005380703177, "grad_norm": 30.53582000732422, "learning_rate": 8e-05, "loss": 54.3659, "num_input_tokens_seen": 161410464, "step": 3135 }, { "epoch": 0.30839537112110266, "grad_norm": 31.640932083129883, "learning_rate": 8e-05, "loss": 49.5831, "num_input_tokens_seen": 161557152, "step": 3138 }, { "epoch": 0.3086902041718877, "grad_norm": 32.702518463134766, "learning_rate": 8e-05, "loss": 50.4781, "num_input_tokens_seen": 161707312, "step": 3141 }, { "epoch": 0.3089850372226727, "grad_norm": 30.084012985229492, "learning_rate": 8e-05, "loss": 51.5196, "num_input_tokens_seen": 161855552, "step": 3144 }, { "epoch": 0.30927987027345766, "grad_norm": 33.723819732666016, "learning_rate": 8e-05, "loss": 49.4337, "num_input_tokens_seen": 161991804, "step": 3147 }, { "epoch": 0.30957470332424264, "grad_norm": 37.2187614440918, "learning_rate": 8e-05, "loss": 52.0158, "num_input_tokens_seen": 162132524, "step": 3150 }, { "epoch": 0.3098695363750276, "grad_norm": 31.11838722229004, "learning_rate": 8e-05, "loss": 50.1612, "num_input_tokens_seen": 162286832, "step": 3153 }, { "epoch": 0.31016436942581266, "grad_norm": 29.569923400878906, "learning_rate": 8e-05, "loss": 45.6304, "num_input_tokens_seen": 162426504, "step": 3156 }, { "epoch": 0.31045920247659764, "grad_norm": 33.4188232421875, "learning_rate": 8e-05, "loss": 48.863, "num_input_tokens_seen": 162582048, "step": 3159 }, { "epoch": 0.3107540355273826, "grad_norm": 30.900367736816406, "learning_rate": 8e-05, "loss": 45.2414, "num_input_tokens_seen": 162735952, "step": 3162 }, { "epoch": 0.3110488685781676, "grad_norm": 30.14768409729004, "learning_rate": 8e-05, "loss": 51.5442, "num_input_tokens_seen": 162880820, "step": 3165 }, { "epoch": 0.3113437016289526, "grad_norm": 30.156339645385742, "learning_rate": 8e-05, "loss": 52.2109, "num_input_tokens_seen": 163037460, "step": 3168 }, { "epoch": 0.3116385346797376, "grad_norm": 29.608909606933594, "learning_rate": 8e-05, "loss": 48.732, "num_input_tokens_seen": 163150724, "step": 3171 }, { "epoch": 0.3119333677305226, "grad_norm": 30.16400718688965, "learning_rate": 8e-05, "loss": 51.2494, "num_input_tokens_seen": 163317320, "step": 3174 }, { "epoch": 0.3122282007813076, "grad_norm": 33.92653274536133, "learning_rate": 8e-05, "loss": 51.0058, "num_input_tokens_seen": 163478164, "step": 3177 }, { "epoch": 0.31252303383209257, "grad_norm": 34.29814910888672, "learning_rate": 8e-05, "loss": 49.5705, "num_input_tokens_seen": 163593236, "step": 3180 }, { "epoch": 0.31281786688287755, "grad_norm": 43.815528869628906, "learning_rate": 8e-05, "loss": 47.95, "num_input_tokens_seen": 163751472, "step": 3183 }, { "epoch": 0.3131126999336626, "grad_norm": 34.95302200317383, "learning_rate": 8e-05, "loss": 48.482, "num_input_tokens_seen": 163909016, "step": 3186 }, { "epoch": 0.31340753298444757, "grad_norm": 32.39896774291992, "learning_rate": 8e-05, "loss": 47.6268, "num_input_tokens_seen": 164061776, "step": 3189 }, { "epoch": 0.31370236603523255, "grad_norm": 29.34246063232422, "learning_rate": 8e-05, "loss": 44.7197, "num_input_tokens_seen": 164228080, "step": 3192 }, { "epoch": 0.31399719908601753, "grad_norm": 32.78367233276367, "learning_rate": 8e-05, "loss": 49.5443, "num_input_tokens_seen": 164372944, "step": 3195 }, { "epoch": 0.3142920321368025, "grad_norm": 29.293537139892578, "learning_rate": 8e-05, "loss": 48.2356, "num_input_tokens_seen": 164539204, "step": 3198 }, { "epoch": 0.31458686518758755, "grad_norm": 27.77086067199707, "learning_rate": 8e-05, "loss": 48.6342, "num_input_tokens_seen": 164700484, "step": 3201 }, { "epoch": 0.31488169823837253, "grad_norm": 30.448762893676758, "learning_rate": 8e-05, "loss": 50.1135, "num_input_tokens_seen": 164858072, "step": 3204 }, { "epoch": 0.3151765312891575, "grad_norm": 31.73259925842285, "learning_rate": 8e-05, "loss": 48.8052, "num_input_tokens_seen": 164994416, "step": 3207 }, { "epoch": 0.3154713643399425, "grad_norm": 24.573047637939453, "learning_rate": 8e-05, "loss": 48.3418, "num_input_tokens_seen": 165168004, "step": 3210 }, { "epoch": 0.3157661973907275, "grad_norm": 30.762184143066406, "learning_rate": 8e-05, "loss": 51.0315, "num_input_tokens_seen": 165320468, "step": 3213 }, { "epoch": 0.3160610304415125, "grad_norm": 31.318260192871094, "learning_rate": 8e-05, "loss": 50.0707, "num_input_tokens_seen": 165487880, "step": 3216 }, { "epoch": 0.3163558634922975, "grad_norm": 34.796592712402344, "learning_rate": 8e-05, "loss": 47.4512, "num_input_tokens_seen": 165647096, "step": 3219 }, { "epoch": 0.3166506965430825, "grad_norm": 31.09071159362793, "learning_rate": 8e-05, "loss": 48.0372, "num_input_tokens_seen": 165813096, "step": 3222 }, { "epoch": 0.31694552959386746, "grad_norm": 29.88203239440918, "learning_rate": 8e-05, "loss": 44.7359, "num_input_tokens_seen": 165981372, "step": 3225 }, { "epoch": 0.31724036264465244, "grad_norm": 29.312549591064453, "learning_rate": 8e-05, "loss": 47.5664, "num_input_tokens_seen": 166140500, "step": 3228 }, { "epoch": 0.3175351956954375, "grad_norm": 34.14030456542969, "learning_rate": 8e-05, "loss": 47.5588, "num_input_tokens_seen": 166289596, "step": 3231 }, { "epoch": 0.31783002874622246, "grad_norm": 32.20860290527344, "learning_rate": 8e-05, "loss": 50.7921, "num_input_tokens_seen": 166451524, "step": 3234 }, { "epoch": 0.31812486179700744, "grad_norm": 31.75952911376953, "learning_rate": 8e-05, "loss": 49.2161, "num_input_tokens_seen": 166602148, "step": 3237 }, { "epoch": 0.3184196948477924, "grad_norm": 29.15970802307129, "learning_rate": 8e-05, "loss": 49.655, "num_input_tokens_seen": 166750708, "step": 3240 }, { "epoch": 0.3187145278985774, "grad_norm": 28.13986587524414, "learning_rate": 8e-05, "loss": 50.4226, "num_input_tokens_seen": 166894808, "step": 3243 }, { "epoch": 0.31900936094936244, "grad_norm": 31.35550308227539, "learning_rate": 8e-05, "loss": 48.0589, "num_input_tokens_seen": 167055288, "step": 3246 }, { "epoch": 0.3193041940001474, "grad_norm": 41.20752716064453, "learning_rate": 8e-05, "loss": 46.9058, "num_input_tokens_seen": 167214404, "step": 3249 }, { "epoch": 0.3195990270509324, "grad_norm": 31.76648712158203, "learning_rate": 8e-05, "loss": 51.1456, "num_input_tokens_seen": 167352292, "step": 3252 }, { "epoch": 0.3198938601017174, "grad_norm": 25.93441390991211, "learning_rate": 8e-05, "loss": 45.1572, "num_input_tokens_seen": 167508120, "step": 3255 }, { "epoch": 0.32018869315250237, "grad_norm": 31.922340393066406, "learning_rate": 8e-05, "loss": 48.1204, "num_input_tokens_seen": 167658524, "step": 3258 }, { "epoch": 0.3204835262032874, "grad_norm": 31.73157501220703, "learning_rate": 8e-05, "loss": 47.138, "num_input_tokens_seen": 167802924, "step": 3261 }, { "epoch": 0.3207783592540724, "grad_norm": 32.577701568603516, "learning_rate": 8e-05, "loss": 47.1694, "num_input_tokens_seen": 167933868, "step": 3264 }, { "epoch": 0.32107319230485737, "grad_norm": 29.493078231811523, "learning_rate": 8e-05, "loss": 46.81, "num_input_tokens_seen": 168090452, "step": 3267 }, { "epoch": 0.32136802535564235, "grad_norm": 29.012908935546875, "learning_rate": 8e-05, "loss": 43.3255, "num_input_tokens_seen": 168232328, "step": 3270 }, { "epoch": 0.3216628584064274, "grad_norm": 32.09306716918945, "learning_rate": 8e-05, "loss": 45.4934, "num_input_tokens_seen": 168361412, "step": 3273 }, { "epoch": 0.32195769145721237, "grad_norm": 31.578340530395508, "learning_rate": 8e-05, "loss": 44.2819, "num_input_tokens_seen": 168491948, "step": 3276 }, { "epoch": 0.32225252450799735, "grad_norm": 32.133750915527344, "learning_rate": 8e-05, "loss": 48.465, "num_input_tokens_seen": 168631568, "step": 3279 }, { "epoch": 0.32254735755878233, "grad_norm": 29.717243194580078, "learning_rate": 8e-05, "loss": 50.4961, "num_input_tokens_seen": 168768060, "step": 3282 }, { "epoch": 0.3228421906095673, "grad_norm": 30.31847381591797, "learning_rate": 8e-05, "loss": 51.0925, "num_input_tokens_seen": 168917240, "step": 3285 }, { "epoch": 0.32313702366035235, "grad_norm": 28.687097549438477, "learning_rate": 8e-05, "loss": 50.5125, "num_input_tokens_seen": 169092584, "step": 3288 }, { "epoch": 0.32343185671113733, "grad_norm": 35.459190368652344, "learning_rate": 8e-05, "loss": 49.7695, "num_input_tokens_seen": 169246108, "step": 3291 }, { "epoch": 0.3237266897619223, "grad_norm": 29.45003318786621, "learning_rate": 8e-05, "loss": 53.2221, "num_input_tokens_seen": 169419860, "step": 3294 }, { "epoch": 0.3240215228127073, "grad_norm": 37.26349639892578, "learning_rate": 8e-05, "loss": 47.2518, "num_input_tokens_seen": 169574904, "step": 3297 }, { "epoch": 0.3243163558634923, "grad_norm": 31.292814254760742, "learning_rate": 8e-05, "loss": 48.2314, "num_input_tokens_seen": 169734168, "step": 3300 }, { "epoch": 0.3246111889142773, "grad_norm": 32.58222961425781, "learning_rate": 8e-05, "loss": 49.1216, "num_input_tokens_seen": 169904416, "step": 3303 }, { "epoch": 0.3249060219650623, "grad_norm": 31.01238250732422, "learning_rate": 8e-05, "loss": 45.0155, "num_input_tokens_seen": 170060788, "step": 3306 }, { "epoch": 0.3252008550158473, "grad_norm": 34.21432876586914, "learning_rate": 8e-05, "loss": 48.9929, "num_input_tokens_seen": 170202916, "step": 3309 }, { "epoch": 0.32549568806663226, "grad_norm": 28.22661590576172, "learning_rate": 8e-05, "loss": 45.7777, "num_input_tokens_seen": 170362740, "step": 3312 }, { "epoch": 0.32579052111741724, "grad_norm": 26.78307342529297, "learning_rate": 8e-05, "loss": 44.7937, "num_input_tokens_seen": 170521040, "step": 3315 }, { "epoch": 0.3260853541682023, "grad_norm": 29.794517517089844, "learning_rate": 8e-05, "loss": 47.9855, "num_input_tokens_seen": 170682436, "step": 3318 }, { "epoch": 0.32638018721898726, "grad_norm": 32.7954216003418, "learning_rate": 8e-05, "loss": 43.2529, "num_input_tokens_seen": 170844748, "step": 3321 }, { "epoch": 0.32667502026977224, "grad_norm": 29.20732307434082, "learning_rate": 8e-05, "loss": 46.1486, "num_input_tokens_seen": 171001688, "step": 3324 }, { "epoch": 0.3269698533205572, "grad_norm": 30.342622756958008, "learning_rate": 8e-05, "loss": 49.8051, "num_input_tokens_seen": 171153672, "step": 3327 }, { "epoch": 0.3272646863713422, "grad_norm": 59.030792236328125, "learning_rate": 8e-05, "loss": 46.6331, "num_input_tokens_seen": 171301812, "step": 3330 }, { "epoch": 0.32755951942212724, "grad_norm": 33.16861343383789, "learning_rate": 8e-05, "loss": 44.3875, "num_input_tokens_seen": 171457740, "step": 3333 }, { "epoch": 0.3278543524729122, "grad_norm": 31.25284194946289, "learning_rate": 8e-05, "loss": 47.5177, "num_input_tokens_seen": 171607032, "step": 3336 }, { "epoch": 0.3281491855236972, "grad_norm": 33.96405029296875, "learning_rate": 8e-05, "loss": 46.1177, "num_input_tokens_seen": 171742676, "step": 3339 }, { "epoch": 0.3284440185744822, "grad_norm": 31.06889533996582, "learning_rate": 8e-05, "loss": 46.4838, "num_input_tokens_seen": 171906656, "step": 3342 }, { "epoch": 0.32873885162526717, "grad_norm": 31.36388397216797, "learning_rate": 8e-05, "loss": 48.7096, "num_input_tokens_seen": 172072992, "step": 3345 }, { "epoch": 0.3290336846760522, "grad_norm": 30.852571487426758, "learning_rate": 8e-05, "loss": 50.6002, "num_input_tokens_seen": 172230152, "step": 3348 }, { "epoch": 0.3293285177268372, "grad_norm": 30.819143295288086, "learning_rate": 8e-05, "loss": 49.1265, "num_input_tokens_seen": 172386988, "step": 3351 }, { "epoch": 0.32962335077762217, "grad_norm": 41.371429443359375, "learning_rate": 8e-05, "loss": 46.3821, "num_input_tokens_seen": 172524532, "step": 3354 }, { "epoch": 0.32991818382840715, "grad_norm": 28.965272903442383, "learning_rate": 8e-05, "loss": 47.383, "num_input_tokens_seen": 172683024, "step": 3357 }, { "epoch": 0.33021301687919213, "grad_norm": 53.77383804321289, "learning_rate": 8e-05, "loss": 47.8472, "num_input_tokens_seen": 172821612, "step": 3360 }, { "epoch": 0.33050784992997717, "grad_norm": 30.76011085510254, "learning_rate": 8e-05, "loss": 45.129, "num_input_tokens_seen": 172967344, "step": 3363 }, { "epoch": 0.33080268298076215, "grad_norm": 27.747955322265625, "learning_rate": 8e-05, "loss": 46.2741, "num_input_tokens_seen": 173115604, "step": 3366 }, { "epoch": 0.33109751603154713, "grad_norm": 34.3202018737793, "learning_rate": 8e-05, "loss": 51.4671, "num_input_tokens_seen": 173262284, "step": 3369 }, { "epoch": 0.3313923490823321, "grad_norm": 40.12452697753906, "learning_rate": 8e-05, "loss": 41.3438, "num_input_tokens_seen": 173425756, "step": 3372 }, { "epoch": 0.3316871821331171, "grad_norm": 37.801883697509766, "learning_rate": 8e-05, "loss": 48.3023, "num_input_tokens_seen": 173566892, "step": 3375 }, { "epoch": 0.33198201518390213, "grad_norm": 35.19266891479492, "learning_rate": 8e-05, "loss": 49.593, "num_input_tokens_seen": 173737684, "step": 3378 }, { "epoch": 0.3322768482346871, "grad_norm": 28.83980369567871, "learning_rate": 8e-05, "loss": 44.9847, "num_input_tokens_seen": 173916400, "step": 3381 }, { "epoch": 0.3325716812854721, "grad_norm": 28.931739807128906, "learning_rate": 8e-05, "loss": 46.7851, "num_input_tokens_seen": 174045520, "step": 3384 }, { "epoch": 0.3328665143362571, "grad_norm": 32.318565368652344, "learning_rate": 8e-05, "loss": 48.6233, "num_input_tokens_seen": 174204072, "step": 3387 }, { "epoch": 0.33316134738704206, "grad_norm": 31.32320213317871, "learning_rate": 8e-05, "loss": 48.7227, "num_input_tokens_seen": 174375316, "step": 3390 }, { "epoch": 0.3334561804378271, "grad_norm": 34.406246185302734, "learning_rate": 8e-05, "loss": 43.7779, "num_input_tokens_seen": 174514908, "step": 3393 }, { "epoch": 0.3337510134886121, "grad_norm": 29.341218948364258, "learning_rate": 8e-05, "loss": 44.7992, "num_input_tokens_seen": 174646708, "step": 3396 }, { "epoch": 0.33404584653939706, "grad_norm": 29.529809951782227, "learning_rate": 8e-05, "loss": 44.8276, "num_input_tokens_seen": 174789488, "step": 3399 }, { "epoch": 0.33434067959018204, "grad_norm": 31.401277542114258, "learning_rate": 8e-05, "loss": 53.1048, "num_input_tokens_seen": 174963152, "step": 3402 }, { "epoch": 0.334635512640967, "grad_norm": 85.02721405029297, "learning_rate": 8e-05, "loss": 42.7275, "num_input_tokens_seen": 175137252, "step": 3405 }, { "epoch": 0.33493034569175206, "grad_norm": 36.67193603515625, "learning_rate": 8e-05, "loss": 48.8682, "num_input_tokens_seen": 175315524, "step": 3408 }, { "epoch": 0.33522517874253704, "grad_norm": 34.90581512451172, "learning_rate": 8e-05, "loss": 45.0905, "num_input_tokens_seen": 175465152, "step": 3411 }, { "epoch": 0.335520011793322, "grad_norm": 30.830902099609375, "learning_rate": 8e-05, "loss": 52.4906, "num_input_tokens_seen": 175642048, "step": 3414 }, { "epoch": 0.335814844844107, "grad_norm": 31.64945411682129, "learning_rate": 8e-05, "loss": 49.6005, "num_input_tokens_seen": 175791692, "step": 3417 }, { "epoch": 0.33610967789489204, "grad_norm": 30.259660720825195, "learning_rate": 8e-05, "loss": 45.7076, "num_input_tokens_seen": 175928204, "step": 3420 }, { "epoch": 0.336404510945677, "grad_norm": 32.44395065307617, "learning_rate": 8e-05, "loss": 49.5071, "num_input_tokens_seen": 176079484, "step": 3423 }, { "epoch": 0.336699343996462, "grad_norm": 494.6167907714844, "learning_rate": 8e-05, "loss": 47.7877, "num_input_tokens_seen": 176223024, "step": 3426 }, { "epoch": 0.336994177047247, "grad_norm": 31.663257598876953, "learning_rate": 8e-05, "loss": 47.6809, "num_input_tokens_seen": 176362164, "step": 3429 }, { "epoch": 0.33728901009803197, "grad_norm": 32.32041549682617, "learning_rate": 8e-05, "loss": 48.758, "num_input_tokens_seen": 176521920, "step": 3432 }, { "epoch": 0.337583843148817, "grad_norm": 30.0501766204834, "learning_rate": 8e-05, "loss": 47.7477, "num_input_tokens_seen": 176662252, "step": 3435 }, { "epoch": 0.337878676199602, "grad_norm": 34.510536193847656, "learning_rate": 8e-05, "loss": 47.1626, "num_input_tokens_seen": 176824600, "step": 3438 }, { "epoch": 0.33817350925038697, "grad_norm": 28.301755905151367, "learning_rate": 8e-05, "loss": 43.903, "num_input_tokens_seen": 177002840, "step": 3441 }, { "epoch": 0.33846834230117195, "grad_norm": 29.808353424072266, "learning_rate": 8e-05, "loss": 45.0669, "num_input_tokens_seen": 177133472, "step": 3444 }, { "epoch": 0.33876317535195694, "grad_norm": 34.070655822753906, "learning_rate": 8e-05, "loss": 48.548, "num_input_tokens_seen": 177298748, "step": 3447 }, { "epoch": 0.339058008402742, "grad_norm": 37.89089584350586, "learning_rate": 8e-05, "loss": 53.683, "num_input_tokens_seen": 177438948, "step": 3450 }, { "epoch": 0.33935284145352695, "grad_norm": 29.450450897216797, "learning_rate": 8e-05, "loss": 44.3855, "num_input_tokens_seen": 177602176, "step": 3453 }, { "epoch": 0.33964767450431194, "grad_norm": 31.299362182617188, "learning_rate": 8e-05, "loss": 48.8359, "num_input_tokens_seen": 177758804, "step": 3456 }, { "epoch": 0.3399425075550969, "grad_norm": 34.62590408325195, "learning_rate": 8e-05, "loss": 50.2165, "num_input_tokens_seen": 177913172, "step": 3459 }, { "epoch": 0.3402373406058819, "grad_norm": 30.38800811767578, "learning_rate": 8e-05, "loss": 49.167, "num_input_tokens_seen": 178064440, "step": 3462 }, { "epoch": 0.34053217365666694, "grad_norm": 31.36223030090332, "learning_rate": 8e-05, "loss": 48.3271, "num_input_tokens_seen": 178230792, "step": 3465 }, { "epoch": 0.3408270067074519, "grad_norm": 28.67354965209961, "learning_rate": 8e-05, "loss": 44.4037, "num_input_tokens_seen": 178367876, "step": 3468 }, { "epoch": 0.3411218397582369, "grad_norm": 27.097543716430664, "learning_rate": 8e-05, "loss": 46.3705, "num_input_tokens_seen": 178521252, "step": 3471 }, { "epoch": 0.3414166728090219, "grad_norm": 28.45320701599121, "learning_rate": 8e-05, "loss": 47.9262, "num_input_tokens_seen": 178673632, "step": 3474 }, { "epoch": 0.34171150585980686, "grad_norm": 34.57194137573242, "learning_rate": 8e-05, "loss": 46.3569, "num_input_tokens_seen": 178831004, "step": 3477 }, { "epoch": 0.3420063389105919, "grad_norm": 30.40506935119629, "learning_rate": 8e-05, "loss": 43.096, "num_input_tokens_seen": 178980504, "step": 3480 }, { "epoch": 0.3423011719613769, "grad_norm": 31.882638931274414, "learning_rate": 8e-05, "loss": 48.569, "num_input_tokens_seen": 179121392, "step": 3483 }, { "epoch": 0.34259600501216186, "grad_norm": 28.25156593322754, "learning_rate": 8e-05, "loss": 47.3365, "num_input_tokens_seen": 179268492, "step": 3486 }, { "epoch": 0.34289083806294685, "grad_norm": 30.753129959106445, "learning_rate": 8e-05, "loss": 47.6602, "num_input_tokens_seen": 179410660, "step": 3489 }, { "epoch": 0.3431856711137318, "grad_norm": 30.181903839111328, "learning_rate": 8e-05, "loss": 50.688, "num_input_tokens_seen": 179595220, "step": 3492 }, { "epoch": 0.34348050416451686, "grad_norm": 31.859561920166016, "learning_rate": 8e-05, "loss": 46.9394, "num_input_tokens_seen": 179753316, "step": 3495 }, { "epoch": 0.34377533721530185, "grad_norm": 58.434322357177734, "learning_rate": 8e-05, "loss": 49.7456, "num_input_tokens_seen": 179901772, "step": 3498 }, { "epoch": 0.3440701702660868, "grad_norm": 35.47056579589844, "learning_rate": 8e-05, "loss": 46.3298, "num_input_tokens_seen": 180042052, "step": 3501 }, { "epoch": 0.3443650033168718, "grad_norm": 28.941143035888672, "learning_rate": 8e-05, "loss": 46.3121, "num_input_tokens_seen": 180183700, "step": 3504 }, { "epoch": 0.3446598363676568, "grad_norm": 31.060523986816406, "learning_rate": 8e-05, "loss": 47.7124, "num_input_tokens_seen": 180345032, "step": 3507 }, { "epoch": 0.34495466941844183, "grad_norm": 32.34244918823242, "learning_rate": 8e-05, "loss": 45.1072, "num_input_tokens_seen": 180500748, "step": 3510 }, { "epoch": 0.3452495024692268, "grad_norm": 102.28973388671875, "learning_rate": 8e-05, "loss": 49.6808, "num_input_tokens_seen": 180648820, "step": 3513 }, { "epoch": 0.3455443355200118, "grad_norm": 31.596895217895508, "learning_rate": 8e-05, "loss": 47.586, "num_input_tokens_seen": 180816336, "step": 3516 }, { "epoch": 0.3458391685707968, "grad_norm": 84.94961547851562, "learning_rate": 8e-05, "loss": 48.6347, "num_input_tokens_seen": 180981176, "step": 3519 }, { "epoch": 0.34613400162158175, "grad_norm": 30.54353141784668, "learning_rate": 8e-05, "loss": 45.0397, "num_input_tokens_seen": 181162392, "step": 3522 }, { "epoch": 0.3464288346723668, "grad_norm": 29.32094383239746, "learning_rate": 8e-05, "loss": 41.4492, "num_input_tokens_seen": 181312360, "step": 3525 }, { "epoch": 0.3467236677231518, "grad_norm": 31.889060974121094, "learning_rate": 8e-05, "loss": 47.0884, "num_input_tokens_seen": 181481268, "step": 3528 }, { "epoch": 0.34701850077393676, "grad_norm": 28.774999618530273, "learning_rate": 8e-05, "loss": 46.5325, "num_input_tokens_seen": 181645000, "step": 3531 }, { "epoch": 0.34731333382472174, "grad_norm": 35.49319839477539, "learning_rate": 8e-05, "loss": 49.6535, "num_input_tokens_seen": 181804300, "step": 3534 }, { "epoch": 0.3476081668755067, "grad_norm": 29.114641189575195, "learning_rate": 8e-05, "loss": 48.0461, "num_input_tokens_seen": 181956832, "step": 3537 }, { "epoch": 0.34790299992629176, "grad_norm": 31.43871307373047, "learning_rate": 8e-05, "loss": 45.1125, "num_input_tokens_seen": 182101356, "step": 3540 }, { "epoch": 0.34819783297707674, "grad_norm": 36.504127502441406, "learning_rate": 8e-05, "loss": 46.4158, "num_input_tokens_seen": 182254904, "step": 3543 }, { "epoch": 0.3484926660278617, "grad_norm": 29.40753746032715, "learning_rate": 8e-05, "loss": 48.6721, "num_input_tokens_seen": 182427084, "step": 3546 }, { "epoch": 0.3487874990786467, "grad_norm": 33.908546447753906, "learning_rate": 8e-05, "loss": 47.7057, "num_input_tokens_seen": 182574352, "step": 3549 }, { "epoch": 0.3490823321294317, "grad_norm": 32.480323791503906, "learning_rate": 8e-05, "loss": 50.7387, "num_input_tokens_seen": 182704996, "step": 3552 }, { "epoch": 0.3493771651802167, "grad_norm": 34.784576416015625, "learning_rate": 8e-05, "loss": 47.4257, "num_input_tokens_seen": 182855420, "step": 3555 }, { "epoch": 0.3496719982310017, "grad_norm": 30.89937400817871, "learning_rate": 8e-05, "loss": 46.5586, "num_input_tokens_seen": 182987680, "step": 3558 }, { "epoch": 0.3499668312817867, "grad_norm": 28.803722381591797, "learning_rate": 8e-05, "loss": 46.2221, "num_input_tokens_seen": 183137976, "step": 3561 }, { "epoch": 0.35026166433257166, "grad_norm": 30.891504287719727, "learning_rate": 8e-05, "loss": 48.2359, "num_input_tokens_seen": 183293772, "step": 3564 }, { "epoch": 0.35055649738335665, "grad_norm": 38.08857727050781, "learning_rate": 8e-05, "loss": 47.384, "num_input_tokens_seen": 183426252, "step": 3567 }, { "epoch": 0.3508513304341417, "grad_norm": 33.9434928894043, "learning_rate": 8e-05, "loss": 49.6966, "num_input_tokens_seen": 183569584, "step": 3570 }, { "epoch": 0.35114616348492667, "grad_norm": 33.663787841796875, "learning_rate": 8e-05, "loss": 41.6853, "num_input_tokens_seen": 183716720, "step": 3573 }, { "epoch": 0.35144099653571165, "grad_norm": 35.80391311645508, "learning_rate": 8e-05, "loss": 48.0787, "num_input_tokens_seen": 183861440, "step": 3576 }, { "epoch": 0.35173582958649663, "grad_norm": 48.684268951416016, "learning_rate": 8e-05, "loss": 50.4618, "num_input_tokens_seen": 184031560, "step": 3579 }, { "epoch": 0.35203066263728167, "grad_norm": 33.56404113769531, "learning_rate": 8e-05, "loss": 46.1341, "num_input_tokens_seen": 184182688, "step": 3582 }, { "epoch": 0.35232549568806665, "grad_norm": 32.63978958129883, "learning_rate": 8e-05, "loss": 45.7058, "num_input_tokens_seen": 184364896, "step": 3585 }, { "epoch": 0.35262032873885163, "grad_norm": 34.40457534790039, "learning_rate": 8e-05, "loss": 49.1106, "num_input_tokens_seen": 184508572, "step": 3588 }, { "epoch": 0.3529151617896366, "grad_norm": 32.84516906738281, "learning_rate": 8e-05, "loss": 50.3652, "num_input_tokens_seen": 184686200, "step": 3591 }, { "epoch": 0.3532099948404216, "grad_norm": 81.30366516113281, "learning_rate": 8e-05, "loss": 46.5863, "num_input_tokens_seen": 184814416, "step": 3594 }, { "epoch": 0.35350482789120663, "grad_norm": 32.37525177001953, "learning_rate": 8e-05, "loss": 44.6372, "num_input_tokens_seen": 184982744, "step": 3597 }, { "epoch": 0.3537996609419916, "grad_norm": 34.05870056152344, "learning_rate": 8e-05, "loss": 44.6898, "num_input_tokens_seen": 185135216, "step": 3600 }, { "epoch": 0.3540944939927766, "grad_norm": 38.33725357055664, "learning_rate": 8e-05, "loss": 42.983, "num_input_tokens_seen": 185304348, "step": 3603 }, { "epoch": 0.3543893270435616, "grad_norm": 204.94512939453125, "learning_rate": 8e-05, "loss": 45.561, "num_input_tokens_seen": 185459268, "step": 3606 }, { "epoch": 0.35468416009434656, "grad_norm": 31.346824645996094, "learning_rate": 8e-05, "loss": 45.4419, "num_input_tokens_seen": 185624232, "step": 3609 }, { "epoch": 0.3549789931451316, "grad_norm": 29.82939338684082, "learning_rate": 8e-05, "loss": 45.6828, "num_input_tokens_seen": 185787720, "step": 3612 }, { "epoch": 0.3552738261959166, "grad_norm": 31.14798355102539, "learning_rate": 8e-05, "loss": 44.7127, "num_input_tokens_seen": 185941444, "step": 3615 }, { "epoch": 0.35556865924670156, "grad_norm": 29.58880615234375, "learning_rate": 8e-05, "loss": 50.7512, "num_input_tokens_seen": 186088412, "step": 3618 }, { "epoch": 0.35586349229748654, "grad_norm": 107.43943786621094, "learning_rate": 8e-05, "loss": 40.4814, "num_input_tokens_seen": 186255008, "step": 3621 }, { "epoch": 0.3561583253482715, "grad_norm": 32.718505859375, "learning_rate": 8e-05, "loss": 46.1933, "num_input_tokens_seen": 186415012, "step": 3624 }, { "epoch": 0.35645315839905656, "grad_norm": 35.0687370300293, "learning_rate": 8e-05, "loss": 47.947, "num_input_tokens_seen": 186574040, "step": 3627 }, { "epoch": 0.35674799144984154, "grad_norm": 33.25041961669922, "learning_rate": 8e-05, "loss": 47.4406, "num_input_tokens_seen": 186759404, "step": 3630 }, { "epoch": 0.3570428245006265, "grad_norm": 34.515506744384766, "learning_rate": 8e-05, "loss": 45.0484, "num_input_tokens_seen": 186909100, "step": 3633 }, { "epoch": 0.3573376575514115, "grad_norm": 33.785972595214844, "learning_rate": 8e-05, "loss": 42.7564, "num_input_tokens_seen": 187064440, "step": 3636 }, { "epoch": 0.3576324906021965, "grad_norm": 29.22235870361328, "learning_rate": 8e-05, "loss": 45.6804, "num_input_tokens_seen": 187185516, "step": 3639 }, { "epoch": 0.3579273236529815, "grad_norm": 24.949066162109375, "learning_rate": 8e-05, "loss": 44.1007, "num_input_tokens_seen": 187337652, "step": 3642 }, { "epoch": 0.3582221567037665, "grad_norm": 26.641557693481445, "learning_rate": 8e-05, "loss": 47.4558, "num_input_tokens_seen": 187493736, "step": 3645 }, { "epoch": 0.3585169897545515, "grad_norm": 32.34521484375, "learning_rate": 8e-05, "loss": 44.5307, "num_input_tokens_seen": 187669108, "step": 3648 }, { "epoch": 0.35881182280533647, "grad_norm": 31.242477416992188, "learning_rate": 8e-05, "loss": 49.4255, "num_input_tokens_seen": 187822444, "step": 3651 }, { "epoch": 0.35910665585612145, "grad_norm": 28.323322296142578, "learning_rate": 8e-05, "loss": 47.169, "num_input_tokens_seen": 187977892, "step": 3654 }, { "epoch": 0.3594014889069065, "grad_norm": 32.317771911621094, "learning_rate": 8e-05, "loss": 42.7603, "num_input_tokens_seen": 188121380, "step": 3657 }, { "epoch": 0.35969632195769147, "grad_norm": 55.2293586730957, "learning_rate": 8e-05, "loss": 42.8748, "num_input_tokens_seen": 188246832, "step": 3660 }, { "epoch": 0.35999115500847645, "grad_norm": 32.014766693115234, "learning_rate": 8e-05, "loss": 41.6695, "num_input_tokens_seen": 188404404, "step": 3663 }, { "epoch": 0.36028598805926143, "grad_norm": 30.330198287963867, "learning_rate": 8e-05, "loss": 45.1016, "num_input_tokens_seen": 188555360, "step": 3666 }, { "epoch": 0.3605808211100464, "grad_norm": 29.632488250732422, "learning_rate": 8e-05, "loss": 49.119, "num_input_tokens_seen": 188718512, "step": 3669 }, { "epoch": 0.36087565416083145, "grad_norm": 29.9589786529541, "learning_rate": 8e-05, "loss": 50.1, "num_input_tokens_seen": 188897648, "step": 3672 }, { "epoch": 0.36117048721161643, "grad_norm": 28.243938446044922, "learning_rate": 8e-05, "loss": 46.6535, "num_input_tokens_seen": 189052008, "step": 3675 }, { "epoch": 0.3614653202624014, "grad_norm": 28.271282196044922, "learning_rate": 8e-05, "loss": 44.8027, "num_input_tokens_seen": 189184824, "step": 3678 }, { "epoch": 0.3617601533131864, "grad_norm": 31.67430305480957, "learning_rate": 8e-05, "loss": 42.1192, "num_input_tokens_seen": 189313464, "step": 3681 }, { "epoch": 0.3620549863639714, "grad_norm": 31.47785186767578, "learning_rate": 8e-05, "loss": 45.3955, "num_input_tokens_seen": 189454948, "step": 3684 }, { "epoch": 0.3623498194147564, "grad_norm": 31.909595489501953, "learning_rate": 8e-05, "loss": 44.9947, "num_input_tokens_seen": 189609212, "step": 3687 }, { "epoch": 0.3626446524655414, "grad_norm": 32.878177642822266, "learning_rate": 8e-05, "loss": 47.5328, "num_input_tokens_seen": 189758364, "step": 3690 }, { "epoch": 0.3629394855163264, "grad_norm": 30.43533706665039, "learning_rate": 8e-05, "loss": 45.8225, "num_input_tokens_seen": 189911720, "step": 3693 }, { "epoch": 0.36323431856711136, "grad_norm": 29.782085418701172, "learning_rate": 8e-05, "loss": 46.4523, "num_input_tokens_seen": 190058724, "step": 3696 }, { "epoch": 0.36352915161789634, "grad_norm": 36.34353256225586, "learning_rate": 8e-05, "loss": 45.7055, "num_input_tokens_seen": 190232372, "step": 3699 }, { "epoch": 0.3638239846686814, "grad_norm": 31.679168701171875, "learning_rate": 8e-05, "loss": 43.4331, "num_input_tokens_seen": 190397056, "step": 3702 }, { "epoch": 0.36411881771946636, "grad_norm": 30.544857025146484, "learning_rate": 8e-05, "loss": 48.4687, "num_input_tokens_seen": 190567812, "step": 3705 }, { "epoch": 0.36441365077025134, "grad_norm": 34.39679718017578, "learning_rate": 8e-05, "loss": 46.3253, "num_input_tokens_seen": 190727292, "step": 3708 }, { "epoch": 0.3647084838210363, "grad_norm": 28.82375144958496, "learning_rate": 8e-05, "loss": 44.8942, "num_input_tokens_seen": 190882620, "step": 3711 }, { "epoch": 0.3650033168718213, "grad_norm": 34.30107498168945, "learning_rate": 8e-05, "loss": 48.7777, "num_input_tokens_seen": 191060832, "step": 3714 }, { "epoch": 0.36529814992260634, "grad_norm": 26.91672134399414, "learning_rate": 8e-05, "loss": 44.2889, "num_input_tokens_seen": 191229352, "step": 3717 }, { "epoch": 0.3655929829733913, "grad_norm": 31.14057159423828, "learning_rate": 8e-05, "loss": 44.6834, "num_input_tokens_seen": 191379112, "step": 3720 }, { "epoch": 0.3658878160241763, "grad_norm": 29.06418800354004, "learning_rate": 8e-05, "loss": 43.0235, "num_input_tokens_seen": 191536348, "step": 3723 }, { "epoch": 0.3661826490749613, "grad_norm": 29.524255752563477, "learning_rate": 8e-05, "loss": 48.2543, "num_input_tokens_seen": 191688912, "step": 3726 }, { "epoch": 0.3664774821257463, "grad_norm": 30.087139129638672, "learning_rate": 8e-05, "loss": 47.2003, "num_input_tokens_seen": 191845312, "step": 3729 }, { "epoch": 0.3667723151765313, "grad_norm": 30.029184341430664, "learning_rate": 8e-05, "loss": 40.1694, "num_input_tokens_seen": 192019360, "step": 3732 }, { "epoch": 0.3670671482273163, "grad_norm": 27.986467361450195, "learning_rate": 8e-05, "loss": 47.4301, "num_input_tokens_seen": 192174176, "step": 3735 }, { "epoch": 0.36736198127810127, "grad_norm": 28.899826049804688, "learning_rate": 8e-05, "loss": 42.4361, "num_input_tokens_seen": 192312400, "step": 3738 }, { "epoch": 0.36765681432888625, "grad_norm": 33.11496353149414, "learning_rate": 8e-05, "loss": 46.0006, "num_input_tokens_seen": 192488888, "step": 3741 }, { "epoch": 0.3679516473796713, "grad_norm": 28.488046646118164, "learning_rate": 8e-05, "loss": 45.7107, "num_input_tokens_seen": 192646096, "step": 3744 }, { "epoch": 0.36824648043045627, "grad_norm": 42.096595764160156, "learning_rate": 8e-05, "loss": 45.9505, "num_input_tokens_seen": 192802116, "step": 3747 }, { "epoch": 0.36854131348124125, "grad_norm": 34.51819610595703, "learning_rate": 8e-05, "loss": 46.8947, "num_input_tokens_seen": 192960540, "step": 3750 }, { "epoch": 0.36883614653202623, "grad_norm": 30.888742446899414, "learning_rate": 8e-05, "loss": 46.85, "num_input_tokens_seen": 193127332, "step": 3753 }, { "epoch": 0.3691309795828112, "grad_norm": 29.665699005126953, "learning_rate": 8e-05, "loss": 42.1983, "num_input_tokens_seen": 193276052, "step": 3756 }, { "epoch": 0.36942581263359625, "grad_norm": 30.412384033203125, "learning_rate": 8e-05, "loss": 45.8014, "num_input_tokens_seen": 193437632, "step": 3759 }, { "epoch": 0.36972064568438123, "grad_norm": 29.54482078552246, "learning_rate": 8e-05, "loss": 42.5811, "num_input_tokens_seen": 193583420, "step": 3762 }, { "epoch": 0.3700154787351662, "grad_norm": 39.85192108154297, "learning_rate": 8e-05, "loss": 46.2251, "num_input_tokens_seen": 193717944, "step": 3765 }, { "epoch": 0.3703103117859512, "grad_norm": 34.36831283569336, "learning_rate": 8e-05, "loss": 44.9958, "num_input_tokens_seen": 193871112, "step": 3768 }, { "epoch": 0.3706051448367362, "grad_norm": 29.24009132385254, "learning_rate": 8e-05, "loss": 43.8158, "num_input_tokens_seen": 194048240, "step": 3771 }, { "epoch": 0.3708999778875212, "grad_norm": 31.27372169494629, "learning_rate": 8e-05, "loss": 45.5604, "num_input_tokens_seen": 194196752, "step": 3774 }, { "epoch": 0.3711948109383062, "grad_norm": 51.641380310058594, "learning_rate": 8e-05, "loss": 40.0764, "num_input_tokens_seen": 194380572, "step": 3777 }, { "epoch": 0.3714896439890912, "grad_norm": 31.002944946289062, "learning_rate": 8e-05, "loss": 51.0142, "num_input_tokens_seen": 194532176, "step": 3780 }, { "epoch": 0.37178447703987616, "grad_norm": 62.89687728881836, "learning_rate": 8e-05, "loss": 40.5023, "num_input_tokens_seen": 194676504, "step": 3783 }, { "epoch": 0.37207931009066114, "grad_norm": 30.240619659423828, "learning_rate": 8e-05, "loss": 42.1637, "num_input_tokens_seen": 194828120, "step": 3786 }, { "epoch": 0.3723741431414462, "grad_norm": 30.373289108276367, "learning_rate": 8e-05, "loss": 48.1604, "num_input_tokens_seen": 195013152, "step": 3789 }, { "epoch": 0.37266897619223116, "grad_norm": 27.48198127746582, "learning_rate": 8e-05, "loss": 46.033, "num_input_tokens_seen": 195186880, "step": 3792 }, { "epoch": 0.37296380924301614, "grad_norm": 30.48610496520996, "learning_rate": 8e-05, "loss": 44.5144, "num_input_tokens_seen": 195357388, "step": 3795 }, { "epoch": 0.3732586422938011, "grad_norm": 37.86562728881836, "learning_rate": 8e-05, "loss": 46.8633, "num_input_tokens_seen": 195523872, "step": 3798 }, { "epoch": 0.3735534753445861, "grad_norm": 25.786422729492188, "learning_rate": 8e-05, "loss": 42.4548, "num_input_tokens_seen": 195677988, "step": 3801 }, { "epoch": 0.37384830839537114, "grad_norm": 26.696611404418945, "learning_rate": 8e-05, "loss": 43.9501, "num_input_tokens_seen": 195837960, "step": 3804 }, { "epoch": 0.3741431414461561, "grad_norm": 28.53610610961914, "learning_rate": 8e-05, "loss": 47.663, "num_input_tokens_seen": 195989804, "step": 3807 }, { "epoch": 0.3744379744969411, "grad_norm": 31.87685203552246, "learning_rate": 8e-05, "loss": 47.7918, "num_input_tokens_seen": 196165920, "step": 3810 }, { "epoch": 0.3747328075477261, "grad_norm": 28.79266357421875, "learning_rate": 8e-05, "loss": 40.603, "num_input_tokens_seen": 196325116, "step": 3813 }, { "epoch": 0.37502764059851107, "grad_norm": 27.765893936157227, "learning_rate": 8e-05, "loss": 44.3665, "num_input_tokens_seen": 196467160, "step": 3816 }, { "epoch": 0.3753224736492961, "grad_norm": 31.322372436523438, "learning_rate": 8e-05, "loss": 40.6007, "num_input_tokens_seen": 196608832, "step": 3819 }, { "epoch": 0.3756173067000811, "grad_norm": 26.41129493713379, "learning_rate": 8e-05, "loss": 39.7225, "num_input_tokens_seen": 196769656, "step": 3822 }, { "epoch": 0.37591213975086607, "grad_norm": 32.98222351074219, "learning_rate": 8e-05, "loss": 46.4776, "num_input_tokens_seen": 196926960, "step": 3825 }, { "epoch": 0.37620697280165105, "grad_norm": 31.1025333404541, "learning_rate": 8e-05, "loss": 48.5201, "num_input_tokens_seen": 197081808, "step": 3828 }, { "epoch": 0.37650180585243603, "grad_norm": 35.10872268676758, "learning_rate": 8e-05, "loss": 47.4361, "num_input_tokens_seen": 197228816, "step": 3831 }, { "epoch": 0.37679663890322107, "grad_norm": 63.819210052490234, "learning_rate": 8e-05, "loss": 44.656, "num_input_tokens_seen": 197366464, "step": 3834 }, { "epoch": 0.37709147195400605, "grad_norm": 88.14225006103516, "learning_rate": 8e-05, "loss": 39.3489, "num_input_tokens_seen": 197515700, "step": 3837 }, { "epoch": 0.37738630500479103, "grad_norm": 30.590761184692383, "learning_rate": 8e-05, "loss": 48.5193, "num_input_tokens_seen": 197661556, "step": 3840 }, { "epoch": 0.377681138055576, "grad_norm": 27.369964599609375, "learning_rate": 8e-05, "loss": 46.7822, "num_input_tokens_seen": 197807528, "step": 3843 }, { "epoch": 0.377975971106361, "grad_norm": 27.713287353515625, "learning_rate": 8e-05, "loss": 44.0746, "num_input_tokens_seen": 197986864, "step": 3846 }, { "epoch": 0.37827080415714603, "grad_norm": 27.770008087158203, "learning_rate": 8e-05, "loss": 42.8452, "num_input_tokens_seen": 198150944, "step": 3849 }, { "epoch": 0.378565637207931, "grad_norm": 31.2003173828125, "learning_rate": 8e-05, "loss": 41.3652, "num_input_tokens_seen": 198302148, "step": 3852 }, { "epoch": 0.378860470258716, "grad_norm": 27.300168991088867, "learning_rate": 8e-05, "loss": 40.8527, "num_input_tokens_seen": 198452080, "step": 3855 }, { "epoch": 0.379155303309501, "grad_norm": 27.111713409423828, "learning_rate": 8e-05, "loss": 45.4798, "num_input_tokens_seen": 198600556, "step": 3858 }, { "epoch": 0.37945013636028596, "grad_norm": 34.327232360839844, "learning_rate": 8e-05, "loss": 40.3459, "num_input_tokens_seen": 198756508, "step": 3861 }, { "epoch": 0.379744969411071, "grad_norm": 35.697811126708984, "learning_rate": 8e-05, "loss": 45.5265, "num_input_tokens_seen": 198906604, "step": 3864 }, { "epoch": 0.380039802461856, "grad_norm": 24.162860870361328, "learning_rate": 8e-05, "loss": 42.994, "num_input_tokens_seen": 199062408, "step": 3867 }, { "epoch": 0.38033463551264096, "grad_norm": 29.72947883605957, "learning_rate": 8e-05, "loss": 46.2078, "num_input_tokens_seen": 199224256, "step": 3870 }, { "epoch": 0.38062946856342594, "grad_norm": 29.636106491088867, "learning_rate": 8e-05, "loss": 41.394, "num_input_tokens_seen": 199377520, "step": 3873 }, { "epoch": 0.380924301614211, "grad_norm": 33.16059112548828, "learning_rate": 8e-05, "loss": 45.7516, "num_input_tokens_seen": 199522628, "step": 3876 }, { "epoch": 0.38121913466499596, "grad_norm": 29.432018280029297, "learning_rate": 8e-05, "loss": 46.2355, "num_input_tokens_seen": 199680984, "step": 3879 }, { "epoch": 0.38151396771578094, "grad_norm": 26.524246215820312, "learning_rate": 8e-05, "loss": 43.8269, "num_input_tokens_seen": 199827376, "step": 3882 }, { "epoch": 0.3818088007665659, "grad_norm": 27.051618576049805, "learning_rate": 8e-05, "loss": 41.0068, "num_input_tokens_seen": 199965272, "step": 3885 }, { "epoch": 0.3821036338173509, "grad_norm": 25.889406204223633, "learning_rate": 8e-05, "loss": 41.8811, "num_input_tokens_seen": 200108116, "step": 3888 }, { "epoch": 0.38239846686813594, "grad_norm": 27.41223907470703, "learning_rate": 8e-05, "loss": 44.4539, "num_input_tokens_seen": 200259528, "step": 3891 }, { "epoch": 0.3826932999189209, "grad_norm": 29.76633071899414, "learning_rate": 8e-05, "loss": 45.3152, "num_input_tokens_seen": 200425348, "step": 3894 }, { "epoch": 0.3829881329697059, "grad_norm": 30.8220272064209, "learning_rate": 8e-05, "loss": 44.7649, "num_input_tokens_seen": 200581448, "step": 3897 }, { "epoch": 0.3832829660204909, "grad_norm": 34.690086364746094, "learning_rate": 8e-05, "loss": 42.6207, "num_input_tokens_seen": 200735580, "step": 3900 }, { "epoch": 0.38357779907127587, "grad_norm": 31.798429489135742, "learning_rate": 8e-05, "loss": 42.9762, "num_input_tokens_seen": 200871456, "step": 3903 }, { "epoch": 0.3838726321220609, "grad_norm": 31.97600746154785, "learning_rate": 8e-05, "loss": 45.5173, "num_input_tokens_seen": 201020476, "step": 3906 }, { "epoch": 0.3841674651728459, "grad_norm": 28.01764488220215, "learning_rate": 8e-05, "loss": 46.0845, "num_input_tokens_seen": 201189760, "step": 3909 }, { "epoch": 0.38446229822363087, "grad_norm": 24.853553771972656, "learning_rate": 8e-05, "loss": 45.8736, "num_input_tokens_seen": 201347932, "step": 3912 }, { "epoch": 0.38475713127441585, "grad_norm": 33.82841873168945, "learning_rate": 8e-05, "loss": 43.8107, "num_input_tokens_seen": 201498192, "step": 3915 }, { "epoch": 0.38505196432520084, "grad_norm": 30.44049644470215, "learning_rate": 8e-05, "loss": 47.1945, "num_input_tokens_seen": 201641892, "step": 3918 }, { "epoch": 0.38534679737598587, "grad_norm": 33.982540130615234, "learning_rate": 8e-05, "loss": 42.0145, "num_input_tokens_seen": 201791476, "step": 3921 }, { "epoch": 0.38564163042677085, "grad_norm": 34.017024993896484, "learning_rate": 8e-05, "loss": 45.6285, "num_input_tokens_seen": 201937304, "step": 3924 }, { "epoch": 0.38593646347755584, "grad_norm": 188.57647705078125, "learning_rate": 8e-05, "loss": 42.9968, "num_input_tokens_seen": 202069036, "step": 3927 }, { "epoch": 0.3862312965283408, "grad_norm": 29.63610076904297, "learning_rate": 8e-05, "loss": 46.5057, "num_input_tokens_seen": 202218752, "step": 3930 }, { "epoch": 0.3865261295791258, "grad_norm": 29.076887130737305, "learning_rate": 8e-05, "loss": 43.2685, "num_input_tokens_seen": 202393000, "step": 3933 }, { "epoch": 0.38682096262991084, "grad_norm": 28.38823127746582, "learning_rate": 8e-05, "loss": 42.1467, "num_input_tokens_seen": 202534764, "step": 3936 }, { "epoch": 0.3871157956806958, "grad_norm": 27.43347930908203, "learning_rate": 8e-05, "loss": 44.0863, "num_input_tokens_seen": 202698572, "step": 3939 }, { "epoch": 0.3874106287314808, "grad_norm": 30.09478187561035, "learning_rate": 8e-05, "loss": 46.4954, "num_input_tokens_seen": 202857640, "step": 3942 }, { "epoch": 0.3877054617822658, "grad_norm": 32.25828170776367, "learning_rate": 8e-05, "loss": 42.9919, "num_input_tokens_seen": 203014816, "step": 3945 }, { "epoch": 0.38800029483305076, "grad_norm": 30.99892234802246, "learning_rate": 8e-05, "loss": 49.7405, "num_input_tokens_seen": 203188984, "step": 3948 }, { "epoch": 0.3882951278838358, "grad_norm": 32.622127532958984, "learning_rate": 8e-05, "loss": 44.2072, "num_input_tokens_seen": 203340960, "step": 3951 }, { "epoch": 0.3885899609346208, "grad_norm": 27.56840705871582, "learning_rate": 8e-05, "loss": 42.6712, "num_input_tokens_seen": 203509848, "step": 3954 }, { "epoch": 0.38888479398540576, "grad_norm": 34.69350814819336, "learning_rate": 8e-05, "loss": 44.4039, "num_input_tokens_seen": 203668996, "step": 3957 }, { "epoch": 0.38917962703619075, "grad_norm": 31.97882652282715, "learning_rate": 8e-05, "loss": 44.0893, "num_input_tokens_seen": 203823772, "step": 3960 }, { "epoch": 0.3894744600869757, "grad_norm": 29.57229995727539, "learning_rate": 8e-05, "loss": 42.1541, "num_input_tokens_seen": 203977648, "step": 3963 }, { "epoch": 0.38976929313776076, "grad_norm": 29.408475875854492, "learning_rate": 8e-05, "loss": 41.2887, "num_input_tokens_seen": 204153060, "step": 3966 }, { "epoch": 0.39006412618854575, "grad_norm": 25.34873390197754, "learning_rate": 8e-05, "loss": 44.2761, "num_input_tokens_seen": 204311144, "step": 3969 }, { "epoch": 0.3903589592393307, "grad_norm": 31.603593826293945, "learning_rate": 8e-05, "loss": 44.0217, "num_input_tokens_seen": 204461156, "step": 3972 }, { "epoch": 0.3906537922901157, "grad_norm": 29.555217742919922, "learning_rate": 8e-05, "loss": 43.9669, "num_input_tokens_seen": 204626196, "step": 3975 }, { "epoch": 0.3909486253409007, "grad_norm": 33.9174690246582, "learning_rate": 8e-05, "loss": 42.4694, "num_input_tokens_seen": 204789336, "step": 3978 }, { "epoch": 0.39124345839168573, "grad_norm": 30.75056266784668, "learning_rate": 8e-05, "loss": 44.382, "num_input_tokens_seen": 204947000, "step": 3981 }, { "epoch": 0.3915382914424707, "grad_norm": 26.133014678955078, "learning_rate": 8e-05, "loss": 40.7875, "num_input_tokens_seen": 205113876, "step": 3984 }, { "epoch": 0.3918331244932557, "grad_norm": 30.40180015563965, "learning_rate": 8e-05, "loss": 44.6807, "num_input_tokens_seen": 205288396, "step": 3987 }, { "epoch": 0.3921279575440407, "grad_norm": 25.849224090576172, "learning_rate": 8e-05, "loss": 44.2802, "num_input_tokens_seen": 205448184, "step": 3990 }, { "epoch": 0.39242279059482565, "grad_norm": 29.1865291595459, "learning_rate": 8e-05, "loss": 40.1745, "num_input_tokens_seen": 205600468, "step": 3993 }, { "epoch": 0.3927176236456107, "grad_norm": 28.203125, "learning_rate": 8e-05, "loss": 43.8436, "num_input_tokens_seen": 205759340, "step": 3996 }, { "epoch": 0.3930124566963957, "grad_norm": 28.577714920043945, "learning_rate": 8e-05, "loss": 41.8728, "num_input_tokens_seen": 205922228, "step": 3999 }, { "epoch": 0.39311073437999067, "eval_gen_len": 30.17, "eval_loss": 2.8102006912231445, "eval_rouge1": 43.9268, "eval_rouge2": 26.793, "eval_rougeL": 40.1378, "eval_rougeLsum": 40.7026, "eval_runtime": 87.0957, "eval_samples_per_second": 2.296, "eval_steps_per_second": 0.574, "num_input_tokens_seen": 205979564, "step": 4000 }, { "epoch": 0.39330728974718066, "grad_norm": 34.915348052978516, "learning_rate": 8e-05, "loss": 46.0877, "num_input_tokens_seen": 206074996, "step": 4002 }, { "epoch": 0.39360212279796564, "grad_norm": 28.753887176513672, "learning_rate": 8e-05, "loss": 43.9296, "num_input_tokens_seen": 206246808, "step": 4005 }, { "epoch": 0.3938969558487506, "grad_norm": 82.86294555664062, "learning_rate": 8e-05, "loss": 44.0698, "num_input_tokens_seen": 206390204, "step": 4008 }, { "epoch": 0.39419178889953566, "grad_norm": 31.36377716064453, "learning_rate": 8e-05, "loss": 43.5785, "num_input_tokens_seen": 206545372, "step": 4011 }, { "epoch": 0.39448662195032064, "grad_norm": 29.626392364501953, "learning_rate": 8e-05, "loss": 41.7103, "num_input_tokens_seen": 206689100, "step": 4014 }, { "epoch": 0.3947814550011056, "grad_norm": 28.289608001708984, "learning_rate": 8e-05, "loss": 43.7036, "num_input_tokens_seen": 206850696, "step": 4017 }, { "epoch": 0.3950762880518906, "grad_norm": 29.048423767089844, "learning_rate": 8e-05, "loss": 45.4158, "num_input_tokens_seen": 207013176, "step": 4020 }, { "epoch": 0.3953711211026756, "grad_norm": 32.74452209472656, "learning_rate": 8e-05, "loss": 44.8341, "num_input_tokens_seen": 207154572, "step": 4023 }, { "epoch": 0.3956659541534606, "grad_norm": 30.65328025817871, "learning_rate": 8e-05, "loss": 41.3659, "num_input_tokens_seen": 207302600, "step": 4026 }, { "epoch": 0.3959607872042456, "grad_norm": 29.986570358276367, "learning_rate": 8e-05, "loss": 46.9741, "num_input_tokens_seen": 207460036, "step": 4029 }, { "epoch": 0.3962556202550306, "grad_norm": 30.823116302490234, "learning_rate": 8e-05, "loss": 46.9343, "num_input_tokens_seen": 207606992, "step": 4032 }, { "epoch": 0.39655045330581556, "grad_norm": 43.00481033325195, "learning_rate": 8e-05, "loss": 42.793, "num_input_tokens_seen": 207755144, "step": 4035 }, { "epoch": 0.3968452863566006, "grad_norm": 35.6334342956543, "learning_rate": 8e-05, "loss": 40.773, "num_input_tokens_seen": 207923108, "step": 4038 }, { "epoch": 0.3971401194073856, "grad_norm": 33.873558044433594, "learning_rate": 8e-05, "loss": 44.2082, "num_input_tokens_seen": 208070340, "step": 4041 }, { "epoch": 0.39743495245817057, "grad_norm": 48.23017501831055, "learning_rate": 8e-05, "loss": 42.1629, "num_input_tokens_seen": 208231296, "step": 4044 }, { "epoch": 0.39772978550895555, "grad_norm": 32.64248275756836, "learning_rate": 8e-05, "loss": 42.2485, "num_input_tokens_seen": 208386000, "step": 4047 }, { "epoch": 0.39802461855974053, "grad_norm": 52.33050537109375, "learning_rate": 8e-05, "loss": 42.8339, "num_input_tokens_seen": 208543976, "step": 4050 }, { "epoch": 0.39831945161052557, "grad_norm": 27.67913055419922, "learning_rate": 8e-05, "loss": 46.7309, "num_input_tokens_seen": 208710496, "step": 4053 }, { "epoch": 0.39861428466131055, "grad_norm": 72.23770904541016, "learning_rate": 8e-05, "loss": 45.5283, "num_input_tokens_seen": 208855908, "step": 4056 }, { "epoch": 0.39890911771209553, "grad_norm": 63.03166198730469, "learning_rate": 8e-05, "loss": 45.1794, "num_input_tokens_seen": 209009292, "step": 4059 }, { "epoch": 0.3992039507628805, "grad_norm": 34.75033950805664, "learning_rate": 8e-05, "loss": 46.1443, "num_input_tokens_seen": 209156760, "step": 4062 }, { "epoch": 0.3994987838136655, "grad_norm": 31.29888916015625, "learning_rate": 8e-05, "loss": 42.9468, "num_input_tokens_seen": 209310320, "step": 4065 }, { "epoch": 0.39979361686445053, "grad_norm": 30.495737075805664, "learning_rate": 8e-05, "loss": 44.2227, "num_input_tokens_seen": 209461532, "step": 4068 }, { "epoch": 0.4000884499152355, "grad_norm": 29.618284225463867, "learning_rate": 8e-05, "loss": 40.4326, "num_input_tokens_seen": 209602624, "step": 4071 }, { "epoch": 0.4003832829660205, "grad_norm": 33.6846923828125, "learning_rate": 8e-05, "loss": 43.9579, "num_input_tokens_seen": 209776420, "step": 4074 }, { "epoch": 0.4006781160168055, "grad_norm": 26.407421112060547, "learning_rate": 8e-05, "loss": 43.0974, "num_input_tokens_seen": 209937612, "step": 4077 }, { "epoch": 0.40097294906759046, "grad_norm": 27.561464309692383, "learning_rate": 8e-05, "loss": 40.6611, "num_input_tokens_seen": 210068780, "step": 4080 }, { "epoch": 0.4012677821183755, "grad_norm": 27.589263916015625, "learning_rate": 8e-05, "loss": 42.478, "num_input_tokens_seen": 210205900, "step": 4083 }, { "epoch": 0.4015626151691605, "grad_norm": 33.44409942626953, "learning_rate": 8e-05, "loss": 41.9828, "num_input_tokens_seen": 210337076, "step": 4086 }, { "epoch": 0.40185744821994546, "grad_norm": 28.39436149597168, "learning_rate": 8e-05, "loss": 47.7314, "num_input_tokens_seen": 210488940, "step": 4089 }, { "epoch": 0.40215228127073044, "grad_norm": 30.144987106323242, "learning_rate": 8e-05, "loss": 43.2947, "num_input_tokens_seen": 210640804, "step": 4092 }, { "epoch": 0.4024471143215154, "grad_norm": 31.10938262939453, "learning_rate": 8e-05, "loss": 43.9627, "num_input_tokens_seen": 210806580, "step": 4095 }, { "epoch": 0.40274194737230046, "grad_norm": 29.420190811157227, "learning_rate": 8e-05, "loss": 40.8033, "num_input_tokens_seen": 210966164, "step": 4098 }, { "epoch": 0.40303678042308544, "grad_norm": 32.46398162841797, "learning_rate": 8e-05, "loss": 45.5083, "num_input_tokens_seen": 211131532, "step": 4101 }, { "epoch": 0.4033316134738704, "grad_norm": 25.459470748901367, "learning_rate": 8e-05, "loss": 44.1949, "num_input_tokens_seen": 211277416, "step": 4104 }, { "epoch": 0.4036264465246554, "grad_norm": 26.684600830078125, "learning_rate": 8e-05, "loss": 41.2496, "num_input_tokens_seen": 211434740, "step": 4107 }, { "epoch": 0.4039212795754404, "grad_norm": 30.317163467407227, "learning_rate": 8e-05, "loss": 46.2663, "num_input_tokens_seen": 211590464, "step": 4110 }, { "epoch": 0.4042161126262254, "grad_norm": 28.028276443481445, "learning_rate": 8e-05, "loss": 43.1111, "num_input_tokens_seen": 211754212, "step": 4113 }, { "epoch": 0.4045109456770104, "grad_norm": 25.986167907714844, "learning_rate": 8e-05, "loss": 42.6019, "num_input_tokens_seen": 211931260, "step": 4116 }, { "epoch": 0.4048057787277954, "grad_norm": 28.613994598388672, "learning_rate": 8e-05, "loss": 43.5144, "num_input_tokens_seen": 212103276, "step": 4119 }, { "epoch": 0.40510061177858037, "grad_norm": 31.608320236206055, "learning_rate": 8e-05, "loss": 43.4318, "num_input_tokens_seen": 212264392, "step": 4122 }, { "epoch": 0.40539544482936535, "grad_norm": 29.455188751220703, "learning_rate": 8e-05, "loss": 45.3574, "num_input_tokens_seen": 212403240, "step": 4125 }, { "epoch": 0.4056902778801504, "grad_norm": 27.21038818359375, "learning_rate": 8e-05, "loss": 41.9461, "num_input_tokens_seen": 212547536, "step": 4128 }, { "epoch": 0.40598511093093537, "grad_norm": 26.4250545501709, "learning_rate": 8e-05, "loss": 43.9856, "num_input_tokens_seen": 212699424, "step": 4131 }, { "epoch": 0.40627994398172035, "grad_norm": 27.913158416748047, "learning_rate": 8e-05, "loss": 43.1773, "num_input_tokens_seen": 212856380, "step": 4134 }, { "epoch": 0.40657477703250533, "grad_norm": 26.234939575195312, "learning_rate": 8e-05, "loss": 42.7789, "num_input_tokens_seen": 213029056, "step": 4137 }, { "epoch": 0.4068696100832903, "grad_norm": 31.88821029663086, "learning_rate": 8e-05, "loss": 44.5928, "num_input_tokens_seen": 213185192, "step": 4140 }, { "epoch": 0.40716444313407535, "grad_norm": 30.04606056213379, "learning_rate": 8e-05, "loss": 45.7953, "num_input_tokens_seen": 213323056, "step": 4143 }, { "epoch": 0.40745927618486033, "grad_norm": 30.563106536865234, "learning_rate": 8e-05, "loss": 47.0879, "num_input_tokens_seen": 213479292, "step": 4146 }, { "epoch": 0.4077541092356453, "grad_norm": 29.05677604675293, "learning_rate": 8e-05, "loss": 43.9433, "num_input_tokens_seen": 213626828, "step": 4149 }, { "epoch": 0.4080489422864303, "grad_norm": 28.374475479125977, "learning_rate": 8e-05, "loss": 46.5581, "num_input_tokens_seen": 213772128, "step": 4152 }, { "epoch": 0.4083437753372153, "grad_norm": 27.08576774597168, "learning_rate": 8e-05, "loss": 46.3264, "num_input_tokens_seen": 213917224, "step": 4155 }, { "epoch": 0.4086386083880003, "grad_norm": 27.269411087036133, "learning_rate": 8e-05, "loss": 42.0309, "num_input_tokens_seen": 214065624, "step": 4158 }, { "epoch": 0.4089334414387853, "grad_norm": 29.288543701171875, "learning_rate": 8e-05, "loss": 42.4004, "num_input_tokens_seen": 214230256, "step": 4161 }, { "epoch": 0.4092282744895703, "grad_norm": 25.37685775756836, "learning_rate": 8e-05, "loss": 41.6511, "num_input_tokens_seen": 214381884, "step": 4164 }, { "epoch": 0.40952310754035526, "grad_norm": 25.916948318481445, "learning_rate": 8e-05, "loss": 39.7151, "num_input_tokens_seen": 214540220, "step": 4167 }, { "epoch": 0.40981794059114024, "grad_norm": 32.05540466308594, "learning_rate": 8e-05, "loss": 42.6181, "num_input_tokens_seen": 214689304, "step": 4170 }, { "epoch": 0.4101127736419253, "grad_norm": 27.41069984436035, "learning_rate": 8e-05, "loss": 42.0901, "num_input_tokens_seen": 214841736, "step": 4173 }, { "epoch": 0.41040760669271026, "grad_norm": 23.779354095458984, "learning_rate": 8e-05, "loss": 40.2321, "num_input_tokens_seen": 215011564, "step": 4176 }, { "epoch": 0.41070243974349524, "grad_norm": 26.35748291015625, "learning_rate": 8e-05, "loss": 40.0868, "num_input_tokens_seen": 215159752, "step": 4179 }, { "epoch": 0.4109972727942802, "grad_norm": 25.243736267089844, "learning_rate": 8e-05, "loss": 42.1468, "num_input_tokens_seen": 215309800, "step": 4182 }, { "epoch": 0.41129210584506526, "grad_norm": 40.49623107910156, "learning_rate": 8e-05, "loss": 42.6442, "num_input_tokens_seen": 215459680, "step": 4185 }, { "epoch": 0.41158693889585024, "grad_norm": 27.0998477935791, "learning_rate": 8e-05, "loss": 44.6483, "num_input_tokens_seen": 215603644, "step": 4188 }, { "epoch": 0.4118817719466352, "grad_norm": 32.15782928466797, "learning_rate": 8e-05, "loss": 44.5412, "num_input_tokens_seen": 215768376, "step": 4191 }, { "epoch": 0.4121766049974202, "grad_norm": 30.283430099487305, "learning_rate": 8e-05, "loss": 43.9261, "num_input_tokens_seen": 215907312, "step": 4194 }, { "epoch": 0.4124714380482052, "grad_norm": 27.773475646972656, "learning_rate": 8e-05, "loss": 40.9794, "num_input_tokens_seen": 216057128, "step": 4197 }, { "epoch": 0.4127662710989902, "grad_norm": 72.8736801147461, "learning_rate": 8e-05, "loss": 44.0748, "num_input_tokens_seen": 216211216, "step": 4200 }, { "epoch": 0.4130611041497752, "grad_norm": 32.80086898803711, "learning_rate": 8e-05, "loss": 42.9728, "num_input_tokens_seen": 216349276, "step": 4203 }, { "epoch": 0.4133559372005602, "grad_norm": 30.961280822753906, "learning_rate": 8e-05, "loss": 46.0797, "num_input_tokens_seen": 216517136, "step": 4206 }, { "epoch": 0.41365077025134517, "grad_norm": 32.26238250732422, "learning_rate": 8e-05, "loss": 38.8171, "num_input_tokens_seen": 216665920, "step": 4209 }, { "epoch": 0.41394560330213015, "grad_norm": 28.24716567993164, "learning_rate": 8e-05, "loss": 44.8195, "num_input_tokens_seen": 216825804, "step": 4212 }, { "epoch": 0.4142404363529152, "grad_norm": 28.280357360839844, "learning_rate": 8e-05, "loss": 42.5102, "num_input_tokens_seen": 216990172, "step": 4215 }, { "epoch": 0.41453526940370017, "grad_norm": 26.093664169311523, "learning_rate": 8e-05, "loss": 46.2611, "num_input_tokens_seen": 217147152, "step": 4218 }, { "epoch": 0.41483010245448515, "grad_norm": 28.501445770263672, "learning_rate": 8e-05, "loss": 40.6934, "num_input_tokens_seen": 217337740, "step": 4221 }, { "epoch": 0.41512493550527013, "grad_norm": 25.08894157409668, "learning_rate": 8e-05, "loss": 44.784, "num_input_tokens_seen": 217471444, "step": 4224 }, { "epoch": 0.4154197685560551, "grad_norm": 30.653091430664062, "learning_rate": 8e-05, "loss": 43.6489, "num_input_tokens_seen": 217620536, "step": 4227 }, { "epoch": 0.41571460160684015, "grad_norm": 30.691848754882812, "learning_rate": 8e-05, "loss": 41.9049, "num_input_tokens_seen": 217765936, "step": 4230 }, { "epoch": 0.41600943465762513, "grad_norm": 30.53278160095215, "learning_rate": 8e-05, "loss": 39.0308, "num_input_tokens_seen": 217949424, "step": 4233 }, { "epoch": 0.4163042677084101, "grad_norm": 28.368154525756836, "learning_rate": 8e-05, "loss": 42.2364, "num_input_tokens_seen": 218101452, "step": 4236 }, { "epoch": 0.4165991007591951, "grad_norm": 30.020780563354492, "learning_rate": 8e-05, "loss": 45.0031, "num_input_tokens_seen": 218280512, "step": 4239 }, { "epoch": 0.4168939338099801, "grad_norm": 24.755632400512695, "learning_rate": 8e-05, "loss": 39.9639, "num_input_tokens_seen": 218440480, "step": 4242 }, { "epoch": 0.4171887668607651, "grad_norm": 28.246004104614258, "learning_rate": 8e-05, "loss": 48.4517, "num_input_tokens_seen": 218584376, "step": 4245 }, { "epoch": 0.4174835999115501, "grad_norm": 29.797582626342773, "learning_rate": 8e-05, "loss": 43.5954, "num_input_tokens_seen": 218739188, "step": 4248 }, { "epoch": 0.4177784329623351, "grad_norm": 30.69160270690918, "learning_rate": 8e-05, "loss": 42.1095, "num_input_tokens_seen": 218895136, "step": 4251 }, { "epoch": 0.41807326601312006, "grad_norm": 33.107330322265625, "learning_rate": 8e-05, "loss": 44.5923, "num_input_tokens_seen": 219049092, "step": 4254 }, { "epoch": 0.41836809906390504, "grad_norm": 35.53581237792969, "learning_rate": 8e-05, "loss": 39.4062, "num_input_tokens_seen": 219199092, "step": 4257 }, { "epoch": 0.4186629321146901, "grad_norm": 27.54696273803711, "learning_rate": 8e-05, "loss": 42.3835, "num_input_tokens_seen": 219367748, "step": 4260 }, { "epoch": 0.41895776516547506, "grad_norm": 27.74174690246582, "learning_rate": 8e-05, "loss": 43.5722, "num_input_tokens_seen": 219543440, "step": 4263 }, { "epoch": 0.41925259821626004, "grad_norm": 28.04417610168457, "learning_rate": 8e-05, "loss": 46.6899, "num_input_tokens_seen": 219691600, "step": 4266 }, { "epoch": 0.419547431267045, "grad_norm": 27.233768463134766, "learning_rate": 8e-05, "loss": 44.2905, "num_input_tokens_seen": 219840124, "step": 4269 }, { "epoch": 0.41984226431783, "grad_norm": 31.485761642456055, "learning_rate": 8e-05, "loss": 43.1472, "num_input_tokens_seen": 219978572, "step": 4272 }, { "epoch": 0.42013709736861504, "grad_norm": 27.343427658081055, "learning_rate": 8e-05, "loss": 40.3824, "num_input_tokens_seen": 220118616, "step": 4275 }, { "epoch": 0.4204319304194, "grad_norm": 28.364166259765625, "learning_rate": 8e-05, "loss": 41.5778, "num_input_tokens_seen": 220275124, "step": 4278 }, { "epoch": 0.420726763470185, "grad_norm": 31.391408920288086, "learning_rate": 8e-05, "loss": 45.4767, "num_input_tokens_seen": 220432824, "step": 4281 }, { "epoch": 0.42102159652097, "grad_norm": 27.511157989501953, "learning_rate": 8e-05, "loss": 44.5242, "num_input_tokens_seen": 220564500, "step": 4284 }, { "epoch": 0.42131642957175497, "grad_norm": 27.571096420288086, "learning_rate": 8e-05, "loss": 43.9533, "num_input_tokens_seen": 220720920, "step": 4287 }, { "epoch": 0.42161126262254, "grad_norm": 31.85966682434082, "learning_rate": 8e-05, "loss": 40.4081, "num_input_tokens_seen": 220865572, "step": 4290 }, { "epoch": 0.421906095673325, "grad_norm": 30.02121353149414, "learning_rate": 8e-05, "loss": 45.8891, "num_input_tokens_seen": 221000948, "step": 4293 }, { "epoch": 0.42220092872410997, "grad_norm": 34.88405227661133, "learning_rate": 8e-05, "loss": 43.8774, "num_input_tokens_seen": 221152948, "step": 4296 }, { "epoch": 0.42249576177489495, "grad_norm": 29.184438705444336, "learning_rate": 8e-05, "loss": 45.0154, "num_input_tokens_seen": 221312820, "step": 4299 }, { "epoch": 0.42279059482567993, "grad_norm": 28.55838394165039, "learning_rate": 8e-05, "loss": 41.2003, "num_input_tokens_seen": 221458184, "step": 4302 }, { "epoch": 0.42308542787646497, "grad_norm": 27.928804397583008, "learning_rate": 8e-05, "loss": 42.1826, "num_input_tokens_seen": 221616952, "step": 4305 }, { "epoch": 0.42338026092724995, "grad_norm": 26.76680564880371, "learning_rate": 8e-05, "loss": 40.2512, "num_input_tokens_seen": 221761448, "step": 4308 }, { "epoch": 0.42367509397803493, "grad_norm": 28.121938705444336, "learning_rate": 8e-05, "loss": 44.0944, "num_input_tokens_seen": 221938332, "step": 4311 }, { "epoch": 0.4239699270288199, "grad_norm": 31.460044860839844, "learning_rate": 8e-05, "loss": 41.3197, "num_input_tokens_seen": 222095052, "step": 4314 }, { "epoch": 0.4242647600796049, "grad_norm": 28.240819931030273, "learning_rate": 8e-05, "loss": 43.4479, "num_input_tokens_seen": 222245720, "step": 4317 }, { "epoch": 0.42455959313038993, "grad_norm": 29.07929229736328, "learning_rate": 8e-05, "loss": 47.6133, "num_input_tokens_seen": 222377240, "step": 4320 }, { "epoch": 0.4248544261811749, "grad_norm": 29.97142791748047, "learning_rate": 8e-05, "loss": 43.3428, "num_input_tokens_seen": 222550048, "step": 4323 }, { "epoch": 0.4251492592319599, "grad_norm": 34.53770446777344, "learning_rate": 8e-05, "loss": 38.3135, "num_input_tokens_seen": 222694712, "step": 4326 }, { "epoch": 0.4254440922827449, "grad_norm": 27.796838760375977, "learning_rate": 8e-05, "loss": 41.4311, "num_input_tokens_seen": 222846660, "step": 4329 }, { "epoch": 0.4257389253335299, "grad_norm": 30.644004821777344, "learning_rate": 8e-05, "loss": 46.5237, "num_input_tokens_seen": 222995488, "step": 4332 }, { "epoch": 0.4260337583843149, "grad_norm": 25.572084426879883, "learning_rate": 8e-05, "loss": 41.2685, "num_input_tokens_seen": 223141724, "step": 4335 }, { "epoch": 0.4263285914350999, "grad_norm": 26.065940856933594, "learning_rate": 8e-05, "loss": 42.0256, "num_input_tokens_seen": 223288620, "step": 4338 }, { "epoch": 0.42662342448588486, "grad_norm": 35.70410919189453, "learning_rate": 8e-05, "loss": 45.6876, "num_input_tokens_seen": 223451668, "step": 4341 }, { "epoch": 0.42691825753666984, "grad_norm": 21.78587532043457, "learning_rate": 8e-05, "loss": 39.8147, "num_input_tokens_seen": 223600916, "step": 4344 }, { "epoch": 0.4272130905874549, "grad_norm": 26.8822078704834, "learning_rate": 8e-05, "loss": 44.0621, "num_input_tokens_seen": 223762988, "step": 4347 }, { "epoch": 0.42750792363823986, "grad_norm": 26.29570960998535, "learning_rate": 8e-05, "loss": 42.4211, "num_input_tokens_seen": 223927452, "step": 4350 }, { "epoch": 0.42780275668902484, "grad_norm": 27.68350601196289, "learning_rate": 8e-05, "loss": 42.0873, "num_input_tokens_seen": 224105740, "step": 4353 }, { "epoch": 0.4280975897398098, "grad_norm": 25.874874114990234, "learning_rate": 8e-05, "loss": 37.8914, "num_input_tokens_seen": 224268092, "step": 4356 }, { "epoch": 0.4283924227905948, "grad_norm": 29.17751121520996, "learning_rate": 8e-05, "loss": 42.4487, "num_input_tokens_seen": 224435168, "step": 4359 }, { "epoch": 0.42868725584137984, "grad_norm": 30.268043518066406, "learning_rate": 8e-05, "loss": 44.7985, "num_input_tokens_seen": 224577196, "step": 4362 }, { "epoch": 0.4289820888921648, "grad_norm": 24.906023025512695, "learning_rate": 8e-05, "loss": 38.5836, "num_input_tokens_seen": 224734276, "step": 4365 }, { "epoch": 0.4292769219429498, "grad_norm": 37.84556198120117, "learning_rate": 8e-05, "loss": 44.6142, "num_input_tokens_seen": 224892396, "step": 4368 }, { "epoch": 0.4295717549937348, "grad_norm": 26.417160034179688, "learning_rate": 8e-05, "loss": 39.8458, "num_input_tokens_seen": 225043828, "step": 4371 }, { "epoch": 0.42986658804451977, "grad_norm": 27.29924774169922, "learning_rate": 8e-05, "loss": 38.1589, "num_input_tokens_seen": 225198824, "step": 4374 }, { "epoch": 0.4301614210953048, "grad_norm": 39.75818634033203, "learning_rate": 8e-05, "loss": 47.3034, "num_input_tokens_seen": 225340136, "step": 4377 }, { "epoch": 0.4304562541460898, "grad_norm": 28.143962860107422, "learning_rate": 8e-05, "loss": 42.0063, "num_input_tokens_seen": 225469012, "step": 4380 }, { "epoch": 0.43075108719687477, "grad_norm": 31.327285766601562, "learning_rate": 8e-05, "loss": 42.4765, "num_input_tokens_seen": 225608504, "step": 4383 }, { "epoch": 0.43104592024765975, "grad_norm": 27.978796005249023, "learning_rate": 8e-05, "loss": 39.2096, "num_input_tokens_seen": 225760756, "step": 4386 }, { "epoch": 0.43134075329844473, "grad_norm": 28.254173278808594, "learning_rate": 8e-05, "loss": 45.1552, "num_input_tokens_seen": 225918220, "step": 4389 }, { "epoch": 0.43163558634922977, "grad_norm": 28.915897369384766, "learning_rate": 8e-05, "loss": 43.1089, "num_input_tokens_seen": 226082244, "step": 4392 }, { "epoch": 0.43193041940001475, "grad_norm": 28.157947540283203, "learning_rate": 8e-05, "loss": 41.2043, "num_input_tokens_seen": 226247280, "step": 4395 }, { "epoch": 0.43222525245079974, "grad_norm": 28.28232765197754, "learning_rate": 8e-05, "loss": 43.0462, "num_input_tokens_seen": 226400352, "step": 4398 }, { "epoch": 0.4325200855015847, "grad_norm": 25.633373260498047, "learning_rate": 8e-05, "loss": 43.7799, "num_input_tokens_seen": 226565792, "step": 4401 }, { "epoch": 0.4328149185523697, "grad_norm": 27.941856384277344, "learning_rate": 8e-05, "loss": 43.6231, "num_input_tokens_seen": 226710228, "step": 4404 }, { "epoch": 0.43310975160315474, "grad_norm": 28.795846939086914, "learning_rate": 8e-05, "loss": 43.6229, "num_input_tokens_seen": 226848448, "step": 4407 }, { "epoch": 0.4334045846539397, "grad_norm": 33.9630241394043, "learning_rate": 8e-05, "loss": 39.9804, "num_input_tokens_seen": 226998428, "step": 4410 }, { "epoch": 0.4336994177047247, "grad_norm": 24.979286193847656, "learning_rate": 8e-05, "loss": 40.9833, "num_input_tokens_seen": 227156348, "step": 4413 }, { "epoch": 0.4339942507555097, "grad_norm": 29.209545135498047, "learning_rate": 8e-05, "loss": 38.7767, "num_input_tokens_seen": 227288052, "step": 4416 }, { "epoch": 0.43428908380629466, "grad_norm": 33.29966354370117, "learning_rate": 8e-05, "loss": 42.0673, "num_input_tokens_seen": 227429904, "step": 4419 }, { "epoch": 0.4345839168570797, "grad_norm": 24.679658889770508, "learning_rate": 8e-05, "loss": 42.4092, "num_input_tokens_seen": 227604476, "step": 4422 }, { "epoch": 0.4348787499078647, "grad_norm": 29.85538673400879, "learning_rate": 8e-05, "loss": 39.9113, "num_input_tokens_seen": 227752576, "step": 4425 }, { "epoch": 0.43517358295864966, "grad_norm": 29.75106430053711, "learning_rate": 8e-05, "loss": 44.6768, "num_input_tokens_seen": 227894956, "step": 4428 }, { "epoch": 0.43546841600943464, "grad_norm": 25.508901596069336, "learning_rate": 8e-05, "loss": 39.05, "num_input_tokens_seen": 228065148, "step": 4431 }, { "epoch": 0.4357632490602196, "grad_norm": 28.596651077270508, "learning_rate": 8e-05, "loss": 40.1365, "num_input_tokens_seen": 228228056, "step": 4434 }, { "epoch": 0.43605808211100466, "grad_norm": 45.46236038208008, "learning_rate": 8e-05, "loss": 42.0655, "num_input_tokens_seen": 228375532, "step": 4437 }, { "epoch": 0.43635291516178965, "grad_norm": 105.65780639648438, "learning_rate": 8e-05, "loss": 39.8979, "num_input_tokens_seen": 228520848, "step": 4440 }, { "epoch": 0.4366477482125746, "grad_norm": 27.998929977416992, "learning_rate": 8e-05, "loss": 45.1238, "num_input_tokens_seen": 228682420, "step": 4443 }, { "epoch": 0.4369425812633596, "grad_norm": 31.12704086303711, "learning_rate": 8e-05, "loss": 42.5966, "num_input_tokens_seen": 228826300, "step": 4446 }, { "epoch": 0.4372374143141446, "grad_norm": 29.78077507019043, "learning_rate": 8e-05, "loss": 43.7748, "num_input_tokens_seen": 228999788, "step": 4449 }, { "epoch": 0.4375322473649296, "grad_norm": 26.121967315673828, "learning_rate": 8e-05, "loss": 44.1384, "num_input_tokens_seen": 229163636, "step": 4452 }, { "epoch": 0.4378270804157146, "grad_norm": 27.11899757385254, "learning_rate": 8e-05, "loss": 42.037, "num_input_tokens_seen": 229301104, "step": 4455 }, { "epoch": 0.4381219134664996, "grad_norm": 27.865236282348633, "learning_rate": 8e-05, "loss": 41.5314, "num_input_tokens_seen": 229470728, "step": 4458 }, { "epoch": 0.4384167465172846, "grad_norm": 26.886306762695312, "learning_rate": 8e-05, "loss": 43.424, "num_input_tokens_seen": 229639428, "step": 4461 }, { "epoch": 0.43871157956806955, "grad_norm": 30.441774368286133, "learning_rate": 8e-05, "loss": 44.4104, "num_input_tokens_seen": 229773216, "step": 4464 }, { "epoch": 0.4390064126188546, "grad_norm": 26.78765869140625, "learning_rate": 8e-05, "loss": 44.3086, "num_input_tokens_seen": 229930740, "step": 4467 }, { "epoch": 0.4393012456696396, "grad_norm": 28.085712432861328, "learning_rate": 8e-05, "loss": 39.6616, "num_input_tokens_seen": 230100392, "step": 4470 }, { "epoch": 0.43959607872042455, "grad_norm": 28.30702781677246, "learning_rate": 8e-05, "loss": 42.1998, "num_input_tokens_seen": 230261584, "step": 4473 }, { "epoch": 0.43989091177120954, "grad_norm": 26.2158260345459, "learning_rate": 8e-05, "loss": 42.0976, "num_input_tokens_seen": 230425084, "step": 4476 }, { "epoch": 0.4401857448219945, "grad_norm": 32.34695053100586, "learning_rate": 8e-05, "loss": 46.1886, "num_input_tokens_seen": 230576716, "step": 4479 }, { "epoch": 0.44048057787277956, "grad_norm": 26.81767463684082, "learning_rate": 8e-05, "loss": 40.8735, "num_input_tokens_seen": 230736824, "step": 4482 }, { "epoch": 0.44077541092356454, "grad_norm": 26.143571853637695, "learning_rate": 8e-05, "loss": 40.0293, "num_input_tokens_seen": 230870868, "step": 4485 }, { "epoch": 0.4410702439743495, "grad_norm": 26.403610229492188, "learning_rate": 8e-05, "loss": 41.7636, "num_input_tokens_seen": 231008172, "step": 4488 }, { "epoch": 0.4413650770251345, "grad_norm": 26.66645050048828, "learning_rate": 8e-05, "loss": 42.2956, "num_input_tokens_seen": 231173044, "step": 4491 }, { "epoch": 0.44165991007591954, "grad_norm": 26.82290267944336, "learning_rate": 8e-05, "loss": 41.3145, "num_input_tokens_seen": 231342420, "step": 4494 }, { "epoch": 0.4419547431267045, "grad_norm": 26.77899169921875, "learning_rate": 8e-05, "loss": 37.7972, "num_input_tokens_seen": 231483824, "step": 4497 }, { "epoch": 0.4422495761774895, "grad_norm": 27.854171752929688, "learning_rate": 8e-05, "loss": 41.4553, "num_input_tokens_seen": 231629132, "step": 4500 }, { "epoch": 0.4425444092282745, "grad_norm": 26.862014770507812, "learning_rate": 8e-05, "loss": 40.6043, "num_input_tokens_seen": 231776520, "step": 4503 }, { "epoch": 0.44283924227905946, "grad_norm": 24.268115997314453, "learning_rate": 8e-05, "loss": 40.1136, "num_input_tokens_seen": 231962636, "step": 4506 }, { "epoch": 0.4431340753298445, "grad_norm": 30.513568878173828, "learning_rate": 8e-05, "loss": 40.4665, "num_input_tokens_seen": 232125376, "step": 4509 }, { "epoch": 0.4434289083806295, "grad_norm": 26.28969955444336, "learning_rate": 8e-05, "loss": 39.5851, "num_input_tokens_seen": 232266024, "step": 4512 }, { "epoch": 0.44372374143141446, "grad_norm": 27.46841812133789, "learning_rate": 8e-05, "loss": 44.978, "num_input_tokens_seen": 232420048, "step": 4515 }, { "epoch": 0.44401857448219945, "grad_norm": 25.948152542114258, "learning_rate": 8e-05, "loss": 43.6427, "num_input_tokens_seen": 232570936, "step": 4518 }, { "epoch": 0.44431340753298443, "grad_norm": 27.499948501586914, "learning_rate": 8e-05, "loss": 44.9228, "num_input_tokens_seen": 232737172, "step": 4521 }, { "epoch": 0.44460824058376947, "grad_norm": 28.01111602783203, "learning_rate": 8e-05, "loss": 43.929, "num_input_tokens_seen": 232890980, "step": 4524 }, { "epoch": 0.44490307363455445, "grad_norm": 27.918964385986328, "learning_rate": 8e-05, "loss": 38.9864, "num_input_tokens_seen": 233027788, "step": 4527 }, { "epoch": 0.44519790668533943, "grad_norm": 29.82014274597168, "learning_rate": 8e-05, "loss": 42.5304, "num_input_tokens_seen": 233198452, "step": 4530 }, { "epoch": 0.4454927397361244, "grad_norm": 26.141271591186523, "learning_rate": 8e-05, "loss": 41.4499, "num_input_tokens_seen": 233350428, "step": 4533 }, { "epoch": 0.4457875727869094, "grad_norm": 57.809268951416016, "learning_rate": 8e-05, "loss": 39.5788, "num_input_tokens_seen": 233493092, "step": 4536 }, { "epoch": 0.44608240583769443, "grad_norm": 56.92467498779297, "learning_rate": 8e-05, "loss": 39.9422, "num_input_tokens_seen": 233646260, "step": 4539 }, { "epoch": 0.4463772388884794, "grad_norm": 25.191301345825195, "learning_rate": 8e-05, "loss": 42.8789, "num_input_tokens_seen": 233789360, "step": 4542 }, { "epoch": 0.4466720719392644, "grad_norm": 31.032283782958984, "learning_rate": 8e-05, "loss": 42.8801, "num_input_tokens_seen": 233939884, "step": 4545 }, { "epoch": 0.4469669049900494, "grad_norm": 24.119443893432617, "learning_rate": 8e-05, "loss": 38.5954, "num_input_tokens_seen": 234093744, "step": 4548 }, { "epoch": 0.44726173804083436, "grad_norm": 30.361286163330078, "learning_rate": 8e-05, "loss": 42.6672, "num_input_tokens_seen": 234253980, "step": 4551 }, { "epoch": 0.4475565710916194, "grad_norm": 26.771743774414062, "learning_rate": 8e-05, "loss": 38.8298, "num_input_tokens_seen": 234412916, "step": 4554 }, { "epoch": 0.4478514041424044, "grad_norm": 31.644166946411133, "learning_rate": 8e-05, "loss": 45.0399, "num_input_tokens_seen": 234564704, "step": 4557 }, { "epoch": 0.44814623719318936, "grad_norm": 27.465606689453125, "learning_rate": 8e-05, "loss": 37.7804, "num_input_tokens_seen": 234719408, "step": 4560 }, { "epoch": 0.44844107024397434, "grad_norm": 25.798643112182617, "learning_rate": 8e-05, "loss": 41.3184, "num_input_tokens_seen": 234877952, "step": 4563 }, { "epoch": 0.4487359032947593, "grad_norm": 25.45863914489746, "learning_rate": 8e-05, "loss": 41.4128, "num_input_tokens_seen": 235018628, "step": 4566 }, { "epoch": 0.44903073634554436, "grad_norm": 27.793365478515625, "learning_rate": 8e-05, "loss": 45.1726, "num_input_tokens_seen": 235183500, "step": 4569 }, { "epoch": 0.44932556939632934, "grad_norm": 24.252897262573242, "learning_rate": 8e-05, "loss": 42.8122, "num_input_tokens_seen": 235353172, "step": 4572 }, { "epoch": 0.4496204024471143, "grad_norm": 26.199750900268555, "learning_rate": 8e-05, "loss": 41.2429, "num_input_tokens_seen": 235498740, "step": 4575 }, { "epoch": 0.4499152354978993, "grad_norm": 26.499221801757812, "learning_rate": 8e-05, "loss": 42.1125, "num_input_tokens_seen": 235684608, "step": 4578 }, { "epoch": 0.4502100685486843, "grad_norm": 26.480804443359375, "learning_rate": 8e-05, "loss": 40.9153, "num_input_tokens_seen": 235850480, "step": 4581 }, { "epoch": 0.4505049015994693, "grad_norm": 26.42413902282715, "learning_rate": 8e-05, "loss": 42.1996, "num_input_tokens_seen": 236034140, "step": 4584 }, { "epoch": 0.4507997346502543, "grad_norm": 24.659976959228516, "learning_rate": 8e-05, "loss": 39.7046, "num_input_tokens_seen": 236180908, "step": 4587 }, { "epoch": 0.4510945677010393, "grad_norm": 31.82207679748535, "learning_rate": 8e-05, "loss": 40.1822, "num_input_tokens_seen": 236331080, "step": 4590 }, { "epoch": 0.45138940075182427, "grad_norm": 22.85356903076172, "learning_rate": 8e-05, "loss": 37.4057, "num_input_tokens_seen": 236491800, "step": 4593 }, { "epoch": 0.45168423380260925, "grad_norm": 35.563621520996094, "learning_rate": 8e-05, "loss": 41.0396, "num_input_tokens_seen": 236637464, "step": 4596 }, { "epoch": 0.4519790668533943, "grad_norm": 36.02199172973633, "learning_rate": 8e-05, "loss": 45.6636, "num_input_tokens_seen": 236803816, "step": 4599 }, { "epoch": 0.45227389990417927, "grad_norm": 28.48723793029785, "learning_rate": 8e-05, "loss": 39.8946, "num_input_tokens_seen": 236949088, "step": 4602 }, { "epoch": 0.45256873295496425, "grad_norm": 29.46847915649414, "learning_rate": 8e-05, "loss": 40.2725, "num_input_tokens_seen": 237105976, "step": 4605 }, { "epoch": 0.45286356600574923, "grad_norm": 24.744489669799805, "learning_rate": 8e-05, "loss": 42.5878, "num_input_tokens_seen": 237259064, "step": 4608 }, { "epoch": 0.4531583990565342, "grad_norm": 28.40730857849121, "learning_rate": 8e-05, "loss": 41.7184, "num_input_tokens_seen": 237429340, "step": 4611 }, { "epoch": 0.45345323210731925, "grad_norm": 26.473224639892578, "learning_rate": 8e-05, "loss": 43.0435, "num_input_tokens_seen": 237587500, "step": 4614 }, { "epoch": 0.45374806515810423, "grad_norm": 26.486181259155273, "learning_rate": 8e-05, "loss": 40.3067, "num_input_tokens_seen": 237748580, "step": 4617 }, { "epoch": 0.4540428982088892, "grad_norm": 38.76091766357422, "learning_rate": 8e-05, "loss": 39.2949, "num_input_tokens_seen": 237924340, "step": 4620 }, { "epoch": 0.4543377312596742, "grad_norm": 27.670326232910156, "learning_rate": 8e-05, "loss": 40.5704, "num_input_tokens_seen": 238066828, "step": 4623 }, { "epoch": 0.4546325643104592, "grad_norm": 40.92820739746094, "learning_rate": 8e-05, "loss": 44.0541, "num_input_tokens_seen": 238220812, "step": 4626 }, { "epoch": 0.4549273973612442, "grad_norm": 27.452932357788086, "learning_rate": 8e-05, "loss": 42.7587, "num_input_tokens_seen": 238384084, "step": 4629 }, { "epoch": 0.4552222304120292, "grad_norm": 25.271718978881836, "learning_rate": 8e-05, "loss": 42.8695, "num_input_tokens_seen": 238523988, "step": 4632 }, { "epoch": 0.4555170634628142, "grad_norm": 23.655532836914062, "learning_rate": 8e-05, "loss": 41.0458, "num_input_tokens_seen": 238669068, "step": 4635 }, { "epoch": 0.45581189651359916, "grad_norm": 28.211811065673828, "learning_rate": 8e-05, "loss": 43.0585, "num_input_tokens_seen": 238825804, "step": 4638 }, { "epoch": 0.4561067295643842, "grad_norm": 28.751880645751953, "learning_rate": 8e-05, "loss": 40.9551, "num_input_tokens_seen": 238982256, "step": 4641 }, { "epoch": 0.4564015626151692, "grad_norm": 25.98259925842285, "learning_rate": 8e-05, "loss": 39.8824, "num_input_tokens_seen": 239180620, "step": 4644 }, { "epoch": 0.45669639566595416, "grad_norm": 28.03835678100586, "learning_rate": 8e-05, "loss": 44.4102, "num_input_tokens_seen": 239351024, "step": 4647 }, { "epoch": 0.45699122871673914, "grad_norm": 26.699846267700195, "learning_rate": 8e-05, "loss": 44.0684, "num_input_tokens_seen": 239523072, "step": 4650 }, { "epoch": 0.4572860617675241, "grad_norm": 24.692668914794922, "learning_rate": 8e-05, "loss": 36.3624, "num_input_tokens_seen": 239675340, "step": 4653 }, { "epoch": 0.45758089481830916, "grad_norm": 45.91609191894531, "learning_rate": 8e-05, "loss": 42.5926, "num_input_tokens_seen": 239824032, "step": 4656 }, { "epoch": 0.45787572786909414, "grad_norm": 29.405675888061523, "learning_rate": 8e-05, "loss": 37.3812, "num_input_tokens_seen": 239957640, "step": 4659 }, { "epoch": 0.4581705609198791, "grad_norm": 27.97926139831543, "learning_rate": 8e-05, "loss": 40.5367, "num_input_tokens_seen": 240118696, "step": 4662 }, { "epoch": 0.4584653939706641, "grad_norm": 41.609432220458984, "learning_rate": 8e-05, "loss": 39.9011, "num_input_tokens_seen": 240257624, "step": 4665 }, { "epoch": 0.4587602270214491, "grad_norm": 28.44548797607422, "learning_rate": 8e-05, "loss": 40.396, "num_input_tokens_seen": 240413784, "step": 4668 }, { "epoch": 0.4590550600722341, "grad_norm": 27.161584854125977, "learning_rate": 8e-05, "loss": 41.7049, "num_input_tokens_seen": 240559368, "step": 4671 }, { "epoch": 0.4593498931230191, "grad_norm": 28.079381942749023, "learning_rate": 8e-05, "loss": 45.1337, "num_input_tokens_seen": 240711084, "step": 4674 }, { "epoch": 0.4596447261738041, "grad_norm": 31.016523361206055, "learning_rate": 8e-05, "loss": 40.1331, "num_input_tokens_seen": 240873416, "step": 4677 }, { "epoch": 0.45993955922458907, "grad_norm": 29.084857940673828, "learning_rate": 8e-05, "loss": 43.6345, "num_input_tokens_seen": 241011492, "step": 4680 }, { "epoch": 0.46023439227537405, "grad_norm": 24.55000877380371, "learning_rate": 8e-05, "loss": 40.8207, "num_input_tokens_seen": 241161368, "step": 4683 }, { "epoch": 0.4605292253261591, "grad_norm": 28.673322677612305, "learning_rate": 8e-05, "loss": 40.0417, "num_input_tokens_seen": 241311936, "step": 4686 }, { "epoch": 0.46082405837694407, "grad_norm": 25.411218643188477, "learning_rate": 8e-05, "loss": 38.8036, "num_input_tokens_seen": 241464440, "step": 4689 }, { "epoch": 0.46111889142772905, "grad_norm": 20.186403274536133, "learning_rate": 8e-05, "loss": 36.3097, "num_input_tokens_seen": 241636584, "step": 4692 }, { "epoch": 0.46141372447851403, "grad_norm": 30.097230911254883, "learning_rate": 8e-05, "loss": 40.7838, "num_input_tokens_seen": 241809340, "step": 4695 }, { "epoch": 0.461708557529299, "grad_norm": 31.64427947998047, "learning_rate": 8e-05, "loss": 46.0323, "num_input_tokens_seen": 241963564, "step": 4698 }, { "epoch": 0.46200339058008405, "grad_norm": 28.308427810668945, "learning_rate": 8e-05, "loss": 40.2799, "num_input_tokens_seen": 242148272, "step": 4701 }, { "epoch": 0.46229822363086903, "grad_norm": 25.227632522583008, "learning_rate": 8e-05, "loss": 41.1382, "num_input_tokens_seen": 242279320, "step": 4704 }, { "epoch": 0.462593056681654, "grad_norm": 28.968778610229492, "learning_rate": 8e-05, "loss": 42.7915, "num_input_tokens_seen": 242449204, "step": 4707 }, { "epoch": 0.462887889732439, "grad_norm": 31.740821838378906, "learning_rate": 8e-05, "loss": 45.0049, "num_input_tokens_seen": 242613292, "step": 4710 }, { "epoch": 0.463182722783224, "grad_norm": 29.082109451293945, "learning_rate": 8e-05, "loss": 39.2121, "num_input_tokens_seen": 242768116, "step": 4713 }, { "epoch": 0.463477555834009, "grad_norm": 24.574909210205078, "learning_rate": 8e-05, "loss": 41.5347, "num_input_tokens_seen": 242905276, "step": 4716 }, { "epoch": 0.463772388884794, "grad_norm": 28.00779914855957, "learning_rate": 8e-05, "loss": 41.8152, "num_input_tokens_seen": 243051876, "step": 4719 }, { "epoch": 0.464067221935579, "grad_norm": 25.955181121826172, "learning_rate": 8e-05, "loss": 38.6152, "num_input_tokens_seen": 243200648, "step": 4722 }, { "epoch": 0.46436205498636396, "grad_norm": 29.894330978393555, "learning_rate": 8e-05, "loss": 37.0003, "num_input_tokens_seen": 243360924, "step": 4725 }, { "epoch": 0.46465688803714894, "grad_norm": 27.062658309936523, "learning_rate": 8e-05, "loss": 41.574, "num_input_tokens_seen": 243529096, "step": 4728 }, { "epoch": 0.464951721087934, "grad_norm": 32.036903381347656, "learning_rate": 8e-05, "loss": 43.9895, "num_input_tokens_seen": 243677944, "step": 4731 }, { "epoch": 0.46524655413871896, "grad_norm": 29.033061981201172, "learning_rate": 8e-05, "loss": 40.0667, "num_input_tokens_seen": 243829408, "step": 4734 }, { "epoch": 0.46554138718950394, "grad_norm": 27.246234893798828, "learning_rate": 8e-05, "loss": 38.1965, "num_input_tokens_seen": 243986544, "step": 4737 }, { "epoch": 0.4658362202402889, "grad_norm": 30.715890884399414, "learning_rate": 8e-05, "loss": 41.338, "num_input_tokens_seen": 244140580, "step": 4740 }, { "epoch": 0.4661310532910739, "grad_norm": 25.131591796875, "learning_rate": 8e-05, "loss": 39.5073, "num_input_tokens_seen": 244284620, "step": 4743 }, { "epoch": 0.46642588634185894, "grad_norm": 27.105449676513672, "learning_rate": 8e-05, "loss": 42.4629, "num_input_tokens_seen": 244422736, "step": 4746 }, { "epoch": 0.4667207193926439, "grad_norm": 25.411161422729492, "learning_rate": 8e-05, "loss": 39.7882, "num_input_tokens_seen": 244575136, "step": 4749 }, { "epoch": 0.4670155524434289, "grad_norm": 27.345781326293945, "learning_rate": 8e-05, "loss": 38.6773, "num_input_tokens_seen": 244722500, "step": 4752 }, { "epoch": 0.4673103854942139, "grad_norm": 29.45749282836914, "learning_rate": 8e-05, "loss": 40.3697, "num_input_tokens_seen": 244873336, "step": 4755 }, { "epoch": 0.46760521854499887, "grad_norm": 29.298477172851562, "learning_rate": 8e-05, "loss": 42.2155, "num_input_tokens_seen": 245010504, "step": 4758 }, { "epoch": 0.4679000515957839, "grad_norm": 27.368099212646484, "learning_rate": 8e-05, "loss": 41.6353, "num_input_tokens_seen": 245187116, "step": 4761 }, { "epoch": 0.4681948846465689, "grad_norm": 25.604515075683594, "learning_rate": 8e-05, "loss": 41.4931, "num_input_tokens_seen": 245352472, "step": 4764 }, { "epoch": 0.46848971769735387, "grad_norm": 26.513587951660156, "learning_rate": 8e-05, "loss": 40.9986, "num_input_tokens_seen": 245524364, "step": 4767 }, { "epoch": 0.46878455074813885, "grad_norm": 23.54816246032715, "learning_rate": 8e-05, "loss": 41.5214, "num_input_tokens_seen": 245676924, "step": 4770 }, { "epoch": 0.46907938379892383, "grad_norm": 26.805103302001953, "learning_rate": 8e-05, "loss": 37.1733, "num_input_tokens_seen": 245828260, "step": 4773 }, { "epoch": 0.46937421684970887, "grad_norm": 30.842025756835938, "learning_rate": 8e-05, "loss": 40.2481, "num_input_tokens_seen": 245993472, "step": 4776 }, { "epoch": 0.46966904990049385, "grad_norm": 26.739294052124023, "learning_rate": 8e-05, "loss": 36.3944, "num_input_tokens_seen": 246172744, "step": 4779 }, { "epoch": 0.46996388295127883, "grad_norm": 33.225921630859375, "learning_rate": 8e-05, "loss": 44.1665, "num_input_tokens_seen": 246331176, "step": 4782 }, { "epoch": 0.4702587160020638, "grad_norm": 28.183168411254883, "learning_rate": 8e-05, "loss": 43.6505, "num_input_tokens_seen": 246481616, "step": 4785 }, { "epoch": 0.47055354905284885, "grad_norm": 26.746055603027344, "learning_rate": 8e-05, "loss": 42.6295, "num_input_tokens_seen": 246633420, "step": 4788 }, { "epoch": 0.47084838210363383, "grad_norm": 30.00533103942871, "learning_rate": 8e-05, "loss": 46.9378, "num_input_tokens_seen": 246794108, "step": 4791 }, { "epoch": 0.4711432151544188, "grad_norm": 24.430938720703125, "learning_rate": 8e-05, "loss": 42.2782, "num_input_tokens_seen": 246945580, "step": 4794 }, { "epoch": 0.4714380482052038, "grad_norm": 34.13113021850586, "learning_rate": 8e-05, "loss": 42.1327, "num_input_tokens_seen": 247093080, "step": 4797 }, { "epoch": 0.4717328812559888, "grad_norm": 26.586788177490234, "learning_rate": 8e-05, "loss": 38.0685, "num_input_tokens_seen": 247236392, "step": 4800 }, { "epoch": 0.4720277143067738, "grad_norm": 31.26131820678711, "learning_rate": 8e-05, "loss": 39.4935, "num_input_tokens_seen": 247378920, "step": 4803 }, { "epoch": 0.4723225473575588, "grad_norm": 27.38518714904785, "learning_rate": 8e-05, "loss": 40.852, "num_input_tokens_seen": 247532256, "step": 4806 }, { "epoch": 0.4726173804083438, "grad_norm": 27.600831985473633, "learning_rate": 8e-05, "loss": 37.4856, "num_input_tokens_seen": 247698124, "step": 4809 }, { "epoch": 0.47291221345912876, "grad_norm": 27.928665161132812, "learning_rate": 8e-05, "loss": 40.5996, "num_input_tokens_seen": 247855388, "step": 4812 }, { "epoch": 0.47320704650991374, "grad_norm": 29.014537811279297, "learning_rate": 8e-05, "loss": 45.1572, "num_input_tokens_seen": 248008312, "step": 4815 }, { "epoch": 0.4735018795606988, "grad_norm": 24.911880493164062, "learning_rate": 8e-05, "loss": 44.1829, "num_input_tokens_seen": 248169076, "step": 4818 }, { "epoch": 0.47379671261148376, "grad_norm": 28.43665313720703, "learning_rate": 8e-05, "loss": 38.3031, "num_input_tokens_seen": 248336860, "step": 4821 }, { "epoch": 0.47409154566226874, "grad_norm": 29.070302963256836, "learning_rate": 8e-05, "loss": 40.3561, "num_input_tokens_seen": 248490776, "step": 4824 }, { "epoch": 0.4743863787130537, "grad_norm": 26.32981300354004, "learning_rate": 8e-05, "loss": 41.947, "num_input_tokens_seen": 248638392, "step": 4827 }, { "epoch": 0.4746812117638387, "grad_norm": 28.874282836914062, "learning_rate": 8e-05, "loss": 43.5164, "num_input_tokens_seen": 248789532, "step": 4830 }, { "epoch": 0.47497604481462374, "grad_norm": 26.124303817749023, "learning_rate": 8e-05, "loss": 40.266, "num_input_tokens_seen": 248957436, "step": 4833 }, { "epoch": 0.4752708778654087, "grad_norm": 25.164854049682617, "learning_rate": 8e-05, "loss": 41.942, "num_input_tokens_seen": 249109452, "step": 4836 }, { "epoch": 0.4755657109161937, "grad_norm": 25.05072784423828, "learning_rate": 8e-05, "loss": 37.8196, "num_input_tokens_seen": 249255184, "step": 4839 }, { "epoch": 0.4758605439669787, "grad_norm": 27.30223846435547, "learning_rate": 8e-05, "loss": 39.4101, "num_input_tokens_seen": 249397136, "step": 4842 }, { "epoch": 0.47615537701776367, "grad_norm": 33.2149658203125, "learning_rate": 8e-05, "loss": 43.5654, "num_input_tokens_seen": 249558888, "step": 4845 }, { "epoch": 0.4764502100685487, "grad_norm": 25.12885856628418, "learning_rate": 8e-05, "loss": 41.83, "num_input_tokens_seen": 249706876, "step": 4848 }, { "epoch": 0.4767450431193337, "grad_norm": 27.43396759033203, "learning_rate": 8e-05, "loss": 41.058, "num_input_tokens_seen": 249894004, "step": 4851 }, { "epoch": 0.47703987617011867, "grad_norm": 27.396812438964844, "learning_rate": 8e-05, "loss": 35.1668, "num_input_tokens_seen": 250050096, "step": 4854 }, { "epoch": 0.47733470922090365, "grad_norm": 31.309123992919922, "learning_rate": 8e-05, "loss": 41.7648, "num_input_tokens_seen": 250215436, "step": 4857 }, { "epoch": 0.47762954227168863, "grad_norm": 34.78447341918945, "learning_rate": 8e-05, "loss": 43.7872, "num_input_tokens_seen": 250357180, "step": 4860 }, { "epoch": 0.47792437532247367, "grad_norm": 30.251394271850586, "learning_rate": 8e-05, "loss": 42.9092, "num_input_tokens_seen": 250499540, "step": 4863 }, { "epoch": 0.47821920837325865, "grad_norm": 31.96343231201172, "learning_rate": 8e-05, "loss": 42.077, "num_input_tokens_seen": 250649300, "step": 4866 }, { "epoch": 0.47851404142404363, "grad_norm": 26.872812271118164, "learning_rate": 8e-05, "loss": 41.2801, "num_input_tokens_seen": 250813788, "step": 4869 }, { "epoch": 0.4788088744748286, "grad_norm": 26.450227737426758, "learning_rate": 8e-05, "loss": 42.2355, "num_input_tokens_seen": 250984424, "step": 4872 }, { "epoch": 0.4791037075256136, "grad_norm": 28.50580596923828, "learning_rate": 8e-05, "loss": 42.1893, "num_input_tokens_seen": 251143068, "step": 4875 }, { "epoch": 0.47939854057639864, "grad_norm": 29.96875762939453, "learning_rate": 8e-05, "loss": 41.5831, "num_input_tokens_seen": 251301592, "step": 4878 }, { "epoch": 0.4796933736271836, "grad_norm": 25.501129150390625, "learning_rate": 8e-05, "loss": 39.8773, "num_input_tokens_seen": 251466600, "step": 4881 }, { "epoch": 0.4799882066779686, "grad_norm": 27.416034698486328, "learning_rate": 8e-05, "loss": 39.085, "num_input_tokens_seen": 251626984, "step": 4884 }, { "epoch": 0.4802830397287536, "grad_norm": 27.230443954467773, "learning_rate": 8e-05, "loss": 42.5695, "num_input_tokens_seen": 251801908, "step": 4887 }, { "epoch": 0.48057787277953856, "grad_norm": 24.647361755371094, "learning_rate": 8e-05, "loss": 40.7212, "num_input_tokens_seen": 251952904, "step": 4890 }, { "epoch": 0.4808727058303236, "grad_norm": 28.154359817504883, "learning_rate": 8e-05, "loss": 40.6429, "num_input_tokens_seen": 252107504, "step": 4893 }, { "epoch": 0.4811675388811086, "grad_norm": 29.207487106323242, "learning_rate": 8e-05, "loss": 41.0142, "num_input_tokens_seen": 252256372, "step": 4896 }, { "epoch": 0.48146237193189356, "grad_norm": 26.0395450592041, "learning_rate": 8e-05, "loss": 39.9626, "num_input_tokens_seen": 252403820, "step": 4899 }, { "epoch": 0.48175720498267854, "grad_norm": 24.243412017822266, "learning_rate": 8e-05, "loss": 39.2682, "num_input_tokens_seen": 252560904, "step": 4902 }, { "epoch": 0.4820520380334635, "grad_norm": 32.20317459106445, "learning_rate": 8e-05, "loss": 42.3904, "num_input_tokens_seen": 252696252, "step": 4905 }, { "epoch": 0.48234687108424856, "grad_norm": 33.271995544433594, "learning_rate": 8e-05, "loss": 42.833, "num_input_tokens_seen": 252835252, "step": 4908 }, { "epoch": 0.48264170413503354, "grad_norm": 32.50652313232422, "learning_rate": 8e-05, "loss": 40.7188, "num_input_tokens_seen": 252977368, "step": 4911 }, { "epoch": 0.4829365371858185, "grad_norm": 21.94767189025879, "learning_rate": 8e-05, "loss": 39.2929, "num_input_tokens_seen": 253131608, "step": 4914 }, { "epoch": 0.4832313702366035, "grad_norm": 25.962169647216797, "learning_rate": 8e-05, "loss": 40.0814, "num_input_tokens_seen": 253289132, "step": 4917 }, { "epoch": 0.4835262032873885, "grad_norm": 26.929094314575195, "learning_rate": 8e-05, "loss": 38.078, "num_input_tokens_seen": 253461620, "step": 4920 }, { "epoch": 0.4838210363381735, "grad_norm": 28.08810043334961, "learning_rate": 8e-05, "loss": 41.8377, "num_input_tokens_seen": 253609532, "step": 4923 }, { "epoch": 0.4841158693889585, "grad_norm": 27.889724731445312, "learning_rate": 8e-05, "loss": 39.9923, "num_input_tokens_seen": 253759116, "step": 4926 }, { "epoch": 0.4844107024397435, "grad_norm": 23.67461585998535, "learning_rate": 8e-05, "loss": 38.6749, "num_input_tokens_seen": 253940252, "step": 4929 }, { "epoch": 0.48470553549052847, "grad_norm": 25.49771499633789, "learning_rate": 8e-05, "loss": 39.5138, "num_input_tokens_seen": 254101524, "step": 4932 }, { "epoch": 0.48500036854131345, "grad_norm": 27.745712280273438, "learning_rate": 8e-05, "loss": 42.9393, "num_input_tokens_seen": 254265708, "step": 4935 }, { "epoch": 0.4852952015920985, "grad_norm": 27.445417404174805, "learning_rate": 8e-05, "loss": 43.3623, "num_input_tokens_seen": 254427608, "step": 4938 }, { "epoch": 0.4855900346428835, "grad_norm": 23.81502342224121, "learning_rate": 8e-05, "loss": 39.5791, "num_input_tokens_seen": 254585740, "step": 4941 }, { "epoch": 0.48588486769366845, "grad_norm": 27.11995506286621, "learning_rate": 8e-05, "loss": 41.631, "num_input_tokens_seen": 254735444, "step": 4944 }, { "epoch": 0.48617970074445344, "grad_norm": 28.302568435668945, "learning_rate": 8e-05, "loss": 39.621, "num_input_tokens_seen": 254873632, "step": 4947 }, { "epoch": 0.4864745337952385, "grad_norm": 25.918787002563477, "learning_rate": 8e-05, "loss": 36.2581, "num_input_tokens_seen": 255018324, "step": 4950 }, { "epoch": 0.48676936684602345, "grad_norm": 26.976848602294922, "learning_rate": 8e-05, "loss": 39.609, "num_input_tokens_seen": 255170504, "step": 4953 }, { "epoch": 0.48706419989680844, "grad_norm": 39.22269821166992, "learning_rate": 8e-05, "loss": 42.433, "num_input_tokens_seen": 255327264, "step": 4956 }, { "epoch": 0.4873590329475934, "grad_norm": 31.607404708862305, "learning_rate": 8e-05, "loss": 41.0775, "num_input_tokens_seen": 255484364, "step": 4959 }, { "epoch": 0.4876538659983784, "grad_norm": 25.68681526184082, "learning_rate": 8e-05, "loss": 39.5636, "num_input_tokens_seen": 255642880, "step": 4962 }, { "epoch": 0.48794869904916344, "grad_norm": 30.098154067993164, "learning_rate": 8e-05, "loss": 38.4692, "num_input_tokens_seen": 255793688, "step": 4965 }, { "epoch": 0.4882435320999484, "grad_norm": 32.68474197387695, "learning_rate": 8e-05, "loss": 38.1969, "num_input_tokens_seen": 255951492, "step": 4968 }, { "epoch": 0.4885383651507334, "grad_norm": 26.910491943359375, "learning_rate": 8e-05, "loss": 40.7008, "num_input_tokens_seen": 256109592, "step": 4971 }, { "epoch": 0.4888331982015184, "grad_norm": 29.480241775512695, "learning_rate": 8e-05, "loss": 40.1681, "num_input_tokens_seen": 256263356, "step": 4974 }, { "epoch": 0.48912803125230336, "grad_norm": 23.866363525390625, "learning_rate": 8e-05, "loss": 38.0946, "num_input_tokens_seen": 256423220, "step": 4977 }, { "epoch": 0.4894228643030884, "grad_norm": 41.37398910522461, "learning_rate": 8e-05, "loss": 40.5847, "num_input_tokens_seen": 256572760, "step": 4980 }, { "epoch": 0.4897176973538734, "grad_norm": 27.146371841430664, "learning_rate": 8e-05, "loss": 41.5017, "num_input_tokens_seen": 256729552, "step": 4983 }, { "epoch": 0.49001253040465836, "grad_norm": 25.587785720825195, "learning_rate": 8e-05, "loss": 38.1012, "num_input_tokens_seen": 256882972, "step": 4986 }, { "epoch": 0.49030736345544335, "grad_norm": 24.43895149230957, "learning_rate": 8e-05, "loss": 39.9138, "num_input_tokens_seen": 257060216, "step": 4989 }, { "epoch": 0.4906021965062283, "grad_norm": 24.547412872314453, "learning_rate": 8e-05, "loss": 38.8881, "num_input_tokens_seen": 257207456, "step": 4992 }, { "epoch": 0.49089702955701336, "grad_norm": 32.7952880859375, "learning_rate": 8e-05, "loss": 39.1508, "num_input_tokens_seen": 257373996, "step": 4995 }, { "epoch": 0.49119186260779835, "grad_norm": 29.346824645996094, "learning_rate": 8e-05, "loss": 41.7305, "num_input_tokens_seen": 257536592, "step": 4998 }, { "epoch": 0.49138841797498833, "eval_gen_len": 32.985, "eval_loss": 2.6099908351898193, "eval_rouge1": 44.4312, "eval_rouge2": 27.6447, "eval_rougeL": 40.525, "eval_rougeLsum": 40.7945, "eval_runtime": 96.8114, "eval_samples_per_second": 2.066, "eval_steps_per_second": 0.516, "num_input_tokens_seen": 257628708, "step": 5000 }, { "epoch": 0.49148669565858333, "grad_norm": 33.791316986083984, "learning_rate": 8e-05, "loss": 42.0111, "num_input_tokens_seen": 257674852, "step": 5001 }, { "epoch": 0.4917815287093683, "grad_norm": 28.114267349243164, "learning_rate": 8e-05, "loss": 37.5229, "num_input_tokens_seen": 257807336, "step": 5004 }, { "epoch": 0.4920763617601533, "grad_norm": 26.51993751525879, "learning_rate": 8e-05, "loss": 35.2494, "num_input_tokens_seen": 257960640, "step": 5007 }, { "epoch": 0.49237119481093833, "grad_norm": 28.530141830444336, "learning_rate": 8e-05, "loss": 41.4613, "num_input_tokens_seen": 258099104, "step": 5010 }, { "epoch": 0.4926660278617233, "grad_norm": 23.611448287963867, "learning_rate": 8e-05, "loss": 42.0885, "num_input_tokens_seen": 258290644, "step": 5013 }, { "epoch": 0.4929608609125083, "grad_norm": 24.45577049255371, "learning_rate": 8e-05, "loss": 40.2475, "num_input_tokens_seen": 258448336, "step": 5016 }, { "epoch": 0.4932556939632933, "grad_norm": 27.3090877532959, "learning_rate": 8e-05, "loss": 38.2567, "num_input_tokens_seen": 258608620, "step": 5019 }, { "epoch": 0.49355052701407826, "grad_norm": 23.795654296875, "learning_rate": 8e-05, "loss": 42.4711, "num_input_tokens_seen": 258771812, "step": 5022 }, { "epoch": 0.4938453600648633, "grad_norm": 29.56174087524414, "learning_rate": 8e-05, "loss": 43.0615, "num_input_tokens_seen": 258923164, "step": 5025 }, { "epoch": 0.4941401931156483, "grad_norm": 37.0699577331543, "learning_rate": 8e-05, "loss": 38.6001, "num_input_tokens_seen": 259096096, "step": 5028 }, { "epoch": 0.49443502616643326, "grad_norm": 26.982820510864258, "learning_rate": 8e-05, "loss": 39.4593, "num_input_tokens_seen": 259260152, "step": 5031 }, { "epoch": 0.49472985921721824, "grad_norm": 28.042970657348633, "learning_rate": 8e-05, "loss": 39.738, "num_input_tokens_seen": 259412384, "step": 5034 }, { "epoch": 0.4950246922680032, "grad_norm": 28.633056640625, "learning_rate": 8e-05, "loss": 39.8979, "num_input_tokens_seen": 259573016, "step": 5037 }, { "epoch": 0.49531952531878826, "grad_norm": 28.497529983520508, "learning_rate": 8e-05, "loss": 40.1717, "num_input_tokens_seen": 259706416, "step": 5040 }, { "epoch": 0.49561435836957324, "grad_norm": 25.843650817871094, "learning_rate": 8e-05, "loss": 39.9081, "num_input_tokens_seen": 259860744, "step": 5043 }, { "epoch": 0.4959091914203582, "grad_norm": 24.251995086669922, "learning_rate": 8e-05, "loss": 40.0976, "num_input_tokens_seen": 260015464, "step": 5046 }, { "epoch": 0.4962040244711432, "grad_norm": 32.20277786254883, "learning_rate": 8e-05, "loss": 40.7806, "num_input_tokens_seen": 260181292, "step": 5049 }, { "epoch": 0.4964988575219282, "grad_norm": 42.7421875, "learning_rate": 8e-05, "loss": 38.9794, "num_input_tokens_seen": 260325680, "step": 5052 }, { "epoch": 0.4967936905727132, "grad_norm": 25.865787506103516, "learning_rate": 8e-05, "loss": 41.3198, "num_input_tokens_seen": 260462880, "step": 5055 }, { "epoch": 0.4970885236234982, "grad_norm": 25.28655433654785, "learning_rate": 8e-05, "loss": 41.7386, "num_input_tokens_seen": 260644488, "step": 5058 }, { "epoch": 0.4973833566742832, "grad_norm": 27.9177303314209, "learning_rate": 8e-05, "loss": 41.1107, "num_input_tokens_seen": 260798904, "step": 5061 }, { "epoch": 0.49767818972506817, "grad_norm": 39.34760284423828, "learning_rate": 8e-05, "loss": 34.9837, "num_input_tokens_seen": 260955768, "step": 5064 }, { "epoch": 0.49797302277585315, "grad_norm": 29.348892211914062, "learning_rate": 8e-05, "loss": 39.5821, "num_input_tokens_seen": 261126252, "step": 5067 }, { "epoch": 0.4982678558266382, "grad_norm": 35.165401458740234, "learning_rate": 8e-05, "loss": 41.6351, "num_input_tokens_seen": 261285852, "step": 5070 }, { "epoch": 0.49856268887742317, "grad_norm": 30.85509490966797, "learning_rate": 8e-05, "loss": 40.2213, "num_input_tokens_seen": 261453024, "step": 5073 }, { "epoch": 0.49885752192820815, "grad_norm": 58.00692367553711, "learning_rate": 8e-05, "loss": 39.2461, "num_input_tokens_seen": 261609220, "step": 5076 }, { "epoch": 0.49915235497899313, "grad_norm": 27.91078758239746, "learning_rate": 8e-05, "loss": 42.8827, "num_input_tokens_seen": 261769640, "step": 5079 }, { "epoch": 0.4994471880297781, "grad_norm": 26.631685256958008, "learning_rate": 8e-05, "loss": 39.4102, "num_input_tokens_seen": 261931560, "step": 5082 }, { "epoch": 0.49974202108056315, "grad_norm": 26.376930236816406, "learning_rate": 8e-05, "loss": 41.2661, "num_input_tokens_seen": 262073004, "step": 5085 }, { "epoch": 0.5000368541313481, "grad_norm": 28.25586700439453, "learning_rate": 8e-05, "loss": 41.2577, "num_input_tokens_seen": 262208488, "step": 5088 }, { "epoch": 0.5003316871821332, "grad_norm": 23.296409606933594, "learning_rate": 8e-05, "loss": 39.7283, "num_input_tokens_seen": 262367280, "step": 5091 }, { "epoch": 0.5006265202329181, "grad_norm": 28.338619232177734, "learning_rate": 8e-05, "loss": 38.4883, "num_input_tokens_seen": 262520148, "step": 5094 }, { "epoch": 0.5009213532837031, "grad_norm": 30.3863582611084, "learning_rate": 8e-05, "loss": 43.1131, "num_input_tokens_seen": 262690632, "step": 5097 }, { "epoch": 0.5012161863344881, "grad_norm": 28.374164581298828, "learning_rate": 8e-05, "loss": 40.7878, "num_input_tokens_seen": 262837896, "step": 5100 }, { "epoch": 0.5015110193852731, "grad_norm": 25.643203735351562, "learning_rate": 8e-05, "loss": 39.6263, "num_input_tokens_seen": 263004624, "step": 5103 }, { "epoch": 0.5018058524360581, "grad_norm": 26.574125289916992, "learning_rate": 8e-05, "loss": 39.7303, "num_input_tokens_seen": 263162740, "step": 5106 }, { "epoch": 0.5021006854868431, "grad_norm": 27.768224716186523, "learning_rate": 8e-05, "loss": 40.8774, "num_input_tokens_seen": 263300304, "step": 5109 }, { "epoch": 0.5023955185376281, "grad_norm": 25.535547256469727, "learning_rate": 8e-05, "loss": 40.4186, "num_input_tokens_seen": 263434248, "step": 5112 }, { "epoch": 0.502690351588413, "grad_norm": 30.100975036621094, "learning_rate": 8e-05, "loss": 41.7356, "num_input_tokens_seen": 263587684, "step": 5115 }, { "epoch": 0.5029851846391981, "grad_norm": 28.115644454956055, "learning_rate": 8e-05, "loss": 38.5396, "num_input_tokens_seen": 263755936, "step": 5118 }, { "epoch": 0.5032800176899831, "grad_norm": 24.499034881591797, "learning_rate": 8e-05, "loss": 39.5854, "num_input_tokens_seen": 263900592, "step": 5121 }, { "epoch": 0.503574850740768, "grad_norm": 30.202110290527344, "learning_rate": 8e-05, "loss": 39.8082, "num_input_tokens_seen": 264063248, "step": 5124 }, { "epoch": 0.5038696837915531, "grad_norm": 29.38237762451172, "learning_rate": 8e-05, "loss": 37.7482, "num_input_tokens_seen": 264214796, "step": 5127 }, { "epoch": 0.504164516842338, "grad_norm": 21.479717254638672, "learning_rate": 8e-05, "loss": 39.0967, "num_input_tokens_seen": 264383492, "step": 5130 }, { "epoch": 0.504459349893123, "grad_norm": 23.876405715942383, "learning_rate": 8e-05, "loss": 41.2343, "num_input_tokens_seen": 264528208, "step": 5133 }, { "epoch": 0.5047541829439081, "grad_norm": 26.351425170898438, "learning_rate": 8e-05, "loss": 41.1, "num_input_tokens_seen": 264677424, "step": 5136 }, { "epoch": 0.505049015994693, "grad_norm": 33.22755813598633, "learning_rate": 8e-05, "loss": 42.8171, "num_input_tokens_seen": 264823504, "step": 5139 }, { "epoch": 0.505343849045478, "grad_norm": 27.761079788208008, "learning_rate": 8e-05, "loss": 40.8142, "num_input_tokens_seen": 264996748, "step": 5142 }, { "epoch": 0.505638682096263, "grad_norm": 27.130084991455078, "learning_rate": 8e-05, "loss": 38.0833, "num_input_tokens_seen": 265131520, "step": 5145 }, { "epoch": 0.505933515147048, "grad_norm": 26.463390350341797, "learning_rate": 8e-05, "loss": 41.0535, "num_input_tokens_seen": 265291396, "step": 5148 }, { "epoch": 0.506228348197833, "grad_norm": 26.453123092651367, "learning_rate": 8e-05, "loss": 40.6416, "num_input_tokens_seen": 265462212, "step": 5151 }, { "epoch": 0.506523181248618, "grad_norm": 64.61177825927734, "learning_rate": 8e-05, "loss": 41.241, "num_input_tokens_seen": 265616688, "step": 5154 }, { "epoch": 0.506818014299403, "grad_norm": 31.10032844543457, "learning_rate": 8e-05, "loss": 36.3523, "num_input_tokens_seen": 265765660, "step": 5157 }, { "epoch": 0.5071128473501879, "grad_norm": 25.7410831451416, "learning_rate": 8e-05, "loss": 38.3414, "num_input_tokens_seen": 265919712, "step": 5160 }, { "epoch": 0.507407680400973, "grad_norm": 29.47089958190918, "learning_rate": 8e-05, "loss": 41.5284, "num_input_tokens_seen": 266091432, "step": 5163 }, { "epoch": 0.507702513451758, "grad_norm": 28.647890090942383, "learning_rate": 8e-05, "loss": 41.0231, "num_input_tokens_seen": 266275472, "step": 5166 }, { "epoch": 0.5079973465025429, "grad_norm": 26.218273162841797, "learning_rate": 8e-05, "loss": 42.1872, "num_input_tokens_seen": 266422520, "step": 5169 }, { "epoch": 0.508292179553328, "grad_norm": 24.72483253479004, "learning_rate": 8e-05, "loss": 39.21, "num_input_tokens_seen": 266575292, "step": 5172 }, { "epoch": 0.5085870126041129, "grad_norm": 24.378684997558594, "learning_rate": 8e-05, "loss": 41.5883, "num_input_tokens_seen": 266727956, "step": 5175 }, { "epoch": 0.5088818456548979, "grad_norm": 26.51215362548828, "learning_rate": 8e-05, "loss": 37.2083, "num_input_tokens_seen": 266907284, "step": 5178 }, { "epoch": 0.509176678705683, "grad_norm": 62.5871696472168, "learning_rate": 8e-05, "loss": 38.3234, "num_input_tokens_seen": 267052280, "step": 5181 }, { "epoch": 0.5094715117564679, "grad_norm": 29.808427810668945, "learning_rate": 8e-05, "loss": 39.3261, "num_input_tokens_seen": 267225372, "step": 5184 }, { "epoch": 0.5097663448072529, "grad_norm": 28.35696792602539, "learning_rate": 8e-05, "loss": 39.9704, "num_input_tokens_seen": 267380868, "step": 5187 }, { "epoch": 0.5100611778580378, "grad_norm": 30.325870513916016, "learning_rate": 8e-05, "loss": 39.2605, "num_input_tokens_seen": 267536984, "step": 5190 }, { "epoch": 0.5103560109088229, "grad_norm": 41.838809967041016, "learning_rate": 8e-05, "loss": 38.4647, "num_input_tokens_seen": 267685792, "step": 5193 }, { "epoch": 0.5106508439596079, "grad_norm": 25.790443420410156, "learning_rate": 8e-05, "loss": 41.0957, "num_input_tokens_seen": 267849848, "step": 5196 }, { "epoch": 0.5109456770103928, "grad_norm": 27.69879913330078, "learning_rate": 8e-05, "loss": 37.9932, "num_input_tokens_seen": 267990528, "step": 5199 }, { "epoch": 0.5112405100611779, "grad_norm": 28.44215965270996, "learning_rate": 8e-05, "loss": 39.1879, "num_input_tokens_seen": 268122272, "step": 5202 }, { "epoch": 0.5115353431119628, "grad_norm": 27.362436294555664, "learning_rate": 8e-05, "loss": 41.4922, "num_input_tokens_seen": 268290084, "step": 5205 }, { "epoch": 0.5118301761627478, "grad_norm": 25.73969268798828, "learning_rate": 8e-05, "loss": 38.0888, "num_input_tokens_seen": 268466960, "step": 5208 }, { "epoch": 0.5121250092135329, "grad_norm": 26.495397567749023, "learning_rate": 8e-05, "loss": 34.8907, "num_input_tokens_seen": 268619264, "step": 5211 }, { "epoch": 0.5124198422643178, "grad_norm": 28.732437133789062, "learning_rate": 8e-05, "loss": 41.5123, "num_input_tokens_seen": 268765252, "step": 5214 }, { "epoch": 0.5127146753151028, "grad_norm": 28.184734344482422, "learning_rate": 8e-05, "loss": 43.1832, "num_input_tokens_seen": 268926384, "step": 5217 }, { "epoch": 0.5130095083658878, "grad_norm": 38.591148376464844, "learning_rate": 8e-05, "loss": 41.2605, "num_input_tokens_seen": 269076024, "step": 5220 }, { "epoch": 0.5133043414166728, "grad_norm": 28.382539749145508, "learning_rate": 8e-05, "loss": 40.6811, "num_input_tokens_seen": 269236980, "step": 5223 }, { "epoch": 0.5135991744674578, "grad_norm": 34.059329986572266, "learning_rate": 8e-05, "loss": 42.5639, "num_input_tokens_seen": 269408772, "step": 5226 }, { "epoch": 0.5138940075182428, "grad_norm": 43.94654083251953, "learning_rate": 8e-05, "loss": 38.8053, "num_input_tokens_seen": 269570864, "step": 5229 }, { "epoch": 0.5141888405690278, "grad_norm": 34.34612274169922, "learning_rate": 8e-05, "loss": 39.3392, "num_input_tokens_seen": 269725332, "step": 5232 }, { "epoch": 0.5144836736198127, "grad_norm": 24.64056968688965, "learning_rate": 8e-05, "loss": 35.7388, "num_input_tokens_seen": 269860884, "step": 5235 }, { "epoch": 0.5147785066705978, "grad_norm": 31.60344123840332, "learning_rate": 8e-05, "loss": 36.9823, "num_input_tokens_seen": 270002004, "step": 5238 }, { "epoch": 0.5150733397213828, "grad_norm": 37.74430465698242, "learning_rate": 8e-05, "loss": 41.4918, "num_input_tokens_seen": 270167456, "step": 5241 }, { "epoch": 0.5153681727721677, "grad_norm": 28.199970245361328, "learning_rate": 8e-05, "loss": 40.808, "num_input_tokens_seen": 270324812, "step": 5244 }, { "epoch": 0.5156630058229528, "grad_norm": 65.03779602050781, "learning_rate": 8e-05, "loss": 38.1227, "num_input_tokens_seen": 270470852, "step": 5247 }, { "epoch": 0.5159578388737377, "grad_norm": 24.968914031982422, "learning_rate": 8e-05, "loss": 40.2922, "num_input_tokens_seen": 270619848, "step": 5250 }, { "epoch": 0.5162526719245227, "grad_norm": 28.412931442260742, "learning_rate": 8e-05, "loss": 41.0161, "num_input_tokens_seen": 270776848, "step": 5253 }, { "epoch": 0.5165475049753078, "grad_norm": 30.238481521606445, "learning_rate": 8e-05, "loss": 37.7615, "num_input_tokens_seen": 270937948, "step": 5256 }, { "epoch": 0.5168423380260927, "grad_norm": 27.592735290527344, "learning_rate": 8e-05, "loss": 40.6843, "num_input_tokens_seen": 271093220, "step": 5259 }, { "epoch": 0.5171371710768777, "grad_norm": 27.805315017700195, "learning_rate": 8e-05, "loss": 38.7642, "num_input_tokens_seen": 271235880, "step": 5262 }, { "epoch": 0.5174320041276627, "grad_norm": 64.01702880859375, "learning_rate": 8e-05, "loss": 40.7117, "num_input_tokens_seen": 271373476, "step": 5265 }, { "epoch": 0.5177268371784477, "grad_norm": 29.8955135345459, "learning_rate": 8e-05, "loss": 38.2741, "num_input_tokens_seen": 271531824, "step": 5268 }, { "epoch": 0.5180216702292327, "grad_norm": 26.748109817504883, "learning_rate": 8e-05, "loss": 38.4718, "num_input_tokens_seen": 271685428, "step": 5271 }, { "epoch": 0.5183165032800177, "grad_norm": 29.331661224365234, "learning_rate": 8e-05, "loss": 40.647, "num_input_tokens_seen": 271834884, "step": 5274 }, { "epoch": 0.5186113363308027, "grad_norm": 25.69898796081543, "learning_rate": 8e-05, "loss": 45.1337, "num_input_tokens_seen": 271984356, "step": 5277 }, { "epoch": 0.5189061693815876, "grad_norm": 29.571247100830078, "learning_rate": 8e-05, "loss": 38.5389, "num_input_tokens_seen": 272155252, "step": 5280 }, { "epoch": 0.5192010024323727, "grad_norm": 32.80549240112305, "learning_rate": 8e-05, "loss": 40.5171, "num_input_tokens_seen": 272295720, "step": 5283 }, { "epoch": 0.5194958354831577, "grad_norm": 33.686519622802734, "learning_rate": 8e-05, "loss": 39.3815, "num_input_tokens_seen": 272456400, "step": 5286 }, { "epoch": 0.5197906685339426, "grad_norm": 27.436264038085938, "learning_rate": 8e-05, "loss": 40.2075, "num_input_tokens_seen": 272612176, "step": 5289 }, { "epoch": 0.5200855015847277, "grad_norm": 29.439571380615234, "learning_rate": 8e-05, "loss": 41.1885, "num_input_tokens_seen": 272764716, "step": 5292 }, { "epoch": 0.5203803346355126, "grad_norm": 29.08220672607422, "learning_rate": 8e-05, "loss": 41.7982, "num_input_tokens_seen": 272908684, "step": 5295 }, { "epoch": 0.5206751676862976, "grad_norm": 29.931116104125977, "learning_rate": 8e-05, "loss": 40.7925, "num_input_tokens_seen": 273075112, "step": 5298 }, { "epoch": 0.5209700007370827, "grad_norm": 26.853681564331055, "learning_rate": 8e-05, "loss": 37.7096, "num_input_tokens_seen": 273216512, "step": 5301 }, { "epoch": 0.5212648337878676, "grad_norm": 25.798200607299805, "learning_rate": 8e-05, "loss": 38.8098, "num_input_tokens_seen": 273352688, "step": 5304 }, { "epoch": 0.5215596668386526, "grad_norm": 28.257875442504883, "learning_rate": 8e-05, "loss": 41.5688, "num_input_tokens_seen": 273502340, "step": 5307 }, { "epoch": 0.5218544998894376, "grad_norm": 23.321849822998047, "learning_rate": 8e-05, "loss": 37.043, "num_input_tokens_seen": 273653772, "step": 5310 }, { "epoch": 0.5221493329402226, "grad_norm": 27.79505729675293, "learning_rate": 8e-05, "loss": 38.3766, "num_input_tokens_seen": 273808488, "step": 5313 }, { "epoch": 0.5224441659910076, "grad_norm": 23.351730346679688, "learning_rate": 8e-05, "loss": 41.9723, "num_input_tokens_seen": 273983832, "step": 5316 }, { "epoch": 0.5227389990417926, "grad_norm": 30.498489379882812, "learning_rate": 8e-05, "loss": 38.6851, "num_input_tokens_seen": 274136064, "step": 5319 }, { "epoch": 0.5230338320925776, "grad_norm": 27.102434158325195, "learning_rate": 8e-05, "loss": 39.6621, "num_input_tokens_seen": 274299484, "step": 5322 }, { "epoch": 0.5233286651433626, "grad_norm": 30.713741302490234, "learning_rate": 8e-05, "loss": 36.4895, "num_input_tokens_seen": 274437288, "step": 5325 }, { "epoch": 0.5236234981941476, "grad_norm": 25.068334579467773, "learning_rate": 8e-05, "loss": 34.8924, "num_input_tokens_seen": 274585476, "step": 5328 }, { "epoch": 0.5239183312449326, "grad_norm": 28.447080612182617, "learning_rate": 8e-05, "loss": 36.4545, "num_input_tokens_seen": 274737720, "step": 5331 }, { "epoch": 0.5242131642957175, "grad_norm": 26.790447235107422, "learning_rate": 8e-05, "loss": 38.516, "num_input_tokens_seen": 274876548, "step": 5334 }, { "epoch": 0.5245079973465026, "grad_norm": 31.397802352905273, "learning_rate": 8e-05, "loss": 40.8084, "num_input_tokens_seen": 275038836, "step": 5337 }, { "epoch": 0.5248028303972876, "grad_norm": 30.601938247680664, "learning_rate": 8e-05, "loss": 40.2357, "num_input_tokens_seen": 275189300, "step": 5340 }, { "epoch": 0.5250976634480725, "grad_norm": 24.533906936645508, "learning_rate": 8e-05, "loss": 40.7632, "num_input_tokens_seen": 275331292, "step": 5343 }, { "epoch": 0.5253924964988576, "grad_norm": 28.960649490356445, "learning_rate": 8e-05, "loss": 36.632, "num_input_tokens_seen": 275483224, "step": 5346 }, { "epoch": 0.5256873295496425, "grad_norm": 28.03540802001953, "learning_rate": 8e-05, "loss": 40.7449, "num_input_tokens_seen": 275641832, "step": 5349 }, { "epoch": 0.5259821626004275, "grad_norm": 25.529850006103516, "learning_rate": 8e-05, "loss": 38.6161, "num_input_tokens_seen": 275804628, "step": 5352 }, { "epoch": 0.5262769956512126, "grad_norm": 24.701250076293945, "learning_rate": 8e-05, "loss": 38.1369, "num_input_tokens_seen": 275967540, "step": 5355 }, { "epoch": 0.5265718287019975, "grad_norm": 23.570316314697266, "learning_rate": 8e-05, "loss": 32.6844, "num_input_tokens_seen": 276122968, "step": 5358 }, { "epoch": 0.5268666617527825, "grad_norm": 29.606403350830078, "learning_rate": 8e-05, "loss": 38.6557, "num_input_tokens_seen": 276278380, "step": 5361 }, { "epoch": 0.5271614948035674, "grad_norm": 25.70172882080078, "learning_rate": 8e-05, "loss": 36.8258, "num_input_tokens_seen": 276430096, "step": 5364 }, { "epoch": 0.5274563278543525, "grad_norm": 26.238239288330078, "learning_rate": 8e-05, "loss": 35.7773, "num_input_tokens_seen": 276591252, "step": 5367 }, { "epoch": 0.5277511609051375, "grad_norm": 27.975414276123047, "learning_rate": 8e-05, "loss": 36.2485, "num_input_tokens_seen": 276746740, "step": 5370 }, { "epoch": 0.5280459939559224, "grad_norm": 27.955827713012695, "learning_rate": 8e-05, "loss": 38.1719, "num_input_tokens_seen": 276897792, "step": 5373 }, { "epoch": 0.5283408270067075, "grad_norm": 29.52302360534668, "learning_rate": 8e-05, "loss": 39.6888, "num_input_tokens_seen": 277027124, "step": 5376 }, { "epoch": 0.5286356600574924, "grad_norm": 27.78631591796875, "learning_rate": 8e-05, "loss": 37.4463, "num_input_tokens_seen": 277181080, "step": 5379 }, { "epoch": 0.5289304931082774, "grad_norm": 26.559904098510742, "learning_rate": 8e-05, "loss": 38.7405, "num_input_tokens_seen": 277323696, "step": 5382 }, { "epoch": 0.5292253261590625, "grad_norm": 24.3076171875, "learning_rate": 8e-05, "loss": 39.4059, "num_input_tokens_seen": 277470088, "step": 5385 }, { "epoch": 0.5295201592098474, "grad_norm": 33.071311950683594, "learning_rate": 8e-05, "loss": 42.2808, "num_input_tokens_seen": 277613564, "step": 5388 }, { "epoch": 0.5298149922606324, "grad_norm": 36.52372360229492, "learning_rate": 8e-05, "loss": 41.923, "num_input_tokens_seen": 277801728, "step": 5391 }, { "epoch": 0.5301098253114174, "grad_norm": 24.653995513916016, "learning_rate": 8e-05, "loss": 38.6579, "num_input_tokens_seen": 277942992, "step": 5394 }, { "epoch": 0.5304046583622024, "grad_norm": 22.624168395996094, "learning_rate": 8e-05, "loss": 37.3442, "num_input_tokens_seen": 278105864, "step": 5397 }, { "epoch": 0.5306994914129874, "grad_norm": 25.598228454589844, "learning_rate": 8e-05, "loss": 41.0039, "num_input_tokens_seen": 278261296, "step": 5400 }, { "epoch": 0.5309943244637724, "grad_norm": 26.133739471435547, "learning_rate": 8e-05, "loss": 44.2222, "num_input_tokens_seen": 278411744, "step": 5403 }, { "epoch": 0.5312891575145574, "grad_norm": 26.089935302734375, "learning_rate": 8e-05, "loss": 37.4215, "num_input_tokens_seen": 278548772, "step": 5406 }, { "epoch": 0.5315839905653423, "grad_norm": 26.627443313598633, "learning_rate": 8e-05, "loss": 38.2373, "num_input_tokens_seen": 278710876, "step": 5409 }, { "epoch": 0.5318788236161274, "grad_norm": 27.520763397216797, "learning_rate": 8e-05, "loss": 44.3792, "num_input_tokens_seen": 278877056, "step": 5412 }, { "epoch": 0.5321736566669124, "grad_norm": 29.1707763671875, "learning_rate": 8e-05, "loss": 38.5313, "num_input_tokens_seen": 279032424, "step": 5415 }, { "epoch": 0.5324684897176973, "grad_norm": 31.549942016601562, "learning_rate": 8e-05, "loss": 38.1194, "num_input_tokens_seen": 279190248, "step": 5418 }, { "epoch": 0.5327633227684824, "grad_norm": 28.314420700073242, "learning_rate": 8e-05, "loss": 39.1158, "num_input_tokens_seen": 279324216, "step": 5421 }, { "epoch": 0.5330581558192673, "grad_norm": 23.590492248535156, "learning_rate": 8e-05, "loss": 39.1442, "num_input_tokens_seen": 279463144, "step": 5424 }, { "epoch": 0.5333529888700523, "grad_norm": 24.80891990661621, "learning_rate": 8e-05, "loss": 42.6141, "num_input_tokens_seen": 279603700, "step": 5427 }, { "epoch": 0.5336478219208374, "grad_norm": 29.72337532043457, "learning_rate": 8e-05, "loss": 41.3286, "num_input_tokens_seen": 279745296, "step": 5430 }, { "epoch": 0.5339426549716223, "grad_norm": 20.50883674621582, "learning_rate": 8e-05, "loss": 35.5831, "num_input_tokens_seen": 279890616, "step": 5433 }, { "epoch": 0.5342374880224073, "grad_norm": 32.3402099609375, "learning_rate": 8e-05, "loss": 35.8485, "num_input_tokens_seen": 280027824, "step": 5436 }, { "epoch": 0.5345323210731923, "grad_norm": 25.347986221313477, "learning_rate": 8e-05, "loss": 40.0463, "num_input_tokens_seen": 280194656, "step": 5439 }, { "epoch": 0.5348271541239773, "grad_norm": 24.94376564025879, "learning_rate": 8e-05, "loss": 44.0798, "num_input_tokens_seen": 280348208, "step": 5442 }, { "epoch": 0.5351219871747623, "grad_norm": 25.23859214782715, "learning_rate": 8e-05, "loss": 44.2179, "num_input_tokens_seen": 280535056, "step": 5445 }, { "epoch": 0.5354168202255473, "grad_norm": 26.1563777923584, "learning_rate": 8e-05, "loss": 36.5628, "num_input_tokens_seen": 280695328, "step": 5448 }, { "epoch": 0.5357116532763323, "grad_norm": 25.728984832763672, "learning_rate": 8e-05, "loss": 40.156, "num_input_tokens_seen": 280851680, "step": 5451 }, { "epoch": 0.5360064863271172, "grad_norm": 28.252504348754883, "learning_rate": 8e-05, "loss": 42.1762, "num_input_tokens_seen": 281001832, "step": 5454 }, { "epoch": 0.5363013193779023, "grad_norm": 22.63050079345703, "learning_rate": 8e-05, "loss": 35.0196, "num_input_tokens_seen": 281148572, "step": 5457 }, { "epoch": 0.5365961524286873, "grad_norm": 55.22769546508789, "learning_rate": 8e-05, "loss": 38.8902, "num_input_tokens_seen": 281315884, "step": 5460 }, { "epoch": 0.5368909854794722, "grad_norm": 27.13551139831543, "learning_rate": 8e-05, "loss": 36.7139, "num_input_tokens_seen": 281458016, "step": 5463 }, { "epoch": 0.5371858185302573, "grad_norm": 25.429719924926758, "learning_rate": 8e-05, "loss": 35.9278, "num_input_tokens_seen": 281624088, "step": 5466 }, { "epoch": 0.5374806515810422, "grad_norm": 28.88753318786621, "learning_rate": 8e-05, "loss": 41.2427, "num_input_tokens_seen": 281753072, "step": 5469 }, { "epoch": 0.5377754846318272, "grad_norm": 31.249370574951172, "learning_rate": 8e-05, "loss": 38.2066, "num_input_tokens_seen": 281881532, "step": 5472 }, { "epoch": 0.5380703176826123, "grad_norm": 23.7862491607666, "learning_rate": 8e-05, "loss": 41.0832, "num_input_tokens_seen": 282036740, "step": 5475 }, { "epoch": 0.5383651507333972, "grad_norm": 20.942829132080078, "learning_rate": 8e-05, "loss": 37.1708, "num_input_tokens_seen": 282192456, "step": 5478 }, { "epoch": 0.5386599837841822, "grad_norm": 24.284210205078125, "learning_rate": 8e-05, "loss": 40.6561, "num_input_tokens_seen": 282320636, "step": 5481 }, { "epoch": 0.5389548168349672, "grad_norm": 26.83125877380371, "learning_rate": 8e-05, "loss": 44.5538, "num_input_tokens_seen": 282455496, "step": 5484 }, { "epoch": 0.5392496498857522, "grad_norm": 25.163188934326172, "learning_rate": 8e-05, "loss": 38.9524, "num_input_tokens_seen": 282598584, "step": 5487 }, { "epoch": 0.5395444829365372, "grad_norm": 25.366954803466797, "learning_rate": 8e-05, "loss": 40.435, "num_input_tokens_seen": 282751360, "step": 5490 }, { "epoch": 0.5398393159873222, "grad_norm": 26.106281280517578, "learning_rate": 8e-05, "loss": 38.6259, "num_input_tokens_seen": 282882692, "step": 5493 }, { "epoch": 0.5401341490381072, "grad_norm": 24.25537872314453, "learning_rate": 8e-05, "loss": 39.2648, "num_input_tokens_seen": 283045352, "step": 5496 }, { "epoch": 0.5404289820888921, "grad_norm": 24.524158477783203, "learning_rate": 8e-05, "loss": 34.532, "num_input_tokens_seen": 283198744, "step": 5499 }, { "epoch": 0.5407238151396772, "grad_norm": 27.091638565063477, "learning_rate": 8e-05, "loss": 40.4187, "num_input_tokens_seen": 283367704, "step": 5502 }, { "epoch": 0.5410186481904622, "grad_norm": 27.984676361083984, "learning_rate": 8e-05, "loss": 39.8698, "num_input_tokens_seen": 283504976, "step": 5505 }, { "epoch": 0.5413134812412471, "grad_norm": 39.2283935546875, "learning_rate": 8e-05, "loss": 39.939, "num_input_tokens_seen": 283677932, "step": 5508 }, { "epoch": 0.5416083142920322, "grad_norm": 25.75412368774414, "learning_rate": 8e-05, "loss": 39.2238, "num_input_tokens_seen": 283826660, "step": 5511 }, { "epoch": 0.5419031473428171, "grad_norm": 25.7208194732666, "learning_rate": 8e-05, "loss": 38.1721, "num_input_tokens_seen": 283989516, "step": 5514 }, { "epoch": 0.5421979803936021, "grad_norm": 24.40972328186035, "learning_rate": 8e-05, "loss": 38.1401, "num_input_tokens_seen": 284151680, "step": 5517 }, { "epoch": 0.5424928134443872, "grad_norm": 28.160717010498047, "learning_rate": 8e-05, "loss": 37.8972, "num_input_tokens_seen": 284318576, "step": 5520 }, { "epoch": 0.5427876464951721, "grad_norm": 23.772441864013672, "learning_rate": 8e-05, "loss": 40.8021, "num_input_tokens_seen": 284479420, "step": 5523 }, { "epoch": 0.5430824795459571, "grad_norm": 26.15060043334961, "learning_rate": 8e-05, "loss": 38.6573, "num_input_tokens_seen": 284650476, "step": 5526 }, { "epoch": 0.543377312596742, "grad_norm": 22.040708541870117, "learning_rate": 8e-05, "loss": 35.2758, "num_input_tokens_seen": 284813264, "step": 5529 }, { "epoch": 0.5436721456475271, "grad_norm": 25.442060470581055, "learning_rate": 8e-05, "loss": 40.1561, "num_input_tokens_seen": 284955988, "step": 5532 }, { "epoch": 0.5439669786983121, "grad_norm": 41.13759231567383, "learning_rate": 8e-05, "loss": 37.8666, "num_input_tokens_seen": 285102048, "step": 5535 }, { "epoch": 0.544261811749097, "grad_norm": 24.975589752197266, "learning_rate": 8e-05, "loss": 42.4398, "num_input_tokens_seen": 285270928, "step": 5538 }, { "epoch": 0.5445566447998821, "grad_norm": 22.179227828979492, "learning_rate": 8e-05, "loss": 36.8113, "num_input_tokens_seen": 285419668, "step": 5541 }, { "epoch": 0.544851477850667, "grad_norm": 26.153484344482422, "learning_rate": 8e-05, "loss": 37.6961, "num_input_tokens_seen": 285605556, "step": 5544 }, { "epoch": 0.545146310901452, "grad_norm": 26.558202743530273, "learning_rate": 8e-05, "loss": 39.7822, "num_input_tokens_seen": 285758500, "step": 5547 }, { "epoch": 0.5454411439522371, "grad_norm": 26.40770149230957, "learning_rate": 8e-05, "loss": 35.4442, "num_input_tokens_seen": 285907832, "step": 5550 }, { "epoch": 0.545735977003022, "grad_norm": 24.307918548583984, "learning_rate": 8e-05, "loss": 41.7595, "num_input_tokens_seen": 286077336, "step": 5553 }, { "epoch": 0.546030810053807, "grad_norm": 32.74250411987305, "learning_rate": 8e-05, "loss": 45.9219, "num_input_tokens_seen": 286245036, "step": 5556 }, { "epoch": 0.546325643104592, "grad_norm": 32.992637634277344, "learning_rate": 8e-05, "loss": 38.9375, "num_input_tokens_seen": 286388136, "step": 5559 }, { "epoch": 0.546620476155377, "grad_norm": 49.21634292602539, "learning_rate": 8e-05, "loss": 37.6279, "num_input_tokens_seen": 286549512, "step": 5562 }, { "epoch": 0.546915309206162, "grad_norm": 24.97784996032715, "learning_rate": 8e-05, "loss": 37.8887, "num_input_tokens_seen": 286677032, "step": 5565 }, { "epoch": 0.547210142256947, "grad_norm": 28.935182571411133, "learning_rate": 8e-05, "loss": 39.9942, "num_input_tokens_seen": 286821188, "step": 5568 }, { "epoch": 0.547504975307732, "grad_norm": 345.0877990722656, "learning_rate": 8e-05, "loss": 35.3478, "num_input_tokens_seen": 286976392, "step": 5571 }, { "epoch": 0.5477998083585169, "grad_norm": 30.12696647644043, "learning_rate": 8e-05, "loss": 43.53, "num_input_tokens_seen": 287129980, "step": 5574 }, { "epoch": 0.548094641409302, "grad_norm": 45.15864944458008, "learning_rate": 8e-05, "loss": 38.9156, "num_input_tokens_seen": 287279712, "step": 5577 }, { "epoch": 0.548389474460087, "grad_norm": 22.9730224609375, "learning_rate": 8e-05, "loss": 41.7912, "num_input_tokens_seen": 287439780, "step": 5580 }, { "epoch": 0.5486843075108719, "grad_norm": 25.395217895507812, "learning_rate": 8e-05, "loss": 36.8195, "num_input_tokens_seen": 287598532, "step": 5583 }, { "epoch": 0.548979140561657, "grad_norm": 24.853181838989258, "learning_rate": 8e-05, "loss": 37.4314, "num_input_tokens_seen": 287745808, "step": 5586 }, { "epoch": 0.5492739736124419, "grad_norm": 35.19365310668945, "learning_rate": 8e-05, "loss": 42.1886, "num_input_tokens_seen": 287914132, "step": 5589 }, { "epoch": 0.5495688066632269, "grad_norm": 27.032106399536133, "learning_rate": 8e-05, "loss": 37.3116, "num_input_tokens_seen": 288060960, "step": 5592 }, { "epoch": 0.549863639714012, "grad_norm": 25.702754974365234, "learning_rate": 8e-05, "loss": 37.2868, "num_input_tokens_seen": 288202716, "step": 5595 }, { "epoch": 0.5501584727647969, "grad_norm": 30.17405891418457, "learning_rate": 8e-05, "loss": 40.0769, "num_input_tokens_seen": 288361760, "step": 5598 }, { "epoch": 0.5504533058155819, "grad_norm": 34.64820861816406, "learning_rate": 8e-05, "loss": 36.3623, "num_input_tokens_seen": 288520980, "step": 5601 }, { "epoch": 0.5507481388663669, "grad_norm": 23.259239196777344, "learning_rate": 8e-05, "loss": 37.8757, "num_input_tokens_seen": 288704772, "step": 5604 }, { "epoch": 0.5510429719171519, "grad_norm": 26.68288803100586, "learning_rate": 8e-05, "loss": 37.9769, "num_input_tokens_seen": 288846252, "step": 5607 }, { "epoch": 0.5513378049679369, "grad_norm": 27.598785400390625, "learning_rate": 8e-05, "loss": 40.1756, "num_input_tokens_seen": 288999052, "step": 5610 }, { "epoch": 0.5516326380187219, "grad_norm": 25.46784210205078, "learning_rate": 8e-05, "loss": 44.6034, "num_input_tokens_seen": 289152324, "step": 5613 }, { "epoch": 0.5519274710695069, "grad_norm": 30.939964294433594, "learning_rate": 8e-05, "loss": 42.1413, "num_input_tokens_seen": 289317628, "step": 5616 }, { "epoch": 0.5522223041202918, "grad_norm": 24.61944007873535, "learning_rate": 8e-05, "loss": 38.7557, "num_input_tokens_seen": 289449632, "step": 5619 }, { "epoch": 0.5525171371710769, "grad_norm": 24.867664337158203, "learning_rate": 8e-05, "loss": 39.5265, "num_input_tokens_seen": 289611300, "step": 5622 }, { "epoch": 0.5528119702218619, "grad_norm": 22.521526336669922, "learning_rate": 8e-05, "loss": 39.4616, "num_input_tokens_seen": 289771848, "step": 5625 }, { "epoch": 0.5531068032726468, "grad_norm": 23.784482955932617, "learning_rate": 8e-05, "loss": 38.105, "num_input_tokens_seen": 289938788, "step": 5628 }, { "epoch": 0.5534016363234319, "grad_norm": 27.707380294799805, "learning_rate": 8e-05, "loss": 37.8987, "num_input_tokens_seen": 290092404, "step": 5631 }, { "epoch": 0.5536964693742169, "grad_norm": 23.659029006958008, "learning_rate": 8e-05, "loss": 37.5206, "num_input_tokens_seen": 290262368, "step": 5634 }, { "epoch": 0.5539913024250018, "grad_norm": 33.99428176879883, "learning_rate": 8e-05, "loss": 39.2867, "num_input_tokens_seen": 290424784, "step": 5637 }, { "epoch": 0.5542861354757869, "grad_norm": 29.01900291442871, "learning_rate": 8e-05, "loss": 36.2682, "num_input_tokens_seen": 290579540, "step": 5640 }, { "epoch": 0.5545809685265718, "grad_norm": 28.39067268371582, "learning_rate": 8e-05, "loss": 38.8032, "num_input_tokens_seen": 290725264, "step": 5643 }, { "epoch": 0.5548758015773568, "grad_norm": 28.794584274291992, "learning_rate": 8e-05, "loss": 42.0969, "num_input_tokens_seen": 290897136, "step": 5646 }, { "epoch": 0.5551706346281419, "grad_norm": 88.10995483398438, "learning_rate": 8e-05, "loss": 39.5867, "num_input_tokens_seen": 291060408, "step": 5649 }, { "epoch": 0.5554654676789268, "grad_norm": 44.05754089355469, "learning_rate": 8e-05, "loss": 39.5617, "num_input_tokens_seen": 291232472, "step": 5652 }, { "epoch": 0.5557603007297118, "grad_norm": 28.9007625579834, "learning_rate": 8e-05, "loss": 39.7402, "num_input_tokens_seen": 291382420, "step": 5655 }, { "epoch": 0.5560551337804968, "grad_norm": 32.80019760131836, "learning_rate": 8e-05, "loss": 41.9413, "num_input_tokens_seen": 291516656, "step": 5658 }, { "epoch": 0.5563499668312818, "grad_norm": 47.72719955444336, "learning_rate": 8e-05, "loss": 33.9347, "num_input_tokens_seen": 291667636, "step": 5661 }, { "epoch": 0.5566447998820668, "grad_norm": 66.66146850585938, "learning_rate": 8e-05, "loss": 43.2117, "num_input_tokens_seen": 291800104, "step": 5664 }, { "epoch": 0.5569396329328518, "grad_norm": 25.505979537963867, "learning_rate": 8e-05, "loss": 38.9609, "num_input_tokens_seen": 291938656, "step": 5667 }, { "epoch": 0.5572344659836368, "grad_norm": 27.359821319580078, "learning_rate": 8e-05, "loss": 35.8896, "num_input_tokens_seen": 292071348, "step": 5670 }, { "epoch": 0.5575292990344217, "grad_norm": 27.773998260498047, "learning_rate": 8e-05, "loss": 43.5044, "num_input_tokens_seen": 292231860, "step": 5673 }, { "epoch": 0.5578241320852068, "grad_norm": 29.372474670410156, "learning_rate": 8e-05, "loss": 34.7316, "num_input_tokens_seen": 292390360, "step": 5676 }, { "epoch": 0.5581189651359918, "grad_norm": 32.9428596496582, "learning_rate": 8e-05, "loss": 42.433, "num_input_tokens_seen": 292555324, "step": 5679 }, { "epoch": 0.5584137981867767, "grad_norm": 26.41676139831543, "learning_rate": 8e-05, "loss": 36.0981, "num_input_tokens_seen": 292720044, "step": 5682 }, { "epoch": 0.5587086312375618, "grad_norm": 24.948963165283203, "learning_rate": 8e-05, "loss": 41.193, "num_input_tokens_seen": 292871312, "step": 5685 }, { "epoch": 0.5590034642883467, "grad_norm": 26.889123916625977, "learning_rate": 8e-05, "loss": 40.1803, "num_input_tokens_seen": 293044880, "step": 5688 }, { "epoch": 0.5592982973391317, "grad_norm": 26.73687744140625, "learning_rate": 8e-05, "loss": 39.5284, "num_input_tokens_seen": 293182688, "step": 5691 }, { "epoch": 0.5595931303899168, "grad_norm": 24.835887908935547, "learning_rate": 8e-05, "loss": 38.3529, "num_input_tokens_seen": 293343424, "step": 5694 }, { "epoch": 0.5598879634407017, "grad_norm": 32.0245361328125, "learning_rate": 8e-05, "loss": 39.3238, "num_input_tokens_seen": 293502672, "step": 5697 }, { "epoch": 0.5601827964914867, "grad_norm": 30.10677146911621, "learning_rate": 8e-05, "loss": 40.3795, "num_input_tokens_seen": 293684264, "step": 5700 }, { "epoch": 0.5604776295422716, "grad_norm": 32.71257400512695, "learning_rate": 8e-05, "loss": 41.1775, "num_input_tokens_seen": 293870016, "step": 5703 }, { "epoch": 0.5607724625930567, "grad_norm": 21.719552993774414, "learning_rate": 8e-05, "loss": 35.4193, "num_input_tokens_seen": 294033016, "step": 5706 }, { "epoch": 0.5610672956438417, "grad_norm": 53.36152267456055, "learning_rate": 8e-05, "loss": 37.5693, "num_input_tokens_seen": 294184876, "step": 5709 }, { "epoch": 0.5613621286946266, "grad_norm": 36.04158020019531, "learning_rate": 8e-05, "loss": 35.0886, "num_input_tokens_seen": 294352844, "step": 5712 }, { "epoch": 0.5616569617454117, "grad_norm": 22.147815704345703, "learning_rate": 8e-05, "loss": 37.5072, "num_input_tokens_seen": 294515272, "step": 5715 }, { "epoch": 0.5619517947961966, "grad_norm": 30.99330711364746, "learning_rate": 8e-05, "loss": 41.2776, "num_input_tokens_seen": 294669808, "step": 5718 }, { "epoch": 0.5622466278469817, "grad_norm": 41.663692474365234, "learning_rate": 8e-05, "loss": 41.1665, "num_input_tokens_seen": 294833380, "step": 5721 }, { "epoch": 0.5625414608977667, "grad_norm": 24.908546447753906, "learning_rate": 8e-05, "loss": 37.6573, "num_input_tokens_seen": 294981576, "step": 5724 }, { "epoch": 0.5628362939485516, "grad_norm": 28.312366485595703, "learning_rate": 8e-05, "loss": 44.9366, "num_input_tokens_seen": 295159120, "step": 5727 }, { "epoch": 0.5631311269993367, "grad_norm": 32.46244812011719, "learning_rate": 8e-05, "loss": 32.9336, "num_input_tokens_seen": 295319032, "step": 5730 }, { "epoch": 0.5634259600501216, "grad_norm": 27.558486938476562, "learning_rate": 8e-05, "loss": 33.7036, "num_input_tokens_seen": 295485416, "step": 5733 }, { "epoch": 0.5637207931009066, "grad_norm": 35.52197265625, "learning_rate": 8e-05, "loss": 37.3093, "num_input_tokens_seen": 295622688, "step": 5736 }, { "epoch": 0.5640156261516917, "grad_norm": 26.202285766601562, "learning_rate": 8e-05, "loss": 36.7879, "num_input_tokens_seen": 295773952, "step": 5739 }, { "epoch": 0.5643104592024766, "grad_norm": 26.322406768798828, "learning_rate": 8e-05, "loss": 43.3144, "num_input_tokens_seen": 295932736, "step": 5742 }, { "epoch": 0.5646052922532616, "grad_norm": 35.22903823852539, "learning_rate": 8e-05, "loss": 40.8689, "num_input_tokens_seen": 296092228, "step": 5745 }, { "epoch": 0.5649001253040465, "grad_norm": 25.881752014160156, "learning_rate": 8e-05, "loss": 41.6622, "num_input_tokens_seen": 296261440, "step": 5748 }, { "epoch": 0.5651949583548316, "grad_norm": 25.221384048461914, "learning_rate": 8e-05, "loss": 39.834, "num_input_tokens_seen": 296402096, "step": 5751 }, { "epoch": 0.5654897914056166, "grad_norm": 46.59519958496094, "learning_rate": 8e-05, "loss": 36.7, "num_input_tokens_seen": 296558864, "step": 5754 }, { "epoch": 0.5657846244564015, "grad_norm": 25.258193969726562, "learning_rate": 8e-05, "loss": 39.4638, "num_input_tokens_seen": 296705336, "step": 5757 }, { "epoch": 0.5660794575071866, "grad_norm": 26.750333786010742, "learning_rate": 8e-05, "loss": 41.9382, "num_input_tokens_seen": 296854004, "step": 5760 }, { "epoch": 0.5663742905579715, "grad_norm": 28.431350708007812, "learning_rate": 8e-05, "loss": 39.838, "num_input_tokens_seen": 297019044, "step": 5763 }, { "epoch": 0.5666691236087565, "grad_norm": 27.761972427368164, "learning_rate": 8e-05, "loss": 35.0378, "num_input_tokens_seen": 297178948, "step": 5766 }, { "epoch": 0.5669639566595416, "grad_norm": 34.624732971191406, "learning_rate": 8e-05, "loss": 37.0351, "num_input_tokens_seen": 297351116, "step": 5769 }, { "epoch": 0.5672587897103265, "grad_norm": 24.31560707092285, "learning_rate": 8e-05, "loss": 37.4997, "num_input_tokens_seen": 297504184, "step": 5772 }, { "epoch": 0.5675536227611115, "grad_norm": 24.228517532348633, "learning_rate": 8e-05, "loss": 36.6194, "num_input_tokens_seen": 297677124, "step": 5775 }, { "epoch": 0.5678484558118965, "grad_norm": 39.17292785644531, "learning_rate": 8e-05, "loss": 39.9698, "num_input_tokens_seen": 297827896, "step": 5778 }, { "epoch": 0.5681432888626815, "grad_norm": 25.8333683013916, "learning_rate": 8e-05, "loss": 37.8598, "num_input_tokens_seen": 297975784, "step": 5781 }, { "epoch": 0.5684381219134665, "grad_norm": 30.754497528076172, "learning_rate": 8e-05, "loss": 35.6395, "num_input_tokens_seen": 298138908, "step": 5784 }, { "epoch": 0.5687329549642515, "grad_norm": 26.73647689819336, "learning_rate": 8e-05, "loss": 39.9627, "num_input_tokens_seen": 298317840, "step": 5787 }, { "epoch": 0.5690277880150365, "grad_norm": 128.11517333984375, "learning_rate": 8e-05, "loss": 38.1483, "num_input_tokens_seen": 298477572, "step": 5790 }, { "epoch": 0.5693226210658214, "grad_norm": 26.494300842285156, "learning_rate": 8e-05, "loss": 36.4683, "num_input_tokens_seen": 298633940, "step": 5793 }, { "epoch": 0.5696174541166065, "grad_norm": 62.236572265625, "learning_rate": 8e-05, "loss": 38.026, "num_input_tokens_seen": 298784060, "step": 5796 }, { "epoch": 0.5699122871673915, "grad_norm": 42.91933822631836, "learning_rate": 8e-05, "loss": 42.2562, "num_input_tokens_seen": 298958820, "step": 5799 }, { "epoch": 0.5702071202181764, "grad_norm": 50.08773422241211, "learning_rate": 8e-05, "loss": 41.6174, "num_input_tokens_seen": 299099592, "step": 5802 }, { "epoch": 0.5705019532689615, "grad_norm": 53.82916259765625, "learning_rate": 8e-05, "loss": 37.39, "num_input_tokens_seen": 299235808, "step": 5805 }, { "epoch": 0.5707967863197464, "grad_norm": 55.40131378173828, "learning_rate": 8e-05, "loss": 34.8677, "num_input_tokens_seen": 299383952, "step": 5808 }, { "epoch": 0.5710916193705314, "grad_norm": 37.975929260253906, "learning_rate": 8e-05, "loss": 41.1252, "num_input_tokens_seen": 299550972, "step": 5811 }, { "epoch": 0.5713864524213165, "grad_norm": 26.348045349121094, "learning_rate": 8e-05, "loss": 40.0975, "num_input_tokens_seen": 299721044, "step": 5814 }, { "epoch": 0.5716812854721014, "grad_norm": 80.32865905761719, "learning_rate": 8e-05, "loss": 33.7063, "num_input_tokens_seen": 299875096, "step": 5817 }, { "epoch": 0.5719761185228864, "grad_norm": 25.530847549438477, "learning_rate": 8e-05, "loss": 36.4637, "num_input_tokens_seen": 300014580, "step": 5820 }, { "epoch": 0.5722709515736714, "grad_norm": 29.553768157958984, "learning_rate": 8e-05, "loss": 40.6987, "num_input_tokens_seen": 300195144, "step": 5823 }, { "epoch": 0.5725657846244564, "grad_norm": 61.815162658691406, "learning_rate": 8e-05, "loss": 40.1162, "num_input_tokens_seen": 300351100, "step": 5826 }, { "epoch": 0.5728606176752414, "grad_norm": 29.161407470703125, "learning_rate": 8e-05, "loss": 40.9905, "num_input_tokens_seen": 300521468, "step": 5829 }, { "epoch": 0.5731554507260264, "grad_norm": 27.665782928466797, "learning_rate": 8e-05, "loss": 39.3531, "num_input_tokens_seen": 300677212, "step": 5832 }, { "epoch": 0.5734502837768114, "grad_norm": 27.592004776000977, "learning_rate": 8e-05, "loss": 37.4998, "num_input_tokens_seen": 300817448, "step": 5835 }, { "epoch": 0.5737451168275963, "grad_norm": 27.71179962158203, "learning_rate": 8e-05, "loss": 38.4142, "num_input_tokens_seen": 300989444, "step": 5838 }, { "epoch": 0.5740399498783814, "grad_norm": 34.622825622558594, "learning_rate": 8e-05, "loss": 40.1114, "num_input_tokens_seen": 301118496, "step": 5841 }, { "epoch": 0.5743347829291664, "grad_norm": 24.764328002929688, "learning_rate": 8e-05, "loss": 41.068, "num_input_tokens_seen": 301279716, "step": 5844 }, { "epoch": 0.5746296159799513, "grad_norm": 30.744380950927734, "learning_rate": 8e-05, "loss": 36.6774, "num_input_tokens_seen": 301427464, "step": 5847 }, { "epoch": 0.5749244490307364, "grad_norm": 200.72665405273438, "learning_rate": 8e-05, "loss": 40.9329, "num_input_tokens_seen": 301587056, "step": 5850 }, { "epoch": 0.5752192820815213, "grad_norm": 31.990041732788086, "learning_rate": 8e-05, "loss": 32.8286, "num_input_tokens_seen": 301730364, "step": 5853 }, { "epoch": 0.5755141151323063, "grad_norm": 29.088926315307617, "learning_rate": 8e-05, "loss": 38.0881, "num_input_tokens_seen": 301900344, "step": 5856 }, { "epoch": 0.5758089481830914, "grad_norm": 142.562255859375, "learning_rate": 8e-05, "loss": 41.9923, "num_input_tokens_seen": 302048480, "step": 5859 }, { "epoch": 0.5761037812338763, "grad_norm": 34.03981399536133, "learning_rate": 8e-05, "loss": 40.1016, "num_input_tokens_seen": 302207504, "step": 5862 }, { "epoch": 0.5763986142846613, "grad_norm": 26.095165252685547, "learning_rate": 8e-05, "loss": 35.2068, "num_input_tokens_seen": 302378048, "step": 5865 }, { "epoch": 0.5766934473354463, "grad_norm": 26.977977752685547, "learning_rate": 8e-05, "loss": 39.1896, "num_input_tokens_seen": 302527312, "step": 5868 }, { "epoch": 0.5769882803862313, "grad_norm": 36.75228500366211, "learning_rate": 8e-05, "loss": 36.731, "num_input_tokens_seen": 302663536, "step": 5871 }, { "epoch": 0.5772831134370163, "grad_norm": 51.440711975097656, "learning_rate": 8e-05, "loss": 38.3708, "num_input_tokens_seen": 302832764, "step": 5874 }, { "epoch": 0.5775779464878013, "grad_norm": 24.259418487548828, "learning_rate": 8e-05, "loss": 36.0445, "num_input_tokens_seen": 302984212, "step": 5877 }, { "epoch": 0.5778727795385863, "grad_norm": 69.90709686279297, "learning_rate": 8e-05, "loss": 38.2045, "num_input_tokens_seen": 303133324, "step": 5880 }, { "epoch": 0.5781676125893712, "grad_norm": 26.804569244384766, "learning_rate": 8e-05, "loss": 37.7684, "num_input_tokens_seen": 303275508, "step": 5883 }, { "epoch": 0.5784624456401563, "grad_norm": 27.848865509033203, "learning_rate": 8e-05, "loss": 36.8484, "num_input_tokens_seen": 303443396, "step": 5886 }, { "epoch": 0.5787572786909413, "grad_norm": 22.267452239990234, "learning_rate": 8e-05, "loss": 38.3815, "num_input_tokens_seen": 303595712, "step": 5889 }, { "epoch": 0.5790521117417262, "grad_norm": 41.911293029785156, "learning_rate": 8e-05, "loss": 42.0451, "num_input_tokens_seen": 303737876, "step": 5892 }, { "epoch": 0.5793469447925113, "grad_norm": 32.45378494262695, "learning_rate": 8e-05, "loss": 37.216, "num_input_tokens_seen": 303900824, "step": 5895 }, { "epoch": 0.5796417778432962, "grad_norm": 40.52968215942383, "learning_rate": 8e-05, "loss": 39.0123, "num_input_tokens_seen": 304048528, "step": 5898 }, { "epoch": 0.5799366108940812, "grad_norm": 39.96379470825195, "learning_rate": 8e-05, "loss": 38.7395, "num_input_tokens_seen": 304218184, "step": 5901 }, { "epoch": 0.5802314439448663, "grad_norm": 26.308931350708008, "learning_rate": 8e-05, "loss": 40.2331, "num_input_tokens_seen": 304377180, "step": 5904 }, { "epoch": 0.5805262769956512, "grad_norm": 26.30266761779785, "learning_rate": 8e-05, "loss": 37.2614, "num_input_tokens_seen": 304539968, "step": 5907 }, { "epoch": 0.5808211100464362, "grad_norm": 26.29747200012207, "learning_rate": 8e-05, "loss": 41.3342, "num_input_tokens_seen": 304706308, "step": 5910 }, { "epoch": 0.5811159430972211, "grad_norm": 27.593076705932617, "learning_rate": 8e-05, "loss": 38.9741, "num_input_tokens_seen": 304848948, "step": 5913 }, { "epoch": 0.5814107761480062, "grad_norm": 25.446651458740234, "learning_rate": 8e-05, "loss": 37.807, "num_input_tokens_seen": 304998176, "step": 5916 }, { "epoch": 0.5817056091987912, "grad_norm": 29.21063995361328, "learning_rate": 8e-05, "loss": 36.8093, "num_input_tokens_seen": 305159976, "step": 5919 }, { "epoch": 0.5820004422495761, "grad_norm": 34.09979248046875, "learning_rate": 8e-05, "loss": 36.7409, "num_input_tokens_seen": 305308808, "step": 5922 }, { "epoch": 0.5822952753003612, "grad_norm": 32.126407623291016, "learning_rate": 8e-05, "loss": 38.0265, "num_input_tokens_seen": 305461308, "step": 5925 }, { "epoch": 0.5825901083511462, "grad_norm": 27.993932723999023, "learning_rate": 8e-05, "loss": 42.3727, "num_input_tokens_seen": 305601456, "step": 5928 }, { "epoch": 0.5828849414019311, "grad_norm": 32.2390022277832, "learning_rate": 8e-05, "loss": 39.7092, "num_input_tokens_seen": 305746720, "step": 5931 }, { "epoch": 0.5831797744527162, "grad_norm": 77.48739624023438, "learning_rate": 8e-05, "loss": 34.6095, "num_input_tokens_seen": 305876036, "step": 5934 }, { "epoch": 0.5834746075035011, "grad_norm": 130.36790466308594, "learning_rate": 8e-05, "loss": 33.3341, "num_input_tokens_seen": 306028364, "step": 5937 }, { "epoch": 0.5837694405542861, "grad_norm": 76.7963638305664, "learning_rate": 8e-05, "loss": 37.3014, "num_input_tokens_seen": 306192280, "step": 5940 }, { "epoch": 0.5840642736050712, "grad_norm": 30.70940399169922, "learning_rate": 8e-05, "loss": 36.3198, "num_input_tokens_seen": 306329048, "step": 5943 }, { "epoch": 0.5843591066558561, "grad_norm": 61.189674377441406, "learning_rate": 8e-05, "loss": 43.0917, "num_input_tokens_seen": 306471744, "step": 5946 }, { "epoch": 0.5846539397066411, "grad_norm": 31.423398971557617, "learning_rate": 8e-05, "loss": 39.6598, "num_input_tokens_seen": 306616996, "step": 5949 }, { "epoch": 0.5849487727574261, "grad_norm": 283.5697021484375, "learning_rate": 8e-05, "loss": 40.6435, "num_input_tokens_seen": 306763840, "step": 5952 }, { "epoch": 0.5852436058082111, "grad_norm": 42.0274543762207, "learning_rate": 8e-05, "loss": 38.9727, "num_input_tokens_seen": 306925976, "step": 5955 }, { "epoch": 0.5855384388589961, "grad_norm": 29.649784088134766, "learning_rate": 8e-05, "loss": 37.5955, "num_input_tokens_seen": 307096356, "step": 5958 }, { "epoch": 0.5858332719097811, "grad_norm": 34.0390625, "learning_rate": 8e-05, "loss": 41.1632, "num_input_tokens_seen": 307248556, "step": 5961 }, { "epoch": 0.5861281049605661, "grad_norm": 36.18404769897461, "learning_rate": 8e-05, "loss": 36.7726, "num_input_tokens_seen": 307399908, "step": 5964 }, { "epoch": 0.586422938011351, "grad_norm": 27.936336517333984, "learning_rate": 8e-05, "loss": 35.0917, "num_input_tokens_seen": 307577524, "step": 5967 }, { "epoch": 0.5867177710621361, "grad_norm": 25.70185661315918, "learning_rate": 8e-05, "loss": 40.0438, "num_input_tokens_seen": 307716788, "step": 5970 }, { "epoch": 0.5870126041129211, "grad_norm": 32.73580551147461, "learning_rate": 8e-05, "loss": 37.6364, "num_input_tokens_seen": 307876480, "step": 5973 }, { "epoch": 0.587307437163706, "grad_norm": 34.62762451171875, "learning_rate": 8e-05, "loss": 39.1228, "num_input_tokens_seen": 308014456, "step": 5976 }, { "epoch": 0.5876022702144911, "grad_norm": 46.6585693359375, "learning_rate": 8e-05, "loss": 34.5612, "num_input_tokens_seen": 308170444, "step": 5979 }, { "epoch": 0.587897103265276, "grad_norm": 29.657745361328125, "learning_rate": 8e-05, "loss": 43.7115, "num_input_tokens_seen": 308332068, "step": 5982 }, { "epoch": 0.588191936316061, "grad_norm": 30.494150161743164, "learning_rate": 8e-05, "loss": 38.0942, "num_input_tokens_seen": 308493208, "step": 5985 }, { "epoch": 0.5884867693668461, "grad_norm": 31.967741012573242, "learning_rate": 8e-05, "loss": 40.2974, "num_input_tokens_seen": 308640716, "step": 5988 }, { "epoch": 0.588781602417631, "grad_norm": 29.766223907470703, "learning_rate": 8e-05, "loss": 39.3607, "num_input_tokens_seen": 308776512, "step": 5991 }, { "epoch": 0.589076435468416, "grad_norm": 51.61750793457031, "learning_rate": 8e-05, "loss": 36.1088, "num_input_tokens_seen": 308914184, "step": 5994 }, { "epoch": 0.589371268519201, "grad_norm": 38.873905181884766, "learning_rate": 8e-05, "loss": 34.3019, "num_input_tokens_seen": 309069328, "step": 5997 }, { "epoch": 0.589666101569986, "grad_norm": 62.840354919433594, "learning_rate": 8e-05, "loss": 41.428, "num_input_tokens_seen": 309218384, "step": 6000 }, { "epoch": 0.589666101569986, "eval_gen_len": 35.03, "eval_loss": 2.4841418266296387, "eval_rouge1": 44.7711, "eval_rouge2": 28.0903, "eval_rougeL": 40.7346, "eval_rougeLsum": 40.9658, "eval_runtime": 111.847, "eval_samples_per_second": 1.788, "eval_steps_per_second": 0.447, "num_input_tokens_seen": 309218384, "step": 6000 }, { "epoch": 0.589960934620771, "grad_norm": 29.889745712280273, "learning_rate": 8e-05, "loss": 39.2844, "num_input_tokens_seen": 309394920, "step": 6003 }, { "epoch": 0.590255767671556, "grad_norm": 27.655027389526367, "learning_rate": 8e-05, "loss": 36.9213, "num_input_tokens_seen": 309554232, "step": 6006 }, { "epoch": 0.590550600722341, "grad_norm": 29.12769317626953, "learning_rate": 8e-05, "loss": 36.2834, "num_input_tokens_seen": 309700692, "step": 6009 }, { "epoch": 0.5908454337731259, "grad_norm": 30.606224060058594, "learning_rate": 8e-05, "loss": 41.0752, "num_input_tokens_seen": 309863240, "step": 6012 }, { "epoch": 0.591140266823911, "grad_norm": 26.596939086914062, "learning_rate": 8e-05, "loss": 39.0024, "num_input_tokens_seen": 310023020, "step": 6015 }, { "epoch": 0.591435099874696, "grad_norm": 32.189170837402344, "learning_rate": 8e-05, "loss": 40.5805, "num_input_tokens_seen": 310161712, "step": 6018 }, { "epoch": 0.5917299329254809, "grad_norm": 24.282939910888672, "learning_rate": 8e-05, "loss": 37.5122, "num_input_tokens_seen": 310313060, "step": 6021 }, { "epoch": 0.592024765976266, "grad_norm": 26.229461669921875, "learning_rate": 8e-05, "loss": 38.3322, "num_input_tokens_seen": 310472164, "step": 6024 }, { "epoch": 0.5923195990270509, "grad_norm": 24.86408805847168, "learning_rate": 8e-05, "loss": 39.9136, "num_input_tokens_seen": 310640152, "step": 6027 }, { "epoch": 0.5926144320778359, "grad_norm": 56.45329666137695, "learning_rate": 8e-05, "loss": 36.4206, "num_input_tokens_seen": 310796800, "step": 6030 }, { "epoch": 0.592909265128621, "grad_norm": 26.79425048828125, "learning_rate": 8e-05, "loss": 39.6583, "num_input_tokens_seen": 310954172, "step": 6033 }, { "epoch": 0.5932040981794059, "grad_norm": 25.8710880279541, "learning_rate": 8e-05, "loss": 36.8764, "num_input_tokens_seen": 311108384, "step": 6036 }, { "epoch": 0.5934989312301909, "grad_norm": 39.8235969543457, "learning_rate": 8e-05, "loss": 40.9173, "num_input_tokens_seen": 311274412, "step": 6039 }, { "epoch": 0.5937937642809759, "grad_norm": 27.080610275268555, "learning_rate": 8e-05, "loss": 39.3676, "num_input_tokens_seen": 311438424, "step": 6042 }, { "epoch": 0.5940885973317609, "grad_norm": 28.4052677154541, "learning_rate": 8e-05, "loss": 36.2356, "num_input_tokens_seen": 311595236, "step": 6045 }, { "epoch": 0.5943834303825459, "grad_norm": 24.13104820251465, "learning_rate": 8e-05, "loss": 34.4232, "num_input_tokens_seen": 311741048, "step": 6048 }, { "epoch": 0.5946782634333309, "grad_norm": 24.363216400146484, "learning_rate": 8e-05, "loss": 38.3822, "num_input_tokens_seen": 311916556, "step": 6051 }, { "epoch": 0.5949730964841159, "grad_norm": 26.450504302978516, "learning_rate": 8e-05, "loss": 42.0056, "num_input_tokens_seen": 312087284, "step": 6054 }, { "epoch": 0.5952679295349008, "grad_norm": 25.021625518798828, "learning_rate": 8e-05, "loss": 34.846, "num_input_tokens_seen": 312239932, "step": 6057 }, { "epoch": 0.5955627625856859, "grad_norm": 41.91618728637695, "learning_rate": 8e-05, "loss": 34.7107, "num_input_tokens_seen": 312380608, "step": 6060 }, { "epoch": 0.5958575956364709, "grad_norm": 28.016080856323242, "learning_rate": 8e-05, "loss": 35.2353, "num_input_tokens_seen": 312542388, "step": 6063 }, { "epoch": 0.5961524286872558, "grad_norm": 29.0964412689209, "learning_rate": 8e-05, "loss": 38.4793, "num_input_tokens_seen": 312704672, "step": 6066 }, { "epoch": 0.5964472617380409, "grad_norm": 29.767629623413086, "learning_rate": 8e-05, "loss": 41.6059, "num_input_tokens_seen": 312866032, "step": 6069 }, { "epoch": 0.5967420947888258, "grad_norm": 27.889211654663086, "learning_rate": 8e-05, "loss": 37.9207, "num_input_tokens_seen": 313003224, "step": 6072 }, { "epoch": 0.5970369278396108, "grad_norm": 25.92238998413086, "learning_rate": 8e-05, "loss": 37.741, "num_input_tokens_seen": 313140552, "step": 6075 }, { "epoch": 0.5973317608903959, "grad_norm": 31.18004035949707, "learning_rate": 8e-05, "loss": 40.4069, "num_input_tokens_seen": 313296652, "step": 6078 }, { "epoch": 0.5976265939411808, "grad_norm": 31.6116943359375, "learning_rate": 8e-05, "loss": 39.7078, "num_input_tokens_seen": 313443404, "step": 6081 }, { "epoch": 0.5979214269919658, "grad_norm": 33.67839050292969, "learning_rate": 8e-05, "loss": 40.8099, "num_input_tokens_seen": 313616320, "step": 6084 }, { "epoch": 0.5982162600427507, "grad_norm": 25.062952041625977, "learning_rate": 8e-05, "loss": 39.9874, "num_input_tokens_seen": 313772216, "step": 6087 }, { "epoch": 0.5985110930935358, "grad_norm": 23.47759246826172, "learning_rate": 8e-05, "loss": 37.2121, "num_input_tokens_seen": 313914312, "step": 6090 }, { "epoch": 0.5988059261443208, "grad_norm": 23.165061950683594, "learning_rate": 8e-05, "loss": 36.6471, "num_input_tokens_seen": 314070528, "step": 6093 }, { "epoch": 0.5991007591951057, "grad_norm": 26.055076599121094, "learning_rate": 8e-05, "loss": 38.3907, "num_input_tokens_seen": 314206052, "step": 6096 }, { "epoch": 0.5993955922458908, "grad_norm": 30.39374351501465, "learning_rate": 8e-05, "loss": 36.5309, "num_input_tokens_seen": 314366628, "step": 6099 }, { "epoch": 0.5996904252966757, "grad_norm": 25.340938568115234, "learning_rate": 8e-05, "loss": 35.0818, "num_input_tokens_seen": 314491676, "step": 6102 }, { "epoch": 0.5999852583474607, "grad_norm": 27.407005310058594, "learning_rate": 8e-05, "loss": 38.7906, "num_input_tokens_seen": 314658508, "step": 6105 }, { "epoch": 0.6002800913982458, "grad_norm": 35.49182891845703, "learning_rate": 8e-05, "loss": 39.3374, "num_input_tokens_seen": 314821812, "step": 6108 }, { "epoch": 0.6005749244490307, "grad_norm": 30.370254516601562, "learning_rate": 8e-05, "loss": 39.6636, "num_input_tokens_seen": 314976568, "step": 6111 }, { "epoch": 0.6008697574998157, "grad_norm": 25.774280548095703, "learning_rate": 8e-05, "loss": 36.3847, "num_input_tokens_seen": 315122416, "step": 6114 }, { "epoch": 0.6011645905506007, "grad_norm": 29.883007049560547, "learning_rate": 8e-05, "loss": 42.0746, "num_input_tokens_seen": 315307612, "step": 6117 }, { "epoch": 0.6014594236013857, "grad_norm": 25.592613220214844, "learning_rate": 8e-05, "loss": 38.5165, "num_input_tokens_seen": 315455472, "step": 6120 }, { "epoch": 0.6017542566521707, "grad_norm": 24.037139892578125, "learning_rate": 8e-05, "loss": 36.9325, "num_input_tokens_seen": 315587648, "step": 6123 }, { "epoch": 0.6020490897029557, "grad_norm": 26.017288208007812, "learning_rate": 8e-05, "loss": 38.6846, "num_input_tokens_seen": 315739604, "step": 6126 }, { "epoch": 0.6023439227537407, "grad_norm": 25.019466400146484, "learning_rate": 8e-05, "loss": 33.4267, "num_input_tokens_seen": 315901624, "step": 6129 }, { "epoch": 0.6026387558045256, "grad_norm": 44.18895721435547, "learning_rate": 8e-05, "loss": 39.0203, "num_input_tokens_seen": 316061572, "step": 6132 }, { "epoch": 0.6029335888553107, "grad_norm": 29.336572647094727, "learning_rate": 8e-05, "loss": 37.8433, "num_input_tokens_seen": 316219240, "step": 6135 }, { "epoch": 0.6032284219060957, "grad_norm": 54.04864501953125, "learning_rate": 8e-05, "loss": 39.3524, "num_input_tokens_seen": 316358264, "step": 6138 }, { "epoch": 0.6035232549568806, "grad_norm": 59.748313903808594, "learning_rate": 8e-05, "loss": 41.2101, "num_input_tokens_seen": 316524004, "step": 6141 }, { "epoch": 0.6038180880076657, "grad_norm": 27.957366943359375, "learning_rate": 8e-05, "loss": 35.7421, "num_input_tokens_seen": 316698732, "step": 6144 }, { "epoch": 0.6041129210584506, "grad_norm": 25.74254035949707, "learning_rate": 8e-05, "loss": 37.5683, "num_input_tokens_seen": 316845568, "step": 6147 }, { "epoch": 0.6044077541092356, "grad_norm": 22.848718643188477, "learning_rate": 8e-05, "loss": 39.1254, "num_input_tokens_seen": 317006924, "step": 6150 }, { "epoch": 0.6047025871600207, "grad_norm": 23.866653442382812, "learning_rate": 8e-05, "loss": 34.2666, "num_input_tokens_seen": 317151316, "step": 6153 }, { "epoch": 0.6049974202108056, "grad_norm": 26.67428207397461, "learning_rate": 8e-05, "loss": 38.899, "num_input_tokens_seen": 317308624, "step": 6156 }, { "epoch": 0.6052922532615906, "grad_norm": 24.60082244873047, "learning_rate": 8e-05, "loss": 37.5207, "num_input_tokens_seen": 317467004, "step": 6159 }, { "epoch": 0.6055870863123756, "grad_norm": 26.759737014770508, "learning_rate": 8e-05, "loss": 41.3066, "num_input_tokens_seen": 317632616, "step": 6162 }, { "epoch": 0.6058819193631606, "grad_norm": 25.69239044189453, "learning_rate": 8e-05, "loss": 36.3191, "num_input_tokens_seen": 317795032, "step": 6165 }, { "epoch": 0.6061767524139456, "grad_norm": 25.120275497436523, "learning_rate": 8e-05, "loss": 39.3155, "num_input_tokens_seen": 317938576, "step": 6168 }, { "epoch": 0.6064715854647306, "grad_norm": 27.428653717041016, "learning_rate": 8e-05, "loss": 39.5125, "num_input_tokens_seen": 318095440, "step": 6171 }, { "epoch": 0.6067664185155156, "grad_norm": 26.741228103637695, "learning_rate": 8e-05, "loss": 34.3678, "num_input_tokens_seen": 318268832, "step": 6174 }, { "epoch": 0.6070612515663005, "grad_norm": 33.27923583984375, "learning_rate": 8e-05, "loss": 37.9281, "num_input_tokens_seen": 318438624, "step": 6177 }, { "epoch": 0.6073560846170856, "grad_norm": 33.121944427490234, "learning_rate": 8e-05, "loss": 38.8562, "num_input_tokens_seen": 318587356, "step": 6180 }, { "epoch": 0.6076509176678706, "grad_norm": 25.744670867919922, "learning_rate": 8e-05, "loss": 35.9841, "num_input_tokens_seen": 318738416, "step": 6183 }, { "epoch": 0.6079457507186555, "grad_norm": 35.156089782714844, "learning_rate": 8e-05, "loss": 39.9863, "num_input_tokens_seen": 318900128, "step": 6186 }, { "epoch": 0.6082405837694406, "grad_norm": 25.188457489013672, "learning_rate": 8e-05, "loss": 37.4518, "num_input_tokens_seen": 319047520, "step": 6189 }, { "epoch": 0.6085354168202255, "grad_norm": 31.52955436706543, "learning_rate": 8e-05, "loss": 41.9319, "num_input_tokens_seen": 319213976, "step": 6192 }, { "epoch": 0.6088302498710105, "grad_norm": 24.839195251464844, "learning_rate": 8e-05, "loss": 38.5171, "num_input_tokens_seen": 319363700, "step": 6195 }, { "epoch": 0.6091250829217956, "grad_norm": 26.186077117919922, "learning_rate": 8e-05, "loss": 38.4776, "num_input_tokens_seen": 319510404, "step": 6198 }, { "epoch": 0.6094199159725805, "grad_norm": 28.5213623046875, "learning_rate": 8e-05, "loss": 40.5308, "num_input_tokens_seen": 319679432, "step": 6201 }, { "epoch": 0.6097147490233655, "grad_norm": 25.307594299316406, "learning_rate": 8e-05, "loss": 36.822, "num_input_tokens_seen": 319803740, "step": 6204 }, { "epoch": 0.6100095820741505, "grad_norm": 28.7880802154541, "learning_rate": 8e-05, "loss": 37.6342, "num_input_tokens_seen": 319950552, "step": 6207 }, { "epoch": 0.6103044151249355, "grad_norm": 31.373199462890625, "learning_rate": 8e-05, "loss": 40.4808, "num_input_tokens_seen": 320084952, "step": 6210 }, { "epoch": 0.6105992481757205, "grad_norm": 22.3890438079834, "learning_rate": 8e-05, "loss": 38.3386, "num_input_tokens_seen": 320259860, "step": 6213 }, { "epoch": 0.6108940812265055, "grad_norm": 22.981782913208008, "learning_rate": 8e-05, "loss": 37.9938, "num_input_tokens_seen": 320413164, "step": 6216 }, { "epoch": 0.6111889142772905, "grad_norm": 28.02817153930664, "learning_rate": 8e-05, "loss": 37.9879, "num_input_tokens_seen": 320565088, "step": 6219 }, { "epoch": 0.6114837473280754, "grad_norm": 26.173526763916016, "learning_rate": 8e-05, "loss": 40.7751, "num_input_tokens_seen": 320720500, "step": 6222 }, { "epoch": 0.6117785803788605, "grad_norm": 24.532196044921875, "learning_rate": 8e-05, "loss": 38.0332, "num_input_tokens_seen": 320866336, "step": 6225 }, { "epoch": 0.6120734134296455, "grad_norm": 24.801034927368164, "learning_rate": 8e-05, "loss": 33.5498, "num_input_tokens_seen": 321025468, "step": 6228 }, { "epoch": 0.6123682464804304, "grad_norm": 24.207077026367188, "learning_rate": 8e-05, "loss": 35.1215, "num_input_tokens_seen": 321190080, "step": 6231 }, { "epoch": 0.6126630795312155, "grad_norm": 23.834396362304688, "learning_rate": 8e-05, "loss": 37.5769, "num_input_tokens_seen": 321342708, "step": 6234 }, { "epoch": 0.6129579125820005, "grad_norm": 30.3687744140625, "learning_rate": 8e-05, "loss": 37.4518, "num_input_tokens_seen": 321499416, "step": 6237 }, { "epoch": 0.6132527456327854, "grad_norm": 33.319515228271484, "learning_rate": 8e-05, "loss": 33.2451, "num_input_tokens_seen": 321653200, "step": 6240 }, { "epoch": 0.6135475786835705, "grad_norm": 22.197099685668945, "learning_rate": 8e-05, "loss": 37.2735, "num_input_tokens_seen": 321814068, "step": 6243 }, { "epoch": 0.6138424117343554, "grad_norm": 24.674034118652344, "learning_rate": 8e-05, "loss": 41.9069, "num_input_tokens_seen": 321963460, "step": 6246 }, { "epoch": 0.6141372447851404, "grad_norm": 25.704599380493164, "learning_rate": 8e-05, "loss": 35.3355, "num_input_tokens_seen": 322105484, "step": 6249 }, { "epoch": 0.6144320778359255, "grad_norm": 68.95780181884766, "learning_rate": 8e-05, "loss": 36.8214, "num_input_tokens_seen": 322262320, "step": 6252 }, { "epoch": 0.6147269108867104, "grad_norm": 24.768585205078125, "learning_rate": 8e-05, "loss": 38.9012, "num_input_tokens_seen": 322430264, "step": 6255 }, { "epoch": 0.6150217439374954, "grad_norm": 23.958892822265625, "learning_rate": 8e-05, "loss": 36.9476, "num_input_tokens_seen": 322584252, "step": 6258 }, { "epoch": 0.6153165769882804, "grad_norm": 22.662036895751953, "learning_rate": 8e-05, "loss": 35.6357, "num_input_tokens_seen": 322735532, "step": 6261 }, { "epoch": 0.6156114100390654, "grad_norm": 32.92559814453125, "learning_rate": 8e-05, "loss": 39.298, "num_input_tokens_seen": 322882780, "step": 6264 }, { "epoch": 0.6159062430898504, "grad_norm": 25.98016357421875, "learning_rate": 8e-05, "loss": 39.7257, "num_input_tokens_seen": 323034520, "step": 6267 }, { "epoch": 0.6162010761406354, "grad_norm": 25.833646774291992, "learning_rate": 8e-05, "loss": 36.4177, "num_input_tokens_seen": 323181256, "step": 6270 }, { "epoch": 0.6164959091914204, "grad_norm": 26.506811141967773, "learning_rate": 8e-05, "loss": 39.6621, "num_input_tokens_seen": 323347496, "step": 6273 }, { "epoch": 0.6167907422422053, "grad_norm": 24.709272384643555, "learning_rate": 8e-05, "loss": 38.6652, "num_input_tokens_seen": 323495496, "step": 6276 }, { "epoch": 0.6170855752929904, "grad_norm": 23.8092041015625, "learning_rate": 8e-05, "loss": 30.2883, "num_input_tokens_seen": 323672384, "step": 6279 }, { "epoch": 0.6173804083437754, "grad_norm": 26.053071975708008, "learning_rate": 8e-05, "loss": 38.5194, "num_input_tokens_seen": 323816864, "step": 6282 }, { "epoch": 0.6176752413945603, "grad_norm": 22.75402069091797, "learning_rate": 8e-05, "loss": 35.6399, "num_input_tokens_seen": 323975752, "step": 6285 }, { "epoch": 0.6179700744453454, "grad_norm": 28.435270309448242, "learning_rate": 8e-05, "loss": 40.2471, "num_input_tokens_seen": 324128112, "step": 6288 }, { "epoch": 0.6182649074961303, "grad_norm": 26.416706085205078, "learning_rate": 8e-05, "loss": 38.8477, "num_input_tokens_seen": 324308656, "step": 6291 }, { "epoch": 0.6185597405469153, "grad_norm": 27.599233627319336, "learning_rate": 8e-05, "loss": 38.1784, "num_input_tokens_seen": 324462412, "step": 6294 }, { "epoch": 0.6188545735977004, "grad_norm": 27.04106330871582, "learning_rate": 8e-05, "loss": 40.2296, "num_input_tokens_seen": 324615088, "step": 6297 }, { "epoch": 0.6191494066484853, "grad_norm": 25.297691345214844, "learning_rate": 8e-05, "loss": 35.8062, "num_input_tokens_seen": 324758700, "step": 6300 }, { "epoch": 0.6194442396992703, "grad_norm": 40.44841003417969, "learning_rate": 8e-05, "loss": 36.9515, "num_input_tokens_seen": 324912956, "step": 6303 }, { "epoch": 0.6197390727500552, "grad_norm": 22.532255172729492, "learning_rate": 8e-05, "loss": 35.7363, "num_input_tokens_seen": 325065772, "step": 6306 }, { "epoch": 0.6200339058008403, "grad_norm": 30.187307357788086, "learning_rate": 8e-05, "loss": 37.0602, "num_input_tokens_seen": 325211688, "step": 6309 }, { "epoch": 0.6203287388516253, "grad_norm": 29.660106658935547, "learning_rate": 8e-05, "loss": 42.3845, "num_input_tokens_seen": 325346160, "step": 6312 }, { "epoch": 0.6206235719024102, "grad_norm": 23.635356903076172, "learning_rate": 8e-05, "loss": 34.5203, "num_input_tokens_seen": 325485300, "step": 6315 }, { "epoch": 0.6209184049531953, "grad_norm": 25.654155731201172, "learning_rate": 8e-05, "loss": 34.3353, "num_input_tokens_seen": 325640184, "step": 6318 }, { "epoch": 0.6212132380039802, "grad_norm": 26.068546295166016, "learning_rate": 8e-05, "loss": 38.6867, "num_input_tokens_seen": 325788596, "step": 6321 }, { "epoch": 0.6215080710547652, "grad_norm": 26.19140625, "learning_rate": 8e-05, "loss": 39.2954, "num_input_tokens_seen": 325935036, "step": 6324 }, { "epoch": 0.6218029041055503, "grad_norm": 25.26287841796875, "learning_rate": 8e-05, "loss": 39.2243, "num_input_tokens_seen": 326107384, "step": 6327 }, { "epoch": 0.6220977371563352, "grad_norm": 42.484676361083984, "learning_rate": 8e-05, "loss": 38.0859, "num_input_tokens_seen": 326258240, "step": 6330 }, { "epoch": 0.6223925702071202, "grad_norm": 28.638322830200195, "learning_rate": 8e-05, "loss": 38.5841, "num_input_tokens_seen": 326408044, "step": 6333 }, { "epoch": 0.6226874032579052, "grad_norm": 32.85466384887695, "learning_rate": 8e-05, "loss": 38.6516, "num_input_tokens_seen": 326574272, "step": 6336 }, { "epoch": 0.6229822363086902, "grad_norm": 25.046932220458984, "learning_rate": 8e-05, "loss": 36.5864, "num_input_tokens_seen": 326728284, "step": 6339 }, { "epoch": 0.6232770693594752, "grad_norm": 25.095548629760742, "learning_rate": 8e-05, "loss": 39.4286, "num_input_tokens_seen": 326868400, "step": 6342 }, { "epoch": 0.6235719024102602, "grad_norm": 27.86627960205078, "learning_rate": 8e-05, "loss": 36.5636, "num_input_tokens_seen": 327017628, "step": 6345 }, { "epoch": 0.6238667354610452, "grad_norm": 26.67412567138672, "learning_rate": 8e-05, "loss": 34.9673, "num_input_tokens_seen": 327176840, "step": 6348 }, { "epoch": 0.6241615685118301, "grad_norm": 21.75296974182129, "learning_rate": 8e-05, "loss": 37.4662, "num_input_tokens_seen": 327348448, "step": 6351 }, { "epoch": 0.6244564015626152, "grad_norm": 24.94108009338379, "learning_rate": 8e-05, "loss": 37.269, "num_input_tokens_seen": 327502008, "step": 6354 }, { "epoch": 0.6247512346134002, "grad_norm": 25.871793746948242, "learning_rate": 8e-05, "loss": 39.1569, "num_input_tokens_seen": 327664740, "step": 6357 }, { "epoch": 0.6250460676641851, "grad_norm": 25.637754440307617, "learning_rate": 8e-05, "loss": 35.1057, "num_input_tokens_seen": 327805228, "step": 6360 }, { "epoch": 0.6253409007149702, "grad_norm": 24.5385799407959, "learning_rate": 8e-05, "loss": 36.3612, "num_input_tokens_seen": 327975752, "step": 6363 }, { "epoch": 0.6256357337657551, "grad_norm": 22.548460006713867, "learning_rate": 8e-05, "loss": 37.2956, "num_input_tokens_seen": 328108840, "step": 6366 }, { "epoch": 0.6259305668165401, "grad_norm": 27.364675521850586, "learning_rate": 8e-05, "loss": 37.3522, "num_input_tokens_seen": 328267088, "step": 6369 }, { "epoch": 0.6262253998673252, "grad_norm": 25.948596954345703, "learning_rate": 8e-05, "loss": 37.2168, "num_input_tokens_seen": 328409936, "step": 6372 }, { "epoch": 0.6265202329181101, "grad_norm": 25.288524627685547, "learning_rate": 8e-05, "loss": 39.4214, "num_input_tokens_seen": 328557584, "step": 6375 }, { "epoch": 0.6268150659688951, "grad_norm": 24.71294593811035, "learning_rate": 8e-05, "loss": 40.4651, "num_input_tokens_seen": 328723240, "step": 6378 }, { "epoch": 0.6271098990196801, "grad_norm": 28.631532669067383, "learning_rate": 8e-05, "loss": 35.7814, "num_input_tokens_seen": 328869796, "step": 6381 }, { "epoch": 0.6274047320704651, "grad_norm": 26.28819465637207, "learning_rate": 8e-05, "loss": 39.6646, "num_input_tokens_seen": 329019092, "step": 6384 }, { "epoch": 0.6276995651212501, "grad_norm": 24.583751678466797, "learning_rate": 8e-05, "loss": 36.6027, "num_input_tokens_seen": 329150600, "step": 6387 }, { "epoch": 0.6279943981720351, "grad_norm": 26.986366271972656, "learning_rate": 8e-05, "loss": 38.0961, "num_input_tokens_seen": 329290408, "step": 6390 }, { "epoch": 0.6282892312228201, "grad_norm": 24.84749984741211, "learning_rate": 8e-05, "loss": 39.3869, "num_input_tokens_seen": 329450420, "step": 6393 }, { "epoch": 0.628584064273605, "grad_norm": 28.76380157470703, "learning_rate": 8e-05, "loss": 36.9979, "num_input_tokens_seen": 329609004, "step": 6396 }, { "epoch": 0.6288788973243901, "grad_norm": 23.334930419921875, "learning_rate": 8e-05, "loss": 37.4399, "num_input_tokens_seen": 329778436, "step": 6399 }, { "epoch": 0.6291737303751751, "grad_norm": 25.433956146240234, "learning_rate": 8e-05, "loss": 36.705, "num_input_tokens_seen": 329929608, "step": 6402 }, { "epoch": 0.62946856342596, "grad_norm": 27.61007308959961, "learning_rate": 8e-05, "loss": 40.0462, "num_input_tokens_seen": 330078108, "step": 6405 }, { "epoch": 0.6297633964767451, "grad_norm": 35.30503845214844, "learning_rate": 8e-05, "loss": 40.6168, "num_input_tokens_seen": 330244708, "step": 6408 }, { "epoch": 0.63005822952753, "grad_norm": 35.08722686767578, "learning_rate": 8e-05, "loss": 38.4732, "num_input_tokens_seen": 330386136, "step": 6411 }, { "epoch": 0.630353062578315, "grad_norm": 37.645442962646484, "learning_rate": 8e-05, "loss": 38.4413, "num_input_tokens_seen": 330530964, "step": 6414 }, { "epoch": 0.6306478956291001, "grad_norm": 27.654539108276367, "learning_rate": 8e-05, "loss": 37.3986, "num_input_tokens_seen": 330677096, "step": 6417 }, { "epoch": 0.630942728679885, "grad_norm": 32.413185119628906, "learning_rate": 8e-05, "loss": 36.1898, "num_input_tokens_seen": 330847940, "step": 6420 }, { "epoch": 0.63123756173067, "grad_norm": 32.70518112182617, "learning_rate": 8e-05, "loss": 35.777, "num_input_tokens_seen": 331014036, "step": 6423 }, { "epoch": 0.631532394781455, "grad_norm": 27.792139053344727, "learning_rate": 8e-05, "loss": 41.193, "num_input_tokens_seen": 331203516, "step": 6426 }, { "epoch": 0.63182722783224, "grad_norm": 28.370925903320312, "learning_rate": 8e-05, "loss": 35.5362, "num_input_tokens_seen": 331365772, "step": 6429 }, { "epoch": 0.632122060883025, "grad_norm": 26.484954833984375, "learning_rate": 8e-05, "loss": 34.9362, "num_input_tokens_seen": 331512204, "step": 6432 }, { "epoch": 0.63241689393381, "grad_norm": 27.78957748413086, "learning_rate": 8e-05, "loss": 38.4179, "num_input_tokens_seen": 331654600, "step": 6435 }, { "epoch": 0.632711726984595, "grad_norm": 24.038084030151367, "learning_rate": 8e-05, "loss": 40.5222, "num_input_tokens_seen": 331813988, "step": 6438 }, { "epoch": 0.6330065600353799, "grad_norm": 25.796804428100586, "learning_rate": 8e-05, "loss": 37.0437, "num_input_tokens_seen": 331970924, "step": 6441 }, { "epoch": 0.633301393086165, "grad_norm": 24.9102725982666, "learning_rate": 8e-05, "loss": 38.9791, "num_input_tokens_seen": 332128504, "step": 6444 }, { "epoch": 0.63359622613695, "grad_norm": 36.67914581298828, "learning_rate": 8e-05, "loss": 38.2588, "num_input_tokens_seen": 332294876, "step": 6447 }, { "epoch": 0.6338910591877349, "grad_norm": 24.813623428344727, "learning_rate": 8e-05, "loss": 36.5417, "num_input_tokens_seen": 332435244, "step": 6450 }, { "epoch": 0.63418589223852, "grad_norm": 25.02754020690918, "learning_rate": 8e-05, "loss": 38.0281, "num_input_tokens_seen": 332585000, "step": 6453 }, { "epoch": 0.6344807252893049, "grad_norm": 28.043087005615234, "learning_rate": 8e-05, "loss": 36.1322, "num_input_tokens_seen": 332746468, "step": 6456 }, { "epoch": 0.6347755583400899, "grad_norm": 26.460355758666992, "learning_rate": 8e-05, "loss": 37.103, "num_input_tokens_seen": 332895956, "step": 6459 }, { "epoch": 0.635070391390875, "grad_norm": 25.79451560974121, "learning_rate": 8e-05, "loss": 41.6694, "num_input_tokens_seen": 333051188, "step": 6462 }, { "epoch": 0.6353652244416599, "grad_norm": 33.36618423461914, "learning_rate": 8e-05, "loss": 39.547, "num_input_tokens_seen": 333195848, "step": 6465 }, { "epoch": 0.6356600574924449, "grad_norm": 36.27760314941406, "learning_rate": 8e-05, "loss": 32.7442, "num_input_tokens_seen": 333349552, "step": 6468 }, { "epoch": 0.6359548905432298, "grad_norm": 34.557708740234375, "learning_rate": 8e-05, "loss": 36.7705, "num_input_tokens_seen": 333490864, "step": 6471 }, { "epoch": 0.6362497235940149, "grad_norm": 40.10776138305664, "learning_rate": 8e-05, "loss": 38.0712, "num_input_tokens_seen": 333639924, "step": 6474 }, { "epoch": 0.6365445566447999, "grad_norm": 27.282073974609375, "learning_rate": 8e-05, "loss": 37.5385, "num_input_tokens_seen": 333774356, "step": 6477 }, { "epoch": 0.6368393896955848, "grad_norm": 35.44887924194336, "learning_rate": 8e-05, "loss": 38.0226, "num_input_tokens_seen": 333934716, "step": 6480 }, { "epoch": 0.6371342227463699, "grad_norm": 25.15498161315918, "learning_rate": 8e-05, "loss": 40.2181, "num_input_tokens_seen": 334098292, "step": 6483 }, { "epoch": 0.6374290557971548, "grad_norm": 23.980417251586914, "learning_rate": 8e-05, "loss": 38.9857, "num_input_tokens_seen": 334266788, "step": 6486 }, { "epoch": 0.6377238888479398, "grad_norm": 26.44454002380371, "learning_rate": 8e-05, "loss": 37.4722, "num_input_tokens_seen": 334416364, "step": 6489 }, { "epoch": 0.6380187218987249, "grad_norm": 30.107446670532227, "learning_rate": 8e-05, "loss": 39.8594, "num_input_tokens_seen": 334556328, "step": 6492 }, { "epoch": 0.6383135549495098, "grad_norm": 25.60369300842285, "learning_rate": 8e-05, "loss": 38.6356, "num_input_tokens_seen": 334715092, "step": 6495 }, { "epoch": 0.6386083880002948, "grad_norm": 23.690120697021484, "learning_rate": 8e-05, "loss": 37.2159, "num_input_tokens_seen": 334872568, "step": 6498 }, { "epoch": 0.6389032210510798, "grad_norm": 25.064729690551758, "learning_rate": 8e-05, "loss": 38.6088, "num_input_tokens_seen": 335003784, "step": 6501 }, { "epoch": 0.6391980541018648, "grad_norm": 25.150876998901367, "learning_rate": 8e-05, "loss": 34.2671, "num_input_tokens_seen": 335149108, "step": 6504 }, { "epoch": 0.6394928871526498, "grad_norm": 27.666296005249023, "learning_rate": 8e-05, "loss": 39.7811, "num_input_tokens_seen": 335302152, "step": 6507 }, { "epoch": 0.6397877202034348, "grad_norm": 29.01523208618164, "learning_rate": 8e-05, "loss": 35.9424, "num_input_tokens_seen": 335457552, "step": 6510 }, { "epoch": 0.6400825532542198, "grad_norm": 20.90727996826172, "learning_rate": 8e-05, "loss": 32.7056, "num_input_tokens_seen": 335620096, "step": 6513 }, { "epoch": 0.6403773863050047, "grad_norm": 22.06777572631836, "learning_rate": 8e-05, "loss": 38.796, "num_input_tokens_seen": 335769336, "step": 6516 }, { "epoch": 0.6406722193557898, "grad_norm": 23.688201904296875, "learning_rate": 8e-05, "loss": 35.616, "num_input_tokens_seen": 335919728, "step": 6519 }, { "epoch": 0.6409670524065748, "grad_norm": 33.622276306152344, "learning_rate": 8e-05, "loss": 37.4969, "num_input_tokens_seen": 336067104, "step": 6522 }, { "epoch": 0.6412618854573597, "grad_norm": 26.230440139770508, "learning_rate": 8e-05, "loss": 38.4008, "num_input_tokens_seen": 336207588, "step": 6525 }, { "epoch": 0.6415567185081448, "grad_norm": 46.02008819580078, "learning_rate": 8e-05, "loss": 35.475, "num_input_tokens_seen": 336358368, "step": 6528 }, { "epoch": 0.6418515515589298, "grad_norm": 26.169721603393555, "learning_rate": 8e-05, "loss": 41.3722, "num_input_tokens_seen": 336497716, "step": 6531 }, { "epoch": 0.6421463846097147, "grad_norm": 26.567733764648438, "learning_rate": 8e-05, "loss": 38.4448, "num_input_tokens_seen": 336646012, "step": 6534 }, { "epoch": 0.6424412176604998, "grad_norm": 26.18414306640625, "learning_rate": 8e-05, "loss": 33.4987, "num_input_tokens_seen": 336785844, "step": 6537 }, { "epoch": 0.6427360507112847, "grad_norm": 27.476449966430664, "learning_rate": 8e-05, "loss": 38.427, "num_input_tokens_seen": 336945340, "step": 6540 }, { "epoch": 0.6430308837620697, "grad_norm": 24.81687355041504, "learning_rate": 8e-05, "loss": 38.1676, "num_input_tokens_seen": 337073892, "step": 6543 }, { "epoch": 0.6433257168128548, "grad_norm": 28.00323486328125, "learning_rate": 8e-05, "loss": 38.1138, "num_input_tokens_seen": 337223080, "step": 6546 }, { "epoch": 0.6436205498636397, "grad_norm": 31.119070053100586, "learning_rate": 8e-05, "loss": 36.6106, "num_input_tokens_seen": 337369276, "step": 6549 }, { "epoch": 0.6439153829144247, "grad_norm": 26.03184700012207, "learning_rate": 8e-05, "loss": 34.1828, "num_input_tokens_seen": 337537580, "step": 6552 }, { "epoch": 0.6442102159652097, "grad_norm": 25.940513610839844, "learning_rate": 8e-05, "loss": 37.7545, "num_input_tokens_seen": 337735200, "step": 6555 }, { "epoch": 0.6445050490159947, "grad_norm": 27.552566528320312, "learning_rate": 8e-05, "loss": 36.6377, "num_input_tokens_seen": 337885680, "step": 6558 }, { "epoch": 0.6447998820667797, "grad_norm": 25.005868911743164, "learning_rate": 8e-05, "loss": 38.7374, "num_input_tokens_seen": 338018188, "step": 6561 }, { "epoch": 0.6450947151175647, "grad_norm": 26.592487335205078, "learning_rate": 8e-05, "loss": 39.5499, "num_input_tokens_seen": 338171564, "step": 6564 }, { "epoch": 0.6453895481683497, "grad_norm": 27.41492462158203, "learning_rate": 8e-05, "loss": 36.1295, "num_input_tokens_seen": 338342292, "step": 6567 }, { "epoch": 0.6456843812191346, "grad_norm": 30.22612190246582, "learning_rate": 8e-05, "loss": 39.7426, "num_input_tokens_seen": 338498444, "step": 6570 }, { "epoch": 0.6459792142699197, "grad_norm": 28.708240509033203, "learning_rate": 8e-05, "loss": 39.4286, "num_input_tokens_seen": 338668028, "step": 6573 }, { "epoch": 0.6462740473207047, "grad_norm": 23.029550552368164, "learning_rate": 8e-05, "loss": 36.3736, "num_input_tokens_seen": 338820384, "step": 6576 }, { "epoch": 0.6465688803714896, "grad_norm": 37.06300735473633, "learning_rate": 8e-05, "loss": 38.5943, "num_input_tokens_seen": 338972760, "step": 6579 }, { "epoch": 0.6468637134222747, "grad_norm": 32.833038330078125, "learning_rate": 8e-05, "loss": 37.1746, "num_input_tokens_seen": 339144456, "step": 6582 }, { "epoch": 0.6471585464730596, "grad_norm": 29.135648727416992, "learning_rate": 8e-05, "loss": 38.2403, "num_input_tokens_seen": 339275080, "step": 6585 }, { "epoch": 0.6474533795238446, "grad_norm": 28.29663848876953, "learning_rate": 8e-05, "loss": 41.0382, "num_input_tokens_seen": 339421484, "step": 6588 }, { "epoch": 0.6477482125746297, "grad_norm": 28.951173782348633, "learning_rate": 8e-05, "loss": 36.4348, "num_input_tokens_seen": 339600020, "step": 6591 }, { "epoch": 0.6480430456254146, "grad_norm": 28.281675338745117, "learning_rate": 8e-05, "loss": 38.8957, "num_input_tokens_seen": 339757804, "step": 6594 }, { "epoch": 0.6483378786761996, "grad_norm": 34.26532745361328, "learning_rate": 8e-05, "loss": 35.5098, "num_input_tokens_seen": 339923508, "step": 6597 }, { "epoch": 0.6486327117269846, "grad_norm": 23.300016403198242, "learning_rate": 8e-05, "loss": 33.0052, "num_input_tokens_seen": 340083980, "step": 6600 }, { "epoch": 0.6489275447777696, "grad_norm": 27.63614845275879, "learning_rate": 8e-05, "loss": 38.4134, "num_input_tokens_seen": 340239028, "step": 6603 }, { "epoch": 0.6492223778285546, "grad_norm": 33.11357498168945, "learning_rate": 8e-05, "loss": 42.3899, "num_input_tokens_seen": 340405744, "step": 6606 }, { "epoch": 0.6495172108793396, "grad_norm": 25.984708786010742, "learning_rate": 8e-05, "loss": 40.0451, "num_input_tokens_seen": 340542892, "step": 6609 }, { "epoch": 0.6498120439301246, "grad_norm": 24.340328216552734, "learning_rate": 8e-05, "loss": 41.1376, "num_input_tokens_seen": 340678260, "step": 6612 }, { "epoch": 0.6501068769809095, "grad_norm": 28.840999603271484, "learning_rate": 8e-05, "loss": 29.5664, "num_input_tokens_seen": 340839812, "step": 6615 }, { "epoch": 0.6504017100316946, "grad_norm": 29.218814849853516, "learning_rate": 8e-05, "loss": 35.1434, "num_input_tokens_seen": 340998380, "step": 6618 }, { "epoch": 0.6506965430824796, "grad_norm": 221.42181396484375, "learning_rate": 8e-05, "loss": 39.2533, "num_input_tokens_seen": 341162476, "step": 6621 }, { "epoch": 0.6509913761332645, "grad_norm": 33.743621826171875, "learning_rate": 8e-05, "loss": 35.9878, "num_input_tokens_seen": 341301756, "step": 6624 }, { "epoch": 0.6512862091840496, "grad_norm": 31.24152183532715, "learning_rate": 8e-05, "loss": 36.8398, "num_input_tokens_seen": 341458280, "step": 6627 }, { "epoch": 0.6515810422348345, "grad_norm": 29.4301815032959, "learning_rate": 8e-05, "loss": 36.0449, "num_input_tokens_seen": 341624296, "step": 6630 }, { "epoch": 0.6518758752856195, "grad_norm": 22.053237915039062, "learning_rate": 8e-05, "loss": 34.3092, "num_input_tokens_seen": 341804404, "step": 6633 }, { "epoch": 0.6521707083364046, "grad_norm": 23.275348663330078, "learning_rate": 8e-05, "loss": 36.9765, "num_input_tokens_seen": 341964004, "step": 6636 }, { "epoch": 0.6524655413871895, "grad_norm": 26.191434860229492, "learning_rate": 8e-05, "loss": 39.8154, "num_input_tokens_seen": 342114328, "step": 6639 }, { "epoch": 0.6527603744379745, "grad_norm": 38.87674331665039, "learning_rate": 8e-05, "loss": 41.3826, "num_input_tokens_seen": 342277812, "step": 6642 }, { "epoch": 0.6530552074887594, "grad_norm": 24.583362579345703, "learning_rate": 8e-05, "loss": 35.0064, "num_input_tokens_seen": 342421428, "step": 6645 }, { "epoch": 0.6533500405395445, "grad_norm": 441.7207946777344, "learning_rate": 8e-05, "loss": 32.8596, "num_input_tokens_seen": 342567780, "step": 6648 }, { "epoch": 0.6536448735903295, "grad_norm": 24.749300003051758, "learning_rate": 8e-05, "loss": 37.5455, "num_input_tokens_seen": 342740652, "step": 6651 }, { "epoch": 0.6539397066411144, "grad_norm": 26.746244430541992, "learning_rate": 8e-05, "loss": 36.2025, "num_input_tokens_seen": 342880932, "step": 6654 }, { "epoch": 0.6542345396918995, "grad_norm": 72.51927947998047, "learning_rate": 8e-05, "loss": 39.4842, "num_input_tokens_seen": 343042572, "step": 6657 }, { "epoch": 0.6545293727426844, "grad_norm": 24.986759185791016, "learning_rate": 8e-05, "loss": 34.271, "num_input_tokens_seen": 343196592, "step": 6660 }, { "epoch": 0.6548242057934694, "grad_norm": 23.670583724975586, "learning_rate": 8e-05, "loss": 38.3177, "num_input_tokens_seen": 343341208, "step": 6663 }, { "epoch": 0.6551190388442545, "grad_norm": 22.490602493286133, "learning_rate": 8e-05, "loss": 37.3046, "num_input_tokens_seen": 343512440, "step": 6666 }, { "epoch": 0.6554138718950394, "grad_norm": 29.031286239624023, "learning_rate": 8e-05, "loss": 42.1513, "num_input_tokens_seen": 343664580, "step": 6669 }, { "epoch": 0.6557087049458244, "grad_norm": 27.807151794433594, "learning_rate": 8e-05, "loss": 36.1717, "num_input_tokens_seen": 343818232, "step": 6672 }, { "epoch": 0.6560035379966094, "grad_norm": 41.40616226196289, "learning_rate": 8e-05, "loss": 33.0627, "num_input_tokens_seen": 343975788, "step": 6675 }, { "epoch": 0.6562983710473944, "grad_norm": 27.92901611328125, "learning_rate": 8e-05, "loss": 32.2173, "num_input_tokens_seen": 344110348, "step": 6678 }, { "epoch": 0.6565932040981795, "grad_norm": 27.682600021362305, "learning_rate": 8e-05, "loss": 39.627, "num_input_tokens_seen": 344263136, "step": 6681 }, { "epoch": 0.6568880371489644, "grad_norm": 25.742345809936523, "learning_rate": 8e-05, "loss": 36.5412, "num_input_tokens_seen": 344415904, "step": 6684 }, { "epoch": 0.6571828701997494, "grad_norm": 23.010465621948242, "learning_rate": 8e-05, "loss": 37.7615, "num_input_tokens_seen": 344578904, "step": 6687 }, { "epoch": 0.6574777032505343, "grad_norm": 25.112548828125, "learning_rate": 8e-05, "loss": 34.0272, "num_input_tokens_seen": 344717536, "step": 6690 }, { "epoch": 0.6577725363013194, "grad_norm": 26.201248168945312, "learning_rate": 8e-05, "loss": 35.2759, "num_input_tokens_seen": 344869812, "step": 6693 }, { "epoch": 0.6580673693521044, "grad_norm": 25.718441009521484, "learning_rate": 8e-05, "loss": 38.9038, "num_input_tokens_seen": 345037436, "step": 6696 }, { "epoch": 0.6583622024028893, "grad_norm": 31.64290428161621, "learning_rate": 8e-05, "loss": 34.065, "num_input_tokens_seen": 345174996, "step": 6699 }, { "epoch": 0.6586570354536744, "grad_norm": 27.697294235229492, "learning_rate": 8e-05, "loss": 34.2109, "num_input_tokens_seen": 345324196, "step": 6702 }, { "epoch": 0.6589518685044593, "grad_norm": 30.634992599487305, "learning_rate": 8e-05, "loss": 35.4552, "num_input_tokens_seen": 345477268, "step": 6705 }, { "epoch": 0.6592467015552443, "grad_norm": 25.306087493896484, "learning_rate": 8e-05, "loss": 39.5236, "num_input_tokens_seen": 345619356, "step": 6708 }, { "epoch": 0.6595415346060294, "grad_norm": 26.60824966430664, "learning_rate": 8e-05, "loss": 38.9368, "num_input_tokens_seen": 345756940, "step": 6711 }, { "epoch": 0.6598363676568143, "grad_norm": 25.774150848388672, "learning_rate": 8e-05, "loss": 40.7406, "num_input_tokens_seen": 345918180, "step": 6714 }, { "epoch": 0.6601312007075993, "grad_norm": 23.252714157104492, "learning_rate": 8e-05, "loss": 38.6923, "num_input_tokens_seen": 346046492, "step": 6717 }, { "epoch": 0.6604260337583843, "grad_norm": 23.236473083496094, "learning_rate": 8e-05, "loss": 37.0205, "num_input_tokens_seen": 346207188, "step": 6720 }, { "epoch": 0.6607208668091693, "grad_norm": 22.641433715820312, "learning_rate": 8e-05, "loss": 35.8195, "num_input_tokens_seen": 346358216, "step": 6723 }, { "epoch": 0.6610156998599543, "grad_norm": 32.38287353515625, "learning_rate": 8e-05, "loss": 37.4374, "num_input_tokens_seen": 346508352, "step": 6726 }, { "epoch": 0.6613105329107393, "grad_norm": 23.829994201660156, "learning_rate": 8e-05, "loss": 38.3075, "num_input_tokens_seen": 346658684, "step": 6729 }, { "epoch": 0.6616053659615243, "grad_norm": 20.449419021606445, "learning_rate": 8e-05, "loss": 32.9127, "num_input_tokens_seen": 346829520, "step": 6732 }, { "epoch": 0.6619001990123092, "grad_norm": 25.241458892822266, "learning_rate": 8e-05, "loss": 37.9814, "num_input_tokens_seen": 346978620, "step": 6735 }, { "epoch": 0.6621950320630943, "grad_norm": 40.08452224731445, "learning_rate": 8e-05, "loss": 35.3977, "num_input_tokens_seen": 347129724, "step": 6738 }, { "epoch": 0.6624898651138793, "grad_norm": 25.62432289123535, "learning_rate": 8e-05, "loss": 37.1599, "num_input_tokens_seen": 347281612, "step": 6741 }, { "epoch": 0.6627846981646642, "grad_norm": 26.09302520751953, "learning_rate": 8e-05, "loss": 35.738, "num_input_tokens_seen": 347432960, "step": 6744 }, { "epoch": 0.6630795312154493, "grad_norm": 28.28545570373535, "learning_rate": 8e-05, "loss": 37.3327, "num_input_tokens_seen": 347581192, "step": 6747 }, { "epoch": 0.6633743642662342, "grad_norm": 65.24707794189453, "learning_rate": 8e-05, "loss": 34.8417, "num_input_tokens_seen": 347696544, "step": 6750 }, { "epoch": 0.6636691973170192, "grad_norm": 37.428218841552734, "learning_rate": 8e-05, "loss": 39.3613, "num_input_tokens_seen": 347855764, "step": 6753 }, { "epoch": 0.6639640303678043, "grad_norm": 23.67691993713379, "learning_rate": 8e-05, "loss": 38.6025, "num_input_tokens_seen": 348022128, "step": 6756 }, { "epoch": 0.6642588634185892, "grad_norm": 25.112234115600586, "learning_rate": 8e-05, "loss": 40.1824, "num_input_tokens_seen": 348192976, "step": 6759 }, { "epoch": 0.6645536964693742, "grad_norm": 29.855688095092773, "learning_rate": 8e-05, "loss": 41.8466, "num_input_tokens_seen": 348363652, "step": 6762 }, { "epoch": 0.6648485295201592, "grad_norm": 25.25415802001953, "learning_rate": 8e-05, "loss": 37.8966, "num_input_tokens_seen": 348530436, "step": 6765 }, { "epoch": 0.6651433625709442, "grad_norm": 25.98432159423828, "learning_rate": 8e-05, "loss": 36.1825, "num_input_tokens_seen": 348684660, "step": 6768 }, { "epoch": 0.6654381956217292, "grad_norm": 29.960161209106445, "learning_rate": 8e-05, "loss": 37.6181, "num_input_tokens_seen": 348820632, "step": 6771 }, { "epoch": 0.6657330286725142, "grad_norm": 36.65909957885742, "learning_rate": 8e-05, "loss": 35.4403, "num_input_tokens_seen": 348985784, "step": 6774 }, { "epoch": 0.6660278617232992, "grad_norm": 34.468509674072266, "learning_rate": 8e-05, "loss": 40.4557, "num_input_tokens_seen": 349137960, "step": 6777 }, { "epoch": 0.6663226947740841, "grad_norm": 24.24013328552246, "learning_rate": 8e-05, "loss": 37.4488, "num_input_tokens_seen": 349282196, "step": 6780 }, { "epoch": 0.6666175278248692, "grad_norm": 38.20613479614258, "learning_rate": 8e-05, "loss": 39.7592, "num_input_tokens_seen": 349430124, "step": 6783 }, { "epoch": 0.6669123608756542, "grad_norm": 39.29615783691406, "learning_rate": 8e-05, "loss": 36.0311, "num_input_tokens_seen": 349590896, "step": 6786 }, { "epoch": 0.6672071939264391, "grad_norm": 27.678237915039062, "learning_rate": 8e-05, "loss": 41.4173, "num_input_tokens_seen": 349741976, "step": 6789 }, { "epoch": 0.6675020269772242, "grad_norm": 23.54377555847168, "learning_rate": 8e-05, "loss": 35.8037, "num_input_tokens_seen": 349891584, "step": 6792 }, { "epoch": 0.6677968600280091, "grad_norm": 22.418794631958008, "learning_rate": 8e-05, "loss": 37.0058, "num_input_tokens_seen": 350048688, "step": 6795 }, { "epoch": 0.6680916930787941, "grad_norm": 27.08586883544922, "learning_rate": 8e-05, "loss": 35.055, "num_input_tokens_seen": 350209312, "step": 6798 }, { "epoch": 0.6683865261295792, "grad_norm": 32.486961364746094, "learning_rate": 8e-05, "loss": 40.9057, "num_input_tokens_seen": 350366576, "step": 6801 }, { "epoch": 0.6686813591803641, "grad_norm": 29.93039321899414, "learning_rate": 8e-05, "loss": 41.0914, "num_input_tokens_seen": 350534288, "step": 6804 }, { "epoch": 0.6689761922311491, "grad_norm": 26.834508895874023, "learning_rate": 8e-05, "loss": 35.2028, "num_input_tokens_seen": 350683428, "step": 6807 }, { "epoch": 0.669271025281934, "grad_norm": 23.54694175720215, "learning_rate": 8e-05, "loss": 37.2137, "num_input_tokens_seen": 350839888, "step": 6810 }, { "epoch": 0.6695658583327191, "grad_norm": 53.113704681396484, "learning_rate": 8e-05, "loss": 35.2484, "num_input_tokens_seen": 350995556, "step": 6813 }, { "epoch": 0.6698606913835041, "grad_norm": 28.740299224853516, "learning_rate": 8e-05, "loss": 38.2213, "num_input_tokens_seen": 351154384, "step": 6816 }, { "epoch": 0.670155524434289, "grad_norm": 25.740116119384766, "learning_rate": 8e-05, "loss": 36.9882, "num_input_tokens_seen": 351302160, "step": 6819 }, { "epoch": 0.6704503574850741, "grad_norm": 26.176889419555664, "learning_rate": 8e-05, "loss": 35.5851, "num_input_tokens_seen": 351441112, "step": 6822 }, { "epoch": 0.670745190535859, "grad_norm": 24.30504608154297, "learning_rate": 8e-05, "loss": 36.2588, "num_input_tokens_seen": 351586372, "step": 6825 }, { "epoch": 0.671040023586644, "grad_norm": 28.314659118652344, "learning_rate": 8e-05, "loss": 39.4041, "num_input_tokens_seen": 351748720, "step": 6828 }, { "epoch": 0.6713348566374291, "grad_norm": 25.78873062133789, "learning_rate": 8e-05, "loss": 35.5167, "num_input_tokens_seen": 351929488, "step": 6831 }, { "epoch": 0.671629689688214, "grad_norm": 29.422073364257812, "learning_rate": 8e-05, "loss": 38.3875, "num_input_tokens_seen": 352077568, "step": 6834 }, { "epoch": 0.671924522738999, "grad_norm": 25.27082633972168, "learning_rate": 8e-05, "loss": 35.4953, "num_input_tokens_seen": 352236692, "step": 6837 }, { "epoch": 0.6722193557897841, "grad_norm": 25.59380340576172, "learning_rate": 8e-05, "loss": 38.9723, "num_input_tokens_seen": 352393932, "step": 6840 }, { "epoch": 0.672514188840569, "grad_norm": 29.449504852294922, "learning_rate": 8e-05, "loss": 43.4062, "num_input_tokens_seen": 352557020, "step": 6843 }, { "epoch": 0.672809021891354, "grad_norm": 23.7353458404541, "learning_rate": 8e-05, "loss": 35.9395, "num_input_tokens_seen": 352720728, "step": 6846 }, { "epoch": 0.673103854942139, "grad_norm": 20.535308837890625, "learning_rate": 8e-05, "loss": 37.3628, "num_input_tokens_seen": 352873480, "step": 6849 }, { "epoch": 0.673398687992924, "grad_norm": 24.83087158203125, "learning_rate": 8e-05, "loss": 36.0663, "num_input_tokens_seen": 353039368, "step": 6852 }, { "epoch": 0.673693521043709, "grad_norm": 27.142587661743164, "learning_rate": 8e-05, "loss": 39.2065, "num_input_tokens_seen": 353182328, "step": 6855 }, { "epoch": 0.673988354094494, "grad_norm": 26.528867721557617, "learning_rate": 8e-05, "loss": 37.2345, "num_input_tokens_seen": 353341440, "step": 6858 }, { "epoch": 0.674283187145279, "grad_norm": 26.108625411987305, "learning_rate": 8e-05, "loss": 35.812, "num_input_tokens_seen": 353509616, "step": 6861 }, { "epoch": 0.6745780201960639, "grad_norm": 23.73592758178711, "learning_rate": 8e-05, "loss": 36.0127, "num_input_tokens_seen": 353648016, "step": 6864 }, { "epoch": 0.674872853246849, "grad_norm": 22.257362365722656, "learning_rate": 8e-05, "loss": 39.6556, "num_input_tokens_seen": 353797716, "step": 6867 }, { "epoch": 0.675167686297634, "grad_norm": 24.35578727722168, "learning_rate": 8e-05, "loss": 36.7126, "num_input_tokens_seen": 353957804, "step": 6870 }, { "epoch": 0.6754625193484189, "grad_norm": 26.841032028198242, "learning_rate": 8e-05, "loss": 35.9122, "num_input_tokens_seen": 354111364, "step": 6873 }, { "epoch": 0.675757352399204, "grad_norm": 33.47208023071289, "learning_rate": 8e-05, "loss": 40.0805, "num_input_tokens_seen": 354243644, "step": 6876 }, { "epoch": 0.6760521854499889, "grad_norm": 23.950185775756836, "learning_rate": 8e-05, "loss": 39.7134, "num_input_tokens_seen": 354412388, "step": 6879 }, { "epoch": 0.6763470185007739, "grad_norm": 23.655397415161133, "learning_rate": 8e-05, "loss": 39.7507, "num_input_tokens_seen": 354566740, "step": 6882 }, { "epoch": 0.676641851551559, "grad_norm": 25.052453994750977, "learning_rate": 8e-05, "loss": 40.9295, "num_input_tokens_seen": 354715756, "step": 6885 }, { "epoch": 0.6769366846023439, "grad_norm": 36.31843948364258, "learning_rate": 8e-05, "loss": 36.5385, "num_input_tokens_seen": 354867004, "step": 6888 }, { "epoch": 0.677231517653129, "grad_norm": 32.722171783447266, "learning_rate": 8e-05, "loss": 34.0506, "num_input_tokens_seen": 355026580, "step": 6891 }, { "epoch": 0.6775263507039139, "grad_norm": 45.20307922363281, "learning_rate": 8e-05, "loss": 35.6528, "num_input_tokens_seen": 355162704, "step": 6894 }, { "epoch": 0.6778211837546989, "grad_norm": 24.55483055114746, "learning_rate": 8e-05, "loss": 37.1783, "num_input_tokens_seen": 355307844, "step": 6897 }, { "epoch": 0.678116016805484, "grad_norm": 24.945140838623047, "learning_rate": 8e-05, "loss": 38.1234, "num_input_tokens_seen": 355464488, "step": 6900 }, { "epoch": 0.6784108498562689, "grad_norm": 31.348560333251953, "learning_rate": 8e-05, "loss": 35.7855, "num_input_tokens_seen": 355599336, "step": 6903 }, { "epoch": 0.6787056829070539, "grad_norm": 86.59671783447266, "learning_rate": 8e-05, "loss": 35.6197, "num_input_tokens_seen": 355755832, "step": 6906 }, { "epoch": 0.6790005159578388, "grad_norm": 28.258621215820312, "learning_rate": 8e-05, "loss": 32.1772, "num_input_tokens_seen": 355903376, "step": 6909 }, { "epoch": 0.6792953490086239, "grad_norm": 25.099407196044922, "learning_rate": 8e-05, "loss": 36.2638, "num_input_tokens_seen": 356063268, "step": 6912 }, { "epoch": 0.6795901820594089, "grad_norm": 23.317407608032227, "learning_rate": 8e-05, "loss": 36.8374, "num_input_tokens_seen": 356208332, "step": 6915 }, { "epoch": 0.6798850151101938, "grad_norm": 23.524396896362305, "learning_rate": 8e-05, "loss": 34.3438, "num_input_tokens_seen": 356350224, "step": 6918 }, { "epoch": 0.6801798481609789, "grad_norm": 40.0164794921875, "learning_rate": 8e-05, "loss": 35.6302, "num_input_tokens_seen": 356505032, "step": 6921 }, { "epoch": 0.6804746812117638, "grad_norm": 30.623050689697266, "learning_rate": 8e-05, "loss": 35.4187, "num_input_tokens_seen": 356672508, "step": 6924 }, { "epoch": 0.6807695142625488, "grad_norm": 25.530649185180664, "learning_rate": 8e-05, "loss": 40.2457, "num_input_tokens_seen": 356835012, "step": 6927 }, { "epoch": 0.6810643473133339, "grad_norm": 23.30413818359375, "learning_rate": 8e-05, "loss": 36.5556, "num_input_tokens_seen": 356974760, "step": 6930 }, { "epoch": 0.6813591803641188, "grad_norm": 26.084671020507812, "learning_rate": 8e-05, "loss": 38.8782, "num_input_tokens_seen": 357150712, "step": 6933 }, { "epoch": 0.6816540134149038, "grad_norm": 30.265125274658203, "learning_rate": 8e-05, "loss": 36.9992, "num_input_tokens_seen": 357317688, "step": 6936 }, { "epoch": 0.6819488464656888, "grad_norm": 26.261505126953125, "learning_rate": 8e-05, "loss": 39.1126, "num_input_tokens_seen": 357463828, "step": 6939 }, { "epoch": 0.6822436795164738, "grad_norm": 25.053955078125, "learning_rate": 8e-05, "loss": 34.9128, "num_input_tokens_seen": 357611476, "step": 6942 }, { "epoch": 0.6825385125672588, "grad_norm": 26.563329696655273, "learning_rate": 8e-05, "loss": 34.9007, "num_input_tokens_seen": 357749340, "step": 6945 }, { "epoch": 0.6828333456180438, "grad_norm": 23.786418914794922, "learning_rate": 8e-05, "loss": 36.6414, "num_input_tokens_seen": 357896284, "step": 6948 }, { "epoch": 0.6831281786688288, "grad_norm": 21.139434814453125, "learning_rate": 8e-05, "loss": 32.9007, "num_input_tokens_seen": 358048648, "step": 6951 }, { "epoch": 0.6834230117196137, "grad_norm": 28.922199249267578, "learning_rate": 8e-05, "loss": 39.5529, "num_input_tokens_seen": 358200792, "step": 6954 }, { "epoch": 0.6837178447703988, "grad_norm": 52.790138244628906, "learning_rate": 8e-05, "loss": 39.8963, "num_input_tokens_seen": 358374328, "step": 6957 }, { "epoch": 0.6840126778211838, "grad_norm": 38.69890594482422, "learning_rate": 8e-05, "loss": 34.4037, "num_input_tokens_seen": 358541504, "step": 6960 }, { "epoch": 0.6843075108719687, "grad_norm": 31.100345611572266, "learning_rate": 8e-05, "loss": 37.5846, "num_input_tokens_seen": 358692296, "step": 6963 }, { "epoch": 0.6846023439227538, "grad_norm": 33.161014556884766, "learning_rate": 8e-05, "loss": 35.0341, "num_input_tokens_seen": 358851288, "step": 6966 }, { "epoch": 0.6848971769735387, "grad_norm": 24.75032615661621, "learning_rate": 8e-05, "loss": 34.567, "num_input_tokens_seen": 359008992, "step": 6969 }, { "epoch": 0.6851920100243237, "grad_norm": 39.028160095214844, "learning_rate": 8e-05, "loss": 38.9036, "num_input_tokens_seen": 359165044, "step": 6972 }, { "epoch": 0.6854868430751088, "grad_norm": 23.45261001586914, "learning_rate": 8e-05, "loss": 37.1691, "num_input_tokens_seen": 359314900, "step": 6975 }, { "epoch": 0.6857816761258937, "grad_norm": 28.804195404052734, "learning_rate": 8e-05, "loss": 39.5592, "num_input_tokens_seen": 359479956, "step": 6978 }, { "epoch": 0.6860765091766787, "grad_norm": 44.11396408081055, "learning_rate": 8e-05, "loss": 37.8574, "num_input_tokens_seen": 359625208, "step": 6981 }, { "epoch": 0.6863713422274637, "grad_norm": 25.33087921142578, "learning_rate": 8e-05, "loss": 38.5327, "num_input_tokens_seen": 359761980, "step": 6984 }, { "epoch": 0.6866661752782487, "grad_norm": 28.133926391601562, "learning_rate": 8e-05, "loss": 37.843, "num_input_tokens_seen": 359908984, "step": 6987 }, { "epoch": 0.6869610083290337, "grad_norm": 26.1141300201416, "learning_rate": 8e-05, "loss": 40.021, "num_input_tokens_seen": 360079052, "step": 6990 }, { "epoch": 0.6872558413798187, "grad_norm": 28.791933059692383, "learning_rate": 8e-05, "loss": 34.2899, "num_input_tokens_seen": 360206192, "step": 6993 }, { "epoch": 0.6875506744306037, "grad_norm": 28.28318977355957, "learning_rate": 8e-05, "loss": 38.1626, "num_input_tokens_seen": 360341872, "step": 6996 }, { "epoch": 0.6878455074813886, "grad_norm": 23.083023071289062, "learning_rate": 8e-05, "loss": 36.5789, "num_input_tokens_seen": 360499676, "step": 6999 }, { "epoch": 0.6879437851649837, "eval_gen_len": 30.805, "eval_loss": 2.384366750717163, "eval_rouge1": 44.8011, "eval_rouge2": 28.0367, "eval_rougeL": 40.8555, "eval_rougeLsum": 41.1516, "eval_runtime": 93.8691, "eval_samples_per_second": 2.131, "eval_steps_per_second": 0.533, "num_input_tokens_seen": 360560352, "step": 7000 }, { "epoch": 0.6881403405321737, "grad_norm": 21.221134185791016, "learning_rate": 8e-05, "loss": 31.7868, "num_input_tokens_seen": 360664356, "step": 7002 }, { "epoch": 0.6884351735829587, "grad_norm": 28.533222198486328, "learning_rate": 8e-05, "loss": 39.2583, "num_input_tokens_seen": 360821316, "step": 7005 }, { "epoch": 0.6887300066337436, "grad_norm": 27.524171829223633, "learning_rate": 8e-05, "loss": 37.4306, "num_input_tokens_seen": 360971048, "step": 7008 }, { "epoch": 0.6890248396845287, "grad_norm": 28.358911514282227, "learning_rate": 8e-05, "loss": 35.6021, "num_input_tokens_seen": 361120852, "step": 7011 }, { "epoch": 0.6893196727353136, "grad_norm": 21.3004207611084, "learning_rate": 8e-05, "loss": 37.0142, "num_input_tokens_seen": 361290472, "step": 7014 }, { "epoch": 0.6896145057860986, "grad_norm": 26.657236099243164, "learning_rate": 8e-05, "loss": 38.8584, "num_input_tokens_seen": 361439848, "step": 7017 }, { "epoch": 0.6899093388368837, "grad_norm": 26.211191177368164, "learning_rate": 8e-05, "loss": 36.6387, "num_input_tokens_seen": 361582308, "step": 7020 }, { "epoch": 0.6902041718876686, "grad_norm": 28.06895637512207, "learning_rate": 8e-05, "loss": 38.8741, "num_input_tokens_seen": 361734312, "step": 7023 }, { "epoch": 0.6904990049384536, "grad_norm": 23.55493927001953, "learning_rate": 8e-05, "loss": 40.4776, "num_input_tokens_seen": 361888652, "step": 7026 }, { "epoch": 0.6907938379892385, "grad_norm": 23.64281463623047, "learning_rate": 8e-05, "loss": 34.3881, "num_input_tokens_seen": 362026428, "step": 7029 }, { "epoch": 0.6910886710400236, "grad_norm": 46.84697723388672, "learning_rate": 8e-05, "loss": 38.394, "num_input_tokens_seen": 362179384, "step": 7032 }, { "epoch": 0.6913835040908086, "grad_norm": 24.269325256347656, "learning_rate": 8e-05, "loss": 33.2829, "num_input_tokens_seen": 362321776, "step": 7035 }, { "epoch": 0.6916783371415935, "grad_norm": 39.650611877441406, "learning_rate": 8e-05, "loss": 36.2346, "num_input_tokens_seen": 362471104, "step": 7038 }, { "epoch": 0.6919731701923786, "grad_norm": 23.979177474975586, "learning_rate": 8e-05, "loss": 37.6138, "num_input_tokens_seen": 362605908, "step": 7041 }, { "epoch": 0.6922680032431635, "grad_norm": 40.64398956298828, "learning_rate": 8e-05, "loss": 38.7976, "num_input_tokens_seen": 362781236, "step": 7044 }, { "epoch": 0.6925628362939485, "grad_norm": 33.43219757080078, "learning_rate": 8e-05, "loss": 35.2467, "num_input_tokens_seen": 362947916, "step": 7047 }, { "epoch": 0.6928576693447336, "grad_norm": 26.71827507019043, "learning_rate": 8e-05, "loss": 39.5095, "num_input_tokens_seen": 363100592, "step": 7050 }, { "epoch": 0.6931525023955185, "grad_norm": 24.0466365814209, "learning_rate": 8e-05, "loss": 40.0637, "num_input_tokens_seen": 363263896, "step": 7053 }, { "epoch": 0.6934473354463035, "grad_norm": 29.3724308013916, "learning_rate": 8e-05, "loss": 36.4387, "num_input_tokens_seen": 363397720, "step": 7056 }, { "epoch": 0.6937421684970885, "grad_norm": 26.97890281677246, "learning_rate": 8e-05, "loss": 36.6981, "num_input_tokens_seen": 363549992, "step": 7059 }, { "epoch": 0.6940370015478735, "grad_norm": 25.043485641479492, "learning_rate": 8e-05, "loss": 37.4143, "num_input_tokens_seen": 363724944, "step": 7062 }, { "epoch": 0.6943318345986585, "grad_norm": 22.60007095336914, "learning_rate": 8e-05, "loss": 34.5618, "num_input_tokens_seen": 363885164, "step": 7065 }, { "epoch": 0.6946266676494435, "grad_norm": 25.03348159790039, "learning_rate": 8e-05, "loss": 37.147, "num_input_tokens_seen": 364053588, "step": 7068 }, { "epoch": 0.6949215007002285, "grad_norm": 24.149120330810547, "learning_rate": 8e-05, "loss": 36.2789, "num_input_tokens_seen": 364200544, "step": 7071 }, { "epoch": 0.6952163337510134, "grad_norm": 24.27806282043457, "learning_rate": 8e-05, "loss": 37.3367, "num_input_tokens_seen": 364365032, "step": 7074 }, { "epoch": 0.6955111668017985, "grad_norm": 27.3250732421875, "learning_rate": 8e-05, "loss": 38.3819, "num_input_tokens_seen": 364500012, "step": 7077 }, { "epoch": 0.6958059998525835, "grad_norm": 24.45925521850586, "learning_rate": 8e-05, "loss": 39.4937, "num_input_tokens_seen": 364660220, "step": 7080 }, { "epoch": 0.6961008329033684, "grad_norm": 32.768638610839844, "learning_rate": 8e-05, "loss": 38.4864, "num_input_tokens_seen": 364805288, "step": 7083 }, { "epoch": 0.6963956659541535, "grad_norm": 25.589962005615234, "learning_rate": 8e-05, "loss": 37.92, "num_input_tokens_seen": 364960252, "step": 7086 }, { "epoch": 0.6966904990049384, "grad_norm": 29.774845123291016, "learning_rate": 8e-05, "loss": 33.9415, "num_input_tokens_seen": 365121300, "step": 7089 }, { "epoch": 0.6969853320557234, "grad_norm": 24.776142120361328, "learning_rate": 8e-05, "loss": 37.4063, "num_input_tokens_seen": 365267452, "step": 7092 }, { "epoch": 0.6972801651065085, "grad_norm": 23.382896423339844, "learning_rate": 8e-05, "loss": 34.9722, "num_input_tokens_seen": 365402260, "step": 7095 }, { "epoch": 0.6975749981572934, "grad_norm": 24.839080810546875, "learning_rate": 8e-05, "loss": 35.6654, "num_input_tokens_seen": 365544004, "step": 7098 }, { "epoch": 0.6978698312080784, "grad_norm": 22.468021392822266, "learning_rate": 8e-05, "loss": 36.9169, "num_input_tokens_seen": 365711024, "step": 7101 }, { "epoch": 0.6981646642588634, "grad_norm": 55.25310134887695, "learning_rate": 8e-05, "loss": 38.9989, "num_input_tokens_seen": 365853052, "step": 7104 }, { "epoch": 0.6984594973096484, "grad_norm": 37.976470947265625, "learning_rate": 8e-05, "loss": 37.0344, "num_input_tokens_seen": 366006644, "step": 7107 }, { "epoch": 0.6987543303604334, "grad_norm": 25.312318801879883, "learning_rate": 8e-05, "loss": 37.1849, "num_input_tokens_seen": 366154148, "step": 7110 }, { "epoch": 0.6990491634112184, "grad_norm": 22.66741943359375, "learning_rate": 8e-05, "loss": 39.7188, "num_input_tokens_seen": 366303968, "step": 7113 }, { "epoch": 0.6993439964620034, "grad_norm": 26.739595413208008, "learning_rate": 8e-05, "loss": 39.6954, "num_input_tokens_seen": 366450852, "step": 7116 }, { "epoch": 0.6996388295127883, "grad_norm": 26.273880004882812, "learning_rate": 8e-05, "loss": 38.4002, "num_input_tokens_seen": 366619148, "step": 7119 }, { "epoch": 0.6999336625635734, "grad_norm": 28.57887840270996, "learning_rate": 8e-05, "loss": 35.176, "num_input_tokens_seen": 366776044, "step": 7122 }, { "epoch": 0.7002284956143584, "grad_norm": 34.28238296508789, "learning_rate": 8e-05, "loss": 39.8619, "num_input_tokens_seen": 366925244, "step": 7125 }, { "epoch": 0.7005233286651433, "grad_norm": 28.01417350769043, "learning_rate": 8e-05, "loss": 35.6729, "num_input_tokens_seen": 367080504, "step": 7128 }, { "epoch": 0.7008181617159284, "grad_norm": 22.694480895996094, "learning_rate": 8e-05, "loss": 37.7513, "num_input_tokens_seen": 367246724, "step": 7131 }, { "epoch": 0.7011129947667133, "grad_norm": 22.39271354675293, "learning_rate": 8e-05, "loss": 34.6424, "num_input_tokens_seen": 367386972, "step": 7134 }, { "epoch": 0.7014078278174983, "grad_norm": 24.380094528198242, "learning_rate": 8e-05, "loss": 34.6489, "num_input_tokens_seen": 367571520, "step": 7137 }, { "epoch": 0.7017026608682834, "grad_norm": 22.910329818725586, "learning_rate": 8e-05, "loss": 34.3629, "num_input_tokens_seen": 367711476, "step": 7140 }, { "epoch": 0.7019974939190683, "grad_norm": 26.820985794067383, "learning_rate": 8e-05, "loss": 38.2424, "num_input_tokens_seen": 367863076, "step": 7143 }, { "epoch": 0.7022923269698533, "grad_norm": 55.195499420166016, "learning_rate": 8e-05, "loss": 36.6898, "num_input_tokens_seen": 368022532, "step": 7146 }, { "epoch": 0.7025871600206384, "grad_norm": 23.07423210144043, "learning_rate": 8e-05, "loss": 39.7422, "num_input_tokens_seen": 368191684, "step": 7149 }, { "epoch": 0.7028819930714233, "grad_norm": 25.3260555267334, "learning_rate": 8e-05, "loss": 37.7671, "num_input_tokens_seen": 368350552, "step": 7152 }, { "epoch": 0.7031768261222083, "grad_norm": 27.317092895507812, "learning_rate": 8e-05, "loss": 36.9679, "num_input_tokens_seen": 368497796, "step": 7155 }, { "epoch": 0.7034716591729933, "grad_norm": 24.9680233001709, "learning_rate": 8e-05, "loss": 35.1555, "num_input_tokens_seen": 368641600, "step": 7158 }, { "epoch": 0.7037664922237783, "grad_norm": 23.467241287231445, "learning_rate": 8e-05, "loss": 36.8177, "num_input_tokens_seen": 368788680, "step": 7161 }, { "epoch": 0.7040613252745633, "grad_norm": 27.392885208129883, "learning_rate": 8e-05, "loss": 38.4683, "num_input_tokens_seen": 368962924, "step": 7164 }, { "epoch": 0.7043561583253483, "grad_norm": 25.242094039916992, "learning_rate": 8e-05, "loss": 36.1093, "num_input_tokens_seen": 369141044, "step": 7167 }, { "epoch": 0.7046509913761333, "grad_norm": 24.719947814941406, "learning_rate": 8e-05, "loss": 37.4259, "num_input_tokens_seen": 369280964, "step": 7170 }, { "epoch": 0.7049458244269182, "grad_norm": 22.53449058532715, "learning_rate": 8e-05, "loss": 34.1221, "num_input_tokens_seen": 369436492, "step": 7173 }, { "epoch": 0.7052406574777033, "grad_norm": 26.51321792602539, "learning_rate": 8e-05, "loss": 35.0682, "num_input_tokens_seen": 369583500, "step": 7176 }, { "epoch": 0.7055354905284883, "grad_norm": 21.050174713134766, "learning_rate": 8e-05, "loss": 35.6422, "num_input_tokens_seen": 369748328, "step": 7179 }, { "epoch": 0.7058303235792732, "grad_norm": 27.241004943847656, "learning_rate": 8e-05, "loss": 39.6832, "num_input_tokens_seen": 369907856, "step": 7182 }, { "epoch": 0.7061251566300583, "grad_norm": 25.388357162475586, "learning_rate": 8e-05, "loss": 41.2166, "num_input_tokens_seen": 370097256, "step": 7185 }, { "epoch": 0.7064199896808432, "grad_norm": 24.880773544311523, "learning_rate": 8e-05, "loss": 36.4923, "num_input_tokens_seen": 370255600, "step": 7188 }, { "epoch": 0.7067148227316282, "grad_norm": 28.246479034423828, "learning_rate": 8e-05, "loss": 35.7393, "num_input_tokens_seen": 370389872, "step": 7191 }, { "epoch": 0.7070096557824133, "grad_norm": 24.847553253173828, "learning_rate": 8e-05, "loss": 35.294, "num_input_tokens_seen": 370519424, "step": 7194 }, { "epoch": 0.7073044888331982, "grad_norm": 26.302785873413086, "learning_rate": 8e-05, "loss": 39.1979, "num_input_tokens_seen": 370689432, "step": 7197 }, { "epoch": 0.7075993218839832, "grad_norm": 21.54235076904297, "learning_rate": 8e-05, "loss": 35.4982, "num_input_tokens_seen": 370857272, "step": 7200 }, { "epoch": 0.7078941549347681, "grad_norm": 29.961332321166992, "learning_rate": 8e-05, "loss": 35.7793, "num_input_tokens_seen": 370983976, "step": 7203 }, { "epoch": 0.7081889879855532, "grad_norm": 24.172889709472656, "learning_rate": 8e-05, "loss": 41.917, "num_input_tokens_seen": 371140492, "step": 7206 }, { "epoch": 0.7084838210363382, "grad_norm": 25.67815399169922, "learning_rate": 8e-05, "loss": 41.9321, "num_input_tokens_seen": 371300236, "step": 7209 }, { "epoch": 0.7087786540871231, "grad_norm": 27.027847290039062, "learning_rate": 8e-05, "loss": 39.0412, "num_input_tokens_seen": 371459420, "step": 7212 }, { "epoch": 0.7090734871379082, "grad_norm": 180.06234741210938, "learning_rate": 8e-05, "loss": 38.4666, "num_input_tokens_seen": 371615372, "step": 7215 }, { "epoch": 0.7093683201886931, "grad_norm": 116.11947631835938, "learning_rate": 8e-05, "loss": 35.4061, "num_input_tokens_seen": 371772936, "step": 7218 }, { "epoch": 0.7096631532394782, "grad_norm": 25.75341796875, "learning_rate": 8e-05, "loss": 38.7179, "num_input_tokens_seen": 371928012, "step": 7221 }, { "epoch": 0.7099579862902632, "grad_norm": 22.445053100585938, "learning_rate": 8e-05, "loss": 35.927, "num_input_tokens_seen": 372086308, "step": 7224 }, { "epoch": 0.7102528193410481, "grad_norm": 29.624927520751953, "learning_rate": 8e-05, "loss": 37.9876, "num_input_tokens_seen": 372253944, "step": 7227 }, { "epoch": 0.7105476523918332, "grad_norm": 23.438127517700195, "learning_rate": 8e-05, "loss": 35.5025, "num_input_tokens_seen": 372408864, "step": 7230 }, { "epoch": 0.7108424854426181, "grad_norm": 29.223533630371094, "learning_rate": 8e-05, "loss": 32.6952, "num_input_tokens_seen": 372563228, "step": 7233 }, { "epoch": 0.7111373184934031, "grad_norm": 23.90278434753418, "learning_rate": 8e-05, "loss": 38.4112, "num_input_tokens_seen": 372716044, "step": 7236 }, { "epoch": 0.7114321515441882, "grad_norm": 24.182998657226562, "learning_rate": 8e-05, "loss": 33.5361, "num_input_tokens_seen": 372868828, "step": 7239 }, { "epoch": 0.7117269845949731, "grad_norm": 32.440555572509766, "learning_rate": 8e-05, "loss": 35.9163, "num_input_tokens_seen": 372999444, "step": 7242 }, { "epoch": 0.7120218176457581, "grad_norm": 47.51459884643555, "learning_rate": 8e-05, "loss": 35.3844, "num_input_tokens_seen": 373163628, "step": 7245 }, { "epoch": 0.712316650696543, "grad_norm": 31.754146575927734, "learning_rate": 8e-05, "loss": 38.3827, "num_input_tokens_seen": 373315648, "step": 7248 }, { "epoch": 0.7126114837473281, "grad_norm": 23.967334747314453, "learning_rate": 8e-05, "loss": 36.374, "num_input_tokens_seen": 373463044, "step": 7251 }, { "epoch": 0.7129063167981131, "grad_norm": 26.474748611450195, "learning_rate": 8e-05, "loss": 36.7464, "num_input_tokens_seen": 373622436, "step": 7254 }, { "epoch": 0.713201149848898, "grad_norm": 25.624454498291016, "learning_rate": 8e-05, "loss": 36.7132, "num_input_tokens_seen": 373771608, "step": 7257 }, { "epoch": 0.7134959828996831, "grad_norm": 21.67783546447754, "learning_rate": 8e-05, "loss": 35.3883, "num_input_tokens_seen": 373929268, "step": 7260 }, { "epoch": 0.713790815950468, "grad_norm": 31.39051628112793, "learning_rate": 8e-05, "loss": 38.0616, "num_input_tokens_seen": 374095272, "step": 7263 }, { "epoch": 0.714085649001253, "grad_norm": 23.98355484008789, "learning_rate": 8e-05, "loss": 35.5447, "num_input_tokens_seen": 374229768, "step": 7266 }, { "epoch": 0.7143804820520381, "grad_norm": 28.991931915283203, "learning_rate": 8e-05, "loss": 38.2876, "num_input_tokens_seen": 374376744, "step": 7269 }, { "epoch": 0.714675315102823, "grad_norm": 25.730976104736328, "learning_rate": 8e-05, "loss": 36.7238, "num_input_tokens_seen": 374525608, "step": 7272 }, { "epoch": 0.714970148153608, "grad_norm": 27.842273712158203, "learning_rate": 8e-05, "loss": 40.2203, "num_input_tokens_seen": 374684756, "step": 7275 }, { "epoch": 0.715264981204393, "grad_norm": 37.037139892578125, "learning_rate": 8e-05, "loss": 38.8265, "num_input_tokens_seen": 374826580, "step": 7278 }, { "epoch": 0.715559814255178, "grad_norm": 26.971187591552734, "learning_rate": 8e-05, "loss": 35.9391, "num_input_tokens_seen": 374989824, "step": 7281 }, { "epoch": 0.715854647305963, "grad_norm": 46.22418975830078, "learning_rate": 8e-05, "loss": 36.5632, "num_input_tokens_seen": 375159888, "step": 7284 }, { "epoch": 0.716149480356748, "grad_norm": 23.984878540039062, "learning_rate": 8e-05, "loss": 35.8848, "num_input_tokens_seen": 375315320, "step": 7287 }, { "epoch": 0.716444313407533, "grad_norm": 23.67920684814453, "learning_rate": 8e-05, "loss": 36.7169, "num_input_tokens_seen": 375480140, "step": 7290 }, { "epoch": 0.7167391464583179, "grad_norm": 24.245229721069336, "learning_rate": 8e-05, "loss": 36.4315, "num_input_tokens_seen": 375655892, "step": 7293 }, { "epoch": 0.717033979509103, "grad_norm": 24.02587127685547, "learning_rate": 8e-05, "loss": 37.9774, "num_input_tokens_seen": 375809988, "step": 7296 }, { "epoch": 0.717328812559888, "grad_norm": 26.92994499206543, "learning_rate": 8e-05, "loss": 40.8964, "num_input_tokens_seen": 375970568, "step": 7299 }, { "epoch": 0.7176236456106729, "grad_norm": 22.372398376464844, "learning_rate": 8e-05, "loss": 36.488, "num_input_tokens_seen": 376134096, "step": 7302 }, { "epoch": 0.717918478661458, "grad_norm": 23.640180587768555, "learning_rate": 8e-05, "loss": 33.9567, "num_input_tokens_seen": 376277268, "step": 7305 }, { "epoch": 0.7182133117122429, "grad_norm": 22.062767028808594, "learning_rate": 8e-05, "loss": 38.0409, "num_input_tokens_seen": 376454000, "step": 7308 }, { "epoch": 0.7185081447630279, "grad_norm": 22.81031608581543, "learning_rate": 8e-05, "loss": 36.8436, "num_input_tokens_seen": 376615928, "step": 7311 }, { "epoch": 0.718802977813813, "grad_norm": 32.10073471069336, "learning_rate": 8e-05, "loss": 38.9314, "num_input_tokens_seen": 376772032, "step": 7314 }, { "epoch": 0.7190978108645979, "grad_norm": 28.078523635864258, "learning_rate": 8e-05, "loss": 41.4978, "num_input_tokens_seen": 376893188, "step": 7317 }, { "epoch": 0.7193926439153829, "grad_norm": 43.02801513671875, "learning_rate": 8e-05, "loss": 38.946, "num_input_tokens_seen": 377049156, "step": 7320 }, { "epoch": 0.7196874769661679, "grad_norm": 34.06877517700195, "learning_rate": 8e-05, "loss": 40.1617, "num_input_tokens_seen": 377217384, "step": 7323 }, { "epoch": 0.7199823100169529, "grad_norm": 26.74899673461914, "learning_rate": 8e-05, "loss": 38.5652, "num_input_tokens_seen": 377380312, "step": 7326 }, { "epoch": 0.7202771430677379, "grad_norm": 33.939579010009766, "learning_rate": 8e-05, "loss": 37.3883, "num_input_tokens_seen": 377536968, "step": 7329 }, { "epoch": 0.7205719761185229, "grad_norm": 123.61614227294922, "learning_rate": 8e-05, "loss": 35.7296, "num_input_tokens_seen": 377700640, "step": 7332 }, { "epoch": 0.7208668091693079, "grad_norm": 24.66319465637207, "learning_rate": 8e-05, "loss": 35.3646, "num_input_tokens_seen": 377846188, "step": 7335 }, { "epoch": 0.7211616422200928, "grad_norm": 23.91818618774414, "learning_rate": 8e-05, "loss": 38.0516, "num_input_tokens_seen": 377998996, "step": 7338 }, { "epoch": 0.7214564752708779, "grad_norm": 27.984975814819336, "learning_rate": 8e-05, "loss": 36.3984, "num_input_tokens_seen": 378149336, "step": 7341 }, { "epoch": 0.7217513083216629, "grad_norm": 22.733352661132812, "learning_rate": 8e-05, "loss": 35.5865, "num_input_tokens_seen": 378306384, "step": 7344 }, { "epoch": 0.7220461413724478, "grad_norm": 46.85588836669922, "learning_rate": 8e-05, "loss": 34.2926, "num_input_tokens_seen": 378455268, "step": 7347 }, { "epoch": 0.7223409744232329, "grad_norm": 22.355670928955078, "learning_rate": 8e-05, "loss": 38.9565, "num_input_tokens_seen": 378615884, "step": 7350 }, { "epoch": 0.7226358074740178, "grad_norm": 23.133432388305664, "learning_rate": 8e-05, "loss": 39.7225, "num_input_tokens_seen": 378767252, "step": 7353 }, { "epoch": 0.7229306405248028, "grad_norm": 25.17456817626953, "learning_rate": 8e-05, "loss": 37.6457, "num_input_tokens_seen": 378915276, "step": 7356 }, { "epoch": 0.7232254735755879, "grad_norm": 22.047069549560547, "learning_rate": 8e-05, "loss": 38.0102, "num_input_tokens_seen": 379071112, "step": 7359 }, { "epoch": 0.7235203066263728, "grad_norm": 23.050128936767578, "learning_rate": 8e-05, "loss": 33.5328, "num_input_tokens_seen": 379239648, "step": 7362 }, { "epoch": 0.7238151396771578, "grad_norm": 22.13313865661621, "learning_rate": 8e-05, "loss": 35.4146, "num_input_tokens_seen": 379379252, "step": 7365 }, { "epoch": 0.7241099727279428, "grad_norm": 109.38043212890625, "learning_rate": 8e-05, "loss": 35.2421, "num_input_tokens_seen": 379521348, "step": 7368 }, { "epoch": 0.7244048057787278, "grad_norm": 28.971952438354492, "learning_rate": 8e-05, "loss": 37.7952, "num_input_tokens_seen": 379636528, "step": 7371 }, { "epoch": 0.7246996388295128, "grad_norm": 25.20716094970703, "learning_rate": 8e-05, "loss": 41.4213, "num_input_tokens_seen": 379788648, "step": 7374 }, { "epoch": 0.7249944718802978, "grad_norm": 24.693050384521484, "learning_rate": 8e-05, "loss": 40.8675, "num_input_tokens_seen": 379958864, "step": 7377 }, { "epoch": 0.7252893049310828, "grad_norm": 24.950502395629883, "learning_rate": 8e-05, "loss": 36.5391, "num_input_tokens_seen": 380118128, "step": 7380 }, { "epoch": 0.7255841379818677, "grad_norm": 44.643550872802734, "learning_rate": 8e-05, "loss": 37.2319, "num_input_tokens_seen": 380272692, "step": 7383 }, { "epoch": 0.7258789710326528, "grad_norm": 20.62811279296875, "learning_rate": 8e-05, "loss": 35.3777, "num_input_tokens_seen": 380421916, "step": 7386 }, { "epoch": 0.7261738040834378, "grad_norm": 29.82221221923828, "learning_rate": 8e-05, "loss": 40.0476, "num_input_tokens_seen": 380595664, "step": 7389 }, { "epoch": 0.7264686371342227, "grad_norm": 24.89944839477539, "learning_rate": 8e-05, "loss": 36.1097, "num_input_tokens_seen": 380733088, "step": 7392 }, { "epoch": 0.7267634701850078, "grad_norm": 26.290952682495117, "learning_rate": 8e-05, "loss": 38.7241, "num_input_tokens_seen": 380878792, "step": 7395 }, { "epoch": 0.7270583032357927, "grad_norm": 24.127761840820312, "learning_rate": 8e-05, "loss": 36.7168, "num_input_tokens_seen": 381028172, "step": 7398 }, { "epoch": 0.7273531362865777, "grad_norm": 24.939001083374023, "learning_rate": 8e-05, "loss": 40.1282, "num_input_tokens_seen": 381186648, "step": 7401 }, { "epoch": 0.7276479693373628, "grad_norm": 23.647138595581055, "learning_rate": 8e-05, "loss": 34.5384, "num_input_tokens_seen": 381351084, "step": 7404 }, { "epoch": 0.7279428023881477, "grad_norm": 31.73210906982422, "learning_rate": 8e-05, "loss": 39.659, "num_input_tokens_seen": 381497584, "step": 7407 }, { "epoch": 0.7282376354389327, "grad_norm": 23.531461715698242, "learning_rate": 8e-05, "loss": 40.1637, "num_input_tokens_seen": 381669016, "step": 7410 }, { "epoch": 0.7285324684897176, "grad_norm": 23.627595901489258, "learning_rate": 8e-05, "loss": 37.0926, "num_input_tokens_seen": 381820036, "step": 7413 }, { "epoch": 0.7288273015405027, "grad_norm": 27.529569625854492, "learning_rate": 8e-05, "loss": 40.0547, "num_input_tokens_seen": 381981184, "step": 7416 }, { "epoch": 0.7291221345912877, "grad_norm": 22.73372459411621, "learning_rate": 8e-05, "loss": 38.2892, "num_input_tokens_seen": 382138552, "step": 7419 }, { "epoch": 0.7294169676420726, "grad_norm": 28.180679321289062, "learning_rate": 8e-05, "loss": 31.7725, "num_input_tokens_seen": 382298084, "step": 7422 }, { "epoch": 0.7297118006928577, "grad_norm": 34.909271240234375, "learning_rate": 8e-05, "loss": 36.6046, "num_input_tokens_seen": 382461200, "step": 7425 }, { "epoch": 0.7300066337436426, "grad_norm": 25.757848739624023, "learning_rate": 8e-05, "loss": 35.9155, "num_input_tokens_seen": 382609888, "step": 7428 }, { "epoch": 0.7303014667944276, "grad_norm": 27.34947395324707, "learning_rate": 8e-05, "loss": 37.7583, "num_input_tokens_seen": 382776656, "step": 7431 }, { "epoch": 0.7305962998452127, "grad_norm": 1279.990234375, "learning_rate": 8e-05, "loss": 37.3187, "num_input_tokens_seen": 382937396, "step": 7434 }, { "epoch": 0.7308911328959976, "grad_norm": 21.36997413635254, "learning_rate": 8e-05, "loss": 34.1862, "num_input_tokens_seen": 383099080, "step": 7437 }, { "epoch": 0.7311859659467826, "grad_norm": 23.636608123779297, "learning_rate": 8e-05, "loss": 35.8693, "num_input_tokens_seen": 383258328, "step": 7440 }, { "epoch": 0.7314807989975677, "grad_norm": 27.117843627929688, "learning_rate": 8e-05, "loss": 33.6174, "num_input_tokens_seen": 383413472, "step": 7443 }, { "epoch": 0.7317756320483526, "grad_norm": 23.628999710083008, "learning_rate": 8e-05, "loss": 34.2789, "num_input_tokens_seen": 383569168, "step": 7446 }, { "epoch": 0.7320704650991376, "grad_norm": 28.383270263671875, "learning_rate": 8e-05, "loss": 34.7162, "num_input_tokens_seen": 383741804, "step": 7449 }, { "epoch": 0.7323652981499226, "grad_norm": 22.602317810058594, "learning_rate": 8e-05, "loss": 36.2921, "num_input_tokens_seen": 383906228, "step": 7452 }, { "epoch": 0.7326601312007076, "grad_norm": 34.03465270996094, "learning_rate": 8e-05, "loss": 35.5612, "num_input_tokens_seen": 384033912, "step": 7455 }, { "epoch": 0.7329549642514926, "grad_norm": 25.446636199951172, "learning_rate": 8e-05, "loss": 36.915, "num_input_tokens_seen": 384194144, "step": 7458 }, { "epoch": 0.7332497973022776, "grad_norm": 23.60262680053711, "learning_rate": 8e-05, "loss": 38.7143, "num_input_tokens_seen": 384353784, "step": 7461 }, { "epoch": 0.7335446303530626, "grad_norm": 22.44038200378418, "learning_rate": 8e-05, "loss": 35.0795, "num_input_tokens_seen": 384487928, "step": 7464 }, { "epoch": 0.7338394634038475, "grad_norm": 23.007976531982422, "learning_rate": 8e-05, "loss": 39.2289, "num_input_tokens_seen": 384646676, "step": 7467 }, { "epoch": 0.7341342964546326, "grad_norm": 25.747835159301758, "learning_rate": 8e-05, "loss": 38.7275, "num_input_tokens_seen": 384803580, "step": 7470 }, { "epoch": 0.7344291295054176, "grad_norm": 34.949222564697266, "learning_rate": 8e-05, "loss": 38.5794, "num_input_tokens_seen": 384950216, "step": 7473 }, { "epoch": 0.7347239625562025, "grad_norm": 25.456125259399414, "learning_rate": 8e-05, "loss": 36.9796, "num_input_tokens_seen": 385096200, "step": 7476 }, { "epoch": 0.7350187956069876, "grad_norm": 22.768922805786133, "learning_rate": 8e-05, "loss": 35.3669, "num_input_tokens_seen": 385247424, "step": 7479 }, { "epoch": 0.7353136286577725, "grad_norm": 25.837984085083008, "learning_rate": 8e-05, "loss": 34.7811, "num_input_tokens_seen": 385401552, "step": 7482 }, { "epoch": 0.7356084617085575, "grad_norm": 28.84712791442871, "learning_rate": 8e-05, "loss": 36.131, "num_input_tokens_seen": 385575468, "step": 7485 }, { "epoch": 0.7359032947593426, "grad_norm": 27.865156173706055, "learning_rate": 8e-05, "loss": 40.8759, "num_input_tokens_seen": 385722096, "step": 7488 }, { "epoch": 0.7361981278101275, "grad_norm": 25.10171890258789, "learning_rate": 8e-05, "loss": 34.8859, "num_input_tokens_seen": 385912764, "step": 7491 }, { "epoch": 0.7364929608609125, "grad_norm": 27.673572540283203, "learning_rate": 8e-05, "loss": 36.1053, "num_input_tokens_seen": 386058424, "step": 7494 }, { "epoch": 0.7367877939116975, "grad_norm": 24.04930305480957, "learning_rate": 8e-05, "loss": 38.4439, "num_input_tokens_seen": 386203764, "step": 7497 }, { "epoch": 0.7370826269624825, "grad_norm": 24.450237274169922, "learning_rate": 8e-05, "loss": 37.3987, "num_input_tokens_seen": 386386776, "step": 7500 }, { "epoch": 0.7373774600132675, "grad_norm": 22.757221221923828, "learning_rate": 8e-05, "loss": 37.2368, "num_input_tokens_seen": 386534316, "step": 7503 }, { "epoch": 0.7376722930640525, "grad_norm": 23.33014488220215, "learning_rate": 8e-05, "loss": 32.5769, "num_input_tokens_seen": 386681532, "step": 7506 }, { "epoch": 0.7379671261148375, "grad_norm": 29.790695190429688, "learning_rate": 8e-05, "loss": 41.2972, "num_input_tokens_seen": 386835200, "step": 7509 }, { "epoch": 0.7382619591656224, "grad_norm": 23.4545841217041, "learning_rate": 8e-05, "loss": 34.3512, "num_input_tokens_seen": 386983248, "step": 7512 }, { "epoch": 0.7385567922164075, "grad_norm": 24.699615478515625, "learning_rate": 8e-05, "loss": 35.651, "num_input_tokens_seen": 387119612, "step": 7515 }, { "epoch": 0.7388516252671925, "grad_norm": 23.103931427001953, "learning_rate": 8e-05, "loss": 37.3486, "num_input_tokens_seen": 387275496, "step": 7518 }, { "epoch": 0.7391464583179774, "grad_norm": 22.726150512695312, "learning_rate": 8e-05, "loss": 35.1715, "num_input_tokens_seen": 387428384, "step": 7521 }, { "epoch": 0.7394412913687625, "grad_norm": 22.575422286987305, "learning_rate": 8e-05, "loss": 36.0418, "num_input_tokens_seen": 387570884, "step": 7524 }, { "epoch": 0.7397361244195474, "grad_norm": 28.128877639770508, "learning_rate": 8e-05, "loss": 34.6036, "num_input_tokens_seen": 387737660, "step": 7527 }, { "epoch": 0.7400309574703324, "grad_norm": 25.145950317382812, "learning_rate": 8e-05, "loss": 31.6659, "num_input_tokens_seen": 387893116, "step": 7530 }, { "epoch": 0.7403257905211175, "grad_norm": 27.49928092956543, "learning_rate": 8e-05, "loss": 35.0847, "num_input_tokens_seen": 388043176, "step": 7533 }, { "epoch": 0.7406206235719024, "grad_norm": 27.566316604614258, "learning_rate": 8e-05, "loss": 37.3643, "num_input_tokens_seen": 388189188, "step": 7536 }, { "epoch": 0.7409154566226874, "grad_norm": 25.077884674072266, "learning_rate": 8e-05, "loss": 38.4539, "num_input_tokens_seen": 388340236, "step": 7539 }, { "epoch": 0.7412102896734724, "grad_norm": 26.655046463012695, "learning_rate": 8e-05, "loss": 31.6471, "num_input_tokens_seen": 388481968, "step": 7542 }, { "epoch": 0.7415051227242574, "grad_norm": 23.787466049194336, "learning_rate": 8e-05, "loss": 39.579, "num_input_tokens_seen": 388656976, "step": 7545 }, { "epoch": 0.7417999557750424, "grad_norm": 25.586387634277344, "learning_rate": 8e-05, "loss": 30.4443, "num_input_tokens_seen": 388807476, "step": 7548 }, { "epoch": 0.7420947888258274, "grad_norm": 27.737192153930664, "learning_rate": 8e-05, "loss": 34.7925, "num_input_tokens_seen": 388959276, "step": 7551 }, { "epoch": 0.7423896218766124, "grad_norm": 33.840476989746094, "learning_rate": 8e-05, "loss": 38.1397, "num_input_tokens_seen": 389100868, "step": 7554 }, { "epoch": 0.7426844549273973, "grad_norm": 24.45082664489746, "learning_rate": 8e-05, "loss": 36.8545, "num_input_tokens_seen": 389255556, "step": 7557 }, { "epoch": 0.7429792879781824, "grad_norm": 27.7251033782959, "learning_rate": 8e-05, "loss": 36.6671, "num_input_tokens_seen": 389404368, "step": 7560 }, { "epoch": 0.7432741210289674, "grad_norm": 25.771398544311523, "learning_rate": 8e-05, "loss": 34.4683, "num_input_tokens_seen": 389579920, "step": 7563 }, { "epoch": 0.7435689540797523, "grad_norm": 24.235137939453125, "learning_rate": 8e-05, "loss": 35.3631, "num_input_tokens_seen": 389748364, "step": 7566 }, { "epoch": 0.7438637871305374, "grad_norm": 22.721860885620117, "learning_rate": 8e-05, "loss": 34.7737, "num_input_tokens_seen": 389891996, "step": 7569 }, { "epoch": 0.7441586201813223, "grad_norm": 24.083951950073242, "learning_rate": 8e-05, "loss": 35.6209, "num_input_tokens_seen": 390052128, "step": 7572 }, { "epoch": 0.7444534532321073, "grad_norm": 28.595867156982422, "learning_rate": 8e-05, "loss": 34.9005, "num_input_tokens_seen": 390217948, "step": 7575 }, { "epoch": 0.7447482862828924, "grad_norm": 23.231950759887695, "learning_rate": 8e-05, "loss": 34.9005, "num_input_tokens_seen": 390363036, "step": 7578 }, { "epoch": 0.7450431193336773, "grad_norm": 152.58456420898438, "learning_rate": 8e-05, "loss": 32.4912, "num_input_tokens_seen": 390528868, "step": 7581 }, { "epoch": 0.7453379523844623, "grad_norm": 29.192127227783203, "learning_rate": 8e-05, "loss": 33.675, "num_input_tokens_seen": 390681416, "step": 7584 }, { "epoch": 0.7456327854352472, "grad_norm": 26.711700439453125, "learning_rate": 8e-05, "loss": 37.3324, "num_input_tokens_seen": 390834660, "step": 7587 }, { "epoch": 0.7459276184860323, "grad_norm": 31.674036026000977, "learning_rate": 8e-05, "loss": 36.7416, "num_input_tokens_seen": 390990760, "step": 7590 }, { "epoch": 0.7462224515368173, "grad_norm": 59.84733963012695, "learning_rate": 8e-05, "loss": 35.4196, "num_input_tokens_seen": 391138464, "step": 7593 }, { "epoch": 0.7465172845876022, "grad_norm": 33.0748405456543, "learning_rate": 8e-05, "loss": 35.9725, "num_input_tokens_seen": 391293492, "step": 7596 }, { "epoch": 0.7468121176383873, "grad_norm": 34.8997688293457, "learning_rate": 8e-05, "loss": 36.4004, "num_input_tokens_seen": 391473388, "step": 7599 }, { "epoch": 0.7471069506891722, "grad_norm": 27.16301918029785, "learning_rate": 8e-05, "loss": 36.9327, "num_input_tokens_seen": 391645156, "step": 7602 }, { "epoch": 0.7474017837399572, "grad_norm": 30.942466735839844, "learning_rate": 8e-05, "loss": 33.8328, "num_input_tokens_seen": 391786612, "step": 7605 }, { "epoch": 0.7476966167907423, "grad_norm": 35.61334991455078, "learning_rate": 8e-05, "loss": 38.7541, "num_input_tokens_seen": 391931904, "step": 7608 }, { "epoch": 0.7479914498415272, "grad_norm": 105.30634307861328, "learning_rate": 8e-05, "loss": 39.6637, "num_input_tokens_seen": 392086016, "step": 7611 }, { "epoch": 0.7482862828923122, "grad_norm": 49.63365936279297, "learning_rate": 8e-05, "loss": 34.3093, "num_input_tokens_seen": 392214580, "step": 7614 }, { "epoch": 0.7485811159430972, "grad_norm": 25.027118682861328, "learning_rate": 8e-05, "loss": 34.7893, "num_input_tokens_seen": 392371752, "step": 7617 }, { "epoch": 0.7488759489938822, "grad_norm": 27.529848098754883, "learning_rate": 8e-05, "loss": 34.8015, "num_input_tokens_seen": 392525552, "step": 7620 }, { "epoch": 0.7491707820446672, "grad_norm": 26.25337791442871, "learning_rate": 8e-05, "loss": 36.5017, "num_input_tokens_seen": 392649924, "step": 7623 }, { "epoch": 0.7494656150954522, "grad_norm": 52.97135925292969, "learning_rate": 8e-05, "loss": 40.43, "num_input_tokens_seen": 392801924, "step": 7626 }, { "epoch": 0.7497604481462372, "grad_norm": 21.912799835205078, "learning_rate": 8e-05, "loss": 34.4445, "num_input_tokens_seen": 392964832, "step": 7629 }, { "epoch": 0.7500552811970221, "grad_norm": 23.80307960510254, "learning_rate": 8e-05, "loss": 36.7419, "num_input_tokens_seen": 393123684, "step": 7632 }, { "epoch": 0.7503501142478072, "grad_norm": 23.64950180053711, "learning_rate": 8e-05, "loss": 35.1361, "num_input_tokens_seen": 393286600, "step": 7635 }, { "epoch": 0.7506449472985922, "grad_norm": 24.831253051757812, "learning_rate": 8e-05, "loss": 34.1056, "num_input_tokens_seen": 393443688, "step": 7638 }, { "epoch": 0.7509397803493771, "grad_norm": 23.34168815612793, "learning_rate": 8e-05, "loss": 36.7828, "num_input_tokens_seen": 393601180, "step": 7641 }, { "epoch": 0.7512346134001622, "grad_norm": 30.1611328125, "learning_rate": 8e-05, "loss": 33.5108, "num_input_tokens_seen": 393738140, "step": 7644 }, { "epoch": 0.7515294464509471, "grad_norm": 27.13521385192871, "learning_rate": 8e-05, "loss": 40.2198, "num_input_tokens_seen": 393886392, "step": 7647 }, { "epoch": 0.7518242795017321, "grad_norm": 27.22330093383789, "learning_rate": 8e-05, "loss": 37.5227, "num_input_tokens_seen": 394037900, "step": 7650 }, { "epoch": 0.7521191125525172, "grad_norm": 21.45113182067871, "learning_rate": 8e-05, "loss": 37.1464, "num_input_tokens_seen": 394194376, "step": 7653 }, { "epoch": 0.7524139456033021, "grad_norm": 20.75296974182129, "learning_rate": 8e-05, "loss": 31.1379, "num_input_tokens_seen": 394340592, "step": 7656 }, { "epoch": 0.7527087786540871, "grad_norm": 25.353015899658203, "learning_rate": 8e-05, "loss": 32.2977, "num_input_tokens_seen": 394499732, "step": 7659 }, { "epoch": 0.7530036117048721, "grad_norm": 23.255964279174805, "learning_rate": 8e-05, "loss": 38.8994, "num_input_tokens_seen": 394655136, "step": 7662 }, { "epoch": 0.7532984447556571, "grad_norm": 26.22134017944336, "learning_rate": 8e-05, "loss": 40.9635, "num_input_tokens_seen": 394797248, "step": 7665 }, { "epoch": 0.7535932778064421, "grad_norm": 23.839248657226562, "learning_rate": 8e-05, "loss": 35.2768, "num_input_tokens_seen": 394952920, "step": 7668 }, { "epoch": 0.7538881108572271, "grad_norm": 26.770469665527344, "learning_rate": 8e-05, "loss": 34.1852, "num_input_tokens_seen": 395082960, "step": 7671 }, { "epoch": 0.7541829439080121, "grad_norm": 26.16197967529297, "learning_rate": 8e-05, "loss": 39.2824, "num_input_tokens_seen": 395222292, "step": 7674 }, { "epoch": 0.754477776958797, "grad_norm": 22.692888259887695, "learning_rate": 8e-05, "loss": 35.6573, "num_input_tokens_seen": 395378916, "step": 7677 }, { "epoch": 0.7547726100095821, "grad_norm": 29.958852767944336, "learning_rate": 8e-05, "loss": 37.3705, "num_input_tokens_seen": 395524172, "step": 7680 }, { "epoch": 0.7550674430603671, "grad_norm": 25.045122146606445, "learning_rate": 8e-05, "loss": 37.268, "num_input_tokens_seen": 395680420, "step": 7683 }, { "epoch": 0.755362276111152, "grad_norm": 21.826213836669922, "learning_rate": 8e-05, "loss": 33.6373, "num_input_tokens_seen": 395855216, "step": 7686 }, { "epoch": 0.7556571091619371, "grad_norm": 27.030338287353516, "learning_rate": 8e-05, "loss": 35.9748, "num_input_tokens_seen": 396020044, "step": 7689 }, { "epoch": 0.755951942212722, "grad_norm": 32.4162483215332, "learning_rate": 8e-05, "loss": 37.4893, "num_input_tokens_seen": 396178644, "step": 7692 }, { "epoch": 0.756246775263507, "grad_norm": 39.71582794189453, "learning_rate": 8e-05, "loss": 34.4656, "num_input_tokens_seen": 396337480, "step": 7695 }, { "epoch": 0.7565416083142921, "grad_norm": 39.37307357788086, "learning_rate": 8e-05, "loss": 37.185, "num_input_tokens_seen": 396513736, "step": 7698 }, { "epoch": 0.756836441365077, "grad_norm": 26.994949340820312, "learning_rate": 8e-05, "loss": 36.2526, "num_input_tokens_seen": 396694672, "step": 7701 }, { "epoch": 0.757131274415862, "grad_norm": 25.13683319091797, "learning_rate": 8e-05, "loss": 33.1264, "num_input_tokens_seen": 396859188, "step": 7704 }, { "epoch": 0.757426107466647, "grad_norm": 28.42525291442871, "learning_rate": 8e-05, "loss": 36.3175, "num_input_tokens_seen": 396999932, "step": 7707 }, { "epoch": 0.757720940517432, "grad_norm": 25.426481246948242, "learning_rate": 8e-05, "loss": 35.1644, "num_input_tokens_seen": 397149508, "step": 7710 }, { "epoch": 0.758015773568217, "grad_norm": 25.66521644592285, "learning_rate": 8e-05, "loss": 33.9718, "num_input_tokens_seen": 397309256, "step": 7713 }, { "epoch": 0.758310606619002, "grad_norm": 25.486326217651367, "learning_rate": 8e-05, "loss": 36.1877, "num_input_tokens_seen": 397470364, "step": 7716 }, { "epoch": 0.758605439669787, "grad_norm": 25.189796447753906, "learning_rate": 8e-05, "loss": 33.6886, "num_input_tokens_seen": 397634620, "step": 7719 }, { "epoch": 0.7589002727205719, "grad_norm": 48.64822769165039, "learning_rate": 8e-05, "loss": 38.49, "num_input_tokens_seen": 397795032, "step": 7722 }, { "epoch": 0.759195105771357, "grad_norm": 29.638479232788086, "learning_rate": 8e-05, "loss": 37.9385, "num_input_tokens_seen": 397962232, "step": 7725 }, { "epoch": 0.759489938822142, "grad_norm": 32.13875961303711, "learning_rate": 8e-05, "loss": 35.1371, "num_input_tokens_seen": 398092100, "step": 7728 }, { "epoch": 0.7597847718729269, "grad_norm": 34.010276794433594, "learning_rate": 8e-05, "loss": 35.8284, "num_input_tokens_seen": 398269600, "step": 7731 }, { "epoch": 0.760079604923712, "grad_norm": 27.5978946685791, "learning_rate": 8e-05, "loss": 39.5413, "num_input_tokens_seen": 398418832, "step": 7734 }, { "epoch": 0.7603744379744969, "grad_norm": 28.058717727661133, "learning_rate": 8e-05, "loss": 38.8165, "num_input_tokens_seen": 398581128, "step": 7737 }, { "epoch": 0.7606692710252819, "grad_norm": 22.413394927978516, "learning_rate": 8e-05, "loss": 36.9416, "num_input_tokens_seen": 398735244, "step": 7740 }, { "epoch": 0.760964104076067, "grad_norm": 22.70956802368164, "learning_rate": 8e-05, "loss": 33.7998, "num_input_tokens_seen": 398882056, "step": 7743 }, { "epoch": 0.7612589371268519, "grad_norm": 34.98361587524414, "learning_rate": 8e-05, "loss": 37.2277, "num_input_tokens_seen": 399034480, "step": 7746 }, { "epoch": 0.7615537701776369, "grad_norm": 21.232566833496094, "learning_rate": 8e-05, "loss": 37.4406, "num_input_tokens_seen": 399183192, "step": 7749 }, { "epoch": 0.761848603228422, "grad_norm": 23.489994049072266, "learning_rate": 8e-05, "loss": 37.4302, "num_input_tokens_seen": 399360340, "step": 7752 }, { "epoch": 0.7621434362792069, "grad_norm": 29.505022048950195, "learning_rate": 8e-05, "loss": 35.7313, "num_input_tokens_seen": 399537496, "step": 7755 }, { "epoch": 0.7624382693299919, "grad_norm": 22.35037612915039, "learning_rate": 8e-05, "loss": 36.3991, "num_input_tokens_seen": 399697048, "step": 7758 }, { "epoch": 0.7627331023807769, "grad_norm": 22.932846069335938, "learning_rate": 8e-05, "loss": 41.2068, "num_input_tokens_seen": 399855616, "step": 7761 }, { "epoch": 0.7630279354315619, "grad_norm": 26.854496002197266, "learning_rate": 8e-05, "loss": 32.3901, "num_input_tokens_seen": 400006544, "step": 7764 }, { "epoch": 0.7633227684823469, "grad_norm": 26.923646926879883, "learning_rate": 8e-05, "loss": 39.4342, "num_input_tokens_seen": 400169056, "step": 7767 }, { "epoch": 0.7636176015331319, "grad_norm": 24.59294891357422, "learning_rate": 8e-05, "loss": 35.3785, "num_input_tokens_seen": 400324716, "step": 7770 }, { "epoch": 0.7639124345839169, "grad_norm": 44.60205841064453, "learning_rate": 8e-05, "loss": 36.1143, "num_input_tokens_seen": 400469124, "step": 7773 }, { "epoch": 0.7642072676347018, "grad_norm": 30.42624855041504, "learning_rate": 8e-05, "loss": 37.0372, "num_input_tokens_seen": 400632716, "step": 7776 }, { "epoch": 0.7645021006854869, "grad_norm": 23.208675384521484, "learning_rate": 8e-05, "loss": 32.5705, "num_input_tokens_seen": 400798540, "step": 7779 }, { "epoch": 0.7647969337362719, "grad_norm": 29.40182113647461, "learning_rate": 8e-05, "loss": 42.2359, "num_input_tokens_seen": 400970840, "step": 7782 }, { "epoch": 0.7650917667870568, "grad_norm": 22.822834014892578, "learning_rate": 8e-05, "loss": 35.2608, "num_input_tokens_seen": 401137072, "step": 7785 }, { "epoch": 0.7653865998378419, "grad_norm": 22.621362686157227, "learning_rate": 8e-05, "loss": 35.9634, "num_input_tokens_seen": 401297236, "step": 7788 }, { "epoch": 0.7656814328886268, "grad_norm": 23.71027183532715, "learning_rate": 8e-05, "loss": 37.8802, "num_input_tokens_seen": 401453132, "step": 7791 }, { "epoch": 0.7659762659394118, "grad_norm": 21.791362762451172, "learning_rate": 8e-05, "loss": 36.5754, "num_input_tokens_seen": 401615016, "step": 7794 }, { "epoch": 0.7662710989901969, "grad_norm": 22.979232788085938, "learning_rate": 8e-05, "loss": 37.7297, "num_input_tokens_seen": 401752296, "step": 7797 }, { "epoch": 0.7665659320409818, "grad_norm": 28.699113845825195, "learning_rate": 8e-05, "loss": 34.315, "num_input_tokens_seen": 401908632, "step": 7800 }, { "epoch": 0.7668607650917668, "grad_norm": 25.213712692260742, "learning_rate": 8e-05, "loss": 35.3176, "num_input_tokens_seen": 402057280, "step": 7803 }, { "epoch": 0.7671555981425517, "grad_norm": 23.474750518798828, "learning_rate": 8e-05, "loss": 38.3909, "num_input_tokens_seen": 402225972, "step": 7806 }, { "epoch": 0.7674504311933368, "grad_norm": 32.31821060180664, "learning_rate": 8e-05, "loss": 39.6794, "num_input_tokens_seen": 402383744, "step": 7809 }, { "epoch": 0.7677452642441218, "grad_norm": 26.248733520507812, "learning_rate": 8e-05, "loss": 35.504, "num_input_tokens_seen": 402537292, "step": 7812 }, { "epoch": 0.7680400972949067, "grad_norm": 31.621732711791992, "learning_rate": 8e-05, "loss": 33.5155, "num_input_tokens_seen": 402678688, "step": 7815 }, { "epoch": 0.7683349303456918, "grad_norm": 22.54973793029785, "learning_rate": 8e-05, "loss": 35.1997, "num_input_tokens_seen": 402824144, "step": 7818 }, { "epoch": 0.7686297633964767, "grad_norm": 23.661958694458008, "learning_rate": 8e-05, "loss": 39.7663, "num_input_tokens_seen": 402990972, "step": 7821 }, { "epoch": 0.7689245964472617, "grad_norm": 48.45589828491211, "learning_rate": 8e-05, "loss": 35.8389, "num_input_tokens_seen": 403146060, "step": 7824 }, { "epoch": 0.7692194294980468, "grad_norm": 28.95506477355957, "learning_rate": 8e-05, "loss": 38.4829, "num_input_tokens_seen": 403293764, "step": 7827 }, { "epoch": 0.7695142625488317, "grad_norm": 24.518653869628906, "learning_rate": 8e-05, "loss": 36.5637, "num_input_tokens_seen": 403467032, "step": 7830 }, { "epoch": 0.7698090955996167, "grad_norm": 187.11962890625, "learning_rate": 8e-05, "loss": 35.7274, "num_input_tokens_seen": 403621044, "step": 7833 }, { "epoch": 0.7701039286504017, "grad_norm": 29.056535720825195, "learning_rate": 8e-05, "loss": 34.5355, "num_input_tokens_seen": 403776084, "step": 7836 }, { "epoch": 0.7703987617011867, "grad_norm": 25.228620529174805, "learning_rate": 8e-05, "loss": 36.6865, "num_input_tokens_seen": 403929324, "step": 7839 }, { "epoch": 0.7706935947519717, "grad_norm": 29.44222640991211, "learning_rate": 8e-05, "loss": 36.8643, "num_input_tokens_seen": 404084380, "step": 7842 }, { "epoch": 0.7709884278027567, "grad_norm": 27.131668090820312, "learning_rate": 8e-05, "loss": 37.6504, "num_input_tokens_seen": 404237284, "step": 7845 }, { "epoch": 0.7712832608535417, "grad_norm": 268.1426086425781, "learning_rate": 8e-05, "loss": 33.8298, "num_input_tokens_seen": 404377928, "step": 7848 }, { "epoch": 0.7715780939043266, "grad_norm": 24.818805694580078, "learning_rate": 8e-05, "loss": 39.3884, "num_input_tokens_seen": 404542604, "step": 7851 }, { "epoch": 0.7718729269551117, "grad_norm": 29.92938995361328, "learning_rate": 8e-05, "loss": 34.5737, "num_input_tokens_seen": 404701704, "step": 7854 }, { "epoch": 0.7721677600058967, "grad_norm": 22.9791259765625, "learning_rate": 8e-05, "loss": 29.5506, "num_input_tokens_seen": 404848704, "step": 7857 }, { "epoch": 0.7724625930566816, "grad_norm": 56.71049880981445, "learning_rate": 8e-05, "loss": 38.5634, "num_input_tokens_seen": 405007396, "step": 7860 }, { "epoch": 0.7727574261074667, "grad_norm": 40.992950439453125, "learning_rate": 8e-05, "loss": 37.3678, "num_input_tokens_seen": 405178424, "step": 7863 }, { "epoch": 0.7730522591582516, "grad_norm": 29.336580276489258, "learning_rate": 8e-05, "loss": 36.4169, "num_input_tokens_seen": 405340732, "step": 7866 }, { "epoch": 0.7733470922090366, "grad_norm": 24.261932373046875, "learning_rate": 8e-05, "loss": 36.5978, "num_input_tokens_seen": 405507356, "step": 7869 }, { "epoch": 0.7736419252598217, "grad_norm": 26.533891677856445, "learning_rate": 8e-05, "loss": 38.5256, "num_input_tokens_seen": 405644324, "step": 7872 }, { "epoch": 0.7739367583106066, "grad_norm": 26.40947723388672, "learning_rate": 8e-05, "loss": 38.562, "num_input_tokens_seen": 405801196, "step": 7875 }, { "epoch": 0.7742315913613916, "grad_norm": 23.67729377746582, "learning_rate": 8e-05, "loss": 34.4802, "num_input_tokens_seen": 405938068, "step": 7878 }, { "epoch": 0.7745264244121766, "grad_norm": 30.940881729125977, "learning_rate": 8e-05, "loss": 35.3962, "num_input_tokens_seen": 406090084, "step": 7881 }, { "epoch": 0.7748212574629616, "grad_norm": 38.28596115112305, "learning_rate": 8e-05, "loss": 37.0547, "num_input_tokens_seen": 406233952, "step": 7884 }, { "epoch": 0.7751160905137466, "grad_norm": 26.43083381652832, "learning_rate": 8e-05, "loss": 37.8771, "num_input_tokens_seen": 406400136, "step": 7887 }, { "epoch": 0.7754109235645316, "grad_norm": 50.69662094116211, "learning_rate": 8e-05, "loss": 34.8928, "num_input_tokens_seen": 406577860, "step": 7890 }, { "epoch": 0.7757057566153166, "grad_norm": 26.12557029724121, "learning_rate": 8e-05, "loss": 34.7082, "num_input_tokens_seen": 406727848, "step": 7893 }, { "epoch": 0.7760005896661015, "grad_norm": 26.89127540588379, "learning_rate": 8e-05, "loss": 35.4053, "num_input_tokens_seen": 406882000, "step": 7896 }, { "epoch": 0.7762954227168866, "grad_norm": 25.696537017822266, "learning_rate": 8e-05, "loss": 32.8174, "num_input_tokens_seen": 407037656, "step": 7899 }, { "epoch": 0.7765902557676716, "grad_norm": 21.538856506347656, "learning_rate": 8e-05, "loss": 33.3996, "num_input_tokens_seen": 407201080, "step": 7902 }, { "epoch": 0.7768850888184565, "grad_norm": 28.80377769470215, "learning_rate": 8e-05, "loss": 36.5668, "num_input_tokens_seen": 407377780, "step": 7905 }, { "epoch": 0.7771799218692416, "grad_norm": 30.080427169799805, "learning_rate": 8e-05, "loss": 37.3908, "num_input_tokens_seen": 407533544, "step": 7908 }, { "epoch": 0.7774747549200265, "grad_norm": 35.93389892578125, "learning_rate": 8e-05, "loss": 36.7938, "num_input_tokens_seen": 407698404, "step": 7911 }, { "epoch": 0.7777695879708115, "grad_norm": 26.772830963134766, "learning_rate": 8e-05, "loss": 35.0283, "num_input_tokens_seen": 407853008, "step": 7914 }, { "epoch": 0.7780644210215966, "grad_norm": 32.96590042114258, "learning_rate": 8e-05, "loss": 36.3987, "num_input_tokens_seen": 408013784, "step": 7917 }, { "epoch": 0.7783592540723815, "grad_norm": 26.807809829711914, "learning_rate": 8e-05, "loss": 37.276, "num_input_tokens_seen": 408172336, "step": 7920 }, { "epoch": 0.7786540871231665, "grad_norm": 25.73488998413086, "learning_rate": 8e-05, "loss": 34.6074, "num_input_tokens_seen": 408320856, "step": 7923 }, { "epoch": 0.7789489201739515, "grad_norm": 23.03981590270996, "learning_rate": 8e-05, "loss": 39.2311, "num_input_tokens_seen": 408461160, "step": 7926 }, { "epoch": 0.7792437532247365, "grad_norm": 24.349489212036133, "learning_rate": 8e-05, "loss": 36.4847, "num_input_tokens_seen": 408610200, "step": 7929 }, { "epoch": 0.7795385862755215, "grad_norm": 25.687152862548828, "learning_rate": 8e-05, "loss": 37.7461, "num_input_tokens_seen": 408765556, "step": 7932 }, { "epoch": 0.7798334193263065, "grad_norm": 25.157957077026367, "learning_rate": 8e-05, "loss": 38.5068, "num_input_tokens_seen": 408937720, "step": 7935 }, { "epoch": 0.7801282523770915, "grad_norm": 21.769214630126953, "learning_rate": 8e-05, "loss": 36.2921, "num_input_tokens_seen": 409109784, "step": 7938 }, { "epoch": 0.7804230854278764, "grad_norm": 20.435977935791016, "learning_rate": 8e-05, "loss": 35.1186, "num_input_tokens_seen": 409279036, "step": 7941 }, { "epoch": 0.7807179184786615, "grad_norm": 25.90553092956543, "learning_rate": 8e-05, "loss": 36.176, "num_input_tokens_seen": 409425764, "step": 7944 }, { "epoch": 0.7810127515294465, "grad_norm": 23.491077423095703, "learning_rate": 8e-05, "loss": 38.1959, "num_input_tokens_seen": 409588992, "step": 7947 }, { "epoch": 0.7813075845802314, "grad_norm": 24.66498565673828, "learning_rate": 8e-05, "loss": 38.0055, "num_input_tokens_seen": 409767288, "step": 7950 }, { "epoch": 0.7816024176310165, "grad_norm": 27.49262046813965, "learning_rate": 8e-05, "loss": 35.4031, "num_input_tokens_seen": 409910484, "step": 7953 }, { "epoch": 0.7818972506818014, "grad_norm": 21.672725677490234, "learning_rate": 8e-05, "loss": 34.6222, "num_input_tokens_seen": 410071500, "step": 7956 }, { "epoch": 0.7821920837325864, "grad_norm": 25.07799530029297, "learning_rate": 8e-05, "loss": 33.0897, "num_input_tokens_seen": 410234160, "step": 7959 }, { "epoch": 0.7824869167833715, "grad_norm": 24.90636444091797, "learning_rate": 8e-05, "loss": 38.1614, "num_input_tokens_seen": 410373068, "step": 7962 }, { "epoch": 0.7827817498341564, "grad_norm": 20.44605827331543, "learning_rate": 8e-05, "loss": 32.2739, "num_input_tokens_seen": 410509172, "step": 7965 }, { "epoch": 0.7830765828849414, "grad_norm": 26.52505111694336, "learning_rate": 8e-05, "loss": 36.0589, "num_input_tokens_seen": 410680432, "step": 7968 }, { "epoch": 0.7833714159357263, "grad_norm": 56.93138122558594, "learning_rate": 8e-05, "loss": 32.2403, "num_input_tokens_seen": 410834396, "step": 7971 }, { "epoch": 0.7836662489865114, "grad_norm": 24.301488876342773, "learning_rate": 8e-05, "loss": 35.1784, "num_input_tokens_seen": 410997416, "step": 7974 }, { "epoch": 0.7839610820372964, "grad_norm": 22.580333709716797, "learning_rate": 8e-05, "loss": 31.316, "num_input_tokens_seen": 411129556, "step": 7977 }, { "epoch": 0.7842559150880813, "grad_norm": 20.43970489501953, "learning_rate": 8e-05, "loss": 37.1739, "num_input_tokens_seen": 411311424, "step": 7980 }, { "epoch": 0.7845507481388664, "grad_norm": 42.44162368774414, "learning_rate": 8e-05, "loss": 37.3535, "num_input_tokens_seen": 411484096, "step": 7983 }, { "epoch": 0.7848455811896513, "grad_norm": 26.987335205078125, "learning_rate": 8e-05, "loss": 39.5258, "num_input_tokens_seen": 411633808, "step": 7986 }, { "epoch": 0.7851404142404363, "grad_norm": 20.923507690429688, "learning_rate": 8e-05, "loss": 34.7262, "num_input_tokens_seen": 411790656, "step": 7989 }, { "epoch": 0.7854352472912214, "grad_norm": 21.965740203857422, "learning_rate": 8e-05, "loss": 37.8557, "num_input_tokens_seen": 411961344, "step": 7992 }, { "epoch": 0.7857300803420063, "grad_norm": 23.653724670410156, "learning_rate": 8e-05, "loss": 35.9979, "num_input_tokens_seen": 412100232, "step": 7995 }, { "epoch": 0.7860249133927913, "grad_norm": 20.85638427734375, "learning_rate": 8e-05, "loss": 36.1657, "num_input_tokens_seen": 412259464, "step": 7998 }, { "epoch": 0.7862214687599813, "eval_gen_len": 35.32, "eval_loss": 2.3185036182403564, "eval_rouge1": 46.647, "eval_rouge2": 29.8361, "eval_rougeL": 42.7361, "eval_rougeLsum": 43.0175, "eval_runtime": 104.6242, "eval_samples_per_second": 1.912, "eval_steps_per_second": 0.478, "num_input_tokens_seen": 412353688, "step": 8000 }, { "epoch": 0.7863197464435763, "grad_norm": 22.64118194580078, "learning_rate": 8e-05, "loss": 36.9061, "num_input_tokens_seen": 412401800, "step": 8001 }, { "epoch": 0.7866145794943613, "grad_norm": 24.01075553894043, "learning_rate": 8e-05, "loss": 37.7225, "num_input_tokens_seen": 412571928, "step": 8004 }, { "epoch": 0.7869094125451463, "grad_norm": 24.196077346801758, "learning_rate": 8e-05, "loss": 36.2338, "num_input_tokens_seen": 412716236, "step": 8007 }, { "epoch": 0.7872042455959313, "grad_norm": 23.942615509033203, "learning_rate": 8e-05, "loss": 36.0792, "num_input_tokens_seen": 412874488, "step": 8010 }, { "epoch": 0.7874990786467163, "grad_norm": 23.444011688232422, "learning_rate": 8e-05, "loss": 34.9672, "num_input_tokens_seen": 413001512, "step": 8013 }, { "epoch": 0.7877939116975012, "grad_norm": 24.958213806152344, "learning_rate": 8e-05, "loss": 32.6181, "num_input_tokens_seen": 413153004, "step": 8016 }, { "epoch": 0.7880887447482863, "grad_norm": 29.166627883911133, "learning_rate": 8e-05, "loss": 35.5564, "num_input_tokens_seen": 413308188, "step": 8019 }, { "epoch": 0.7883835777990713, "grad_norm": 26.949697494506836, "learning_rate": 8e-05, "loss": 36.4201, "num_input_tokens_seen": 413440888, "step": 8022 }, { "epoch": 0.7886784108498562, "grad_norm": 41.7303352355957, "learning_rate": 8e-05, "loss": 31.4589, "num_input_tokens_seen": 413578524, "step": 8025 }, { "epoch": 0.7889732439006413, "grad_norm": 37.500732421875, "learning_rate": 8e-05, "loss": 35.4136, "num_input_tokens_seen": 413735264, "step": 8028 }, { "epoch": 0.7892680769514262, "grad_norm": 33.136962890625, "learning_rate": 8e-05, "loss": 35.4613, "num_input_tokens_seen": 413891816, "step": 8031 }, { "epoch": 0.7895629100022112, "grad_norm": 20.599218368530273, "learning_rate": 8e-05, "loss": 36.9574, "num_input_tokens_seen": 414043084, "step": 8034 }, { "epoch": 0.7898577430529963, "grad_norm": 21.741914749145508, "learning_rate": 8e-05, "loss": 32.622, "num_input_tokens_seen": 414202624, "step": 8037 }, { "epoch": 0.7901525761037812, "grad_norm": 20.87574005126953, "learning_rate": 8e-05, "loss": 36.8067, "num_input_tokens_seen": 414367648, "step": 8040 }, { "epoch": 0.7904474091545662, "grad_norm": 23.262584686279297, "learning_rate": 8e-05, "loss": 38.4502, "num_input_tokens_seen": 414525216, "step": 8043 }, { "epoch": 0.7907422422053512, "grad_norm": 25.928823471069336, "learning_rate": 8e-05, "loss": 40.3445, "num_input_tokens_seen": 414672264, "step": 8046 }, { "epoch": 0.7910370752561362, "grad_norm": 24.576786041259766, "learning_rate": 8e-05, "loss": 36.1848, "num_input_tokens_seen": 414833988, "step": 8049 }, { "epoch": 0.7913319083069212, "grad_norm": 60.29381561279297, "learning_rate": 8e-05, "loss": 38.3726, "num_input_tokens_seen": 414964680, "step": 8052 }, { "epoch": 0.7916267413577062, "grad_norm": 21.849842071533203, "learning_rate": 8e-05, "loss": 34.8281, "num_input_tokens_seen": 415133388, "step": 8055 }, { "epoch": 0.7919215744084912, "grad_norm": 30.273502349853516, "learning_rate": 8e-05, "loss": 39.4244, "num_input_tokens_seen": 415283668, "step": 8058 }, { "epoch": 0.7922164074592762, "grad_norm": 24.501184463500977, "learning_rate": 8e-05, "loss": 37.741, "num_input_tokens_seen": 415433676, "step": 8061 }, { "epoch": 0.7925112405100612, "grad_norm": 24.874170303344727, "learning_rate": 8e-05, "loss": 34.0968, "num_input_tokens_seen": 415591080, "step": 8064 }, { "epoch": 0.7928060735608462, "grad_norm": 31.724990844726562, "learning_rate": 8e-05, "loss": 35.3581, "num_input_tokens_seen": 415754600, "step": 8067 }, { "epoch": 0.7931009066116311, "grad_norm": 28.470970153808594, "learning_rate": 8e-05, "loss": 34.9877, "num_input_tokens_seen": 415916000, "step": 8070 }, { "epoch": 0.7933957396624162, "grad_norm": 28.722963333129883, "learning_rate": 8e-05, "loss": 36.3972, "num_input_tokens_seen": 416085508, "step": 8073 }, { "epoch": 0.7936905727132012, "grad_norm": 30.28119659423828, "learning_rate": 8e-05, "loss": 38.09, "num_input_tokens_seen": 416220544, "step": 8076 }, { "epoch": 0.7939854057639861, "grad_norm": 27.0206241607666, "learning_rate": 8e-05, "loss": 41.4609, "num_input_tokens_seen": 416375272, "step": 8079 }, { "epoch": 0.7942802388147712, "grad_norm": 23.414587020874023, "learning_rate": 8e-05, "loss": 35.2382, "num_input_tokens_seen": 416544904, "step": 8082 }, { "epoch": 0.7945750718655561, "grad_norm": 24.774656295776367, "learning_rate": 8e-05, "loss": 36.4614, "num_input_tokens_seen": 416707732, "step": 8085 }, { "epoch": 0.7948699049163411, "grad_norm": 26.332231521606445, "learning_rate": 8e-05, "loss": 34.6727, "num_input_tokens_seen": 416846512, "step": 8088 }, { "epoch": 0.7951647379671262, "grad_norm": 23.728219985961914, "learning_rate": 8e-05, "loss": 38.4909, "num_input_tokens_seen": 417024300, "step": 8091 }, { "epoch": 0.7954595710179111, "grad_norm": 22.694969177246094, "learning_rate": 8e-05, "loss": 33.3829, "num_input_tokens_seen": 417169300, "step": 8094 }, { "epoch": 0.7957544040686961, "grad_norm": 21.721418380737305, "learning_rate": 8e-05, "loss": 37.5297, "num_input_tokens_seen": 417310940, "step": 8097 }, { "epoch": 0.7960492371194811, "grad_norm": 25.580392837524414, "learning_rate": 8e-05, "loss": 35.933, "num_input_tokens_seen": 417492312, "step": 8100 }, { "epoch": 0.7963440701702661, "grad_norm": 24.695316314697266, "learning_rate": 8e-05, "loss": 39.1894, "num_input_tokens_seen": 417636016, "step": 8103 }, { "epoch": 0.7966389032210511, "grad_norm": 27.233322143554688, "learning_rate": 8e-05, "loss": 36.8955, "num_input_tokens_seen": 417801812, "step": 8106 }, { "epoch": 0.7969337362718361, "grad_norm": 24.737655639648438, "learning_rate": 8e-05, "loss": 33.5013, "num_input_tokens_seen": 417945108, "step": 8109 }, { "epoch": 0.7972285693226211, "grad_norm": 26.381988525390625, "learning_rate": 8e-05, "loss": 37.7245, "num_input_tokens_seen": 418084320, "step": 8112 }, { "epoch": 0.797523402373406, "grad_norm": 20.971017837524414, "learning_rate": 8e-05, "loss": 32.9355, "num_input_tokens_seen": 418243772, "step": 8115 }, { "epoch": 0.7978182354241911, "grad_norm": 56.757423400878906, "learning_rate": 8e-05, "loss": 33.5788, "num_input_tokens_seen": 418376468, "step": 8118 }, { "epoch": 0.7981130684749761, "grad_norm": 23.269350051879883, "learning_rate": 8e-05, "loss": 33.1199, "num_input_tokens_seen": 418523632, "step": 8121 }, { "epoch": 0.798407901525761, "grad_norm": 27.43719482421875, "learning_rate": 8e-05, "loss": 37.3559, "num_input_tokens_seen": 418683152, "step": 8124 }, { "epoch": 0.7987027345765461, "grad_norm": 22.803434371948242, "learning_rate": 8e-05, "loss": 36.1157, "num_input_tokens_seen": 418843020, "step": 8127 }, { "epoch": 0.798997567627331, "grad_norm": 21.03816032409668, "learning_rate": 8e-05, "loss": 34.2601, "num_input_tokens_seen": 419014256, "step": 8130 }, { "epoch": 0.799292400678116, "grad_norm": 71.1124038696289, "learning_rate": 8e-05, "loss": 35.6755, "num_input_tokens_seen": 419153168, "step": 8133 }, { "epoch": 0.7995872337289011, "grad_norm": 21.754837036132812, "learning_rate": 8e-05, "loss": 33.553, "num_input_tokens_seen": 419317532, "step": 8136 }, { "epoch": 0.799882066779686, "grad_norm": 23.042221069335938, "learning_rate": 8e-05, "loss": 33.131, "num_input_tokens_seen": 419481224, "step": 8139 }, { "epoch": 0.800176899830471, "grad_norm": 24.852903366088867, "learning_rate": 8e-05, "loss": 32.4917, "num_input_tokens_seen": 419639672, "step": 8142 }, { "epoch": 0.800471732881256, "grad_norm": 48.62297058105469, "learning_rate": 8e-05, "loss": 33.921, "num_input_tokens_seen": 419791948, "step": 8145 }, { "epoch": 0.800766565932041, "grad_norm": 26.25032615661621, "learning_rate": 8e-05, "loss": 32.9486, "num_input_tokens_seen": 419946984, "step": 8148 }, { "epoch": 0.801061398982826, "grad_norm": 49.425872802734375, "learning_rate": 8e-05, "loss": 33.7382, "num_input_tokens_seen": 420080392, "step": 8151 }, { "epoch": 0.801356232033611, "grad_norm": 22.22492790222168, "learning_rate": 8e-05, "loss": 31.2606, "num_input_tokens_seen": 420221328, "step": 8154 }, { "epoch": 0.801651065084396, "grad_norm": 23.21967315673828, "learning_rate": 8e-05, "loss": 35.8521, "num_input_tokens_seen": 420374892, "step": 8157 }, { "epoch": 0.8019458981351809, "grad_norm": 21.875211715698242, "learning_rate": 8e-05, "loss": 37.2863, "num_input_tokens_seen": 420528800, "step": 8160 }, { "epoch": 0.802240731185966, "grad_norm": 23.19055938720703, "learning_rate": 8e-05, "loss": 39.5513, "num_input_tokens_seen": 420685724, "step": 8163 }, { "epoch": 0.802535564236751, "grad_norm": 53.719539642333984, "learning_rate": 8e-05, "loss": 37.4689, "num_input_tokens_seen": 420843844, "step": 8166 }, { "epoch": 0.8028303972875359, "grad_norm": 23.912782669067383, "learning_rate": 8e-05, "loss": 32.1369, "num_input_tokens_seen": 421008060, "step": 8169 }, { "epoch": 0.803125230338321, "grad_norm": 45.83617401123047, "learning_rate": 8e-05, "loss": 34.3147, "num_input_tokens_seen": 421145352, "step": 8172 }, { "epoch": 0.8034200633891059, "grad_norm": 37.71995544433594, "learning_rate": 8e-05, "loss": 35.0719, "num_input_tokens_seen": 421296184, "step": 8175 }, { "epoch": 0.8037148964398909, "grad_norm": 24.80175018310547, "learning_rate": 8e-05, "loss": 35.9003, "num_input_tokens_seen": 421432260, "step": 8178 }, { "epoch": 0.804009729490676, "grad_norm": 18.523286819458008, "learning_rate": 8e-05, "loss": 33.581, "num_input_tokens_seen": 421601412, "step": 8181 }, { "epoch": 0.8043045625414609, "grad_norm": 26.430187225341797, "learning_rate": 8e-05, "loss": 39.4251, "num_input_tokens_seen": 421768668, "step": 8184 }, { "epoch": 0.8045993955922459, "grad_norm": 19.468984603881836, "learning_rate": 8e-05, "loss": 35.396, "num_input_tokens_seen": 421925492, "step": 8187 }, { "epoch": 0.8048942286430308, "grad_norm": 24.663354873657227, "learning_rate": 8e-05, "loss": 36.6025, "num_input_tokens_seen": 422102116, "step": 8190 }, { "epoch": 0.8051890616938159, "grad_norm": 24.94499397277832, "learning_rate": 8e-05, "loss": 33.9554, "num_input_tokens_seen": 422254632, "step": 8193 }, { "epoch": 0.8054838947446009, "grad_norm": 27.692119598388672, "learning_rate": 8e-05, "loss": 36.4076, "num_input_tokens_seen": 422399280, "step": 8196 }, { "epoch": 0.8057787277953858, "grad_norm": 24.469003677368164, "learning_rate": 8e-05, "loss": 36.6231, "num_input_tokens_seen": 422564724, "step": 8199 }, { "epoch": 0.8060735608461709, "grad_norm": 24.645511627197266, "learning_rate": 8e-05, "loss": 33.4182, "num_input_tokens_seen": 422693920, "step": 8202 }, { "epoch": 0.8063683938969558, "grad_norm": 24.154550552368164, "learning_rate": 8e-05, "loss": 37.1308, "num_input_tokens_seen": 422852320, "step": 8205 }, { "epoch": 0.8066632269477408, "grad_norm": 20.739830017089844, "learning_rate": 8e-05, "loss": 34.6883, "num_input_tokens_seen": 423036660, "step": 8208 }, { "epoch": 0.8069580599985259, "grad_norm": 22.852832794189453, "learning_rate": 8e-05, "loss": 35.2849, "num_input_tokens_seen": 423171956, "step": 8211 }, { "epoch": 0.8072528930493108, "grad_norm": 22.849538803100586, "learning_rate": 8e-05, "loss": 37.724, "num_input_tokens_seen": 423331536, "step": 8214 }, { "epoch": 0.8075477261000958, "grad_norm": 26.494213104248047, "learning_rate": 8e-05, "loss": 34.4807, "num_input_tokens_seen": 423458460, "step": 8217 }, { "epoch": 0.8078425591508808, "grad_norm": 25.496356964111328, "learning_rate": 8e-05, "loss": 38.5775, "num_input_tokens_seen": 423610528, "step": 8220 }, { "epoch": 0.8081373922016658, "grad_norm": 20.508310317993164, "learning_rate": 8e-05, "loss": 31.4715, "num_input_tokens_seen": 423747124, "step": 8223 }, { "epoch": 0.8084322252524508, "grad_norm": 216.43284606933594, "learning_rate": 8e-05, "loss": 31.5416, "num_input_tokens_seen": 423890224, "step": 8226 }, { "epoch": 0.8087270583032358, "grad_norm": 24.748380661010742, "learning_rate": 8e-05, "loss": 35.1916, "num_input_tokens_seen": 424043772, "step": 8229 }, { "epoch": 0.8090218913540208, "grad_norm": 57.81846618652344, "learning_rate": 8e-05, "loss": 33.6554, "num_input_tokens_seen": 424210108, "step": 8232 }, { "epoch": 0.8093167244048057, "grad_norm": 24.118955612182617, "learning_rate": 8e-05, "loss": 34.5261, "num_input_tokens_seen": 424371540, "step": 8235 }, { "epoch": 0.8096115574555908, "grad_norm": 27.515722274780273, "learning_rate": 8e-05, "loss": 33.6207, "num_input_tokens_seen": 424535776, "step": 8238 }, { "epoch": 0.8099063905063758, "grad_norm": 26.33366584777832, "learning_rate": 8e-05, "loss": 38.9131, "num_input_tokens_seen": 424710928, "step": 8241 }, { "epoch": 0.8102012235571607, "grad_norm": 23.612756729125977, "learning_rate": 8e-05, "loss": 35.7337, "num_input_tokens_seen": 424873100, "step": 8244 }, { "epoch": 0.8104960566079458, "grad_norm": 27.79807472229004, "learning_rate": 8e-05, "loss": 35.9495, "num_input_tokens_seen": 425019436, "step": 8247 }, { "epoch": 0.8107908896587307, "grad_norm": 27.202306747436523, "learning_rate": 8e-05, "loss": 35.9226, "num_input_tokens_seen": 425193320, "step": 8250 }, { "epoch": 0.8110857227095157, "grad_norm": 34.29397964477539, "learning_rate": 8e-05, "loss": 35.7965, "num_input_tokens_seen": 425337520, "step": 8253 }, { "epoch": 0.8113805557603008, "grad_norm": 24.603361129760742, "learning_rate": 8e-05, "loss": 35.9459, "num_input_tokens_seen": 425479424, "step": 8256 }, { "epoch": 0.8116753888110857, "grad_norm": 26.088085174560547, "learning_rate": 8e-05, "loss": 36.1599, "num_input_tokens_seen": 425633808, "step": 8259 }, { "epoch": 0.8119702218618707, "grad_norm": 24.88553810119629, "learning_rate": 8e-05, "loss": 36.8701, "num_input_tokens_seen": 425789476, "step": 8262 }, { "epoch": 0.8122650549126557, "grad_norm": 26.12474250793457, "learning_rate": 8e-05, "loss": 36.3213, "num_input_tokens_seen": 425921604, "step": 8265 }, { "epoch": 0.8125598879634407, "grad_norm": 24.976924896240234, "learning_rate": 8e-05, "loss": 38.2543, "num_input_tokens_seen": 426076664, "step": 8268 }, { "epoch": 0.8128547210142257, "grad_norm": 25.509859085083008, "learning_rate": 8e-05, "loss": 35.7021, "num_input_tokens_seen": 426225572, "step": 8271 }, { "epoch": 0.8131495540650107, "grad_norm": 23.46617317199707, "learning_rate": 8e-05, "loss": 36.6647, "num_input_tokens_seen": 426368208, "step": 8274 }, { "epoch": 0.8134443871157957, "grad_norm": 23.891382217407227, "learning_rate": 8e-05, "loss": 36.64, "num_input_tokens_seen": 426528880, "step": 8277 }, { "epoch": 0.8137392201665806, "grad_norm": 25.366939544677734, "learning_rate": 8e-05, "loss": 36.1003, "num_input_tokens_seen": 426708144, "step": 8280 }, { "epoch": 0.8140340532173657, "grad_norm": 24.68431854248047, "learning_rate": 8e-05, "loss": 36.2022, "num_input_tokens_seen": 426870136, "step": 8283 }, { "epoch": 0.8143288862681507, "grad_norm": 21.63142204284668, "learning_rate": 8e-05, "loss": 33.1946, "num_input_tokens_seen": 427021880, "step": 8286 }, { "epoch": 0.8146237193189356, "grad_norm": 20.929309844970703, "learning_rate": 8e-05, "loss": 36.1105, "num_input_tokens_seen": 427174376, "step": 8289 }, { "epoch": 0.8149185523697207, "grad_norm": 23.457090377807617, "learning_rate": 8e-05, "loss": 35.412, "num_input_tokens_seen": 427319372, "step": 8292 }, { "epoch": 0.8152133854205056, "grad_norm": 24.533313751220703, "learning_rate": 8e-05, "loss": 39.5555, "num_input_tokens_seen": 427484580, "step": 8295 }, { "epoch": 0.8155082184712906, "grad_norm": 19.934356689453125, "learning_rate": 8e-05, "loss": 33.2841, "num_input_tokens_seen": 427633100, "step": 8298 }, { "epoch": 0.8158030515220757, "grad_norm": 25.337459564208984, "learning_rate": 8e-05, "loss": 39.6166, "num_input_tokens_seen": 427768212, "step": 8301 }, { "epoch": 0.8160978845728606, "grad_norm": 24.47291374206543, "learning_rate": 8e-05, "loss": 36.4094, "num_input_tokens_seen": 427922476, "step": 8304 }, { "epoch": 0.8163927176236456, "grad_norm": 21.60689353942871, "learning_rate": 8e-05, "loss": 34.1587, "num_input_tokens_seen": 428073292, "step": 8307 }, { "epoch": 0.8166875506744306, "grad_norm": 25.36764907836914, "learning_rate": 8e-05, "loss": 38.1677, "num_input_tokens_seen": 428248860, "step": 8310 }, { "epoch": 0.8169823837252156, "grad_norm": 27.005537033081055, "learning_rate": 8e-05, "loss": 38.2046, "num_input_tokens_seen": 428395584, "step": 8313 }, { "epoch": 0.8172772167760006, "grad_norm": 21.089380264282227, "learning_rate": 8e-05, "loss": 32.8911, "num_input_tokens_seen": 428549272, "step": 8316 }, { "epoch": 0.8175720498267856, "grad_norm": 24.877689361572266, "learning_rate": 8e-05, "loss": 35.3285, "num_input_tokens_seen": 428698504, "step": 8319 }, { "epoch": 0.8178668828775706, "grad_norm": 29.171049118041992, "learning_rate": 8e-05, "loss": 34.9713, "num_input_tokens_seen": 428851904, "step": 8322 }, { "epoch": 0.8181617159283555, "grad_norm": 25.88189697265625, "learning_rate": 8e-05, "loss": 30.8571, "num_input_tokens_seen": 429018916, "step": 8325 }, { "epoch": 0.8184565489791406, "grad_norm": 23.980445861816406, "learning_rate": 8e-05, "loss": 37.0643, "num_input_tokens_seen": 429173112, "step": 8328 }, { "epoch": 0.8187513820299256, "grad_norm": 21.506813049316406, "learning_rate": 8e-05, "loss": 37.2138, "num_input_tokens_seen": 429331240, "step": 8331 }, { "epoch": 0.8190462150807105, "grad_norm": 20.923595428466797, "learning_rate": 8e-05, "loss": 33.7857, "num_input_tokens_seen": 429477164, "step": 8334 }, { "epoch": 0.8193410481314956, "grad_norm": 24.962444305419922, "learning_rate": 8e-05, "loss": 36.0604, "num_input_tokens_seen": 429642332, "step": 8337 }, { "epoch": 0.8196358811822805, "grad_norm": 21.74043846130371, "learning_rate": 8e-05, "loss": 41.008, "num_input_tokens_seen": 429789980, "step": 8340 }, { "epoch": 0.8199307142330655, "grad_norm": 33.76387405395508, "learning_rate": 8e-05, "loss": 35.5676, "num_input_tokens_seen": 429946564, "step": 8343 }, { "epoch": 0.8202255472838506, "grad_norm": 19.511245727539062, "learning_rate": 8e-05, "loss": 34.3012, "num_input_tokens_seen": 430082288, "step": 8346 }, { "epoch": 0.8205203803346355, "grad_norm": 25.11481285095215, "learning_rate": 8e-05, "loss": 35.0994, "num_input_tokens_seen": 430241712, "step": 8349 }, { "epoch": 0.8208152133854205, "grad_norm": 28.375946044921875, "learning_rate": 8e-05, "loss": 35.1421, "num_input_tokens_seen": 430369624, "step": 8352 }, { "epoch": 0.8211100464362056, "grad_norm": 23.08304214477539, "learning_rate": 8e-05, "loss": 36.7995, "num_input_tokens_seen": 430523416, "step": 8355 }, { "epoch": 0.8214048794869905, "grad_norm": 21.1984806060791, "learning_rate": 8e-05, "loss": 35.5492, "num_input_tokens_seen": 430676216, "step": 8358 }, { "epoch": 0.8216997125377755, "grad_norm": 25.081396102905273, "learning_rate": 8e-05, "loss": 33.4143, "num_input_tokens_seen": 430813496, "step": 8361 }, { "epoch": 0.8219945455885604, "grad_norm": 22.87725257873535, "learning_rate": 8e-05, "loss": 35.4886, "num_input_tokens_seen": 430970352, "step": 8364 }, { "epoch": 0.8222893786393455, "grad_norm": 22.641258239746094, "learning_rate": 8e-05, "loss": 36.1933, "num_input_tokens_seen": 431131588, "step": 8367 }, { "epoch": 0.8225842116901305, "grad_norm": 20.716995239257812, "learning_rate": 8e-05, "loss": 33.6509, "num_input_tokens_seen": 431276432, "step": 8370 }, { "epoch": 0.8228790447409154, "grad_norm": 41.984832763671875, "learning_rate": 8e-05, "loss": 33.6686, "num_input_tokens_seen": 431435588, "step": 8373 }, { "epoch": 0.8231738777917005, "grad_norm": 23.140209197998047, "learning_rate": 8e-05, "loss": 37.4149, "num_input_tokens_seen": 431600448, "step": 8376 }, { "epoch": 0.8234687108424854, "grad_norm": 30.463956832885742, "learning_rate": 8e-05, "loss": 36.6518, "num_input_tokens_seen": 431745244, "step": 8379 }, { "epoch": 0.8237635438932704, "grad_norm": 27.39523696899414, "learning_rate": 8e-05, "loss": 37.3414, "num_input_tokens_seen": 431892992, "step": 8382 }, { "epoch": 0.8240583769440555, "grad_norm": 22.793724060058594, "learning_rate": 8e-05, "loss": 37.5179, "num_input_tokens_seen": 432059272, "step": 8385 }, { "epoch": 0.8243532099948404, "grad_norm": 19.464582443237305, "learning_rate": 8e-05, "loss": 33.5467, "num_input_tokens_seen": 432208184, "step": 8388 }, { "epoch": 0.8246480430456254, "grad_norm": 21.68764305114746, "learning_rate": 8e-05, "loss": 34.8924, "num_input_tokens_seen": 432345912, "step": 8391 }, { "epoch": 0.8249428760964104, "grad_norm": 22.38986587524414, "learning_rate": 8e-05, "loss": 32.9228, "num_input_tokens_seen": 432490172, "step": 8394 }, { "epoch": 0.8252377091471954, "grad_norm": 25.4932861328125, "learning_rate": 8e-05, "loss": 37.1201, "num_input_tokens_seen": 432661932, "step": 8397 }, { "epoch": 0.8255325421979804, "grad_norm": 24.564067840576172, "learning_rate": 8e-05, "loss": 37.7674, "num_input_tokens_seen": 432830700, "step": 8400 }, { "epoch": 0.8258273752487654, "grad_norm": 23.00069808959961, "learning_rate": 8e-05, "loss": 36.0192, "num_input_tokens_seen": 432990432, "step": 8403 }, { "epoch": 0.8261222082995504, "grad_norm": 23.691499710083008, "learning_rate": 8e-05, "loss": 36.5851, "num_input_tokens_seen": 433140292, "step": 8406 }, { "epoch": 0.8264170413503353, "grad_norm": 23.410005569458008, "learning_rate": 8e-05, "loss": 34.4295, "num_input_tokens_seen": 433277012, "step": 8409 }, { "epoch": 0.8267118744011204, "grad_norm": 23.00798988342285, "learning_rate": 8e-05, "loss": 35.6506, "num_input_tokens_seen": 433426164, "step": 8412 }, { "epoch": 0.8270067074519054, "grad_norm": 21.08866310119629, "learning_rate": 8e-05, "loss": 32.9966, "num_input_tokens_seen": 433586000, "step": 8415 }, { "epoch": 0.8273015405026903, "grad_norm": 24.12877655029297, "learning_rate": 8e-05, "loss": 37.5246, "num_input_tokens_seen": 433742360, "step": 8418 }, { "epoch": 0.8275963735534754, "grad_norm": 23.096635818481445, "learning_rate": 8e-05, "loss": 35.271, "num_input_tokens_seen": 433883892, "step": 8421 }, { "epoch": 0.8278912066042603, "grad_norm": 23.10941505432129, "learning_rate": 8e-05, "loss": 33.7624, "num_input_tokens_seen": 434026752, "step": 8424 }, { "epoch": 0.8281860396550453, "grad_norm": 25.72688102722168, "learning_rate": 8e-05, "loss": 37.7114, "num_input_tokens_seen": 434179836, "step": 8427 }, { "epoch": 0.8284808727058304, "grad_norm": 23.44504737854004, "learning_rate": 8e-05, "loss": 37.4356, "num_input_tokens_seen": 434342732, "step": 8430 }, { "epoch": 0.8287757057566153, "grad_norm": 23.88396644592285, "learning_rate": 8e-05, "loss": 34.6538, "num_input_tokens_seen": 434483484, "step": 8433 }, { "epoch": 0.8290705388074003, "grad_norm": 24.157764434814453, "learning_rate": 8e-05, "loss": 35.9479, "num_input_tokens_seen": 434631000, "step": 8436 }, { "epoch": 0.8293653718581853, "grad_norm": 25.783918380737305, "learning_rate": 8e-05, "loss": 34.5447, "num_input_tokens_seen": 434777308, "step": 8439 }, { "epoch": 0.8296602049089703, "grad_norm": 22.558677673339844, "learning_rate": 8e-05, "loss": 34.3088, "num_input_tokens_seen": 434959584, "step": 8442 }, { "epoch": 0.8299550379597553, "grad_norm": 22.762332916259766, "learning_rate": 8e-05, "loss": 37.8718, "num_input_tokens_seen": 435115636, "step": 8445 }, { "epoch": 0.8302498710105403, "grad_norm": 21.599580764770508, "learning_rate": 8e-05, "loss": 33.6288, "num_input_tokens_seen": 435268344, "step": 8448 }, { "epoch": 0.8305447040613253, "grad_norm": 23.022197723388672, "learning_rate": 8e-05, "loss": 39.1464, "num_input_tokens_seen": 435408740, "step": 8451 }, { "epoch": 0.8308395371121102, "grad_norm": 21.746292114257812, "learning_rate": 8e-05, "loss": 34.105, "num_input_tokens_seen": 435567116, "step": 8454 }, { "epoch": 0.8311343701628953, "grad_norm": 28.59079360961914, "learning_rate": 8e-05, "loss": 36.243, "num_input_tokens_seen": 435724948, "step": 8457 }, { "epoch": 0.8314292032136803, "grad_norm": 25.476356506347656, "learning_rate": 8e-05, "loss": 37.6103, "num_input_tokens_seen": 435870588, "step": 8460 }, { "epoch": 0.8317240362644652, "grad_norm": 23.399967193603516, "learning_rate": 8e-05, "loss": 33.0977, "num_input_tokens_seen": 436014732, "step": 8463 }, { "epoch": 0.8320188693152503, "grad_norm": 29.340322494506836, "learning_rate": 8e-05, "loss": 35.883, "num_input_tokens_seen": 436163556, "step": 8466 }, { "epoch": 0.8323137023660352, "grad_norm": 25.349374771118164, "learning_rate": 8e-05, "loss": 38.2025, "num_input_tokens_seen": 436319124, "step": 8469 }, { "epoch": 0.8326085354168202, "grad_norm": 25.654415130615234, "learning_rate": 8e-05, "loss": 35.1885, "num_input_tokens_seen": 436452984, "step": 8472 }, { "epoch": 0.8329033684676053, "grad_norm": 34.60222625732422, "learning_rate": 8e-05, "loss": 34.6596, "num_input_tokens_seen": 436597484, "step": 8475 }, { "epoch": 0.8331982015183902, "grad_norm": 21.619489669799805, "learning_rate": 8e-05, "loss": 33.9803, "num_input_tokens_seen": 436738324, "step": 8478 }, { "epoch": 0.8334930345691752, "grad_norm": 27.300823211669922, "learning_rate": 8e-05, "loss": 37.2664, "num_input_tokens_seen": 436902392, "step": 8481 }, { "epoch": 0.8337878676199602, "grad_norm": 18.91798210144043, "learning_rate": 8e-05, "loss": 32.6294, "num_input_tokens_seen": 437055092, "step": 8484 }, { "epoch": 0.8340827006707452, "grad_norm": 22.521039962768555, "learning_rate": 8e-05, "loss": 38.8009, "num_input_tokens_seen": 437212152, "step": 8487 }, { "epoch": 0.8343775337215302, "grad_norm": 39.985023498535156, "learning_rate": 8e-05, "loss": 33.9871, "num_input_tokens_seen": 437373860, "step": 8490 }, { "epoch": 0.8346723667723152, "grad_norm": 23.792661666870117, "learning_rate": 8e-05, "loss": 41.1392, "num_input_tokens_seen": 437543544, "step": 8493 }, { "epoch": 0.8349671998231002, "grad_norm": 22.138065338134766, "learning_rate": 8e-05, "loss": 35.8017, "num_input_tokens_seen": 437715852, "step": 8496 }, { "epoch": 0.8352620328738851, "grad_norm": 21.686561584472656, "learning_rate": 8e-05, "loss": 34.2745, "num_input_tokens_seen": 437861004, "step": 8499 }, { "epoch": 0.8355568659246702, "grad_norm": 24.75675392150879, "learning_rate": 8e-05, "loss": 33.8077, "num_input_tokens_seen": 438041664, "step": 8502 }, { "epoch": 0.8358516989754552, "grad_norm": 27.919275283813477, "learning_rate": 8e-05, "loss": 35.537, "num_input_tokens_seen": 438220600, "step": 8505 }, { "epoch": 0.8361465320262401, "grad_norm": 25.631576538085938, "learning_rate": 8e-05, "loss": 35.2494, "num_input_tokens_seen": 438362040, "step": 8508 }, { "epoch": 0.8364413650770252, "grad_norm": 23.383817672729492, "learning_rate": 8e-05, "loss": 38.0361, "num_input_tokens_seen": 438518572, "step": 8511 }, { "epoch": 0.8367361981278101, "grad_norm": 28.416120529174805, "learning_rate": 8e-05, "loss": 37.6057, "num_input_tokens_seen": 438686732, "step": 8514 }, { "epoch": 0.8370310311785951, "grad_norm": 23.688934326171875, "learning_rate": 8e-05, "loss": 34.7264, "num_input_tokens_seen": 438825692, "step": 8517 }, { "epoch": 0.8373258642293802, "grad_norm": 23.2996883392334, "learning_rate": 8e-05, "loss": 32.613, "num_input_tokens_seen": 438966772, "step": 8520 }, { "epoch": 0.8376206972801651, "grad_norm": 23.35087776184082, "learning_rate": 8e-05, "loss": 35.096, "num_input_tokens_seen": 439151532, "step": 8523 }, { "epoch": 0.8379155303309501, "grad_norm": 24.346364974975586, "learning_rate": 8e-05, "loss": 37.9784, "num_input_tokens_seen": 439316152, "step": 8526 }, { "epoch": 0.838210363381735, "grad_norm": 24.31075668334961, "learning_rate": 8e-05, "loss": 39.3151, "num_input_tokens_seen": 439476964, "step": 8529 }, { "epoch": 0.8385051964325201, "grad_norm": 24.63071060180664, "learning_rate": 8e-05, "loss": 36.7107, "num_input_tokens_seen": 439631960, "step": 8532 }, { "epoch": 0.8388000294833051, "grad_norm": 24.59788703918457, "learning_rate": 8e-05, "loss": 35.1999, "num_input_tokens_seen": 439790576, "step": 8535 }, { "epoch": 0.83909486253409, "grad_norm": 23.76664924621582, "learning_rate": 8e-05, "loss": 36.6624, "num_input_tokens_seen": 439941332, "step": 8538 }, { "epoch": 0.8393896955848751, "grad_norm": 33.22909927368164, "learning_rate": 8e-05, "loss": 33.9235, "num_input_tokens_seen": 440082820, "step": 8541 }, { "epoch": 0.83968452863566, "grad_norm": 26.295854568481445, "learning_rate": 8e-05, "loss": 35.2236, "num_input_tokens_seen": 440242188, "step": 8544 }, { "epoch": 0.839979361686445, "grad_norm": 21.903047561645508, "learning_rate": 8e-05, "loss": 34.7781, "num_input_tokens_seen": 440408484, "step": 8547 }, { "epoch": 0.8402741947372301, "grad_norm": 28.9826717376709, "learning_rate": 8e-05, "loss": 36.0424, "num_input_tokens_seen": 440564132, "step": 8550 }, { "epoch": 0.840569027788015, "grad_norm": 26.4545955657959, "learning_rate": 8e-05, "loss": 33.648, "num_input_tokens_seen": 440725396, "step": 8553 }, { "epoch": 0.8408638608388, "grad_norm": 21.50971221923828, "learning_rate": 8e-05, "loss": 33.2473, "num_input_tokens_seen": 440871052, "step": 8556 }, { "epoch": 0.841158693889585, "grad_norm": 21.93988800048828, "learning_rate": 8e-05, "loss": 36.9516, "num_input_tokens_seen": 441011984, "step": 8559 }, { "epoch": 0.84145352694037, "grad_norm": 25.327268600463867, "learning_rate": 8e-05, "loss": 34.4141, "num_input_tokens_seen": 441164920, "step": 8562 }, { "epoch": 0.841748359991155, "grad_norm": 25.629833221435547, "learning_rate": 8e-05, "loss": 35.3238, "num_input_tokens_seen": 441309320, "step": 8565 }, { "epoch": 0.84204319304194, "grad_norm": 31.811267852783203, "learning_rate": 8e-05, "loss": 35.6597, "num_input_tokens_seen": 441475116, "step": 8568 }, { "epoch": 0.842338026092725, "grad_norm": 24.13558006286621, "learning_rate": 8e-05, "loss": 32.54, "num_input_tokens_seen": 441604948, "step": 8571 }, { "epoch": 0.8426328591435099, "grad_norm": 22.01540756225586, "learning_rate": 8e-05, "loss": 35.1774, "num_input_tokens_seen": 441776644, "step": 8574 }, { "epoch": 0.842927692194295, "grad_norm": 20.971839904785156, "learning_rate": 8e-05, "loss": 37.2688, "num_input_tokens_seen": 441924832, "step": 8577 }, { "epoch": 0.84322252524508, "grad_norm": 21.542495727539062, "learning_rate": 8e-05, "loss": 37.1687, "num_input_tokens_seen": 442075104, "step": 8580 }, { "epoch": 0.8435173582958649, "grad_norm": 26.712690353393555, "learning_rate": 8e-05, "loss": 33.4731, "num_input_tokens_seen": 442214348, "step": 8583 }, { "epoch": 0.84381219134665, "grad_norm": 24.981464385986328, "learning_rate": 8e-05, "loss": 35.7284, "num_input_tokens_seen": 442369648, "step": 8586 }, { "epoch": 0.8441070243974349, "grad_norm": 24.999849319458008, "learning_rate": 8e-05, "loss": 35.2039, "num_input_tokens_seen": 442540736, "step": 8589 }, { "epoch": 0.8444018574482199, "grad_norm": 24.549636840820312, "learning_rate": 8e-05, "loss": 35.5619, "num_input_tokens_seen": 442713112, "step": 8592 }, { "epoch": 0.844696690499005, "grad_norm": 25.977907180786133, "learning_rate": 8e-05, "loss": 37.0179, "num_input_tokens_seen": 442869764, "step": 8595 }, { "epoch": 0.8449915235497899, "grad_norm": 30.217334747314453, "learning_rate": 8e-05, "loss": 34.2794, "num_input_tokens_seen": 443017248, "step": 8598 }, { "epoch": 0.8452863566005749, "grad_norm": 21.913938522338867, "learning_rate": 8e-05, "loss": 37.9163, "num_input_tokens_seen": 443159376, "step": 8601 }, { "epoch": 0.8455811896513599, "grad_norm": 23.640243530273438, "learning_rate": 8e-05, "loss": 34.5981, "num_input_tokens_seen": 443288428, "step": 8604 }, { "epoch": 0.8458760227021449, "grad_norm": 20.208879470825195, "learning_rate": 8e-05, "loss": 33.8835, "num_input_tokens_seen": 443449712, "step": 8607 }, { "epoch": 0.8461708557529299, "grad_norm": 21.805574417114258, "learning_rate": 8e-05, "loss": 36.3991, "num_input_tokens_seen": 443612872, "step": 8610 }, { "epoch": 0.8464656888037149, "grad_norm": 22.304637908935547, "learning_rate": 8e-05, "loss": 38.1468, "num_input_tokens_seen": 443766068, "step": 8613 }, { "epoch": 0.8467605218544999, "grad_norm": 24.58745765686035, "learning_rate": 8e-05, "loss": 37.1603, "num_input_tokens_seen": 443937012, "step": 8616 }, { "epoch": 0.8470553549052848, "grad_norm": 24.43326187133789, "learning_rate": 8e-05, "loss": 34.2103, "num_input_tokens_seen": 444088396, "step": 8619 }, { "epoch": 0.8473501879560699, "grad_norm": 24.7650089263916, "learning_rate": 8e-05, "loss": 36.8356, "num_input_tokens_seen": 444250968, "step": 8622 }, { "epoch": 0.8476450210068549, "grad_norm": 20.300933837890625, "learning_rate": 8e-05, "loss": 35.2136, "num_input_tokens_seen": 444411484, "step": 8625 }, { "epoch": 0.8479398540576398, "grad_norm": 21.370935440063477, "learning_rate": 8e-05, "loss": 36.4811, "num_input_tokens_seen": 444578168, "step": 8628 }, { "epoch": 0.8482346871084249, "grad_norm": 24.828235626220703, "learning_rate": 8e-05, "loss": 36.1291, "num_input_tokens_seen": 444727816, "step": 8631 }, { "epoch": 0.8485295201592098, "grad_norm": 25.074390411376953, "learning_rate": 8e-05, "loss": 36.5011, "num_input_tokens_seen": 444885136, "step": 8634 }, { "epoch": 0.8488243532099948, "grad_norm": 24.60084342956543, "learning_rate": 8e-05, "loss": 40.491, "num_input_tokens_seen": 445047136, "step": 8637 }, { "epoch": 0.8491191862607799, "grad_norm": 19.854074478149414, "learning_rate": 8e-05, "loss": 36.0442, "num_input_tokens_seen": 445217352, "step": 8640 }, { "epoch": 0.8494140193115648, "grad_norm": 24.5972843170166, "learning_rate": 8e-05, "loss": 35.3802, "num_input_tokens_seen": 445358576, "step": 8643 }, { "epoch": 0.8497088523623498, "grad_norm": 24.3794002532959, "learning_rate": 8e-05, "loss": 34.9128, "num_input_tokens_seen": 445496724, "step": 8646 }, { "epoch": 0.8500036854131348, "grad_norm": 44.509586334228516, "learning_rate": 8e-05, "loss": 37.9704, "num_input_tokens_seen": 445634380, "step": 8649 }, { "epoch": 0.8502985184639198, "grad_norm": 22.681522369384766, "learning_rate": 8e-05, "loss": 38.7094, "num_input_tokens_seen": 445791312, "step": 8652 }, { "epoch": 0.8505933515147048, "grad_norm": 31.228179931640625, "learning_rate": 8e-05, "loss": 35.9362, "num_input_tokens_seen": 445939464, "step": 8655 }, { "epoch": 0.8508881845654898, "grad_norm": 26.90880584716797, "learning_rate": 8e-05, "loss": 36.8482, "num_input_tokens_seen": 446091944, "step": 8658 }, { "epoch": 0.8511830176162748, "grad_norm": 23.329309463500977, "learning_rate": 8e-05, "loss": 33.0599, "num_input_tokens_seen": 446275900, "step": 8661 }, { "epoch": 0.8514778506670598, "grad_norm": 25.813411712646484, "learning_rate": 8e-05, "loss": 37.8099, "num_input_tokens_seen": 446417860, "step": 8664 }, { "epoch": 0.8517726837178448, "grad_norm": 26.515968322753906, "learning_rate": 8e-05, "loss": 35.298, "num_input_tokens_seen": 446582760, "step": 8667 }, { "epoch": 0.8520675167686298, "grad_norm": 32.57109832763672, "learning_rate": 8e-05, "loss": 34.4064, "num_input_tokens_seen": 446744728, "step": 8670 }, { "epoch": 0.8523623498194147, "grad_norm": 21.738887786865234, "learning_rate": 8e-05, "loss": 29.7742, "num_input_tokens_seen": 446889608, "step": 8673 }, { "epoch": 0.8526571828701998, "grad_norm": 24.397785186767578, "learning_rate": 8e-05, "loss": 34.2036, "num_input_tokens_seen": 447058880, "step": 8676 }, { "epoch": 0.8529520159209848, "grad_norm": 25.896568298339844, "learning_rate": 8e-05, "loss": 36.5112, "num_input_tokens_seen": 447198284, "step": 8679 }, { "epoch": 0.8532468489717697, "grad_norm": 25.91545867919922, "learning_rate": 8e-05, "loss": 35.4539, "num_input_tokens_seen": 447326096, "step": 8682 }, { "epoch": 0.8535416820225548, "grad_norm": 235.16880798339844, "learning_rate": 8e-05, "loss": 30.8254, "num_input_tokens_seen": 447471968, "step": 8685 }, { "epoch": 0.8538365150733397, "grad_norm": 25.915328979492188, "learning_rate": 8e-05, "loss": 34.4214, "num_input_tokens_seen": 447621200, "step": 8688 }, { "epoch": 0.8541313481241247, "grad_norm": 22.61929702758789, "learning_rate": 8e-05, "loss": 33.0711, "num_input_tokens_seen": 447759144, "step": 8691 }, { "epoch": 0.8544261811749098, "grad_norm": 64.58021545410156, "learning_rate": 8e-05, "loss": 35.5584, "num_input_tokens_seen": 447912668, "step": 8694 }, { "epoch": 0.8547210142256947, "grad_norm": 23.018518447875977, "learning_rate": 8e-05, "loss": 36.516, "num_input_tokens_seen": 448061476, "step": 8697 }, { "epoch": 0.8550158472764797, "grad_norm": 26.483489990234375, "learning_rate": 8e-05, "loss": 37.8034, "num_input_tokens_seen": 448227060, "step": 8700 }, { "epoch": 0.8553106803272646, "grad_norm": 24.367496490478516, "learning_rate": 8e-05, "loss": 33.9807, "num_input_tokens_seen": 448373212, "step": 8703 }, { "epoch": 0.8556055133780497, "grad_norm": 23.358795166015625, "learning_rate": 8e-05, "loss": 33.6428, "num_input_tokens_seen": 448524656, "step": 8706 }, { "epoch": 0.8559003464288347, "grad_norm": 23.506254196166992, "learning_rate": 8e-05, "loss": 38.0909, "num_input_tokens_seen": 448690976, "step": 8709 }, { "epoch": 0.8561951794796197, "grad_norm": 22.54275131225586, "learning_rate": 8e-05, "loss": 35.3043, "num_input_tokens_seen": 448850068, "step": 8712 }, { "epoch": 0.8564900125304047, "grad_norm": 28.30064582824707, "learning_rate": 8e-05, "loss": 36.4259, "num_input_tokens_seen": 448999892, "step": 8715 }, { "epoch": 0.8567848455811896, "grad_norm": 23.732145309448242, "learning_rate": 8e-05, "loss": 36.0793, "num_input_tokens_seen": 449151824, "step": 8718 }, { "epoch": 0.8570796786319747, "grad_norm": 26.53993797302246, "learning_rate": 8e-05, "loss": 35.8181, "num_input_tokens_seen": 449326968, "step": 8721 }, { "epoch": 0.8573745116827597, "grad_norm": 21.584989547729492, "learning_rate": 8e-05, "loss": 33.9483, "num_input_tokens_seen": 449489480, "step": 8724 }, { "epoch": 0.8576693447335446, "grad_norm": 20.787649154663086, "learning_rate": 8e-05, "loss": 35.5196, "num_input_tokens_seen": 449643276, "step": 8727 }, { "epoch": 0.8579641777843297, "grad_norm": 36.44699478149414, "learning_rate": 8e-05, "loss": 35.5228, "num_input_tokens_seen": 449813676, "step": 8730 }, { "epoch": 0.8582590108351146, "grad_norm": 24.554771423339844, "learning_rate": 8e-05, "loss": 33.7362, "num_input_tokens_seen": 449952952, "step": 8733 }, { "epoch": 0.8585538438858996, "grad_norm": 21.766437530517578, "learning_rate": 8e-05, "loss": 36.0747, "num_input_tokens_seen": 450100052, "step": 8736 }, { "epoch": 0.8588486769366847, "grad_norm": 25.506664276123047, "learning_rate": 8e-05, "loss": 36.1313, "num_input_tokens_seen": 450234676, "step": 8739 }, { "epoch": 0.8591435099874696, "grad_norm": 20.974655151367188, "learning_rate": 8e-05, "loss": 36.4953, "num_input_tokens_seen": 450399724, "step": 8742 }, { "epoch": 0.8594383430382546, "grad_norm": 21.562509536743164, "learning_rate": 8e-05, "loss": 34.0653, "num_input_tokens_seen": 450567292, "step": 8745 }, { "epoch": 0.8597331760890395, "grad_norm": 27.16927146911621, "learning_rate": 8e-05, "loss": 32.6643, "num_input_tokens_seen": 450756072, "step": 8748 }, { "epoch": 0.8600280091398246, "grad_norm": 27.619579315185547, "learning_rate": 8e-05, "loss": 32.074, "num_input_tokens_seen": 450934544, "step": 8751 }, { "epoch": 0.8603228421906096, "grad_norm": 20.881961822509766, "learning_rate": 8e-05, "loss": 34.7699, "num_input_tokens_seen": 451095764, "step": 8754 }, { "epoch": 0.8606176752413945, "grad_norm": 20.835350036621094, "learning_rate": 8e-05, "loss": 33.1017, "num_input_tokens_seen": 451260184, "step": 8757 }, { "epoch": 0.8609125082921796, "grad_norm": 38.06818771362305, "learning_rate": 8e-05, "loss": 35.8208, "num_input_tokens_seen": 451409368, "step": 8760 }, { "epoch": 0.8612073413429645, "grad_norm": 28.952232360839844, "learning_rate": 8e-05, "loss": 33.0805, "num_input_tokens_seen": 451566212, "step": 8763 }, { "epoch": 0.8615021743937495, "grad_norm": 22.913902282714844, "learning_rate": 8e-05, "loss": 35.8183, "num_input_tokens_seen": 451722756, "step": 8766 }, { "epoch": 0.8617970074445346, "grad_norm": 21.72722625732422, "learning_rate": 8e-05, "loss": 30.1934, "num_input_tokens_seen": 451869548, "step": 8769 }, { "epoch": 0.8620918404953195, "grad_norm": 20.529747009277344, "learning_rate": 8e-05, "loss": 37.0564, "num_input_tokens_seen": 452039380, "step": 8772 }, { "epoch": 0.8623866735461045, "grad_norm": 25.314756393432617, "learning_rate": 8e-05, "loss": 35.3177, "num_input_tokens_seen": 452186872, "step": 8775 }, { "epoch": 0.8626815065968895, "grad_norm": 22.90513801574707, "learning_rate": 8e-05, "loss": 37.0339, "num_input_tokens_seen": 452336444, "step": 8778 }, { "epoch": 0.8629763396476745, "grad_norm": 23.360958099365234, "learning_rate": 8e-05, "loss": 34.8022, "num_input_tokens_seen": 452475436, "step": 8781 }, { "epoch": 0.8632711726984595, "grad_norm": 21.173948287963867, "learning_rate": 8e-05, "loss": 32.445, "num_input_tokens_seen": 452653208, "step": 8784 }, { "epoch": 0.8635660057492445, "grad_norm": 21.700834274291992, "learning_rate": 8e-05, "loss": 33.6184, "num_input_tokens_seen": 452806456, "step": 8787 }, { "epoch": 0.8638608388000295, "grad_norm": 22.173959732055664, "learning_rate": 8e-05, "loss": 34.0303, "num_input_tokens_seen": 452968584, "step": 8790 }, { "epoch": 0.8641556718508144, "grad_norm": 24.537803649902344, "learning_rate": 8e-05, "loss": 39.1815, "num_input_tokens_seen": 453119356, "step": 8793 }, { "epoch": 0.8644505049015995, "grad_norm": 20.4522762298584, "learning_rate": 8e-05, "loss": 35.7433, "num_input_tokens_seen": 453280312, "step": 8796 }, { "epoch": 0.8647453379523845, "grad_norm": 22.030925750732422, "learning_rate": 8e-05, "loss": 34.1829, "num_input_tokens_seen": 453433044, "step": 8799 }, { "epoch": 0.8650401710031694, "grad_norm": 22.265579223632812, "learning_rate": 8e-05, "loss": 32.7075, "num_input_tokens_seen": 453573752, "step": 8802 }, { "epoch": 0.8653350040539545, "grad_norm": 33.58341598510742, "learning_rate": 8e-05, "loss": 35.9601, "num_input_tokens_seen": 453732492, "step": 8805 }, { "epoch": 0.8656298371047394, "grad_norm": 20.91718292236328, "learning_rate": 8e-05, "loss": 36.4737, "num_input_tokens_seen": 453883240, "step": 8808 }, { "epoch": 0.8659246701555244, "grad_norm": 17.35066795349121, "learning_rate": 8e-05, "loss": 31.8058, "num_input_tokens_seen": 454042468, "step": 8811 }, { "epoch": 0.8662195032063095, "grad_norm": 23.744104385375977, "learning_rate": 8e-05, "loss": 34.8083, "num_input_tokens_seen": 454194980, "step": 8814 }, { "epoch": 0.8665143362570944, "grad_norm": 21.31110954284668, "learning_rate": 8e-05, "loss": 36.2735, "num_input_tokens_seen": 454358280, "step": 8817 }, { "epoch": 0.8668091693078794, "grad_norm": 31.886720657348633, "learning_rate": 8e-05, "loss": 34.488, "num_input_tokens_seen": 454501644, "step": 8820 }, { "epoch": 0.8671040023586644, "grad_norm": 26.252796173095703, "learning_rate": 8e-05, "loss": 34.3354, "num_input_tokens_seen": 454654120, "step": 8823 }, { "epoch": 0.8673988354094494, "grad_norm": 26.040729522705078, "learning_rate": 8e-05, "loss": 35.9982, "num_input_tokens_seen": 454844272, "step": 8826 }, { "epoch": 0.8676936684602344, "grad_norm": 23.30434226989746, "learning_rate": 8e-05, "loss": 34.3949, "num_input_tokens_seen": 454992640, "step": 8829 }, { "epoch": 0.8679885015110194, "grad_norm": 22.62255096435547, "learning_rate": 8e-05, "loss": 35.0844, "num_input_tokens_seen": 455155308, "step": 8832 }, { "epoch": 0.8682833345618044, "grad_norm": 20.258991241455078, "learning_rate": 8e-05, "loss": 34.8521, "num_input_tokens_seen": 455306464, "step": 8835 }, { "epoch": 0.8685781676125893, "grad_norm": 37.642086029052734, "learning_rate": 8e-05, "loss": 31.3964, "num_input_tokens_seen": 455458584, "step": 8838 }, { "epoch": 0.8688730006633744, "grad_norm": 20.638486862182617, "learning_rate": 8e-05, "loss": 35.4098, "num_input_tokens_seen": 455603556, "step": 8841 }, { "epoch": 0.8691678337141594, "grad_norm": 19.583791732788086, "learning_rate": 8e-05, "loss": 33.9662, "num_input_tokens_seen": 455750708, "step": 8844 }, { "epoch": 0.8694626667649443, "grad_norm": 22.0402774810791, "learning_rate": 8e-05, "loss": 39.1685, "num_input_tokens_seen": 455912140, "step": 8847 }, { "epoch": 0.8697574998157294, "grad_norm": 23.528413772583008, "learning_rate": 8e-05, "loss": 37.4517, "num_input_tokens_seen": 456084876, "step": 8850 }, { "epoch": 0.8700523328665143, "grad_norm": 22.013490676879883, "learning_rate": 8e-05, "loss": 34.666, "num_input_tokens_seen": 456246060, "step": 8853 }, { "epoch": 0.8703471659172993, "grad_norm": 22.675901412963867, "learning_rate": 8e-05, "loss": 34.4143, "num_input_tokens_seen": 456389620, "step": 8856 }, { "epoch": 0.8706419989680844, "grad_norm": 23.27752113342285, "learning_rate": 8e-05, "loss": 35.8503, "num_input_tokens_seen": 456567184, "step": 8859 }, { "epoch": 0.8709368320188693, "grad_norm": 20.808027267456055, "learning_rate": 8e-05, "loss": 36.6624, "num_input_tokens_seen": 456726572, "step": 8862 }, { "epoch": 0.8712316650696543, "grad_norm": 21.771181106567383, "learning_rate": 8e-05, "loss": 32.4634, "num_input_tokens_seen": 456871844, "step": 8865 }, { "epoch": 0.8715264981204393, "grad_norm": 28.904388427734375, "learning_rate": 8e-05, "loss": 33.9851, "num_input_tokens_seen": 457027984, "step": 8868 }, { "epoch": 0.8718213311712243, "grad_norm": 32.225040435791016, "learning_rate": 8e-05, "loss": 33.2702, "num_input_tokens_seen": 457181788, "step": 8871 }, { "epoch": 0.8721161642220093, "grad_norm": 22.55632209777832, "learning_rate": 8e-05, "loss": 35.1886, "num_input_tokens_seen": 457349848, "step": 8874 }, { "epoch": 0.8724109972727943, "grad_norm": 22.402708053588867, "learning_rate": 8e-05, "loss": 28.6777, "num_input_tokens_seen": 457518292, "step": 8877 }, { "epoch": 0.8727058303235793, "grad_norm": 21.531339645385742, "learning_rate": 8e-05, "loss": 32.3938, "num_input_tokens_seen": 457672928, "step": 8880 }, { "epoch": 0.8730006633743642, "grad_norm": 23.64670181274414, "learning_rate": 8e-05, "loss": 35.7909, "num_input_tokens_seen": 457831076, "step": 8883 }, { "epoch": 0.8732954964251493, "grad_norm": 27.580236434936523, "learning_rate": 8e-05, "loss": 35.1965, "num_input_tokens_seen": 457994408, "step": 8886 }, { "epoch": 0.8735903294759343, "grad_norm": 20.507062911987305, "learning_rate": 8e-05, "loss": 34.1822, "num_input_tokens_seen": 458163696, "step": 8889 }, { "epoch": 0.8738851625267192, "grad_norm": 22.15749168395996, "learning_rate": 8e-05, "loss": 33.8258, "num_input_tokens_seen": 458312932, "step": 8892 }, { "epoch": 0.8741799955775043, "grad_norm": 40.3120002746582, "learning_rate": 8e-05, "loss": 34.9081, "num_input_tokens_seen": 458438996, "step": 8895 }, { "epoch": 0.8744748286282892, "grad_norm": 25.363845825195312, "learning_rate": 8e-05, "loss": 35.5967, "num_input_tokens_seen": 458578764, "step": 8898 }, { "epoch": 0.8747696616790742, "grad_norm": 22.670642852783203, "learning_rate": 8e-05, "loss": 33.0508, "num_input_tokens_seen": 458726804, "step": 8901 }, { "epoch": 0.8750644947298593, "grad_norm": 24.797653198242188, "learning_rate": 8e-05, "loss": 32.97, "num_input_tokens_seen": 458882348, "step": 8904 }, { "epoch": 0.8753593277806442, "grad_norm": 26.080148696899414, "learning_rate": 8e-05, "loss": 34.2268, "num_input_tokens_seen": 459035532, "step": 8907 }, { "epoch": 0.8756541608314292, "grad_norm": 27.912677764892578, "learning_rate": 8e-05, "loss": 30.2107, "num_input_tokens_seen": 459189060, "step": 8910 }, { "epoch": 0.8759489938822141, "grad_norm": 23.80186653137207, "learning_rate": 8e-05, "loss": 36.9899, "num_input_tokens_seen": 459339772, "step": 8913 }, { "epoch": 0.8762438269329992, "grad_norm": 23.507909774780273, "learning_rate": 8e-05, "loss": 38.24, "num_input_tokens_seen": 459525644, "step": 8916 }, { "epoch": 0.8765386599837842, "grad_norm": 20.912324905395508, "learning_rate": 8e-05, "loss": 35.2431, "num_input_tokens_seen": 459672648, "step": 8919 }, { "epoch": 0.8768334930345691, "grad_norm": 23.98154640197754, "learning_rate": 8e-05, "loss": 34.0235, "num_input_tokens_seen": 459825628, "step": 8922 }, { "epoch": 0.8771283260853542, "grad_norm": 20.608129501342773, "learning_rate": 8e-05, "loss": 35.8222, "num_input_tokens_seen": 459977856, "step": 8925 }, { "epoch": 0.8774231591361391, "grad_norm": 27.58140754699707, "learning_rate": 8e-05, "loss": 34.5154, "num_input_tokens_seen": 460137076, "step": 8928 }, { "epoch": 0.8777179921869241, "grad_norm": 24.529319763183594, "learning_rate": 8e-05, "loss": 38.3434, "num_input_tokens_seen": 460300812, "step": 8931 }, { "epoch": 0.8780128252377092, "grad_norm": 26.84439468383789, "learning_rate": 8e-05, "loss": 31.479, "num_input_tokens_seen": 460439784, "step": 8934 }, { "epoch": 0.8783076582884941, "grad_norm": 22.796037673950195, "learning_rate": 8e-05, "loss": 36.2946, "num_input_tokens_seen": 460583252, "step": 8937 }, { "epoch": 0.8786024913392791, "grad_norm": 25.805952072143555, "learning_rate": 8e-05, "loss": 35.5886, "num_input_tokens_seen": 460727672, "step": 8940 }, { "epoch": 0.8788973243900641, "grad_norm": 20.5937557220459, "learning_rate": 8e-05, "loss": 36.3242, "num_input_tokens_seen": 460884440, "step": 8943 }, { "epoch": 0.8791921574408491, "grad_norm": 20.25699234008789, "learning_rate": 8e-05, "loss": 32.9865, "num_input_tokens_seen": 461038208, "step": 8946 }, { "epoch": 0.8794869904916341, "grad_norm": 25.006460189819336, "learning_rate": 8e-05, "loss": 37.2056, "num_input_tokens_seen": 461195296, "step": 8949 }, { "epoch": 0.8797818235424191, "grad_norm": 23.678054809570312, "learning_rate": 8e-05, "loss": 33.6716, "num_input_tokens_seen": 461355160, "step": 8952 }, { "epoch": 0.8800766565932041, "grad_norm": 24.38894271850586, "learning_rate": 8e-05, "loss": 39.0701, "num_input_tokens_seen": 461502236, "step": 8955 }, { "epoch": 0.880371489643989, "grad_norm": 22.873247146606445, "learning_rate": 8e-05, "loss": 36.7938, "num_input_tokens_seen": 461676032, "step": 8958 }, { "epoch": 0.8806663226947741, "grad_norm": 24.20355796813965, "learning_rate": 8e-05, "loss": 38.5804, "num_input_tokens_seen": 461823272, "step": 8961 }, { "epoch": 0.8809611557455591, "grad_norm": 23.963706970214844, "learning_rate": 8e-05, "loss": 34.3477, "num_input_tokens_seen": 461965960, "step": 8964 }, { "epoch": 0.881255988796344, "grad_norm": 21.74570655822754, "learning_rate": 8e-05, "loss": 38.7081, "num_input_tokens_seen": 462104796, "step": 8967 }, { "epoch": 0.8815508218471291, "grad_norm": 23.517004013061523, "learning_rate": 8e-05, "loss": 38.1686, "num_input_tokens_seen": 462271072, "step": 8970 }, { "epoch": 0.8818456548979141, "grad_norm": 23.73151206970215, "learning_rate": 8e-05, "loss": 29.5466, "num_input_tokens_seen": 462399608, "step": 8973 }, { "epoch": 0.882140487948699, "grad_norm": 27.569501876831055, "learning_rate": 8e-05, "loss": 38.4012, "num_input_tokens_seen": 462539888, "step": 8976 }, { "epoch": 0.8824353209994841, "grad_norm": 58.874732971191406, "learning_rate": 8e-05, "loss": 34.0895, "num_input_tokens_seen": 462694848, "step": 8979 }, { "epoch": 0.882730154050269, "grad_norm": 21.32411003112793, "learning_rate": 8e-05, "loss": 37.7922, "num_input_tokens_seen": 462872488, "step": 8982 }, { "epoch": 0.883024987101054, "grad_norm": 24.835783004760742, "learning_rate": 8e-05, "loss": 36.0279, "num_input_tokens_seen": 463015268, "step": 8985 }, { "epoch": 0.8833198201518391, "grad_norm": 34.98569869995117, "learning_rate": 8e-05, "loss": 38.1697, "num_input_tokens_seen": 463182416, "step": 8988 }, { "epoch": 0.883614653202624, "grad_norm": 20.140106201171875, "learning_rate": 8e-05, "loss": 33.6916, "num_input_tokens_seen": 463330240, "step": 8991 }, { "epoch": 0.883909486253409, "grad_norm": 22.520090103149414, "learning_rate": 8e-05, "loss": 35.2222, "num_input_tokens_seen": 463491564, "step": 8994 }, { "epoch": 0.884204319304194, "grad_norm": 22.722288131713867, "learning_rate": 8e-05, "loss": 38.6709, "num_input_tokens_seen": 463644744, "step": 8997 }, { "epoch": 0.884499152354979, "grad_norm": 19.819669723510742, "learning_rate": 8e-05, "loss": 33.1455, "num_input_tokens_seen": 463798308, "step": 9000 }, { "epoch": 0.884499152354979, "eval_gen_len": 36.3, "eval_loss": 2.2607827186584473, "eval_rouge1": 48.6856, "eval_rouge2": 32.331, "eval_rougeL": 44.6585, "eval_rougeLsum": 45.0587, "eval_runtime": 119.5771, "eval_samples_per_second": 1.673, "eval_steps_per_second": 0.418, "num_input_tokens_seen": 463798308, "step": 9000 }, { "epoch": 0.884793985405764, "grad_norm": 23.7694149017334, "learning_rate": 8e-05, "loss": 37.3976, "num_input_tokens_seen": 463964648, "step": 9003 }, { "epoch": 0.885088818456549, "grad_norm": 22.439407348632812, "learning_rate": 8e-05, "loss": 34.1451, "num_input_tokens_seen": 464146436, "step": 9006 }, { "epoch": 0.885383651507334, "grad_norm": 18.74619483947754, "learning_rate": 8e-05, "loss": 32.3557, "num_input_tokens_seen": 464315752, "step": 9009 }, { "epoch": 0.8856784845581189, "grad_norm": 24.110286712646484, "learning_rate": 8e-05, "loss": 34.7745, "num_input_tokens_seen": 464469780, "step": 9012 }, { "epoch": 0.885973317608904, "grad_norm": 21.31003189086914, "learning_rate": 8e-05, "loss": 33.854, "num_input_tokens_seen": 464621000, "step": 9015 }, { "epoch": 0.886268150659689, "grad_norm": 21.028785705566406, "learning_rate": 8e-05, "loss": 33.7743, "num_input_tokens_seen": 464771256, "step": 9018 }, { "epoch": 0.8865629837104739, "grad_norm": 22.16194725036621, "learning_rate": 8e-05, "loss": 35.7378, "num_input_tokens_seen": 464938064, "step": 9021 }, { "epoch": 0.886857816761259, "grad_norm": 23.61879539489746, "learning_rate": 8e-05, "loss": 32.3147, "num_input_tokens_seen": 465078100, "step": 9024 }, { "epoch": 0.8871526498120439, "grad_norm": 23.015331268310547, "learning_rate": 8e-05, "loss": 36.0143, "num_input_tokens_seen": 465224376, "step": 9027 }, { "epoch": 0.8874474828628289, "grad_norm": 25.31874656677246, "learning_rate": 8e-05, "loss": 35.5241, "num_input_tokens_seen": 465373676, "step": 9030 }, { "epoch": 0.887742315913614, "grad_norm": 26.688716888427734, "learning_rate": 8e-05, "loss": 35.9495, "num_input_tokens_seen": 465511792, "step": 9033 }, { "epoch": 0.8880371489643989, "grad_norm": 22.31456184387207, "learning_rate": 8e-05, "loss": 38.7842, "num_input_tokens_seen": 465648152, "step": 9036 }, { "epoch": 0.8883319820151839, "grad_norm": 21.19448471069336, "learning_rate": 8e-05, "loss": 37.5663, "num_input_tokens_seen": 465808344, "step": 9039 }, { "epoch": 0.8886268150659689, "grad_norm": 24.864177703857422, "learning_rate": 8e-05, "loss": 36.8419, "num_input_tokens_seen": 465973604, "step": 9042 }, { "epoch": 0.8889216481167539, "grad_norm": 22.296798706054688, "learning_rate": 8e-05, "loss": 36.3395, "num_input_tokens_seen": 466124000, "step": 9045 }, { "epoch": 0.8892164811675389, "grad_norm": 20.440757751464844, "learning_rate": 8e-05, "loss": 38.0934, "num_input_tokens_seen": 466280940, "step": 9048 }, { "epoch": 0.8895113142183239, "grad_norm": 21.70374870300293, "learning_rate": 8e-05, "loss": 34.7552, "num_input_tokens_seen": 466427172, "step": 9051 }, { "epoch": 0.8898061472691089, "grad_norm": 21.369592666625977, "learning_rate": 8e-05, "loss": 33.519, "num_input_tokens_seen": 466596836, "step": 9054 }, { "epoch": 0.8901009803198938, "grad_norm": 21.856243133544922, "learning_rate": 8e-05, "loss": 37.3898, "num_input_tokens_seen": 466776248, "step": 9057 }, { "epoch": 0.8903958133706789, "grad_norm": 23.62397003173828, "learning_rate": 8e-05, "loss": 37.3079, "num_input_tokens_seen": 466937708, "step": 9060 }, { "epoch": 0.8906906464214639, "grad_norm": 20.472566604614258, "learning_rate": 8e-05, "loss": 32.3763, "num_input_tokens_seen": 467100392, "step": 9063 }, { "epoch": 0.8909854794722488, "grad_norm": 21.549814224243164, "learning_rate": 8e-05, "loss": 35.363, "num_input_tokens_seen": 467240956, "step": 9066 }, { "epoch": 0.8912803125230339, "grad_norm": 23.798681259155273, "learning_rate": 8e-05, "loss": 33.1227, "num_input_tokens_seen": 467400232, "step": 9069 }, { "epoch": 0.8915751455738188, "grad_norm": 22.498197555541992, "learning_rate": 8e-05, "loss": 37.3493, "num_input_tokens_seen": 467548700, "step": 9072 }, { "epoch": 0.8918699786246038, "grad_norm": 27.08021354675293, "learning_rate": 8e-05, "loss": 37.5852, "num_input_tokens_seen": 467701476, "step": 9075 }, { "epoch": 0.8921648116753889, "grad_norm": 21.05506706237793, "learning_rate": 8e-05, "loss": 32.7373, "num_input_tokens_seen": 467854984, "step": 9078 }, { "epoch": 0.8924596447261738, "grad_norm": 22.088733673095703, "learning_rate": 8e-05, "loss": 34.3866, "num_input_tokens_seen": 468016524, "step": 9081 }, { "epoch": 0.8927544777769588, "grad_norm": 34.91509246826172, "learning_rate": 8e-05, "loss": 32.2981, "num_input_tokens_seen": 468170612, "step": 9084 }, { "epoch": 0.8930493108277437, "grad_norm": 21.367778778076172, "learning_rate": 8e-05, "loss": 33.8347, "num_input_tokens_seen": 468329472, "step": 9087 }, { "epoch": 0.8933441438785288, "grad_norm": 20.561328887939453, "learning_rate": 8e-05, "loss": 32.3517, "num_input_tokens_seen": 468483892, "step": 9090 }, { "epoch": 0.8936389769293138, "grad_norm": 22.309688568115234, "learning_rate": 8e-05, "loss": 35.2497, "num_input_tokens_seen": 468605508, "step": 9093 }, { "epoch": 0.8939338099800987, "grad_norm": 26.220806121826172, "learning_rate": 8e-05, "loss": 37.4711, "num_input_tokens_seen": 468758424, "step": 9096 }, { "epoch": 0.8942286430308838, "grad_norm": 25.198400497436523, "learning_rate": 8e-05, "loss": 32.77, "num_input_tokens_seen": 468913748, "step": 9099 }, { "epoch": 0.8945234760816687, "grad_norm": 24.01615333557129, "learning_rate": 8e-05, "loss": 33.9408, "num_input_tokens_seen": 469049056, "step": 9102 }, { "epoch": 0.8948183091324537, "grad_norm": 25.380462646484375, "learning_rate": 8e-05, "loss": 37.3165, "num_input_tokens_seen": 469189400, "step": 9105 }, { "epoch": 0.8951131421832388, "grad_norm": 23.461862564086914, "learning_rate": 8e-05, "loss": 28.8939, "num_input_tokens_seen": 469344408, "step": 9108 }, { "epoch": 0.8954079752340237, "grad_norm": 23.690677642822266, "learning_rate": 8e-05, "loss": 35.6869, "num_input_tokens_seen": 469513492, "step": 9111 }, { "epoch": 0.8957028082848087, "grad_norm": 19.593894958496094, "learning_rate": 8e-05, "loss": 33.1535, "num_input_tokens_seen": 469666376, "step": 9114 }, { "epoch": 0.8959976413355937, "grad_norm": 100.64704895019531, "learning_rate": 8e-05, "loss": 32.9694, "num_input_tokens_seen": 469842892, "step": 9117 }, { "epoch": 0.8962924743863787, "grad_norm": 24.357036590576172, "learning_rate": 8e-05, "loss": 36.882, "num_input_tokens_seen": 470014480, "step": 9120 }, { "epoch": 0.8965873074371637, "grad_norm": 21.07242774963379, "learning_rate": 8e-05, "loss": 32.1337, "num_input_tokens_seen": 470179932, "step": 9123 }, { "epoch": 0.8968821404879487, "grad_norm": 20.460813522338867, "learning_rate": 8e-05, "loss": 31.8768, "num_input_tokens_seen": 470352160, "step": 9126 }, { "epoch": 0.8971769735387337, "grad_norm": 24.652921676635742, "learning_rate": 8e-05, "loss": 36.6415, "num_input_tokens_seen": 470513512, "step": 9129 }, { "epoch": 0.8974718065895186, "grad_norm": 23.364362716674805, "learning_rate": 8e-05, "loss": 36.4622, "num_input_tokens_seen": 470668512, "step": 9132 }, { "epoch": 0.8977666396403037, "grad_norm": 20.231983184814453, "learning_rate": 8e-05, "loss": 37.4117, "num_input_tokens_seen": 470838268, "step": 9135 }, { "epoch": 0.8980614726910887, "grad_norm": 18.93602752685547, "learning_rate": 8e-05, "loss": 34.4865, "num_input_tokens_seen": 470981460, "step": 9138 }, { "epoch": 0.8983563057418736, "grad_norm": 20.244422912597656, "learning_rate": 8e-05, "loss": 36.4161, "num_input_tokens_seen": 471131972, "step": 9141 }, { "epoch": 0.8986511387926587, "grad_norm": 28.71910858154297, "learning_rate": 8e-05, "loss": 35.6331, "num_input_tokens_seen": 471275176, "step": 9144 }, { "epoch": 0.8989459718434436, "grad_norm": 20.917680740356445, "learning_rate": 8e-05, "loss": 37.4839, "num_input_tokens_seen": 471451868, "step": 9147 }, { "epoch": 0.8992408048942286, "grad_norm": 22.487375259399414, "learning_rate": 8e-05, "loss": 36.7346, "num_input_tokens_seen": 471614456, "step": 9150 }, { "epoch": 0.8995356379450137, "grad_norm": 18.380311965942383, "learning_rate": 8e-05, "loss": 33.7283, "num_input_tokens_seen": 471782060, "step": 9153 }, { "epoch": 0.8998304709957986, "grad_norm": 22.763168334960938, "learning_rate": 8e-05, "loss": 35.6685, "num_input_tokens_seen": 471939804, "step": 9156 }, { "epoch": 0.9001253040465836, "grad_norm": 25.03064727783203, "learning_rate": 8e-05, "loss": 30.6905, "num_input_tokens_seen": 472067844, "step": 9159 }, { "epoch": 0.9004201370973686, "grad_norm": 20.84009552001953, "learning_rate": 8e-05, "loss": 39.4543, "num_input_tokens_seen": 472217136, "step": 9162 }, { "epoch": 0.9007149701481536, "grad_norm": 21.24103546142578, "learning_rate": 8e-05, "loss": 32.795, "num_input_tokens_seen": 472381476, "step": 9165 }, { "epoch": 0.9010098031989386, "grad_norm": 29.891502380371094, "learning_rate": 8e-05, "loss": 35.9941, "num_input_tokens_seen": 472536628, "step": 9168 }, { "epoch": 0.9013046362497236, "grad_norm": 20.72770118713379, "learning_rate": 8e-05, "loss": 34.2618, "num_input_tokens_seen": 472696500, "step": 9171 }, { "epoch": 0.9015994693005086, "grad_norm": 21.615867614746094, "learning_rate": 8e-05, "loss": 35.2642, "num_input_tokens_seen": 472862236, "step": 9174 }, { "epoch": 0.9018943023512935, "grad_norm": 17.681720733642578, "learning_rate": 8e-05, "loss": 33.8807, "num_input_tokens_seen": 473041936, "step": 9177 }, { "epoch": 0.9021891354020786, "grad_norm": 18.36534309387207, "learning_rate": 8e-05, "loss": 32.7494, "num_input_tokens_seen": 473215748, "step": 9180 }, { "epoch": 0.9024839684528636, "grad_norm": 21.646133422851562, "learning_rate": 8e-05, "loss": 33.7257, "num_input_tokens_seen": 473363992, "step": 9183 }, { "epoch": 0.9027788015036485, "grad_norm": 20.912612915039062, "learning_rate": 8e-05, "loss": 32.9426, "num_input_tokens_seen": 473505952, "step": 9186 }, { "epoch": 0.9030736345544336, "grad_norm": 22.79306983947754, "learning_rate": 8e-05, "loss": 37.0397, "num_input_tokens_seen": 473665556, "step": 9189 }, { "epoch": 0.9033684676052185, "grad_norm": 21.08576202392578, "learning_rate": 8e-05, "loss": 34.9447, "num_input_tokens_seen": 473829616, "step": 9192 }, { "epoch": 0.9036633006560035, "grad_norm": 21.607378005981445, "learning_rate": 8e-05, "loss": 32.5755, "num_input_tokens_seen": 473981004, "step": 9195 }, { "epoch": 0.9039581337067886, "grad_norm": 19.860673904418945, "learning_rate": 8e-05, "loss": 32.9528, "num_input_tokens_seen": 474139044, "step": 9198 }, { "epoch": 0.9042529667575735, "grad_norm": 20.233686447143555, "learning_rate": 8e-05, "loss": 34.9714, "num_input_tokens_seen": 474286740, "step": 9201 }, { "epoch": 0.9045477998083585, "grad_norm": 24.46356201171875, "learning_rate": 8e-05, "loss": 33.5463, "num_input_tokens_seen": 474438148, "step": 9204 }, { "epoch": 0.9048426328591435, "grad_norm": 21.49631690979004, "learning_rate": 8e-05, "loss": 37.0035, "num_input_tokens_seen": 474570456, "step": 9207 }, { "epoch": 0.9051374659099285, "grad_norm": 20.027463912963867, "learning_rate": 8e-05, "loss": 34.2798, "num_input_tokens_seen": 474720192, "step": 9210 }, { "epoch": 0.9054322989607135, "grad_norm": 22.349233627319336, "learning_rate": 8e-05, "loss": 35.0284, "num_input_tokens_seen": 474892740, "step": 9213 }, { "epoch": 0.9057271320114985, "grad_norm": 20.50714683532715, "learning_rate": 8e-05, "loss": 32.0477, "num_input_tokens_seen": 475034648, "step": 9216 }, { "epoch": 0.9060219650622835, "grad_norm": 18.592439651489258, "learning_rate": 8e-05, "loss": 36.5711, "num_input_tokens_seen": 475203076, "step": 9219 }, { "epoch": 0.9063167981130684, "grad_norm": 22.731468200683594, "learning_rate": 8e-05, "loss": 35.8516, "num_input_tokens_seen": 475359344, "step": 9222 }, { "epoch": 0.9066116311638535, "grad_norm": 21.352195739746094, "learning_rate": 8e-05, "loss": 34.0699, "num_input_tokens_seen": 475514552, "step": 9225 }, { "epoch": 0.9069064642146385, "grad_norm": 22.647829055786133, "learning_rate": 8e-05, "loss": 35.6129, "num_input_tokens_seen": 475658052, "step": 9228 }, { "epoch": 0.9072012972654234, "grad_norm": 21.473758697509766, "learning_rate": 8e-05, "loss": 30.6012, "num_input_tokens_seen": 475828848, "step": 9231 }, { "epoch": 0.9074961303162085, "grad_norm": 22.36081886291504, "learning_rate": 8e-05, "loss": 36.0751, "num_input_tokens_seen": 475984260, "step": 9234 }, { "epoch": 0.9077909633669934, "grad_norm": 26.658342361450195, "learning_rate": 8e-05, "loss": 34.4761, "num_input_tokens_seen": 476143672, "step": 9237 }, { "epoch": 0.9080857964177784, "grad_norm": 23.835533142089844, "learning_rate": 8e-05, "loss": 39.206, "num_input_tokens_seen": 476301540, "step": 9240 }, { "epoch": 0.9083806294685635, "grad_norm": 20.80655288696289, "learning_rate": 8e-05, "loss": 33.6508, "num_input_tokens_seen": 476446140, "step": 9243 }, { "epoch": 0.9086754625193484, "grad_norm": 42.49269485473633, "learning_rate": 8e-05, "loss": 33.288, "num_input_tokens_seen": 476600308, "step": 9246 }, { "epoch": 0.9089702955701334, "grad_norm": 21.22631072998047, "learning_rate": 8e-05, "loss": 36.4164, "num_input_tokens_seen": 476778056, "step": 9249 }, { "epoch": 0.9092651286209184, "grad_norm": 24.015443801879883, "learning_rate": 8e-05, "loss": 35.8094, "num_input_tokens_seen": 476921668, "step": 9252 }, { "epoch": 0.9095599616717034, "grad_norm": 21.08645248413086, "learning_rate": 8e-05, "loss": 33.3251, "num_input_tokens_seen": 477090876, "step": 9255 }, { "epoch": 0.9098547947224884, "grad_norm": 29.930673599243164, "learning_rate": 8e-05, "loss": 34.2575, "num_input_tokens_seen": 477265768, "step": 9258 }, { "epoch": 0.9101496277732734, "grad_norm": 19.717514038085938, "learning_rate": 8e-05, "loss": 31.6078, "num_input_tokens_seen": 477436472, "step": 9261 }, { "epoch": 0.9104444608240584, "grad_norm": 24.465761184692383, "learning_rate": 8e-05, "loss": 32.7405, "num_input_tokens_seen": 477603040, "step": 9264 }, { "epoch": 0.9107392938748434, "grad_norm": 25.059980392456055, "learning_rate": 8e-05, "loss": 36.9096, "num_input_tokens_seen": 477755684, "step": 9267 }, { "epoch": 0.9110341269256284, "grad_norm": 21.387956619262695, "learning_rate": 8e-05, "loss": 35.4109, "num_input_tokens_seen": 477917476, "step": 9270 }, { "epoch": 0.9113289599764134, "grad_norm": 35.50089645385742, "learning_rate": 8e-05, "loss": 33.5537, "num_input_tokens_seen": 478062884, "step": 9273 }, { "epoch": 0.9116237930271983, "grad_norm": 19.14139747619629, "learning_rate": 8e-05, "loss": 32.7446, "num_input_tokens_seen": 478226776, "step": 9276 }, { "epoch": 0.9119186260779834, "grad_norm": 22.263626098632812, "learning_rate": 8e-05, "loss": 37.0658, "num_input_tokens_seen": 478393780, "step": 9279 }, { "epoch": 0.9122134591287684, "grad_norm": 23.79678726196289, "learning_rate": 8e-05, "loss": 37.1102, "num_input_tokens_seen": 478563348, "step": 9282 }, { "epoch": 0.9125082921795533, "grad_norm": 23.68954849243164, "learning_rate": 8e-05, "loss": 38.8632, "num_input_tokens_seen": 478716036, "step": 9285 }, { "epoch": 0.9128031252303384, "grad_norm": 29.16128921508789, "learning_rate": 8e-05, "loss": 33.1692, "num_input_tokens_seen": 478861352, "step": 9288 }, { "epoch": 0.9130979582811233, "grad_norm": 25.06321144104004, "learning_rate": 8e-05, "loss": 33.0169, "num_input_tokens_seen": 479003680, "step": 9291 }, { "epoch": 0.9133927913319083, "grad_norm": 23.62033462524414, "learning_rate": 8e-05, "loss": 37.3165, "num_input_tokens_seen": 479174860, "step": 9294 }, { "epoch": 0.9136876243826934, "grad_norm": 25.722423553466797, "learning_rate": 8e-05, "loss": 39.9271, "num_input_tokens_seen": 479333436, "step": 9297 }, { "epoch": 0.9139824574334783, "grad_norm": 21.272403717041016, "learning_rate": 8e-05, "loss": 35.6935, "num_input_tokens_seen": 479494764, "step": 9300 }, { "epoch": 0.9142772904842633, "grad_norm": 21.769561767578125, "learning_rate": 8e-05, "loss": 31.1685, "num_input_tokens_seen": 479666964, "step": 9303 }, { "epoch": 0.9145721235350482, "grad_norm": 22.09537124633789, "learning_rate": 8e-05, "loss": 32.3473, "num_input_tokens_seen": 479846588, "step": 9306 }, { "epoch": 0.9148669565858333, "grad_norm": 21.59555435180664, "learning_rate": 8e-05, "loss": 37.1373, "num_input_tokens_seen": 480008892, "step": 9309 }, { "epoch": 0.9151617896366183, "grad_norm": 28.728235244750977, "learning_rate": 8e-05, "loss": 35.4527, "num_input_tokens_seen": 480182680, "step": 9312 }, { "epoch": 0.9154566226874032, "grad_norm": 19.65328598022461, "learning_rate": 8e-05, "loss": 31.485, "num_input_tokens_seen": 480319308, "step": 9315 }, { "epoch": 0.9157514557381883, "grad_norm": 21.808292388916016, "learning_rate": 8e-05, "loss": 35.7691, "num_input_tokens_seen": 480467744, "step": 9318 }, { "epoch": 0.9160462887889732, "grad_norm": 23.805320739746094, "learning_rate": 8e-05, "loss": 34.7721, "num_input_tokens_seen": 480607360, "step": 9321 }, { "epoch": 0.9163411218397582, "grad_norm": 23.046770095825195, "learning_rate": 8e-05, "loss": 32.3187, "num_input_tokens_seen": 480765240, "step": 9324 }, { "epoch": 0.9166359548905433, "grad_norm": 21.922595977783203, "learning_rate": 8e-05, "loss": 35.2356, "num_input_tokens_seen": 480915016, "step": 9327 }, { "epoch": 0.9169307879413282, "grad_norm": 21.19693374633789, "learning_rate": 8e-05, "loss": 32.9518, "num_input_tokens_seen": 481050844, "step": 9330 }, { "epoch": 0.9172256209921132, "grad_norm": 30.573301315307617, "learning_rate": 8e-05, "loss": 35.4635, "num_input_tokens_seen": 481198144, "step": 9333 }, { "epoch": 0.9175204540428982, "grad_norm": 20.04607391357422, "learning_rate": 8e-05, "loss": 36.8555, "num_input_tokens_seen": 481357400, "step": 9336 }, { "epoch": 0.9178152870936832, "grad_norm": 22.094623565673828, "learning_rate": 8e-05, "loss": 37.6246, "num_input_tokens_seen": 481501936, "step": 9339 }, { "epoch": 0.9181101201444682, "grad_norm": 21.94713020324707, "learning_rate": 8e-05, "loss": 34.7503, "num_input_tokens_seen": 481656604, "step": 9342 }, { "epoch": 0.9184049531952532, "grad_norm": 18.993497848510742, "learning_rate": 8e-05, "loss": 34.0214, "num_input_tokens_seen": 481831376, "step": 9345 }, { "epoch": 0.9186997862460382, "grad_norm": 20.522165298461914, "learning_rate": 8e-05, "loss": 33.8274, "num_input_tokens_seen": 481989768, "step": 9348 }, { "epoch": 0.9189946192968231, "grad_norm": 22.100162506103516, "learning_rate": 8e-05, "loss": 32.6869, "num_input_tokens_seen": 482148876, "step": 9351 }, { "epoch": 0.9192894523476082, "grad_norm": 31.93699073791504, "learning_rate": 8e-05, "loss": 38.894, "num_input_tokens_seen": 482302396, "step": 9354 }, { "epoch": 0.9195842853983932, "grad_norm": 18.395660400390625, "learning_rate": 8e-05, "loss": 35.2791, "num_input_tokens_seen": 482459164, "step": 9357 }, { "epoch": 0.9198791184491781, "grad_norm": 20.613759994506836, "learning_rate": 8e-05, "loss": 32.9776, "num_input_tokens_seen": 482634520, "step": 9360 }, { "epoch": 0.9201739514999632, "grad_norm": 26.639373779296875, "learning_rate": 8e-05, "loss": 36.3913, "num_input_tokens_seen": 482779476, "step": 9363 }, { "epoch": 0.9204687845507481, "grad_norm": 21.73651885986328, "learning_rate": 8e-05, "loss": 33.269, "num_input_tokens_seen": 482931304, "step": 9366 }, { "epoch": 0.9207636176015331, "grad_norm": 19.23166847229004, "learning_rate": 8e-05, "loss": 34.0668, "num_input_tokens_seen": 483111640, "step": 9369 }, { "epoch": 0.9210584506523182, "grad_norm": 21.559226989746094, "learning_rate": 8e-05, "loss": 40.3595, "num_input_tokens_seen": 483274320, "step": 9372 }, { "epoch": 0.9213532837031031, "grad_norm": 22.628686904907227, "learning_rate": 8e-05, "loss": 33.3141, "num_input_tokens_seen": 483416020, "step": 9375 }, { "epoch": 0.9216481167538881, "grad_norm": 18.740367889404297, "learning_rate": 8e-05, "loss": 31.2498, "num_input_tokens_seen": 483552496, "step": 9378 }, { "epoch": 0.9219429498046731, "grad_norm": 20.061677932739258, "learning_rate": 8e-05, "loss": 34.287, "num_input_tokens_seen": 483715768, "step": 9381 }, { "epoch": 0.9222377828554581, "grad_norm": 19.16664695739746, "learning_rate": 8e-05, "loss": 33.4455, "num_input_tokens_seen": 483880060, "step": 9384 }, { "epoch": 0.9225326159062431, "grad_norm": 20.502145767211914, "learning_rate": 8e-05, "loss": 29.6471, "num_input_tokens_seen": 484019340, "step": 9387 }, { "epoch": 0.9228274489570281, "grad_norm": 21.350358963012695, "learning_rate": 8e-05, "loss": 35.5749, "num_input_tokens_seen": 484185872, "step": 9390 }, { "epoch": 0.9231222820078131, "grad_norm": 20.89253807067871, "learning_rate": 8e-05, "loss": 34.3302, "num_input_tokens_seen": 484346760, "step": 9393 }, { "epoch": 0.923417115058598, "grad_norm": 24.376724243164062, "learning_rate": 8e-05, "loss": 36.6064, "num_input_tokens_seen": 484479972, "step": 9396 }, { "epoch": 0.9237119481093831, "grad_norm": 19.522863388061523, "learning_rate": 8e-05, "loss": 34.7432, "num_input_tokens_seen": 484613160, "step": 9399 }, { "epoch": 0.9240067811601681, "grad_norm": 199.71450805664062, "learning_rate": 8e-05, "loss": 32.396, "num_input_tokens_seen": 484759556, "step": 9402 }, { "epoch": 0.924301614210953, "grad_norm": 21.016319274902344, "learning_rate": 8e-05, "loss": 33.2017, "num_input_tokens_seen": 484918408, "step": 9405 }, { "epoch": 0.9245964472617381, "grad_norm": 21.742298126220703, "learning_rate": 8e-05, "loss": 36.4643, "num_input_tokens_seen": 485084048, "step": 9408 }, { "epoch": 0.924891280312523, "grad_norm": 21.95002555847168, "learning_rate": 8e-05, "loss": 34.1213, "num_input_tokens_seen": 485233412, "step": 9411 }, { "epoch": 0.925186113363308, "grad_norm": 23.653345108032227, "learning_rate": 8e-05, "loss": 34.5019, "num_input_tokens_seen": 485369200, "step": 9414 }, { "epoch": 0.9254809464140931, "grad_norm": 19.90846061706543, "learning_rate": 8e-05, "loss": 33.594, "num_input_tokens_seen": 485519936, "step": 9417 }, { "epoch": 0.925775779464878, "grad_norm": 22.45507049560547, "learning_rate": 8e-05, "loss": 40.2271, "num_input_tokens_seen": 485667392, "step": 9420 }, { "epoch": 0.926070612515663, "grad_norm": 23.564464569091797, "learning_rate": 8e-05, "loss": 33.0576, "num_input_tokens_seen": 485811800, "step": 9423 }, { "epoch": 0.926365445566448, "grad_norm": 21.314844131469727, "learning_rate": 8e-05, "loss": 30.8356, "num_input_tokens_seen": 485970732, "step": 9426 }, { "epoch": 0.926660278617233, "grad_norm": 18.391977310180664, "learning_rate": 8e-05, "loss": 33.5133, "num_input_tokens_seen": 486143860, "step": 9429 }, { "epoch": 0.926955111668018, "grad_norm": 19.696285247802734, "learning_rate": 8e-05, "loss": 33.489, "num_input_tokens_seen": 486277652, "step": 9432 }, { "epoch": 0.927249944718803, "grad_norm": 22.3781681060791, "learning_rate": 8e-05, "loss": 36.3257, "num_input_tokens_seen": 486453628, "step": 9435 }, { "epoch": 0.927544777769588, "grad_norm": 21.230148315429688, "learning_rate": 8e-05, "loss": 33.2605, "num_input_tokens_seen": 486595912, "step": 9438 }, { "epoch": 0.9278396108203729, "grad_norm": 21.168331146240234, "learning_rate": 8e-05, "loss": 33.8501, "num_input_tokens_seen": 486731804, "step": 9441 }, { "epoch": 0.928134443871158, "grad_norm": 18.55724334716797, "learning_rate": 8e-05, "loss": 28.1622, "num_input_tokens_seen": 486884636, "step": 9444 }, { "epoch": 0.928429276921943, "grad_norm": 24.025060653686523, "learning_rate": 8e-05, "loss": 37.0399, "num_input_tokens_seen": 487042888, "step": 9447 }, { "epoch": 0.9287241099727279, "grad_norm": 27.229087829589844, "learning_rate": 8e-05, "loss": 37.3008, "num_input_tokens_seen": 487211120, "step": 9450 }, { "epoch": 0.929018943023513, "grad_norm": 18.948274612426758, "learning_rate": 8e-05, "loss": 33.0499, "num_input_tokens_seen": 487353712, "step": 9453 }, { "epoch": 0.9293137760742979, "grad_norm": 21.234724044799805, "learning_rate": 8e-05, "loss": 38.9776, "num_input_tokens_seen": 487529332, "step": 9456 }, { "epoch": 0.9296086091250829, "grad_norm": 25.416913986206055, "learning_rate": 8e-05, "loss": 34.834, "num_input_tokens_seen": 487704728, "step": 9459 }, { "epoch": 0.929903442175868, "grad_norm": 24.957584381103516, "learning_rate": 8e-05, "loss": 37.8111, "num_input_tokens_seen": 487870980, "step": 9462 }, { "epoch": 0.9301982752266529, "grad_norm": 21.949615478515625, "learning_rate": 8e-05, "loss": 37.4198, "num_input_tokens_seen": 488027824, "step": 9465 }, { "epoch": 0.9304931082774379, "grad_norm": 21.102859497070312, "learning_rate": 8e-05, "loss": 35.4306, "num_input_tokens_seen": 488174928, "step": 9468 }, { "epoch": 0.9307879413282228, "grad_norm": 20.244783401489258, "learning_rate": 8e-05, "loss": 34.1812, "num_input_tokens_seen": 488327380, "step": 9471 }, { "epoch": 0.9310827743790079, "grad_norm": 22.8046817779541, "learning_rate": 8e-05, "loss": 32.5576, "num_input_tokens_seen": 488475732, "step": 9474 }, { "epoch": 0.9313776074297929, "grad_norm": 24.047414779663086, "learning_rate": 8e-05, "loss": 36.3854, "num_input_tokens_seen": 488624304, "step": 9477 }, { "epoch": 0.9316724404805778, "grad_norm": 22.388870239257812, "learning_rate": 8e-05, "loss": 34.9084, "num_input_tokens_seen": 488770808, "step": 9480 }, { "epoch": 0.9319672735313629, "grad_norm": 22.250295639038086, "learning_rate": 8e-05, "loss": 35.0276, "num_input_tokens_seen": 488912556, "step": 9483 }, { "epoch": 0.9322621065821478, "grad_norm": 24.2645206451416, "learning_rate": 8e-05, "loss": 35.794, "num_input_tokens_seen": 489060396, "step": 9486 }, { "epoch": 0.9325569396329328, "grad_norm": 21.49842643737793, "learning_rate": 8e-05, "loss": 34.637, "num_input_tokens_seen": 489218892, "step": 9489 }, { "epoch": 0.9328517726837179, "grad_norm": 21.13848114013672, "learning_rate": 8e-05, "loss": 34.4821, "num_input_tokens_seen": 489372688, "step": 9492 }, { "epoch": 0.9331466057345028, "grad_norm": 24.668495178222656, "learning_rate": 8e-05, "loss": 32.8465, "num_input_tokens_seen": 489505468, "step": 9495 }, { "epoch": 0.9334414387852878, "grad_norm": 19.518667221069336, "learning_rate": 8e-05, "loss": 33.5588, "num_input_tokens_seen": 489659504, "step": 9498 }, { "epoch": 0.9337362718360728, "grad_norm": 19.34673309326172, "learning_rate": 8e-05, "loss": 32.4264, "num_input_tokens_seen": 489810660, "step": 9501 }, { "epoch": 0.9340311048868578, "grad_norm": 22.143505096435547, "learning_rate": 8e-05, "loss": 37.9556, "num_input_tokens_seen": 489979256, "step": 9504 }, { "epoch": 0.9343259379376428, "grad_norm": 20.706947326660156, "learning_rate": 8e-05, "loss": 35.6361, "num_input_tokens_seen": 490134768, "step": 9507 }, { "epoch": 0.9346207709884278, "grad_norm": 20.448240280151367, "learning_rate": 8e-05, "loss": 32.0137, "num_input_tokens_seen": 490292724, "step": 9510 }, { "epoch": 0.9349156040392128, "grad_norm": 22.021400451660156, "learning_rate": 8e-05, "loss": 36.6951, "num_input_tokens_seen": 490450020, "step": 9513 }, { "epoch": 0.9352104370899977, "grad_norm": 21.240678787231445, "learning_rate": 8e-05, "loss": 35.2441, "num_input_tokens_seen": 490605652, "step": 9516 }, { "epoch": 0.9355052701407828, "grad_norm": 21.042343139648438, "learning_rate": 8e-05, "loss": 36.4855, "num_input_tokens_seen": 490769044, "step": 9519 }, { "epoch": 0.9358001031915678, "grad_norm": 21.686046600341797, "learning_rate": 8e-05, "loss": 31.8773, "num_input_tokens_seen": 490922684, "step": 9522 }, { "epoch": 0.9360949362423527, "grad_norm": 31.692363739013672, "learning_rate": 8e-05, "loss": 38.7701, "num_input_tokens_seen": 491073924, "step": 9525 }, { "epoch": 0.9363897692931378, "grad_norm": 20.821338653564453, "learning_rate": 8e-05, "loss": 37.0108, "num_input_tokens_seen": 491229148, "step": 9528 }, { "epoch": 0.9366846023439227, "grad_norm": 24.331012725830078, "learning_rate": 8e-05, "loss": 36.7816, "num_input_tokens_seen": 491401840, "step": 9531 }, { "epoch": 0.9369794353947077, "grad_norm": 22.13974952697754, "learning_rate": 8e-05, "loss": 36.4159, "num_input_tokens_seen": 491558052, "step": 9534 }, { "epoch": 0.9372742684454928, "grad_norm": 20.529359817504883, "learning_rate": 8e-05, "loss": 33.4253, "num_input_tokens_seen": 491710880, "step": 9537 }, { "epoch": 0.9375691014962777, "grad_norm": 22.973304748535156, "learning_rate": 8e-05, "loss": 38.1455, "num_input_tokens_seen": 491863964, "step": 9540 }, { "epoch": 0.9378639345470627, "grad_norm": 20.182170867919922, "learning_rate": 8e-05, "loss": 36.4646, "num_input_tokens_seen": 492011328, "step": 9543 }, { "epoch": 0.9381587675978477, "grad_norm": 21.98539161682129, "learning_rate": 8e-05, "loss": 35.9327, "num_input_tokens_seen": 492181992, "step": 9546 }, { "epoch": 0.9384536006486327, "grad_norm": 22.3552303314209, "learning_rate": 8e-05, "loss": 36.1662, "num_input_tokens_seen": 492328688, "step": 9549 }, { "epoch": 0.9387484336994177, "grad_norm": 18.721784591674805, "learning_rate": 8e-05, "loss": 36.1254, "num_input_tokens_seen": 492490200, "step": 9552 }, { "epoch": 0.9390432667502027, "grad_norm": 22.69650650024414, "learning_rate": 8e-05, "loss": 34.9092, "num_input_tokens_seen": 492650216, "step": 9555 }, { "epoch": 0.9393380998009877, "grad_norm": 25.502473831176758, "learning_rate": 8e-05, "loss": 33.9575, "num_input_tokens_seen": 492808952, "step": 9558 }, { "epoch": 0.9396329328517726, "grad_norm": 19.732494354248047, "learning_rate": 8e-05, "loss": 32.2993, "num_input_tokens_seen": 492963472, "step": 9561 }, { "epoch": 0.9399277659025577, "grad_norm": 20.05377960205078, "learning_rate": 8e-05, "loss": 33.9659, "num_input_tokens_seen": 493119448, "step": 9564 }, { "epoch": 0.9402225989533427, "grad_norm": 30.67023277282715, "learning_rate": 8e-05, "loss": 34.9658, "num_input_tokens_seen": 493269844, "step": 9567 }, { "epoch": 0.9405174320041276, "grad_norm": 26.700212478637695, "learning_rate": 8e-05, "loss": 35.2712, "num_input_tokens_seen": 493403864, "step": 9570 }, { "epoch": 0.9408122650549127, "grad_norm": 24.358205795288086, "learning_rate": 8e-05, "loss": 34.8117, "num_input_tokens_seen": 493562716, "step": 9573 }, { "epoch": 0.9411070981056977, "grad_norm": 22.915876388549805, "learning_rate": 8e-05, "loss": 34.7899, "num_input_tokens_seen": 493726740, "step": 9576 }, { "epoch": 0.9414019311564826, "grad_norm": 25.79482650756836, "learning_rate": 8e-05, "loss": 36.4804, "num_input_tokens_seen": 493875008, "step": 9579 }, { "epoch": 0.9416967642072677, "grad_norm": 18.314313888549805, "learning_rate": 8e-05, "loss": 33.5188, "num_input_tokens_seen": 494033616, "step": 9582 }, { "epoch": 0.9419915972580526, "grad_norm": 22.25737953186035, "learning_rate": 8e-05, "loss": 37.1847, "num_input_tokens_seen": 494174624, "step": 9585 }, { "epoch": 0.9422864303088376, "grad_norm": 23.608041763305664, "learning_rate": 8e-05, "loss": 33.9381, "num_input_tokens_seen": 494322532, "step": 9588 }, { "epoch": 0.9425812633596227, "grad_norm": 20.048398971557617, "learning_rate": 8e-05, "loss": 36.2292, "num_input_tokens_seen": 494474040, "step": 9591 }, { "epoch": 0.9428760964104076, "grad_norm": 18.939929962158203, "learning_rate": 8e-05, "loss": 30.8911, "num_input_tokens_seen": 494617224, "step": 9594 }, { "epoch": 0.9431709294611926, "grad_norm": 17.95427894592285, "learning_rate": 8e-05, "loss": 35.8452, "num_input_tokens_seen": 494774616, "step": 9597 }, { "epoch": 0.9434657625119776, "grad_norm": 21.61246109008789, "learning_rate": 8e-05, "loss": 36.3985, "num_input_tokens_seen": 494931148, "step": 9600 }, { "epoch": 0.9437605955627626, "grad_norm": 21.577884674072266, "learning_rate": 8e-05, "loss": 35.3917, "num_input_tokens_seen": 495082104, "step": 9603 }, { "epoch": 0.9440554286135476, "grad_norm": 20.373001098632812, "learning_rate": 8e-05, "loss": 33.8704, "num_input_tokens_seen": 495222352, "step": 9606 }, { "epoch": 0.9443502616643326, "grad_norm": 20.00078010559082, "learning_rate": 8e-05, "loss": 37.882, "num_input_tokens_seen": 495387952, "step": 9609 }, { "epoch": 0.9446450947151176, "grad_norm": 20.51920509338379, "learning_rate": 8e-05, "loss": 34.1078, "num_input_tokens_seen": 495531268, "step": 9612 }, { "epoch": 0.9449399277659025, "grad_norm": 21.88003158569336, "learning_rate": 8e-05, "loss": 37.5921, "num_input_tokens_seen": 495702868, "step": 9615 }, { "epoch": 0.9452347608166876, "grad_norm": 28.009687423706055, "learning_rate": 8e-05, "loss": 36.7524, "num_input_tokens_seen": 495851672, "step": 9618 }, { "epoch": 0.9455295938674726, "grad_norm": 23.272294998168945, "learning_rate": 8e-05, "loss": 34.6014, "num_input_tokens_seen": 495997208, "step": 9621 }, { "epoch": 0.9458244269182575, "grad_norm": 27.648483276367188, "learning_rate": 8e-05, "loss": 35.1428, "num_input_tokens_seen": 496153292, "step": 9624 }, { "epoch": 0.9461192599690426, "grad_norm": 21.118453979492188, "learning_rate": 8e-05, "loss": 35.3225, "num_input_tokens_seen": 496308760, "step": 9627 }, { "epoch": 0.9464140930198275, "grad_norm": 19.653175354003906, "learning_rate": 8e-05, "loss": 28.756, "num_input_tokens_seen": 496479760, "step": 9630 }, { "epoch": 0.9467089260706125, "grad_norm": 24.253093719482422, "learning_rate": 8e-05, "loss": 35.9993, "num_input_tokens_seen": 496618096, "step": 9633 }, { "epoch": 0.9470037591213976, "grad_norm": 25.256576538085938, "learning_rate": 8e-05, "loss": 38.7862, "num_input_tokens_seen": 496770116, "step": 9636 }, { "epoch": 0.9472985921721825, "grad_norm": 28.437040328979492, "learning_rate": 8e-05, "loss": 31.186, "num_input_tokens_seen": 496941048, "step": 9639 }, { "epoch": 0.9475934252229675, "grad_norm": 20.87740707397461, "learning_rate": 8e-05, "loss": 37.1772, "num_input_tokens_seen": 497102572, "step": 9642 }, { "epoch": 0.9478882582737524, "grad_norm": 20.97877311706543, "learning_rate": 8e-05, "loss": 34.5502, "num_input_tokens_seen": 497254912, "step": 9645 }, { "epoch": 0.9481830913245375, "grad_norm": 23.761564254760742, "learning_rate": 8e-05, "loss": 38.1404, "num_input_tokens_seen": 497407040, "step": 9648 }, { "epoch": 0.9484779243753225, "grad_norm": 22.26776695251465, "learning_rate": 8e-05, "loss": 36.5098, "num_input_tokens_seen": 497547540, "step": 9651 }, { "epoch": 0.9487727574261074, "grad_norm": 22.641294479370117, "learning_rate": 8e-05, "loss": 31.9351, "num_input_tokens_seen": 497700032, "step": 9654 }, { "epoch": 0.9490675904768925, "grad_norm": 22.351139068603516, "learning_rate": 8e-05, "loss": 37.8459, "num_input_tokens_seen": 497862068, "step": 9657 }, { "epoch": 0.9493624235276774, "grad_norm": 19.74834632873535, "learning_rate": 8e-05, "loss": 36.4842, "num_input_tokens_seen": 498024708, "step": 9660 }, { "epoch": 0.9496572565784624, "grad_norm": 26.638202667236328, "learning_rate": 8e-05, "loss": 37.338, "num_input_tokens_seen": 498177824, "step": 9663 }, { "epoch": 0.9499520896292475, "grad_norm": 22.298736572265625, "learning_rate": 8e-05, "loss": 33.4881, "num_input_tokens_seen": 498333556, "step": 9666 }, { "epoch": 0.9502469226800324, "grad_norm": 19.22551155090332, "learning_rate": 8e-05, "loss": 30.2335, "num_input_tokens_seen": 498469020, "step": 9669 }, { "epoch": 0.9505417557308175, "grad_norm": 24.792818069458008, "learning_rate": 8e-05, "loss": 33.5654, "num_input_tokens_seen": 498619116, "step": 9672 }, { "epoch": 0.9508365887816024, "grad_norm": 20.585163116455078, "learning_rate": 8e-05, "loss": 30.6095, "num_input_tokens_seen": 498767456, "step": 9675 }, { "epoch": 0.9511314218323874, "grad_norm": 18.556869506835938, "learning_rate": 8e-05, "loss": 35.2455, "num_input_tokens_seen": 498925584, "step": 9678 }, { "epoch": 0.9514262548831725, "grad_norm": 23.33407974243164, "learning_rate": 8e-05, "loss": 35.0502, "num_input_tokens_seen": 499102064, "step": 9681 }, { "epoch": 0.9517210879339574, "grad_norm": 20.793521881103516, "learning_rate": 8e-05, "loss": 35.8787, "num_input_tokens_seen": 499253736, "step": 9684 }, { "epoch": 0.9520159209847424, "grad_norm": 20.79303741455078, "learning_rate": 8e-05, "loss": 38.7343, "num_input_tokens_seen": 499411668, "step": 9687 }, { "epoch": 0.9523107540355273, "grad_norm": 19.87205696105957, "learning_rate": 8e-05, "loss": 34.112, "num_input_tokens_seen": 499575948, "step": 9690 }, { "epoch": 0.9526055870863124, "grad_norm": 24.625654220581055, "learning_rate": 8e-05, "loss": 34.9216, "num_input_tokens_seen": 499714888, "step": 9693 }, { "epoch": 0.9529004201370974, "grad_norm": 21.79035758972168, "learning_rate": 8e-05, "loss": 35.6022, "num_input_tokens_seen": 499867056, "step": 9696 }, { "epoch": 0.9531952531878823, "grad_norm": 17.22002601623535, "learning_rate": 8e-05, "loss": 33.1651, "num_input_tokens_seen": 500020932, "step": 9699 }, { "epoch": 0.9534900862386674, "grad_norm": 20.668373107910156, "learning_rate": 8e-05, "loss": 28.9119, "num_input_tokens_seen": 500185388, "step": 9702 }, { "epoch": 0.9537849192894523, "grad_norm": 24.211820602416992, "learning_rate": 8e-05, "loss": 37.2794, "num_input_tokens_seen": 500339608, "step": 9705 }, { "epoch": 0.9540797523402373, "grad_norm": 22.269710540771484, "learning_rate": 8e-05, "loss": 33.7264, "num_input_tokens_seen": 500474852, "step": 9708 }, { "epoch": 0.9543745853910224, "grad_norm": 20.687097549438477, "learning_rate": 8e-05, "loss": 35.9343, "num_input_tokens_seen": 500621900, "step": 9711 }, { "epoch": 0.9546694184418073, "grad_norm": 18.96708869934082, "learning_rate": 8e-05, "loss": 34.9415, "num_input_tokens_seen": 500796820, "step": 9714 }, { "epoch": 0.9549642514925923, "grad_norm": 481.27459716796875, "learning_rate": 8e-05, "loss": 33.8589, "num_input_tokens_seen": 500949388, "step": 9717 }, { "epoch": 0.9552590845433773, "grad_norm": 23.094690322875977, "learning_rate": 8e-05, "loss": 32.8335, "num_input_tokens_seen": 501096632, "step": 9720 }, { "epoch": 0.9555539175941623, "grad_norm": 20.492908477783203, "learning_rate": 8e-05, "loss": 34.0337, "num_input_tokens_seen": 501247176, "step": 9723 }, { "epoch": 0.9558487506449473, "grad_norm": 20.346040725708008, "learning_rate": 8e-05, "loss": 34.3415, "num_input_tokens_seen": 501389276, "step": 9726 }, { "epoch": 0.9561435836957323, "grad_norm": 25.47278594970703, "learning_rate": 8e-05, "loss": 30.5468, "num_input_tokens_seen": 501528648, "step": 9729 }, { "epoch": 0.9564384167465173, "grad_norm": 21.43868064880371, "learning_rate": 8e-05, "loss": 37.3752, "num_input_tokens_seen": 501666952, "step": 9732 }, { "epoch": 0.9567332497973022, "grad_norm": 23.923418045043945, "learning_rate": 8e-05, "loss": 37.6275, "num_input_tokens_seen": 501814412, "step": 9735 }, { "epoch": 0.9570280828480873, "grad_norm": 22.2674503326416, "learning_rate": 8e-05, "loss": 39.7825, "num_input_tokens_seen": 501952240, "step": 9738 }, { "epoch": 0.9573229158988723, "grad_norm": 19.780776977539062, "learning_rate": 8e-05, "loss": 33.9042, "num_input_tokens_seen": 502113492, "step": 9741 }, { "epoch": 0.9576177489496572, "grad_norm": 18.62359046936035, "learning_rate": 8e-05, "loss": 29.9758, "num_input_tokens_seen": 502272680, "step": 9744 }, { "epoch": 0.9579125820004423, "grad_norm": 45.219757080078125, "learning_rate": 8e-05, "loss": 38.9523, "num_input_tokens_seen": 502412496, "step": 9747 }, { "epoch": 0.9582074150512272, "grad_norm": 21.53115463256836, "learning_rate": 8e-05, "loss": 35.1453, "num_input_tokens_seen": 502565504, "step": 9750 }, { "epoch": 0.9585022481020122, "grad_norm": 19.353473663330078, "learning_rate": 8e-05, "loss": 36.7383, "num_input_tokens_seen": 502697288, "step": 9753 }, { "epoch": 0.9587970811527973, "grad_norm": 18.89324951171875, "learning_rate": 8e-05, "loss": 33.598, "num_input_tokens_seen": 502848920, "step": 9756 }, { "epoch": 0.9590919142035822, "grad_norm": 21.069604873657227, "learning_rate": 8e-05, "loss": 34.0274, "num_input_tokens_seen": 503030192, "step": 9759 }, { "epoch": 0.9593867472543672, "grad_norm": 37.08634948730469, "learning_rate": 8e-05, "loss": 33.2065, "num_input_tokens_seen": 503184856, "step": 9762 }, { "epoch": 0.9596815803051522, "grad_norm": 22.28505516052246, "learning_rate": 8e-05, "loss": 35.7401, "num_input_tokens_seen": 503320700, "step": 9765 }, { "epoch": 0.9599764133559372, "grad_norm": 18.990753173828125, "learning_rate": 8e-05, "loss": 33.9176, "num_input_tokens_seen": 503488576, "step": 9768 }, { "epoch": 0.9602712464067222, "grad_norm": 19.933992385864258, "learning_rate": 8e-05, "loss": 35.2393, "num_input_tokens_seen": 503654248, "step": 9771 }, { "epoch": 0.9605660794575072, "grad_norm": 19.628435134887695, "learning_rate": 8e-05, "loss": 34.0539, "num_input_tokens_seen": 503801032, "step": 9774 }, { "epoch": 0.9608609125082922, "grad_norm": 24.20693016052246, "learning_rate": 8e-05, "loss": 33.1455, "num_input_tokens_seen": 503960648, "step": 9777 }, { "epoch": 0.9611557455590771, "grad_norm": 22.341066360473633, "learning_rate": 8e-05, "loss": 38.3758, "num_input_tokens_seen": 504113264, "step": 9780 }, { "epoch": 0.9614505786098622, "grad_norm": 20.44728660583496, "learning_rate": 8e-05, "loss": 34.0762, "num_input_tokens_seen": 504251244, "step": 9783 }, { "epoch": 0.9617454116606472, "grad_norm": 18.34883689880371, "learning_rate": 8e-05, "loss": 31.6663, "num_input_tokens_seen": 504408236, "step": 9786 }, { "epoch": 0.9620402447114321, "grad_norm": 19.12334442138672, "learning_rate": 8e-05, "loss": 37.1449, "num_input_tokens_seen": 504583188, "step": 9789 }, { "epoch": 0.9623350777622172, "grad_norm": 22.31780242919922, "learning_rate": 8e-05, "loss": 34.1645, "num_input_tokens_seen": 504735980, "step": 9792 }, { "epoch": 0.9626299108130021, "grad_norm": 21.41351318359375, "learning_rate": 8e-05, "loss": 35.3733, "num_input_tokens_seen": 504874156, "step": 9795 }, { "epoch": 0.9629247438637871, "grad_norm": 18.388166427612305, "learning_rate": 8e-05, "loss": 29.836, "num_input_tokens_seen": 505025504, "step": 9798 }, { "epoch": 0.9632195769145722, "grad_norm": 20.411054611206055, "learning_rate": 8e-05, "loss": 31.9745, "num_input_tokens_seen": 505190108, "step": 9801 }, { "epoch": 0.9635144099653571, "grad_norm": 25.66120147705078, "learning_rate": 8e-05, "loss": 39.5065, "num_input_tokens_seen": 505335460, "step": 9804 }, { "epoch": 0.9638092430161421, "grad_norm": 21.30845832824707, "learning_rate": 8e-05, "loss": 33.3261, "num_input_tokens_seen": 505511100, "step": 9807 }, { "epoch": 0.964104076066927, "grad_norm": 25.881977081298828, "learning_rate": 8e-05, "loss": 34.2817, "num_input_tokens_seen": 505643444, "step": 9810 }, { "epoch": 0.9643989091177121, "grad_norm": 20.899316787719727, "learning_rate": 8e-05, "loss": 34.6943, "num_input_tokens_seen": 505804368, "step": 9813 }, { "epoch": 0.9646937421684971, "grad_norm": 20.84662628173828, "learning_rate": 8e-05, "loss": 35.9952, "num_input_tokens_seen": 505949192, "step": 9816 }, { "epoch": 0.964988575219282, "grad_norm": 23.826128005981445, "learning_rate": 8e-05, "loss": 34.4256, "num_input_tokens_seen": 506116204, "step": 9819 }, { "epoch": 0.9652834082700671, "grad_norm": 21.089387893676758, "learning_rate": 8e-05, "loss": 37.0237, "num_input_tokens_seen": 506268228, "step": 9822 }, { "epoch": 0.965578241320852, "grad_norm": 20.195451736450195, "learning_rate": 8e-05, "loss": 32.2907, "num_input_tokens_seen": 506423696, "step": 9825 }, { "epoch": 0.965873074371637, "grad_norm": 20.46344757080078, "learning_rate": 8e-05, "loss": 34.8418, "num_input_tokens_seen": 506570468, "step": 9828 }, { "epoch": 0.9661679074224221, "grad_norm": 20.803773880004883, "learning_rate": 8e-05, "loss": 35.3394, "num_input_tokens_seen": 506735356, "step": 9831 }, { "epoch": 0.966462740473207, "grad_norm": 20.69495391845703, "learning_rate": 8e-05, "loss": 31.3141, "num_input_tokens_seen": 506882236, "step": 9834 }, { "epoch": 0.966757573523992, "grad_norm": 20.818252563476562, "learning_rate": 8e-05, "loss": 34.065, "num_input_tokens_seen": 507036800, "step": 9837 }, { "epoch": 0.967052406574777, "grad_norm": 19.533720016479492, "learning_rate": 8e-05, "loss": 36.8215, "num_input_tokens_seen": 507189136, "step": 9840 }, { "epoch": 0.967347239625562, "grad_norm": 24.33357810974121, "learning_rate": 8e-05, "loss": 31.5452, "num_input_tokens_seen": 507332120, "step": 9843 }, { "epoch": 0.967642072676347, "grad_norm": 21.880701065063477, "learning_rate": 8e-05, "loss": 37.1414, "num_input_tokens_seen": 507486400, "step": 9846 }, { "epoch": 0.967936905727132, "grad_norm": 23.587797164916992, "learning_rate": 8e-05, "loss": 33.8317, "num_input_tokens_seen": 507651728, "step": 9849 }, { "epoch": 0.968231738777917, "grad_norm": 16.616315841674805, "learning_rate": 8e-05, "loss": 34.2093, "num_input_tokens_seen": 507812512, "step": 9852 }, { "epoch": 0.9685265718287019, "grad_norm": 21.581829071044922, "learning_rate": 8e-05, "loss": 30.2119, "num_input_tokens_seen": 507974420, "step": 9855 }, { "epoch": 0.968821404879487, "grad_norm": 20.407085418701172, "learning_rate": 8e-05, "loss": 37.4761, "num_input_tokens_seen": 508119500, "step": 9858 }, { "epoch": 0.969116237930272, "grad_norm": 24.967605590820312, "learning_rate": 8e-05, "loss": 32.8561, "num_input_tokens_seen": 508279472, "step": 9861 }, { "epoch": 0.9694110709810569, "grad_norm": 28.922719955444336, "learning_rate": 8e-05, "loss": 31.4779, "num_input_tokens_seen": 508415424, "step": 9864 }, { "epoch": 0.969705904031842, "grad_norm": 24.99030303955078, "learning_rate": 8e-05, "loss": 35.4844, "num_input_tokens_seen": 508561032, "step": 9867 }, { "epoch": 0.9700007370826269, "grad_norm": 21.515268325805664, "learning_rate": 8e-05, "loss": 37.914, "num_input_tokens_seen": 508724348, "step": 9870 }, { "epoch": 0.9702955701334119, "grad_norm": 20.35093879699707, "learning_rate": 8e-05, "loss": 39.0067, "num_input_tokens_seen": 508882664, "step": 9873 }, { "epoch": 0.970590403184197, "grad_norm": 21.495630264282227, "learning_rate": 8e-05, "loss": 33.9999, "num_input_tokens_seen": 509022868, "step": 9876 }, { "epoch": 0.9708852362349819, "grad_norm": 22.823671340942383, "learning_rate": 8e-05, "loss": 36.8194, "num_input_tokens_seen": 509156400, "step": 9879 }, { "epoch": 0.971180069285767, "grad_norm": 19.8090877532959, "learning_rate": 8e-05, "loss": 33.599, "num_input_tokens_seen": 509318684, "step": 9882 }, { "epoch": 0.971474902336552, "grad_norm": 22.890684127807617, "learning_rate": 8e-05, "loss": 35.3154, "num_input_tokens_seen": 509481856, "step": 9885 }, { "epoch": 0.9717697353873369, "grad_norm": 20.67902374267578, "learning_rate": 8e-05, "loss": 37.2118, "num_input_tokens_seen": 509665356, "step": 9888 }, { "epoch": 0.972064568438122, "grad_norm": 25.929218292236328, "learning_rate": 8e-05, "loss": 34.125, "num_input_tokens_seen": 509812800, "step": 9891 }, { "epoch": 0.9723594014889069, "grad_norm": 22.32213592529297, "learning_rate": 8e-05, "loss": 38.672, "num_input_tokens_seen": 509972556, "step": 9894 }, { "epoch": 0.9726542345396919, "grad_norm": 17.61611557006836, "learning_rate": 8e-05, "loss": 32.8811, "num_input_tokens_seen": 510141016, "step": 9897 }, { "epoch": 0.972949067590477, "grad_norm": 22.887008666992188, "learning_rate": 8e-05, "loss": 35.0657, "num_input_tokens_seen": 510288240, "step": 9900 }, { "epoch": 0.9732439006412619, "grad_norm": 20.91583251953125, "learning_rate": 8e-05, "loss": 35.5109, "num_input_tokens_seen": 510457332, "step": 9903 }, { "epoch": 0.9735387336920469, "grad_norm": 22.92230224609375, "learning_rate": 8e-05, "loss": 35.5309, "num_input_tokens_seen": 510609764, "step": 9906 }, { "epoch": 0.9738335667428318, "grad_norm": 22.04583740234375, "learning_rate": 8e-05, "loss": 31.3168, "num_input_tokens_seen": 510767660, "step": 9909 }, { "epoch": 0.9741283997936169, "grad_norm": 20.284467697143555, "learning_rate": 8e-05, "loss": 34.3692, "num_input_tokens_seen": 510936116, "step": 9912 }, { "epoch": 0.9744232328444019, "grad_norm": 21.815916061401367, "learning_rate": 8e-05, "loss": 33.3647, "num_input_tokens_seen": 511094124, "step": 9915 }, { "epoch": 0.9747180658951868, "grad_norm": 26.54884910583496, "learning_rate": 8e-05, "loss": 34.5375, "num_input_tokens_seen": 511243880, "step": 9918 }, { "epoch": 0.9750128989459719, "grad_norm": 20.4998779296875, "learning_rate": 8e-05, "loss": 33.2815, "num_input_tokens_seen": 511410856, "step": 9921 }, { "epoch": 0.9753077319967568, "grad_norm": 19.39156723022461, "learning_rate": 8e-05, "loss": 31.9833, "num_input_tokens_seen": 511576308, "step": 9924 }, { "epoch": 0.9756025650475418, "grad_norm": 19.645227432250977, "learning_rate": 8e-05, "loss": 32.86, "num_input_tokens_seen": 511739464, "step": 9927 }, { "epoch": 0.9758973980983269, "grad_norm": 23.1904354095459, "learning_rate": 8e-05, "loss": 33.4071, "num_input_tokens_seen": 511878860, "step": 9930 }, { "epoch": 0.9761922311491118, "grad_norm": 18.9490966796875, "learning_rate": 8e-05, "loss": 36.429, "num_input_tokens_seen": 512038108, "step": 9933 }, { "epoch": 0.9764870641998968, "grad_norm": 20.765483856201172, "learning_rate": 8e-05, "loss": 34.1946, "num_input_tokens_seen": 512178532, "step": 9936 }, { "epoch": 0.9767818972506818, "grad_norm": 21.686630249023438, "learning_rate": 8e-05, "loss": 39.4583, "num_input_tokens_seen": 512327616, "step": 9939 }, { "epoch": 0.9770767303014668, "grad_norm": 19.82565689086914, "learning_rate": 8e-05, "loss": 32.1733, "num_input_tokens_seen": 512489768, "step": 9942 }, { "epoch": 0.9773715633522518, "grad_norm": 28.079265594482422, "learning_rate": 8e-05, "loss": 39.3916, "num_input_tokens_seen": 512627556, "step": 9945 }, { "epoch": 0.9776663964030368, "grad_norm": 22.157262802124023, "learning_rate": 8e-05, "loss": 31.2814, "num_input_tokens_seen": 512799820, "step": 9948 }, { "epoch": 0.9779612294538218, "grad_norm": 24.7800235748291, "learning_rate": 8e-05, "loss": 34.3267, "num_input_tokens_seen": 512947652, "step": 9951 }, { "epoch": 0.9782560625046067, "grad_norm": 21.06366729736328, "learning_rate": 8e-05, "loss": 35.091, "num_input_tokens_seen": 513105068, "step": 9954 }, { "epoch": 0.9785508955553918, "grad_norm": 21.90027618408203, "learning_rate": 8e-05, "loss": 35.5001, "num_input_tokens_seen": 513270152, "step": 9957 }, { "epoch": 0.9788457286061768, "grad_norm": 21.52861213684082, "learning_rate": 8e-05, "loss": 34.9564, "num_input_tokens_seen": 513417040, "step": 9960 }, { "epoch": 0.9791405616569617, "grad_norm": 21.92180824279785, "learning_rate": 8e-05, "loss": 34.5154, "num_input_tokens_seen": 513567416, "step": 9963 }, { "epoch": 0.9794353947077468, "grad_norm": 20.202768325805664, "learning_rate": 8e-05, "loss": 30.0621, "num_input_tokens_seen": 513723628, "step": 9966 }, { "epoch": 0.9797302277585317, "grad_norm": 21.07074546813965, "learning_rate": 8e-05, "loss": 35.3967, "num_input_tokens_seen": 513884464, "step": 9969 }, { "epoch": 0.9800250608093167, "grad_norm": 23.317293167114258, "learning_rate": 8e-05, "loss": 38.406, "num_input_tokens_seen": 514050832, "step": 9972 }, { "epoch": 0.9803198938601018, "grad_norm": 20.868349075317383, "learning_rate": 8e-05, "loss": 35.6753, "num_input_tokens_seen": 514197228, "step": 9975 }, { "epoch": 0.9806147269108867, "grad_norm": 23.616369247436523, "learning_rate": 8e-05, "loss": 35.2552, "num_input_tokens_seen": 514350780, "step": 9978 }, { "epoch": 0.9809095599616717, "grad_norm": 20.292543411254883, "learning_rate": 8e-05, "loss": 33.7011, "num_input_tokens_seen": 514503084, "step": 9981 }, { "epoch": 0.9812043930124567, "grad_norm": 25.273117065429688, "learning_rate": 8e-05, "loss": 35.5905, "num_input_tokens_seen": 514659228, "step": 9984 }, { "epoch": 0.9814992260632417, "grad_norm": 20.815622329711914, "learning_rate": 8e-05, "loss": 31.5263, "num_input_tokens_seen": 514817380, "step": 9987 }, { "epoch": 0.9817940591140267, "grad_norm": 20.29827308654785, "learning_rate": 8e-05, "loss": 35.3308, "num_input_tokens_seen": 514996984, "step": 9990 }, { "epoch": 0.9820888921648117, "grad_norm": 19.71322250366211, "learning_rate": 8e-05, "loss": 34.5728, "num_input_tokens_seen": 515142160, "step": 9993 }, { "epoch": 0.9823837252155967, "grad_norm": 20.120180130004883, "learning_rate": 8e-05, "loss": 37.7434, "num_input_tokens_seen": 515298572, "step": 9996 }, { "epoch": 0.9826785582663816, "grad_norm": 23.941017150878906, "learning_rate": 8e-05, "loss": 36.9318, "num_input_tokens_seen": 515474328, "step": 9999 }, { "epoch": 0.9827768359499767, "eval_gen_len": 30.6, "eval_loss": 2.209508180618286, "eval_rouge1": 50.3518, "eval_rouge2": 33.9831, "eval_rougeL": 46.3741, "eval_rougeLsum": 46.7798, "eval_runtime": 92.1739, "eval_samples_per_second": 2.17, "eval_steps_per_second": 0.542, "num_input_tokens_seen": 515531508, "step": 10000 }, { "epoch": 0.9829733913171667, "grad_norm": 19.42774772644043, "learning_rate": 8e-05, "loss": 34.3803, "num_input_tokens_seen": 515630088, "step": 10002 }, { "epoch": 0.9832682243679517, "grad_norm": 22.675596237182617, "learning_rate": 8e-05, "loss": 32.2009, "num_input_tokens_seen": 515809936, "step": 10005 }, { "epoch": 0.9835630574187366, "grad_norm": 17.020742416381836, "learning_rate": 8e-05, "loss": 32.3546, "num_input_tokens_seen": 515963532, "step": 10008 }, { "epoch": 0.9838578904695217, "grad_norm": 19.389602661132812, "learning_rate": 8e-05, "loss": 32.8277, "num_input_tokens_seen": 516124864, "step": 10011 }, { "epoch": 0.9841527235203066, "grad_norm": 27.274927139282227, "learning_rate": 8e-05, "loss": 37.2066, "num_input_tokens_seen": 516280980, "step": 10014 }, { "epoch": 0.9844475565710916, "grad_norm": 21.650020599365234, "learning_rate": 8e-05, "loss": 34.0361, "num_input_tokens_seen": 516413736, "step": 10017 }, { "epoch": 0.9847423896218767, "grad_norm": 23.6706485748291, "learning_rate": 8e-05, "loss": 37.021, "num_input_tokens_seen": 516578896, "step": 10020 }, { "epoch": 0.9850372226726616, "grad_norm": 19.572914123535156, "learning_rate": 8e-05, "loss": 32.311, "num_input_tokens_seen": 516759116, "step": 10023 }, { "epoch": 0.9853320557234466, "grad_norm": 21.539064407348633, "learning_rate": 8e-05, "loss": 33.711, "num_input_tokens_seen": 516912860, "step": 10026 }, { "epoch": 0.9856268887742315, "grad_norm": 29.89957618713379, "learning_rate": 8e-05, "loss": 36.039, "num_input_tokens_seen": 517066956, "step": 10029 }, { "epoch": 0.9859217218250166, "grad_norm": 40.08900833129883, "learning_rate": 8e-05, "loss": 37.2409, "num_input_tokens_seen": 517232724, "step": 10032 }, { "epoch": 0.9862165548758016, "grad_norm": 23.061819076538086, "learning_rate": 8e-05, "loss": 38.555, "num_input_tokens_seen": 517389860, "step": 10035 }, { "epoch": 0.9865113879265865, "grad_norm": 22.41363525390625, "learning_rate": 8e-05, "loss": 33.8079, "num_input_tokens_seen": 517542184, "step": 10038 }, { "epoch": 0.9868062209773716, "grad_norm": 19.55914878845215, "learning_rate": 8e-05, "loss": 35.6898, "num_input_tokens_seen": 517694116, "step": 10041 }, { "epoch": 0.9871010540281565, "grad_norm": 22.108427047729492, "learning_rate": 8e-05, "loss": 37.0617, "num_input_tokens_seen": 517869780, "step": 10044 }, { "epoch": 0.9873958870789415, "grad_norm": 22.998760223388672, "learning_rate": 8e-05, "loss": 35.7518, "num_input_tokens_seen": 518023508, "step": 10047 }, { "epoch": 0.9876907201297266, "grad_norm": 21.714473724365234, "learning_rate": 8e-05, "loss": 33.4973, "num_input_tokens_seen": 518175596, "step": 10050 }, { "epoch": 0.9879855531805115, "grad_norm": 21.371646881103516, "learning_rate": 8e-05, "loss": 31.9276, "num_input_tokens_seen": 518322968, "step": 10053 }, { "epoch": 0.9882803862312965, "grad_norm": 28.615001678466797, "learning_rate": 8e-05, "loss": 34.2391, "num_input_tokens_seen": 518484088, "step": 10056 }, { "epoch": 0.9885752192820815, "grad_norm": 20.553464889526367, "learning_rate": 8e-05, "loss": 35.4243, "num_input_tokens_seen": 518626344, "step": 10059 }, { "epoch": 0.9888700523328665, "grad_norm": 18.18429183959961, "learning_rate": 8e-05, "loss": 31.8329, "num_input_tokens_seen": 518788096, "step": 10062 }, { "epoch": 0.9891648853836515, "grad_norm": 18.766237258911133, "learning_rate": 8e-05, "loss": 29.4481, "num_input_tokens_seen": 518957216, "step": 10065 }, { "epoch": 0.9894597184344365, "grad_norm": 21.449504852294922, "learning_rate": 8e-05, "loss": 33.9893, "num_input_tokens_seen": 519103360, "step": 10068 }, { "epoch": 0.9897545514852215, "grad_norm": 21.447940826416016, "learning_rate": 8e-05, "loss": 37.1803, "num_input_tokens_seen": 519259144, "step": 10071 }, { "epoch": 0.9900493845360064, "grad_norm": 21.051067352294922, "learning_rate": 8e-05, "loss": 31.7947, "num_input_tokens_seen": 519414808, "step": 10074 }, { "epoch": 0.9903442175867915, "grad_norm": 23.860065460205078, "learning_rate": 8e-05, "loss": 37.1056, "num_input_tokens_seen": 519583132, "step": 10077 }, { "epoch": 0.9906390506375765, "grad_norm": 22.17423439025879, "learning_rate": 8e-05, "loss": 31.622, "num_input_tokens_seen": 519756728, "step": 10080 }, { "epoch": 0.9909338836883614, "grad_norm": 56.88254928588867, "learning_rate": 8e-05, "loss": 31.7862, "num_input_tokens_seen": 519898204, "step": 10083 }, { "epoch": 0.9912287167391465, "grad_norm": 21.94686508178711, "learning_rate": 8e-05, "loss": 34.7363, "num_input_tokens_seen": 520060116, "step": 10086 }, { "epoch": 0.9915235497899314, "grad_norm": 19.87834358215332, "learning_rate": 8e-05, "loss": 31.9671, "num_input_tokens_seen": 520198240, "step": 10089 }, { "epoch": 0.9918183828407164, "grad_norm": 21.034942626953125, "learning_rate": 8e-05, "loss": 35.371, "num_input_tokens_seen": 520331472, "step": 10092 }, { "epoch": 0.9921132158915015, "grad_norm": 21.12396240234375, "learning_rate": 8e-05, "loss": 35.7865, "num_input_tokens_seen": 520478780, "step": 10095 }, { "epoch": 0.9924080489422864, "grad_norm": 20.821521759033203, "learning_rate": 8e-05, "loss": 30.3265, "num_input_tokens_seen": 520624224, "step": 10098 }, { "epoch": 0.9927028819930714, "grad_norm": 22.01003074645996, "learning_rate": 8e-05, "loss": 31.763, "num_input_tokens_seen": 520782620, "step": 10101 }, { "epoch": 0.9929977150438564, "grad_norm": 22.409605026245117, "learning_rate": 8e-05, "loss": 32.1014, "num_input_tokens_seen": 520933140, "step": 10104 }, { "epoch": 0.9932925480946414, "grad_norm": 21.183134078979492, "learning_rate": 8e-05, "loss": 32.2919, "num_input_tokens_seen": 521090992, "step": 10107 }, { "epoch": 0.9935873811454264, "grad_norm": 25.62685775756836, "learning_rate": 8e-05, "loss": 34.3917, "num_input_tokens_seen": 521225684, "step": 10110 }, { "epoch": 0.9938822141962114, "grad_norm": 27.680646896362305, "learning_rate": 8e-05, "loss": 34.8568, "num_input_tokens_seen": 521374296, "step": 10113 }, { "epoch": 0.9941770472469964, "grad_norm": 19.09922981262207, "learning_rate": 8e-05, "loss": 34.4068, "num_input_tokens_seen": 521563176, "step": 10116 }, { "epoch": 0.9944718802977813, "grad_norm": 27.627317428588867, "learning_rate": 8e-05, "loss": 38.226, "num_input_tokens_seen": 521719468, "step": 10119 }, { "epoch": 0.9947667133485664, "grad_norm": 20.5487003326416, "learning_rate": 8e-05, "loss": 35.3753, "num_input_tokens_seen": 521881312, "step": 10122 }, { "epoch": 0.9950615463993514, "grad_norm": 26.295984268188477, "learning_rate": 8e-05, "loss": 33.185, "num_input_tokens_seen": 522020508, "step": 10125 }, { "epoch": 0.9953563794501363, "grad_norm": 25.220104217529297, "learning_rate": 8e-05, "loss": 33.8547, "num_input_tokens_seen": 522183960, "step": 10128 }, { "epoch": 0.9956512125009214, "grad_norm": 32.48232650756836, "learning_rate": 8e-05, "loss": 35.6518, "num_input_tokens_seen": 522335996, "step": 10131 }, { "epoch": 0.9959460455517063, "grad_norm": 21.6962947845459, "learning_rate": 8e-05, "loss": 33.5165, "num_input_tokens_seen": 522504656, "step": 10134 }, { "epoch": 0.9962408786024913, "grad_norm": 20.183027267456055, "learning_rate": 8e-05, "loss": 32.3433, "num_input_tokens_seen": 522656296, "step": 10137 }, { "epoch": 0.9965357116532764, "grad_norm": 25.173770904541016, "learning_rate": 8e-05, "loss": 36.5661, "num_input_tokens_seen": 522813752, "step": 10140 }, { "epoch": 0.9968305447040613, "grad_norm": 24.288291931152344, "learning_rate": 8e-05, "loss": 36.8847, "num_input_tokens_seen": 522965148, "step": 10143 }, { "epoch": 0.9971253777548463, "grad_norm": 20.81410026550293, "learning_rate": 8e-05, "loss": 37.2171, "num_input_tokens_seen": 523100376, "step": 10146 }, { "epoch": 0.9974202108056313, "grad_norm": 20.889305114746094, "learning_rate": 8e-05, "loss": 31.6453, "num_input_tokens_seen": 523248276, "step": 10149 }, { "epoch": 0.9977150438564163, "grad_norm": 22.11374282836914, "learning_rate": 8e-05, "loss": 37.9344, "num_input_tokens_seen": 523429220, "step": 10152 }, { "epoch": 0.9980098769072013, "grad_norm": 21.695816040039062, "learning_rate": 8e-05, "loss": 38.5086, "num_input_tokens_seen": 523578488, "step": 10155 }, { "epoch": 0.9983047099579863, "grad_norm": 20.22509002685547, "learning_rate": 8e-05, "loss": 36.4175, "num_input_tokens_seen": 523725232, "step": 10158 }, { "epoch": 0.9985995430087713, "grad_norm": 22.71636962890625, "learning_rate": 8e-05, "loss": 33.4109, "num_input_tokens_seen": 523883028, "step": 10161 }, { "epoch": 0.9988943760595562, "grad_norm": 19.05315589904785, "learning_rate": 8e-05, "loss": 34.5973, "num_input_tokens_seen": 524037912, "step": 10164 }, { "epoch": 0.9991892091103413, "grad_norm": 19.2845401763916, "learning_rate": 8e-05, "loss": 35.7759, "num_input_tokens_seen": 524190012, "step": 10167 }, { "epoch": 0.9994840421611263, "grad_norm": 23.7741641998291, "learning_rate": 8e-05, "loss": 32.828, "num_input_tokens_seen": 524345060, "step": 10170 }, { "epoch": 0.9997788752119112, "grad_norm": 20.280067443847656, "learning_rate": 8e-05, "loss": 32.2748, "num_input_tokens_seen": 524511820, "step": 10173 }, { "epoch": 0.9999754305791012, "num_input_tokens_seen": 524625736, "step": 10175, "total_flos": 1.8734435060870185e+18, "train_loss": 51.37127453965696, "train_runtime": 54286.9149, "train_samples_per_second": 11.996, "train_steps_per_second": 0.187, "train_tokens_per_second": 9664.096 } ], "logging_steps": 3, "max_steps": 10175, "num_input_tokens_seen": 524625736, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8734435060870185e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }