{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9974985568597268, "eval_steps": 500, "global_step": 324, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.3905354112155957, "learning_rate": 1e-05, "loss": 1.5601, "step": 1 }, { "epoch": 0.01, "grad_norm": 3.2069311238141744, "learning_rate": 2e-05, "loss": 1.5425, "step": 2 }, { "epoch": 0.01, "grad_norm": 2.329876685543184, "learning_rate": 3e-05, "loss": 1.5518, "step": 3 }, { "epoch": 0.01, "grad_norm": 1.3279617694098214, "learning_rate": 4e-05, "loss": 1.4658, "step": 4 }, { "epoch": 0.02, "grad_norm": 1.3546782512787179, "learning_rate": 5e-05, "loss": 1.4736, "step": 5 }, { "epoch": 0.02, "grad_norm": 1.6365619709572907, "learning_rate": 6e-05, "loss": 1.4756, "step": 6 }, { "epoch": 0.02, "grad_norm": 1.2081088328094727, "learning_rate": 7e-05, "loss": 1.3752, "step": 7 }, { "epoch": 0.02, "grad_norm": 1.0574546767027917, "learning_rate": 8e-05, "loss": 1.439, "step": 8 }, { "epoch": 0.03, "grad_norm": 1.0012858470947548, "learning_rate": 9e-05, "loss": 1.4502, "step": 9 }, { "epoch": 0.03, "grad_norm": 1.0136463608511321, "learning_rate": 0.0001, "loss": 1.396, "step": 10 }, { "epoch": 0.03, "grad_norm": 0.9854986957079499, "learning_rate": 9.999749748415981e-05, "loss": 1.4634, "step": 11 }, { "epoch": 0.04, "grad_norm": 0.9239702875144448, "learning_rate": 9.998999018714263e-05, "loss": 1.4185, "step": 12 }, { "epoch": 0.04, "grad_norm": 0.9233186885946617, "learning_rate": 9.997747886043367e-05, "loss": 1.3691, "step": 13 }, { "epoch": 0.04, "grad_norm": 0.8728080646228755, "learning_rate": 9.995996475642466e-05, "loss": 1.3569, "step": 14 }, { "epoch": 0.05, "grad_norm": 0.8307436189120798, "learning_rate": 9.99374496282885e-05, "loss": 1.3667, "step": 15 }, { "epoch": 0.05, "grad_norm": 0.7939681765489442, "learning_rate": 9.990993572980378e-05, "loss": 1.3589, "step": 16 }, { "epoch": 0.05, "grad_norm": 0.7980526587202514, "learning_rate": 9.987742581512918e-05, "loss": 1.395, "step": 17 }, { "epoch": 0.06, "grad_norm": 0.7207026961586376, "learning_rate": 9.983992313852774e-05, "loss": 1.3887, "step": 18 }, { "epoch": 0.06, "grad_norm": 0.7871310923443939, "learning_rate": 9.979743145404119e-05, "loss": 1.3062, "step": 19 }, { "epoch": 0.06, "grad_norm": 0.7246807835643384, "learning_rate": 9.974995501511404e-05, "loss": 1.4028, "step": 20 }, { "epoch": 0.06, "grad_norm": 0.7206361737972257, "learning_rate": 9.969749857416789e-05, "loss": 1.3398, "step": 21 }, { "epoch": 0.07, "grad_norm": 0.6960105391155793, "learning_rate": 9.964006738212575e-05, "loss": 1.3469, "step": 22 }, { "epoch": 0.07, "grad_norm": 0.7204455050781057, "learning_rate": 9.957766718788633e-05, "loss": 1.3765, "step": 23 }, { "epoch": 0.07, "grad_norm": 0.5812596677574942, "learning_rate": 9.951030423774859e-05, "loss": 1.3379, "step": 24 }, { "epoch": 0.08, "grad_norm": 0.675904894861765, "learning_rate": 9.943798527478651e-05, "loss": 1.3789, "step": 25 }, { "epoch": 0.08, "grad_norm": 0.6664199931252878, "learning_rate": 9.936071753817415e-05, "loss": 1.3535, "step": 26 }, { "epoch": 0.08, "grad_norm": 0.7030363325377104, "learning_rate": 9.927850876246088e-05, "loss": 1.218, "step": 27 }, { "epoch": 0.09, "grad_norm": 0.6185910243439269, "learning_rate": 9.919136717679722e-05, "loss": 1.3501, "step": 28 }, { "epoch": 0.09, "grad_norm": 0.6270496500828271, "learning_rate": 9.909930150411113e-05, "loss": 1.3628, "step": 29 }, { "epoch": 0.09, "grad_norm": 0.6021577621428597, "learning_rate": 9.900232096023477e-05, "loss": 1.3428, "step": 30 }, { "epoch": 0.1, "grad_norm": 0.6760900210464318, "learning_rate": 9.890043525298203e-05, "loss": 1.3079, "step": 31 }, { "epoch": 0.1, "grad_norm": 0.5940959503401227, "learning_rate": 9.879365458117678e-05, "loss": 1.291, "step": 32 }, { "epoch": 0.1, "grad_norm": 0.5427094594745479, "learning_rate": 9.868198963363189e-05, "loss": 1.2893, "step": 33 }, { "epoch": 0.1, "grad_norm": 0.5958301777472343, "learning_rate": 9.856545158807938e-05, "loss": 1.2363, "step": 34 }, { "epoch": 0.11, "grad_norm": 0.5850214722817281, "learning_rate": 9.844405211005146e-05, "loss": 1.3154, "step": 35 }, { "epoch": 0.11, "grad_norm": 0.6172288827652985, "learning_rate": 9.831780335171279e-05, "loss": 1.3101, "step": 36 }, { "epoch": 0.11, "grad_norm": 0.5728085013088785, "learning_rate": 9.818671795064404e-05, "loss": 1.2817, "step": 37 }, { "epoch": 0.12, "grad_norm": 0.5979780148525169, "learning_rate": 9.805080902857699e-05, "loss": 1.2493, "step": 38 }, { "epoch": 0.12, "grad_norm": 0.537951216091522, "learning_rate": 9.791009019008078e-05, "loss": 1.2454, "step": 39 }, { "epoch": 0.12, "grad_norm": 0.5879388223555754, "learning_rate": 9.776457552120033e-05, "loss": 1.2744, "step": 40 }, { "epoch": 0.13, "grad_norm": 0.5828144715679795, "learning_rate": 9.761427958804621e-05, "loss": 1.2354, "step": 41 }, { "epoch": 0.13, "grad_norm": 0.5640545987289305, "learning_rate": 9.745921743533653e-05, "loss": 1.3105, "step": 42 }, { "epoch": 0.13, "grad_norm": 0.6077402891626962, "learning_rate": 9.729940458489104e-05, "loss": 1.3188, "step": 43 }, { "epoch": 0.14, "grad_norm": 0.5649165442230879, "learning_rate": 9.713485703407731e-05, "loss": 1.2756, "step": 44 }, { "epoch": 0.14, "grad_norm": 0.5751090024359722, "learning_rate": 9.696559125420948e-05, "loss": 1.2336, "step": 45 }, { "epoch": 0.14, "grad_norm": 0.5415492855376868, "learning_rate": 9.679162418889931e-05, "loss": 1.2424, "step": 46 }, { "epoch": 0.14, "grad_norm": 0.5742691889803935, "learning_rate": 9.66129732523603e-05, "loss": 1.2363, "step": 47 }, { "epoch": 0.15, "grad_norm": 0.5763001671560306, "learning_rate": 9.642965632766436e-05, "loss": 1.2661, "step": 48 }, { "epoch": 0.15, "grad_norm": 0.5940077836695673, "learning_rate": 9.624169176495184e-05, "loss": 1.2734, "step": 49 }, { "epoch": 0.15, "grad_norm": 0.6005566973905004, "learning_rate": 9.604909837959455e-05, "loss": 1.2078, "step": 50 }, { "epoch": 0.16, "grad_norm": 0.5429670344508561, "learning_rate": 9.585189545031238e-05, "loss": 1.2246, "step": 51 }, { "epoch": 0.16, "grad_norm": 0.5871060174846529, "learning_rate": 9.565010271724352e-05, "loss": 1.3501, "step": 52 }, { "epoch": 0.16, "grad_norm": 0.5907025473048154, "learning_rate": 9.54437403799684e-05, "loss": 1.2729, "step": 53 }, { "epoch": 0.17, "grad_norm": 0.6516780485358238, "learning_rate": 9.523282909548773e-05, "loss": 1.2881, "step": 54 }, { "epoch": 0.17, "grad_norm": 0.5670548168352986, "learning_rate": 9.50173899761547e-05, "loss": 1.2056, "step": 55 }, { "epoch": 0.17, "grad_norm": 0.5390303783246289, "learning_rate": 9.47974445875617e-05, "loss": 1.2144, "step": 56 }, { "epoch": 0.18, "grad_norm": 0.5646076772019145, "learning_rate": 9.457301494638147e-05, "loss": 1.2212, "step": 57 }, { "epoch": 0.18, "grad_norm": 0.5612167041328082, "learning_rate": 9.434412351816328e-05, "loss": 1.2717, "step": 58 }, { "epoch": 0.18, "grad_norm": 0.5263144027788276, "learning_rate": 9.411079321508414e-05, "loss": 1.2112, "step": 59 }, { "epoch": 0.18, "grad_norm": 0.5356079649939604, "learning_rate": 9.387304739365523e-05, "loss": 1.2234, "step": 60 }, { "epoch": 0.19, "grad_norm": 0.5615087563747105, "learning_rate": 9.36309098523839e-05, "loss": 1.2996, "step": 61 }, { "epoch": 0.19, "grad_norm": 0.5314023973665841, "learning_rate": 9.338440482939146e-05, "loss": 1.1797, "step": 62 }, { "epoch": 0.19, "grad_norm": 0.5539257915730327, "learning_rate": 9.31335569999869e-05, "loss": 1.2395, "step": 63 }, { "epoch": 0.2, "grad_norm": 0.5648098416860655, "learning_rate": 9.287839147419686e-05, "loss": 1.2156, "step": 64 }, { "epoch": 0.2, "grad_norm": 0.5865707594533118, "learning_rate": 9.261893379425218e-05, "loss": 1.2683, "step": 65 }, { "epoch": 0.2, "grad_norm": 0.5440049015077518, "learning_rate": 9.2355209932031e-05, "loss": 1.1853, "step": 66 }, { "epoch": 0.21, "grad_norm": 0.5440912007358205, "learning_rate": 9.208724628645902e-05, "loss": 1.207, "step": 67 }, { "epoch": 0.21, "grad_norm": 0.5587040510710671, "learning_rate": 9.181506968086697e-05, "loss": 1.1768, "step": 68 }, { "epoch": 0.21, "grad_norm": 0.5192024047252568, "learning_rate": 9.153870736030548e-05, "loss": 1.2427, "step": 69 }, { "epoch": 0.22, "grad_norm": 0.528230045301502, "learning_rate": 9.125818698881798e-05, "loss": 1.2168, "step": 70 }, { "epoch": 0.22, "grad_norm": 0.5562754356167662, "learning_rate": 9.097353664667138e-05, "loss": 1.2793, "step": 71 }, { "epoch": 0.22, "grad_norm": 0.5415713371356312, "learning_rate": 9.068478482754532e-05, "loss": 1.2334, "step": 72 }, { "epoch": 0.22, "grad_norm": 0.5683526535348254, "learning_rate": 9.03919604356798e-05, "loss": 1.2534, "step": 73 }, { "epoch": 0.23, "grad_norm": 0.5396091696393609, "learning_rate": 9.0095092782982e-05, "loss": 1.2722, "step": 74 }, { "epoch": 0.23, "grad_norm": 0.513589724983156, "learning_rate": 8.979421158609206e-05, "loss": 1.2083, "step": 75 }, { "epoch": 0.23, "grad_norm": 0.5376666654796496, "learning_rate": 8.948934696340843e-05, "loss": 1.1931, "step": 76 }, { "epoch": 0.24, "grad_norm": 0.5368664737517322, "learning_rate": 8.918052943207298e-05, "loss": 1.2202, "step": 77 }, { "epoch": 0.24, "grad_norm": 0.5263413266320822, "learning_rate": 8.886778990491631e-05, "loss": 1.1553, "step": 78 }, { "epoch": 0.24, "grad_norm": 0.5604541523029579, "learning_rate": 8.85511596873632e-05, "loss": 1.2964, "step": 79 }, { "epoch": 0.25, "grad_norm": 0.5427504321680312, "learning_rate": 8.823067047429907e-05, "loss": 1.2896, "step": 80 }, { "epoch": 0.25, "grad_norm": 0.5474275345159585, "learning_rate": 8.790635434689721e-05, "loss": 1.2612, "step": 81 }, { "epoch": 0.25, "grad_norm": 0.5269654905955425, "learning_rate": 8.757824376940746e-05, "loss": 1.1821, "step": 82 }, { "epoch": 0.26, "grad_norm": 0.518380839571753, "learning_rate": 8.724637158590652e-05, "loss": 1.2393, "step": 83 }, { "epoch": 0.26, "grad_norm": 0.5158889921158187, "learning_rate": 8.691077101701024e-05, "loss": 1.2754, "step": 84 }, { "epoch": 0.26, "grad_norm": 0.4935757204910804, "learning_rate": 8.65714756565482e-05, "loss": 1.2173, "step": 85 }, { "epoch": 0.26, "grad_norm": 0.5653934280740491, "learning_rate": 8.622851946820095e-05, "loss": 1.2253, "step": 86 }, { "epoch": 0.27, "grad_norm": 0.5250362872573285, "learning_rate": 8.588193678210026e-05, "loss": 1.1707, "step": 87 }, { "epoch": 0.27, "grad_norm": 0.5446362423647451, "learning_rate": 8.553176229139261e-05, "loss": 1.1589, "step": 88 }, { "epoch": 0.27, "grad_norm": 0.5142920460404157, "learning_rate": 8.517803104876639e-05, "loss": 1.1948, "step": 89 }, { "epoch": 0.28, "grad_norm": 0.5317755291367837, "learning_rate": 8.482077846294308e-05, "loss": 1.1812, "step": 90 }, { "epoch": 0.28, "grad_norm": 0.5147982642702206, "learning_rate": 8.446004029513294e-05, "loss": 1.2266, "step": 91 }, { "epoch": 0.28, "grad_norm": 0.5096116915413985, "learning_rate": 8.409585265545509e-05, "loss": 1.179, "step": 92 }, { "epoch": 0.29, "grad_norm": 0.49910238964612424, "learning_rate": 8.372825199932304e-05, "loss": 1.2485, "step": 93 }, { "epoch": 0.29, "grad_norm": 0.5507929652198581, "learning_rate": 8.335727512379534e-05, "loss": 1.2388, "step": 94 }, { "epoch": 0.29, "grad_norm": 0.5063227228339493, "learning_rate": 8.298295916389234e-05, "loss": 1.2415, "step": 95 }, { "epoch": 0.3, "grad_norm": 0.48774944558422, "learning_rate": 8.260534158887876e-05, "loss": 1.1301, "step": 96 }, { "epoch": 0.3, "grad_norm": 0.5131731261481731, "learning_rate": 8.222446019851314e-05, "loss": 1.2158, "step": 97 }, { "epoch": 0.3, "grad_norm": 0.49068550228239954, "learning_rate": 8.184035311926396e-05, "loss": 1.1968, "step": 98 }, { "epoch": 0.3, "grad_norm": 0.5288695654598018, "learning_rate": 8.145305880049328e-05, "loss": 1.2637, "step": 99 }, { "epoch": 0.31, "grad_norm": 0.5479388159051014, "learning_rate": 8.106261601060772e-05, "loss": 1.3218, "step": 100 }, { "epoch": 0.31, "grad_norm": 0.5621161336370604, "learning_rate": 8.066906383317801e-05, "loss": 1.1729, "step": 101 }, { "epoch": 0.31, "grad_norm": 0.4924846556483893, "learning_rate": 8.027244166302642e-05, "loss": 1.1875, "step": 102 }, { "epoch": 0.32, "grad_norm": 0.5037791405351364, "learning_rate": 7.987278920228349e-05, "loss": 1.2539, "step": 103 }, { "epoch": 0.32, "grad_norm": 0.522728960991421, "learning_rate": 7.947014645641379e-05, "loss": 1.2217, "step": 104 }, { "epoch": 0.32, "grad_norm": 0.5182027559256257, "learning_rate": 7.906455373021129e-05, "loss": 1.2188, "step": 105 }, { "epoch": 0.33, "grad_norm": 0.5539472403444413, "learning_rate": 7.865605162376486e-05, "loss": 1.1509, "step": 106 }, { "epoch": 0.33, "grad_norm": 0.5439005321857965, "learning_rate": 7.824468102839419e-05, "loss": 1.251, "step": 107 }, { "epoch": 0.33, "grad_norm": 0.5093593984665626, "learning_rate": 7.783048312255653e-05, "loss": 1.229, "step": 108 }, { "epoch": 0.34, "grad_norm": 0.5145253165856254, "learning_rate": 7.741349936772469e-05, "loss": 1.1824, "step": 109 }, { "epoch": 0.34, "grad_norm": 0.5702243863746732, "learning_rate": 7.699377150423672e-05, "loss": 1.1582, "step": 110 }, { "epoch": 0.34, "grad_norm": 0.5451183103162889, "learning_rate": 7.65713415471177e-05, "loss": 1.2122, "step": 111 }, { "epoch": 0.34, "grad_norm": 0.5252426501201418, "learning_rate": 7.614625178187402e-05, "loss": 1.1833, "step": 112 }, { "epoch": 0.35, "grad_norm": 0.48749296422821287, "learning_rate": 7.571854476026048e-05, "loss": 1.2705, "step": 113 }, { "epoch": 0.35, "grad_norm": 0.502002191255964, "learning_rate": 7.528826329602099e-05, "loss": 1.2188, "step": 114 }, { "epoch": 0.35, "grad_norm": 0.48918503246260797, "learning_rate": 7.485545046060271e-05, "loss": 1.1997, "step": 115 }, { "epoch": 0.36, "grad_norm": 0.5000600132443745, "learning_rate": 7.442014957884472e-05, "loss": 1.2051, "step": 116 }, { "epoch": 0.36, "grad_norm": 0.5463372631998543, "learning_rate": 7.398240422464109e-05, "loss": 1.2214, "step": 117 }, { "epoch": 0.36, "grad_norm": 0.4961344121724833, "learning_rate": 7.354225821657914e-05, "loss": 1.208, "step": 118 }, { "epoch": 0.37, "grad_norm": 0.5899278757904003, "learning_rate": 7.309975561355312e-05, "loss": 1.146, "step": 119 }, { "epoch": 0.37, "grad_norm": 0.5023010444635025, "learning_rate": 7.265494071035401e-05, "loss": 1.1509, "step": 120 }, { "epoch": 0.37, "grad_norm": 0.5098342526132932, "learning_rate": 7.220785803323544e-05, "loss": 1.1743, "step": 121 }, { "epoch": 0.38, "grad_norm": 0.5181510224338065, "learning_rate": 7.175855233545668e-05, "loss": 1.1807, "step": 122 }, { "epoch": 0.38, "grad_norm": 0.5126009147492929, "learning_rate": 7.130706859280274e-05, "loss": 1.1875, "step": 123 }, { "epoch": 0.38, "grad_norm": 0.5694155609516423, "learning_rate": 7.085345199908235e-05, "loss": 1.1428, "step": 124 }, { "epoch": 0.38, "grad_norm": 0.5320176914945207, "learning_rate": 7.03977479616039e-05, "loss": 1.219, "step": 125 }, { "epoch": 0.39, "grad_norm": 0.5263190337396886, "learning_rate": 6.994000209663036e-05, "loss": 1.1709, "step": 126 }, { "epoch": 0.39, "grad_norm": 0.5043242059465939, "learning_rate": 6.948026022481279e-05, "loss": 1.1968, "step": 127 }, { "epoch": 0.39, "grad_norm": 0.5580562389957314, "learning_rate": 6.901856836660386e-05, "loss": 1.1494, "step": 128 }, { "epoch": 0.4, "grad_norm": 0.495782343949626, "learning_rate": 6.855497273765112e-05, "loss": 1.2119, "step": 129 }, { "epoch": 0.4, "grad_norm": 0.49510379613102157, "learning_rate": 6.808951974417078e-05, "loss": 1.2063, "step": 130 }, { "epoch": 0.4, "grad_norm": 0.5536322418977505, "learning_rate": 6.762225597830237e-05, "loss": 1.2617, "step": 131 }, { "epoch": 0.41, "grad_norm": 0.5668665640341709, "learning_rate": 6.715322821344494e-05, "loss": 1.1443, "step": 132 }, { "epoch": 0.41, "grad_norm": 0.5083973254687169, "learning_rate": 6.668248339957491e-05, "loss": 1.2588, "step": 133 }, { "epoch": 0.41, "grad_norm": 0.5188609075674584, "learning_rate": 6.621006865854644e-05, "loss": 1.1726, "step": 134 }, { "epoch": 0.42, "grad_norm": 0.5083154005353754, "learning_rate": 6.573603127937442e-05, "loss": 1.1953, "step": 135 }, { "epoch": 0.42, "grad_norm": 0.544802137981313, "learning_rate": 6.526041871350086e-05, "loss": 1.1853, "step": 136 }, { "epoch": 0.42, "grad_norm": 0.4692918589408204, "learning_rate": 6.478327857004495e-05, "loss": 1.1267, "step": 137 }, { "epoch": 0.42, "grad_norm": 0.504019443454899, "learning_rate": 6.43046586110374e-05, "loss": 1.2085, "step": 138 }, { "epoch": 0.43, "grad_norm": 0.5071221863228176, "learning_rate": 6.382460674663932e-05, "loss": 1.2026, "step": 139 }, { "epoch": 0.43, "grad_norm": 0.5146798822767533, "learning_rate": 6.334317103034652e-05, "loss": 1.1177, "step": 140 }, { "epoch": 0.43, "grad_norm": 0.4996775842843673, "learning_rate": 6.286039965417925e-05, "loss": 1.1711, "step": 141 }, { "epoch": 0.44, "grad_norm": 0.5288823443570614, "learning_rate": 6.237634094385813e-05, "loss": 1.1199, "step": 142 }, { "epoch": 0.44, "grad_norm": 0.5214584824734313, "learning_rate": 6.18910433539668e-05, "loss": 1.1199, "step": 143 }, { "epoch": 0.44, "grad_norm": 0.5287174552858058, "learning_rate": 6.140455546310148e-05, "loss": 1.2134, "step": 144 }, { "epoch": 0.45, "grad_norm": 0.5254574455344191, "learning_rate": 6.0916925969008275e-05, "loss": 1.2578, "step": 145 }, { "epoch": 0.45, "grad_norm": 0.5189177716169284, "learning_rate": 6.042820368370854e-05, "loss": 1.2236, "step": 146 }, { "epoch": 0.45, "grad_norm": 0.5680190447979175, "learning_rate": 5.993843752861266e-05, "loss": 1.1443, "step": 147 }, { "epoch": 0.46, "grad_norm": 0.471841513545942, "learning_rate": 5.944767652962309e-05, "loss": 1.1733, "step": 148 }, { "epoch": 0.46, "grad_norm": 0.5131643851734395, "learning_rate": 5.895596981222678e-05, "loss": 1.1592, "step": 149 }, { "epoch": 0.46, "grad_norm": 0.5069414300839418, "learning_rate": 5.8463366596577706e-05, "loss": 1.2537, "step": 150 }, { "epoch": 0.46, "grad_norm": 0.5239372199223642, "learning_rate": 5.796991619256985e-05, "loss": 1.1743, "step": 151 }, { "epoch": 0.47, "grad_norm": 0.5387326952860074, "learning_rate": 5.747566799490132e-05, "loss": 1.2075, "step": 152 }, { "epoch": 0.47, "grad_norm": 0.5224076525892895, "learning_rate": 5.6980671478129853e-05, "loss": 1.2515, "step": 153 }, { "epoch": 0.47, "grad_norm": 0.5609147216869792, "learning_rate": 5.648497619172042e-05, "loss": 1.1836, "step": 154 }, { "epoch": 0.48, "grad_norm": 0.5465488763290158, "learning_rate": 5.5988631755085264e-05, "loss": 1.1433, "step": 155 }, { "epoch": 0.48, "grad_norm": 0.5079660248232113, "learning_rate": 5.549168785261698e-05, "loss": 1.1812, "step": 156 }, { "epoch": 0.48, "grad_norm": 0.5717154092705009, "learning_rate": 5.499419422871506e-05, "loss": 1.2068, "step": 157 }, { "epoch": 0.49, "grad_norm": 0.5301493442832448, "learning_rate": 5.4496200682806495e-05, "loss": 1.2273, "step": 158 }, { "epoch": 0.49, "grad_norm": 0.5098890110687092, "learning_rate": 5.399775706436076e-05, "loss": 1.2134, "step": 159 }, { "epoch": 0.49, "grad_norm": 0.5365690403371399, "learning_rate": 5.3498913267899864e-05, "loss": 1.2051, "step": 160 }, { "epoch": 0.5, "grad_norm": 0.5002287628666306, "learning_rate": 5.299971922800391e-05, "loss": 1.1255, "step": 161 }, { "epoch": 0.5, "grad_norm": 0.5276270904967929, "learning_rate": 5.250022491431259e-05, "loss": 1.2124, "step": 162 }, { "epoch": 0.5, "grad_norm": 0.5224515486361312, "learning_rate": 5.200048032652318e-05, "loss": 1.2559, "step": 163 }, { "epoch": 0.5, "grad_norm": 0.49316666866194575, "learning_rate": 5.150053548938557e-05, "loss": 1.1421, "step": 164 }, { "epoch": 0.51, "grad_norm": 0.5388196993705506, "learning_rate": 5.100044044769472e-05, "loss": 1.2017, "step": 165 }, { "epoch": 0.51, "grad_norm": 0.4944161877205336, "learning_rate": 5.0500245261281175e-05, "loss": 1.1838, "step": 166 }, { "epoch": 0.51, "grad_norm": 0.5075949586376647, "learning_rate": 5e-05, "loss": 1.2698, "step": 167 }, { "epoch": 0.52, "grad_norm": 0.5288349964384436, "learning_rate": 4.949975473871884e-05, "loss": 1.2051, "step": 168 }, { "epoch": 0.52, "grad_norm": 0.5298564213314307, "learning_rate": 4.899955955230529e-05, "loss": 1.1921, "step": 169 }, { "epoch": 0.52, "grad_norm": 0.5006832164442894, "learning_rate": 4.849946451061443e-05, "loss": 1.0999, "step": 170 }, { "epoch": 0.53, "grad_norm": 0.5180365116427443, "learning_rate": 4.799951967347683e-05, "loss": 1.1248, "step": 171 }, { "epoch": 0.53, "grad_norm": 0.5085456414296031, "learning_rate": 4.749977508568742e-05, "loss": 1.0989, "step": 172 }, { "epoch": 0.53, "grad_norm": 0.5062569741436713, "learning_rate": 4.7000280771996104e-05, "loss": 1.1851, "step": 173 }, { "epoch": 0.54, "grad_norm": 0.49915526983252295, "learning_rate": 4.650108673210015e-05, "loss": 1.1565, "step": 174 }, { "epoch": 0.54, "grad_norm": 0.530737295373366, "learning_rate": 4.6002242935639254e-05, "loss": 1.1367, "step": 175 }, { "epoch": 0.54, "grad_norm": 0.5354173274123866, "learning_rate": 4.550379931719351e-05, "loss": 1.2231, "step": 176 }, { "epoch": 0.54, "grad_norm": 0.4910072525351625, "learning_rate": 4.500580577128495e-05, "loss": 1.2354, "step": 177 }, { "epoch": 0.55, "grad_norm": 0.5227332856552718, "learning_rate": 4.4508312147383036e-05, "loss": 1.2178, "step": 178 }, { "epoch": 0.55, "grad_norm": 0.5288368214749951, "learning_rate": 4.4011368244914755e-05, "loss": 1.1731, "step": 179 }, { "epoch": 0.55, "grad_norm": 0.5044952842565356, "learning_rate": 4.3515023808279586e-05, "loss": 1.1287, "step": 180 }, { "epoch": 0.56, "grad_norm": 0.5325342803705855, "learning_rate": 4.301932852187016e-05, "loss": 1.2129, "step": 181 }, { "epoch": 0.56, "grad_norm": 0.5415040662560654, "learning_rate": 4.252433200509869e-05, "loss": 1.3086, "step": 182 }, { "epoch": 0.56, "grad_norm": 0.51763854212227, "learning_rate": 4.203008380743016e-05, "loss": 1.1216, "step": 183 }, { "epoch": 0.57, "grad_norm": 0.5422430582487087, "learning_rate": 4.1536633403422306e-05, "loss": 1.1111, "step": 184 }, { "epoch": 0.57, "grad_norm": 0.5022269460427503, "learning_rate": 4.104403018777323e-05, "loss": 1.1963, "step": 185 }, { "epoch": 0.57, "grad_norm": 0.5631689643987351, "learning_rate": 4.0552323470376916e-05, "loss": 1.261, "step": 186 }, { "epoch": 0.58, "grad_norm": 0.5083595167881586, "learning_rate": 4.006156247138736e-05, "loss": 1.2529, "step": 187 }, { "epoch": 0.58, "grad_norm": 0.4982995201328196, "learning_rate": 3.9571796316291476e-05, "loss": 1.1938, "step": 188 }, { "epoch": 0.58, "grad_norm": 0.5142407248745989, "learning_rate": 3.908307403099174e-05, "loss": 1.231, "step": 189 }, { "epoch": 0.58, "grad_norm": 0.49845909814685674, "learning_rate": 3.859544453689853e-05, "loss": 1.1174, "step": 190 }, { "epoch": 0.59, "grad_norm": 0.5044393346608876, "learning_rate": 3.810895664603321e-05, "loss": 1.1626, "step": 191 }, { "epoch": 0.59, "grad_norm": 0.5028906716759755, "learning_rate": 3.762365905614187e-05, "loss": 1.1248, "step": 192 }, { "epoch": 0.59, "grad_norm": 0.5052094751619233, "learning_rate": 3.713960034582077e-05, "loss": 1.1782, "step": 193 }, { "epoch": 0.6, "grad_norm": 0.4927933528160678, "learning_rate": 3.665682896965349e-05, "loss": 1.1411, "step": 194 }, { "epoch": 0.6, "grad_norm": 0.5194837061389329, "learning_rate": 3.61753932533607e-05, "loss": 1.1194, "step": 195 }, { "epoch": 0.6, "grad_norm": 0.5003507430931593, "learning_rate": 3.5695341388962614e-05, "loss": 1.1438, "step": 196 }, { "epoch": 0.61, "grad_norm": 0.4952909851844929, "learning_rate": 3.521672142995506e-05, "loss": 1.1274, "step": 197 }, { "epoch": 0.61, "grad_norm": 0.515058338420701, "learning_rate": 3.473958128649915e-05, "loss": 1.2173, "step": 198 }, { "epoch": 0.61, "grad_norm": 0.4649441022469789, "learning_rate": 3.4263968720625594e-05, "loss": 1.0901, "step": 199 }, { "epoch": 0.62, "grad_norm": 0.5257495392411118, "learning_rate": 3.378993134145356e-05, "loss": 1.2124, "step": 200 }, { "epoch": 0.62, "grad_norm": 0.4932641665110047, "learning_rate": 3.33175166004251e-05, "loss": 1.2024, "step": 201 }, { "epoch": 0.62, "grad_norm": 0.47986279797474973, "learning_rate": 3.284677178655507e-05, "loss": 1.199, "step": 202 }, { "epoch": 0.62, "grad_norm": 0.5192526802216362, "learning_rate": 3.2377744021697644e-05, "loss": 1.1348, "step": 203 }, { "epoch": 0.63, "grad_norm": 0.49075397683891514, "learning_rate": 3.1910480255829237e-05, "loss": 1.209, "step": 204 }, { "epoch": 0.63, "grad_norm": 0.5308583566356849, "learning_rate": 3.1445027262348894e-05, "loss": 1.1843, "step": 205 }, { "epoch": 0.63, "grad_norm": 0.4754800140640908, "learning_rate": 3.098143163339615e-05, "loss": 1.0713, "step": 206 }, { "epoch": 0.64, "grad_norm": 0.48744339587851926, "learning_rate": 3.051973977518723e-05, "loss": 1.1589, "step": 207 }, { "epoch": 0.64, "grad_norm": 0.538169296667757, "learning_rate": 3.0059997903369656e-05, "loss": 1.1636, "step": 208 }, { "epoch": 0.64, "grad_norm": 0.5174008006678488, "learning_rate": 2.9602252038396093e-05, "loss": 1.0088, "step": 209 }, { "epoch": 0.65, "grad_norm": 0.5423637270068683, "learning_rate": 2.914654800091768e-05, "loss": 1.2637, "step": 210 }, { "epoch": 0.65, "grad_norm": 0.49532340168296196, "learning_rate": 2.8692931407197275e-05, "loss": 1.1211, "step": 211 }, { "epoch": 0.65, "grad_norm": 0.5223772948936137, "learning_rate": 2.824144766454333e-05, "loss": 1.1392, "step": 212 }, { "epoch": 0.66, "grad_norm": 0.5301272448221841, "learning_rate": 2.7792141966764568e-05, "loss": 1.1663, "step": 213 }, { "epoch": 0.66, "grad_norm": 0.5241453393589025, "learning_rate": 2.7345059289646008e-05, "loss": 1.1375, "step": 214 }, { "epoch": 0.66, "grad_norm": 0.4946190439071369, "learning_rate": 2.6900244386446904e-05, "loss": 1.1843, "step": 215 }, { "epoch": 0.66, "grad_norm": 0.4894503682723616, "learning_rate": 2.6457741783420886e-05, "loss": 1.1492, "step": 216 }, { "epoch": 0.67, "grad_norm": 0.4852212167430057, "learning_rate": 2.6017595775358926e-05, "loss": 1.1592, "step": 217 }, { "epoch": 0.67, "grad_norm": 0.5560815990000127, "learning_rate": 2.5579850421155293e-05, "loss": 1.1484, "step": 218 }, { "epoch": 0.67, "grad_norm": 0.5035735515547093, "learning_rate": 2.514454953939731e-05, "loss": 1.1381, "step": 219 }, { "epoch": 0.68, "grad_norm": 0.5057189594248047, "learning_rate": 2.4711736703979018e-05, "loss": 1.2036, "step": 220 }, { "epoch": 0.68, "grad_norm": 0.5116012961964514, "learning_rate": 2.428145523973952e-05, "loss": 1.2212, "step": 221 }, { "epoch": 0.68, "grad_norm": 0.48593835956939924, "learning_rate": 2.3853748218126e-05, "loss": 1.1505, "step": 222 }, { "epoch": 0.69, "grad_norm": 0.5014137656412536, "learning_rate": 2.342865845288232e-05, "loss": 1.2246, "step": 223 }, { "epoch": 0.69, "grad_norm": 0.5223070185696177, "learning_rate": 2.3006228495763295e-05, "loss": 1.158, "step": 224 }, { "epoch": 0.69, "grad_norm": 0.5624122056478932, "learning_rate": 2.258650063227533e-05, "loss": 1.0566, "step": 225 }, { "epoch": 0.7, "grad_norm": 0.5471967527021497, "learning_rate": 2.2169516877443485e-05, "loss": 1.1277, "step": 226 }, { "epoch": 0.7, "grad_norm": 0.48337168913421796, "learning_rate": 2.1755318971605826e-05, "loss": 1.1956, "step": 227 }, { "epoch": 0.7, "grad_norm": 0.5284626857313452, "learning_rate": 2.1343948376235144e-05, "loss": 1.196, "step": 228 }, { "epoch": 0.71, "grad_norm": 0.5837213719579207, "learning_rate": 2.0935446269788717e-05, "loss": 1.0967, "step": 229 }, { "epoch": 0.71, "grad_norm": 0.5165785735037434, "learning_rate": 2.052985354358622e-05, "loss": 1.2036, "step": 230 }, { "epoch": 0.71, "grad_norm": 0.5536196580249753, "learning_rate": 2.0127210797716524e-05, "loss": 1.1572, "step": 231 }, { "epoch": 0.71, "grad_norm": 0.4991948055619459, "learning_rate": 1.9727558336973595e-05, "loss": 1.2112, "step": 232 }, { "epoch": 0.72, "grad_norm": 0.5271303549814744, "learning_rate": 1.933093616682201e-05, "loss": 1.0928, "step": 233 }, { "epoch": 0.72, "grad_norm": 0.4755907436318383, "learning_rate": 1.8937383989392294e-05, "loss": 1.198, "step": 234 }, { "epoch": 0.72, "grad_norm": 0.5071149662683123, "learning_rate": 1.854694119950675e-05, "loss": 1.1602, "step": 235 }, { "epoch": 0.73, "grad_norm": 0.541369836697391, "learning_rate": 1.8159646880736036e-05, "loss": 1.1282, "step": 236 }, { "epoch": 0.73, "grad_norm": 0.5083789491431415, "learning_rate": 1.7775539801486867e-05, "loss": 1.0872, "step": 237 }, { "epoch": 0.73, "grad_norm": 0.5324743232408313, "learning_rate": 1.739465841112125e-05, "loss": 1.2222, "step": 238 }, { "epoch": 0.74, "grad_norm": 0.48387142611649014, "learning_rate": 1.701704083610768e-05, "loss": 1.158, "step": 239 }, { "epoch": 0.74, "grad_norm": 0.5141063576503123, "learning_rate": 1.664272487620466e-05, "loss": 1.1162, "step": 240 }, { "epoch": 0.74, "grad_norm": 0.5408323267729829, "learning_rate": 1.6271748000676983e-05, "loss": 1.02, "step": 241 }, { "epoch": 0.75, "grad_norm": 0.49627165191053163, "learning_rate": 1.590414734454493e-05, "loss": 1.0848, "step": 242 }, { "epoch": 0.75, "grad_norm": 0.5376377474797022, "learning_rate": 1.5539959704867085e-05, "loss": 1.2019, "step": 243 }, { "epoch": 0.75, "grad_norm": 0.49371506434647033, "learning_rate": 1.517922153705692e-05, "loss": 1.2151, "step": 244 }, { "epoch": 0.75, "grad_norm": 0.523592301439241, "learning_rate": 1.4821968951233638e-05, "loss": 1.231, "step": 245 }, { "epoch": 0.76, "grad_norm": 0.5074105674850246, "learning_rate": 1.4468237708607397e-05, "loss": 1.1519, "step": 246 }, { "epoch": 0.76, "grad_norm": 0.5066768606133761, "learning_rate": 1.4118063217899746e-05, "loss": 1.1492, "step": 247 }, { "epoch": 0.76, "grad_norm": 0.5116682098711993, "learning_rate": 1.3771480531799052e-05, "loss": 1.1477, "step": 248 }, { "epoch": 0.77, "grad_norm": 0.5352752339451914, "learning_rate": 1.342852434345181e-05, "loss": 1.0964, "step": 249 }, { "epoch": 0.77, "grad_norm": 0.5418770700535751, "learning_rate": 1.308922898298977e-05, "loss": 1.0803, "step": 250 }, { "epoch": 0.77, "grad_norm": 0.5157514248366463, "learning_rate": 1.2753628414093489e-05, "loss": 1.1614, "step": 251 }, { "epoch": 0.78, "grad_norm": 0.5351720244073139, "learning_rate": 1.2421756230592534e-05, "loss": 1.2224, "step": 252 }, { "epoch": 0.78, "grad_norm": 0.5214450914678166, "learning_rate": 1.2093645653102786e-05, "loss": 1.2302, "step": 253 }, { "epoch": 0.78, "grad_norm": 0.5409585268429792, "learning_rate": 1.1769329525700935e-05, "loss": 1.2104, "step": 254 }, { "epoch": 0.79, "grad_norm": 0.5220300809084372, "learning_rate": 1.144884031263681e-05, "loss": 1.1047, "step": 255 }, { "epoch": 0.79, "grad_norm": 0.5376568328092315, "learning_rate": 1.1132210095083694e-05, "loss": 1.2002, "step": 256 }, { "epoch": 0.79, "grad_norm": 0.5414025185087213, "learning_rate": 1.081947056792702e-05, "loss": 1.1833, "step": 257 }, { "epoch": 0.79, "grad_norm": 0.49383703911170834, "learning_rate": 1.0510653036591583e-05, "loss": 1.199, "step": 258 }, { "epoch": 0.8, "grad_norm": 0.4916712820340113, "learning_rate": 1.0205788413907952e-05, "loss": 1.1292, "step": 259 }, { "epoch": 0.8, "grad_norm": 0.5168351602558782, "learning_rate": 9.904907217018e-06, "loss": 1.2712, "step": 260 }, { "epoch": 0.8, "grad_norm": 0.4856958512300068, "learning_rate": 9.608039564320209e-06, "loss": 1.1277, "step": 261 }, { "epoch": 0.81, "grad_norm": 0.5106017400451819, "learning_rate": 9.31521517245469e-06, "loss": 1.1108, "step": 262 }, { "epoch": 0.81, "grad_norm": 0.5036900746833531, "learning_rate": 9.026463353328613e-06, "loss": 1.1748, "step": 263 }, { "epoch": 0.81, "grad_norm": 0.5033379794982934, "learning_rate": 8.741813011182014e-06, "loss": 1.1521, "step": 264 }, { "epoch": 0.82, "grad_norm": 0.5332335505700774, "learning_rate": 8.461292639694518e-06, "loss": 1.0955, "step": 265 }, { "epoch": 0.82, "grad_norm": 0.5125213937729919, "learning_rate": 8.18493031913305e-06, "loss": 1.1089, "step": 266 }, { "epoch": 0.82, "grad_norm": 0.5148264028992737, "learning_rate": 7.912753713540988e-06, "loss": 1.177, "step": 267 }, { "epoch": 0.83, "grad_norm": 0.5229543225675894, "learning_rate": 7.644790067969005e-06, "loss": 1.1646, "step": 268 }, { "epoch": 0.83, "grad_norm": 0.541964343800654, "learning_rate": 7.381066205747822e-06, "loss": 1.1187, "step": 269 }, { "epoch": 0.83, "grad_norm": 0.5266917236869202, "learning_rate": 7.1216085258031414e-06, "loss": 1.0901, "step": 270 }, { "epoch": 0.83, "grad_norm": 0.49725116144459375, "learning_rate": 6.866443000013117e-06, "loss": 1.1526, "step": 271 }, { "epoch": 0.84, "grad_norm": 0.5012370969649222, "learning_rate": 6.6155951706085405e-06, "loss": 1.1689, "step": 272 }, { "epoch": 0.84, "grad_norm": 0.5129519814796674, "learning_rate": 6.369090147616103e-06, "loss": 1.1348, "step": 273 }, { "epoch": 0.84, "grad_norm": 0.4926628244202811, "learning_rate": 6.1269526063447765e-06, "loss": 1.1538, "step": 274 }, { "epoch": 0.85, "grad_norm": 0.5420926972187566, "learning_rate": 5.889206784915862e-06, "loss": 1.1484, "step": 275 }, { "epoch": 0.85, "grad_norm": 0.4950562048422779, "learning_rate": 5.6558764818367195e-06, "loss": 1.2126, "step": 276 }, { "epoch": 0.85, "grad_norm": 0.536783110803186, "learning_rate": 5.426985053618544e-06, "loss": 1.1301, "step": 277 }, { "epoch": 0.86, "grad_norm": 0.5016583572718184, "learning_rate": 5.2025554124383095e-06, "loss": 1.1582, "step": 278 }, { "epoch": 0.86, "grad_norm": 0.5263293678305013, "learning_rate": 4.9826100238453135e-06, "loss": 1.1577, "step": 279 }, { "epoch": 0.86, "grad_norm": 0.5272007649322749, "learning_rate": 4.767170904512292e-06, "loss": 1.167, "step": 280 }, { "epoch": 0.87, "grad_norm": 0.5063050835599103, "learning_rate": 4.556259620031617e-06, "loss": 1.1846, "step": 281 }, { "epoch": 0.87, "grad_norm": 0.4910298010180289, "learning_rate": 4.349897282756487e-06, "loss": 1.1357, "step": 282 }, { "epoch": 0.87, "grad_norm": 0.5379014923521229, "learning_rate": 4.148104549687626e-06, "loss": 1.0798, "step": 283 }, { "epoch": 0.87, "grad_norm": 0.4865878079832316, "learning_rate": 3.95090162040545e-06, "loss": 1.1123, "step": 284 }, { "epoch": 0.88, "grad_norm": 0.5282174531059731, "learning_rate": 3.758308235048158e-06, "loss": 1.1538, "step": 285 }, { "epoch": 0.88, "grad_norm": 0.5010967709298223, "learning_rate": 3.570343672335641e-06, "loss": 1.1392, "step": 286 }, { "epoch": 0.88, "grad_norm": 0.5189711142846094, "learning_rate": 3.38702674763971e-06, "loss": 1.1851, "step": 287 }, { "epoch": 0.89, "grad_norm": 0.49664252429083205, "learning_rate": 3.2083758111006945e-06, "loss": 1.11, "step": 288 }, { "epoch": 0.89, "grad_norm": 0.4859267407564821, "learning_rate": 3.0344087457905346e-06, "loss": 1.1719, "step": 289 }, { "epoch": 0.89, "grad_norm": 0.5151543902141822, "learning_rate": 2.86514296592269e-06, "loss": 1.2102, "step": 290 }, { "epoch": 0.9, "grad_norm": 0.5256996348656279, "learning_rate": 2.7005954151089695e-06, "loss": 1.1709, "step": 291 }, { "epoch": 0.9, "grad_norm": 0.5102631445476464, "learning_rate": 2.54078256466348e-06, "loss": 1.2034, "step": 292 }, { "epoch": 0.9, "grad_norm": 0.5093343267813872, "learning_rate": 2.3857204119538014e-06, "loss": 1.0969, "step": 293 }, { "epoch": 0.91, "grad_norm": 0.5024898891939928, "learning_rate": 2.2354244787996748e-06, "loss": 1.1289, "step": 294 }, { "epoch": 0.91, "grad_norm": 0.5279824056783474, "learning_rate": 2.0899098099192273e-06, "loss": 1.1672, "step": 295 }, { "epoch": 0.91, "grad_norm": 0.5379712149279697, "learning_rate": 1.9491909714230204e-06, "loss": 1.1858, "step": 296 }, { "epoch": 0.91, "grad_norm": 0.5436100119596193, "learning_rate": 1.8132820493559521e-06, "loss": 1.1201, "step": 297 }, { "epoch": 0.92, "grad_norm": 0.5015550141769547, "learning_rate": 1.6821966482872264e-06, "loss": 1.1797, "step": 298 }, { "epoch": 0.92, "grad_norm": 0.5119620808666944, "learning_rate": 1.5559478899485447e-06, "loss": 1.1493, "step": 299 }, { "epoch": 0.92, "grad_norm": 0.4977128949907143, "learning_rate": 1.434548411920622e-06, "loss": 1.1821, "step": 300 }, { "epoch": 0.93, "grad_norm": 0.509981327731476, "learning_rate": 1.3180103663681165e-06, "loss": 1.1377, "step": 301 }, { "epoch": 0.93, "grad_norm": 0.5029662223988268, "learning_rate": 1.206345418823235e-06, "loss": 1.1526, "step": 302 }, { "epoch": 0.93, "grad_norm": 0.5519080967379805, "learning_rate": 1.099564747017967e-06, "loss": 1.1282, "step": 303 }, { "epoch": 0.94, "grad_norm": 0.5223711089586693, "learning_rate": 9.976790397652315e-07, "loss": 1.0811, "step": 304 }, { "epoch": 0.94, "grad_norm": 0.47043707607640073, "learning_rate": 9.006984958888742e-07, "loss": 1.1685, "step": 305 }, { "epoch": 0.94, "grad_norm": 0.5254608837175258, "learning_rate": 8.086328232027873e-07, "loss": 1.0779, "step": 306 }, { "epoch": 0.95, "grad_norm": 0.5081931596861774, "learning_rate": 7.214912375391291e-07, "loss": 1.1371, "step": 307 }, { "epoch": 0.95, "grad_norm": 0.4922428385525549, "learning_rate": 6.392824618258519e-07, "loss": 1.2107, "step": 308 }, { "epoch": 0.95, "grad_norm": 0.5274751815037492, "learning_rate": 5.620147252134889e-07, "loss": 1.1548, "step": 309 }, { "epoch": 0.95, "grad_norm": 0.4984402486182386, "learning_rate": 4.896957622514298e-07, "loss": 1.1189, "step": 310 }, { "epoch": 0.96, "grad_norm": 0.503914938251447, "learning_rate": 4.2233281211368493e-07, "loss": 1.1362, "step": 311 }, { "epoch": 0.96, "grad_norm": 0.5313082041540377, "learning_rate": 3.599326178742535e-07, "loss": 1.2146, "step": 312 }, { "epoch": 0.96, "grad_norm": 0.5301442913189116, "learning_rate": 3.025014258321135e-07, "loss": 1.1699, "step": 313 }, { "epoch": 0.97, "grad_norm": 0.5176192540816547, "learning_rate": 2.500449848859776e-07, "loss": 1.2048, "step": 314 }, { "epoch": 0.97, "grad_norm": 0.5034214543341652, "learning_rate": 2.0256854595881447e-07, "loss": 1.1338, "step": 315 }, { "epoch": 0.97, "grad_norm": 0.4795708437379455, "learning_rate": 1.6007686147225254e-07, "loss": 1.1533, "step": 316 }, { "epoch": 0.98, "grad_norm": 0.5053834258977199, "learning_rate": 1.2257418487082727e-07, "loss": 1.1648, "step": 317 }, { "epoch": 0.98, "grad_norm": 0.5276919175530018, "learning_rate": 9.006427019622176e-08, "loss": 1.2034, "step": 318 }, { "epoch": 0.98, "grad_norm": 0.5495649945557277, "learning_rate": 6.255037171150612e-08, "loss": 1.1985, "step": 319 }, { "epoch": 0.99, "grad_norm": 0.4946180370647359, "learning_rate": 4.0035243575342605e-08, "loss": 1.1738, "step": 320 }, { "epoch": 0.99, "grad_norm": 0.47883695876558374, "learning_rate": 2.2521139566328285e-08, "loss": 1.1804, "step": 321 }, { "epoch": 0.99, "grad_norm": 0.5584783758937163, "learning_rate": 1.0009812857370016e-08, "loss": 1.1968, "step": 322 }, { "epoch": 0.99, "grad_norm": 0.4815115437532202, "learning_rate": 2.502515840197006e-09, "loss": 1.1697, "step": 323 }, { "epoch": 1.0, "grad_norm": 0.5515369695100554, "learning_rate": 0.0, "loss": 1.1912, "step": 324 }, { "epoch": 1.0, "step": 324, "total_flos": 2.9498071531598643e+18, "train_loss": 1.2066333912037037, "train_runtime": 17698.3629, "train_samples_per_second": 4.697, "train_steps_per_second": 0.018 } ], "logging_steps": 1.0, "max_steps": 324, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 2.9498071531598643e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }