{ "best_metric": 1.556799054145813, "best_model_checkpoint": "miner_id_24/checkpoint-300", "epoch": 1.0, "eval_steps": 100, "global_step": 361, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002770083102493075, "grad_norm": 0.9283442497253418, "learning_rate": 5e-06, "loss": 2.05, "step": 1 }, { "epoch": 0.002770083102493075, "eval_loss": 2.1240499019622803, "eval_runtime": 95.1756, "eval_samples_per_second": 6.388, "eval_steps_per_second": 1.597, "step": 1 }, { "epoch": 0.00554016620498615, "grad_norm": 0.955880343914032, "learning_rate": 1e-05, "loss": 2.0261, "step": 2 }, { "epoch": 0.008310249307479225, "grad_norm": 0.9946913123130798, "learning_rate": 1.5e-05, "loss": 2.0557, "step": 3 }, { "epoch": 0.0110803324099723, "grad_norm": 0.9017006158828735, "learning_rate": 2e-05, "loss": 2.0127, "step": 4 }, { "epoch": 0.013850415512465374, "grad_norm": 1.0481677055358887, "learning_rate": 2.5e-05, "loss": 1.9824, "step": 5 }, { "epoch": 0.01662049861495845, "grad_norm": 0.7929118275642395, "learning_rate": 3e-05, "loss": 2.0085, "step": 6 }, { "epoch": 0.019390581717451522, "grad_norm": 0.7500401139259338, "learning_rate": 3.5e-05, "loss": 1.9579, "step": 7 }, { "epoch": 0.0221606648199446, "grad_norm": 0.6779894232749939, "learning_rate": 4e-05, "loss": 1.9187, "step": 8 }, { "epoch": 0.024930747922437674, "grad_norm": 0.644287645816803, "learning_rate": 4.5e-05, "loss": 1.9291, "step": 9 }, { "epoch": 0.027700831024930747, "grad_norm": 0.6950892806053162, "learning_rate": 5e-05, "loss": 1.9033, "step": 10 }, { "epoch": 0.030470914127423823, "grad_norm": 0.6530030369758606, "learning_rate": 5.500000000000001e-05, "loss": 1.849, "step": 11 }, { "epoch": 0.0332409972299169, "grad_norm": 0.6152698993682861, "learning_rate": 6e-05, "loss": 1.8331, "step": 12 }, { "epoch": 0.036011080332409975, "grad_norm": 0.638900637626648, "learning_rate": 6.500000000000001e-05, "loss": 1.7434, "step": 13 }, { "epoch": 0.038781163434903045, "grad_norm": 0.6423549056053162, "learning_rate": 7e-05, "loss": 1.809, "step": 14 }, { "epoch": 0.04155124653739612, "grad_norm": 0.6707618236541748, "learning_rate": 7.500000000000001e-05, "loss": 1.8022, "step": 15 }, { "epoch": 0.0443213296398892, "grad_norm": 0.6206409335136414, "learning_rate": 8e-05, "loss": 1.7494, "step": 16 }, { "epoch": 0.04709141274238227, "grad_norm": 0.5846340656280518, "learning_rate": 8.5e-05, "loss": 1.7886, "step": 17 }, { "epoch": 0.04986149584487535, "grad_norm": 0.5503649115562439, "learning_rate": 9e-05, "loss": 1.7373, "step": 18 }, { "epoch": 0.05263157894736842, "grad_norm": 0.55040043592453, "learning_rate": 9.5e-05, "loss": 1.7224, "step": 19 }, { "epoch": 0.055401662049861494, "grad_norm": 0.5535205006599426, "learning_rate": 0.0001, "loss": 1.6901, "step": 20 }, { "epoch": 0.05817174515235457, "grad_norm": 0.559658408164978, "learning_rate": 9.999787808528638e-05, "loss": 1.6755, "step": 21 }, { "epoch": 0.060941828254847646, "grad_norm": 0.5673468708992004, "learning_rate": 9.999151252124639e-05, "loss": 1.6903, "step": 22 }, { "epoch": 0.06371191135734072, "grad_norm": 0.5480718612670898, "learning_rate": 9.99809038481674e-05, "loss": 1.8207, "step": 23 }, { "epoch": 0.0664819944598338, "grad_norm": 0.5410633087158203, "learning_rate": 9.996605296647737e-05, "loss": 1.7156, "step": 24 }, { "epoch": 0.06925207756232687, "grad_norm": 0.5444090366363525, "learning_rate": 9.99469611366685e-05, "loss": 1.695, "step": 25 }, { "epoch": 0.07202216066481995, "grad_norm": 0.5443681478500366, "learning_rate": 9.992362997919016e-05, "loss": 1.6809, "step": 26 }, { "epoch": 0.07479224376731301, "grad_norm": 0.5565460920333862, "learning_rate": 9.98960614743114e-05, "loss": 1.7007, "step": 27 }, { "epoch": 0.07756232686980609, "grad_norm": 0.5411592721939087, "learning_rate": 9.986425796195287e-05, "loss": 1.7413, "step": 28 }, { "epoch": 0.08033240997229917, "grad_norm": 0.5616833567619324, "learning_rate": 9.982822214148821e-05, "loss": 1.6731, "step": 29 }, { "epoch": 0.08310249307479224, "grad_norm": 0.5728862881660461, "learning_rate": 9.978795707151492e-05, "loss": 1.7152, "step": 30 }, { "epoch": 0.08587257617728532, "grad_norm": 0.5711526274681091, "learning_rate": 9.974346616959476e-05, "loss": 1.6633, "step": 31 }, { "epoch": 0.0886426592797784, "grad_norm": 0.5852300524711609, "learning_rate": 9.969475321196373e-05, "loss": 1.6699, "step": 32 }, { "epoch": 0.09141274238227147, "grad_norm": 0.5722000598907471, "learning_rate": 9.96418223332115e-05, "loss": 1.6235, "step": 33 }, { "epoch": 0.09418282548476455, "grad_norm": 0.6049830317497253, "learning_rate": 9.958467802593046e-05, "loss": 1.7123, "step": 34 }, { "epoch": 0.09695290858725762, "grad_norm": 0.6205933690071106, "learning_rate": 9.952332514033447e-05, "loss": 1.6483, "step": 35 }, { "epoch": 0.0997229916897507, "grad_norm": 0.5876198410987854, "learning_rate": 9.94577688838472e-05, "loss": 1.6032, "step": 36 }, { "epoch": 0.10249307479224377, "grad_norm": 0.568473756313324, "learning_rate": 9.938801482065998e-05, "loss": 1.6591, "step": 37 }, { "epoch": 0.10526315789473684, "grad_norm": 0.5977758765220642, "learning_rate": 9.93140688712598e-05, "loss": 1.5954, "step": 38 }, { "epoch": 0.10803324099722991, "grad_norm": 0.6168347597122192, "learning_rate": 9.923593731192655e-05, "loss": 1.7199, "step": 39 }, { "epoch": 0.11080332409972299, "grad_norm": 0.567383348941803, "learning_rate": 9.915362677420044e-05, "loss": 1.5559, "step": 40 }, { "epoch": 0.11357340720221606, "grad_norm": 0.5804698467254639, "learning_rate": 9.906714424431913e-05, "loss": 1.595, "step": 41 }, { "epoch": 0.11634349030470914, "grad_norm": 0.6087197661399841, "learning_rate": 9.897649706262473e-05, "loss": 1.6415, "step": 42 }, { "epoch": 0.11911357340720222, "grad_norm": 0.6603733897209167, "learning_rate": 9.888169292294076e-05, "loss": 1.7308, "step": 43 }, { "epoch": 0.12188365650969529, "grad_norm": 0.597120463848114, "learning_rate": 9.87827398719192e-05, "loss": 1.651, "step": 44 }, { "epoch": 0.12465373961218837, "grad_norm": 0.563897967338562, "learning_rate": 9.867964630835743e-05, "loss": 1.5197, "step": 45 }, { "epoch": 0.12742382271468145, "grad_norm": 0.5994200706481934, "learning_rate": 9.857242098248542e-05, "loss": 1.6037, "step": 46 }, { "epoch": 0.13019390581717452, "grad_norm": 0.643892765045166, "learning_rate": 9.846107299522304e-05, "loss": 1.6028, "step": 47 }, { "epoch": 0.1329639889196676, "grad_norm": 0.6362243294715881, "learning_rate": 9.834561179740762e-05, "loss": 1.7313, "step": 48 }, { "epoch": 0.13573407202216067, "grad_norm": 0.8057113885879517, "learning_rate": 9.82260471889917e-05, "loss": 1.8693, "step": 49 }, { "epoch": 0.13850415512465375, "grad_norm": 1.2995071411132812, "learning_rate": 9.810238931821138e-05, "loss": 2.3462, "step": 50 }, { "epoch": 0.14127423822714683, "grad_norm": 0.9156427383422852, "learning_rate": 9.797464868072488e-05, "loss": 1.9039, "step": 51 }, { "epoch": 0.1440443213296399, "grad_norm": 0.7253287434577942, "learning_rate": 9.784283611872169e-05, "loss": 1.7885, "step": 52 }, { "epoch": 0.14681440443213298, "grad_norm": 0.5801952481269836, "learning_rate": 9.770696282000244e-05, "loss": 1.7586, "step": 53 }, { "epoch": 0.14958448753462603, "grad_norm": 0.48791196942329407, "learning_rate": 9.756704031702918e-05, "loss": 1.7201, "step": 54 }, { "epoch": 0.1523545706371191, "grad_norm": 0.47324320673942566, "learning_rate": 9.742308048594665e-05, "loss": 1.7283, "step": 55 }, { "epoch": 0.15512465373961218, "grad_norm": 0.44494226574897766, "learning_rate": 9.727509554557417e-05, "loss": 1.7024, "step": 56 }, { "epoch": 0.15789473684210525, "grad_norm": 0.45924824476242065, "learning_rate": 9.712309805636863e-05, "loss": 1.6947, "step": 57 }, { "epoch": 0.16066481994459833, "grad_norm": 0.5117136240005493, "learning_rate": 9.696710091935841e-05, "loss": 1.7821, "step": 58 }, { "epoch": 0.1634349030470914, "grad_norm": 0.4841713607311249, "learning_rate": 9.68071173750483e-05, "loss": 1.6528, "step": 59 }, { "epoch": 0.16620498614958448, "grad_norm": 0.4830092787742615, "learning_rate": 9.664316100229578e-05, "loss": 1.7166, "step": 60 }, { "epoch": 0.16897506925207756, "grad_norm": 0.4221424162387848, "learning_rate": 9.647524571715843e-05, "loss": 1.6441, "step": 61 }, { "epoch": 0.17174515235457063, "grad_norm": 0.4262312650680542, "learning_rate": 9.630338577171282e-05, "loss": 1.6742, "step": 62 }, { "epoch": 0.1745152354570637, "grad_norm": 0.4470296800136566, "learning_rate": 9.612759575284483e-05, "loss": 1.6873, "step": 63 }, { "epoch": 0.1772853185595568, "grad_norm": 0.43828487396240234, "learning_rate": 9.594789058101153e-05, "loss": 1.6717, "step": 64 }, { "epoch": 0.18005540166204986, "grad_norm": 0.44721511006355286, "learning_rate": 9.576428550897489e-05, "loss": 1.6437, "step": 65 }, { "epoch": 0.18282548476454294, "grad_norm": 0.4734845459461212, "learning_rate": 9.557679612050708e-05, "loss": 1.617, "step": 66 }, { "epoch": 0.18559556786703602, "grad_norm": 0.4750541150569916, "learning_rate": 9.538543832906773e-05, "loss": 1.6971, "step": 67 }, { "epoch": 0.1883656509695291, "grad_norm": 0.45159047842025757, "learning_rate": 9.519022837645337e-05, "loss": 1.6612, "step": 68 }, { "epoch": 0.19113573407202217, "grad_norm": 0.45829257369041443, "learning_rate": 9.499118283141887e-05, "loss": 1.5972, "step": 69 }, { "epoch": 0.19390581717451524, "grad_norm": 0.49154049158096313, "learning_rate": 9.478831858827104e-05, "loss": 1.6616, "step": 70 }, { "epoch": 0.19667590027700832, "grad_norm": 0.47941017150878906, "learning_rate": 9.458165286543476e-05, "loss": 1.6312, "step": 71 }, { "epoch": 0.1994459833795014, "grad_norm": 0.47044363617897034, "learning_rate": 9.437120320399158e-05, "loss": 1.6031, "step": 72 }, { "epoch": 0.20221606648199447, "grad_norm": 0.4906218349933624, "learning_rate": 9.415698746619079e-05, "loss": 1.6558, "step": 73 }, { "epoch": 0.20498614958448755, "grad_norm": 0.46305832266807556, "learning_rate": 9.393902383393347e-05, "loss": 1.626, "step": 74 }, { "epoch": 0.2077562326869806, "grad_norm": 0.4860275983810425, "learning_rate": 9.371733080722911e-05, "loss": 1.6518, "step": 75 }, { "epoch": 0.21052631578947367, "grad_norm": 0.49080756306648254, "learning_rate": 9.349192720262555e-05, "loss": 1.5951, "step": 76 }, { "epoch": 0.21329639889196675, "grad_norm": 0.512401282787323, "learning_rate": 9.326283215161178e-05, "loss": 1.5436, "step": 77 }, { "epoch": 0.21606648199445982, "grad_norm": 0.48431679606437683, "learning_rate": 9.303006509899418e-05, "loss": 1.693, "step": 78 }, { "epoch": 0.2188365650969529, "grad_norm": 0.5012890696525574, "learning_rate": 9.279364580124614e-05, "loss": 1.6496, "step": 79 }, { "epoch": 0.22160664819944598, "grad_norm": 0.5362542867660522, "learning_rate": 9.255359432483105e-05, "loss": 1.5378, "step": 80 }, { "epoch": 0.22437673130193905, "grad_norm": 0.5153225660324097, "learning_rate": 9.230993104449939e-05, "loss": 1.5482, "step": 81 }, { "epoch": 0.22714681440443213, "grad_norm": 0.5314464569091797, "learning_rate": 9.206267664155907e-05, "loss": 1.6165, "step": 82 }, { "epoch": 0.2299168975069252, "grad_norm": 0.5320023894309998, "learning_rate": 9.181185210212034e-05, "loss": 1.5757, "step": 83 }, { "epoch": 0.23268698060941828, "grad_norm": 0.542969286441803, "learning_rate": 9.155747871531444e-05, "loss": 1.5714, "step": 84 }, { "epoch": 0.23545706371191136, "grad_norm": 0.5345393419265747, "learning_rate": 9.129957807148666e-05, "loss": 1.5333, "step": 85 }, { "epoch": 0.23822714681440443, "grad_norm": 0.5255215167999268, "learning_rate": 9.103817206036382e-05, "loss": 1.5549, "step": 86 }, { "epoch": 0.2409972299168975, "grad_norm": 0.5266215801239014, "learning_rate": 9.077328286919638e-05, "loss": 1.5857, "step": 87 }, { "epoch": 0.24376731301939059, "grad_norm": 0.5361652374267578, "learning_rate": 9.050493298087523e-05, "loss": 1.5605, "step": 88 }, { "epoch": 0.24653739612188366, "grad_norm": 0.5425163507461548, "learning_rate": 9.02331451720234e-05, "loss": 1.6093, "step": 89 }, { "epoch": 0.24930747922437674, "grad_norm": 0.5160465240478516, "learning_rate": 8.995794251106295e-05, "loss": 1.5442, "step": 90 }, { "epoch": 0.2520775623268698, "grad_norm": 0.5372593998908997, "learning_rate": 8.967934835625689e-05, "loss": 1.5728, "step": 91 }, { "epoch": 0.2548476454293629, "grad_norm": 0.5655612349510193, "learning_rate": 8.939738635372664e-05, "loss": 1.5368, "step": 92 }, { "epoch": 0.25761772853185594, "grad_norm": 0.5733644366264343, "learning_rate": 8.911208043544513e-05, "loss": 1.5212, "step": 93 }, { "epoch": 0.26038781163434904, "grad_norm": 0.5658868551254272, "learning_rate": 8.882345481720533e-05, "loss": 1.5177, "step": 94 }, { "epoch": 0.2631578947368421, "grad_norm": 0.5318059325218201, "learning_rate": 8.853153399656513e-05, "loss": 1.5302, "step": 95 }, { "epoch": 0.2659279778393352, "grad_norm": 0.5699280500411987, "learning_rate": 8.823634275076791e-05, "loss": 1.544, "step": 96 }, { "epoch": 0.26869806094182824, "grad_norm": 0.6027292013168335, "learning_rate": 8.793790613463955e-05, "loss": 1.5461, "step": 97 }, { "epoch": 0.27146814404432135, "grad_norm": 0.6108666062355042, "learning_rate": 8.763624947846195e-05, "loss": 1.5956, "step": 98 }, { "epoch": 0.2742382271468144, "grad_norm": 0.7130281329154968, "learning_rate": 8.7331398385823e-05, "loss": 1.6505, "step": 99 }, { "epoch": 0.2770083102493075, "grad_norm": 1.1027400493621826, "learning_rate": 8.702337873144343e-05, "loss": 2.2798, "step": 100 }, { "epoch": 0.2770083102493075, "eval_loss": 1.6783268451690674, "eval_runtime": 96.3073, "eval_samples_per_second": 6.313, "eval_steps_per_second": 1.578, "step": 100 }, { "epoch": 0.27977839335180055, "grad_norm": 0.8103031516075134, "learning_rate": 8.671221665898073e-05, "loss": 1.781, "step": 101 }, { "epoch": 0.28254847645429365, "grad_norm": 0.6315918564796448, "learning_rate": 8.639793857881011e-05, "loss": 1.7388, "step": 102 }, { "epoch": 0.2853185595567867, "grad_norm": 0.4375033676624298, "learning_rate": 8.608057116578283e-05, "loss": 1.7528, "step": 103 }, { "epoch": 0.2880886426592798, "grad_norm": 0.43976011872291565, "learning_rate": 8.576014135696226e-05, "loss": 1.6575, "step": 104 }, { "epoch": 0.29085872576177285, "grad_norm": 0.4270864725112915, "learning_rate": 8.543667634933742e-05, "loss": 1.7506, "step": 105 }, { "epoch": 0.29362880886426596, "grad_norm": 0.4277926981449127, "learning_rate": 8.511020359751466e-05, "loss": 1.6791, "step": 106 }, { "epoch": 0.296398891966759, "grad_norm": 0.42316100001335144, "learning_rate": 8.478075081138745e-05, "loss": 1.6349, "step": 107 }, { "epoch": 0.29916897506925205, "grad_norm": 0.419150173664093, "learning_rate": 8.444834595378434e-05, "loss": 1.6907, "step": 108 }, { "epoch": 0.30193905817174516, "grad_norm": 0.44057542085647583, "learning_rate": 8.411301723809563e-05, "loss": 1.7144, "step": 109 }, { "epoch": 0.3047091412742382, "grad_norm": 0.4566152095794678, "learning_rate": 8.377479312587879e-05, "loss": 1.6322, "step": 110 }, { "epoch": 0.3074792243767313, "grad_norm": 0.4847561717033386, "learning_rate": 8.343370232444261e-05, "loss": 1.6603, "step": 111 }, { "epoch": 0.31024930747922436, "grad_norm": 0.4762137532234192, "learning_rate": 8.308977378441072e-05, "loss": 1.6872, "step": 112 }, { "epoch": 0.31301939058171746, "grad_norm": 0.44722503423690796, "learning_rate": 8.274303669726426e-05, "loss": 1.6122, "step": 113 }, { "epoch": 0.3157894736842105, "grad_norm": 0.4250723719596863, "learning_rate": 8.239352049286435e-05, "loss": 1.6256, "step": 114 }, { "epoch": 0.3185595567867036, "grad_norm": 0.45953068137168884, "learning_rate": 8.204125483695403e-05, "loss": 1.6354, "step": 115 }, { "epoch": 0.32132963988919666, "grad_norm": 0.44468310475349426, "learning_rate": 8.168626962864045e-05, "loss": 1.5903, "step": 116 }, { "epoch": 0.32409972299168976, "grad_norm": 0.4388692378997803, "learning_rate": 8.132859499785707e-05, "loss": 1.5752, "step": 117 }, { "epoch": 0.3268698060941828, "grad_norm": 0.4421890079975128, "learning_rate": 8.096826130280639e-05, "loss": 1.6525, "step": 118 }, { "epoch": 0.3296398891966759, "grad_norm": 0.45000019669532776, "learning_rate": 8.060529912738315e-05, "loss": 1.6413, "step": 119 }, { "epoch": 0.33240997229916897, "grad_norm": 0.43714025616645813, "learning_rate": 8.023973927857857e-05, "loss": 1.6296, "step": 120 }, { "epoch": 0.33518005540166207, "grad_norm": 0.4478208124637604, "learning_rate": 7.987161278386554e-05, "loss": 1.6076, "step": 121 }, { "epoch": 0.3379501385041551, "grad_norm": 0.441143661737442, "learning_rate": 7.950095088856508e-05, "loss": 1.5921, "step": 122 }, { "epoch": 0.3407202216066482, "grad_norm": 0.4647657573223114, "learning_rate": 7.912778505319436e-05, "loss": 1.5757, "step": 123 }, { "epoch": 0.34349030470914127, "grad_norm": 0.49418550729751587, "learning_rate": 7.875214695079647e-05, "loss": 1.5912, "step": 124 }, { "epoch": 0.3462603878116344, "grad_norm": 0.47427743673324585, "learning_rate": 7.837406846425204e-05, "loss": 1.5198, "step": 125 }, { "epoch": 0.3490304709141274, "grad_norm": 0.47177940607070923, "learning_rate": 7.799358168357323e-05, "loss": 1.5259, "step": 126 }, { "epoch": 0.3518005540166205, "grad_norm": 0.4780464470386505, "learning_rate": 7.761071890317994e-05, "loss": 1.58, "step": 127 }, { "epoch": 0.3545706371191136, "grad_norm": 0.48775434494018555, "learning_rate": 7.72255126191589e-05, "loss": 1.5837, "step": 128 }, { "epoch": 0.3573407202216066, "grad_norm": 0.463819682598114, "learning_rate": 7.683799552650534e-05, "loss": 1.5071, "step": 129 }, { "epoch": 0.3601108033240997, "grad_norm": 0.4649139642715454, "learning_rate": 7.644820051634812e-05, "loss": 1.5129, "step": 130 }, { "epoch": 0.3628808864265928, "grad_norm": 0.49197837710380554, "learning_rate": 7.605616067315793e-05, "loss": 1.5304, "step": 131 }, { "epoch": 0.3656509695290859, "grad_norm": 0.5007265210151672, "learning_rate": 7.56619092719392e-05, "loss": 1.5716, "step": 132 }, { "epoch": 0.3684210526315789, "grad_norm": 0.5111303329467773, "learning_rate": 7.526547977540592e-05, "loss": 1.533, "step": 133 }, { "epoch": 0.37119113573407203, "grad_norm": 0.5242710709571838, "learning_rate": 7.486690583114136e-05, "loss": 1.6119, "step": 134 }, { "epoch": 0.3739612188365651, "grad_norm": 0.5257385969161987, "learning_rate": 7.446622126874218e-05, "loss": 1.559, "step": 135 }, { "epoch": 0.3767313019390582, "grad_norm": 0.5438200235366821, "learning_rate": 7.406346009694713e-05, "loss": 1.5974, "step": 136 }, { "epoch": 0.37950138504155123, "grad_norm": 0.5373654961585999, "learning_rate": 7.365865650075046e-05, "loss": 1.5271, "step": 137 }, { "epoch": 0.38227146814404434, "grad_norm": 0.5332836508750916, "learning_rate": 7.325184483850042e-05, "loss": 1.5001, "step": 138 }, { "epoch": 0.3850415512465374, "grad_norm": 0.5282530784606934, "learning_rate": 7.284305963898314e-05, "loss": 1.5149, "step": 139 }, { "epoch": 0.3878116343490305, "grad_norm": 0.5338895320892334, "learning_rate": 7.243233559849179e-05, "loss": 1.5146, "step": 140 }, { "epoch": 0.39058171745152354, "grad_norm": 0.5510410666465759, "learning_rate": 7.201970757788172e-05, "loss": 1.4864, "step": 141 }, { "epoch": 0.39335180055401664, "grad_norm": 0.5490543246269226, "learning_rate": 7.160521059961169e-05, "loss": 1.5124, "step": 142 }, { "epoch": 0.3961218836565097, "grad_norm": 0.5581960678100586, "learning_rate": 7.118887984477116e-05, "loss": 1.582, "step": 143 }, { "epoch": 0.3988919667590028, "grad_norm": 0.5543914437294006, "learning_rate": 7.077075065009433e-05, "loss": 1.5549, "step": 144 }, { "epoch": 0.40166204986149584, "grad_norm": 0.5982301235198975, "learning_rate": 7.03508585049608e-05, "loss": 1.5457, "step": 145 }, { "epoch": 0.40443213296398894, "grad_norm": 0.5774012207984924, "learning_rate": 6.99292390483834e-05, "loss": 1.5163, "step": 146 }, { "epoch": 0.407202216066482, "grad_norm": 0.6118898391723633, "learning_rate": 6.950592806598327e-05, "loss": 1.6121, "step": 147 }, { "epoch": 0.4099722991689751, "grad_norm": 0.6272540092468262, "learning_rate": 6.908096148695251e-05, "loss": 1.7264, "step": 148 }, { "epoch": 0.41274238227146814, "grad_norm": 0.6987607479095459, "learning_rate": 6.865437538100457e-05, "loss": 1.8212, "step": 149 }, { "epoch": 0.4155124653739612, "grad_norm": 0.9023134112358093, "learning_rate": 6.822620595531286e-05, "loss": 2.1983, "step": 150 }, { "epoch": 0.4182825484764543, "grad_norm": 0.527152955532074, "learning_rate": 6.779648955143754e-05, "loss": 1.7174, "step": 151 }, { "epoch": 0.42105263157894735, "grad_norm": 0.5015479922294617, "learning_rate": 6.736526264224101e-05, "loss": 1.6503, "step": 152 }, { "epoch": 0.42382271468144045, "grad_norm": 0.4754060208797455, "learning_rate": 6.693256182879225e-05, "loss": 1.7529, "step": 153 }, { "epoch": 0.4265927977839335, "grad_norm": 0.4191894829273224, "learning_rate": 6.64984238372601e-05, "loss": 1.7416, "step": 154 }, { "epoch": 0.4293628808864266, "grad_norm": 0.4156198799610138, "learning_rate": 6.606288551579629e-05, "loss": 1.6428, "step": 155 }, { "epoch": 0.43213296398891965, "grad_norm": 0.42625534534454346, "learning_rate": 6.562598383140772e-05, "loss": 1.7209, "step": 156 }, { "epoch": 0.43490304709141275, "grad_norm": 0.4264881908893585, "learning_rate": 6.518775586681887e-05, "loss": 1.6075, "step": 157 }, { "epoch": 0.4376731301939058, "grad_norm": 0.4331325888633728, "learning_rate": 6.47482388173244e-05, "loss": 1.6109, "step": 158 }, { "epoch": 0.4404432132963989, "grad_norm": 0.42697060108184814, "learning_rate": 6.430746998763204e-05, "loss": 1.6069, "step": 159 }, { "epoch": 0.44321329639889195, "grad_norm": 0.44331657886505127, "learning_rate": 6.386548678869644e-05, "loss": 1.6185, "step": 160 }, { "epoch": 0.44598337950138506, "grad_norm": 0.4249429702758789, "learning_rate": 6.342232673454371e-05, "loss": 1.6654, "step": 161 }, { "epoch": 0.4487534626038781, "grad_norm": 0.4505755305290222, "learning_rate": 6.297802743908741e-05, "loss": 1.6413, "step": 162 }, { "epoch": 0.4515235457063712, "grad_norm": 0.46338891983032227, "learning_rate": 6.253262661293604e-05, "loss": 1.6393, "step": 163 }, { "epoch": 0.45429362880886426, "grad_norm": 0.4631763696670532, "learning_rate": 6.208616206019224e-05, "loss": 1.5277, "step": 164 }, { "epoch": 0.45706371191135736, "grad_norm": 0.46459320187568665, "learning_rate": 6.163867167524419e-05, "loss": 1.5505, "step": 165 }, { "epoch": 0.4598337950138504, "grad_norm": 0.454522043466568, "learning_rate": 6.119019343954914e-05, "loss": 1.5993, "step": 166 }, { "epoch": 0.4626038781163435, "grad_norm": 0.45403793454170227, "learning_rate": 6.074076541840977e-05, "loss": 1.585, "step": 167 }, { "epoch": 0.46537396121883656, "grad_norm": 0.4569069445133209, "learning_rate": 6.029042575774334e-05, "loss": 1.5865, "step": 168 }, { "epoch": 0.46814404432132967, "grad_norm": 0.4373742938041687, "learning_rate": 5.9839212680843925e-05, "loss": 1.5161, "step": 169 }, { "epoch": 0.4709141274238227, "grad_norm": 0.448476105928421, "learning_rate": 5.938716448513818e-05, "loss": 1.5271, "step": 170 }, { "epoch": 0.47368421052631576, "grad_norm": 0.458726704120636, "learning_rate": 5.8934319538934824e-05, "loss": 1.5619, "step": 171 }, { "epoch": 0.47645429362880887, "grad_norm": 0.4478129744529724, "learning_rate": 5.848071627816803e-05, "loss": 1.6222, "step": 172 }, { "epoch": 0.4792243767313019, "grad_norm": 0.4548794627189636, "learning_rate": 5.802639320313514e-05, "loss": 1.4884, "step": 173 }, { "epoch": 0.481994459833795, "grad_norm": 0.4709833562374115, "learning_rate": 5.757138887522884e-05, "loss": 1.5441, "step": 174 }, { "epoch": 0.48476454293628807, "grad_norm": 0.46005013585090637, "learning_rate": 5.7115741913664264e-05, "loss": 1.5582, "step": 175 }, { "epoch": 0.48753462603878117, "grad_norm": 0.4569703936576843, "learning_rate": 5.6659490992201094e-05, "loss": 1.5664, "step": 176 }, { "epoch": 0.4903047091412742, "grad_norm": 0.4556875228881836, "learning_rate": 5.620267483586105e-05, "loss": 1.557, "step": 177 }, { "epoch": 0.4930747922437673, "grad_norm": 0.46336084604263306, "learning_rate": 5.574533221764109e-05, "loss": 1.5718, "step": 178 }, { "epoch": 0.49584487534626037, "grad_norm": 0.4791565239429474, "learning_rate": 5.5287501955222444e-05, "loss": 1.5194, "step": 179 }, { "epoch": 0.4986149584487535, "grad_norm": 0.4609779417514801, "learning_rate": 5.482922290767589e-05, "loss": 1.5159, "step": 180 }, { "epoch": 0.5013850415512465, "grad_norm": 0.4774700999259949, "learning_rate": 5.437053397216364e-05, "loss": 1.4932, "step": 181 }, { "epoch": 0.5041551246537396, "grad_norm": 0.4876171946525574, "learning_rate": 5.39114740806377e-05, "loss": 1.554, "step": 182 }, { "epoch": 0.5069252077562327, "grad_norm": 0.5063115954399109, "learning_rate": 5.345208219653561e-05, "loss": 1.4743, "step": 183 }, { "epoch": 0.5096952908587258, "grad_norm": 0.5020216107368469, "learning_rate": 5.2992397311473316e-05, "loss": 1.478, "step": 184 }, { "epoch": 0.5124653739612188, "grad_norm": 0.49928566813468933, "learning_rate": 5.2532458441935636e-05, "loss": 1.5072, "step": 185 }, { "epoch": 0.5152354570637119, "grad_norm": 0.49790674448013306, "learning_rate": 5.2072304625964785e-05, "loss": 1.5212, "step": 186 }, { "epoch": 0.518005540166205, "grad_norm": 0.5072758793830872, "learning_rate": 5.161197491984684e-05, "loss": 1.4771, "step": 187 }, { "epoch": 0.5207756232686981, "grad_norm": 0.5330449342727661, "learning_rate": 5.11515083947969e-05, "loss": 1.5769, "step": 188 }, { "epoch": 0.5235457063711911, "grad_norm": 0.5207458734512329, "learning_rate": 5.069094413364272e-05, "loss": 1.4655, "step": 189 }, { "epoch": 0.5263157894736842, "grad_norm": 0.5129647254943848, "learning_rate": 5.023032122750759e-05, "loss": 1.4404, "step": 190 }, { "epoch": 0.5290858725761773, "grad_norm": 0.5331078171730042, "learning_rate": 4.976967877249242e-05, "loss": 1.5448, "step": 191 }, { "epoch": 0.5318559556786704, "grad_norm": 0.5314807295799255, "learning_rate": 4.9309055866357295e-05, "loss": 1.4778, "step": 192 }, { "epoch": 0.5346260387811634, "grad_norm": 0.5341604948043823, "learning_rate": 4.884849160520311e-05, "loss": 1.4723, "step": 193 }, { "epoch": 0.5373961218836565, "grad_norm": 0.5463652014732361, "learning_rate": 4.838802508015316e-05, "loss": 1.5132, "step": 194 }, { "epoch": 0.5401662049861495, "grad_norm": 0.5638048648834229, "learning_rate": 4.792769537403523e-05, "loss": 1.4437, "step": 195 }, { "epoch": 0.5429362880886427, "grad_norm": 0.5838809609413147, "learning_rate": 4.746754155806437e-05, "loss": 1.4876, "step": 196 }, { "epoch": 0.5457063711911357, "grad_norm": 0.6096907258033752, "learning_rate": 4.7007602688526695e-05, "loss": 1.5158, "step": 197 }, { "epoch": 0.5484764542936288, "grad_norm": 0.6180377006530762, "learning_rate": 4.65479178034644e-05, "loss": 1.6507, "step": 198 }, { "epoch": 0.5512465373961218, "grad_norm": 0.6747138500213623, "learning_rate": 4.608852591936231e-05, "loss": 1.728, "step": 199 }, { "epoch": 0.554016620498615, "grad_norm": 0.8520070314407349, "learning_rate": 4.562946602783636e-05, "loss": 2.2183, "step": 200 }, { "epoch": 0.554016620498615, "eval_loss": 1.5925147533416748, "eval_runtime": 96.8511, "eval_samples_per_second": 6.278, "eval_steps_per_second": 1.569, "step": 200 }, { "epoch": 0.556786703601108, "grad_norm": 0.4485991597175598, "learning_rate": 4.517077709232411e-05, "loss": 1.7589, "step": 201 }, { "epoch": 0.5595567867036011, "grad_norm": 0.4756779372692108, "learning_rate": 4.471249804477758e-05, "loss": 1.7589, "step": 202 }, { "epoch": 0.5623268698060941, "grad_norm": 0.445344477891922, "learning_rate": 4.4254667782358924e-05, "loss": 1.6403, "step": 203 }, { "epoch": 0.5650969529085873, "grad_norm": 0.448127806186676, "learning_rate": 4.379732516413897e-05, "loss": 1.6286, "step": 204 }, { "epoch": 0.5678670360110804, "grad_norm": 0.4261086881160736, "learning_rate": 4.334050900779893e-05, "loss": 1.6222, "step": 205 }, { "epoch": 0.5706371191135734, "grad_norm": 0.422758013010025, "learning_rate": 4.288425808633575e-05, "loss": 1.5906, "step": 206 }, { "epoch": 0.5734072022160664, "grad_norm": 0.39392560720443726, "learning_rate": 4.2428611124771184e-05, "loss": 1.6972, "step": 207 }, { "epoch": 0.5761772853185596, "grad_norm": 0.40653908252716064, "learning_rate": 4.1973606796864884e-05, "loss": 1.5745, "step": 208 }, { "epoch": 0.5789473684210527, "grad_norm": 0.4077155888080597, "learning_rate": 4.151928372183198e-05, "loss": 1.5567, "step": 209 }, { "epoch": 0.5817174515235457, "grad_norm": 0.4133901596069336, "learning_rate": 4.1065680461065194e-05, "loss": 1.5616, "step": 210 }, { "epoch": 0.5844875346260388, "grad_norm": 0.4191035330295563, "learning_rate": 4.061283551486185e-05, "loss": 1.6254, "step": 211 }, { "epoch": 0.5872576177285319, "grad_norm": 0.4195918142795563, "learning_rate": 4.016078731915608e-05, "loss": 1.5852, "step": 212 }, { "epoch": 0.590027700831025, "grad_norm": 0.4277445375919342, "learning_rate": 3.970957424225666e-05, "loss": 1.5839, "step": 213 }, { "epoch": 0.592797783933518, "grad_norm": 0.4163782596588135, "learning_rate": 3.925923458159023e-05, "loss": 1.6141, "step": 214 }, { "epoch": 0.5955678670360111, "grad_norm": 0.42547765374183655, "learning_rate": 3.880980656045087e-05, "loss": 1.5977, "step": 215 }, { "epoch": 0.5983379501385041, "grad_norm": 0.4501667320728302, "learning_rate": 3.8361328324755825e-05, "loss": 1.5311, "step": 216 }, { "epoch": 0.6011080332409973, "grad_norm": 0.43693989515304565, "learning_rate": 3.791383793980776e-05, "loss": 1.5779, "step": 217 }, { "epoch": 0.6038781163434903, "grad_norm": 0.4550263285636902, "learning_rate": 3.746737338706397e-05, "loss": 1.5961, "step": 218 }, { "epoch": 0.6066481994459834, "grad_norm": 0.4572041928768158, "learning_rate": 3.70219725609126e-05, "loss": 1.592, "step": 219 }, { "epoch": 0.6094182825484764, "grad_norm": 0.4633628726005554, "learning_rate": 3.65776732654563e-05, "loss": 1.5822, "step": 220 }, { "epoch": 0.6121883656509696, "grad_norm": 0.4722716808319092, "learning_rate": 3.6134513211303556e-05, "loss": 1.5399, "step": 221 }, { "epoch": 0.6149584487534626, "grad_norm": 0.477970689535141, "learning_rate": 3.5692530012367955e-05, "loss": 1.5923, "step": 222 }, { "epoch": 0.6177285318559557, "grad_norm": 0.4831191599369049, "learning_rate": 3.5251761182675625e-05, "loss": 1.559, "step": 223 }, { "epoch": 0.6204986149584487, "grad_norm": 0.46263793110847473, "learning_rate": 3.481224413318114e-05, "loss": 1.5255, "step": 224 }, { "epoch": 0.6232686980609419, "grad_norm": 0.4732118844985962, "learning_rate": 3.4374016168592296e-05, "loss": 1.5777, "step": 225 }, { "epoch": 0.6260387811634349, "grad_norm": 0.47649428248405457, "learning_rate": 3.393711448420372e-05, "loss": 1.4918, "step": 226 }, { "epoch": 0.628808864265928, "grad_norm": 0.4683854579925537, "learning_rate": 3.3501576162739904e-05, "loss": 1.4874, "step": 227 }, { "epoch": 0.631578947368421, "grad_norm": 0.4791397750377655, "learning_rate": 3.3067438171207766e-05, "loss": 1.55, "step": 228 }, { "epoch": 0.6343490304709142, "grad_norm": 0.474380224943161, "learning_rate": 3.263473735775899e-05, "loss": 1.5044, "step": 229 }, { "epoch": 0.6371191135734072, "grad_norm": 0.4925787150859833, "learning_rate": 3.220351044856247e-05, "loss": 1.5274, "step": 230 }, { "epoch": 0.6398891966759003, "grad_norm": 0.4613766670227051, "learning_rate": 3.177379404468715e-05, "loss": 1.4617, "step": 231 }, { "epoch": 0.6426592797783933, "grad_norm": 0.4807930588722229, "learning_rate": 3.134562461899545e-05, "loss": 1.4273, "step": 232 }, { "epoch": 0.6454293628808865, "grad_norm": 0.49378934502601624, "learning_rate": 3.091903851304751e-05, "loss": 1.5487, "step": 233 }, { "epoch": 0.6481994459833795, "grad_norm": 0.5002757906913757, "learning_rate": 3.0494071934016737e-05, "loss": 1.4499, "step": 234 }, { "epoch": 0.6509695290858726, "grad_norm": 0.49733448028564453, "learning_rate": 3.0070760951616618e-05, "loss": 1.4476, "step": 235 }, { "epoch": 0.6537396121883656, "grad_norm": 0.5034710764884949, "learning_rate": 2.9649141495039223e-05, "loss": 1.5502, "step": 236 }, { "epoch": 0.6565096952908587, "grad_norm": 0.49712780117988586, "learning_rate": 2.9229249349905684e-05, "loss": 1.4659, "step": 237 }, { "epoch": 0.6592797783933518, "grad_norm": 0.5008156895637512, "learning_rate": 2.8811120155228844e-05, "loss": 1.4596, "step": 238 }, { "epoch": 0.6620498614958449, "grad_norm": 0.5352665185928345, "learning_rate": 2.8394789400388328e-05, "loss": 1.5186, "step": 239 }, { "epoch": 0.6648199445983379, "grad_norm": 0.5389817357063293, "learning_rate": 2.798029242211828e-05, "loss": 1.5449, "step": 240 }, { "epoch": 0.667590027700831, "grad_norm": 0.5315467715263367, "learning_rate": 2.7567664401508225e-05, "loss": 1.4991, "step": 241 }, { "epoch": 0.6703601108033241, "grad_norm": 0.5424023270606995, "learning_rate": 2.7156940361016864e-05, "loss": 1.528, "step": 242 }, { "epoch": 0.6731301939058172, "grad_norm": 0.5434011816978455, "learning_rate": 2.6748155161499567e-05, "loss": 1.4446, "step": 243 }, { "epoch": 0.6759002770083102, "grad_norm": 0.5452317595481873, "learning_rate": 2.634134349924956e-05, "loss": 1.36, "step": 244 }, { "epoch": 0.6786703601108033, "grad_norm": 0.5486112833023071, "learning_rate": 2.5936539903052892e-05, "loss": 1.4738, "step": 245 }, { "epoch": 0.6814404432132964, "grad_norm": 0.582291841506958, "learning_rate": 2.5533778731257824e-05, "loss": 1.4956, "step": 246 }, { "epoch": 0.6842105263157895, "grad_norm": 0.5929666757583618, "learning_rate": 2.513309416885865e-05, "loss": 1.5308, "step": 247 }, { "epoch": 0.6869806094182825, "grad_norm": 0.5847265124320984, "learning_rate": 2.4734520224594093e-05, "loss": 1.5132, "step": 248 }, { "epoch": 0.6897506925207756, "grad_norm": 0.7136350274085999, "learning_rate": 2.433809072806081e-05, "loss": 1.8617, "step": 249 }, { "epoch": 0.6925207756232687, "grad_norm": 1.0075485706329346, "learning_rate": 2.3943839326842092e-05, "loss": 2.1937, "step": 250 }, { "epoch": 0.6952908587257618, "grad_norm": 0.42398756742477417, "learning_rate": 2.3551799483651894e-05, "loss": 1.7047, "step": 251 }, { "epoch": 0.6980609418282548, "grad_norm": 0.4284769296646118, "learning_rate": 2.3162004473494657e-05, "loss": 1.6634, "step": 252 }, { "epoch": 0.7008310249307479, "grad_norm": 0.4206549823284149, "learning_rate": 2.2774487380841115e-05, "loss": 1.6303, "step": 253 }, { "epoch": 0.703601108033241, "grad_norm": 0.41631007194519043, "learning_rate": 2.2389281096820075e-05, "loss": 1.6223, "step": 254 }, { "epoch": 0.7063711911357341, "grad_norm": 0.4172951579093933, "learning_rate": 2.2006418316426775e-05, "loss": 1.6146, "step": 255 }, { "epoch": 0.7091412742382271, "grad_norm": 0.4219145178794861, "learning_rate": 2.1625931535747964e-05, "loss": 1.5774, "step": 256 }, { "epoch": 0.7119113573407202, "grad_norm": 0.41025567054748535, "learning_rate": 2.1247853049203543e-05, "loss": 1.6285, "step": 257 }, { "epoch": 0.7146814404432132, "grad_norm": 0.4165142774581909, "learning_rate": 2.087221494680563e-05, "loss": 1.6049, "step": 258 }, { "epoch": 0.7174515235457064, "grad_norm": 0.4020419418811798, "learning_rate": 2.049904911143492e-05, "loss": 1.5822, "step": 259 }, { "epoch": 0.7202216066481995, "grad_norm": 0.39877504110336304, "learning_rate": 2.012838721613447e-05, "loss": 1.5962, "step": 260 }, { "epoch": 0.7229916897506925, "grad_norm": 0.404995858669281, "learning_rate": 1.9760260721421426e-05, "loss": 1.61, "step": 261 }, { "epoch": 0.7257617728531855, "grad_norm": 0.41909393668174744, "learning_rate": 1.9394700872616855e-05, "loss": 1.5494, "step": 262 }, { "epoch": 0.7285318559556787, "grad_norm": 0.42204612493515015, "learning_rate": 1.903173869719362e-05, "loss": 1.673, "step": 263 }, { "epoch": 0.7313019390581718, "grad_norm": 0.4202198088169098, "learning_rate": 1.8671405002142918e-05, "loss": 1.5661, "step": 264 }, { "epoch": 0.7340720221606648, "grad_norm": 0.42738965153694153, "learning_rate": 1.831373037135955e-05, "loss": 1.5709, "step": 265 }, { "epoch": 0.7368421052631579, "grad_norm": 0.4346977174282074, "learning_rate": 1.7958745163045986e-05, "loss": 1.5803, "step": 266 }, { "epoch": 0.739612188365651, "grad_norm": 0.4478447437286377, "learning_rate": 1.760647950713566e-05, "loss": 1.584, "step": 267 }, { "epoch": 0.7423822714681441, "grad_norm": 0.43284082412719727, "learning_rate": 1.725696330273575e-05, "loss": 1.5911, "step": 268 }, { "epoch": 0.7451523545706371, "grad_norm": 0.4424854815006256, "learning_rate": 1.6910226215589303e-05, "loss": 1.5178, "step": 269 }, { "epoch": 0.7479224376731302, "grad_norm": 0.4423474669456482, "learning_rate": 1.656629767555739e-05, "loss": 1.5337, "step": 270 }, { "epoch": 0.7506925207756233, "grad_norm": 0.450488418340683, "learning_rate": 1.6225206874121218e-05, "loss": 1.5476, "step": 271 }, { "epoch": 0.7534626038781164, "grad_norm": 0.4583964943885803, "learning_rate": 1.5886982761904377e-05, "loss": 1.559, "step": 272 }, { "epoch": 0.7562326869806094, "grad_norm": 0.45635777711868286, "learning_rate": 1.555165404621567e-05, "loss": 1.5798, "step": 273 }, { "epoch": 0.7590027700831025, "grad_norm": 0.4589531123638153, "learning_rate": 1.5219249188612556e-05, "loss": 1.5855, "step": 274 }, { "epoch": 0.7617728531855956, "grad_norm": 0.47425007820129395, "learning_rate": 1.488979640248534e-05, "loss": 1.565, "step": 275 }, { "epoch": 0.7645429362880887, "grad_norm": 0.4853668808937073, "learning_rate": 1.4563323650662586e-05, "loss": 1.5671, "step": 276 }, { "epoch": 0.7673130193905817, "grad_norm": 0.4773860275745392, "learning_rate": 1.4239858643037751e-05, "loss": 1.562, "step": 277 }, { "epoch": 0.7700831024930748, "grad_norm": 0.5074953436851501, "learning_rate": 1.3919428834217163e-05, "loss": 1.5222, "step": 278 }, { "epoch": 0.7728531855955678, "grad_norm": 0.4980280101299286, "learning_rate": 1.36020614211899e-05, "loss": 1.5357, "step": 279 }, { "epoch": 0.775623268698061, "grad_norm": 0.4977880120277405, "learning_rate": 1.3287783341019278e-05, "loss": 1.5099, "step": 280 }, { "epoch": 0.778393351800554, "grad_norm": 0.490857869386673, "learning_rate": 1.2976621268556571e-05, "loss": 1.4592, "step": 281 }, { "epoch": 0.7811634349030471, "grad_norm": 0.4895660877227783, "learning_rate": 1.2668601614177017e-05, "loss": 1.5262, "step": 282 }, { "epoch": 0.7839335180055401, "grad_norm": 0.5010288953781128, "learning_rate": 1.2363750521538064e-05, "loss": 1.4536, "step": 283 }, { "epoch": 0.7867036011080333, "grad_norm": 0.5021648406982422, "learning_rate": 1.2062093865360458e-05, "loss": 1.4938, "step": 284 }, { "epoch": 0.7894736842105263, "grad_norm": 0.5206820368766785, "learning_rate": 1.1763657249232107e-05, "loss": 1.5172, "step": 285 }, { "epoch": 0.7922437673130194, "grad_norm": 0.5139452815055847, "learning_rate": 1.146846600343488e-05, "loss": 1.4832, "step": 286 }, { "epoch": 0.7950138504155124, "grad_norm": 0.5119965672492981, "learning_rate": 1.1176545182794674e-05, "loss": 1.5057, "step": 287 }, { "epoch": 0.7977839335180056, "grad_norm": 0.5294527411460876, "learning_rate": 1.0887919564554894e-05, "loss": 1.4764, "step": 288 }, { "epoch": 0.8005540166204986, "grad_norm": 0.5366680026054382, "learning_rate": 1.0602613646273374e-05, "loss": 1.4931, "step": 289 }, { "epoch": 0.8033240997229917, "grad_norm": 0.5404147505760193, "learning_rate": 1.032065164374313e-05, "loss": 1.4244, "step": 290 }, { "epoch": 0.8060941828254847, "grad_norm": 0.520129919052124, "learning_rate": 1.0042057488937067e-05, "loss": 1.461, "step": 291 }, { "epoch": 0.8088642659279779, "grad_norm": 0.5262720584869385, "learning_rate": 9.766854827976617e-06, "loss": 1.4333, "step": 292 }, { "epoch": 0.8116343490304709, "grad_norm": 0.5492231249809265, "learning_rate": 9.495067019124792e-06, "loss": 1.5142, "step": 293 }, { "epoch": 0.814404432132964, "grad_norm": 0.5467971563339233, "learning_rate": 9.226717130803636e-06, "loss": 1.4606, "step": 294 }, { "epoch": 0.817174515235457, "grad_norm": 0.5795299410820007, "learning_rate": 8.961827939636196e-06, "loss": 1.4897, "step": 295 }, { "epoch": 0.8199445983379502, "grad_norm": 0.5784875154495239, "learning_rate": 8.700421928513352e-06, "loss": 1.4938, "step": 296 }, { "epoch": 0.8227146814404432, "grad_norm": 0.5875300765037537, "learning_rate": 8.442521284685573e-06, "loss": 1.4066, "step": 297 }, { "epoch": 0.8254847645429363, "grad_norm": 0.6010243892669678, "learning_rate": 8.188147897879667e-06, "loss": 1.482, "step": 298 }, { "epoch": 0.8282548476454293, "grad_norm": 0.6733506321907043, "learning_rate": 7.937323358440935e-06, "loss": 1.6978, "step": 299 }, { "epoch": 0.8310249307479224, "grad_norm": 0.987700879573822, "learning_rate": 7.690068955500624e-06, "loss": 2.2272, "step": 300 }, { "epoch": 0.8310249307479224, "eval_loss": 1.556799054145813, "eval_runtime": 95.6603, "eval_samples_per_second": 6.356, "eval_steps_per_second": 1.589, "step": 300 }, { "epoch": 0.8337950138504155, "grad_norm": 0.38641664385795593, "learning_rate": 7.446405675168938e-06, "loss": 1.6862, "step": 301 }, { "epoch": 0.8365650969529086, "grad_norm": 0.386618435382843, "learning_rate": 7.206354198753862e-06, "loss": 1.6566, "step": 302 }, { "epoch": 0.8393351800554016, "grad_norm": 0.4190298318862915, "learning_rate": 6.969934901005809e-06, "loss": 1.6447, "step": 303 }, { "epoch": 0.8421052631578947, "grad_norm": 0.40949639678001404, "learning_rate": 6.7371678483882264e-06, "loss": 1.6331, "step": 304 }, { "epoch": 0.8448753462603878, "grad_norm": 0.42585617303848267, "learning_rate": 6.508072797374454e-06, "loss": 1.6439, "step": 305 }, { "epoch": 0.8476454293628809, "grad_norm": 0.41650789976119995, "learning_rate": 6.282669192770896e-06, "loss": 1.616, "step": 306 }, { "epoch": 0.850415512465374, "grad_norm": 0.4084133803844452, "learning_rate": 6.060976166066546e-06, "loss": 1.587, "step": 307 }, { "epoch": 0.853185595567867, "grad_norm": 0.4127242863178253, "learning_rate": 5.8430125338092115e-06, "loss": 1.6102, "step": 308 }, { "epoch": 0.8559556786703602, "grad_norm": 0.42294085025787354, "learning_rate": 5.628796796008434e-06, "loss": 1.6542, "step": 309 }, { "epoch": 0.8587257617728532, "grad_norm": 0.41091540455818176, "learning_rate": 5.418347134565249e-06, "loss": 1.5955, "step": 310 }, { "epoch": 0.8614958448753463, "grad_norm": 0.41265037655830383, "learning_rate": 5.211681411728969e-06, "loss": 1.6711, "step": 311 }, { "epoch": 0.8642659279778393, "grad_norm": 0.44204774498939514, "learning_rate": 5.008817168581137e-06, "loss": 1.5871, "step": 312 }, { "epoch": 0.8670360110803325, "grad_norm": 0.4217774271965027, "learning_rate": 4.809771623546627e-06, "loss": 1.5632, "step": 313 }, { "epoch": 0.8698060941828255, "grad_norm": 0.43789371848106384, "learning_rate": 4.614561670932288e-06, "loss": 1.5672, "step": 314 }, { "epoch": 0.8725761772853186, "grad_norm": 0.44473928213119507, "learning_rate": 4.423203879492943e-06, "loss": 1.5175, "step": 315 }, { "epoch": 0.8753462603878116, "grad_norm": 0.4420805871486664, "learning_rate": 4.2357144910251e-06, "loss": 1.5502, "step": 316 }, { "epoch": 0.8781163434903048, "grad_norm": 0.448369562625885, "learning_rate": 4.05210941898847e-06, "loss": 1.5517, "step": 317 }, { "epoch": 0.8808864265927978, "grad_norm": 0.44154343008995056, "learning_rate": 3.872404247155193e-06, "loss": 1.4863, "step": 318 }, { "epoch": 0.8836565096952909, "grad_norm": 0.44638606905937195, "learning_rate": 3.696614228287187e-06, "loss": 1.5786, "step": 319 }, { "epoch": 0.8864265927977839, "grad_norm": 0.43334829807281494, "learning_rate": 3.5247542828415747e-06, "loss": 1.5491, "step": 320 }, { "epoch": 0.889196675900277, "grad_norm": 0.4540192484855652, "learning_rate": 3.356838997704226e-06, "loss": 1.5194, "step": 321 }, { "epoch": 0.8919667590027701, "grad_norm": 0.4565938115119934, "learning_rate": 3.1928826249516987e-06, "loss": 1.5226, "step": 322 }, { "epoch": 0.8947368421052632, "grad_norm": 0.4570363461971283, "learning_rate": 3.0328990806415934e-06, "loss": 1.5702, "step": 323 }, { "epoch": 0.8975069252077562, "grad_norm": 0.4690124988555908, "learning_rate": 2.8769019436313715e-06, "loss": 1.5384, "step": 324 }, { "epoch": 0.9002770083102493, "grad_norm": 0.47177836298942566, "learning_rate": 2.7249044544258363e-06, "loss": 1.5368, "step": 325 }, { "epoch": 0.9030470914127424, "grad_norm": 0.4627552628517151, "learning_rate": 2.576919514053355e-06, "loss": 1.5475, "step": 326 }, { "epoch": 0.9058171745152355, "grad_norm": 0.4692405164241791, "learning_rate": 2.4329596829708144e-06, "loss": 1.4843, "step": 327 }, { "epoch": 0.9085872576177285, "grad_norm": 0.5105109214782715, "learning_rate": 2.2930371799975594e-06, "loss": 1.5008, "step": 328 }, { "epoch": 0.9113573407202216, "grad_norm": 0.48938921093940735, "learning_rate": 2.157163881278312e-06, "loss": 1.5492, "step": 329 }, { "epoch": 0.9141274238227147, "grad_norm": 0.485343337059021, "learning_rate": 2.0253513192751373e-06, "loss": 1.5183, "step": 330 }, { "epoch": 0.9168975069252078, "grad_norm": 0.4986342489719391, "learning_rate": 1.8976106817886196e-06, "loss": 1.5177, "step": 331 }, { "epoch": 0.9196675900277008, "grad_norm": 0.49586573243141174, "learning_rate": 1.7739528110083004e-06, "loss": 1.519, "step": 332 }, { "epoch": 0.9224376731301939, "grad_norm": 0.5050438642501831, "learning_rate": 1.6543882025923886e-06, "loss": 1.4329, "step": 333 }, { "epoch": 0.925207756232687, "grad_norm": 0.5025500655174255, "learning_rate": 1.5389270047769578e-06, "loss": 1.486, "step": 334 }, { "epoch": 0.9279778393351801, "grad_norm": 0.50665682554245, "learning_rate": 1.4275790175145908e-06, "loss": 1.4943, "step": 335 }, { "epoch": 0.9307479224376731, "grad_norm": 0.518688440322876, "learning_rate": 1.3203536916425841e-06, "loss": 1.526, "step": 336 }, { "epoch": 0.9335180055401662, "grad_norm": 0.5425696969032288, "learning_rate": 1.217260128080816e-06, "loss": 1.513, "step": 337 }, { "epoch": 0.9362880886426593, "grad_norm": 0.5442622303962708, "learning_rate": 1.1183070770592441e-06, "loss": 1.5735, "step": 338 }, { "epoch": 0.9390581717451524, "grad_norm": 0.5330432057380676, "learning_rate": 1.0235029373752758e-06, "loss": 1.5291, "step": 339 }, { "epoch": 0.9418282548476454, "grad_norm": 0.5193708539009094, "learning_rate": 9.32855755680867e-07, "loss": 1.418, "step": 340 }, { "epoch": 0.9445983379501385, "grad_norm": 0.5180433392524719, "learning_rate": 8.463732257995571e-07, "loss": 1.495, "step": 341 }, { "epoch": 0.9473684210526315, "grad_norm": 0.5407363772392273, "learning_rate": 7.640626880734581e-07, "loss": 1.5243, "step": 342 }, { "epoch": 0.9501385041551247, "grad_norm": 0.5722284913063049, "learning_rate": 6.859311287402081e-07, "loss": 1.4204, "step": 343 }, { "epoch": 0.9529085872576177, "grad_norm": 0.5504374504089355, "learning_rate": 6.119851793400189e-07, "loss": 1.4763, "step": 344 }, { "epoch": 0.9556786703601108, "grad_norm": 0.5620555281639099, "learning_rate": 5.422311161528193e-07, "loss": 1.4551, "step": 345 }, { "epoch": 0.9584487534626038, "grad_norm": 0.5651283860206604, "learning_rate": 4.7667485966552685e-07, "loss": 1.4817, "step": 346 }, { "epoch": 0.961218836565097, "grad_norm": 0.6057314872741699, "learning_rate": 4.153219740695435e-07, "loss": 1.5466, "step": 347 }, { "epoch": 0.96398891966759, "grad_norm": 0.6597415208816528, "learning_rate": 3.5817766678850615e-07, "loss": 1.5562, "step": 348 }, { "epoch": 0.9667590027700831, "grad_norm": 0.7001429796218872, "learning_rate": 3.052467880362675e-07, "loss": 1.899, "step": 349 }, { "epoch": 0.9695290858725761, "grad_norm": 0.8798688054084778, "learning_rate": 2.5653383040524227e-07, "loss": 1.96, "step": 350 }, { "epoch": 0.9722991689750693, "grad_norm": 0.3499889671802521, "learning_rate": 2.1204292848509555e-07, "loss": 1.6123, "step": 351 }, { "epoch": 0.9750692520775623, "grad_norm": 0.38690975308418274, "learning_rate": 1.717778585118013e-07, "loss": 1.5579, "step": 352 }, { "epoch": 0.9778393351800554, "grad_norm": 0.41460373997688293, "learning_rate": 1.3574203804713747e-07, "loss": 1.585, "step": 353 }, { "epoch": 0.9806094182825484, "grad_norm": 0.4353157877922058, "learning_rate": 1.0393852568860719e-07, "loss": 1.5611, "step": 354 }, { "epoch": 0.9833795013850416, "grad_norm": 0.450083464384079, "learning_rate": 7.637002080985168e-08, "loss": 1.5568, "step": 355 }, { "epoch": 0.9861495844875346, "grad_norm": 0.47905081510543823, "learning_rate": 5.303886333151153e-08, "loss": 1.5438, "step": 356 }, { "epoch": 0.9889196675900277, "grad_norm": 0.4843379855155945, "learning_rate": 3.394703352263551e-08, "loss": 1.4646, "step": 357 }, { "epoch": 0.9916897506925207, "grad_norm": 0.5109671950340271, "learning_rate": 1.9096151832609375e-08, "loss": 1.513, "step": 358 }, { "epoch": 0.9944598337950139, "grad_norm": 0.5229476690292358, "learning_rate": 8.487478753615997e-09, "loss": 1.4675, "step": 359 }, { "epoch": 0.997229916897507, "grad_norm": 0.5755655765533447, "learning_rate": 2.1219147136264382e-09, "loss": 1.5564, "step": 360 }, { "epoch": 1.0, "grad_norm": 0.7581580877304077, "learning_rate": 0.0, "loss": 1.8149, "step": 361 } ], "logging_steps": 1, "max_steps": 361, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.1502575448070554e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }