{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9977324263038548, "eval_steps": 500, "global_step": 330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030234315948601664, "grad_norm": 0.6349862119317693, "learning_rate": 5.000000000000001e-07, "loss": 1.3237, "step": 1 }, { "epoch": 0.006046863189720333, "grad_norm": 0.6915137231647266, "learning_rate": 1.0000000000000002e-06, "loss": 1.3595, "step": 2 }, { "epoch": 0.009070294784580499, "grad_norm": 0.623700079073619, "learning_rate": 1.5e-06, "loss": 1.343, "step": 3 }, { "epoch": 0.012093726379440665, "grad_norm": 0.7242880491963869, "learning_rate": 2.0000000000000003e-06, "loss": 1.3527, "step": 4 }, { "epoch": 0.015117157974300832, "grad_norm": 0.6516906859598985, "learning_rate": 2.5e-06, "loss": 1.3319, "step": 5 }, { "epoch": 0.018140589569160998, "grad_norm": 0.5742747957897, "learning_rate": 3e-06, "loss": 1.342, "step": 6 }, { "epoch": 0.021164021164021163, "grad_norm": 0.557815390462239, "learning_rate": 3.5e-06, "loss": 1.3152, "step": 7 }, { "epoch": 0.02418745275888133, "grad_norm": 0.4620246107786041, "learning_rate": 4.000000000000001e-06, "loss": 1.2963, "step": 8 }, { "epoch": 0.027210884353741496, "grad_norm": 0.44763809541022137, "learning_rate": 4.5e-06, "loss": 1.2895, "step": 9 }, { "epoch": 0.030234315948601664, "grad_norm": 0.3416187088663793, "learning_rate": 5e-06, "loss": 1.2531, "step": 10 }, { "epoch": 0.03325774754346183, "grad_norm": 0.31917539621933483, "learning_rate": 4.999970800043822e-06, "loss": 1.2006, "step": 11 }, { "epoch": 0.036281179138321996, "grad_norm": 0.27239571970104204, "learning_rate": 4.9998832008573975e-06, "loss": 1.1767, "step": 12 }, { "epoch": 0.039304610733182165, "grad_norm": 0.32495241030295385, "learning_rate": 4.999737204487039e-06, "loss": 1.1951, "step": 13 }, { "epoch": 0.042328042328042326, "grad_norm": 0.31114523478470957, "learning_rate": 4.999532814343219e-06, "loss": 1.1474, "step": 14 }, { "epoch": 0.045351473922902494, "grad_norm": 0.26573282398874887, "learning_rate": 4.999270035200483e-06, "loss": 1.1684, "step": 15 }, { "epoch": 0.04837490551776266, "grad_norm": 0.27675989125666167, "learning_rate": 4.998948873197342e-06, "loss": 1.142, "step": 16 }, { "epoch": 0.05139833711262283, "grad_norm": 0.2341024474066861, "learning_rate": 4.99856933583613e-06, "loss": 1.1735, "step": 17 }, { "epoch": 0.05442176870748299, "grad_norm": 0.20679018253539813, "learning_rate": 4.998131431982826e-06, "loss": 1.0896, "step": 18 }, { "epoch": 0.05744520030234316, "grad_norm": 0.21159362728987222, "learning_rate": 4.9976351718668485e-06, "loss": 1.1191, "step": 19 }, { "epoch": 0.06046863189720333, "grad_norm": 0.19379985234830382, "learning_rate": 4.9970805670808174e-06, "loss": 1.1162, "step": 20 }, { "epoch": 0.06349206349206349, "grad_norm": 0.2039064731806591, "learning_rate": 4.9964676305802794e-06, "loss": 1.1155, "step": 21 }, { "epoch": 0.06651549508692366, "grad_norm": 0.22133580902562022, "learning_rate": 4.995796376683411e-06, "loss": 1.0603, "step": 22 }, { "epoch": 0.06953892668178382, "grad_norm": 0.24913058306438574, "learning_rate": 4.9950668210706795e-06, "loss": 1.0854, "step": 23 }, { "epoch": 0.07256235827664399, "grad_norm": 0.22434864947712013, "learning_rate": 4.994278980784478e-06, "loss": 1.0601, "step": 24 }, { "epoch": 0.07558578987150416, "grad_norm": 0.18349247230596857, "learning_rate": 4.9934328742287285e-06, "loss": 1.1042, "step": 25 }, { "epoch": 0.07860922146636433, "grad_norm": 0.1585429266996897, "learning_rate": 4.992528521168449e-06, "loss": 1.0409, "step": 26 }, { "epoch": 0.08163265306122448, "grad_norm": 0.16168593725598268, "learning_rate": 4.991565942729298e-06, "loss": 1.0341, "step": 27 }, { "epoch": 0.08465608465608465, "grad_norm": 0.19566832668054185, "learning_rate": 4.990545161397073e-06, "loss": 1.0689, "step": 28 }, { "epoch": 0.08767951625094482, "grad_norm": 0.2499930738278608, "learning_rate": 4.989466201017188e-06, "loss": 1.0096, "step": 29 }, { "epoch": 0.09070294784580499, "grad_norm": 0.2779488624344162, "learning_rate": 4.988329086794122e-06, "loss": 1.0609, "step": 30 }, { "epoch": 0.09372637944066516, "grad_norm": 0.2244846945907016, "learning_rate": 4.987133845290823e-06, "loss": 1.0366, "step": 31 }, { "epoch": 0.09674981103552532, "grad_norm": 0.17994766023159892, "learning_rate": 4.98588050442809e-06, "loss": 1.0314, "step": 32 }, { "epoch": 0.09977324263038549, "grad_norm": 0.22279237142259942, "learning_rate": 4.984569093483922e-06, "loss": 1.0445, "step": 33 }, { "epoch": 0.10279667422524566, "grad_norm": 0.2494526014297992, "learning_rate": 4.983199643092833e-06, "loss": 1.0344, "step": 34 }, { "epoch": 0.10582010582010581, "grad_norm": 0.21434458455232053, "learning_rate": 4.981772185245135e-06, "loss": 1.0421, "step": 35 }, { "epoch": 0.10884353741496598, "grad_norm": 0.18307769428152484, "learning_rate": 4.980286753286196e-06, "loss": 0.9864, "step": 36 }, { "epoch": 0.11186696900982615, "grad_norm": 0.21179293089346243, "learning_rate": 4.97874338191565e-06, "loss": 0.9842, "step": 37 }, { "epoch": 0.11489040060468632, "grad_norm": 0.23379777419897857, "learning_rate": 4.977142107186602e-06, "loss": 0.9955, "step": 38 }, { "epoch": 0.11791383219954649, "grad_norm": 0.20298340697744424, "learning_rate": 4.975482966504772e-06, "loss": 0.9957, "step": 39 }, { "epoch": 0.12093726379440665, "grad_norm": 0.22788321802784506, "learning_rate": 4.973765998627628e-06, "loss": 0.9909, "step": 40 }, { "epoch": 0.12396069538926682, "grad_norm": 0.22447377185154144, "learning_rate": 4.97199124366348e-06, "loss": 0.9995, "step": 41 }, { "epoch": 0.12698412698412698, "grad_norm": 0.19695029744427425, "learning_rate": 4.970158743070542e-06, "loss": 0.9781, "step": 42 }, { "epoch": 0.13000755857898716, "grad_norm": 0.178963231333608, "learning_rate": 4.9682685396559625e-06, "loss": 0.9779, "step": 43 }, { "epoch": 0.1330309901738473, "grad_norm": 0.1873471219218099, "learning_rate": 4.966320677574828e-06, "loss": 0.9796, "step": 44 }, { "epoch": 0.1360544217687075, "grad_norm": 0.22949932135410833, "learning_rate": 4.964315202329127e-06, "loss": 0.9965, "step": 45 }, { "epoch": 0.13907785336356765, "grad_norm": 0.2274052062281532, "learning_rate": 4.9622521607666936e-06, "loss": 0.9625, "step": 46 }, { "epoch": 0.1421012849584278, "grad_norm": 0.1806669455946557, "learning_rate": 4.960131601080104e-06, "loss": 0.9807, "step": 47 }, { "epoch": 0.14512471655328799, "grad_norm": 0.19467061044424094, "learning_rate": 4.957953572805558e-06, "loss": 0.9615, "step": 48 }, { "epoch": 0.14814814814814814, "grad_norm": 0.2731410757300855, "learning_rate": 4.9557181268217225e-06, "loss": 0.9819, "step": 49 }, { "epoch": 0.15117157974300832, "grad_norm": 0.19042718807008738, "learning_rate": 4.953425315348534e-06, "loss": 0.9547, "step": 50 }, { "epoch": 0.15419501133786848, "grad_norm": 0.16643927370098177, "learning_rate": 4.9510751919459895e-06, "loss": 0.9892, "step": 51 }, { "epoch": 0.15721844293272866, "grad_norm": 0.2524323083468839, "learning_rate": 4.94866781151289e-06, "loss": 1.0181, "step": 52 }, { "epoch": 0.1602418745275888, "grad_norm": 0.27545197371921265, "learning_rate": 4.946203230285558e-06, "loss": 0.9713, "step": 53 }, { "epoch": 0.16326530612244897, "grad_norm": 0.17013540947461778, "learning_rate": 4.943681505836523e-06, "loss": 1.0005, "step": 54 }, { "epoch": 0.16628873771730915, "grad_norm": 0.18283369295290966, "learning_rate": 4.941102697073181e-06, "loss": 0.9183, "step": 55 }, { "epoch": 0.1693121693121693, "grad_norm": 0.2189807492467087, "learning_rate": 4.938466864236413e-06, "loss": 0.9683, "step": 56 }, { "epoch": 0.17233560090702948, "grad_norm": 0.2766806847549335, "learning_rate": 4.935774068899184e-06, "loss": 0.958, "step": 57 }, { "epoch": 0.17535903250188964, "grad_norm": 0.2295270706172793, "learning_rate": 4.933024373965097e-06, "loss": 0.9399, "step": 58 }, { "epoch": 0.17838246409674982, "grad_norm": 0.20415845821236425, "learning_rate": 4.930217843666929e-06, "loss": 0.9677, "step": 59 }, { "epoch": 0.18140589569160998, "grad_norm": 0.18705886763979152, "learning_rate": 4.927354543565131e-06, "loss": 0.9453, "step": 60 }, { "epoch": 0.18442932728647016, "grad_norm": 0.25228689054978015, "learning_rate": 4.924434540546291e-06, "loss": 0.9639, "step": 61 }, { "epoch": 0.1874527588813303, "grad_norm": 0.2685784416971121, "learning_rate": 4.921457902821578e-06, "loss": 0.9561, "step": 62 }, { "epoch": 0.19047619047619047, "grad_norm": 0.24674154778238747, "learning_rate": 4.918424699925146e-06, "loss": 0.952, "step": 63 }, { "epoch": 0.19349962207105065, "grad_norm": 0.19937803912058571, "learning_rate": 4.915335002712506e-06, "loss": 0.9158, "step": 64 }, { "epoch": 0.1965230536659108, "grad_norm": 0.21943107617585558, "learning_rate": 4.912188883358879e-06, "loss": 0.9622, "step": 65 }, { "epoch": 0.19954648526077098, "grad_norm": 0.20789781104002328, "learning_rate": 4.9089864153575016e-06, "loss": 0.9432, "step": 66 }, { "epoch": 0.20256991685563114, "grad_norm": 0.21625333461538526, "learning_rate": 4.9057276735179134e-06, "loss": 0.9136, "step": 67 }, { "epoch": 0.20559334845049132, "grad_norm": 0.20774782340550482, "learning_rate": 4.902412733964212e-06, "loss": 0.9205, "step": 68 }, { "epoch": 0.20861678004535147, "grad_norm": 0.23205941698573587, "learning_rate": 4.899041674133266e-06, "loss": 0.9193, "step": 69 }, { "epoch": 0.21164021164021163, "grad_norm": 0.20096610581169602, "learning_rate": 4.895614572772916e-06, "loss": 0.9332, "step": 70 }, { "epoch": 0.2146636432350718, "grad_norm": 0.18733010074274722, "learning_rate": 4.89213150994013e-06, "loss": 0.9562, "step": 71 }, { "epoch": 0.21768707482993196, "grad_norm": 0.2131500035254074, "learning_rate": 4.888592566999134e-06, "loss": 0.978, "step": 72 }, { "epoch": 0.22071050642479215, "grad_norm": 0.25995206465303416, "learning_rate": 4.884997826619512e-06, "loss": 0.9615, "step": 73 }, { "epoch": 0.2237339380196523, "grad_norm": 0.20122899473383501, "learning_rate": 4.88134737277427e-06, "loss": 0.9223, "step": 74 }, { "epoch": 0.22675736961451248, "grad_norm": 0.20082627865414718, "learning_rate": 4.8776412907378845e-06, "loss": 0.9129, "step": 75 }, { "epoch": 0.22978080120937264, "grad_norm": 0.22559902896183986, "learning_rate": 4.873879667084301e-06, "loss": 0.9331, "step": 76 }, { "epoch": 0.2328042328042328, "grad_norm": 0.24097328648057836, "learning_rate": 4.870062589684917e-06, "loss": 0.9302, "step": 77 }, { "epoch": 0.23582766439909297, "grad_norm": 0.2191859905396367, "learning_rate": 4.866190147706525e-06, "loss": 0.906, "step": 78 }, { "epoch": 0.23885109599395313, "grad_norm": 0.1927603541449588, "learning_rate": 4.862262431609235e-06, "loss": 0.9158, "step": 79 }, { "epoch": 0.2418745275888133, "grad_norm": 0.20091846606347583, "learning_rate": 4.858279533144358e-06, "loss": 0.9241, "step": 80 }, { "epoch": 0.24489795918367346, "grad_norm": 0.19776572498006212, "learning_rate": 4.854241545352262e-06, "loss": 0.908, "step": 81 }, { "epoch": 0.24792139077853365, "grad_norm": 0.19142342325998066, "learning_rate": 4.8501485625602e-06, "loss": 0.9031, "step": 82 }, { "epoch": 0.2509448223733938, "grad_norm": 0.255824517812554, "learning_rate": 4.846000680380106e-06, "loss": 0.896, "step": 83 }, { "epoch": 0.25396825396825395, "grad_norm": 0.23838401037023174, "learning_rate": 4.841797995706362e-06, "loss": 0.9169, "step": 84 }, { "epoch": 0.25699168556311414, "grad_norm": 0.20594758086068155, "learning_rate": 4.837540606713538e-06, "loss": 0.9293, "step": 85 }, { "epoch": 0.2600151171579743, "grad_norm": 0.21813818048500913, "learning_rate": 4.833228612854088e-06, "loss": 0.9194, "step": 86 }, { "epoch": 0.26303854875283444, "grad_norm": 0.23454835369326738, "learning_rate": 4.828862114856038e-06, "loss": 0.9214, "step": 87 }, { "epoch": 0.2660619803476946, "grad_norm": 0.2204000662732641, "learning_rate": 4.824441214720629e-06, "loss": 0.907, "step": 88 }, { "epoch": 0.2690854119425548, "grad_norm": 0.2250848297991148, "learning_rate": 4.819966015719933e-06, "loss": 0.9032, "step": 89 }, { "epoch": 0.272108843537415, "grad_norm": 0.2535347696118056, "learning_rate": 4.815436622394442e-06, "loss": 0.9149, "step": 90 }, { "epoch": 0.2751322751322751, "grad_norm": 0.22450012032883543, "learning_rate": 4.810853140550625e-06, "loss": 0.9055, "step": 91 }, { "epoch": 0.2781557067271353, "grad_norm": 0.17386208282106705, "learning_rate": 4.806215677258456e-06, "loss": 0.8933, "step": 92 }, { "epoch": 0.2811791383219955, "grad_norm": 0.19053752177477154, "learning_rate": 4.801524340848917e-06, "loss": 0.8915, "step": 93 }, { "epoch": 0.2842025699168556, "grad_norm": 0.2725320545499666, "learning_rate": 4.796779240911461e-06, "loss": 0.9251, "step": 94 }, { "epoch": 0.2872260015117158, "grad_norm": 0.2386183196781376, "learning_rate": 4.791980488291457e-06, "loss": 0.8928, "step": 95 }, { "epoch": 0.29024943310657597, "grad_norm": 0.1817710733957378, "learning_rate": 4.787128195087596e-06, "loss": 0.9165, "step": 96 }, { "epoch": 0.29327286470143615, "grad_norm": 0.17308690210240787, "learning_rate": 4.782222474649279e-06, "loss": 0.887, "step": 97 }, { "epoch": 0.2962962962962963, "grad_norm": 0.2404735832702819, "learning_rate": 4.777263441573963e-06, "loss": 0.9012, "step": 98 }, { "epoch": 0.29931972789115646, "grad_norm": 0.28779677911496493, "learning_rate": 4.772251211704487e-06, "loss": 0.9016, "step": 99 }, { "epoch": 0.30234315948601664, "grad_norm": 0.15787837522906498, "learning_rate": 4.7671859021263635e-06, "loss": 0.9051, "step": 100 }, { "epoch": 0.30536659108087677, "grad_norm": 0.1575234808015298, "learning_rate": 4.762067631165049e-06, "loss": 0.8917, "step": 101 }, { "epoch": 0.30839002267573695, "grad_norm": 0.17558403452861931, "learning_rate": 4.756896518383173e-06, "loss": 0.9174, "step": 102 }, { "epoch": 0.31141345427059713, "grad_norm": 0.28974349430226604, "learning_rate": 4.751672684577747e-06, "loss": 0.8929, "step": 103 }, { "epoch": 0.3144368858654573, "grad_norm": 0.24411092218088543, "learning_rate": 4.746396251777348e-06, "loss": 0.8811, "step": 104 }, { "epoch": 0.31746031746031744, "grad_norm": 0.16801064806045637, "learning_rate": 4.74106734323926e-06, "loss": 0.8758, "step": 105 }, { "epoch": 0.3204837490551776, "grad_norm": 0.19248014461061233, "learning_rate": 4.7356860834466e-06, "loss": 0.9103, "step": 106 }, { "epoch": 0.3235071806500378, "grad_norm": 0.27209908752286666, "learning_rate": 4.730252598105407e-06, "loss": 0.8843, "step": 107 }, { "epoch": 0.32653061224489793, "grad_norm": 0.2293714752972601, "learning_rate": 4.72476701414171e-06, "loss": 0.9231, "step": 108 }, { "epoch": 0.3295540438397581, "grad_norm": 0.18392800235656956, "learning_rate": 4.7192294596985564e-06, "loss": 0.8552, "step": 109 }, { "epoch": 0.3325774754346183, "grad_norm": 0.1893627518175467, "learning_rate": 4.7136400641330245e-06, "loss": 0.8811, "step": 110 }, { "epoch": 0.3356009070294785, "grad_norm": 0.27532406651290064, "learning_rate": 4.7079989580132005e-06, "loss": 0.9032, "step": 111 }, { "epoch": 0.3386243386243386, "grad_norm": 0.21281637805817608, "learning_rate": 4.702306273115122e-06, "loss": 0.8731, "step": 112 }, { "epoch": 0.3416477702191988, "grad_norm": 0.21685692387167585, "learning_rate": 4.696562142419712e-06, "loss": 0.8713, "step": 113 }, { "epoch": 0.34467120181405897, "grad_norm": 0.27021306476550466, "learning_rate": 4.690766700109659e-06, "loss": 0.88, "step": 114 }, { "epoch": 0.3476946334089191, "grad_norm": 0.23439835580439225, "learning_rate": 4.684920081566295e-06, "loss": 0.8814, "step": 115 }, { "epoch": 0.3507180650037793, "grad_norm": 0.21025681348048122, "learning_rate": 4.679022423366424e-06, "loss": 0.8535, "step": 116 }, { "epoch": 0.35374149659863946, "grad_norm": 0.21924118290065314, "learning_rate": 4.673073863279133e-06, "loss": 0.8869, "step": 117 }, { "epoch": 0.35676492819349964, "grad_norm": 0.2875708297089177, "learning_rate": 4.667074540262577e-06, "loss": 0.8646, "step": 118 }, { "epoch": 0.35978835978835977, "grad_norm": 0.20014737080144987, "learning_rate": 4.661024594460733e-06, "loss": 0.8718, "step": 119 }, { "epoch": 0.36281179138321995, "grad_norm": 0.19119381829230253, "learning_rate": 4.654924167200124e-06, "loss": 0.8683, "step": 120 }, { "epoch": 0.36583522297808013, "grad_norm": 0.2655620248145862, "learning_rate": 4.648773400986513e-06, "loss": 0.8655, "step": 121 }, { "epoch": 0.3688586545729403, "grad_norm": 0.25081787812962225, "learning_rate": 4.6425724395015865e-06, "loss": 0.8582, "step": 122 }, { "epoch": 0.37188208616780044, "grad_norm": 0.2146047325963571, "learning_rate": 4.636321427599586e-06, "loss": 0.8893, "step": 123 }, { "epoch": 0.3749055177626606, "grad_norm": 0.2309806267470169, "learning_rate": 4.63002051130393e-06, "loss": 0.8486, "step": 124 }, { "epoch": 0.3779289493575208, "grad_norm": 0.27736367362748365, "learning_rate": 4.623669837803803e-06, "loss": 0.8687, "step": 125 }, { "epoch": 0.38095238095238093, "grad_norm": 0.2224756513405458, "learning_rate": 4.617269555450715e-06, "loss": 0.8825, "step": 126 }, { "epoch": 0.3839758125472411, "grad_norm": 0.17936830170379472, "learning_rate": 4.610819813755038e-06, "loss": 0.8546, "step": 127 }, { "epoch": 0.3869992441421013, "grad_norm": 0.18923636586433076, "learning_rate": 4.604320763382512e-06, "loss": 0.87, "step": 128 }, { "epoch": 0.3900226757369615, "grad_norm": 0.18724186374787236, "learning_rate": 4.597772556150724e-06, "loss": 0.8676, "step": 129 }, { "epoch": 0.3930461073318216, "grad_norm": 0.2914426770268331, "learning_rate": 4.591175345025567e-06, "loss": 0.8799, "step": 130 }, { "epoch": 0.3960695389266818, "grad_norm": 0.23506817928141502, "learning_rate": 4.584529284117662e-06, "loss": 0.8895, "step": 131 }, { "epoch": 0.39909297052154197, "grad_norm": 0.19429487340998514, "learning_rate": 4.5778345286787575e-06, "loss": 0.8272, "step": 132 }, { "epoch": 0.4021164021164021, "grad_norm": 0.24906142354962724, "learning_rate": 4.5710912350981066e-06, "loss": 0.8647, "step": 133 }, { "epoch": 0.4051398337112623, "grad_norm": 0.25795927507557026, "learning_rate": 4.56429956089881e-06, "loss": 0.8653, "step": 134 }, { "epoch": 0.40816326530612246, "grad_norm": 0.18224019982541997, "learning_rate": 4.5574596647341414e-06, "loss": 0.8555, "step": 135 }, { "epoch": 0.41118669690098264, "grad_norm": 0.20473182208619398, "learning_rate": 4.550571706383833e-06, "loss": 0.8664, "step": 136 }, { "epoch": 0.41421012849584277, "grad_norm": 0.22168708013084754, "learning_rate": 4.543635846750351e-06, "loss": 0.8515, "step": 137 }, { "epoch": 0.41723356009070295, "grad_norm": 0.21632029243557258, "learning_rate": 4.536652247855133e-06, "loss": 0.8619, "step": 138 }, { "epoch": 0.42025699168556313, "grad_norm": 0.1920055931208493, "learning_rate": 4.529621072834805e-06, "loss": 0.8566, "step": 139 }, { "epoch": 0.42328042328042326, "grad_norm": 0.1880614895437287, "learning_rate": 4.522542485937369e-06, "loss": 0.8243, "step": 140 }, { "epoch": 0.42630385487528344, "grad_norm": 0.25600769805101486, "learning_rate": 4.515416652518366e-06, "loss": 0.8551, "step": 141 }, { "epoch": 0.4293272864701436, "grad_norm": 0.2034314626277561, "learning_rate": 4.508243739037016e-06, "loss": 0.8603, "step": 142 }, { "epoch": 0.4323507180650038, "grad_norm": 0.23508415301120186, "learning_rate": 4.501023913052326e-06, "loss": 0.8826, "step": 143 }, { "epoch": 0.43537414965986393, "grad_norm": 0.2775448226015208, "learning_rate": 4.4937573432191766e-06, "loss": 0.8764, "step": 144 }, { "epoch": 0.4383975812547241, "grad_norm": 0.24618223106362153, "learning_rate": 4.486444199284386e-06, "loss": 0.8973, "step": 145 }, { "epoch": 0.4414210128495843, "grad_norm": 0.23424108283949535, "learning_rate": 4.47908465208274e-06, "loss": 0.8736, "step": 146 }, { "epoch": 0.4444444444444444, "grad_norm": 0.22742376996470443, "learning_rate": 4.471678873533002e-06, "loss": 0.8581, "step": 147 }, { "epoch": 0.4474678760393046, "grad_norm": 0.24653243269473768, "learning_rate": 4.464227036633901e-06, "loss": 0.8489, "step": 148 }, { "epoch": 0.4504913076341648, "grad_norm": 0.2408835452466121, "learning_rate": 4.456729315460084e-06, "loss": 0.8637, "step": 149 }, { "epoch": 0.45351473922902497, "grad_norm": 0.20149761505503935, "learning_rate": 4.449185885158056e-06, "loss": 0.8671, "step": 150 }, { "epoch": 0.4565381708238851, "grad_norm": 0.19127590785183332, "learning_rate": 4.4415969219420846e-06, "loss": 0.8792, "step": 151 }, { "epoch": 0.4595616024187453, "grad_norm": 0.22390628054581238, "learning_rate": 4.433962603090083e-06, "loss": 0.8468, "step": 152 }, { "epoch": 0.46258503401360546, "grad_norm": 0.2957253215613366, "learning_rate": 4.426283106939474e-06, "loss": 0.8268, "step": 153 }, { "epoch": 0.4656084656084656, "grad_norm": 0.20506648122584112, "learning_rate": 4.418558612883016e-06, "loss": 0.8772, "step": 154 }, { "epoch": 0.46863189720332576, "grad_norm": 0.18636265474604682, "learning_rate": 4.410789301364621e-06, "loss": 0.858, "step": 155 }, { "epoch": 0.47165532879818595, "grad_norm": 0.2674232446173923, "learning_rate": 4.402975353875134e-06, "loss": 0.8683, "step": 156 }, { "epoch": 0.47467876039304613, "grad_norm": 0.2747499333038218, "learning_rate": 4.3951169529480934e-06, "loss": 0.8439, "step": 157 }, { "epoch": 0.47770219198790626, "grad_norm": 0.18463338955505504, "learning_rate": 4.3872142821554695e-06, "loss": 0.8321, "step": 158 }, { "epoch": 0.48072562358276644, "grad_norm": 0.19683973897761153, "learning_rate": 4.379267526103374e-06, "loss": 0.8378, "step": 159 }, { "epoch": 0.4837490551776266, "grad_norm": 0.23093724944543254, "learning_rate": 4.3712768704277535e-06, "loss": 0.8342, "step": 160 }, { "epoch": 0.48677248677248675, "grad_norm": 0.25457828536678356, "learning_rate": 4.36324250179004e-06, "loss": 0.8438, "step": 161 }, { "epoch": 0.4897959183673469, "grad_norm": 0.2341347444247441, "learning_rate": 4.355164607872806e-06, "loss": 0.874, "step": 162 }, { "epoch": 0.4928193499622071, "grad_norm": 0.19832386653308293, "learning_rate": 4.347043377375369e-06, "loss": 0.8871, "step": 163 }, { "epoch": 0.4958427815570673, "grad_norm": 0.23548674821464477, "learning_rate": 4.338879000009389e-06, "loss": 0.8571, "step": 164 }, { "epoch": 0.4988662131519274, "grad_norm": 0.2564635876122362, "learning_rate": 4.3306716664944345e-06, "loss": 0.8441, "step": 165 }, { "epoch": 0.5018896447467877, "grad_norm": 0.22937827244764553, "learning_rate": 4.322421568553529e-06, "loss": 0.8435, "step": 166 }, { "epoch": 0.5049130763416477, "grad_norm": 0.20546938114609037, "learning_rate": 4.314128898908672e-06, "loss": 0.8427, "step": 167 }, { "epoch": 0.5079365079365079, "grad_norm": 0.24461216551872245, "learning_rate": 4.305793851276335e-06, "loss": 0.8488, "step": 168 }, { "epoch": 0.5109599395313681, "grad_norm": 0.2280451372713774, "learning_rate": 4.297416620362939e-06, "loss": 0.8493, "step": 169 }, { "epoch": 0.5139833711262283, "grad_norm": 0.2202142714476725, "learning_rate": 4.288997401860303e-06, "loss": 0.8514, "step": 170 }, { "epoch": 0.5170068027210885, "grad_norm": 0.2426775141297586, "learning_rate": 4.280536392441078e-06, "loss": 0.8501, "step": 171 }, { "epoch": 0.5200302343159486, "grad_norm": 0.1998543423805206, "learning_rate": 4.272033789754146e-06, "loss": 0.8313, "step": 172 }, { "epoch": 0.5230536659108088, "grad_norm": 0.1847895892138973, "learning_rate": 4.263489792420008e-06, "loss": 0.8195, "step": 173 }, { "epoch": 0.5260770975056689, "grad_norm": 0.23817124539909545, "learning_rate": 4.254904600026143e-06, "loss": 0.8581, "step": 174 }, { "epoch": 0.5291005291005291, "grad_norm": 0.2575742303999011, "learning_rate": 4.246278413122344e-06, "loss": 0.8511, "step": 175 }, { "epoch": 0.5321239606953893, "grad_norm": 0.22609359204972732, "learning_rate": 4.2376114332160325e-06, "loss": 0.843, "step": 176 }, { "epoch": 0.5351473922902494, "grad_norm": 0.22696322689045012, "learning_rate": 4.2289038627675585e-06, "loss": 0.833, "step": 177 }, { "epoch": 0.5381708238851096, "grad_norm": 0.2083064134180325, "learning_rate": 4.220155905185461e-06, "loss": 0.8707, "step": 178 }, { "epoch": 0.5411942554799698, "grad_norm": 0.2188998951871127, "learning_rate": 4.211367764821722e-06, "loss": 0.8756, "step": 179 }, { "epoch": 0.54421768707483, "grad_norm": 0.21174182945781866, "learning_rate": 4.202539646966993e-06, "loss": 0.8431, "step": 180 }, { "epoch": 0.54724111866969, "grad_norm": 0.26921219919236117, "learning_rate": 4.193671757845797e-06, "loss": 0.8346, "step": 181 }, { "epoch": 0.5502645502645502, "grad_norm": 0.2410488610748255, "learning_rate": 4.184764304611715e-06, "loss": 0.8323, "step": 182 }, { "epoch": 0.5532879818594104, "grad_norm": 0.19188924232191892, "learning_rate": 4.17581749534254e-06, "loss": 0.8275, "step": 183 }, { "epoch": 0.5563114134542706, "grad_norm": 0.24965929389660024, "learning_rate": 4.166831539035423e-06, "loss": 0.8558, "step": 184 }, { "epoch": 0.5593348450491308, "grad_norm": 0.2715497253670651, "learning_rate": 4.1578066456019885e-06, "loss": 0.8667, "step": 185 }, { "epoch": 0.562358276643991, "grad_norm": 0.19906288449082996, "learning_rate": 4.148743025863432e-06, "loss": 0.8535, "step": 186 }, { "epoch": 0.5653817082388511, "grad_norm": 0.22076525732705374, "learning_rate": 4.139640891545591e-06, "loss": 0.8296, "step": 187 }, { "epoch": 0.5684051398337112, "grad_norm": 0.25483531753570576, "learning_rate": 4.130500455274005e-06, "loss": 0.8355, "step": 188 }, { "epoch": 0.5714285714285714, "grad_norm": 0.24421069561222894, "learning_rate": 4.121321930568946e-06, "loss": 0.8357, "step": 189 }, { "epoch": 0.5744520030234316, "grad_norm": 0.20339394657166124, "learning_rate": 4.112105531840427e-06, "loss": 0.8357, "step": 190 }, { "epoch": 0.5774754346182918, "grad_norm": 0.24233770822338466, "learning_rate": 4.1028514743832e-06, "loss": 0.8313, "step": 191 }, { "epoch": 0.5804988662131519, "grad_norm": 0.2829777666494022, "learning_rate": 4.093559974371725e-06, "loss": 0.8378, "step": 192 }, { "epoch": 0.5835222978080121, "grad_norm": 0.1699407087734907, "learning_rate": 4.084231248855113e-06, "loss": 0.8208, "step": 193 }, { "epoch": 0.5865457294028723, "grad_norm": 0.17498689950665328, "learning_rate": 4.074865515752068e-06, "loss": 0.838, "step": 194 }, { "epoch": 0.5895691609977324, "grad_norm": 0.2475691965670073, "learning_rate": 4.065462993845785e-06, "loss": 0.849, "step": 195 }, { "epoch": 0.5925925925925926, "grad_norm": 0.24997313540826083, "learning_rate": 4.056023902778846e-06, "loss": 0.8229, "step": 196 }, { "epoch": 0.5956160241874527, "grad_norm": 0.19976933217581305, "learning_rate": 4.046548463048089e-06, "loss": 0.8301, "step": 197 }, { "epoch": 0.5986394557823129, "grad_norm": 0.24028559185538167, "learning_rate": 4.037036895999453e-06, "loss": 0.8462, "step": 198 }, { "epoch": 0.6016628873771731, "grad_norm": 0.27335949880058813, "learning_rate": 4.0274894238228115e-06, "loss": 0.8364, "step": 199 }, { "epoch": 0.6046863189720333, "grad_norm": 0.18909543268493909, "learning_rate": 4.017906269546778e-06, "loss": 0.8083, "step": 200 }, { "epoch": 0.6077097505668935, "grad_norm": 0.20724602824279856, "learning_rate": 4.0082876570335025e-06, "loss": 0.8193, "step": 201 }, { "epoch": 0.6107331821617535, "grad_norm": 0.26651899455610345, "learning_rate": 3.9986338109734354e-06, "loss": 0.8299, "step": 202 }, { "epoch": 0.6137566137566137, "grad_norm": 0.20515478118259406, "learning_rate": 3.988944956880082e-06, "loss": 0.8323, "step": 203 }, { "epoch": 0.6167800453514739, "grad_norm": 0.1823781343576012, "learning_rate": 3.979221321084734e-06, "loss": 0.8224, "step": 204 }, { "epoch": 0.6198034769463341, "grad_norm": 0.19460227890197035, "learning_rate": 3.969463130731183e-06, "loss": 0.8243, "step": 205 }, { "epoch": 0.6228269085411943, "grad_norm": 0.25256274653870814, "learning_rate": 3.959670613770414e-06, "loss": 0.834, "step": 206 }, { "epoch": 0.6258503401360545, "grad_norm": 0.2099371278262912, "learning_rate": 3.949843998955279e-06, "loss": 0.8001, "step": 207 }, { "epoch": 0.6288737717309146, "grad_norm": 0.18831071399800087, "learning_rate": 3.939983515835157e-06, "loss": 0.846, "step": 208 }, { "epoch": 0.6318972033257747, "grad_norm": 0.20326222391630303, "learning_rate": 3.9300893947505865e-06, "loss": 0.813, "step": 209 }, { "epoch": 0.6349206349206349, "grad_norm": 0.28946931059014386, "learning_rate": 3.92016186682789e-06, "loss": 0.8252, "step": 210 }, { "epoch": 0.6379440665154951, "grad_norm": 0.20146394091804065, "learning_rate": 3.9102011639737715e-06, "loss": 0.8273, "step": 211 }, { "epoch": 0.6409674981103552, "grad_norm": 0.16554710439809656, "learning_rate": 3.900207518869901e-06, "loss": 0.8294, "step": 212 }, { "epoch": 0.6439909297052154, "grad_norm": 0.19154551239872575, "learning_rate": 3.890181164967476e-06, "loss": 0.8331, "step": 213 }, { "epoch": 0.6470143613000756, "grad_norm": 0.2863695398034112, "learning_rate": 3.880122336481774e-06, "loss": 0.8156, "step": 214 }, { "epoch": 0.6500377928949358, "grad_norm": 0.21052777788511692, "learning_rate": 3.870031268386676e-06, "loss": 0.7963, "step": 215 }, { "epoch": 0.6530612244897959, "grad_norm": 0.1566104119157067, "learning_rate": 3.859908196409177e-06, "loss": 0.8247, "step": 216 }, { "epoch": 0.656084656084656, "grad_norm": 0.17376065755010325, "learning_rate": 3.849753357023885e-06, "loss": 0.8412, "step": 217 }, { "epoch": 0.6591080876795162, "grad_norm": 0.2775570184417396, "learning_rate": 3.839566987447492e-06, "loss": 0.8444, "step": 218 }, { "epoch": 0.6621315192743764, "grad_norm": 0.3002446727716999, "learning_rate": 3.829349325633233e-06, "loss": 0.8353, "step": 219 }, { "epoch": 0.6651549508692366, "grad_norm": 0.17501583537193782, "learning_rate": 3.819100610265332e-06, "loss": 0.8406, "step": 220 }, { "epoch": 0.6681783824640968, "grad_norm": 0.16018543435725524, "learning_rate": 3.8088210807534185e-06, "loss": 0.8143, "step": 221 }, { "epoch": 0.671201814058957, "grad_norm": 0.26632239617155334, "learning_rate": 3.7985109772269435e-06, "loss": 0.8099, "step": 222 }, { "epoch": 0.674225245653817, "grad_norm": 0.2502372675648549, "learning_rate": 3.7881705405295623e-06, "loss": 0.828, "step": 223 }, { "epoch": 0.6772486772486772, "grad_norm": 0.21825897588135384, "learning_rate": 3.777800012213514e-06, "loss": 0.8246, "step": 224 }, { "epoch": 0.6802721088435374, "grad_norm": 0.27497686942905814, "learning_rate": 3.767399634533976e-06, "loss": 0.8131, "step": 225 }, { "epoch": 0.6832955404383976, "grad_norm": 0.22856597196018685, "learning_rate": 3.756969650443408e-06, "loss": 0.8098, "step": 226 }, { "epoch": 0.6863189720332578, "grad_norm": 0.21059170940590144, "learning_rate": 3.7465103035858718e-06, "loss": 0.8187, "step": 227 }, { "epoch": 0.6893424036281179, "grad_norm": 0.2289160214691356, "learning_rate": 3.7360218382913426e-06, "loss": 0.8265, "step": 228 }, { "epoch": 0.6923658352229781, "grad_norm": 0.22771294742249917, "learning_rate": 3.7255044995700024e-06, "loss": 0.8063, "step": 229 }, { "epoch": 0.6953892668178382, "grad_norm": 0.220912987205476, "learning_rate": 3.714958533106515e-06, "loss": 0.8141, "step": 230 }, { "epoch": 0.6984126984126984, "grad_norm": 0.2331093248404988, "learning_rate": 3.7043841852542884e-06, "loss": 0.7967, "step": 231 }, { "epoch": 0.7014361300075586, "grad_norm": 0.24044315675315245, "learning_rate": 3.6937817030297164e-06, "loss": 0.8202, "step": 232 }, { "epoch": 0.7044595616024187, "grad_norm": 0.17808063026487772, "learning_rate": 3.6831513341064128e-06, "loss": 0.824, "step": 233 }, { "epoch": 0.7074829931972789, "grad_norm": 0.1686282272216412, "learning_rate": 3.672493326809422e-06, "loss": 0.8265, "step": 234 }, { "epoch": 0.7105064247921391, "grad_norm": 0.2620354561369418, "learning_rate": 3.661807930109422e-06, "loss": 0.8156, "step": 235 }, { "epoch": 0.7135298563869993, "grad_norm": 0.325482330440253, "learning_rate": 3.651095393616904e-06, "loss": 0.828, "step": 236 }, { "epoch": 0.7165532879818595, "grad_norm": 0.15080114640909387, "learning_rate": 3.6403559675763457e-06, "loss": 0.7995, "step": 237 }, { "epoch": 0.7195767195767195, "grad_norm": 0.14745127928311055, "learning_rate": 3.629589902860363e-06, "loss": 0.8087, "step": 238 }, { "epoch": 0.7226001511715797, "grad_norm": 0.2799111726866219, "learning_rate": 3.6187974509638496e-06, "loss": 0.8176, "step": 239 }, { "epoch": 0.7256235827664399, "grad_norm": 0.2502547915239206, "learning_rate": 3.607978863998104e-06, "loss": 0.8064, "step": 240 }, { "epoch": 0.7286470143613001, "grad_norm": 0.13777657856560566, "learning_rate": 3.5971343946849374e-06, "loss": 0.8178, "step": 241 }, { "epoch": 0.7316704459561603, "grad_norm": 0.1385328283480905, "learning_rate": 3.586264296350775e-06, "loss": 0.8027, "step": 242 }, { "epoch": 0.7346938775510204, "grad_norm": 0.17341004642304678, "learning_rate": 3.57536882292073e-06, "loss": 0.8096, "step": 243 }, { "epoch": 0.7377173091458806, "grad_norm": 0.3691916406878038, "learning_rate": 3.564448228912682e-06, "loss": 0.8338, "step": 244 }, { "epoch": 0.7407407407407407, "grad_norm": 0.21689653213933718, "learning_rate": 3.5535027694313233e-06, "loss": 0.7977, "step": 245 }, { "epoch": 0.7437641723356009, "grad_norm": 0.16595312089208156, "learning_rate": 3.5425327001622034e-06, "loss": 0.7987, "step": 246 }, { "epoch": 0.7467876039304611, "grad_norm": 0.21979225164562236, "learning_rate": 3.5315382773657563e-06, "loss": 0.8181, "step": 247 }, { "epoch": 0.7498110355253212, "grad_norm": 0.31450056661452935, "learning_rate": 3.520519757871313e-06, "loss": 0.8128, "step": 248 }, { "epoch": 0.7528344671201814, "grad_norm": 0.155403218509628, "learning_rate": 3.5094773990711024e-06, "loss": 0.807, "step": 249 }, { "epoch": 0.7558578987150416, "grad_norm": 0.14490425331756726, "learning_rate": 3.4984114589142388e-06, "loss": 0.7883, "step": 250 }, { "epoch": 0.7588813303099018, "grad_norm": 0.21380341083079393, "learning_rate": 3.4873221959006973e-06, "loss": 0.8162, "step": 251 }, { "epoch": 0.7619047619047619, "grad_norm": 0.35920542267660566, "learning_rate": 3.476209869075273e-06, "loss": 0.7852, "step": 252 }, { "epoch": 0.764928193499622, "grad_norm": 0.14693329979199346, "learning_rate": 3.4650747380215296e-06, "loss": 0.8164, "step": 253 }, { "epoch": 0.7679516250944822, "grad_norm": 0.2613621433404773, "learning_rate": 3.4539170628557383e-06, "loss": 0.8083, "step": 254 }, { "epoch": 0.7709750566893424, "grad_norm": 0.3665112092678806, "learning_rate": 3.442737104220801e-06, "loss": 0.8181, "step": 255 }, { "epoch": 0.7739984882842026, "grad_norm": 0.16067983638579006, "learning_rate": 3.4315351232801597e-06, "loss": 0.8162, "step": 256 }, { "epoch": 0.7770219198790628, "grad_norm": 0.24580578582443013, "learning_rate": 3.4203113817116955e-06, "loss": 0.8199, "step": 257 }, { "epoch": 0.780045351473923, "grad_norm": 0.331248956918326, "learning_rate": 3.409066141701618e-06, "loss": 0.7913, "step": 258 }, { "epoch": 0.783068783068783, "grad_norm": 0.16426278470075412, "learning_rate": 3.3977996659383396e-06, "loss": 0.8166, "step": 259 }, { "epoch": 0.7860922146636432, "grad_norm": 0.2057865252683302, "learning_rate": 3.386512217606339e-06, "loss": 0.8018, "step": 260 }, { "epoch": 0.7891156462585034, "grad_norm": 0.3793459602253602, "learning_rate": 3.3752040603800148e-06, "loss": 0.8243, "step": 261 }, { "epoch": 0.7921390778533636, "grad_norm": 0.14811638555402215, "learning_rate": 3.3638754584175222e-06, "loss": 0.8144, "step": 262 }, { "epoch": 0.7951625094482238, "grad_norm": 0.3237839618432774, "learning_rate": 3.352526676354606e-06, "loss": 0.7933, "step": 263 }, { "epoch": 0.7981859410430839, "grad_norm": 0.21169351866452582, "learning_rate": 3.3411579792984178e-06, "loss": 0.8125, "step": 264 }, { "epoch": 0.8012093726379441, "grad_norm": 0.14502913140221696, "learning_rate": 3.3297696328213215e-06, "loss": 0.7919, "step": 265 }, { "epoch": 0.8042328042328042, "grad_norm": 0.130046065883626, "learning_rate": 3.318361902954692e-06, "loss": 0.7925, "step": 266 }, { "epoch": 0.8072562358276644, "grad_norm": 0.1806023890937921, "learning_rate": 3.3069350561826997e-06, "loss": 0.7977, "step": 267 }, { "epoch": 0.8102796674225246, "grad_norm": 0.3661239179855748, "learning_rate": 3.295489359436083e-06, "loss": 0.8121, "step": 268 }, { "epoch": 0.8133030990173847, "grad_norm": 0.15684544823299335, "learning_rate": 3.2840250800859185e-06, "loss": 0.8439, "step": 269 }, { "epoch": 0.8163265306122449, "grad_norm": 0.1442117724504863, "learning_rate": 3.272542485937369e-06, "loss": 0.8205, "step": 270 }, { "epoch": 0.8193499622071051, "grad_norm": 0.1630144971387636, "learning_rate": 3.2610418452234315e-06, "loss": 0.8116, "step": 271 }, { "epoch": 0.8223733938019653, "grad_norm": 0.2272302138625313, "learning_rate": 3.249523426598669e-06, "loss": 0.7889, "step": 272 }, { "epoch": 0.8253968253968254, "grad_norm": 0.2630488611954438, "learning_rate": 3.2379874991329374e-06, "loss": 0.8101, "step": 273 }, { "epoch": 0.8284202569916855, "grad_norm": 0.1636882510390679, "learning_rate": 3.2264343323050985e-06, "loss": 0.8067, "step": 274 }, { "epoch": 0.8314436885865457, "grad_norm": 0.1800718434777349, "learning_rate": 3.214864195996723e-06, "loss": 0.8267, "step": 275 }, { "epoch": 0.8344671201814059, "grad_norm": 0.27772170659214646, "learning_rate": 3.2032773604857915e-06, "loss": 0.8021, "step": 276 }, { "epoch": 0.8374905517762661, "grad_norm": 0.2524388193093376, "learning_rate": 3.1916740964403736e-06, "loss": 0.8067, "step": 277 }, { "epoch": 0.8405139833711263, "grad_norm": 0.18970600852145528, "learning_rate": 3.1800546749123108e-06, "loss": 0.8073, "step": 278 }, { "epoch": 0.8435374149659864, "grad_norm": 0.19923073362072904, "learning_rate": 3.168419367330883e-06, "loss": 0.799, "step": 279 }, { "epoch": 0.8465608465608465, "grad_norm": 0.25436094223895794, "learning_rate": 3.1567684454964674e-06, "loss": 0.8041, "step": 280 }, { "epoch": 0.8495842781557067, "grad_norm": 0.21128266721448266, "learning_rate": 3.14510218157419e-06, "loss": 0.8113, "step": 281 }, { "epoch": 0.8526077097505669, "grad_norm": 0.22163072880133364, "learning_rate": 3.133420848087566e-06, "loss": 0.7889, "step": 282 }, { "epoch": 0.8556311413454271, "grad_norm": 0.22883591781527274, "learning_rate": 3.121724717912138e-06, "loss": 0.7917, "step": 283 }, { "epoch": 0.8586545729402872, "grad_norm": 0.2032672012417271, "learning_rate": 3.110014064269094e-06, "loss": 0.8032, "step": 284 }, { "epoch": 0.8616780045351474, "grad_norm": 0.1740199158625731, "learning_rate": 3.0982891607188948e-06, "loss": 0.7827, "step": 285 }, { "epoch": 0.8647014361300076, "grad_norm": 0.18106353392739202, "learning_rate": 3.0865502811548755e-06, "loss": 0.7896, "step": 286 }, { "epoch": 0.8677248677248677, "grad_norm": 0.2292881686201471, "learning_rate": 3.0747976997968513e-06, "loss": 0.8159, "step": 287 }, { "epoch": 0.8707482993197279, "grad_norm": 0.27476966438745903, "learning_rate": 3.0630316911847112e-06, "loss": 0.7938, "step": 288 }, { "epoch": 0.873771730914588, "grad_norm": 0.21250803524552264, "learning_rate": 3.051252530172003e-06, "loss": 0.7912, "step": 289 }, { "epoch": 0.8767951625094482, "grad_norm": 0.20109882386036412, "learning_rate": 3.039460491919516e-06, "loss": 0.8005, "step": 290 }, { "epoch": 0.8798185941043084, "grad_norm": 0.22987450725486983, "learning_rate": 3.0276558518888496e-06, "loss": 0.8081, "step": 291 }, { "epoch": 0.8828420256991686, "grad_norm": 0.20495650915854588, "learning_rate": 3.015838885835981e-06, "loss": 0.8115, "step": 292 }, { "epoch": 0.8858654572940288, "grad_norm": 0.17141615072214778, "learning_rate": 3.0040098698048232e-06, "loss": 0.7813, "step": 293 }, { "epoch": 0.8888888888888888, "grad_norm": 0.18881546824196338, "learning_rate": 2.992169080120776e-06, "loss": 0.8113, "step": 294 }, { "epoch": 0.891912320483749, "grad_norm": 0.20261508334609984, "learning_rate": 2.9803167933842712e-06, "loss": 0.7993, "step": 295 }, { "epoch": 0.8949357520786092, "grad_norm": 0.2637865639683421, "learning_rate": 2.9684532864643123e-06, "loss": 0.8025, "step": 296 }, { "epoch": 0.8979591836734694, "grad_norm": 0.20588016874386464, "learning_rate": 2.9565788364920034e-06, "loss": 0.7869, "step": 297 }, { "epoch": 0.9009826152683296, "grad_norm": 0.1838418464531271, "learning_rate": 2.944693720854081e-06, "loss": 0.7976, "step": 298 }, { "epoch": 0.9040060468631897, "grad_norm": 0.2238627689541774, "learning_rate": 2.932798217186429e-06, "loss": 0.7886, "step": 299 }, { "epoch": 0.9070294784580499, "grad_norm": 0.2223361558094008, "learning_rate": 2.920892603367596e-06, "loss": 0.8163, "step": 300 }, { "epoch": 0.91005291005291, "grad_norm": 0.1664138917818463, "learning_rate": 2.908977157512305e-06, "loss": 0.7859, "step": 301 }, { "epoch": 0.9130763416477702, "grad_norm": 0.218098712406248, "learning_rate": 2.897052157964952e-06, "loss": 0.818, "step": 302 }, { "epoch": 0.9160997732426304, "grad_norm": 0.25476932805817953, "learning_rate": 2.8851178832931076e-06, "loss": 0.7936, "step": 303 }, { "epoch": 0.9191232048374905, "grad_norm": 0.20454797870655053, "learning_rate": 2.8731746122810105e-06, "loss": 0.8009, "step": 304 }, { "epoch": 0.9221466364323507, "grad_norm": 0.2171163509058848, "learning_rate": 2.8612226239230536e-06, "loss": 0.8012, "step": 305 }, { "epoch": 0.9251700680272109, "grad_norm": 0.3201406418230194, "learning_rate": 2.8492621974172653e-06, "loss": 0.8347, "step": 306 }, { "epoch": 0.9281934996220711, "grad_norm": 0.20044446217181253, "learning_rate": 2.8372936121587895e-06, "loss": 0.8066, "step": 307 }, { "epoch": 0.9312169312169312, "grad_norm": 0.16283549638272465, "learning_rate": 2.8253171477333585e-06, "loss": 0.8049, "step": 308 }, { "epoch": 0.9342403628117913, "grad_norm": 0.20912249423273097, "learning_rate": 2.813333083910761e-06, "loss": 0.8112, "step": 309 }, { "epoch": 0.9372637944066515, "grad_norm": 0.28501513792396893, "learning_rate": 2.8013417006383078e-06, "loss": 0.8033, "step": 310 }, { "epoch": 0.9402872260015117, "grad_norm": 0.17569005132324075, "learning_rate": 2.7893432780342928e-06, "loss": 0.7905, "step": 311 }, { "epoch": 0.9433106575963719, "grad_norm": 0.1707451012967817, "learning_rate": 2.7773380963814454e-06, "loss": 0.7992, "step": 312 }, { "epoch": 0.9463340891912321, "grad_norm": 0.23658188962283105, "learning_rate": 2.76532643612039e-06, "loss": 0.7959, "step": 313 }, { "epoch": 0.9493575207860923, "grad_norm": 0.2417426081720488, "learning_rate": 2.7533085778430884e-06, "loss": 0.7719, "step": 314 }, { "epoch": 0.9523809523809523, "grad_norm": 0.21779534491141914, "learning_rate": 2.7412848022862883e-06, "loss": 0.8148, "step": 315 }, { "epoch": 0.9554043839758125, "grad_norm": 0.1937439406511132, "learning_rate": 2.729255390324966e-06, "loss": 0.8099, "step": 316 }, { "epoch": 0.9584278155706727, "grad_norm": 0.22418232835047394, "learning_rate": 2.717220622965762e-06, "loss": 0.8029, "step": 317 }, { "epoch": 0.9614512471655329, "grad_norm": 0.24163066601859826, "learning_rate": 2.7051807813404213e-06, "loss": 0.8069, "step": 318 }, { "epoch": 0.9644746787603931, "grad_norm": 0.17718761833134763, "learning_rate": 2.6931361466992225e-06, "loss": 0.7964, "step": 319 }, { "epoch": 0.9674981103552532, "grad_norm": 0.21359305838545312, "learning_rate": 2.6810870004044065e-06, "loss": 0.7777, "step": 320 }, { "epoch": 0.9705215419501134, "grad_norm": 0.2951108231827231, "learning_rate": 2.6690336239236097e-06, "loss": 0.7654, "step": 321 }, { "epoch": 0.9735449735449735, "grad_norm": 0.17887426724913263, "learning_rate": 2.6569762988232838e-06, "loss": 0.8021, "step": 322 }, { "epoch": 0.9765684051398337, "grad_norm": 0.16446650801438847, "learning_rate": 2.644915306762121e-06, "loss": 0.7996, "step": 323 }, { "epoch": 0.9795918367346939, "grad_norm": 0.18349619699553313, "learning_rate": 2.632850929484472e-06, "loss": 0.769, "step": 324 }, { "epoch": 0.982615268329554, "grad_norm": 0.23290485597057656, "learning_rate": 2.620783448813768e-06, "loss": 0.8104, "step": 325 }, { "epoch": 0.9856386999244142, "grad_norm": 0.21697778026585082, "learning_rate": 2.6087131466459344e-06, "loss": 0.7919, "step": 326 }, { "epoch": 0.9886621315192744, "grad_norm": 0.18436604515216662, "learning_rate": 2.5966403049428056e-06, "loss": 0.7819, "step": 327 }, { "epoch": 0.9916855631141346, "grad_norm": 0.1916879714375915, "learning_rate": 2.5845652057255414e-06, "loss": 0.7565, "step": 328 }, { "epoch": 0.9947089947089947, "grad_norm": 0.2338419771871179, "learning_rate": 2.572488131068037e-06, "loss": 0.8002, "step": 329 }, { "epoch": 0.9977324263038548, "grad_norm": 0.19973120898443514, "learning_rate": 2.560409363090331e-06, "loss": 0.8019, "step": 330 } ], "logging_steps": 1, "max_steps": 660, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 330, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.219445850938278e+18, "train_batch_size": 6, "trial_name": null, "trial_params": null }