{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.13486176668914363, "eval_steps": 34, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00033715441672285906, "eval_loss": 2.3532843589782715, "eval_runtime": 338.8675, "eval_samples_per_second": 14.74, "eval_steps_per_second": 1.844, "step": 1 }, { "epoch": 0.0010114632501685772, "grad_norm": 0.8756303787231445, "learning_rate": 1.5e-05, "loss": 2.361, "step": 3 }, { "epoch": 0.0020229265003371545, "grad_norm": 0.9312941431999207, "learning_rate": 3e-05, "loss": 2.4681, "step": 6 }, { "epoch": 0.0030343897505057315, "grad_norm": 0.9662004709243774, "learning_rate": 4.5e-05, "loss": 2.1782, "step": 9 }, { "epoch": 0.004045853000674309, "grad_norm": 1.3127541542053223, "learning_rate": 4.999675562428437e-05, "loss": 2.2872, "step": 12 }, { "epoch": 0.0050573162508428865, "grad_norm": 1.5349361896514893, "learning_rate": 4.9979724954289244e-05, "loss": 1.8232, "step": 15 }, { "epoch": 0.006068779501011463, "grad_norm": 1.5614502429962158, "learning_rate": 4.994810682835951e-05, "loss": 1.5184, "step": 18 }, { "epoch": 0.0070802427511800405, "grad_norm": 1.413482904434204, "learning_rate": 4.990191971059033e-05, "loss": 1.2261, "step": 21 }, { "epoch": 0.008091706001348618, "grad_norm": 1.2501227855682373, "learning_rate": 4.984119057295783e-05, "loss": 1.1656, "step": 24 }, { "epoch": 0.009103169251517195, "grad_norm": 1.0704395771026611, "learning_rate": 4.976595487956823e-05, "loss": 1.0296, "step": 27 }, { "epoch": 0.010114632501685773, "grad_norm": 0.8690694570541382, "learning_rate": 4.967625656594782e-05, "loss": 0.834, "step": 30 }, { "epoch": 0.011126095751854349, "grad_norm": 0.9992819428443909, "learning_rate": 4.957214801338581e-05, "loss": 0.9255, "step": 33 }, { "epoch": 0.011463250168577209, "eval_loss": 0.8507078289985657, "eval_runtime": 341.3748, "eval_samples_per_second": 14.632, "eval_steps_per_second": 1.831, "step": 34 }, { "epoch": 0.012137559002022926, "grad_norm": 1.001940131187439, "learning_rate": 4.9453690018345144e-05, "loss": 0.8057, "step": 36 }, { "epoch": 0.013149022252191504, "grad_norm": 1.1807010173797607, "learning_rate": 4.932095175695911e-05, "loss": 0.8677, "step": 39 }, { "epoch": 0.014160485502360081, "grad_norm": 0.7869375348091125, "learning_rate": 4.917401074463441e-05, "loss": 0.6542, "step": 42 }, { "epoch": 0.015171948752528659, "grad_norm": 0.8986634612083435, "learning_rate": 4.901295279078431e-05, "loss": 0.7597, "step": 45 }, { "epoch": 0.016183412002697236, "grad_norm": 0.8910415172576904, "learning_rate": 4.883787194871841e-05, "loss": 0.7038, "step": 48 }, { "epoch": 0.017194875252865813, "grad_norm": 1.069819688796997, "learning_rate": 4.864887046071813e-05, "loss": 0.7414, "step": 51 }, { "epoch": 0.01820633850303439, "grad_norm": 0.9437199831008911, "learning_rate": 4.8446058698330115e-05, "loss": 0.7289, "step": 54 }, { "epoch": 0.01921780175320297, "grad_norm": 1.1073840856552124, "learning_rate": 4.822955509791233e-05, "loss": 0.7371, "step": 57 }, { "epoch": 0.020229265003371546, "grad_norm": 1.1307681798934937, "learning_rate": 4.799948609147061e-05, "loss": 0.6487, "step": 60 }, { "epoch": 0.02124072825354012, "grad_norm": 0.8803877234458923, "learning_rate": 4.7755986032825864e-05, "loss": 0.6114, "step": 63 }, { "epoch": 0.022252191503708697, "grad_norm": 0.9320568442344666, "learning_rate": 4.74991971191553e-05, "loss": 0.6164, "step": 66 }, { "epoch": 0.022926500337154418, "eval_loss": 0.6071863174438477, "eval_runtime": 341.6544, "eval_samples_per_second": 14.62, "eval_steps_per_second": 1.829, "step": 68 }, { "epoch": 0.023263654753877275, "grad_norm": 1.0200417041778564, "learning_rate": 4.7229269307953235e-05, "loss": 0.6376, "step": 69 }, { "epoch": 0.024275118004045852, "grad_norm": 1.1348711252212524, "learning_rate": 4.694636022946012e-05, "loss": 0.6043, "step": 72 }, { "epoch": 0.02528658125421443, "grad_norm": 1.1511644124984741, "learning_rate": 4.665063509461097e-05, "loss": 0.6967, "step": 75 }, { "epoch": 0.026298044504383007, "grad_norm": 0.999319314956665, "learning_rate": 4.6342266598556814e-05, "loss": 0.5724, "step": 78 }, { "epoch": 0.027309507754551585, "grad_norm": 1.094857931137085, "learning_rate": 4.6021434819815555e-05, "loss": 0.5304, "step": 81 }, { "epoch": 0.028320971004720162, "grad_norm": 1.1055927276611328, "learning_rate": 4.568832711511125e-05, "loss": 0.598, "step": 84 }, { "epoch": 0.02933243425488874, "grad_norm": 0.9610121846199036, "learning_rate": 4.534313800996299e-05, "loss": 0.5244, "step": 87 }, { "epoch": 0.030343897505057317, "grad_norm": 0.962637722492218, "learning_rate": 4.498606908508754e-05, "loss": 0.5391, "step": 90 }, { "epoch": 0.031355360755225894, "grad_norm": 1.200920820236206, "learning_rate": 4.46173288586818e-05, "loss": 0.5275, "step": 93 }, { "epoch": 0.03236682400539447, "grad_norm": 1.0528006553649902, "learning_rate": 4.4237132664654154e-05, "loss": 0.5063, "step": 96 }, { "epoch": 0.03337828725556305, "grad_norm": 1.1569225788116455, "learning_rate": 4.384570252687542e-05, "loss": 0.6439, "step": 99 }, { "epoch": 0.03438975050573163, "grad_norm": 1.086855411529541, "learning_rate": 4.344326702952326e-05, "loss": 0.5464, "step": 102 }, { "epoch": 0.03438975050573163, "eval_loss": 0.5225653648376465, "eval_runtime": 341.5659, "eval_samples_per_second": 14.624, "eval_steps_per_second": 1.83, "step": 102 }, { "epoch": 0.035401213755900204, "grad_norm": 1.109368085861206, "learning_rate": 4.303006118359537e-05, "loss": 0.555, "step": 105 }, { "epoch": 0.03641267700606878, "grad_norm": 1.114818811416626, "learning_rate": 4.260632628966974e-05, "loss": 0.6002, "step": 108 }, { "epoch": 0.03742414025623736, "grad_norm": 1.0815789699554443, "learning_rate": 4.217230979699188e-05, "loss": 0.4636, "step": 111 }, { "epoch": 0.03843560350640594, "grad_norm": 0.9574511051177979, "learning_rate": 4.172826515897146e-05, "loss": 0.5052, "step": 114 }, { "epoch": 0.039447066756574514, "grad_norm": 1.1621136665344238, "learning_rate": 4.12744516851726e-05, "loss": 0.604, "step": 117 }, { "epoch": 0.04045853000674309, "grad_norm": 1.1703245639801025, "learning_rate": 4.0811134389884433e-05, "loss": 0.5072, "step": 120 }, { "epoch": 0.04146999325691166, "grad_norm": 1.0807468891143799, "learning_rate": 4.0338583837360225e-05, "loss": 0.5368, "step": 123 }, { "epoch": 0.04248145650708024, "grad_norm": 1.2833844423294067, "learning_rate": 3.985707598381544e-05, "loss": 0.5251, "step": 126 }, { "epoch": 0.04349291975724882, "grad_norm": 1.172788143157959, "learning_rate": 3.9366892016277096e-05, "loss": 0.4348, "step": 129 }, { "epoch": 0.044504383007417395, "grad_norm": 1.2922419309616089, "learning_rate": 3.886831818837847e-05, "loss": 0.4397, "step": 132 }, { "epoch": 0.04551584625758597, "grad_norm": 1.223044514656067, "learning_rate": 3.8361645653195026e-05, "loss": 0.5303, "step": 135 }, { "epoch": 0.045853000674308836, "eval_loss": 0.4773218631744385, "eval_runtime": 341.7589, "eval_samples_per_second": 14.616, "eval_steps_per_second": 1.829, "step": 136 }, { "epoch": 0.04652730950775455, "grad_norm": 1.1864899396896362, "learning_rate": 3.784717029321922e-05, "loss": 0.4828, "step": 138 }, { "epoch": 0.04753877275792313, "grad_norm": 1.1543965339660645, "learning_rate": 3.732519254757344e-05, "loss": 0.4989, "step": 141 }, { "epoch": 0.048550236008091704, "grad_norm": 1.2661465406417847, "learning_rate": 3.679601723656205e-05, "loss": 0.5184, "step": 144 }, { "epoch": 0.04956169925826028, "grad_norm": 1.4297566413879395, "learning_rate": 3.625995338366492e-05, "loss": 0.518, "step": 147 }, { "epoch": 0.05057316250842886, "grad_norm": 1.1457245349884033, "learning_rate": 3.5717314035076355e-05, "loss": 0.561, "step": 150 }, { "epoch": 0.05158462575859744, "grad_norm": 1.035326600074768, "learning_rate": 3.516841607689501e-05, "loss": 0.4277, "step": 153 }, { "epoch": 0.052596089008766014, "grad_norm": 1.0127874612808228, "learning_rate": 3.461358005007128e-05, "loss": 0.5606, "step": 156 }, { "epoch": 0.05360755225893459, "grad_norm": 1.2170828580856323, "learning_rate": 3.405312996322042e-05, "loss": 0.5461, "step": 159 }, { "epoch": 0.05461901550910317, "grad_norm": 1.3591985702514648, "learning_rate": 3.348739310341068e-05, "loss": 0.5153, "step": 162 }, { "epoch": 0.05563047875927175, "grad_norm": 1.0014851093292236, "learning_rate": 3.2916699845036816e-05, "loss": 0.4194, "step": 165 }, { "epoch": 0.056641942009440324, "grad_norm": 1.1735330820083618, "learning_rate": 3.234138345689077e-05, "loss": 0.4675, "step": 168 }, { "epoch": 0.057316250842886045, "eval_loss": 0.4461934566497803, "eval_runtime": 341.8298, "eval_samples_per_second": 14.613, "eval_steps_per_second": 1.828, "step": 170 }, { "epoch": 0.0576534052596089, "grad_norm": 1.0280512571334839, "learning_rate": 3.17617799075421e-05, "loss": 0.4213, "step": 171 }, { "epoch": 0.05866486850977748, "grad_norm": 1.155824065208435, "learning_rate": 3.1178227669141744e-05, "loss": 0.384, "step": 174 }, { "epoch": 0.05967633175994606, "grad_norm": 1.2171953916549683, "learning_rate": 3.0591067519763895e-05, "loss": 0.4842, "step": 177 }, { "epoch": 0.060687795010114634, "grad_norm": 1.1826962232589722, "learning_rate": 3.0000642344401113e-05, "loss": 0.4559, "step": 180 }, { "epoch": 0.06169925826028321, "grad_norm": 1.2323546409606934, "learning_rate": 2.9407296934729227e-05, "loss": 0.4811, "step": 183 }, { "epoch": 0.06271072151045179, "grad_norm": 1.1695280075073242, "learning_rate": 2.8811377787758636e-05, "loss": 0.4234, "step": 186 }, { "epoch": 0.06372218476062036, "grad_norm": 1.3116445541381836, "learning_rate": 2.8213232903489865e-05, "loss": 0.5082, "step": 189 }, { "epoch": 0.06473364801078894, "grad_norm": 1.3884786367416382, "learning_rate": 2.761321158169134e-05, "loss": 0.4535, "step": 192 }, { "epoch": 0.06574511126095751, "grad_norm": 1.1839005947113037, "learning_rate": 2.7011664217918154e-05, "loss": 0.4841, "step": 195 }, { "epoch": 0.0667565745111261, "grad_norm": 1.329397439956665, "learning_rate": 2.6408942098890936e-05, "loss": 0.48, "step": 198 }, { "epoch": 0.06776803776129467, "grad_norm": 1.0817499160766602, "learning_rate": 2.580539719735433e-05, "loss": 0.3271, "step": 201 }, { "epoch": 0.06877950101146325, "grad_norm": 1.6627490520477295, "learning_rate": 2.5201381966534748e-05, "loss": 0.4969, "step": 204 }, { "epoch": 0.06877950101146325, "eval_loss": 0.43049055337905884, "eval_runtime": 341.7654, "eval_samples_per_second": 14.615, "eval_steps_per_second": 1.829, "step": 204 }, { "epoch": 0.06979096426163182, "grad_norm": 1.039337396621704, "learning_rate": 2.459724913431772e-05, "loss": 0.439, "step": 207 }, { "epoch": 0.07080242751180041, "grad_norm": 1.222737431526184, "learning_rate": 2.399335149726463e-05, "loss": 0.4838, "step": 210 }, { "epoch": 0.07181389076196898, "grad_norm": 1.278668999671936, "learning_rate": 2.3390041714589514e-05, "loss": 0.4612, "step": 213 }, { "epoch": 0.07282535401213756, "grad_norm": 1.1525593996047974, "learning_rate": 2.2787672102216042e-05, "loss": 0.4372, "step": 216 }, { "epoch": 0.07383681726230613, "grad_norm": 1.3022117614746094, "learning_rate": 2.2186594427034864e-05, "loss": 0.4593, "step": 219 }, { "epoch": 0.07484828051247472, "grad_norm": 1.4199026823043823, "learning_rate": 2.1587159701481716e-05, "loss": 0.455, "step": 222 }, { "epoch": 0.07585974376264329, "grad_norm": 1.3410009145736694, "learning_rate": 2.098971797855599e-05, "loss": 0.6084, "step": 225 }, { "epoch": 0.07687120701281187, "grad_norm": 1.2653465270996094, "learning_rate": 2.0394618147399713e-05, "loss": 0.497, "step": 228 }, { "epoch": 0.07788267026298044, "grad_norm": 1.2599753141403198, "learning_rate": 1.980220772955602e-05, "loss": 0.4794, "step": 231 }, { "epoch": 0.07889413351314903, "grad_norm": 1.176132321357727, "learning_rate": 1.921283267602643e-05, "loss": 0.4134, "step": 234 }, { "epoch": 0.0799055967633176, "grad_norm": 1.2982177734375, "learning_rate": 1.8626837165245165e-05, "loss": 0.4404, "step": 237 }, { "epoch": 0.08024275118004046, "eval_loss": 0.4182414412498474, "eval_runtime": 341.5144, "eval_samples_per_second": 14.626, "eval_steps_per_second": 1.83, "step": 238 }, { "epoch": 0.08091706001348618, "grad_norm": 1.521083116531372, "learning_rate": 1.8044563402088684e-05, "loss": 0.4579, "step": 240 }, { "epoch": 0.08192852326365475, "grad_norm": 1.1534286737442017, "learning_rate": 1.746635141803761e-05, "loss": 0.3893, "step": 243 }, { "epoch": 0.08293998651382332, "grad_norm": 1.179457426071167, "learning_rate": 1.6892538872607937e-05, "loss": 0.428, "step": 246 }, { "epoch": 0.08395144976399191, "grad_norm": 1.498482346534729, "learning_rate": 1.6323460856167426e-05, "loss": 0.414, "step": 249 }, { "epoch": 0.08496291301416048, "grad_norm": 1.3838918209075928, "learning_rate": 1.5759449694252226e-05, "loss": 0.4113, "step": 252 }, { "epoch": 0.08597437626432906, "grad_norm": 1.2871530055999756, "learning_rate": 1.5200834753498128e-05, "loss": 0.4945, "step": 255 }, { "epoch": 0.08698583951449763, "grad_norm": 1.1573866605758667, "learning_rate": 1.4647942249299707e-05, "loss": 0.4448, "step": 258 }, { "epoch": 0.08799730276466622, "grad_norm": 1.2284533977508545, "learning_rate": 1.4101095055309746e-05, "loss": 0.4248, "step": 261 }, { "epoch": 0.08900876601483479, "grad_norm": 1.3865326642990112, "learning_rate": 1.356061251489012e-05, "loss": 0.5, "step": 264 }, { "epoch": 0.09002022926500337, "grad_norm": 1.0498360395431519, "learning_rate": 1.302681025462424e-05, "loss": 0.3297, "step": 267 }, { "epoch": 0.09103169251517194, "grad_norm": 1.1438897848129272, "learning_rate": 1.2500000000000006e-05, "loss": 0.4251, "step": 270 }, { "epoch": 0.09170600134861767, "eval_loss": 0.409618616104126, "eval_runtime": 341.4948, "eval_samples_per_second": 14.627, "eval_steps_per_second": 1.83, "step": 272 }, { "epoch": 0.09204315576534053, "grad_norm": 1.5004656314849854, "learning_rate": 1.1980489393370938e-05, "loss": 0.4688, "step": 273 }, { "epoch": 0.0930546190155091, "grad_norm": 1.2165626287460327, "learning_rate": 1.1468581814301717e-05, "loss": 0.4862, "step": 276 }, { "epoch": 0.09406608226567768, "grad_norm": 1.4357872009277344, "learning_rate": 1.096457620240298e-05, "loss": 0.4654, "step": 279 }, { "epoch": 0.09507754551584625, "grad_norm": 1.3055099248886108, "learning_rate": 1.0468766882759094e-05, "loss": 0.4198, "step": 282 }, { "epoch": 0.09608900876601484, "grad_norm": 1.3729138374328613, "learning_rate": 9.981443394050525e-06, "loss": 0.423, "step": 285 }, { "epoch": 0.09710047201618341, "grad_norm": 1.2119735479354858, "learning_rate": 9.502890319471491e-06, "loss": 0.4302, "step": 288 }, { "epoch": 0.098111935266352, "grad_norm": 1.487874984741211, "learning_rate": 9.033387120541306e-06, "loss": 0.4843, "step": 291 }, { "epoch": 0.09912339851652056, "grad_norm": 1.3085728883743286, "learning_rate": 8.573207973906735e-06, "loss": 0.4358, "step": 294 }, { "epoch": 0.10013486176668915, "grad_norm": 1.1197856664657593, "learning_rate": 8.1226216112306e-06, "loss": 0.392, "step": 297 }, { "epoch": 0.10114632501685772, "grad_norm": 1.2706650495529175, "learning_rate": 7.681891162260015e-06, "loss": 0.4164, "step": 300 }, { "epoch": 0.1021577882670263, "grad_norm": 1.287969708442688, "learning_rate": 7.251274001166044e-06, "loss": 0.5041, "step": 303 }, { "epoch": 0.10316925151719487, "grad_norm": 1.357833743095398, "learning_rate": 6.831021596244424e-06, "loss": 0.3919, "step": 306 }, { "epoch": 0.10316925151719487, "eval_loss": 0.4043685495853424, "eval_runtime": 341.6981, "eval_samples_per_second": 14.618, "eval_steps_per_second": 1.829, "step": 306 }, { "epoch": 0.10418071476736346, "grad_norm": 1.1861456632614136, "learning_rate": 6.421379363065142e-06, "loss": 0.3585, "step": 309 }, { "epoch": 0.10519217801753203, "grad_norm": 1.4933161735534668, "learning_rate": 6.022586521156715e-06, "loss": 0.3932, "step": 312 }, { "epoch": 0.10620364126770061, "grad_norm": 1.3899950981140137, "learning_rate": 5.634875954308638e-06, "loss": 0.5755, "step": 315 }, { "epoch": 0.10721510451786918, "grad_norm": 1.18565833568573, "learning_rate": 5.258474074573877e-06, "loss": 0.3489, "step": 318 }, { "epoch": 0.10822656776803777, "grad_norm": 1.3438397645950317, "learning_rate": 4.893600690050579e-06, "loss": 0.3942, "step": 321 }, { "epoch": 0.10923803101820634, "grad_norm": 1.1274303197860718, "learning_rate": 4.540468876520323e-06, "loss": 0.3829, "step": 324 }, { "epoch": 0.11024949426837491, "grad_norm": 1.4088205099105835, "learning_rate": 4.199284853017896e-06, "loss": 0.4908, "step": 327 }, { "epoch": 0.1112609575185435, "grad_norm": 1.07282292842865, "learning_rate": 3.8702478614051355e-06, "loss": 0.4099, "step": 330 }, { "epoch": 0.11227242076871206, "grad_norm": 1.2825994491577148, "learning_rate": 3.5535500500193357e-06, "loss": 0.4394, "step": 333 }, { "epoch": 0.11328388401888065, "grad_norm": 1.3308430910110474, "learning_rate": 3.249376361464021e-06, "loss": 0.4331, "step": 336 }, { "epoch": 0.11429534726904922, "grad_norm": 1.5105128288269043, "learning_rate": 2.957904424607652e-06, "loss": 0.4308, "step": 339 }, { "epoch": 0.11463250168577209, "eval_loss": 0.4016662836074829, "eval_runtime": 341.6809, "eval_samples_per_second": 14.619, "eval_steps_per_second": 1.829, "step": 340 }, { "epoch": 0.1153068105192178, "grad_norm": 1.3175163269042969, "learning_rate": 2.679304450853401e-06, "loss": 0.391, "step": 342 }, { "epoch": 0.11631827376938637, "grad_norm": 1.3113083839416504, "learning_rate": 2.4137391347404476e-06, "loss": 0.3967, "step": 345 }, { "epoch": 0.11732973701955496, "grad_norm": 1.2265738248825073, "learning_rate": 2.1613635589349756e-06, "loss": 0.3237, "step": 348 }, { "epoch": 0.11834120026972353, "grad_norm": 1.3466339111328125, "learning_rate": 1.922325103666281e-06, "loss": 0.3573, "step": 351 }, { "epoch": 0.11935266351989211, "grad_norm": 1.2574374675750732, "learning_rate": 1.696763360660808e-06, "loss": 0.3907, "step": 354 }, { "epoch": 0.12036412677006068, "grad_norm": 1.1374329328536987, "learning_rate": 1.4848100516245717e-06, "loss": 0.3421, "step": 357 }, { "epoch": 0.12137559002022927, "grad_norm": 1.2535523176193237, "learning_rate": 1.286588951321363e-06, "loss": 0.4246, "step": 360 }, { "epoch": 0.12238705327039784, "grad_norm": 1.125602126121521, "learning_rate": 1.102215815291774e-06, "loss": 0.408, "step": 363 }, { "epoch": 0.12339851652056642, "grad_norm": 0.9282971024513245, "learning_rate": 9.317983122552332e-07, "loss": 0.3658, "step": 366 }, { "epoch": 0.124409979770735, "grad_norm": 1.376049280166626, "learning_rate": 7.754359612344859e-07, "loss": 0.4305, "step": 369 }, { "epoch": 0.12542144302090358, "grad_norm": 1.3448472023010254, "learning_rate": 6.332200734393057e-07, "loss": 0.3743, "step": 372 }, { "epoch": 0.1260957518543493, "eval_loss": 0.4007108509540558, "eval_runtime": 341.2867, "eval_samples_per_second": 14.636, "eval_steps_per_second": 1.831, "step": 374 }, { "epoch": 0.12643290627107215, "grad_norm": 1.2979270219802856, "learning_rate": 5.052336989433082e-07, "loss": 0.3374, "step": 375 }, { "epoch": 0.12744436952124072, "grad_norm": 1.2179648876190186, "learning_rate": 3.915515781850565e-07, "loss": 0.3698, "step": 378 }, { "epoch": 0.12845583277140932, "grad_norm": 1.2433198690414429, "learning_rate": 2.922400983217416e-07, "loss": 0.3986, "step": 381 }, { "epoch": 0.1294672960215779, "grad_norm": 0.9717249274253845, "learning_rate": 2.0735725446094923e-07, "loss": 0.3977, "step": 384 }, { "epoch": 0.13047875927174646, "grad_norm": 1.3990002870559692, "learning_rate": 1.3695261579316777e-07, "loss": 0.4376, "step": 387 }, { "epoch": 0.13149022252191503, "grad_norm": 1.29855477809906, "learning_rate": 8.106729664475176e-08, "loss": 0.4265, "step": 390 }, { "epoch": 0.13250168577208363, "grad_norm": 1.3467254638671875, "learning_rate": 3.9733932468333234e-08, "loss": 0.407, "step": 393 }, { "epoch": 0.1335131490222522, "grad_norm": 1.5646733045578003, "learning_rate": 1.297666078462767e-08, "loss": 0.4872, "step": 396 }, { "epoch": 0.13452461227242077, "grad_norm": 1.2246640920639038, "learning_rate": 8.111070868010995e-10, "loss": 0.4088, "step": 399 } ], "logging_steps": 3, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 34, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.592309546614784e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }