|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.13486176668914363, |
|
"eval_steps": 34, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00033715441672285906, |
|
"eval_loss": 2.3532843589782715, |
|
"eval_runtime": 338.8675, |
|
"eval_samples_per_second": 14.74, |
|
"eval_steps_per_second": 1.844, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0010114632501685772, |
|
"grad_norm": 0.8756303787231445, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.361, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0020229265003371545, |
|
"grad_norm": 0.9312941431999207, |
|
"learning_rate": 3e-05, |
|
"loss": 2.4681, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0030343897505057315, |
|
"grad_norm": 0.9662004709243774, |
|
"learning_rate": 4.5e-05, |
|
"loss": 2.1782, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.004045853000674309, |
|
"grad_norm": 1.3127541542053223, |
|
"learning_rate": 4.999675562428437e-05, |
|
"loss": 2.2872, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0050573162508428865, |
|
"grad_norm": 1.5349361896514893, |
|
"learning_rate": 4.9979724954289244e-05, |
|
"loss": 1.8232, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.006068779501011463, |
|
"grad_norm": 1.5614502429962158, |
|
"learning_rate": 4.994810682835951e-05, |
|
"loss": 1.5184, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0070802427511800405, |
|
"grad_norm": 1.413482904434204, |
|
"learning_rate": 4.990191971059033e-05, |
|
"loss": 1.2261, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.008091706001348618, |
|
"grad_norm": 1.2501227855682373, |
|
"learning_rate": 4.984119057295783e-05, |
|
"loss": 1.1656, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.009103169251517195, |
|
"grad_norm": 1.0704395771026611, |
|
"learning_rate": 4.976595487956823e-05, |
|
"loss": 1.0296, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.010114632501685773, |
|
"grad_norm": 0.8690694570541382, |
|
"learning_rate": 4.967625656594782e-05, |
|
"loss": 0.834, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.011126095751854349, |
|
"grad_norm": 0.9992819428443909, |
|
"learning_rate": 4.957214801338581e-05, |
|
"loss": 0.9255, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.011463250168577209, |
|
"eval_loss": 0.8507078289985657, |
|
"eval_runtime": 341.3748, |
|
"eval_samples_per_second": 14.632, |
|
"eval_steps_per_second": 1.831, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.012137559002022926, |
|
"grad_norm": 1.001940131187439, |
|
"learning_rate": 4.9453690018345144e-05, |
|
"loss": 0.8057, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.013149022252191504, |
|
"grad_norm": 1.1807010173797607, |
|
"learning_rate": 4.932095175695911e-05, |
|
"loss": 0.8677, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.014160485502360081, |
|
"grad_norm": 0.7869375348091125, |
|
"learning_rate": 4.917401074463441e-05, |
|
"loss": 0.6542, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.015171948752528659, |
|
"grad_norm": 0.8986634612083435, |
|
"learning_rate": 4.901295279078431e-05, |
|
"loss": 0.7597, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.016183412002697236, |
|
"grad_norm": 0.8910415172576904, |
|
"learning_rate": 4.883787194871841e-05, |
|
"loss": 0.7038, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.017194875252865813, |
|
"grad_norm": 1.069819688796997, |
|
"learning_rate": 4.864887046071813e-05, |
|
"loss": 0.7414, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.01820633850303439, |
|
"grad_norm": 0.9437199831008911, |
|
"learning_rate": 4.8446058698330115e-05, |
|
"loss": 0.7289, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.01921780175320297, |
|
"grad_norm": 1.1073840856552124, |
|
"learning_rate": 4.822955509791233e-05, |
|
"loss": 0.7371, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.020229265003371546, |
|
"grad_norm": 1.1307681798934937, |
|
"learning_rate": 4.799948609147061e-05, |
|
"loss": 0.6487, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02124072825354012, |
|
"grad_norm": 0.8803877234458923, |
|
"learning_rate": 4.7755986032825864e-05, |
|
"loss": 0.6114, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.022252191503708697, |
|
"grad_norm": 0.9320568442344666, |
|
"learning_rate": 4.74991971191553e-05, |
|
"loss": 0.6164, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.022926500337154418, |
|
"eval_loss": 0.6071863174438477, |
|
"eval_runtime": 341.6544, |
|
"eval_samples_per_second": 14.62, |
|
"eval_steps_per_second": 1.829, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.023263654753877275, |
|
"grad_norm": 1.0200417041778564, |
|
"learning_rate": 4.7229269307953235e-05, |
|
"loss": 0.6376, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.024275118004045852, |
|
"grad_norm": 1.1348711252212524, |
|
"learning_rate": 4.694636022946012e-05, |
|
"loss": 0.6043, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.02528658125421443, |
|
"grad_norm": 1.1511644124984741, |
|
"learning_rate": 4.665063509461097e-05, |
|
"loss": 0.6967, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.026298044504383007, |
|
"grad_norm": 0.999319314956665, |
|
"learning_rate": 4.6342266598556814e-05, |
|
"loss": 0.5724, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.027309507754551585, |
|
"grad_norm": 1.094857931137085, |
|
"learning_rate": 4.6021434819815555e-05, |
|
"loss": 0.5304, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.028320971004720162, |
|
"grad_norm": 1.1055927276611328, |
|
"learning_rate": 4.568832711511125e-05, |
|
"loss": 0.598, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.02933243425488874, |
|
"grad_norm": 0.9610121846199036, |
|
"learning_rate": 4.534313800996299e-05, |
|
"loss": 0.5244, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.030343897505057317, |
|
"grad_norm": 0.962637722492218, |
|
"learning_rate": 4.498606908508754e-05, |
|
"loss": 0.5391, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.031355360755225894, |
|
"grad_norm": 1.200920820236206, |
|
"learning_rate": 4.46173288586818e-05, |
|
"loss": 0.5275, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.03236682400539447, |
|
"grad_norm": 1.0528006553649902, |
|
"learning_rate": 4.4237132664654154e-05, |
|
"loss": 0.5063, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.03337828725556305, |
|
"grad_norm": 1.1569225788116455, |
|
"learning_rate": 4.384570252687542e-05, |
|
"loss": 0.6439, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.03438975050573163, |
|
"grad_norm": 1.086855411529541, |
|
"learning_rate": 4.344326702952326e-05, |
|
"loss": 0.5464, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.03438975050573163, |
|
"eval_loss": 0.5225653648376465, |
|
"eval_runtime": 341.5659, |
|
"eval_samples_per_second": 14.624, |
|
"eval_steps_per_second": 1.83, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.035401213755900204, |
|
"grad_norm": 1.109368085861206, |
|
"learning_rate": 4.303006118359537e-05, |
|
"loss": 0.555, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.03641267700606878, |
|
"grad_norm": 1.114818811416626, |
|
"learning_rate": 4.260632628966974e-05, |
|
"loss": 0.6002, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.03742414025623736, |
|
"grad_norm": 1.0815789699554443, |
|
"learning_rate": 4.217230979699188e-05, |
|
"loss": 0.4636, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.03843560350640594, |
|
"grad_norm": 0.9574511051177979, |
|
"learning_rate": 4.172826515897146e-05, |
|
"loss": 0.5052, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.039447066756574514, |
|
"grad_norm": 1.1621136665344238, |
|
"learning_rate": 4.12744516851726e-05, |
|
"loss": 0.604, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.04045853000674309, |
|
"grad_norm": 1.1703245639801025, |
|
"learning_rate": 4.0811134389884433e-05, |
|
"loss": 0.5072, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04146999325691166, |
|
"grad_norm": 1.0807468891143799, |
|
"learning_rate": 4.0338583837360225e-05, |
|
"loss": 0.5368, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.04248145650708024, |
|
"grad_norm": 1.2833844423294067, |
|
"learning_rate": 3.985707598381544e-05, |
|
"loss": 0.5251, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.04349291975724882, |
|
"grad_norm": 1.172788143157959, |
|
"learning_rate": 3.9366892016277096e-05, |
|
"loss": 0.4348, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.044504383007417395, |
|
"grad_norm": 1.2922419309616089, |
|
"learning_rate": 3.886831818837847e-05, |
|
"loss": 0.4397, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.04551584625758597, |
|
"grad_norm": 1.223044514656067, |
|
"learning_rate": 3.8361645653195026e-05, |
|
"loss": 0.5303, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.045853000674308836, |
|
"eval_loss": 0.4773218631744385, |
|
"eval_runtime": 341.7589, |
|
"eval_samples_per_second": 14.616, |
|
"eval_steps_per_second": 1.829, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.04652730950775455, |
|
"grad_norm": 1.1864899396896362, |
|
"learning_rate": 3.784717029321922e-05, |
|
"loss": 0.4828, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.04753877275792313, |
|
"grad_norm": 1.1543965339660645, |
|
"learning_rate": 3.732519254757344e-05, |
|
"loss": 0.4989, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.048550236008091704, |
|
"grad_norm": 1.2661465406417847, |
|
"learning_rate": 3.679601723656205e-05, |
|
"loss": 0.5184, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.04956169925826028, |
|
"grad_norm": 1.4297566413879395, |
|
"learning_rate": 3.625995338366492e-05, |
|
"loss": 0.518, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.05057316250842886, |
|
"grad_norm": 1.1457245349884033, |
|
"learning_rate": 3.5717314035076355e-05, |
|
"loss": 0.561, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05158462575859744, |
|
"grad_norm": 1.035326600074768, |
|
"learning_rate": 3.516841607689501e-05, |
|
"loss": 0.4277, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.052596089008766014, |
|
"grad_norm": 1.0127874612808228, |
|
"learning_rate": 3.461358005007128e-05, |
|
"loss": 0.5606, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.05360755225893459, |
|
"grad_norm": 1.2170828580856323, |
|
"learning_rate": 3.405312996322042e-05, |
|
"loss": 0.5461, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.05461901550910317, |
|
"grad_norm": 1.3591985702514648, |
|
"learning_rate": 3.348739310341068e-05, |
|
"loss": 0.5153, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.05563047875927175, |
|
"grad_norm": 1.0014851093292236, |
|
"learning_rate": 3.2916699845036816e-05, |
|
"loss": 0.4194, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.056641942009440324, |
|
"grad_norm": 1.1735330820083618, |
|
"learning_rate": 3.234138345689077e-05, |
|
"loss": 0.4675, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.057316250842886045, |
|
"eval_loss": 0.4461934566497803, |
|
"eval_runtime": 341.8298, |
|
"eval_samples_per_second": 14.613, |
|
"eval_steps_per_second": 1.828, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0576534052596089, |
|
"grad_norm": 1.0280512571334839, |
|
"learning_rate": 3.17617799075421e-05, |
|
"loss": 0.4213, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.05866486850977748, |
|
"grad_norm": 1.155824065208435, |
|
"learning_rate": 3.1178227669141744e-05, |
|
"loss": 0.384, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.05967633175994606, |
|
"grad_norm": 1.2171953916549683, |
|
"learning_rate": 3.0591067519763895e-05, |
|
"loss": 0.4842, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.060687795010114634, |
|
"grad_norm": 1.1826962232589722, |
|
"learning_rate": 3.0000642344401113e-05, |
|
"loss": 0.4559, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06169925826028321, |
|
"grad_norm": 1.2323546409606934, |
|
"learning_rate": 2.9407296934729227e-05, |
|
"loss": 0.4811, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.06271072151045179, |
|
"grad_norm": 1.1695280075073242, |
|
"learning_rate": 2.8811377787758636e-05, |
|
"loss": 0.4234, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.06372218476062036, |
|
"grad_norm": 1.3116445541381836, |
|
"learning_rate": 2.8213232903489865e-05, |
|
"loss": 0.5082, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.06473364801078894, |
|
"grad_norm": 1.3884786367416382, |
|
"learning_rate": 2.761321158169134e-05, |
|
"loss": 0.4535, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.06574511126095751, |
|
"grad_norm": 1.1839005947113037, |
|
"learning_rate": 2.7011664217918154e-05, |
|
"loss": 0.4841, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.0667565745111261, |
|
"grad_norm": 1.329397439956665, |
|
"learning_rate": 2.6408942098890936e-05, |
|
"loss": 0.48, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.06776803776129467, |
|
"grad_norm": 1.0817499160766602, |
|
"learning_rate": 2.580539719735433e-05, |
|
"loss": 0.3271, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.06877950101146325, |
|
"grad_norm": 1.6627490520477295, |
|
"learning_rate": 2.5201381966534748e-05, |
|
"loss": 0.4969, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.06877950101146325, |
|
"eval_loss": 0.43049055337905884, |
|
"eval_runtime": 341.7654, |
|
"eval_samples_per_second": 14.615, |
|
"eval_steps_per_second": 1.829, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.06979096426163182, |
|
"grad_norm": 1.039337396621704, |
|
"learning_rate": 2.459724913431772e-05, |
|
"loss": 0.439, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.07080242751180041, |
|
"grad_norm": 1.222737431526184, |
|
"learning_rate": 2.399335149726463e-05, |
|
"loss": 0.4838, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.07181389076196898, |
|
"grad_norm": 1.278668999671936, |
|
"learning_rate": 2.3390041714589514e-05, |
|
"loss": 0.4612, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.07282535401213756, |
|
"grad_norm": 1.1525593996047974, |
|
"learning_rate": 2.2787672102216042e-05, |
|
"loss": 0.4372, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.07383681726230613, |
|
"grad_norm": 1.3022117614746094, |
|
"learning_rate": 2.2186594427034864e-05, |
|
"loss": 0.4593, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.07484828051247472, |
|
"grad_norm": 1.4199026823043823, |
|
"learning_rate": 2.1587159701481716e-05, |
|
"loss": 0.455, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.07585974376264329, |
|
"grad_norm": 1.3410009145736694, |
|
"learning_rate": 2.098971797855599e-05, |
|
"loss": 0.6084, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.07687120701281187, |
|
"grad_norm": 1.2653465270996094, |
|
"learning_rate": 2.0394618147399713e-05, |
|
"loss": 0.497, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.07788267026298044, |
|
"grad_norm": 1.2599753141403198, |
|
"learning_rate": 1.980220772955602e-05, |
|
"loss": 0.4794, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.07889413351314903, |
|
"grad_norm": 1.176132321357727, |
|
"learning_rate": 1.921283267602643e-05, |
|
"loss": 0.4134, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.0799055967633176, |
|
"grad_norm": 1.2982177734375, |
|
"learning_rate": 1.8626837165245165e-05, |
|
"loss": 0.4404, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.08024275118004046, |
|
"eval_loss": 0.4182414412498474, |
|
"eval_runtime": 341.5144, |
|
"eval_samples_per_second": 14.626, |
|
"eval_steps_per_second": 1.83, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.08091706001348618, |
|
"grad_norm": 1.521083116531372, |
|
"learning_rate": 1.8044563402088684e-05, |
|
"loss": 0.4579, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.08192852326365475, |
|
"grad_norm": 1.1534286737442017, |
|
"learning_rate": 1.746635141803761e-05, |
|
"loss": 0.3893, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.08293998651382332, |
|
"grad_norm": 1.179457426071167, |
|
"learning_rate": 1.6892538872607937e-05, |
|
"loss": 0.428, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.08395144976399191, |
|
"grad_norm": 1.498482346534729, |
|
"learning_rate": 1.6323460856167426e-05, |
|
"loss": 0.414, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.08496291301416048, |
|
"grad_norm": 1.3838918209075928, |
|
"learning_rate": 1.5759449694252226e-05, |
|
"loss": 0.4113, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.08597437626432906, |
|
"grad_norm": 1.2871530055999756, |
|
"learning_rate": 1.5200834753498128e-05, |
|
"loss": 0.4945, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.08698583951449763, |
|
"grad_norm": 1.1573866605758667, |
|
"learning_rate": 1.4647942249299707e-05, |
|
"loss": 0.4448, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.08799730276466622, |
|
"grad_norm": 1.2284533977508545, |
|
"learning_rate": 1.4101095055309746e-05, |
|
"loss": 0.4248, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.08900876601483479, |
|
"grad_norm": 1.3865326642990112, |
|
"learning_rate": 1.356061251489012e-05, |
|
"loss": 0.5, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.09002022926500337, |
|
"grad_norm": 1.0498360395431519, |
|
"learning_rate": 1.302681025462424e-05, |
|
"loss": 0.3297, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.09103169251517194, |
|
"grad_norm": 1.1438897848129272, |
|
"learning_rate": 1.2500000000000006e-05, |
|
"loss": 0.4251, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.09170600134861767, |
|
"eval_loss": 0.409618616104126, |
|
"eval_runtime": 341.4948, |
|
"eval_samples_per_second": 14.627, |
|
"eval_steps_per_second": 1.83, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.09204315576534053, |
|
"grad_norm": 1.5004656314849854, |
|
"learning_rate": 1.1980489393370938e-05, |
|
"loss": 0.4688, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.0930546190155091, |
|
"grad_norm": 1.2165626287460327, |
|
"learning_rate": 1.1468581814301717e-05, |
|
"loss": 0.4862, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.09406608226567768, |
|
"grad_norm": 1.4357872009277344, |
|
"learning_rate": 1.096457620240298e-05, |
|
"loss": 0.4654, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.09507754551584625, |
|
"grad_norm": 1.3055099248886108, |
|
"learning_rate": 1.0468766882759094e-05, |
|
"loss": 0.4198, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.09608900876601484, |
|
"grad_norm": 1.3729138374328613, |
|
"learning_rate": 9.981443394050525e-06, |
|
"loss": 0.423, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.09710047201618341, |
|
"grad_norm": 1.2119735479354858, |
|
"learning_rate": 9.502890319471491e-06, |
|
"loss": 0.4302, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.098111935266352, |
|
"grad_norm": 1.487874984741211, |
|
"learning_rate": 9.033387120541306e-06, |
|
"loss": 0.4843, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.09912339851652056, |
|
"grad_norm": 1.3085728883743286, |
|
"learning_rate": 8.573207973906735e-06, |
|
"loss": 0.4358, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.10013486176668915, |
|
"grad_norm": 1.1197856664657593, |
|
"learning_rate": 8.1226216112306e-06, |
|
"loss": 0.392, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.10114632501685772, |
|
"grad_norm": 1.2706650495529175, |
|
"learning_rate": 7.681891162260015e-06, |
|
"loss": 0.4164, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1021577882670263, |
|
"grad_norm": 1.287969708442688, |
|
"learning_rate": 7.251274001166044e-06, |
|
"loss": 0.5041, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.10316925151719487, |
|
"grad_norm": 1.357833743095398, |
|
"learning_rate": 6.831021596244424e-06, |
|
"loss": 0.3919, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.10316925151719487, |
|
"eval_loss": 0.4043685495853424, |
|
"eval_runtime": 341.6981, |
|
"eval_samples_per_second": 14.618, |
|
"eval_steps_per_second": 1.829, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.10418071476736346, |
|
"grad_norm": 1.1861456632614136, |
|
"learning_rate": 6.421379363065142e-06, |
|
"loss": 0.3585, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.10519217801753203, |
|
"grad_norm": 1.4933161735534668, |
|
"learning_rate": 6.022586521156715e-06, |
|
"loss": 0.3932, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.10620364126770061, |
|
"grad_norm": 1.3899950981140137, |
|
"learning_rate": 5.634875954308638e-06, |
|
"loss": 0.5755, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.10721510451786918, |
|
"grad_norm": 1.18565833568573, |
|
"learning_rate": 5.258474074573877e-06, |
|
"loss": 0.3489, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.10822656776803777, |
|
"grad_norm": 1.3438397645950317, |
|
"learning_rate": 4.893600690050579e-06, |
|
"loss": 0.3942, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.10923803101820634, |
|
"grad_norm": 1.1274303197860718, |
|
"learning_rate": 4.540468876520323e-06, |
|
"loss": 0.3829, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.11024949426837491, |
|
"grad_norm": 1.4088205099105835, |
|
"learning_rate": 4.199284853017896e-06, |
|
"loss": 0.4908, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.1112609575185435, |
|
"grad_norm": 1.07282292842865, |
|
"learning_rate": 3.8702478614051355e-06, |
|
"loss": 0.4099, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.11227242076871206, |
|
"grad_norm": 1.2825994491577148, |
|
"learning_rate": 3.5535500500193357e-06, |
|
"loss": 0.4394, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.11328388401888065, |
|
"grad_norm": 1.3308430910110474, |
|
"learning_rate": 3.249376361464021e-06, |
|
"loss": 0.4331, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.11429534726904922, |
|
"grad_norm": 1.5105128288269043, |
|
"learning_rate": 2.957904424607652e-06, |
|
"loss": 0.4308, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.11463250168577209, |
|
"eval_loss": 0.4016662836074829, |
|
"eval_runtime": 341.6809, |
|
"eval_samples_per_second": 14.619, |
|
"eval_steps_per_second": 1.829, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1153068105192178, |
|
"grad_norm": 1.3175163269042969, |
|
"learning_rate": 2.679304450853401e-06, |
|
"loss": 0.391, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.11631827376938637, |
|
"grad_norm": 1.3113083839416504, |
|
"learning_rate": 2.4137391347404476e-06, |
|
"loss": 0.3967, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.11732973701955496, |
|
"grad_norm": 1.2265738248825073, |
|
"learning_rate": 2.1613635589349756e-06, |
|
"loss": 0.3237, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.11834120026972353, |
|
"grad_norm": 1.3466339111328125, |
|
"learning_rate": 1.922325103666281e-06, |
|
"loss": 0.3573, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.11935266351989211, |
|
"grad_norm": 1.2574374675750732, |
|
"learning_rate": 1.696763360660808e-06, |
|
"loss": 0.3907, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.12036412677006068, |
|
"grad_norm": 1.1374329328536987, |
|
"learning_rate": 1.4848100516245717e-06, |
|
"loss": 0.3421, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.12137559002022927, |
|
"grad_norm": 1.2535523176193237, |
|
"learning_rate": 1.286588951321363e-06, |
|
"loss": 0.4246, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.12238705327039784, |
|
"grad_norm": 1.125602126121521, |
|
"learning_rate": 1.102215815291774e-06, |
|
"loss": 0.408, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.12339851652056642, |
|
"grad_norm": 0.9282971024513245, |
|
"learning_rate": 9.317983122552332e-07, |
|
"loss": 0.3658, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.124409979770735, |
|
"grad_norm": 1.376049280166626, |
|
"learning_rate": 7.754359612344859e-07, |
|
"loss": 0.4305, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.12542144302090358, |
|
"grad_norm": 1.3448472023010254, |
|
"learning_rate": 6.332200734393057e-07, |
|
"loss": 0.3743, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.1260957518543493, |
|
"eval_loss": 0.4007108509540558, |
|
"eval_runtime": 341.2867, |
|
"eval_samples_per_second": 14.636, |
|
"eval_steps_per_second": 1.831, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.12643290627107215, |
|
"grad_norm": 1.2979270219802856, |
|
"learning_rate": 5.052336989433082e-07, |
|
"loss": 0.3374, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.12744436952124072, |
|
"grad_norm": 1.2179648876190186, |
|
"learning_rate": 3.915515781850565e-07, |
|
"loss": 0.3698, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.12845583277140932, |
|
"grad_norm": 1.2433198690414429, |
|
"learning_rate": 2.922400983217416e-07, |
|
"loss": 0.3986, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.1294672960215779, |
|
"grad_norm": 0.9717249274253845, |
|
"learning_rate": 2.0735725446094923e-07, |
|
"loss": 0.3977, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.13047875927174646, |
|
"grad_norm": 1.3990002870559692, |
|
"learning_rate": 1.3695261579316777e-07, |
|
"loss": 0.4376, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.13149022252191503, |
|
"grad_norm": 1.29855477809906, |
|
"learning_rate": 8.106729664475176e-08, |
|
"loss": 0.4265, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.13250168577208363, |
|
"grad_norm": 1.3467254638671875, |
|
"learning_rate": 3.9733932468333234e-08, |
|
"loss": 0.407, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.1335131490222522, |
|
"grad_norm": 1.5646733045578003, |
|
"learning_rate": 1.297666078462767e-08, |
|
"loss": 0.4872, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.13452461227242077, |
|
"grad_norm": 1.2246640920639038, |
|
"learning_rate": 8.111070868010995e-10, |
|
"loss": 0.4088, |
|
"step": 399 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 34, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.592309546614784e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|