Nexspear's picture
Training in progress, step 400, checkpoint
9d50e2c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.13486176668914363,
"eval_steps": 34,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00033715441672285906,
"eval_loss": 2.3532843589782715,
"eval_runtime": 338.8675,
"eval_samples_per_second": 14.74,
"eval_steps_per_second": 1.844,
"step": 1
},
{
"epoch": 0.0010114632501685772,
"grad_norm": 0.8756303787231445,
"learning_rate": 1.5e-05,
"loss": 2.361,
"step": 3
},
{
"epoch": 0.0020229265003371545,
"grad_norm": 0.9312941431999207,
"learning_rate": 3e-05,
"loss": 2.4681,
"step": 6
},
{
"epoch": 0.0030343897505057315,
"grad_norm": 0.9662004709243774,
"learning_rate": 4.5e-05,
"loss": 2.1782,
"step": 9
},
{
"epoch": 0.004045853000674309,
"grad_norm": 1.3127541542053223,
"learning_rate": 4.999675562428437e-05,
"loss": 2.2872,
"step": 12
},
{
"epoch": 0.0050573162508428865,
"grad_norm": 1.5349361896514893,
"learning_rate": 4.9979724954289244e-05,
"loss": 1.8232,
"step": 15
},
{
"epoch": 0.006068779501011463,
"grad_norm": 1.5614502429962158,
"learning_rate": 4.994810682835951e-05,
"loss": 1.5184,
"step": 18
},
{
"epoch": 0.0070802427511800405,
"grad_norm": 1.413482904434204,
"learning_rate": 4.990191971059033e-05,
"loss": 1.2261,
"step": 21
},
{
"epoch": 0.008091706001348618,
"grad_norm": 1.2501227855682373,
"learning_rate": 4.984119057295783e-05,
"loss": 1.1656,
"step": 24
},
{
"epoch": 0.009103169251517195,
"grad_norm": 1.0704395771026611,
"learning_rate": 4.976595487956823e-05,
"loss": 1.0296,
"step": 27
},
{
"epoch": 0.010114632501685773,
"grad_norm": 0.8690694570541382,
"learning_rate": 4.967625656594782e-05,
"loss": 0.834,
"step": 30
},
{
"epoch": 0.011126095751854349,
"grad_norm": 0.9992819428443909,
"learning_rate": 4.957214801338581e-05,
"loss": 0.9255,
"step": 33
},
{
"epoch": 0.011463250168577209,
"eval_loss": 0.8507078289985657,
"eval_runtime": 341.3748,
"eval_samples_per_second": 14.632,
"eval_steps_per_second": 1.831,
"step": 34
},
{
"epoch": 0.012137559002022926,
"grad_norm": 1.001940131187439,
"learning_rate": 4.9453690018345144e-05,
"loss": 0.8057,
"step": 36
},
{
"epoch": 0.013149022252191504,
"grad_norm": 1.1807010173797607,
"learning_rate": 4.932095175695911e-05,
"loss": 0.8677,
"step": 39
},
{
"epoch": 0.014160485502360081,
"grad_norm": 0.7869375348091125,
"learning_rate": 4.917401074463441e-05,
"loss": 0.6542,
"step": 42
},
{
"epoch": 0.015171948752528659,
"grad_norm": 0.8986634612083435,
"learning_rate": 4.901295279078431e-05,
"loss": 0.7597,
"step": 45
},
{
"epoch": 0.016183412002697236,
"grad_norm": 0.8910415172576904,
"learning_rate": 4.883787194871841e-05,
"loss": 0.7038,
"step": 48
},
{
"epoch": 0.017194875252865813,
"grad_norm": 1.069819688796997,
"learning_rate": 4.864887046071813e-05,
"loss": 0.7414,
"step": 51
},
{
"epoch": 0.01820633850303439,
"grad_norm": 0.9437199831008911,
"learning_rate": 4.8446058698330115e-05,
"loss": 0.7289,
"step": 54
},
{
"epoch": 0.01921780175320297,
"grad_norm": 1.1073840856552124,
"learning_rate": 4.822955509791233e-05,
"loss": 0.7371,
"step": 57
},
{
"epoch": 0.020229265003371546,
"grad_norm": 1.1307681798934937,
"learning_rate": 4.799948609147061e-05,
"loss": 0.6487,
"step": 60
},
{
"epoch": 0.02124072825354012,
"grad_norm": 0.8803877234458923,
"learning_rate": 4.7755986032825864e-05,
"loss": 0.6114,
"step": 63
},
{
"epoch": 0.022252191503708697,
"grad_norm": 0.9320568442344666,
"learning_rate": 4.74991971191553e-05,
"loss": 0.6164,
"step": 66
},
{
"epoch": 0.022926500337154418,
"eval_loss": 0.6071863174438477,
"eval_runtime": 341.6544,
"eval_samples_per_second": 14.62,
"eval_steps_per_second": 1.829,
"step": 68
},
{
"epoch": 0.023263654753877275,
"grad_norm": 1.0200417041778564,
"learning_rate": 4.7229269307953235e-05,
"loss": 0.6376,
"step": 69
},
{
"epoch": 0.024275118004045852,
"grad_norm": 1.1348711252212524,
"learning_rate": 4.694636022946012e-05,
"loss": 0.6043,
"step": 72
},
{
"epoch": 0.02528658125421443,
"grad_norm": 1.1511644124984741,
"learning_rate": 4.665063509461097e-05,
"loss": 0.6967,
"step": 75
},
{
"epoch": 0.026298044504383007,
"grad_norm": 0.999319314956665,
"learning_rate": 4.6342266598556814e-05,
"loss": 0.5724,
"step": 78
},
{
"epoch": 0.027309507754551585,
"grad_norm": 1.094857931137085,
"learning_rate": 4.6021434819815555e-05,
"loss": 0.5304,
"step": 81
},
{
"epoch": 0.028320971004720162,
"grad_norm": 1.1055927276611328,
"learning_rate": 4.568832711511125e-05,
"loss": 0.598,
"step": 84
},
{
"epoch": 0.02933243425488874,
"grad_norm": 0.9610121846199036,
"learning_rate": 4.534313800996299e-05,
"loss": 0.5244,
"step": 87
},
{
"epoch": 0.030343897505057317,
"grad_norm": 0.962637722492218,
"learning_rate": 4.498606908508754e-05,
"loss": 0.5391,
"step": 90
},
{
"epoch": 0.031355360755225894,
"grad_norm": 1.200920820236206,
"learning_rate": 4.46173288586818e-05,
"loss": 0.5275,
"step": 93
},
{
"epoch": 0.03236682400539447,
"grad_norm": 1.0528006553649902,
"learning_rate": 4.4237132664654154e-05,
"loss": 0.5063,
"step": 96
},
{
"epoch": 0.03337828725556305,
"grad_norm": 1.1569225788116455,
"learning_rate": 4.384570252687542e-05,
"loss": 0.6439,
"step": 99
},
{
"epoch": 0.03438975050573163,
"grad_norm": 1.086855411529541,
"learning_rate": 4.344326702952326e-05,
"loss": 0.5464,
"step": 102
},
{
"epoch": 0.03438975050573163,
"eval_loss": 0.5225653648376465,
"eval_runtime": 341.5659,
"eval_samples_per_second": 14.624,
"eval_steps_per_second": 1.83,
"step": 102
},
{
"epoch": 0.035401213755900204,
"grad_norm": 1.109368085861206,
"learning_rate": 4.303006118359537e-05,
"loss": 0.555,
"step": 105
},
{
"epoch": 0.03641267700606878,
"grad_norm": 1.114818811416626,
"learning_rate": 4.260632628966974e-05,
"loss": 0.6002,
"step": 108
},
{
"epoch": 0.03742414025623736,
"grad_norm": 1.0815789699554443,
"learning_rate": 4.217230979699188e-05,
"loss": 0.4636,
"step": 111
},
{
"epoch": 0.03843560350640594,
"grad_norm": 0.9574511051177979,
"learning_rate": 4.172826515897146e-05,
"loss": 0.5052,
"step": 114
},
{
"epoch": 0.039447066756574514,
"grad_norm": 1.1621136665344238,
"learning_rate": 4.12744516851726e-05,
"loss": 0.604,
"step": 117
},
{
"epoch": 0.04045853000674309,
"grad_norm": 1.1703245639801025,
"learning_rate": 4.0811134389884433e-05,
"loss": 0.5072,
"step": 120
},
{
"epoch": 0.04146999325691166,
"grad_norm": 1.0807468891143799,
"learning_rate": 4.0338583837360225e-05,
"loss": 0.5368,
"step": 123
},
{
"epoch": 0.04248145650708024,
"grad_norm": 1.2833844423294067,
"learning_rate": 3.985707598381544e-05,
"loss": 0.5251,
"step": 126
},
{
"epoch": 0.04349291975724882,
"grad_norm": 1.172788143157959,
"learning_rate": 3.9366892016277096e-05,
"loss": 0.4348,
"step": 129
},
{
"epoch": 0.044504383007417395,
"grad_norm": 1.2922419309616089,
"learning_rate": 3.886831818837847e-05,
"loss": 0.4397,
"step": 132
},
{
"epoch": 0.04551584625758597,
"grad_norm": 1.223044514656067,
"learning_rate": 3.8361645653195026e-05,
"loss": 0.5303,
"step": 135
},
{
"epoch": 0.045853000674308836,
"eval_loss": 0.4773218631744385,
"eval_runtime": 341.7589,
"eval_samples_per_second": 14.616,
"eval_steps_per_second": 1.829,
"step": 136
},
{
"epoch": 0.04652730950775455,
"grad_norm": 1.1864899396896362,
"learning_rate": 3.784717029321922e-05,
"loss": 0.4828,
"step": 138
},
{
"epoch": 0.04753877275792313,
"grad_norm": 1.1543965339660645,
"learning_rate": 3.732519254757344e-05,
"loss": 0.4989,
"step": 141
},
{
"epoch": 0.048550236008091704,
"grad_norm": 1.2661465406417847,
"learning_rate": 3.679601723656205e-05,
"loss": 0.5184,
"step": 144
},
{
"epoch": 0.04956169925826028,
"grad_norm": 1.4297566413879395,
"learning_rate": 3.625995338366492e-05,
"loss": 0.518,
"step": 147
},
{
"epoch": 0.05057316250842886,
"grad_norm": 1.1457245349884033,
"learning_rate": 3.5717314035076355e-05,
"loss": 0.561,
"step": 150
},
{
"epoch": 0.05158462575859744,
"grad_norm": 1.035326600074768,
"learning_rate": 3.516841607689501e-05,
"loss": 0.4277,
"step": 153
},
{
"epoch": 0.052596089008766014,
"grad_norm": 1.0127874612808228,
"learning_rate": 3.461358005007128e-05,
"loss": 0.5606,
"step": 156
},
{
"epoch": 0.05360755225893459,
"grad_norm": 1.2170828580856323,
"learning_rate": 3.405312996322042e-05,
"loss": 0.5461,
"step": 159
},
{
"epoch": 0.05461901550910317,
"grad_norm": 1.3591985702514648,
"learning_rate": 3.348739310341068e-05,
"loss": 0.5153,
"step": 162
},
{
"epoch": 0.05563047875927175,
"grad_norm": 1.0014851093292236,
"learning_rate": 3.2916699845036816e-05,
"loss": 0.4194,
"step": 165
},
{
"epoch": 0.056641942009440324,
"grad_norm": 1.1735330820083618,
"learning_rate": 3.234138345689077e-05,
"loss": 0.4675,
"step": 168
},
{
"epoch": 0.057316250842886045,
"eval_loss": 0.4461934566497803,
"eval_runtime": 341.8298,
"eval_samples_per_second": 14.613,
"eval_steps_per_second": 1.828,
"step": 170
},
{
"epoch": 0.0576534052596089,
"grad_norm": 1.0280512571334839,
"learning_rate": 3.17617799075421e-05,
"loss": 0.4213,
"step": 171
},
{
"epoch": 0.05866486850977748,
"grad_norm": 1.155824065208435,
"learning_rate": 3.1178227669141744e-05,
"loss": 0.384,
"step": 174
},
{
"epoch": 0.05967633175994606,
"grad_norm": 1.2171953916549683,
"learning_rate": 3.0591067519763895e-05,
"loss": 0.4842,
"step": 177
},
{
"epoch": 0.060687795010114634,
"grad_norm": 1.1826962232589722,
"learning_rate": 3.0000642344401113e-05,
"loss": 0.4559,
"step": 180
},
{
"epoch": 0.06169925826028321,
"grad_norm": 1.2323546409606934,
"learning_rate": 2.9407296934729227e-05,
"loss": 0.4811,
"step": 183
},
{
"epoch": 0.06271072151045179,
"grad_norm": 1.1695280075073242,
"learning_rate": 2.8811377787758636e-05,
"loss": 0.4234,
"step": 186
},
{
"epoch": 0.06372218476062036,
"grad_norm": 1.3116445541381836,
"learning_rate": 2.8213232903489865e-05,
"loss": 0.5082,
"step": 189
},
{
"epoch": 0.06473364801078894,
"grad_norm": 1.3884786367416382,
"learning_rate": 2.761321158169134e-05,
"loss": 0.4535,
"step": 192
},
{
"epoch": 0.06574511126095751,
"grad_norm": 1.1839005947113037,
"learning_rate": 2.7011664217918154e-05,
"loss": 0.4841,
"step": 195
},
{
"epoch": 0.0667565745111261,
"grad_norm": 1.329397439956665,
"learning_rate": 2.6408942098890936e-05,
"loss": 0.48,
"step": 198
},
{
"epoch": 0.06776803776129467,
"grad_norm": 1.0817499160766602,
"learning_rate": 2.580539719735433e-05,
"loss": 0.3271,
"step": 201
},
{
"epoch": 0.06877950101146325,
"grad_norm": 1.6627490520477295,
"learning_rate": 2.5201381966534748e-05,
"loss": 0.4969,
"step": 204
},
{
"epoch": 0.06877950101146325,
"eval_loss": 0.43049055337905884,
"eval_runtime": 341.7654,
"eval_samples_per_second": 14.615,
"eval_steps_per_second": 1.829,
"step": 204
},
{
"epoch": 0.06979096426163182,
"grad_norm": 1.039337396621704,
"learning_rate": 2.459724913431772e-05,
"loss": 0.439,
"step": 207
},
{
"epoch": 0.07080242751180041,
"grad_norm": 1.222737431526184,
"learning_rate": 2.399335149726463e-05,
"loss": 0.4838,
"step": 210
},
{
"epoch": 0.07181389076196898,
"grad_norm": 1.278668999671936,
"learning_rate": 2.3390041714589514e-05,
"loss": 0.4612,
"step": 213
},
{
"epoch": 0.07282535401213756,
"grad_norm": 1.1525593996047974,
"learning_rate": 2.2787672102216042e-05,
"loss": 0.4372,
"step": 216
},
{
"epoch": 0.07383681726230613,
"grad_norm": 1.3022117614746094,
"learning_rate": 2.2186594427034864e-05,
"loss": 0.4593,
"step": 219
},
{
"epoch": 0.07484828051247472,
"grad_norm": 1.4199026823043823,
"learning_rate": 2.1587159701481716e-05,
"loss": 0.455,
"step": 222
},
{
"epoch": 0.07585974376264329,
"grad_norm": 1.3410009145736694,
"learning_rate": 2.098971797855599e-05,
"loss": 0.6084,
"step": 225
},
{
"epoch": 0.07687120701281187,
"grad_norm": 1.2653465270996094,
"learning_rate": 2.0394618147399713e-05,
"loss": 0.497,
"step": 228
},
{
"epoch": 0.07788267026298044,
"grad_norm": 1.2599753141403198,
"learning_rate": 1.980220772955602e-05,
"loss": 0.4794,
"step": 231
},
{
"epoch": 0.07889413351314903,
"grad_norm": 1.176132321357727,
"learning_rate": 1.921283267602643e-05,
"loss": 0.4134,
"step": 234
},
{
"epoch": 0.0799055967633176,
"grad_norm": 1.2982177734375,
"learning_rate": 1.8626837165245165e-05,
"loss": 0.4404,
"step": 237
},
{
"epoch": 0.08024275118004046,
"eval_loss": 0.4182414412498474,
"eval_runtime": 341.5144,
"eval_samples_per_second": 14.626,
"eval_steps_per_second": 1.83,
"step": 238
},
{
"epoch": 0.08091706001348618,
"grad_norm": 1.521083116531372,
"learning_rate": 1.8044563402088684e-05,
"loss": 0.4579,
"step": 240
},
{
"epoch": 0.08192852326365475,
"grad_norm": 1.1534286737442017,
"learning_rate": 1.746635141803761e-05,
"loss": 0.3893,
"step": 243
},
{
"epoch": 0.08293998651382332,
"grad_norm": 1.179457426071167,
"learning_rate": 1.6892538872607937e-05,
"loss": 0.428,
"step": 246
},
{
"epoch": 0.08395144976399191,
"grad_norm": 1.498482346534729,
"learning_rate": 1.6323460856167426e-05,
"loss": 0.414,
"step": 249
},
{
"epoch": 0.08496291301416048,
"grad_norm": 1.3838918209075928,
"learning_rate": 1.5759449694252226e-05,
"loss": 0.4113,
"step": 252
},
{
"epoch": 0.08597437626432906,
"grad_norm": 1.2871530055999756,
"learning_rate": 1.5200834753498128e-05,
"loss": 0.4945,
"step": 255
},
{
"epoch": 0.08698583951449763,
"grad_norm": 1.1573866605758667,
"learning_rate": 1.4647942249299707e-05,
"loss": 0.4448,
"step": 258
},
{
"epoch": 0.08799730276466622,
"grad_norm": 1.2284533977508545,
"learning_rate": 1.4101095055309746e-05,
"loss": 0.4248,
"step": 261
},
{
"epoch": 0.08900876601483479,
"grad_norm": 1.3865326642990112,
"learning_rate": 1.356061251489012e-05,
"loss": 0.5,
"step": 264
},
{
"epoch": 0.09002022926500337,
"grad_norm": 1.0498360395431519,
"learning_rate": 1.302681025462424e-05,
"loss": 0.3297,
"step": 267
},
{
"epoch": 0.09103169251517194,
"grad_norm": 1.1438897848129272,
"learning_rate": 1.2500000000000006e-05,
"loss": 0.4251,
"step": 270
},
{
"epoch": 0.09170600134861767,
"eval_loss": 0.409618616104126,
"eval_runtime": 341.4948,
"eval_samples_per_second": 14.627,
"eval_steps_per_second": 1.83,
"step": 272
},
{
"epoch": 0.09204315576534053,
"grad_norm": 1.5004656314849854,
"learning_rate": 1.1980489393370938e-05,
"loss": 0.4688,
"step": 273
},
{
"epoch": 0.0930546190155091,
"grad_norm": 1.2165626287460327,
"learning_rate": 1.1468581814301717e-05,
"loss": 0.4862,
"step": 276
},
{
"epoch": 0.09406608226567768,
"grad_norm": 1.4357872009277344,
"learning_rate": 1.096457620240298e-05,
"loss": 0.4654,
"step": 279
},
{
"epoch": 0.09507754551584625,
"grad_norm": 1.3055099248886108,
"learning_rate": 1.0468766882759094e-05,
"loss": 0.4198,
"step": 282
},
{
"epoch": 0.09608900876601484,
"grad_norm": 1.3729138374328613,
"learning_rate": 9.981443394050525e-06,
"loss": 0.423,
"step": 285
},
{
"epoch": 0.09710047201618341,
"grad_norm": 1.2119735479354858,
"learning_rate": 9.502890319471491e-06,
"loss": 0.4302,
"step": 288
},
{
"epoch": 0.098111935266352,
"grad_norm": 1.487874984741211,
"learning_rate": 9.033387120541306e-06,
"loss": 0.4843,
"step": 291
},
{
"epoch": 0.09912339851652056,
"grad_norm": 1.3085728883743286,
"learning_rate": 8.573207973906735e-06,
"loss": 0.4358,
"step": 294
},
{
"epoch": 0.10013486176668915,
"grad_norm": 1.1197856664657593,
"learning_rate": 8.1226216112306e-06,
"loss": 0.392,
"step": 297
},
{
"epoch": 0.10114632501685772,
"grad_norm": 1.2706650495529175,
"learning_rate": 7.681891162260015e-06,
"loss": 0.4164,
"step": 300
},
{
"epoch": 0.1021577882670263,
"grad_norm": 1.287969708442688,
"learning_rate": 7.251274001166044e-06,
"loss": 0.5041,
"step": 303
},
{
"epoch": 0.10316925151719487,
"grad_norm": 1.357833743095398,
"learning_rate": 6.831021596244424e-06,
"loss": 0.3919,
"step": 306
},
{
"epoch": 0.10316925151719487,
"eval_loss": 0.4043685495853424,
"eval_runtime": 341.6981,
"eval_samples_per_second": 14.618,
"eval_steps_per_second": 1.829,
"step": 306
},
{
"epoch": 0.10418071476736346,
"grad_norm": 1.1861456632614136,
"learning_rate": 6.421379363065142e-06,
"loss": 0.3585,
"step": 309
},
{
"epoch": 0.10519217801753203,
"grad_norm": 1.4933161735534668,
"learning_rate": 6.022586521156715e-06,
"loss": 0.3932,
"step": 312
},
{
"epoch": 0.10620364126770061,
"grad_norm": 1.3899950981140137,
"learning_rate": 5.634875954308638e-06,
"loss": 0.5755,
"step": 315
},
{
"epoch": 0.10721510451786918,
"grad_norm": 1.18565833568573,
"learning_rate": 5.258474074573877e-06,
"loss": 0.3489,
"step": 318
},
{
"epoch": 0.10822656776803777,
"grad_norm": 1.3438397645950317,
"learning_rate": 4.893600690050579e-06,
"loss": 0.3942,
"step": 321
},
{
"epoch": 0.10923803101820634,
"grad_norm": 1.1274303197860718,
"learning_rate": 4.540468876520323e-06,
"loss": 0.3829,
"step": 324
},
{
"epoch": 0.11024949426837491,
"grad_norm": 1.4088205099105835,
"learning_rate": 4.199284853017896e-06,
"loss": 0.4908,
"step": 327
},
{
"epoch": 0.1112609575185435,
"grad_norm": 1.07282292842865,
"learning_rate": 3.8702478614051355e-06,
"loss": 0.4099,
"step": 330
},
{
"epoch": 0.11227242076871206,
"grad_norm": 1.2825994491577148,
"learning_rate": 3.5535500500193357e-06,
"loss": 0.4394,
"step": 333
},
{
"epoch": 0.11328388401888065,
"grad_norm": 1.3308430910110474,
"learning_rate": 3.249376361464021e-06,
"loss": 0.4331,
"step": 336
},
{
"epoch": 0.11429534726904922,
"grad_norm": 1.5105128288269043,
"learning_rate": 2.957904424607652e-06,
"loss": 0.4308,
"step": 339
},
{
"epoch": 0.11463250168577209,
"eval_loss": 0.4016662836074829,
"eval_runtime": 341.6809,
"eval_samples_per_second": 14.619,
"eval_steps_per_second": 1.829,
"step": 340
},
{
"epoch": 0.1153068105192178,
"grad_norm": 1.3175163269042969,
"learning_rate": 2.679304450853401e-06,
"loss": 0.391,
"step": 342
},
{
"epoch": 0.11631827376938637,
"grad_norm": 1.3113083839416504,
"learning_rate": 2.4137391347404476e-06,
"loss": 0.3967,
"step": 345
},
{
"epoch": 0.11732973701955496,
"grad_norm": 1.2265738248825073,
"learning_rate": 2.1613635589349756e-06,
"loss": 0.3237,
"step": 348
},
{
"epoch": 0.11834120026972353,
"grad_norm": 1.3466339111328125,
"learning_rate": 1.922325103666281e-06,
"loss": 0.3573,
"step": 351
},
{
"epoch": 0.11935266351989211,
"grad_norm": 1.2574374675750732,
"learning_rate": 1.696763360660808e-06,
"loss": 0.3907,
"step": 354
},
{
"epoch": 0.12036412677006068,
"grad_norm": 1.1374329328536987,
"learning_rate": 1.4848100516245717e-06,
"loss": 0.3421,
"step": 357
},
{
"epoch": 0.12137559002022927,
"grad_norm": 1.2535523176193237,
"learning_rate": 1.286588951321363e-06,
"loss": 0.4246,
"step": 360
},
{
"epoch": 0.12238705327039784,
"grad_norm": 1.125602126121521,
"learning_rate": 1.102215815291774e-06,
"loss": 0.408,
"step": 363
},
{
"epoch": 0.12339851652056642,
"grad_norm": 0.9282971024513245,
"learning_rate": 9.317983122552332e-07,
"loss": 0.3658,
"step": 366
},
{
"epoch": 0.124409979770735,
"grad_norm": 1.376049280166626,
"learning_rate": 7.754359612344859e-07,
"loss": 0.4305,
"step": 369
},
{
"epoch": 0.12542144302090358,
"grad_norm": 1.3448472023010254,
"learning_rate": 6.332200734393057e-07,
"loss": 0.3743,
"step": 372
},
{
"epoch": 0.1260957518543493,
"eval_loss": 0.4007108509540558,
"eval_runtime": 341.2867,
"eval_samples_per_second": 14.636,
"eval_steps_per_second": 1.831,
"step": 374
},
{
"epoch": 0.12643290627107215,
"grad_norm": 1.2979270219802856,
"learning_rate": 5.052336989433082e-07,
"loss": 0.3374,
"step": 375
},
{
"epoch": 0.12744436952124072,
"grad_norm": 1.2179648876190186,
"learning_rate": 3.915515781850565e-07,
"loss": 0.3698,
"step": 378
},
{
"epoch": 0.12845583277140932,
"grad_norm": 1.2433198690414429,
"learning_rate": 2.922400983217416e-07,
"loss": 0.3986,
"step": 381
},
{
"epoch": 0.1294672960215779,
"grad_norm": 0.9717249274253845,
"learning_rate": 2.0735725446094923e-07,
"loss": 0.3977,
"step": 384
},
{
"epoch": 0.13047875927174646,
"grad_norm": 1.3990002870559692,
"learning_rate": 1.3695261579316777e-07,
"loss": 0.4376,
"step": 387
},
{
"epoch": 0.13149022252191503,
"grad_norm": 1.29855477809906,
"learning_rate": 8.106729664475176e-08,
"loss": 0.4265,
"step": 390
},
{
"epoch": 0.13250168577208363,
"grad_norm": 1.3467254638671875,
"learning_rate": 3.9733932468333234e-08,
"loss": 0.407,
"step": 393
},
{
"epoch": 0.1335131490222522,
"grad_norm": 1.5646733045578003,
"learning_rate": 1.297666078462767e-08,
"loss": 0.4872,
"step": 396
},
{
"epoch": 0.13452461227242077,
"grad_norm": 1.2246640920639038,
"learning_rate": 8.111070868010995e-10,
"loss": 0.4088,
"step": 399
}
],
"logging_steps": 3,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 34,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.592309546614784e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}