Qwarkstar-4B-Instruct-Preview / trainer_state.json
qingy2024's picture
Upload checkpoint 3300
21f5eae verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9786476868327402,
"eval_steps": 500,
"global_step": 3300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008896797153024911,
"grad_norm": 1.0390625,
"learning_rate": 6e-05,
"loss": 1.0103,
"step": 3
},
{
"epoch": 0.0017793594306049821,
"grad_norm": 0.85546875,
"learning_rate": 0.00012,
"loss": 0.9297,
"step": 6
},
{
"epoch": 0.0026690391459074734,
"grad_norm": 1.125,
"learning_rate": 0.00018,
"loss": 0.9383,
"step": 9
},
{
"epoch": 0.0035587188612099642,
"grad_norm": 0.8046875,
"learning_rate": 0.00019999982536383071,
"loss": 0.9338,
"step": 12
},
{
"epoch": 0.004448398576512456,
"grad_norm": 0.515625,
"learning_rate": 0.00019999890852560968,
"loss": 0.8937,
"step": 15
},
{
"epoch": 0.005338078291814947,
"grad_norm": 0.474609375,
"learning_rate": 0.00019999720583349016,
"loss": 0.8876,
"step": 18
},
{
"epoch": 0.006227758007117438,
"grad_norm": 0.400390625,
"learning_rate": 0.000199994717300853,
"loss": 0.8804,
"step": 21
},
{
"epoch": 0.0071174377224199285,
"grad_norm": 0.41796875,
"learning_rate": 0.00019999144294725462,
"loss": 0.8889,
"step": 24
},
{
"epoch": 0.00800711743772242,
"grad_norm": 0.39453125,
"learning_rate": 0.0001999873827984269,
"loss": 0.8664,
"step": 27
},
{
"epoch": 0.008896797153024912,
"grad_norm": 0.416015625,
"learning_rate": 0.00019998253688627705,
"loss": 0.876,
"step": 30
},
{
"epoch": 0.009786476868327402,
"grad_norm": 0.39453125,
"learning_rate": 0.00019997690524888734,
"loss": 0.8375,
"step": 33
},
{
"epoch": 0.010676156583629894,
"grad_norm": 0.39453125,
"learning_rate": 0.0001999704879305146,
"loss": 0.8698,
"step": 36
},
{
"epoch": 0.011565836298932384,
"grad_norm": 0.369140625,
"learning_rate": 0.0001999632849815902,
"loss": 0.8346,
"step": 39
},
{
"epoch": 0.012455516014234875,
"grad_norm": 0.37109375,
"learning_rate": 0.00019995529645871934,
"loss": 0.8573,
"step": 42
},
{
"epoch": 0.013345195729537367,
"grad_norm": 0.42578125,
"learning_rate": 0.0001999465224246809,
"loss": 0.8411,
"step": 45
},
{
"epoch": 0.014234875444839857,
"grad_norm": 0.408203125,
"learning_rate": 0.00019993696294842668,
"loss": 0.8484,
"step": 48
},
{
"epoch": 0.015124555160142349,
"grad_norm": 0.41015625,
"learning_rate": 0.000199926618105081,
"loss": 0.8172,
"step": 51
},
{
"epoch": 0.01601423487544484,
"grad_norm": 0.384765625,
"learning_rate": 0.00019991548797594015,
"loss": 0.8481,
"step": 54
},
{
"epoch": 0.016903914590747332,
"grad_norm": 0.421875,
"learning_rate": 0.0001999035726484716,
"loss": 0.8458,
"step": 57
},
{
"epoch": 0.017793594306049824,
"grad_norm": 0.390625,
"learning_rate": 0.00019989087221631343,
"loss": 0.8321,
"step": 60
},
{
"epoch": 0.018683274021352312,
"grad_norm": 0.4140625,
"learning_rate": 0.00019987738677927365,
"loss": 0.8511,
"step": 63
},
{
"epoch": 0.019572953736654804,
"grad_norm": 0.375,
"learning_rate": 0.00019986311644332915,
"loss": 0.8324,
"step": 66
},
{
"epoch": 0.020462633451957295,
"grad_norm": 0.341796875,
"learning_rate": 0.00019984806132062517,
"loss": 0.8368,
"step": 69
},
{
"epoch": 0.021352313167259787,
"grad_norm": 0.498046875,
"learning_rate": 0.00019983222152947428,
"loss": 0.8062,
"step": 72
},
{
"epoch": 0.02224199288256228,
"grad_norm": 0.484375,
"learning_rate": 0.0001998155971943555,
"loss": 0.8329,
"step": 75
},
{
"epoch": 0.023131672597864767,
"grad_norm": 0.44921875,
"learning_rate": 0.00019979818844591317,
"loss": 0.8214,
"step": 78
},
{
"epoch": 0.02402135231316726,
"grad_norm": 0.4765625,
"learning_rate": 0.00019977999542095617,
"loss": 0.8077,
"step": 81
},
{
"epoch": 0.02491103202846975,
"grad_norm": 0.38671875,
"learning_rate": 0.0001997610182624566,
"loss": 0.8004,
"step": 84
},
{
"epoch": 0.025800711743772242,
"grad_norm": 0.447265625,
"learning_rate": 0.0001997412571195489,
"loss": 0.8649,
"step": 87
},
{
"epoch": 0.026690391459074734,
"grad_norm": 0.37109375,
"learning_rate": 0.0001997207121475284,
"loss": 0.82,
"step": 90
},
{
"epoch": 0.027580071174377226,
"grad_norm": 0.40234375,
"learning_rate": 0.00019969938350785035,
"loss": 0.8353,
"step": 93
},
{
"epoch": 0.028469750889679714,
"grad_norm": 0.41015625,
"learning_rate": 0.00019967727136812856,
"loss": 0.8006,
"step": 96
},
{
"epoch": 0.029359430604982206,
"grad_norm": 0.376953125,
"learning_rate": 0.000199654375902134,
"loss": 0.8127,
"step": 99
},
{
"epoch": 0.030249110320284697,
"grad_norm": 0.37890625,
"learning_rate": 0.00019963069728979357,
"loss": 0.8029,
"step": 102
},
{
"epoch": 0.03113879003558719,
"grad_norm": 0.400390625,
"learning_rate": 0.00019960623571718862,
"loss": 0.8162,
"step": 105
},
{
"epoch": 0.03202846975088968,
"grad_norm": 0.36328125,
"learning_rate": 0.0001995809913765534,
"loss": 0.7928,
"step": 108
},
{
"epoch": 0.03291814946619217,
"grad_norm": 0.39453125,
"learning_rate": 0.00019955496446627375,
"loss": 0.8011,
"step": 111
},
{
"epoch": 0.033807829181494664,
"grad_norm": 0.36328125,
"learning_rate": 0.0001995281551908854,
"loss": 0.7874,
"step": 114
},
{
"epoch": 0.03469750889679715,
"grad_norm": 0.37890625,
"learning_rate": 0.00019950056376107238,
"loss": 0.827,
"step": 117
},
{
"epoch": 0.03558718861209965,
"grad_norm": 0.375,
"learning_rate": 0.00019947219039366537,
"loss": 0.7829,
"step": 120
},
{
"epoch": 0.036476868327402136,
"grad_norm": 0.36328125,
"learning_rate": 0.00019944303531164005,
"loss": 0.8184,
"step": 123
},
{
"epoch": 0.037366548042704624,
"grad_norm": 0.388671875,
"learning_rate": 0.00019941309874411524,
"loss": 0.8005,
"step": 126
},
{
"epoch": 0.03825622775800712,
"grad_norm": 0.357421875,
"learning_rate": 0.0001993823809263512,
"loss": 0.8199,
"step": 129
},
{
"epoch": 0.03914590747330961,
"grad_norm": 0.35546875,
"learning_rate": 0.00019935088209974773,
"loss": 0.8066,
"step": 132
},
{
"epoch": 0.0400355871886121,
"grad_norm": 0.34375,
"learning_rate": 0.0001993186025118423,
"loss": 0.7823,
"step": 135
},
{
"epoch": 0.04092526690391459,
"grad_norm": 0.345703125,
"learning_rate": 0.00019928554241630802,
"loss": 0.7799,
"step": 138
},
{
"epoch": 0.04181494661921708,
"grad_norm": 0.337890625,
"learning_rate": 0.0001992517020729519,
"loss": 0.7996,
"step": 141
},
{
"epoch": 0.042704626334519574,
"grad_norm": 0.37890625,
"learning_rate": 0.00019921708174771237,
"loss": 0.8068,
"step": 144
},
{
"epoch": 0.04359430604982206,
"grad_norm": 0.369140625,
"learning_rate": 0.00019918168171265764,
"loss": 0.8197,
"step": 147
},
{
"epoch": 0.04448398576512456,
"grad_norm": 0.390625,
"learning_rate": 0.0001991455022459833,
"loss": 0.8255,
"step": 150
},
{
"epoch": 0.045373665480427046,
"grad_norm": 0.388671875,
"learning_rate": 0.00019910854363201018,
"loss": 0.823,
"step": 153
},
{
"epoch": 0.046263345195729534,
"grad_norm": 0.396484375,
"learning_rate": 0.00019907080616118222,
"loss": 0.7878,
"step": 156
},
{
"epoch": 0.04715302491103203,
"grad_norm": 0.40625,
"learning_rate": 0.00019903229013006394,
"loss": 0.7548,
"step": 159
},
{
"epoch": 0.04804270462633452,
"grad_norm": 0.33984375,
"learning_rate": 0.00019899299584133845,
"loss": 0.7965,
"step": 162
},
{
"epoch": 0.04893238434163701,
"grad_norm": 0.400390625,
"learning_rate": 0.0001989529236038048,
"loss": 0.7848,
"step": 165
},
{
"epoch": 0.0498220640569395,
"grad_norm": 0.353515625,
"learning_rate": 0.0001989120737323757,
"loss": 0.7979,
"step": 168
},
{
"epoch": 0.050711743772241996,
"grad_norm": 0.369140625,
"learning_rate": 0.00019887044654807488,
"loss": 0.747,
"step": 171
},
{
"epoch": 0.051601423487544484,
"grad_norm": 0.380859375,
"learning_rate": 0.00019882804237803488,
"loss": 0.811,
"step": 174
},
{
"epoch": 0.05249110320284697,
"grad_norm": 0.33984375,
"learning_rate": 0.00019878486155549405,
"loss": 0.7585,
"step": 177
},
{
"epoch": 0.05338078291814947,
"grad_norm": 0.373046875,
"learning_rate": 0.0001987409044197943,
"loss": 0.7964,
"step": 180
},
{
"epoch": 0.054270462633451956,
"grad_norm": 0.365234375,
"learning_rate": 0.0001986961713163783,
"loss": 0.7923,
"step": 183
},
{
"epoch": 0.05516014234875445,
"grad_norm": 0.349609375,
"learning_rate": 0.0001986506625967867,
"loss": 0.7821,
"step": 186
},
{
"epoch": 0.05604982206405694,
"grad_norm": 0.349609375,
"learning_rate": 0.00019860437861865546,
"loss": 0.75,
"step": 189
},
{
"epoch": 0.05693950177935943,
"grad_norm": 0.373046875,
"learning_rate": 0.00019855731974571298,
"loss": 0.7976,
"step": 192
},
{
"epoch": 0.05782918149466192,
"grad_norm": 0.361328125,
"learning_rate": 0.0001985094863477773,
"loss": 0.7894,
"step": 195
},
{
"epoch": 0.05871886120996441,
"grad_norm": 0.38671875,
"learning_rate": 0.00019846087880075314,
"loss": 0.7952,
"step": 198
},
{
"epoch": 0.059608540925266906,
"grad_norm": 0.376953125,
"learning_rate": 0.00019841149748662894,
"loss": 0.8058,
"step": 201
},
{
"epoch": 0.060498220640569395,
"grad_norm": 0.36328125,
"learning_rate": 0.000198361342793474,
"loss": 0.7727,
"step": 204
},
{
"epoch": 0.06138790035587188,
"grad_norm": 0.349609375,
"learning_rate": 0.00019831041511543515,
"loss": 0.7792,
"step": 207
},
{
"epoch": 0.06227758007117438,
"grad_norm": 0.390625,
"learning_rate": 0.00019825871485273396,
"loss": 0.8123,
"step": 210
},
{
"epoch": 0.06316725978647687,
"grad_norm": 0.400390625,
"learning_rate": 0.00019820624241166334,
"loss": 0.7966,
"step": 213
},
{
"epoch": 0.06405693950177936,
"grad_norm": 0.369140625,
"learning_rate": 0.00019815299820458458,
"loss": 0.7883,
"step": 216
},
{
"epoch": 0.06494661921708185,
"grad_norm": 0.353515625,
"learning_rate": 0.00019809898264992385,
"loss": 0.7707,
"step": 219
},
{
"epoch": 0.06583629893238434,
"grad_norm": 0.353515625,
"learning_rate": 0.0001980441961721692,
"loss": 0.788,
"step": 222
},
{
"epoch": 0.06672597864768683,
"grad_norm": 0.333984375,
"learning_rate": 0.00019798863920186696,
"loss": 0.7759,
"step": 225
},
{
"epoch": 0.06761565836298933,
"grad_norm": 0.3359375,
"learning_rate": 0.0001979323121756185,
"loss": 0.7782,
"step": 228
},
{
"epoch": 0.06850533807829182,
"grad_norm": 0.3671875,
"learning_rate": 0.0001978752155360768,
"loss": 0.7417,
"step": 231
},
{
"epoch": 0.0693950177935943,
"grad_norm": 0.35546875,
"learning_rate": 0.00019781734973194293,
"loss": 0.7678,
"step": 234
},
{
"epoch": 0.07028469750889679,
"grad_norm": 0.376953125,
"learning_rate": 0.00019775871521796252,
"loss": 0.7867,
"step": 237
},
{
"epoch": 0.0711743772241993,
"grad_norm": 0.3984375,
"learning_rate": 0.00019769931245492222,
"loss": 0.807,
"step": 240
},
{
"epoch": 0.07206405693950178,
"grad_norm": 0.39453125,
"learning_rate": 0.00019763914190964609,
"loss": 0.7683,
"step": 243
},
{
"epoch": 0.07295373665480427,
"grad_norm": 0.333984375,
"learning_rate": 0.0001975782040549918,
"loss": 0.7412,
"step": 246
},
{
"epoch": 0.07384341637010676,
"grad_norm": 0.33203125,
"learning_rate": 0.0001975164993698471,
"loss": 0.7793,
"step": 249
},
{
"epoch": 0.07473309608540925,
"grad_norm": 0.37890625,
"learning_rate": 0.00019745402833912598,
"loss": 0.8096,
"step": 252
},
{
"epoch": 0.07562277580071175,
"grad_norm": 0.494140625,
"learning_rate": 0.00019739079145376484,
"loss": 0.771,
"step": 255
},
{
"epoch": 0.07651245551601424,
"grad_norm": 0.47265625,
"learning_rate": 0.0001973267892107186,
"loss": 0.776,
"step": 258
},
{
"epoch": 0.07740213523131673,
"grad_norm": 0.76171875,
"learning_rate": 0.00019726202211295686,
"loss": 0.7794,
"step": 261
},
{
"epoch": 0.07829181494661921,
"grad_norm": 0.353515625,
"learning_rate": 0.00019719649066945996,
"loss": 0.7751,
"step": 264
},
{
"epoch": 0.0791814946619217,
"grad_norm": 0.384765625,
"learning_rate": 0.000197130195395215,
"loss": 0.7957,
"step": 267
},
{
"epoch": 0.0800711743772242,
"grad_norm": 0.361328125,
"learning_rate": 0.0001970631368112115,
"loss": 0.7747,
"step": 270
},
{
"epoch": 0.0809608540925267,
"grad_norm": 0.3828125,
"learning_rate": 0.00019699531544443784,
"loss": 0.7677,
"step": 273
},
{
"epoch": 0.08185053380782918,
"grad_norm": 0.36328125,
"learning_rate": 0.00019692673182787666,
"loss": 0.7749,
"step": 276
},
{
"epoch": 0.08274021352313167,
"grad_norm": 0.43359375,
"learning_rate": 0.00019685738650050086,
"loss": 0.7795,
"step": 279
},
{
"epoch": 0.08362989323843416,
"grad_norm": 0.38671875,
"learning_rate": 0.00019678728000726935,
"loss": 0.7748,
"step": 282
},
{
"epoch": 0.08451957295373666,
"grad_norm": 0.546875,
"learning_rate": 0.0001967164128991227,
"loss": 0.7842,
"step": 285
},
{
"epoch": 0.08540925266903915,
"grad_norm": 0.39453125,
"learning_rate": 0.000196644785732979,
"loss": 0.7675,
"step": 288
},
{
"epoch": 0.08629893238434164,
"grad_norm": 0.37890625,
"learning_rate": 0.00019657239907172925,
"loss": 0.7593,
"step": 291
},
{
"epoch": 0.08718861209964412,
"grad_norm": 0.369140625,
"learning_rate": 0.00019649925348423305,
"loss": 0.7741,
"step": 294
},
{
"epoch": 0.08807829181494661,
"grad_norm": 0.3515625,
"learning_rate": 0.0001964253495453141,
"loss": 0.7842,
"step": 297
},
{
"epoch": 0.08896797153024912,
"grad_norm": 0.40625,
"learning_rate": 0.00019635068783575578,
"loss": 0.7615,
"step": 300
},
{
"epoch": 0.0898576512455516,
"grad_norm": 0.357421875,
"learning_rate": 0.0001962752689422964,
"loss": 0.7853,
"step": 303
},
{
"epoch": 0.09074733096085409,
"grad_norm": 0.326171875,
"learning_rate": 0.00019619909345762476,
"loss": 0.7412,
"step": 306
},
{
"epoch": 0.09163701067615658,
"grad_norm": 0.361328125,
"learning_rate": 0.00019612216198037542,
"loss": 0.7758,
"step": 309
},
{
"epoch": 0.09252669039145907,
"grad_norm": 0.359375,
"learning_rate": 0.00019604447511512396,
"loss": 0.7994,
"step": 312
},
{
"epoch": 0.09341637010676157,
"grad_norm": 0.357421875,
"learning_rate": 0.00019596603347238234,
"loss": 0.746,
"step": 315
},
{
"epoch": 0.09430604982206406,
"grad_norm": 0.341796875,
"learning_rate": 0.00019588683766859398,
"loss": 0.7644,
"step": 318
},
{
"epoch": 0.09519572953736655,
"grad_norm": 0.3515625,
"learning_rate": 0.000195806888326129,
"loss": 0.7574,
"step": 321
},
{
"epoch": 0.09608540925266904,
"grad_norm": 0.365234375,
"learning_rate": 0.00019572618607327925,
"loss": 0.7797,
"step": 324
},
{
"epoch": 0.09697508896797152,
"grad_norm": 0.34375,
"learning_rate": 0.00019564473154425348,
"loss": 0.7519,
"step": 327
},
{
"epoch": 0.09786476868327403,
"grad_norm": 26.375,
"learning_rate": 0.00019556252537917225,
"loss": 0.8056,
"step": 330
},
{
"epoch": 0.09875444839857651,
"grad_norm": 0.46484375,
"learning_rate": 0.000195479568224063,
"loss": 0.7589,
"step": 333
},
{
"epoch": 0.099644128113879,
"grad_norm": 0.375,
"learning_rate": 0.00019539586073085482,
"loss": 0.7666,
"step": 336
},
{
"epoch": 0.10053380782918149,
"grad_norm": 0.36328125,
"learning_rate": 0.00019531140355737354,
"loss": 0.7903,
"step": 339
},
{
"epoch": 0.10142348754448399,
"grad_norm": 0.388671875,
"learning_rate": 0.00019522619736733637,
"loss": 0.7793,
"step": 342
},
{
"epoch": 0.10231316725978648,
"grad_norm": 0.34765625,
"learning_rate": 0.00019514024283034682,
"loss": 0.7505,
"step": 345
},
{
"epoch": 0.10320284697508897,
"grad_norm": 0.388671875,
"learning_rate": 0.00019505354062188931,
"loss": 0.7642,
"step": 348
},
{
"epoch": 0.10409252669039146,
"grad_norm": 0.35546875,
"learning_rate": 0.00019496609142332397,
"loss": 0.7675,
"step": 351
},
{
"epoch": 0.10498220640569395,
"grad_norm": 0.361328125,
"learning_rate": 0.00019487789592188124,
"loss": 0.7635,
"step": 354
},
{
"epoch": 0.10587188612099645,
"grad_norm": 0.365234375,
"learning_rate": 0.00019478895481065645,
"loss": 0.7739,
"step": 357
},
{
"epoch": 0.10676156583629894,
"grad_norm": 0.33203125,
"learning_rate": 0.00019469926878860444,
"loss": 0.7302,
"step": 360
},
{
"epoch": 0.10765124555160142,
"grad_norm": 0.359375,
"learning_rate": 0.000194608838560534,
"loss": 0.7563,
"step": 363
},
{
"epoch": 0.10854092526690391,
"grad_norm": 0.341796875,
"learning_rate": 0.00019451766483710237,
"loss": 0.7473,
"step": 366
},
{
"epoch": 0.1094306049822064,
"grad_norm": 0.373046875,
"learning_rate": 0.00019442574833480962,
"loss": 0.7352,
"step": 369
},
{
"epoch": 0.1103202846975089,
"grad_norm": 0.330078125,
"learning_rate": 0.00019433308977599305,
"loss": 0.7561,
"step": 372
},
{
"epoch": 0.11120996441281139,
"grad_norm": 0.3359375,
"learning_rate": 0.0001942396898888215,
"loss": 0.7633,
"step": 375
},
{
"epoch": 0.11209964412811388,
"grad_norm": 0.33984375,
"learning_rate": 0.00019414554940728963,
"loss": 0.7221,
"step": 378
},
{
"epoch": 0.11298932384341637,
"grad_norm": 0.32421875,
"learning_rate": 0.0001940506690712122,
"loss": 0.7552,
"step": 381
},
{
"epoch": 0.11387900355871886,
"grad_norm": 0.35546875,
"learning_rate": 0.0001939550496262181,
"loss": 0.7563,
"step": 384
},
{
"epoch": 0.11476868327402136,
"grad_norm": 0.3125,
"learning_rate": 0.00019385869182374474,
"loss": 0.7643,
"step": 387
},
{
"epoch": 0.11565836298932385,
"grad_norm": 0.31640625,
"learning_rate": 0.0001937615964210319,
"loss": 0.7562,
"step": 390
},
{
"epoch": 0.11654804270462633,
"grad_norm": 0.30859375,
"learning_rate": 0.00019366376418111588,
"loss": 0.7425,
"step": 393
},
{
"epoch": 0.11743772241992882,
"grad_norm": 0.357421875,
"learning_rate": 0.0001935651958728236,
"loss": 0.7514,
"step": 396
},
{
"epoch": 0.11832740213523131,
"grad_norm": 0.328125,
"learning_rate": 0.0001934658922707664,
"loss": 0.7466,
"step": 399
},
{
"epoch": 0.11921708185053381,
"grad_norm": 0.341796875,
"learning_rate": 0.00019336585415533398,
"loss": 0.7557,
"step": 402
},
{
"epoch": 0.1201067615658363,
"grad_norm": 0.318359375,
"learning_rate": 0.00019326508231268838,
"loss": 0.7685,
"step": 405
},
{
"epoch": 0.12099644128113879,
"grad_norm": 0.4140625,
"learning_rate": 0.00019316357753475772,
"loss": 0.7632,
"step": 408
},
{
"epoch": 0.12188612099644128,
"grad_norm": 0.328125,
"learning_rate": 0.00019306134061922994,
"loss": 0.7484,
"step": 411
},
{
"epoch": 0.12277580071174377,
"grad_norm": 0.40625,
"learning_rate": 0.0001929583723695466,
"loss": 0.7623,
"step": 414
},
{
"epoch": 0.12366548042704627,
"grad_norm": 0.33984375,
"learning_rate": 0.00019285467359489653,
"loss": 0.7662,
"step": 417
},
{
"epoch": 0.12455516014234876,
"grad_norm": 0.341796875,
"learning_rate": 0.0001927502451102095,
"loss": 0.7346,
"step": 420
},
{
"epoch": 0.12544483985765126,
"grad_norm": 0.341796875,
"learning_rate": 0.00019264508773614972,
"loss": 0.7275,
"step": 423
},
{
"epoch": 0.12633451957295375,
"grad_norm": 0.34375,
"learning_rate": 0.00019253920229910965,
"loss": 0.7476,
"step": 426
},
{
"epoch": 0.12722419928825623,
"grad_norm": 0.359375,
"learning_rate": 0.00019243258963120313,
"loss": 0.7471,
"step": 429
},
{
"epoch": 0.12811387900355872,
"grad_norm": 0.37890625,
"learning_rate": 0.00019232525057025915,
"loss": 0.7575,
"step": 432
},
{
"epoch": 0.1290035587188612,
"grad_norm": 0.36328125,
"learning_rate": 0.00019221718595981507,
"loss": 0.7385,
"step": 435
},
{
"epoch": 0.1298932384341637,
"grad_norm": 0.3125,
"learning_rate": 0.00019210839664911012,
"loss": 0.7562,
"step": 438
},
{
"epoch": 0.1307829181494662,
"grad_norm": 0.34375,
"learning_rate": 0.00019199888349307872,
"loss": 0.7409,
"step": 441
},
{
"epoch": 0.13167259786476868,
"grad_norm": 0.341796875,
"learning_rate": 0.0001918886473523436,
"loss": 0.7473,
"step": 444
},
{
"epoch": 0.13256227758007116,
"grad_norm": 0.3359375,
"learning_rate": 0.00019177768909320927,
"loss": 0.7526,
"step": 447
},
{
"epoch": 0.13345195729537365,
"grad_norm": 0.361328125,
"learning_rate": 0.00019166600958765506,
"loss": 0.7723,
"step": 450
},
{
"epoch": 0.13434163701067617,
"grad_norm": 0.318359375,
"learning_rate": 0.00019155360971332826,
"loss": 0.7301,
"step": 453
},
{
"epoch": 0.13523131672597866,
"grad_norm": 0.3828125,
"learning_rate": 0.0001914404903535373,
"loss": 0.7629,
"step": 456
},
{
"epoch": 0.13612099644128114,
"grad_norm": 0.35546875,
"learning_rate": 0.00019132665239724486,
"loss": 0.7431,
"step": 459
},
{
"epoch": 0.13701067615658363,
"grad_norm": 0.30859375,
"learning_rate": 0.00019121209673906065,
"loss": 0.742,
"step": 462
},
{
"epoch": 0.13790035587188612,
"grad_norm": 0.33984375,
"learning_rate": 0.0001910968242792346,
"loss": 0.7517,
"step": 465
},
{
"epoch": 0.1387900355871886,
"grad_norm": 0.359375,
"learning_rate": 0.00019098083592364974,
"loss": 0.7371,
"step": 468
},
{
"epoch": 0.1396797153024911,
"grad_norm": 0.35546875,
"learning_rate": 0.00019086413258381506,
"loss": 0.7498,
"step": 471
},
{
"epoch": 0.14056939501779359,
"grad_norm": 0.3359375,
"learning_rate": 0.00019074671517685827,
"loss": 0.7289,
"step": 474
},
{
"epoch": 0.14145907473309607,
"grad_norm": 0.33203125,
"learning_rate": 0.00019062858462551874,
"loss": 0.7158,
"step": 477
},
{
"epoch": 0.1423487544483986,
"grad_norm": 0.357421875,
"learning_rate": 0.0001905097418581401,
"loss": 0.729,
"step": 480
},
{
"epoch": 0.14323843416370108,
"grad_norm": 0.328125,
"learning_rate": 0.00019039018780866312,
"loss": 0.7763,
"step": 483
},
{
"epoch": 0.14412811387900357,
"grad_norm": 0.34375,
"learning_rate": 0.00019026992341661817,
"loss": 0.7581,
"step": 486
},
{
"epoch": 0.14501779359430605,
"grad_norm": 0.337890625,
"learning_rate": 0.00019014894962711805,
"loss": 0.7664,
"step": 489
},
{
"epoch": 0.14590747330960854,
"grad_norm": 1.1328125,
"learning_rate": 0.00019002726739085027,
"loss": 0.7466,
"step": 492
},
{
"epoch": 0.14679715302491103,
"grad_norm": 0.33984375,
"learning_rate": 0.00018990487766406994,
"loss": 0.7463,
"step": 495
},
{
"epoch": 0.14768683274021352,
"grad_norm": 0.408203125,
"learning_rate": 0.000189781781408592,
"loss": 0.7512,
"step": 498
},
{
"epoch": 0.148576512455516,
"grad_norm": 0.3359375,
"learning_rate": 0.0001896579795917837,
"loss": 0.7208,
"step": 501
},
{
"epoch": 0.1494661921708185,
"grad_norm": 0.333984375,
"learning_rate": 0.00018953347318655705,
"loss": 0.7514,
"step": 504
},
{
"epoch": 0.15035587188612098,
"grad_norm": 0.35546875,
"learning_rate": 0.0001894082631713612,
"loss": 0.7525,
"step": 507
},
{
"epoch": 0.1512455516014235,
"grad_norm": 0.328125,
"learning_rate": 0.00018928235053017472,
"loss": 0.7335,
"step": 510
},
{
"epoch": 0.152135231316726,
"grad_norm": 0.3203125,
"learning_rate": 0.0001891557362524977,
"loss": 0.7331,
"step": 513
},
{
"epoch": 0.15302491103202848,
"grad_norm": 0.3203125,
"learning_rate": 0.00018902842133334432,
"loss": 0.7437,
"step": 516
},
{
"epoch": 0.15391459074733096,
"grad_norm": 0.33203125,
"learning_rate": 0.00018890040677323474,
"loss": 0.7029,
"step": 519
},
{
"epoch": 0.15480427046263345,
"grad_norm": 0.34765625,
"learning_rate": 0.0001887716935781873,
"loss": 0.7364,
"step": 522
},
{
"epoch": 0.15569395017793594,
"grad_norm": 0.330078125,
"learning_rate": 0.00018864228275971076,
"loss": 0.7084,
"step": 525
},
{
"epoch": 0.15658362989323843,
"grad_norm": 0.3203125,
"learning_rate": 0.00018851217533479616,
"loss": 0.7301,
"step": 528
},
{
"epoch": 0.15747330960854092,
"grad_norm": 0.302734375,
"learning_rate": 0.00018838137232590895,
"loss": 0.7352,
"step": 531
},
{
"epoch": 0.1583629893238434,
"grad_norm": 0.3125,
"learning_rate": 0.00018824987476098092,
"loss": 0.7245,
"step": 534
},
{
"epoch": 0.1592526690391459,
"grad_norm": 0.328125,
"learning_rate": 0.00018811768367340198,
"loss": 0.7509,
"step": 537
},
{
"epoch": 0.1601423487544484,
"grad_norm": 0.310546875,
"learning_rate": 0.0001879848001020124,
"loss": 0.732,
"step": 540
},
{
"epoch": 0.1610320284697509,
"grad_norm": 0.333984375,
"learning_rate": 0.00018785122509109426,
"loss": 0.7194,
"step": 543
},
{
"epoch": 0.1619217081850534,
"grad_norm": 0.359375,
"learning_rate": 0.00018771695969036344,
"loss": 0.7389,
"step": 546
},
{
"epoch": 0.16281138790035588,
"grad_norm": 0.328125,
"learning_rate": 0.00018758200495496132,
"loss": 0.7155,
"step": 549
},
{
"epoch": 0.16370106761565836,
"grad_norm": 0.30078125,
"learning_rate": 0.0001874463619454466,
"loss": 0.7561,
"step": 552
},
{
"epoch": 0.16459074733096085,
"grad_norm": 0.306640625,
"learning_rate": 0.0001873100317277867,
"loss": 0.7208,
"step": 555
},
{
"epoch": 0.16548042704626334,
"grad_norm": 0.2890625,
"learning_rate": 0.00018717301537334973,
"loss": 0.682,
"step": 558
},
{
"epoch": 0.16637010676156583,
"grad_norm": 0.33203125,
"learning_rate": 0.00018703531395889575,
"loss": 0.7293,
"step": 561
},
{
"epoch": 0.16725978647686832,
"grad_norm": 0.3203125,
"learning_rate": 0.00018689692856656853,
"loss": 0.7452,
"step": 564
},
{
"epoch": 0.16814946619217083,
"grad_norm": 0.328125,
"learning_rate": 0.00018675786028388692,
"loss": 0.73,
"step": 567
},
{
"epoch": 0.16903914590747332,
"grad_norm": 0.306640625,
"learning_rate": 0.0001866181102037364,
"loss": 0.728,
"step": 570
},
{
"epoch": 0.1699288256227758,
"grad_norm": 0.3046875,
"learning_rate": 0.00018647767942436038,
"loss": 0.7141,
"step": 573
},
{
"epoch": 0.1708185053380783,
"grad_norm": 0.296875,
"learning_rate": 0.0001863365690493517,
"loss": 0.728,
"step": 576
},
{
"epoch": 0.17170818505338079,
"grad_norm": 0.318359375,
"learning_rate": 0.00018619478018764378,
"loss": 0.7351,
"step": 579
},
{
"epoch": 0.17259786476868327,
"grad_norm": 0.310546875,
"learning_rate": 0.00018605231395350214,
"loss": 0.7291,
"step": 582
},
{
"epoch": 0.17348754448398576,
"grad_norm": 0.302734375,
"learning_rate": 0.00018590917146651544,
"loss": 0.7626,
"step": 585
},
{
"epoch": 0.17437722419928825,
"grad_norm": 0.302734375,
"learning_rate": 0.00018576535385158674,
"loss": 0.7216,
"step": 588
},
{
"epoch": 0.17526690391459074,
"grad_norm": 0.326171875,
"learning_rate": 0.00018562086223892474,
"loss": 0.7437,
"step": 591
},
{
"epoch": 0.17615658362989323,
"grad_norm": 0.349609375,
"learning_rate": 0.0001854756977640348,
"loss": 0.7354,
"step": 594
},
{
"epoch": 0.17704626334519574,
"grad_norm": 0.3046875,
"learning_rate": 0.00018532986156771008,
"loss": 0.7039,
"step": 597
},
{
"epoch": 0.17793594306049823,
"grad_norm": 0.3515625,
"learning_rate": 0.00018518335479602248,
"loss": 0.7506,
"step": 600
},
{
"epoch": 0.17882562277580072,
"grad_norm": 0.328125,
"learning_rate": 0.00018503617860031376,
"loss": 0.748,
"step": 603
},
{
"epoch": 0.1797153024911032,
"grad_norm": 0.322265625,
"learning_rate": 0.00018488833413718645,
"loss": 0.7402,
"step": 606
},
{
"epoch": 0.1806049822064057,
"grad_norm": 0.30859375,
"learning_rate": 0.00018473982256849466,
"loss": 0.7068,
"step": 609
},
{
"epoch": 0.18149466192170818,
"grad_norm": 0.3203125,
"learning_rate": 0.0001845906450613351,
"loss": 0.7265,
"step": 612
},
{
"epoch": 0.18238434163701067,
"grad_norm": 0.333984375,
"learning_rate": 0.0001844408027880378,
"loss": 0.7285,
"step": 615
},
{
"epoch": 0.18327402135231316,
"grad_norm": 0.3203125,
"learning_rate": 0.00018429029692615701,
"loss": 0.7226,
"step": 618
},
{
"epoch": 0.18416370106761565,
"grad_norm": 0.306640625,
"learning_rate": 0.00018413912865846178,
"loss": 0.7204,
"step": 621
},
{
"epoch": 0.18505338078291814,
"grad_norm": 0.3359375,
"learning_rate": 0.00018398729917292684,
"loss": 0.724,
"step": 624
},
{
"epoch": 0.18594306049822065,
"grad_norm": 0.302734375,
"learning_rate": 0.00018383480966272306,
"loss": 0.7032,
"step": 627
},
{
"epoch": 0.18683274021352314,
"grad_norm": 0.30859375,
"learning_rate": 0.00018368166132620836,
"loss": 0.7126,
"step": 630
},
{
"epoch": 0.18772241992882563,
"grad_norm": 0.328125,
"learning_rate": 0.00018352785536691804,
"loss": 0.7188,
"step": 633
},
{
"epoch": 0.18861209964412812,
"grad_norm": 0.427734375,
"learning_rate": 0.00018337339299355542,
"loss": 0.7207,
"step": 636
},
{
"epoch": 0.1895017793594306,
"grad_norm": 0.318359375,
"learning_rate": 0.00018321827541998228,
"loss": 0.7095,
"step": 639
},
{
"epoch": 0.1903914590747331,
"grad_norm": 0.283203125,
"learning_rate": 0.0001830625038652095,
"loss": 0.733,
"step": 642
},
{
"epoch": 0.19128113879003558,
"grad_norm": 0.29296875,
"learning_rate": 0.0001829060795533872,
"loss": 0.7192,
"step": 645
},
{
"epoch": 0.19217081850533807,
"grad_norm": 0.3046875,
"learning_rate": 0.00018274900371379542,
"loss": 0.7178,
"step": 648
},
{
"epoch": 0.19306049822064056,
"grad_norm": 0.30078125,
"learning_rate": 0.00018259127758083417,
"loss": 0.6983,
"step": 651
},
{
"epoch": 0.19395017793594305,
"grad_norm": 0.29296875,
"learning_rate": 0.00018243290239401404,
"loss": 0.7008,
"step": 654
},
{
"epoch": 0.19483985765124556,
"grad_norm": 0.291015625,
"learning_rate": 0.0001822738793979461,
"loss": 0.6886,
"step": 657
},
{
"epoch": 0.19572953736654805,
"grad_norm": 0.287109375,
"learning_rate": 0.0001821142098423325,
"loss": 0.6909,
"step": 660
},
{
"epoch": 0.19661921708185054,
"grad_norm": 0.302734375,
"learning_rate": 0.00018195389498195627,
"loss": 0.7114,
"step": 663
},
{
"epoch": 0.19750889679715303,
"grad_norm": 0.30078125,
"learning_rate": 0.00018179293607667178,
"loss": 0.7185,
"step": 666
},
{
"epoch": 0.19839857651245552,
"grad_norm": 0.3046875,
"learning_rate": 0.00018163133439139467,
"loss": 0.7284,
"step": 669
},
{
"epoch": 0.199288256227758,
"grad_norm": 0.31640625,
"learning_rate": 0.00018146909119609196,
"loss": 0.702,
"step": 672
},
{
"epoch": 0.2001779359430605,
"grad_norm": 0.314453125,
"learning_rate": 0.00018130620776577198,
"loss": 0.7232,
"step": 675
},
{
"epoch": 0.20106761565836298,
"grad_norm": 0.322265625,
"learning_rate": 0.00018114268538047456,
"loss": 0.7201,
"step": 678
},
{
"epoch": 0.20195729537366547,
"grad_norm": 0.31640625,
"learning_rate": 0.00018097852532526074,
"loss": 0.704,
"step": 681
},
{
"epoch": 0.20284697508896798,
"grad_norm": 0.3203125,
"learning_rate": 0.0001808137288902028,
"loss": 0.7134,
"step": 684
},
{
"epoch": 0.20373665480427047,
"grad_norm": 0.3203125,
"learning_rate": 0.0001806482973703741,
"loss": 0.7004,
"step": 687
},
{
"epoch": 0.20462633451957296,
"grad_norm": 0.3203125,
"learning_rate": 0.00018048223206583878,
"loss": 0.7124,
"step": 690
},
{
"epoch": 0.20551601423487545,
"grad_norm": 0.33203125,
"learning_rate": 0.00018031553428164186,
"loss": 0.7271,
"step": 693
},
{
"epoch": 0.20640569395017794,
"grad_norm": 0.30859375,
"learning_rate": 0.0001801482053277987,
"loss": 0.6942,
"step": 696
},
{
"epoch": 0.20729537366548043,
"grad_norm": 0.322265625,
"learning_rate": 0.00017998024651928464,
"loss": 0.7274,
"step": 699
},
{
"epoch": 0.20818505338078291,
"grad_norm": 0.328125,
"learning_rate": 0.000179811659176025,
"loss": 0.7046,
"step": 702
},
{
"epoch": 0.2090747330960854,
"grad_norm": 0.296875,
"learning_rate": 0.00017964244462288448,
"loss": 0.6981,
"step": 705
},
{
"epoch": 0.2099644128113879,
"grad_norm": 0.30859375,
"learning_rate": 0.0001794726041896567,
"loss": 0.7034,
"step": 708
},
{
"epoch": 0.21085409252669038,
"grad_norm": 0.31640625,
"learning_rate": 0.00017930213921105392,
"loss": 0.7157,
"step": 711
},
{
"epoch": 0.2117437722419929,
"grad_norm": 0.30859375,
"learning_rate": 0.00017913105102669642,
"loss": 0.7015,
"step": 714
},
{
"epoch": 0.21263345195729538,
"grad_norm": 0.333984375,
"learning_rate": 0.00017895934098110207,
"loss": 0.714,
"step": 717
},
{
"epoch": 0.21352313167259787,
"grad_norm": 0.31640625,
"learning_rate": 0.0001787870104236757,
"loss": 0.7185,
"step": 720
},
{
"epoch": 0.21441281138790036,
"grad_norm": 0.328125,
"learning_rate": 0.00017861406070869844,
"loss": 0.6969,
"step": 723
},
{
"epoch": 0.21530249110320285,
"grad_norm": 0.296875,
"learning_rate": 0.00017844049319531725,
"loss": 0.7188,
"step": 726
},
{
"epoch": 0.21619217081850534,
"grad_norm": 0.30078125,
"learning_rate": 0.00017826630924753408,
"loss": 0.7109,
"step": 729
},
{
"epoch": 0.21708185053380782,
"grad_norm": 0.32421875,
"learning_rate": 0.00017809151023419516,
"loss": 0.715,
"step": 732
},
{
"epoch": 0.2179715302491103,
"grad_norm": 0.341796875,
"learning_rate": 0.0001779160975289804,
"loss": 0.7103,
"step": 735
},
{
"epoch": 0.2188612099644128,
"grad_norm": 0.326171875,
"learning_rate": 0.00017774007251039245,
"loss": 0.7083,
"step": 738
},
{
"epoch": 0.2197508896797153,
"grad_norm": 0.32421875,
"learning_rate": 0.00017756343656174584,
"loss": 0.733,
"step": 741
},
{
"epoch": 0.2206405693950178,
"grad_norm": 0.3125,
"learning_rate": 0.00017738619107115618,
"loss": 0.7088,
"step": 744
},
{
"epoch": 0.2215302491103203,
"grad_norm": 0.322265625,
"learning_rate": 0.00017720833743152935,
"loss": 0.715,
"step": 747
},
{
"epoch": 0.22241992882562278,
"grad_norm": 0.3125,
"learning_rate": 0.0001770298770405503,
"loss": 0.707,
"step": 750
},
{
"epoch": 0.22330960854092527,
"grad_norm": 0.330078125,
"learning_rate": 0.0001768508113006723,
"loss": 0.6825,
"step": 753
},
{
"epoch": 0.22419928825622776,
"grad_norm": 0.31640625,
"learning_rate": 0.00017667114161910586,
"loss": 0.72,
"step": 756
},
{
"epoch": 0.22508896797153025,
"grad_norm": 0.31640625,
"learning_rate": 0.00017649086940780748,
"loss": 0.7141,
"step": 759
},
{
"epoch": 0.22597864768683273,
"grad_norm": 0.310546875,
"learning_rate": 0.00017630999608346886,
"loss": 0.6972,
"step": 762
},
{
"epoch": 0.22686832740213522,
"grad_norm": 0.30859375,
"learning_rate": 0.00017612852306750566,
"loss": 0.6851,
"step": 765
},
{
"epoch": 0.2277580071174377,
"grad_norm": 0.310546875,
"learning_rate": 0.00017594645178604611,
"loss": 0.7086,
"step": 768
},
{
"epoch": 0.22864768683274023,
"grad_norm": 0.302734375,
"learning_rate": 0.0001757637836699202,
"loss": 0.6971,
"step": 771
},
{
"epoch": 0.22953736654804271,
"grad_norm": 0.30078125,
"learning_rate": 0.000175580520154648,
"loss": 0.7095,
"step": 774
},
{
"epoch": 0.2304270462633452,
"grad_norm": 0.28515625,
"learning_rate": 0.0001753966626804288,
"loss": 0.6622,
"step": 777
},
{
"epoch": 0.2313167259786477,
"grad_norm": 0.3046875,
"learning_rate": 0.00017521221269212943,
"loss": 0.7095,
"step": 780
},
{
"epoch": 0.23220640569395018,
"grad_norm": 0.302734375,
"learning_rate": 0.00017502717163927315,
"loss": 0.7202,
"step": 783
},
{
"epoch": 0.23309608540925267,
"grad_norm": 0.345703125,
"learning_rate": 0.0001748415409760282,
"loss": 0.7216,
"step": 786
},
{
"epoch": 0.23398576512455516,
"grad_norm": 0.30078125,
"learning_rate": 0.00017465532216119625,
"loss": 0.6692,
"step": 789
},
{
"epoch": 0.23487544483985764,
"grad_norm": 0.3359375,
"learning_rate": 0.00017446851665820116,
"loss": 0.6887,
"step": 792
},
{
"epoch": 0.23576512455516013,
"grad_norm": 0.330078125,
"learning_rate": 0.00017428112593507723,
"loss": 0.7244,
"step": 795
},
{
"epoch": 0.23665480427046262,
"grad_norm": 0.302734375,
"learning_rate": 0.00017409315146445784,
"loss": 0.7044,
"step": 798
},
{
"epoch": 0.23754448398576514,
"grad_norm": 0.3046875,
"learning_rate": 0.00017390459472356383,
"loss": 0.7155,
"step": 801
},
{
"epoch": 0.23843416370106763,
"grad_norm": 0.314453125,
"learning_rate": 0.00017371545719419186,
"loss": 0.698,
"step": 804
},
{
"epoch": 0.2393238434163701,
"grad_norm": 0.34375,
"learning_rate": 0.00017352574036270282,
"loss": 0.6877,
"step": 807
},
{
"epoch": 0.2402135231316726,
"grad_norm": 0.310546875,
"learning_rate": 0.00017333544572001007,
"loss": 0.7019,
"step": 810
},
{
"epoch": 0.2411032028469751,
"grad_norm": 0.3046875,
"learning_rate": 0.00017314457476156782,
"loss": 0.711,
"step": 813
},
{
"epoch": 0.24199288256227758,
"grad_norm": 0.33203125,
"learning_rate": 0.00017295312898735934,
"loss": 0.7147,
"step": 816
},
{
"epoch": 0.24288256227758007,
"grad_norm": 0.380859375,
"learning_rate": 0.00017276110990188507,
"loss": 0.6981,
"step": 819
},
{
"epoch": 0.24377224199288255,
"grad_norm": 0.287109375,
"learning_rate": 0.000172568519014151,
"loss": 0.6907,
"step": 822
},
{
"epoch": 0.24466192170818504,
"grad_norm": 0.3203125,
"learning_rate": 0.00017237535783765662,
"loss": 0.7123,
"step": 825
},
{
"epoch": 0.24555160142348753,
"grad_norm": 0.306640625,
"learning_rate": 0.00017218162789038312,
"loss": 0.7002,
"step": 828
},
{
"epoch": 0.24644128113879005,
"grad_norm": 0.291015625,
"learning_rate": 0.00017198733069478153,
"loss": 0.6812,
"step": 831
},
{
"epoch": 0.24733096085409254,
"grad_norm": 0.330078125,
"learning_rate": 0.00017179246777776052,
"loss": 0.697,
"step": 834
},
{
"epoch": 0.24822064056939502,
"grad_norm": 0.3046875,
"learning_rate": 0.00017159704067067468,
"loss": 0.6925,
"step": 837
},
{
"epoch": 0.2491103202846975,
"grad_norm": 0.291015625,
"learning_rate": 0.0001714010509093123,
"loss": 0.7084,
"step": 840
},
{
"epoch": 0.25,
"grad_norm": 0.302734375,
"learning_rate": 0.00017120450003388338,
"loss": 0.6717,
"step": 843
},
{
"epoch": 0.2508896797153025,
"grad_norm": 0.298828125,
"learning_rate": 0.0001710073895890075,
"loss": 0.7053,
"step": 846
},
{
"epoch": 0.251779359430605,
"grad_norm": 0.287109375,
"learning_rate": 0.00017080972112370167,
"loss": 0.6479,
"step": 849
},
{
"epoch": 0.2526690391459075,
"grad_norm": 0.365234375,
"learning_rate": 0.00017061149619136817,
"loss": 0.6837,
"step": 852
},
{
"epoch": 0.25355871886120995,
"grad_norm": 0.341796875,
"learning_rate": 0.0001704127163497824,
"loss": 0.7102,
"step": 855
},
{
"epoch": 0.25444839857651247,
"grad_norm": 0.3203125,
"learning_rate": 0.0001702133831610805,
"loss": 0.7029,
"step": 858
},
{
"epoch": 0.25533807829181493,
"grad_norm": 0.31640625,
"learning_rate": 0.00017001349819174727,
"loss": 0.6871,
"step": 861
},
{
"epoch": 0.25622775800711745,
"grad_norm": 0.302734375,
"learning_rate": 0.00016981306301260357,
"loss": 0.6933,
"step": 864
},
{
"epoch": 0.2571174377224199,
"grad_norm": 0.3046875,
"learning_rate": 0.0001696120791987944,
"loss": 0.7173,
"step": 867
},
{
"epoch": 0.2580071174377224,
"grad_norm": 0.29296875,
"learning_rate": 0.000169410548329776,
"loss": 0.6973,
"step": 870
},
{
"epoch": 0.25889679715302494,
"grad_norm": 0.298828125,
"learning_rate": 0.0001692084719893039,
"loss": 0.6909,
"step": 873
},
{
"epoch": 0.2597864768683274,
"grad_norm": 0.28125,
"learning_rate": 0.0001690058517654203,
"loss": 0.7051,
"step": 876
},
{
"epoch": 0.2606761565836299,
"grad_norm": 0.29296875,
"learning_rate": 0.00016880268925044143,
"loss": 0.6671,
"step": 879
},
{
"epoch": 0.2615658362989324,
"grad_norm": 0.328125,
"learning_rate": 0.0001685989860409453,
"loss": 0.6904,
"step": 882
},
{
"epoch": 0.2624555160142349,
"grad_norm": 0.29296875,
"learning_rate": 0.00016839474373775892,
"loss": 0.7076,
"step": 885
},
{
"epoch": 0.26334519572953735,
"grad_norm": 0.314453125,
"learning_rate": 0.00016818996394594603,
"loss": 0.7155,
"step": 888
},
{
"epoch": 0.26423487544483987,
"grad_norm": 0.302734375,
"learning_rate": 0.00016798464827479404,
"loss": 0.6727,
"step": 891
},
{
"epoch": 0.26512455516014233,
"grad_norm": 0.28515625,
"learning_rate": 0.0001677787983378019,
"loss": 0.7061,
"step": 894
},
{
"epoch": 0.26601423487544484,
"grad_norm": 0.294921875,
"learning_rate": 0.00016757241575266694,
"loss": 0.6945,
"step": 897
},
{
"epoch": 0.2669039145907473,
"grad_norm": 0.294921875,
"learning_rate": 0.00016736550214127246,
"loss": 0.6812,
"step": 900
},
{
"epoch": 0.2677935943060498,
"grad_norm": 0.326171875,
"learning_rate": 0.0001671580591296749,
"loss": 0.7009,
"step": 903
},
{
"epoch": 0.26868327402135234,
"grad_norm": 0.34375,
"learning_rate": 0.00016695008834809107,
"loss": 0.6928,
"step": 906
},
{
"epoch": 0.2695729537366548,
"grad_norm": 0.294921875,
"learning_rate": 0.00016674159143088526,
"loss": 0.7009,
"step": 909
},
{
"epoch": 0.2704626334519573,
"grad_norm": 0.31640625,
"learning_rate": 0.00016653257001655652,
"loss": 0.6824,
"step": 912
},
{
"epoch": 0.2713523131672598,
"grad_norm": 0.30859375,
"learning_rate": 0.00016632302574772577,
"loss": 0.7052,
"step": 915
},
{
"epoch": 0.2722419928825623,
"grad_norm": 0.2890625,
"learning_rate": 0.0001661129602711227,
"loss": 0.6904,
"step": 918
},
{
"epoch": 0.27313167259786475,
"grad_norm": 0.333984375,
"learning_rate": 0.0001659023752375731,
"loss": 0.6945,
"step": 921
},
{
"epoch": 0.27402135231316727,
"grad_norm": 0.302734375,
"learning_rate": 0.00016569127230198577,
"loss": 0.6761,
"step": 924
},
{
"epoch": 0.2749110320284697,
"grad_norm": 0.27734375,
"learning_rate": 0.0001654796531233394,
"loss": 0.7051,
"step": 927
},
{
"epoch": 0.27580071174377224,
"grad_norm": 0.3203125,
"learning_rate": 0.00016526751936466974,
"loss": 0.6919,
"step": 930
},
{
"epoch": 0.27669039145907476,
"grad_norm": 0.279296875,
"learning_rate": 0.0001650548726930564,
"loss": 0.7075,
"step": 933
},
{
"epoch": 0.2775800711743772,
"grad_norm": 0.291015625,
"learning_rate": 0.00016484171477960976,
"loss": 0.6621,
"step": 936
},
{
"epoch": 0.27846975088967973,
"grad_norm": 0.314453125,
"learning_rate": 0.0001646280472994579,
"loss": 0.6956,
"step": 939
},
{
"epoch": 0.2793594306049822,
"grad_norm": 0.298828125,
"learning_rate": 0.00016441387193173336,
"loss": 0.6791,
"step": 942
},
{
"epoch": 0.2802491103202847,
"grad_norm": 0.306640625,
"learning_rate": 0.00016419919035956,
"loss": 0.6891,
"step": 945
},
{
"epoch": 0.28113879003558717,
"grad_norm": 0.314453125,
"learning_rate": 0.0001639840042700397,
"loss": 0.6948,
"step": 948
},
{
"epoch": 0.2820284697508897,
"grad_norm": 0.31640625,
"learning_rate": 0.00016376831535423923,
"loss": 0.6941,
"step": 951
},
{
"epoch": 0.28291814946619215,
"grad_norm": 0.32421875,
"learning_rate": 0.00016355212530717682,
"loss": 0.6737,
"step": 954
},
{
"epoch": 0.28380782918149466,
"grad_norm": 0.328125,
"learning_rate": 0.00016333543582780898,
"loss": 0.6966,
"step": 957
},
{
"epoch": 0.2846975088967972,
"grad_norm": 0.283203125,
"learning_rate": 0.00016311824861901694,
"loss": 0.6938,
"step": 960
},
{
"epoch": 0.28558718861209964,
"grad_norm": 0.306640625,
"learning_rate": 0.00016290056538759352,
"loss": 0.6963,
"step": 963
},
{
"epoch": 0.28647686832740216,
"grad_norm": 0.30078125,
"learning_rate": 0.00016268238784422954,
"loss": 0.6931,
"step": 966
},
{
"epoch": 0.2873665480427046,
"grad_norm": 0.298828125,
"learning_rate": 0.00016246371770350045,
"loss": 0.6698,
"step": 969
},
{
"epoch": 0.28825622775800713,
"grad_norm": 0.2890625,
"learning_rate": 0.00016224455668385282,
"loss": 0.6909,
"step": 972
},
{
"epoch": 0.2891459074733096,
"grad_norm": 0.30859375,
"learning_rate": 0.0001620249065075909,
"loss": 0.7073,
"step": 975
},
{
"epoch": 0.2900355871886121,
"grad_norm": 0.298828125,
"learning_rate": 0.00016180476890086297,
"loss": 0.6631,
"step": 978
},
{
"epoch": 0.29092526690391457,
"grad_norm": 0.3046875,
"learning_rate": 0.00016158414559364789,
"loss": 0.6814,
"step": 981
},
{
"epoch": 0.2918149466192171,
"grad_norm": 0.28515625,
"learning_rate": 0.00016136303831974146,
"loss": 0.6889,
"step": 984
},
{
"epoch": 0.29270462633451955,
"grad_norm": 0.298828125,
"learning_rate": 0.0001611414488167427,
"loss": 0.6689,
"step": 987
},
{
"epoch": 0.29359430604982206,
"grad_norm": 0.31640625,
"learning_rate": 0.00016091937882604048,
"loss": 0.6845,
"step": 990
},
{
"epoch": 0.2944839857651246,
"grad_norm": 0.298828125,
"learning_rate": 0.00016069683009279942,
"loss": 0.6819,
"step": 993
},
{
"epoch": 0.29537366548042704,
"grad_norm": 0.302734375,
"learning_rate": 0.0001604738043659466,
"loss": 0.6779,
"step": 996
},
{
"epoch": 0.29626334519572955,
"grad_norm": 0.294921875,
"learning_rate": 0.00016025030339815745,
"loss": 0.6603,
"step": 999
},
{
"epoch": 0.297153024911032,
"grad_norm": 0.41796875,
"learning_rate": 0.00016002632894584226,
"loss": 0.6968,
"step": 1002
},
{
"epoch": 0.29804270462633453,
"grad_norm": 0.294921875,
"learning_rate": 0.00015980188276913215,
"loss": 0.6903,
"step": 1005
},
{
"epoch": 0.298932384341637,
"grad_norm": 0.30078125,
"learning_rate": 0.00015957696663186546,
"loss": 0.6829,
"step": 1008
},
{
"epoch": 0.2998220640569395,
"grad_norm": 0.30078125,
"learning_rate": 0.00015935158230157367,
"loss": 0.67,
"step": 1011
},
{
"epoch": 0.30071174377224197,
"grad_norm": 0.3046875,
"learning_rate": 0.00015912573154946768,
"loss": 0.7109,
"step": 1014
},
{
"epoch": 0.3016014234875445,
"grad_norm": 0.28515625,
"learning_rate": 0.0001588994161504238,
"loss": 0.6877,
"step": 1017
},
{
"epoch": 0.302491103202847,
"grad_norm": 0.33203125,
"learning_rate": 0.00015867263788296984,
"loss": 0.6896,
"step": 1020
},
{
"epoch": 0.30338078291814946,
"grad_norm": 0.31640625,
"learning_rate": 0.00015844539852927109,
"loss": 0.7024,
"step": 1023
},
{
"epoch": 0.304270462633452,
"grad_norm": 0.306640625,
"learning_rate": 0.00015821769987511635,
"loss": 0.6913,
"step": 1026
},
{
"epoch": 0.30516014234875444,
"grad_norm": 0.3046875,
"learning_rate": 0.00015798954370990393,
"loss": 0.6893,
"step": 1029
},
{
"epoch": 0.30604982206405695,
"grad_norm": 0.296875,
"learning_rate": 0.0001577609318266275,
"loss": 0.6613,
"step": 1032
},
{
"epoch": 0.3069395017793594,
"grad_norm": 0.296875,
"learning_rate": 0.00015753186602186209,
"loss": 0.6827,
"step": 1035
},
{
"epoch": 0.30782918149466193,
"grad_norm": 0.287109375,
"learning_rate": 0.00015730234809574985,
"loss": 0.6789,
"step": 1038
},
{
"epoch": 0.3087188612099644,
"grad_norm": 0.29296875,
"learning_rate": 0.00015707237985198612,
"loss": 0.6798,
"step": 1041
},
{
"epoch": 0.3096085409252669,
"grad_norm": 0.294921875,
"learning_rate": 0.00015684196309780494,
"loss": 0.6736,
"step": 1044
},
{
"epoch": 0.3104982206405694,
"grad_norm": 0.3125,
"learning_rate": 0.0001566110996439652,
"loss": 0.6915,
"step": 1047
},
{
"epoch": 0.3113879003558719,
"grad_norm": 0.30078125,
"learning_rate": 0.0001563797913047361,
"loss": 0.6674,
"step": 1050
},
{
"epoch": 0.3122775800711744,
"grad_norm": 0.314453125,
"learning_rate": 0.00015614803989788314,
"loss": 0.6783,
"step": 1053
},
{
"epoch": 0.31316725978647686,
"grad_norm": 0.294921875,
"learning_rate": 0.00015591584724465363,
"loss": 0.6735,
"step": 1056
},
{
"epoch": 0.3140569395017794,
"grad_norm": 0.3203125,
"learning_rate": 0.00015568321516976248,
"loss": 0.6884,
"step": 1059
},
{
"epoch": 0.31494661921708184,
"grad_norm": 0.298828125,
"learning_rate": 0.00015545014550137786,
"loss": 0.6765,
"step": 1062
},
{
"epoch": 0.31583629893238435,
"grad_norm": 0.28515625,
"learning_rate": 0.00015521664007110691,
"loss": 0.6778,
"step": 1065
},
{
"epoch": 0.3167259786476868,
"grad_norm": 0.310546875,
"learning_rate": 0.00015498270071398116,
"loss": 0.6766,
"step": 1068
},
{
"epoch": 0.31761565836298933,
"grad_norm": 0.302734375,
"learning_rate": 0.00015474832926844223,
"loss": 0.6813,
"step": 1071
},
{
"epoch": 0.3185053380782918,
"grad_norm": 0.3046875,
"learning_rate": 0.00015451352757632733,
"loss": 0.6756,
"step": 1074
},
{
"epoch": 0.3193950177935943,
"grad_norm": 0.294921875,
"learning_rate": 0.0001542782974828549,
"loss": 0.6401,
"step": 1077
},
{
"epoch": 0.3202846975088968,
"grad_norm": 0.306640625,
"learning_rate": 0.00015404264083660992,
"loss": 0.6928,
"step": 1080
},
{
"epoch": 0.3211743772241993,
"grad_norm": 0.291015625,
"learning_rate": 0.00015380655948952961,
"loss": 0.6618,
"step": 1083
},
{
"epoch": 0.3220640569395018,
"grad_norm": 0.296875,
"learning_rate": 0.00015357005529688866,
"loss": 0.7026,
"step": 1086
},
{
"epoch": 0.32295373665480426,
"grad_norm": 0.287109375,
"learning_rate": 0.00015333313011728478,
"loss": 0.6659,
"step": 1089
},
{
"epoch": 0.3238434163701068,
"grad_norm": 0.291015625,
"learning_rate": 0.00015309578581262402,
"loss": 0.6674,
"step": 1092
},
{
"epoch": 0.32473309608540923,
"grad_norm": 0.302734375,
"learning_rate": 0.00015285802424810626,
"loss": 0.6893,
"step": 1095
},
{
"epoch": 0.32562277580071175,
"grad_norm": 0.3046875,
"learning_rate": 0.00015261984729221038,
"loss": 0.6829,
"step": 1098
},
{
"epoch": 0.3265124555160142,
"grad_norm": 0.3125,
"learning_rate": 0.00015238125681667973,
"loss": 0.6579,
"step": 1101
},
{
"epoch": 0.3274021352313167,
"grad_norm": 0.298828125,
"learning_rate": 0.00015214225469650726,
"loss": 0.6546,
"step": 1104
},
{
"epoch": 0.32829181494661924,
"grad_norm": 0.3046875,
"learning_rate": 0.00015190284280992107,
"loss": 0.6665,
"step": 1107
},
{
"epoch": 0.3291814946619217,
"grad_norm": 0.283203125,
"learning_rate": 0.00015166302303836927,
"loss": 0.6877,
"step": 1110
},
{
"epoch": 0.3300711743772242,
"grad_norm": 0.296875,
"learning_rate": 0.00015142279726650543,
"loss": 0.6823,
"step": 1113
},
{
"epoch": 0.3309608540925267,
"grad_norm": 0.306640625,
"learning_rate": 0.0001511821673821738,
"loss": 0.6788,
"step": 1116
},
{
"epoch": 0.3318505338078292,
"grad_norm": 0.296875,
"learning_rate": 0.0001509411352763943,
"loss": 0.6804,
"step": 1119
},
{
"epoch": 0.33274021352313166,
"grad_norm": 0.298828125,
"learning_rate": 0.00015069970284334785,
"loss": 0.6902,
"step": 1122
},
{
"epoch": 0.33362989323843417,
"grad_norm": 0.3046875,
"learning_rate": 0.00015045787198036132,
"loss": 0.6713,
"step": 1125
},
{
"epoch": 0.33451957295373663,
"grad_norm": 0.2890625,
"learning_rate": 0.0001502156445878927,
"loss": 0.6787,
"step": 1128
},
{
"epoch": 0.33540925266903915,
"grad_norm": 0.29296875,
"learning_rate": 0.00014997302256951624,
"loss": 0.6657,
"step": 1131
},
{
"epoch": 0.33629893238434166,
"grad_norm": 0.296875,
"learning_rate": 0.00014973000783190726,
"loss": 0.657,
"step": 1134
},
{
"epoch": 0.3371886120996441,
"grad_norm": 0.296875,
"learning_rate": 0.00014948660228482745,
"loss": 0.6838,
"step": 1137
},
{
"epoch": 0.33807829181494664,
"grad_norm": 0.298828125,
"learning_rate": 0.00014924280784110963,
"loss": 0.6536,
"step": 1140
},
{
"epoch": 0.3389679715302491,
"grad_norm": 0.294921875,
"learning_rate": 0.00014899862641664288,
"loss": 0.6654,
"step": 1143
},
{
"epoch": 0.3398576512455516,
"grad_norm": 0.30078125,
"learning_rate": 0.0001487540599303574,
"loss": 0.6894,
"step": 1146
},
{
"epoch": 0.3407473309608541,
"grad_norm": 0.296875,
"learning_rate": 0.0001485091103042094,
"loss": 0.6429,
"step": 1149
},
{
"epoch": 0.3416370106761566,
"grad_norm": 0.294921875,
"learning_rate": 0.0001482637794631661,
"loss": 0.6806,
"step": 1152
},
{
"epoch": 0.34252669039145905,
"grad_norm": 0.298828125,
"learning_rate": 0.00014801806933519048,
"loss": 0.67,
"step": 1155
},
{
"epoch": 0.34341637010676157,
"grad_norm": 0.294921875,
"learning_rate": 0.0001477719818512263,
"loss": 0.6683,
"step": 1158
},
{
"epoch": 0.34430604982206403,
"grad_norm": 0.326171875,
"learning_rate": 0.00014752551894518272,
"loss": 0.6468,
"step": 1161
},
{
"epoch": 0.34519572953736655,
"grad_norm": 0.3046875,
"learning_rate": 0.00014727868255391924,
"loss": 0.6664,
"step": 1164
},
{
"epoch": 0.34608540925266906,
"grad_norm": 0.314453125,
"learning_rate": 0.00014703147461723041,
"loss": 0.6905,
"step": 1167
},
{
"epoch": 0.3469750889679715,
"grad_norm": 0.296875,
"learning_rate": 0.00014678389707783071,
"loss": 0.6777,
"step": 1170
},
{
"epoch": 0.34786476868327404,
"grad_norm": 0.27734375,
"learning_rate": 0.00014653595188133904,
"loss": 0.656,
"step": 1173
},
{
"epoch": 0.3487544483985765,
"grad_norm": 0.294921875,
"learning_rate": 0.0001462876409762637,
"loss": 0.6772,
"step": 1176
},
{
"epoch": 0.349644128113879,
"grad_norm": 0.2890625,
"learning_rate": 0.00014603896631398692,
"loss": 0.667,
"step": 1179
},
{
"epoch": 0.3505338078291815,
"grad_norm": 0.296875,
"learning_rate": 0.00014578992984874955,
"loss": 0.6635,
"step": 1182
},
{
"epoch": 0.351423487544484,
"grad_norm": 0.3046875,
"learning_rate": 0.00014554053353763575,
"loss": 0.6726,
"step": 1185
},
{
"epoch": 0.35231316725978645,
"grad_norm": 0.298828125,
"learning_rate": 0.00014529077934055752,
"loss": 0.67,
"step": 1188
},
{
"epoch": 0.35320284697508897,
"grad_norm": 0.279296875,
"learning_rate": 0.00014504066922023934,
"loss": 0.6716,
"step": 1191
},
{
"epoch": 0.3540925266903915,
"grad_norm": 0.3046875,
"learning_rate": 0.00014479020514220284,
"loss": 0.6786,
"step": 1194
},
{
"epoch": 0.35498220640569395,
"grad_norm": 0.298828125,
"learning_rate": 0.00014453938907475124,
"loss": 0.6636,
"step": 1197
},
{
"epoch": 0.35587188612099646,
"grad_norm": 0.30859375,
"learning_rate": 0.00014428822298895387,
"loss": 0.6316,
"step": 1200
},
{
"epoch": 0.3567615658362989,
"grad_norm": 0.3046875,
"learning_rate": 0.00014403670885863073,
"loss": 0.6849,
"step": 1203
},
{
"epoch": 0.35765124555160144,
"grad_norm": 0.3046875,
"learning_rate": 0.00014378484866033704,
"loss": 0.6701,
"step": 1206
},
{
"epoch": 0.3585409252669039,
"grad_norm": 0.2890625,
"learning_rate": 0.00014353264437334758,
"loss": 0.629,
"step": 1209
},
{
"epoch": 0.3594306049822064,
"grad_norm": 0.310546875,
"learning_rate": 0.00014328009797964113,
"loss": 0.6799,
"step": 1212
},
{
"epoch": 0.3603202846975089,
"grad_norm": 0.291015625,
"learning_rate": 0.00014302721146388514,
"loss": 0.6566,
"step": 1215
},
{
"epoch": 0.3612099644128114,
"grad_norm": 0.298828125,
"learning_rate": 0.00014277398681341983,
"loss": 0.6607,
"step": 1218
},
{
"epoch": 0.3620996441281139,
"grad_norm": 0.28125,
"learning_rate": 0.0001425204260182426,
"loss": 0.6603,
"step": 1221
},
{
"epoch": 0.36298932384341637,
"grad_norm": 0.28125,
"learning_rate": 0.00014226653107099273,
"loss": 0.6862,
"step": 1224
},
{
"epoch": 0.3638790035587189,
"grad_norm": 0.2890625,
"learning_rate": 0.00014201230396693526,
"loss": 0.6771,
"step": 1227
},
{
"epoch": 0.36476868327402134,
"grad_norm": 0.30859375,
"learning_rate": 0.00014175774670394562,
"loss": 0.6682,
"step": 1230
},
{
"epoch": 0.36565836298932386,
"grad_norm": 0.291015625,
"learning_rate": 0.0001415028612824938,
"loss": 0.664,
"step": 1233
},
{
"epoch": 0.3665480427046263,
"grad_norm": 0.3046875,
"learning_rate": 0.00014124764970562873,
"loss": 0.6675,
"step": 1236
},
{
"epoch": 0.36743772241992884,
"grad_norm": 0.2890625,
"learning_rate": 0.0001409921139789624,
"loss": 0.6531,
"step": 1239
},
{
"epoch": 0.3683274021352313,
"grad_norm": 0.283203125,
"learning_rate": 0.00014073625611065423,
"loss": 0.6503,
"step": 1242
},
{
"epoch": 0.3692170818505338,
"grad_norm": 0.30078125,
"learning_rate": 0.00014048007811139513,
"loss": 0.6663,
"step": 1245
},
{
"epoch": 0.3701067615658363,
"grad_norm": 0.29296875,
"learning_rate": 0.00014022358199439192,
"loss": 0.6692,
"step": 1248
},
{
"epoch": 0.3709964412811388,
"grad_norm": 0.283203125,
"learning_rate": 0.0001399667697753513,
"loss": 0.636,
"step": 1251
},
{
"epoch": 0.3718861209964413,
"grad_norm": 0.302734375,
"learning_rate": 0.00013970964347246418,
"loss": 0.6724,
"step": 1254
},
{
"epoch": 0.37277580071174377,
"grad_norm": 0.2890625,
"learning_rate": 0.0001394522051063897,
"loss": 0.6496,
"step": 1257
},
{
"epoch": 0.3736654804270463,
"grad_norm": 0.28515625,
"learning_rate": 0.00013919445670023932,
"loss": 0.681,
"step": 1260
},
{
"epoch": 0.37455516014234874,
"grad_norm": 0.291015625,
"learning_rate": 0.00013893640027956106,
"loss": 0.665,
"step": 1263
},
{
"epoch": 0.37544483985765126,
"grad_norm": 0.287109375,
"learning_rate": 0.00013867803787232348,
"loss": 0.6562,
"step": 1266
},
{
"epoch": 0.3763345195729537,
"grad_norm": 0.287109375,
"learning_rate": 0.0001384193715088999,
"loss": 0.676,
"step": 1269
},
{
"epoch": 0.37722419928825623,
"grad_norm": 0.291015625,
"learning_rate": 0.00013816040322205207,
"loss": 0.6744,
"step": 1272
},
{
"epoch": 0.3781138790035587,
"grad_norm": 0.291015625,
"learning_rate": 0.00013790113504691463,
"loss": 0.661,
"step": 1275
},
{
"epoch": 0.3790035587188612,
"grad_norm": 0.2890625,
"learning_rate": 0.00013764156902097891,
"loss": 0.6443,
"step": 1278
},
{
"epoch": 0.3798932384341637,
"grad_norm": 0.28515625,
"learning_rate": 0.00013738170718407687,
"loss": 0.6613,
"step": 1281
},
{
"epoch": 0.3807829181494662,
"grad_norm": 0.279296875,
"learning_rate": 0.0001371215515783652,
"loss": 0.6629,
"step": 1284
},
{
"epoch": 0.3816725978647687,
"grad_norm": 0.29296875,
"learning_rate": 0.00013686110424830923,
"loss": 0.6735,
"step": 1287
},
{
"epoch": 0.38256227758007116,
"grad_norm": 0.294921875,
"learning_rate": 0.00013660036724066668,
"loss": 0.6597,
"step": 1290
},
{
"epoch": 0.3834519572953737,
"grad_norm": 0.29296875,
"learning_rate": 0.00013633934260447192,
"loss": 0.6712,
"step": 1293
},
{
"epoch": 0.38434163701067614,
"grad_norm": 0.275390625,
"learning_rate": 0.00013607803239101964,
"loss": 0.6494,
"step": 1296
},
{
"epoch": 0.38523131672597866,
"grad_norm": 0.28515625,
"learning_rate": 0.00013581643865384873,
"loss": 0.6474,
"step": 1299
},
{
"epoch": 0.3861209964412811,
"grad_norm": 0.26953125,
"learning_rate": 0.0001355545634487262,
"loss": 0.6386,
"step": 1302
},
{
"epoch": 0.38701067615658363,
"grad_norm": 0.27734375,
"learning_rate": 0.000135292408833631,
"loss": 0.6541,
"step": 1305
},
{
"epoch": 0.3879003558718861,
"grad_norm": 0.298828125,
"learning_rate": 0.00013502997686873797,
"loss": 0.6444,
"step": 1308
},
{
"epoch": 0.3887900355871886,
"grad_norm": 0.287109375,
"learning_rate": 0.00013476726961640133,
"loss": 0.6295,
"step": 1311
},
{
"epoch": 0.3896797153024911,
"grad_norm": 0.30859375,
"learning_rate": 0.0001345042891411389,
"loss": 0.6551,
"step": 1314
},
{
"epoch": 0.3905693950177936,
"grad_norm": 0.30078125,
"learning_rate": 0.0001342410375096155,
"loss": 0.6757,
"step": 1317
},
{
"epoch": 0.3914590747330961,
"grad_norm": 0.28515625,
"learning_rate": 0.00013397751679062692,
"loss": 0.6516,
"step": 1320
},
{
"epoch": 0.39234875444839856,
"grad_norm": 0.27734375,
"learning_rate": 0.00013371372905508362,
"loss": 0.6425,
"step": 1323
},
{
"epoch": 0.3932384341637011,
"grad_norm": 0.27734375,
"learning_rate": 0.00013344967637599444,
"loss": 0.642,
"step": 1326
},
{
"epoch": 0.39412811387900354,
"grad_norm": 0.283203125,
"learning_rate": 0.00013318536082845026,
"loss": 0.6309,
"step": 1329
},
{
"epoch": 0.39501779359430605,
"grad_norm": 0.283203125,
"learning_rate": 0.0001329207844896078,
"loss": 0.6477,
"step": 1332
},
{
"epoch": 0.3959074733096085,
"grad_norm": 0.283203125,
"learning_rate": 0.00013265594943867327,
"loss": 0.6476,
"step": 1335
},
{
"epoch": 0.39679715302491103,
"grad_norm": 0.2890625,
"learning_rate": 0.00013239085775688592,
"loss": 0.646,
"step": 1338
},
{
"epoch": 0.39768683274021355,
"grad_norm": 0.30078125,
"learning_rate": 0.00013212551152750178,
"loss": 0.6614,
"step": 1341
},
{
"epoch": 0.398576512455516,
"grad_norm": 0.28125,
"learning_rate": 0.00013185991283577738,
"loss": 0.6477,
"step": 1344
},
{
"epoch": 0.3994661921708185,
"grad_norm": 0.279296875,
"learning_rate": 0.00013159406376895313,
"loss": 0.6417,
"step": 1347
},
{
"epoch": 0.400355871886121,
"grad_norm": 0.287109375,
"learning_rate": 0.00013132796641623703,
"loss": 0.6358,
"step": 1350
},
{
"epoch": 0.4012455516014235,
"grad_norm": 0.283203125,
"learning_rate": 0.00013106162286878842,
"loss": 0.652,
"step": 1353
},
{
"epoch": 0.40213523131672596,
"grad_norm": 0.2734375,
"learning_rate": 0.00013079503521970127,
"loss": 0.6511,
"step": 1356
},
{
"epoch": 0.4030249110320285,
"grad_norm": 0.291015625,
"learning_rate": 0.00013052820556398785,
"loss": 0.6546,
"step": 1359
},
{
"epoch": 0.40391459074733094,
"grad_norm": 0.296875,
"learning_rate": 0.0001302611359985623,
"loss": 0.6564,
"step": 1362
},
{
"epoch": 0.40480427046263345,
"grad_norm": 0.26953125,
"learning_rate": 0.00012999382862222415,
"loss": 0.6609,
"step": 1365
},
{
"epoch": 0.40569395017793597,
"grad_norm": 0.275390625,
"learning_rate": 0.00012972628553564177,
"loss": 0.6508,
"step": 1368
},
{
"epoch": 0.40658362989323843,
"grad_norm": 0.275390625,
"learning_rate": 0.0001294585088413358,
"loss": 0.652,
"step": 1371
},
{
"epoch": 0.40747330960854095,
"grad_norm": 0.279296875,
"learning_rate": 0.00012919050064366295,
"loss": 0.6492,
"step": 1374
},
{
"epoch": 0.4083629893238434,
"grad_norm": 0.267578125,
"learning_rate": 0.00012892226304879893,
"loss": 0.6136,
"step": 1377
},
{
"epoch": 0.4092526690391459,
"grad_norm": 0.279296875,
"learning_rate": 0.00012865379816472242,
"loss": 0.6543,
"step": 1380
},
{
"epoch": 0.4101423487544484,
"grad_norm": 0.26953125,
"learning_rate": 0.0001283851081011982,
"loss": 0.6568,
"step": 1383
},
{
"epoch": 0.4110320284697509,
"grad_norm": 0.28515625,
"learning_rate": 0.00012811619496976066,
"loss": 0.6457,
"step": 1386
},
{
"epoch": 0.41192170818505336,
"grad_norm": 0.27734375,
"learning_rate": 0.00012784706088369714,
"loss": 0.6653,
"step": 1389
},
{
"epoch": 0.4128113879003559,
"grad_norm": 0.275390625,
"learning_rate": 0.0001275777079580315,
"loss": 0.6545,
"step": 1392
},
{
"epoch": 0.41370106761565834,
"grad_norm": 0.2734375,
"learning_rate": 0.00012730813830950732,
"loss": 0.6465,
"step": 1395
},
{
"epoch": 0.41459074733096085,
"grad_norm": 0.28515625,
"learning_rate": 0.00012703835405657122,
"loss": 0.6304,
"step": 1398
},
{
"epoch": 0.41548042704626337,
"grad_norm": 0.33203125,
"learning_rate": 0.0001267683573193565,
"loss": 0.6628,
"step": 1401
},
{
"epoch": 0.41637010676156583,
"grad_norm": 0.283203125,
"learning_rate": 0.0001264981502196662,
"loss": 0.6328,
"step": 1404
},
{
"epoch": 0.41725978647686834,
"grad_norm": 0.298828125,
"learning_rate": 0.00012622773488095643,
"loss": 0.6621,
"step": 1407
},
{
"epoch": 0.4181494661921708,
"grad_norm": 0.287109375,
"learning_rate": 0.00012595711342831996,
"loss": 0.6672,
"step": 1410
},
{
"epoch": 0.4190391459074733,
"grad_norm": 0.283203125,
"learning_rate": 0.00012568628798846924,
"loss": 0.6496,
"step": 1413
},
{
"epoch": 0.4199288256227758,
"grad_norm": 0.2890625,
"learning_rate": 0.00012541526068971973,
"loss": 0.6318,
"step": 1416
},
{
"epoch": 0.4208185053380783,
"grad_norm": 0.283203125,
"learning_rate": 0.0001251440336619733,
"loss": 0.6328,
"step": 1419
},
{
"epoch": 0.42170818505338076,
"grad_norm": 0.27734375,
"learning_rate": 0.00012487260903670135,
"loss": 0.6476,
"step": 1422
},
{
"epoch": 0.4225978647686833,
"grad_norm": 0.27734375,
"learning_rate": 0.00012460098894692822,
"loss": 0.6404,
"step": 1425
},
{
"epoch": 0.4234875444839858,
"grad_norm": 0.279296875,
"learning_rate": 0.0001243291755272142,
"loss": 0.6592,
"step": 1428
},
{
"epoch": 0.42437722419928825,
"grad_norm": 0.2734375,
"learning_rate": 0.000124057170913639,
"loss": 0.6489,
"step": 1431
},
{
"epoch": 0.42526690391459077,
"grad_norm": 0.306640625,
"learning_rate": 0.00012378497724378483,
"loss": 0.6669,
"step": 1434
},
{
"epoch": 0.4261565836298932,
"grad_norm": 0.291015625,
"learning_rate": 0.00012351259665671958,
"loss": 0.6477,
"step": 1437
},
{
"epoch": 0.42704626334519574,
"grad_norm": 0.27734375,
"learning_rate": 0.00012324003129298005,
"loss": 0.6202,
"step": 1440
},
{
"epoch": 0.4279359430604982,
"grad_norm": 0.28515625,
"learning_rate": 0.00012296728329455524,
"loss": 0.6655,
"step": 1443
},
{
"epoch": 0.4288256227758007,
"grad_norm": 0.29296875,
"learning_rate": 0.00012269435480486923,
"loss": 0.6526,
"step": 1446
},
{
"epoch": 0.4297153024911032,
"grad_norm": 0.2890625,
"learning_rate": 0.0001224212479687646,
"loss": 0.6619,
"step": 1449
},
{
"epoch": 0.4306049822064057,
"grad_norm": 0.318359375,
"learning_rate": 0.00012214796493248563,
"loss": 0.6718,
"step": 1452
},
{
"epoch": 0.4314946619217082,
"grad_norm": 0.302734375,
"learning_rate": 0.00012187450784366101,
"loss": 0.6414,
"step": 1455
},
{
"epoch": 0.43238434163701067,
"grad_norm": 0.447265625,
"learning_rate": 0.00012160087885128745,
"loss": 0.6567,
"step": 1458
},
{
"epoch": 0.4332740213523132,
"grad_norm": 0.28515625,
"learning_rate": 0.00012132708010571252,
"loss": 0.6345,
"step": 1461
},
{
"epoch": 0.43416370106761565,
"grad_norm": 0.302734375,
"learning_rate": 0.00012105311375861785,
"loss": 0.6611,
"step": 1464
},
{
"epoch": 0.43505338078291816,
"grad_norm": 0.279296875,
"learning_rate": 0.00012077898196300208,
"loss": 0.652,
"step": 1467
},
{
"epoch": 0.4359430604982206,
"grad_norm": 0.28125,
"learning_rate": 0.00012050468687316419,
"loss": 0.6373,
"step": 1470
},
{
"epoch": 0.43683274021352314,
"grad_norm": 0.2890625,
"learning_rate": 0.00012023023064468637,
"loss": 0.6323,
"step": 1473
},
{
"epoch": 0.4377224199288256,
"grad_norm": 0.283203125,
"learning_rate": 0.00011995561543441709,
"loss": 0.6321,
"step": 1476
},
{
"epoch": 0.4386120996441281,
"grad_norm": 0.27734375,
"learning_rate": 0.00011968084340045425,
"loss": 0.6388,
"step": 1479
},
{
"epoch": 0.4395017793594306,
"grad_norm": 0.275390625,
"learning_rate": 0.0001194059167021282,
"loss": 0.653,
"step": 1482
},
{
"epoch": 0.4403914590747331,
"grad_norm": 0.3046875,
"learning_rate": 0.00011913083749998464,
"loss": 0.6604,
"step": 1485
},
{
"epoch": 0.4412811387900356,
"grad_norm": 0.2890625,
"learning_rate": 0.00011885560795576783,
"loss": 0.662,
"step": 1488
},
{
"epoch": 0.44217081850533807,
"grad_norm": 0.3046875,
"learning_rate": 0.0001185802302324035,
"loss": 0.6724,
"step": 1491
},
{
"epoch": 0.4430604982206406,
"grad_norm": 0.310546875,
"learning_rate": 0.00011830470649398182,
"loss": 0.6296,
"step": 1494
},
{
"epoch": 0.44395017793594305,
"grad_norm": 0.27734375,
"learning_rate": 0.00011802903890574046,
"loss": 0.6475,
"step": 1497
},
{
"epoch": 0.44483985765124556,
"grad_norm": 0.27734375,
"learning_rate": 0.00011775322963404756,
"loss": 0.6665,
"step": 1500
},
{
"epoch": 0.445729537366548,
"grad_norm": 0.279296875,
"learning_rate": 0.0001174772808463847,
"loss": 0.6361,
"step": 1503
},
{
"epoch": 0.44661921708185054,
"grad_norm": 0.259765625,
"learning_rate": 0.00011720119471132984,
"loss": 0.6302,
"step": 1506
},
{
"epoch": 0.447508896797153,
"grad_norm": 0.287109375,
"learning_rate": 0.00011692497339854031,
"loss": 0.6552,
"step": 1509
},
{
"epoch": 0.4483985765124555,
"grad_norm": 0.2734375,
"learning_rate": 0.00011664861907873584,
"loss": 0.6436,
"step": 1512
},
{
"epoch": 0.44928825622775803,
"grad_norm": 0.27734375,
"learning_rate": 0.00011637213392368118,
"loss": 0.6381,
"step": 1515
},
{
"epoch": 0.4501779359430605,
"grad_norm": 0.275390625,
"learning_rate": 0.00011609552010616954,
"loss": 0.659,
"step": 1518
},
{
"epoch": 0.451067615658363,
"grad_norm": 0.259765625,
"learning_rate": 0.00011581877980000508,
"loss": 0.6412,
"step": 1521
},
{
"epoch": 0.45195729537366547,
"grad_norm": 0.26953125,
"learning_rate": 0.00011554191517998598,
"loss": 0.637,
"step": 1524
},
{
"epoch": 0.452846975088968,
"grad_norm": 0.376953125,
"learning_rate": 0.00011526492842188745,
"loss": 0.64,
"step": 1527
},
{
"epoch": 0.45373665480427045,
"grad_norm": 0.27734375,
"learning_rate": 0.00011498782170244449,
"loss": 0.6525,
"step": 1530
},
{
"epoch": 0.45462633451957296,
"grad_norm": 0.267578125,
"learning_rate": 0.00011471059719933479,
"loss": 0.6248,
"step": 1533
},
{
"epoch": 0.4555160142348754,
"grad_norm": 0.375,
"learning_rate": 0.00011443325709116171,
"loss": 0.6334,
"step": 1536
},
{
"epoch": 0.45640569395017794,
"grad_norm": 0.2734375,
"learning_rate": 0.00011415580355743707,
"loss": 0.6381,
"step": 1539
},
{
"epoch": 0.45729537366548045,
"grad_norm": 0.26953125,
"learning_rate": 0.00011387823877856411,
"loss": 0.6367,
"step": 1542
},
{
"epoch": 0.4581850533807829,
"grad_norm": 0.267578125,
"learning_rate": 0.00011360056493582028,
"loss": 0.6443,
"step": 1545
},
{
"epoch": 0.45907473309608543,
"grad_norm": 0.27734375,
"learning_rate": 0.00011332278421134005,
"loss": 0.6342,
"step": 1548
},
{
"epoch": 0.4599644128113879,
"grad_norm": 0.27734375,
"learning_rate": 0.00011304489878809803,
"loss": 0.6385,
"step": 1551
},
{
"epoch": 0.4608540925266904,
"grad_norm": 0.2734375,
"learning_rate": 0.00011276691084989134,
"loss": 0.6102,
"step": 1554
},
{
"epoch": 0.46174377224199287,
"grad_norm": 0.279296875,
"learning_rate": 0.00011248882258132299,
"loss": 0.6285,
"step": 1557
},
{
"epoch": 0.4626334519572954,
"grad_norm": 0.267578125,
"learning_rate": 0.00011221063616778425,
"loss": 0.6504,
"step": 1560
},
{
"epoch": 0.46352313167259784,
"grad_norm": 0.27734375,
"learning_rate": 0.00011193235379543778,
"loss": 0.647,
"step": 1563
},
{
"epoch": 0.46441281138790036,
"grad_norm": 0.279296875,
"learning_rate": 0.00011165397765120033,
"loss": 0.645,
"step": 1566
},
{
"epoch": 0.4653024911032028,
"grad_norm": 0.30859375,
"learning_rate": 0.00011137550992272561,
"loss": 0.6311,
"step": 1569
},
{
"epoch": 0.46619217081850534,
"grad_norm": 0.28125,
"learning_rate": 0.0001110969527983869,
"loss": 0.6348,
"step": 1572
},
{
"epoch": 0.46708185053380785,
"grad_norm": 0.28515625,
"learning_rate": 0.00011081830846726021,
"loss": 0.6244,
"step": 1575
},
{
"epoch": 0.4679715302491103,
"grad_norm": 0.291015625,
"learning_rate": 0.00011053957911910671,
"loss": 0.6336,
"step": 1578
},
{
"epoch": 0.46886120996441283,
"grad_norm": 0.291015625,
"learning_rate": 0.0001102607669443558,
"loss": 0.6211,
"step": 1581
},
{
"epoch": 0.4697508896797153,
"grad_norm": 0.283203125,
"learning_rate": 0.00010998187413408774,
"loss": 0.6337,
"step": 1584
},
{
"epoch": 0.4706405693950178,
"grad_norm": 0.267578125,
"learning_rate": 0.00010970290288001644,
"loss": 0.6451,
"step": 1587
},
{
"epoch": 0.47153024911032027,
"grad_norm": 0.275390625,
"learning_rate": 0.00010942385537447236,
"loss": 0.6466,
"step": 1590
},
{
"epoch": 0.4724199288256228,
"grad_norm": 0.28125,
"learning_rate": 0.00010914473381038508,
"loss": 0.6324,
"step": 1593
},
{
"epoch": 0.47330960854092524,
"grad_norm": 0.279296875,
"learning_rate": 0.00010886554038126625,
"loss": 0.615,
"step": 1596
},
{
"epoch": 0.47419928825622776,
"grad_norm": 0.26171875,
"learning_rate": 0.00010858627728119226,
"loss": 0.6385,
"step": 1599
},
{
"epoch": 0.4750889679715303,
"grad_norm": 0.265625,
"learning_rate": 0.00010830694670478705,
"loss": 0.6253,
"step": 1602
},
{
"epoch": 0.47597864768683273,
"grad_norm": 0.267578125,
"learning_rate": 0.00010802755084720479,
"loss": 0.6188,
"step": 1605
},
{
"epoch": 0.47686832740213525,
"grad_norm": 0.2890625,
"learning_rate": 0.0001077480919041127,
"loss": 0.6179,
"step": 1608
},
{
"epoch": 0.4777580071174377,
"grad_norm": 0.271484375,
"learning_rate": 0.00010746857207167372,
"loss": 0.6427,
"step": 1611
},
{
"epoch": 0.4786476868327402,
"grad_norm": 0.271484375,
"learning_rate": 0.00010718899354652931,
"loss": 0.6096,
"step": 1614
},
{
"epoch": 0.4795373665480427,
"grad_norm": 0.48046875,
"learning_rate": 0.00010690935852578225,
"loss": 0.648,
"step": 1617
},
{
"epoch": 0.4804270462633452,
"grad_norm": 0.287109375,
"learning_rate": 0.00010662966920697919,
"loss": 0.619,
"step": 1620
},
{
"epoch": 0.48131672597864766,
"grad_norm": 0.271484375,
"learning_rate": 0.00010634992778809357,
"loss": 0.6368,
"step": 1623
},
{
"epoch": 0.4822064056939502,
"grad_norm": 0.279296875,
"learning_rate": 0.00010607013646750818,
"loss": 0.623,
"step": 1626
},
{
"epoch": 0.4830960854092527,
"grad_norm": 0.2734375,
"learning_rate": 0.00010579029744399809,
"loss": 0.6212,
"step": 1629
},
{
"epoch": 0.48398576512455516,
"grad_norm": 0.2734375,
"learning_rate": 0.00010551041291671311,
"loss": 0.6343,
"step": 1632
},
{
"epoch": 0.48487544483985767,
"grad_norm": 0.275390625,
"learning_rate": 0.00010523048508516075,
"loss": 0.634,
"step": 1635
},
{
"epoch": 0.48576512455516013,
"grad_norm": 0.275390625,
"learning_rate": 0.00010495051614918881,
"loss": 0.5907,
"step": 1638
},
{
"epoch": 0.48665480427046265,
"grad_norm": 0.267578125,
"learning_rate": 0.00010467050830896808,
"loss": 0.6122,
"step": 1641
},
{
"epoch": 0.4875444839857651,
"grad_norm": 0.2734375,
"learning_rate": 0.0001043904637649751,
"loss": 0.6175,
"step": 1644
},
{
"epoch": 0.4884341637010676,
"grad_norm": 0.27734375,
"learning_rate": 0.00010411038471797488,
"loss": 0.6488,
"step": 1647
},
{
"epoch": 0.4893238434163701,
"grad_norm": 0.29296875,
"learning_rate": 0.00010383027336900355,
"loss": 0.6245,
"step": 1650
},
{
"epoch": 0.4902135231316726,
"grad_norm": 0.265625,
"learning_rate": 0.00010355013191935108,
"loss": 0.6203,
"step": 1653
},
{
"epoch": 0.49110320284697506,
"grad_norm": 0.265625,
"learning_rate": 0.000103269962570544,
"loss": 0.6193,
"step": 1656
},
{
"epoch": 0.4919928825622776,
"grad_norm": 0.26953125,
"learning_rate": 0.00010298976752432812,
"loss": 0.6403,
"step": 1659
},
{
"epoch": 0.4928825622775801,
"grad_norm": 0.267578125,
"learning_rate": 0.00010270954898265112,
"loss": 0.6292,
"step": 1662
},
{
"epoch": 0.49377224199288255,
"grad_norm": 0.2890625,
"learning_rate": 0.00010242930914764541,
"loss": 0.6404,
"step": 1665
},
{
"epoch": 0.49466192170818507,
"grad_norm": 0.28125,
"learning_rate": 0.0001021490502216107,
"loss": 0.6359,
"step": 1668
},
{
"epoch": 0.49555160142348753,
"grad_norm": 0.27734375,
"learning_rate": 0.0001018687744069967,
"loss": 0.6442,
"step": 1671
},
{
"epoch": 0.49644128113879005,
"grad_norm": 0.287109375,
"learning_rate": 0.00010158848390638587,
"loss": 0.6515,
"step": 1674
},
{
"epoch": 0.4973309608540925,
"grad_norm": 0.27734375,
"learning_rate": 0.00010130818092247607,
"loss": 0.6115,
"step": 1677
},
{
"epoch": 0.498220640569395,
"grad_norm": 0.283203125,
"learning_rate": 0.0001010278676580633,
"loss": 0.6434,
"step": 1680
},
{
"epoch": 0.4991103202846975,
"grad_norm": 0.271484375,
"learning_rate": 0.00010074754631602428,
"loss": 0.6234,
"step": 1683
},
{
"epoch": 0.5,
"grad_norm": 0.29296875,
"learning_rate": 0.00010046721909929928,
"loss": 0.6571,
"step": 1686
},
{
"epoch": 0.5008896797153025,
"grad_norm": 0.279296875,
"learning_rate": 0.00010018688821087474,
"loss": 0.6454,
"step": 1689
},
{
"epoch": 0.501779359430605,
"grad_norm": 0.275390625,
"learning_rate": 9.990655585376584e-05,
"loss": 0.5973,
"step": 1692
},
{
"epoch": 0.5026690391459074,
"grad_norm": 0.271484375,
"learning_rate": 9.962622423099942e-05,
"loss": 0.6094,
"step": 1695
},
{
"epoch": 0.50355871886121,
"grad_norm": 0.28125,
"learning_rate": 9.934589554559653e-05,
"loss": 0.642,
"step": 1698
},
{
"epoch": 0.5044483985765125,
"grad_norm": 0.2578125,
"learning_rate": 9.906557200055508e-05,
"loss": 0.6283,
"step": 1701
},
{
"epoch": 0.505338078291815,
"grad_norm": 0.27734375,
"learning_rate": 9.878525579883265e-05,
"loss": 0.6277,
"step": 1704
},
{
"epoch": 0.5062277580071174,
"grad_norm": 0.279296875,
"learning_rate": 9.850494914332908e-05,
"loss": 0.627,
"step": 1707
},
{
"epoch": 0.5071174377224199,
"grad_norm": 0.2890625,
"learning_rate": 9.822465423686917e-05,
"loss": 0.6435,
"step": 1710
},
{
"epoch": 0.5080071174377224,
"grad_norm": 0.291015625,
"learning_rate": 9.794437328218546e-05,
"loss": 0.6537,
"step": 1713
},
{
"epoch": 0.5088967971530249,
"grad_norm": 0.287109375,
"learning_rate": 9.766410848190077e-05,
"loss": 0.6366,
"step": 1716
},
{
"epoch": 0.5097864768683275,
"grad_norm": 0.265625,
"learning_rate": 9.738386203851101e-05,
"loss": 0.625,
"step": 1719
},
{
"epoch": 0.5106761565836299,
"grad_norm": 0.263671875,
"learning_rate": 9.710363615436776e-05,
"loss": 0.6372,
"step": 1722
},
{
"epoch": 0.5115658362989324,
"grad_norm": 0.294921875,
"learning_rate": 9.682343303166117e-05,
"loss": 0.6188,
"step": 1725
},
{
"epoch": 0.5124555160142349,
"grad_norm": 0.271484375,
"learning_rate": 9.654325487240243e-05,
"loss": 0.6223,
"step": 1728
},
{
"epoch": 0.5133451957295374,
"grad_norm": 0.263671875,
"learning_rate": 9.626310387840648e-05,
"loss": 0.6096,
"step": 1731
},
{
"epoch": 0.5142348754448398,
"grad_norm": 0.2578125,
"learning_rate": 9.598298225127498e-05,
"loss": 0.615,
"step": 1734
},
{
"epoch": 0.5151245551601423,
"grad_norm": 0.2734375,
"learning_rate": 9.570289219237858e-05,
"loss": 0.6377,
"step": 1737
},
{
"epoch": 0.5160142348754448,
"grad_norm": 0.26953125,
"learning_rate": 9.542283590284002e-05,
"loss": 0.6191,
"step": 1740
},
{
"epoch": 0.5169039145907474,
"grad_norm": 0.2734375,
"learning_rate": 9.514281558351653e-05,
"loss": 0.625,
"step": 1743
},
{
"epoch": 0.5177935943060499,
"grad_norm": 0.283203125,
"learning_rate": 9.486283343498277e-05,
"loss": 0.6202,
"step": 1746
},
{
"epoch": 0.5186832740213523,
"grad_norm": 0.2734375,
"learning_rate": 9.458289165751339e-05,
"loss": 0.613,
"step": 1749
},
{
"epoch": 0.5195729537366548,
"grad_norm": 0.28125,
"learning_rate": 9.430299245106573e-05,
"loss": 0.6306,
"step": 1752
},
{
"epoch": 0.5204626334519573,
"grad_norm": 0.255859375,
"learning_rate": 9.402313801526267e-05,
"loss": 0.6162,
"step": 1755
},
{
"epoch": 0.5213523131672598,
"grad_norm": 0.279296875,
"learning_rate": 9.37433305493752e-05,
"loss": 0.6472,
"step": 1758
},
{
"epoch": 0.5222419928825622,
"grad_norm": 0.265625,
"learning_rate": 9.346357225230519e-05,
"loss": 0.6347,
"step": 1761
},
{
"epoch": 0.5231316725978647,
"grad_norm": 0.271484375,
"learning_rate": 9.318386532256807e-05,
"loss": 0.6155,
"step": 1764
},
{
"epoch": 0.5240213523131673,
"grad_norm": 0.265625,
"learning_rate": 9.290421195827572e-05,
"loss": 0.6112,
"step": 1767
},
{
"epoch": 0.5249110320284698,
"grad_norm": 0.275390625,
"learning_rate": 9.262461435711898e-05,
"loss": 0.6252,
"step": 1770
},
{
"epoch": 0.5258007117437722,
"grad_norm": 0.283203125,
"learning_rate": 9.234507471635043e-05,
"loss": 0.6241,
"step": 1773
},
{
"epoch": 0.5266903914590747,
"grad_norm": 0.291015625,
"learning_rate": 9.206559523276731e-05,
"loss": 0.6346,
"step": 1776
},
{
"epoch": 0.5275800711743772,
"grad_norm": 0.28125,
"learning_rate": 9.178617810269388e-05,
"loss": 0.6489,
"step": 1779
},
{
"epoch": 0.5284697508896797,
"grad_norm": 0.263671875,
"learning_rate": 9.150682552196462e-05,
"loss": 0.6247,
"step": 1782
},
{
"epoch": 0.5293594306049823,
"grad_norm": 0.259765625,
"learning_rate": 9.12275396859066e-05,
"loss": 0.5991,
"step": 1785
},
{
"epoch": 0.5302491103202847,
"grad_norm": 0.26953125,
"learning_rate": 9.094832278932238e-05,
"loss": 0.6174,
"step": 1788
},
{
"epoch": 0.5311387900355872,
"grad_norm": 0.27734375,
"learning_rate": 9.066917702647284e-05,
"loss": 0.6204,
"step": 1791
},
{
"epoch": 0.5320284697508897,
"grad_norm": 0.26953125,
"learning_rate": 9.039010459105974e-05,
"loss": 0.584,
"step": 1794
},
{
"epoch": 0.5329181494661922,
"grad_norm": 0.27734375,
"learning_rate": 9.011110767620865e-05,
"loss": 0.6179,
"step": 1797
},
{
"epoch": 0.5338078291814946,
"grad_norm": 0.267578125,
"learning_rate": 8.983218847445157e-05,
"loss": 0.5993,
"step": 1800
},
{
"epoch": 0.5346975088967971,
"grad_norm": 0.271484375,
"learning_rate": 8.955334917770993e-05,
"loss": 0.6324,
"step": 1803
},
{
"epoch": 0.5355871886120996,
"grad_norm": 0.265625,
"learning_rate": 8.927459197727712e-05,
"loss": 0.6048,
"step": 1806
},
{
"epoch": 0.5364768683274022,
"grad_norm": 0.279296875,
"learning_rate": 8.899591906380131e-05,
"loss": 0.6138,
"step": 1809
},
{
"epoch": 0.5373665480427047,
"grad_norm": 0.271484375,
"learning_rate": 8.871733262726846e-05,
"loss": 0.626,
"step": 1812
},
{
"epoch": 0.5382562277580071,
"grad_norm": 0.275390625,
"learning_rate": 8.843883485698474e-05,
"loss": 0.6169,
"step": 1815
},
{
"epoch": 0.5391459074733096,
"grad_norm": 0.265625,
"learning_rate": 8.81604279415597e-05,
"loss": 0.6043,
"step": 1818
},
{
"epoch": 0.5400355871886121,
"grad_norm": 0.2734375,
"learning_rate": 8.788211406888872e-05,
"loss": 0.6216,
"step": 1821
},
{
"epoch": 0.5409252669039146,
"grad_norm": 0.283203125,
"learning_rate": 8.76038954261362e-05,
"loss": 0.6365,
"step": 1824
},
{
"epoch": 0.541814946619217,
"grad_norm": 0.271484375,
"learning_rate": 8.732577419971801e-05,
"loss": 0.6226,
"step": 1827
},
{
"epoch": 0.5427046263345195,
"grad_norm": 0.265625,
"learning_rate": 8.704775257528448e-05,
"loss": 0.6179,
"step": 1830
},
{
"epoch": 0.5435943060498221,
"grad_norm": 0.27734375,
"learning_rate": 8.676983273770327e-05,
"loss": 0.6191,
"step": 1833
},
{
"epoch": 0.5444839857651246,
"grad_norm": 0.2890625,
"learning_rate": 8.649201687104209e-05,
"loss": 0.6243,
"step": 1836
},
{
"epoch": 0.5453736654804271,
"grad_norm": 0.271484375,
"learning_rate": 8.621430715855155e-05,
"loss": 0.628,
"step": 1839
},
{
"epoch": 0.5462633451957295,
"grad_norm": 0.263671875,
"learning_rate": 8.593670578264814e-05,
"loss": 0.6108,
"step": 1842
},
{
"epoch": 0.547153024911032,
"grad_norm": 0.259765625,
"learning_rate": 8.565921492489686e-05,
"loss": 0.609,
"step": 1845
},
{
"epoch": 0.5480427046263345,
"grad_norm": 0.26953125,
"learning_rate": 8.538183676599426e-05,
"loss": 0.6135,
"step": 1848
},
{
"epoch": 0.548932384341637,
"grad_norm": 0.26953125,
"learning_rate": 8.510457348575115e-05,
"loss": 0.623,
"step": 1851
},
{
"epoch": 0.5498220640569395,
"grad_norm": 0.28515625,
"learning_rate": 8.482742726307569e-05,
"loss": 0.6345,
"step": 1854
},
{
"epoch": 0.550711743772242,
"grad_norm": 0.271484375,
"learning_rate": 8.4550400275956e-05,
"loss": 0.6167,
"step": 1857
},
{
"epoch": 0.5516014234875445,
"grad_norm": 0.287109375,
"learning_rate": 8.427349470144319e-05,
"loss": 0.6199,
"step": 1860
},
{
"epoch": 0.552491103202847,
"grad_norm": 0.267578125,
"learning_rate": 8.399671271563438e-05,
"loss": 0.589,
"step": 1863
},
{
"epoch": 0.5533807829181495,
"grad_norm": 0.2734375,
"learning_rate": 8.372005649365519e-05,
"loss": 0.6183,
"step": 1866
},
{
"epoch": 0.5542704626334519,
"grad_norm": 0.2734375,
"learning_rate": 8.344352820964317e-05,
"loss": 0.6555,
"step": 1869
},
{
"epoch": 0.5551601423487544,
"grad_norm": 0.26953125,
"learning_rate": 8.316713003673028e-05,
"loss": 0.6172,
"step": 1872
},
{
"epoch": 0.556049822064057,
"grad_norm": 0.26953125,
"learning_rate": 8.289086414702609e-05,
"loss": 0.6328,
"step": 1875
},
{
"epoch": 0.5569395017793595,
"grad_norm": 0.27734375,
"learning_rate": 8.261473271160046e-05,
"loss": 0.6314,
"step": 1878
},
{
"epoch": 0.5578291814946619,
"grad_norm": 0.26953125,
"learning_rate": 8.233873790046684e-05,
"loss": 0.6392,
"step": 1881
},
{
"epoch": 0.5587188612099644,
"grad_norm": 0.2734375,
"learning_rate": 8.206288188256486e-05,
"loss": 0.6297,
"step": 1884
},
{
"epoch": 0.5596085409252669,
"grad_norm": 0.26953125,
"learning_rate": 8.178716682574339e-05,
"loss": 0.5915,
"step": 1887
},
{
"epoch": 0.5604982206405694,
"grad_norm": 0.279296875,
"learning_rate": 8.15115948967437e-05,
"loss": 0.625,
"step": 1890
},
{
"epoch": 0.5613879003558719,
"grad_norm": 0.279296875,
"learning_rate": 8.12361682611821e-05,
"loss": 0.6307,
"step": 1893
},
{
"epoch": 0.5622775800711743,
"grad_norm": 0.26953125,
"learning_rate": 8.096088908353315e-05,
"loss": 0.6018,
"step": 1896
},
{
"epoch": 0.5631672597864769,
"grad_norm": 0.287109375,
"learning_rate": 8.068575952711272e-05,
"loss": 0.6045,
"step": 1899
},
{
"epoch": 0.5640569395017794,
"grad_norm": 0.271484375,
"learning_rate": 8.041078175406064e-05,
"loss": 0.5921,
"step": 1902
},
{
"epoch": 0.5649466192170819,
"grad_norm": 0.271484375,
"learning_rate": 8.013595792532412e-05,
"loss": 0.6339,
"step": 1905
},
{
"epoch": 0.5658362989323843,
"grad_norm": 0.2734375,
"learning_rate": 7.986129020064044e-05,
"loss": 0.6147,
"step": 1908
},
{
"epoch": 0.5667259786476868,
"grad_norm": 0.259765625,
"learning_rate": 7.958678073852025e-05,
"loss": 0.5981,
"step": 1911
},
{
"epoch": 0.5676156583629893,
"grad_norm": 0.275390625,
"learning_rate": 7.931243169623037e-05,
"loss": 0.6201,
"step": 1914
},
{
"epoch": 0.5685053380782918,
"grad_norm": 0.263671875,
"learning_rate": 7.903824522977695e-05,
"loss": 0.5997,
"step": 1917
},
{
"epoch": 0.5693950177935944,
"grad_norm": 0.28125,
"learning_rate": 7.876422349388862e-05,
"loss": 0.6216,
"step": 1920
},
{
"epoch": 0.5702846975088968,
"grad_norm": 0.27734375,
"learning_rate": 7.849036864199931e-05,
"loss": 0.6365,
"step": 1923
},
{
"epoch": 0.5711743772241993,
"grad_norm": 0.291015625,
"learning_rate": 7.821668282623158e-05,
"loss": 0.5978,
"step": 1926
},
{
"epoch": 0.5720640569395018,
"grad_norm": 0.26953125,
"learning_rate": 7.79431681973795e-05,
"loss": 0.605,
"step": 1929
},
{
"epoch": 0.5729537366548043,
"grad_norm": 0.26953125,
"learning_rate": 7.766982690489199e-05,
"loss": 0.6106,
"step": 1932
},
{
"epoch": 0.5738434163701067,
"grad_norm": 0.283203125,
"learning_rate": 7.739666109685563e-05,
"loss": 0.6312,
"step": 1935
},
{
"epoch": 0.5747330960854092,
"grad_norm": 0.265625,
"learning_rate": 7.7123672919978e-05,
"loss": 0.6069,
"step": 1938
},
{
"epoch": 0.5756227758007118,
"grad_norm": 0.259765625,
"learning_rate": 7.685086451957084e-05,
"loss": 0.6266,
"step": 1941
},
{
"epoch": 0.5765124555160143,
"grad_norm": 0.275390625,
"learning_rate": 7.657823803953288e-05,
"loss": 0.5883,
"step": 1944
},
{
"epoch": 0.5774021352313167,
"grad_norm": 0.26171875,
"learning_rate": 7.63057956223334e-05,
"loss": 0.6277,
"step": 1947
},
{
"epoch": 0.5782918149466192,
"grad_norm": 0.271484375,
"learning_rate": 7.60335394089951e-05,
"loss": 0.6281,
"step": 1950
},
{
"epoch": 0.5791814946619217,
"grad_norm": 0.26171875,
"learning_rate": 7.576147153907742e-05,
"loss": 0.6225,
"step": 1953
},
{
"epoch": 0.5800711743772242,
"grad_norm": 0.259765625,
"learning_rate": 7.54895941506596e-05,
"loss": 0.6119,
"step": 1956
},
{
"epoch": 0.5809608540925267,
"grad_norm": 0.2890625,
"learning_rate": 7.521790938032408e-05,
"loss": 0.6066,
"step": 1959
},
{
"epoch": 0.5818505338078291,
"grad_norm": 0.267578125,
"learning_rate": 7.494641936313953e-05,
"loss": 0.5946,
"step": 1962
},
{
"epoch": 0.5827402135231317,
"grad_norm": 0.26953125,
"learning_rate": 7.467512623264403e-05,
"loss": 0.6134,
"step": 1965
},
{
"epoch": 0.5836298932384342,
"grad_norm": 0.267578125,
"learning_rate": 7.440403212082862e-05,
"loss": 0.6004,
"step": 1968
},
{
"epoch": 0.5845195729537367,
"grad_norm": 0.283203125,
"learning_rate": 7.41331391581201e-05,
"loss": 0.613,
"step": 1971
},
{
"epoch": 0.5854092526690391,
"grad_norm": 0.2734375,
"learning_rate": 7.386244947336462e-05,
"loss": 0.6162,
"step": 1974
},
{
"epoch": 0.5862989323843416,
"grad_norm": 0.287109375,
"learning_rate": 7.359196519381092e-05,
"loss": 0.6304,
"step": 1977
},
{
"epoch": 0.5871886120996441,
"grad_norm": 0.275390625,
"learning_rate": 7.33216884450934e-05,
"loss": 0.6367,
"step": 1980
},
{
"epoch": 0.5880782918149466,
"grad_norm": 0.26171875,
"learning_rate": 7.305162135121561e-05,
"loss": 0.5892,
"step": 1983
},
{
"epoch": 0.5889679715302492,
"grad_norm": 0.255859375,
"learning_rate": 7.278176603453347e-05,
"loss": 0.6313,
"step": 1986
},
{
"epoch": 0.5898576512455516,
"grad_norm": 0.263671875,
"learning_rate": 7.251212461573873e-05,
"loss": 0.6132,
"step": 1989
},
{
"epoch": 0.5907473309608541,
"grad_norm": 0.267578125,
"learning_rate": 7.224269921384206e-05,
"loss": 0.6081,
"step": 1992
},
{
"epoch": 0.5916370106761566,
"grad_norm": 0.259765625,
"learning_rate": 7.197349194615656e-05,
"loss": 0.5778,
"step": 1995
},
{
"epoch": 0.5925266903914591,
"grad_norm": 0.271484375,
"learning_rate": 7.170450492828125e-05,
"loss": 0.6191,
"step": 1998
},
{
"epoch": 0.5934163701067615,
"grad_norm": 0.27734375,
"learning_rate": 7.143574027408408e-05,
"loss": 0.6019,
"step": 2001
},
{
"epoch": 0.594306049822064,
"grad_norm": 0.28125,
"learning_rate": 7.116720009568564e-05,
"loss": 0.6255,
"step": 2004
},
{
"epoch": 0.5951957295373665,
"grad_norm": 0.265625,
"learning_rate": 7.08988865034424e-05,
"loss": 0.5998,
"step": 2007
},
{
"epoch": 0.5960854092526691,
"grad_norm": 0.259765625,
"learning_rate": 7.063080160593025e-05,
"loss": 0.6059,
"step": 2010
},
{
"epoch": 0.5969750889679716,
"grad_norm": 0.263671875,
"learning_rate": 7.036294750992775e-05,
"loss": 0.6082,
"step": 2013
},
{
"epoch": 0.597864768683274,
"grad_norm": 0.263671875,
"learning_rate": 7.009532632039975e-05,
"loss": 0.6038,
"step": 2016
},
{
"epoch": 0.5987544483985765,
"grad_norm": 0.271484375,
"learning_rate": 6.982794014048077e-05,
"loss": 0.6042,
"step": 2019
},
{
"epoch": 0.599644128113879,
"grad_norm": 0.2734375,
"learning_rate": 6.956079107145845e-05,
"loss": 0.6194,
"step": 2022
},
{
"epoch": 0.6005338078291815,
"grad_norm": 0.265625,
"learning_rate": 6.92938812127571e-05,
"loss": 0.6125,
"step": 2025
},
{
"epoch": 0.6014234875444839,
"grad_norm": 0.265625,
"learning_rate": 6.902721266192111e-05,
"loss": 0.6206,
"step": 2028
},
{
"epoch": 0.6023131672597865,
"grad_norm": 0.267578125,
"learning_rate": 6.876078751459856e-05,
"loss": 0.5901,
"step": 2031
},
{
"epoch": 0.603202846975089,
"grad_norm": 0.27734375,
"learning_rate": 6.849460786452475e-05,
"loss": 0.6007,
"step": 2034
},
{
"epoch": 0.6040925266903915,
"grad_norm": 0.283203125,
"learning_rate": 6.822867580350563e-05,
"loss": 0.6419,
"step": 2037
},
{
"epoch": 0.604982206405694,
"grad_norm": 0.275390625,
"learning_rate": 6.79629934214015e-05,
"loss": 0.6086,
"step": 2040
},
{
"epoch": 0.6058718861209964,
"grad_norm": 0.271484375,
"learning_rate": 6.769756280611046e-05,
"loss": 0.6324,
"step": 2043
},
{
"epoch": 0.6067615658362989,
"grad_norm": 0.271484375,
"learning_rate": 6.743238604355219e-05,
"loss": 0.5897,
"step": 2046
},
{
"epoch": 0.6076512455516014,
"grad_norm": 0.259765625,
"learning_rate": 6.716746521765131e-05,
"loss": 0.598,
"step": 2049
},
{
"epoch": 0.608540925266904,
"grad_norm": 0.259765625,
"learning_rate": 6.690280241032116e-05,
"loss": 0.6165,
"step": 2052
},
{
"epoch": 0.6094306049822064,
"grad_norm": 0.265625,
"learning_rate": 6.663839970144751e-05,
"loss": 0.607,
"step": 2055
},
{
"epoch": 0.6103202846975089,
"grad_norm": 0.27734375,
"learning_rate": 6.637425916887198e-05,
"loss": 0.6024,
"step": 2058
},
{
"epoch": 0.6112099644128114,
"grad_norm": 0.26953125,
"learning_rate": 6.611038288837593e-05,
"loss": 0.6074,
"step": 2061
},
{
"epoch": 0.6120996441281139,
"grad_norm": 0.263671875,
"learning_rate": 6.584677293366396e-05,
"loss": 0.6156,
"step": 2064
},
{
"epoch": 0.6129893238434164,
"grad_norm": 0.287109375,
"learning_rate": 6.558343137634788e-05,
"loss": 0.6304,
"step": 2067
},
{
"epoch": 0.6138790035587188,
"grad_norm": 0.263671875,
"learning_rate": 6.532036028593011e-05,
"loss": 0.5998,
"step": 2070
},
{
"epoch": 0.6147686832740213,
"grad_norm": 0.263671875,
"learning_rate": 6.505756172978765e-05,
"loss": 0.6092,
"step": 2073
},
{
"epoch": 0.6156583629893239,
"grad_norm": 0.265625,
"learning_rate": 6.479503777315577e-05,
"loss": 0.6055,
"step": 2076
},
{
"epoch": 0.6165480427046264,
"grad_norm": 0.2734375,
"learning_rate": 6.453279047911169e-05,
"loss": 0.6009,
"step": 2079
},
{
"epoch": 0.6174377224199288,
"grad_norm": 0.267578125,
"learning_rate": 6.427082190855854e-05,
"loss": 0.6202,
"step": 2082
},
{
"epoch": 0.6183274021352313,
"grad_norm": 0.275390625,
"learning_rate": 6.400913412020895e-05,
"loss": 0.6211,
"step": 2085
},
{
"epoch": 0.6192170818505338,
"grad_norm": 0.267578125,
"learning_rate": 6.374772917056908e-05,
"loss": 0.6151,
"step": 2088
},
{
"epoch": 0.6201067615658363,
"grad_norm": 0.255859375,
"learning_rate": 6.34866091139224e-05,
"loss": 0.6111,
"step": 2091
},
{
"epoch": 0.6209964412811388,
"grad_norm": 0.25,
"learning_rate": 6.322577600231332e-05,
"loss": 0.6109,
"step": 2094
},
{
"epoch": 0.6218861209964412,
"grad_norm": 0.267578125,
"learning_rate": 6.296523188553153e-05,
"loss": 0.5829,
"step": 2097
},
{
"epoch": 0.6227758007117438,
"grad_norm": 0.259765625,
"learning_rate": 6.270497881109541e-05,
"loss": 0.6002,
"step": 2100
},
{
"epoch": 0.6236654804270463,
"grad_norm": 0.265625,
"learning_rate": 6.244501882423621e-05,
"loss": 0.6095,
"step": 2103
},
{
"epoch": 0.6245551601423488,
"grad_norm": 0.26171875,
"learning_rate": 6.21853539678819e-05,
"loss": 0.6049,
"step": 2106
},
{
"epoch": 0.6254448398576512,
"grad_norm": 0.25,
"learning_rate": 6.192598628264121e-05,
"loss": 0.604,
"step": 2109
},
{
"epoch": 0.6263345195729537,
"grad_norm": 0.2578125,
"learning_rate": 6.166691780678743e-05,
"loss": 0.6092,
"step": 2112
},
{
"epoch": 0.6272241992882562,
"grad_norm": 0.267578125,
"learning_rate": 6.140815057624248e-05,
"loss": 0.6099,
"step": 2115
},
{
"epoch": 0.6281138790035588,
"grad_norm": 0.259765625,
"learning_rate": 6.114968662456093e-05,
"loss": 0.5978,
"step": 2118
},
{
"epoch": 0.6290035587188612,
"grad_norm": 0.271484375,
"learning_rate": 6.089152798291398e-05,
"loss": 0.5994,
"step": 2121
},
{
"epoch": 0.6298932384341637,
"grad_norm": 0.2734375,
"learning_rate": 6.063367668007356e-05,
"loss": 0.5946,
"step": 2124
},
{
"epoch": 0.6307829181494662,
"grad_norm": 0.265625,
"learning_rate": 6.0376134742396276e-05,
"loss": 0.6131,
"step": 2127
},
{
"epoch": 0.6316725978647687,
"grad_norm": 0.2734375,
"learning_rate": 6.011890419380756e-05,
"loss": 0.6259,
"step": 2130
},
{
"epoch": 0.6325622775800712,
"grad_norm": 0.28125,
"learning_rate": 5.986198705578583e-05,
"loss": 0.6079,
"step": 2133
},
{
"epoch": 0.6334519572953736,
"grad_norm": 0.248046875,
"learning_rate": 5.960538534734641e-05,
"loss": 0.5792,
"step": 2136
},
{
"epoch": 0.6343416370106761,
"grad_norm": 0.255859375,
"learning_rate": 5.934910108502587e-05,
"loss": 0.5913,
"step": 2139
},
{
"epoch": 0.6352313167259787,
"grad_norm": 0.271484375,
"learning_rate": 5.909313628286601e-05,
"loss": 0.6215,
"step": 2142
},
{
"epoch": 0.6361209964412812,
"grad_norm": 0.265625,
"learning_rate": 5.8837492952398234e-05,
"loss": 0.5994,
"step": 2145
},
{
"epoch": 0.6370106761565836,
"grad_norm": 0.275390625,
"learning_rate": 5.8582173102627524e-05,
"loss": 0.5984,
"step": 2148
},
{
"epoch": 0.6379003558718861,
"grad_norm": 0.259765625,
"learning_rate": 5.8327178740016744e-05,
"loss": 0.612,
"step": 2151
},
{
"epoch": 0.6387900355871886,
"grad_norm": 0.2578125,
"learning_rate": 5.8072511868470945e-05,
"loss": 0.596,
"step": 2154
},
{
"epoch": 0.6396797153024911,
"grad_norm": 0.2578125,
"learning_rate": 5.781817448932145e-05,
"loss": 0.6327,
"step": 2157
},
{
"epoch": 0.6405693950177936,
"grad_norm": 0.265625,
"learning_rate": 5.756416860131036e-05,
"loss": 0.6162,
"step": 2160
},
{
"epoch": 0.641459074733096,
"grad_norm": 0.26953125,
"learning_rate": 5.731049620057457e-05,
"loss": 0.6125,
"step": 2163
},
{
"epoch": 0.6423487544483986,
"grad_norm": 0.2578125,
"learning_rate": 5.705715928063031e-05,
"loss": 0.6053,
"step": 2166
},
{
"epoch": 0.6432384341637011,
"grad_norm": 0.25390625,
"learning_rate": 5.6804159832357426e-05,
"loss": 0.5802,
"step": 2169
},
{
"epoch": 0.6441281138790036,
"grad_norm": 0.26953125,
"learning_rate": 5.655149984398359e-05,
"loss": 0.6088,
"step": 2172
},
{
"epoch": 0.645017793594306,
"grad_norm": 0.28125,
"learning_rate": 5.629918130106886e-05,
"loss": 0.6359,
"step": 2175
},
{
"epoch": 0.6459074733096085,
"grad_norm": 0.267578125,
"learning_rate": 5.6047206186489934e-05,
"loss": 0.5961,
"step": 2178
},
{
"epoch": 0.646797153024911,
"grad_norm": 0.265625,
"learning_rate": 5.5795576480424774e-05,
"loss": 0.5898,
"step": 2181
},
{
"epoch": 0.6476868327402135,
"grad_norm": 0.27734375,
"learning_rate": 5.554429416033673e-05,
"loss": 0.6315,
"step": 2184
},
{
"epoch": 0.6485765124555161,
"grad_norm": 0.259765625,
"learning_rate": 5.5293361200959314e-05,
"loss": 0.6186,
"step": 2187
},
{
"epoch": 0.6494661921708185,
"grad_norm": 0.259765625,
"learning_rate": 5.504277957428052e-05,
"loss": 0.6337,
"step": 2190
},
{
"epoch": 0.650355871886121,
"grad_norm": 0.267578125,
"learning_rate": 5.4792551249527314e-05,
"loss": 0.6123,
"step": 2193
},
{
"epoch": 0.6512455516014235,
"grad_norm": 0.267578125,
"learning_rate": 5.454267819315015e-05,
"loss": 0.6191,
"step": 2196
},
{
"epoch": 0.652135231316726,
"grad_norm": 0.275390625,
"learning_rate": 5.429316236880764e-05,
"loss": 0.6093,
"step": 2199
},
{
"epoch": 0.6530249110320284,
"grad_norm": 0.2734375,
"learning_rate": 5.4044005737351044e-05,
"loss": 0.6005,
"step": 2202
},
{
"epoch": 0.6539145907473309,
"grad_norm": 0.2734375,
"learning_rate": 5.379521025680878e-05,
"loss": 0.6154,
"step": 2205
},
{
"epoch": 0.6548042704626335,
"grad_norm": 0.265625,
"learning_rate": 5.3546777882371254e-05,
"loss": 0.6227,
"step": 2208
},
{
"epoch": 0.655693950177936,
"grad_norm": 0.263671875,
"learning_rate": 5.329871056637524e-05,
"loss": 0.6065,
"step": 2211
},
{
"epoch": 0.6565836298932385,
"grad_norm": 0.263671875,
"learning_rate": 5.305101025828863e-05,
"loss": 0.6088,
"step": 2214
},
{
"epoch": 0.6574733096085409,
"grad_norm": 0.275390625,
"learning_rate": 5.280367890469529e-05,
"loss": 0.5995,
"step": 2217
},
{
"epoch": 0.6583629893238434,
"grad_norm": 0.271484375,
"learning_rate": 5.255671844927944e-05,
"loss": 0.6172,
"step": 2220
},
{
"epoch": 0.6592526690391459,
"grad_norm": 0.275390625,
"learning_rate": 5.231013083281067e-05,
"loss": 0.603,
"step": 2223
},
{
"epoch": 0.6601423487544484,
"grad_norm": 0.265625,
"learning_rate": 5.2063917993128554e-05,
"loss": 0.6181,
"step": 2226
},
{
"epoch": 0.6610320284697508,
"grad_norm": 0.271484375,
"learning_rate": 5.1818081865127386e-05,
"loss": 0.6015,
"step": 2229
},
{
"epoch": 0.6619217081850534,
"grad_norm": 0.279296875,
"learning_rate": 5.157262438074104e-05,
"loss": 0.6155,
"step": 2232
},
{
"epoch": 0.6628113879003559,
"grad_norm": 0.259765625,
"learning_rate": 5.132754746892776e-05,
"loss": 0.5928,
"step": 2235
},
{
"epoch": 0.6637010676156584,
"grad_norm": 0.25390625,
"learning_rate": 5.1082853055655076e-05,
"loss": 0.6114,
"step": 2238
},
{
"epoch": 0.6645907473309609,
"grad_norm": 0.275390625,
"learning_rate": 5.0838543063884515e-05,
"loss": 0.6169,
"step": 2241
},
{
"epoch": 0.6654804270462633,
"grad_norm": 0.267578125,
"learning_rate": 5.059461941355666e-05,
"loss": 0.5962,
"step": 2244
},
{
"epoch": 0.6663701067615658,
"grad_norm": 0.28125,
"learning_rate": 5.035108402157598e-05,
"loss": 0.6391,
"step": 2247
},
{
"epoch": 0.6672597864768683,
"grad_norm": 0.275390625,
"learning_rate": 5.0107938801795695e-05,
"loss": 0.6067,
"step": 2250
},
{
"epoch": 0.6681494661921709,
"grad_norm": 0.263671875,
"learning_rate": 4.986518566500287e-05,
"loss": 0.6251,
"step": 2253
},
{
"epoch": 0.6690391459074733,
"grad_norm": 0.26953125,
"learning_rate": 4.962282651890325e-05,
"loss": 0.5799,
"step": 2256
},
{
"epoch": 0.6699288256227758,
"grad_norm": 0.259765625,
"learning_rate": 4.938086326810651e-05,
"loss": 0.6,
"step": 2259
},
{
"epoch": 0.6708185053380783,
"grad_norm": 0.2578125,
"learning_rate": 4.913929781411098e-05,
"loss": 0.5815,
"step": 2262
},
{
"epoch": 0.6717081850533808,
"grad_norm": 0.263671875,
"learning_rate": 4.889813205528895e-05,
"loss": 0.5998,
"step": 2265
},
{
"epoch": 0.6725978647686833,
"grad_norm": 0.259765625,
"learning_rate": 4.865736788687164e-05,
"loss": 0.6064,
"step": 2268
},
{
"epoch": 0.6734875444839857,
"grad_norm": 0.25390625,
"learning_rate": 4.8417007200934294e-05,
"loss": 0.5976,
"step": 2271
},
{
"epoch": 0.6743772241992882,
"grad_norm": 0.26171875,
"learning_rate": 4.8177051886381344e-05,
"loss": 0.5906,
"step": 2274
},
{
"epoch": 0.6752669039145908,
"grad_norm": 0.27734375,
"learning_rate": 4.793750382893151e-05,
"loss": 0.6096,
"step": 2277
},
{
"epoch": 0.6761565836298933,
"grad_norm": 0.263671875,
"learning_rate": 4.769836491110314e-05,
"loss": 0.6067,
"step": 2280
},
{
"epoch": 0.6770462633451957,
"grad_norm": 0.271484375,
"learning_rate": 4.74596370121993e-05,
"loss": 0.6047,
"step": 2283
},
{
"epoch": 0.6779359430604982,
"grad_norm": 0.259765625,
"learning_rate": 4.7221322008292915e-05,
"loss": 0.5876,
"step": 2286
},
{
"epoch": 0.6788256227758007,
"grad_norm": 0.259765625,
"learning_rate": 4.698342177221219e-05,
"loss": 0.6129,
"step": 2289
},
{
"epoch": 0.6797153024911032,
"grad_norm": 0.26171875,
"learning_rate": 4.674593817352575e-05,
"loss": 0.6128,
"step": 2292
},
{
"epoch": 0.6806049822064056,
"grad_norm": 0.26953125,
"learning_rate": 4.650887307852818e-05,
"loss": 0.6152,
"step": 2295
},
{
"epoch": 0.6814946619217082,
"grad_norm": 0.26171875,
"learning_rate": 4.627222835022502e-05,
"loss": 0.6167,
"step": 2298
},
{
"epoch": 0.6823843416370107,
"grad_norm": 0.2578125,
"learning_rate": 4.603600584831844e-05,
"loss": 0.6155,
"step": 2301
},
{
"epoch": 0.6832740213523132,
"grad_norm": 0.265625,
"learning_rate": 4.580020742919246e-05,
"loss": 0.6212,
"step": 2304
},
{
"epoch": 0.6841637010676157,
"grad_norm": 0.2578125,
"learning_rate": 4.556483494589836e-05,
"loss": 0.6115,
"step": 2307
},
{
"epoch": 0.6850533807829181,
"grad_norm": 0.267578125,
"learning_rate": 4.532989024814015e-05,
"loss": 0.5939,
"step": 2310
},
{
"epoch": 0.6859430604982206,
"grad_norm": 0.26171875,
"learning_rate": 4.5095375182260016e-05,
"loss": 0.5834,
"step": 2313
},
{
"epoch": 0.6868327402135231,
"grad_norm": 0.26171875,
"learning_rate": 4.486129159122393e-05,
"loss": 0.5859,
"step": 2316
},
{
"epoch": 0.6877224199288257,
"grad_norm": 0.26953125,
"learning_rate": 4.462764131460694e-05,
"loss": 0.6076,
"step": 2319
},
{
"epoch": 0.6886120996441281,
"grad_norm": 0.263671875,
"learning_rate": 4.439442618857891e-05,
"loss": 0.5954,
"step": 2322
},
{
"epoch": 0.6895017793594306,
"grad_norm": 0.26171875,
"learning_rate": 4.416164804589005e-05,
"loss": 0.6072,
"step": 2325
},
{
"epoch": 0.6903914590747331,
"grad_norm": 0.265625,
"learning_rate": 4.39293087158564e-05,
"loss": 0.6083,
"step": 2328
},
{
"epoch": 0.6912811387900356,
"grad_norm": 0.25390625,
"learning_rate": 4.369741002434556e-05,
"loss": 0.5948,
"step": 2331
},
{
"epoch": 0.6921708185053381,
"grad_norm": 0.251953125,
"learning_rate": 4.346595379376232e-05,
"loss": 0.6148,
"step": 2334
},
{
"epoch": 0.6930604982206405,
"grad_norm": 0.267578125,
"learning_rate": 4.323494184303435e-05,
"loss": 0.6134,
"step": 2337
},
{
"epoch": 0.693950177935943,
"grad_norm": 0.24609375,
"learning_rate": 4.3004375987597946e-05,
"loss": 0.5801,
"step": 2340
},
{
"epoch": 0.6948398576512456,
"grad_norm": 0.263671875,
"learning_rate": 4.277425803938356e-05,
"loss": 0.615,
"step": 2343
},
{
"epoch": 0.6957295373665481,
"grad_norm": 0.271484375,
"learning_rate": 4.254458980680188e-05,
"loss": 0.6239,
"step": 2346
},
{
"epoch": 0.6966192170818505,
"grad_norm": 0.279296875,
"learning_rate": 4.2315373094729316e-05,
"loss": 0.5997,
"step": 2349
},
{
"epoch": 0.697508896797153,
"grad_norm": 0.267578125,
"learning_rate": 4.2086609704494015e-05,
"loss": 0.5897,
"step": 2352
},
{
"epoch": 0.6983985765124555,
"grad_norm": 0.26171875,
"learning_rate": 4.1858301433861566e-05,
"loss": 0.5926,
"step": 2355
},
{
"epoch": 0.699288256227758,
"grad_norm": 0.263671875,
"learning_rate": 4.163045007702104e-05,
"loss": 0.5991,
"step": 2358
},
{
"epoch": 0.7001779359430605,
"grad_norm": 0.263671875,
"learning_rate": 4.14030574245708e-05,
"loss": 0.6193,
"step": 2361
},
{
"epoch": 0.701067615658363,
"grad_norm": 0.2734375,
"learning_rate": 4.117612526350428e-05,
"loss": 0.6146,
"step": 2364
},
{
"epoch": 0.7019572953736655,
"grad_norm": 0.26171875,
"learning_rate": 4.09496553771963e-05,
"loss": 0.6032,
"step": 2367
},
{
"epoch": 0.702846975088968,
"grad_norm": 0.263671875,
"learning_rate": 4.0723649545388575e-05,
"loss": 0.5999,
"step": 2370
},
{
"epoch": 0.7037366548042705,
"grad_norm": 0.267578125,
"learning_rate": 4.0498109544176245e-05,
"loss": 0.5979,
"step": 2373
},
{
"epoch": 0.7046263345195729,
"grad_norm": 0.251953125,
"learning_rate": 4.0273037145993454e-05,
"loss": 0.6016,
"step": 2376
},
{
"epoch": 0.7055160142348754,
"grad_norm": 0.26171875,
"learning_rate": 4.0048434119599765e-05,
"loss": 0.6017,
"step": 2379
},
{
"epoch": 0.7064056939501779,
"grad_norm": 0.267578125,
"learning_rate": 3.982430223006613e-05,
"loss": 0.5984,
"step": 2382
},
{
"epoch": 0.7072953736654805,
"grad_norm": 0.287109375,
"learning_rate": 3.960064323876093e-05,
"loss": 0.5982,
"step": 2385
},
{
"epoch": 0.708185053380783,
"grad_norm": 0.2578125,
"learning_rate": 3.937745890333623e-05,
"loss": 0.5984,
"step": 2388
},
{
"epoch": 0.7090747330960854,
"grad_norm": 0.267578125,
"learning_rate": 3.915475097771396e-05,
"loss": 0.6207,
"step": 2391
},
{
"epoch": 0.7099644128113879,
"grad_norm": 0.26953125,
"learning_rate": 3.8932521212072206e-05,
"loss": 0.6029,
"step": 2394
},
{
"epoch": 0.7108540925266904,
"grad_norm": 0.255859375,
"learning_rate": 3.871077135283123e-05,
"loss": 0.5887,
"step": 2397
},
{
"epoch": 0.7117437722419929,
"grad_norm": 0.267578125,
"learning_rate": 3.8489503142640016e-05,
"loss": 0.6058,
"step": 2400
},
{
"epoch": 0.7126334519572953,
"grad_norm": 0.26171875,
"learning_rate": 3.826871832036242e-05,
"loss": 0.606,
"step": 2403
},
{
"epoch": 0.7135231316725978,
"grad_norm": 0.2578125,
"learning_rate": 3.804841862106347e-05,
"loss": 0.5846,
"step": 2406
},
{
"epoch": 0.7144128113879004,
"grad_norm": 0.287109375,
"learning_rate": 3.782860577599585e-05,
"loss": 0.6075,
"step": 2409
},
{
"epoch": 0.7153024911032029,
"grad_norm": 0.26953125,
"learning_rate": 3.7609281512586203e-05,
"loss": 0.6038,
"step": 2412
},
{
"epoch": 0.7161921708185054,
"grad_norm": 0.267578125,
"learning_rate": 3.739044755442162e-05,
"loss": 0.6201,
"step": 2415
},
{
"epoch": 0.7170818505338078,
"grad_norm": 0.259765625,
"learning_rate": 3.717210562123613e-05,
"loss": 0.6062,
"step": 2418
},
{
"epoch": 0.7179715302491103,
"grad_norm": 0.2734375,
"learning_rate": 3.695425742889698e-05,
"loss": 0.6108,
"step": 2421
},
{
"epoch": 0.7188612099644128,
"grad_norm": 0.26171875,
"learning_rate": 3.6736904689391417e-05,
"loss": 0.6307,
"step": 2424
},
{
"epoch": 0.7197508896797153,
"grad_norm": 0.271484375,
"learning_rate": 3.6520049110813035e-05,
"loss": 0.6057,
"step": 2427
},
{
"epoch": 0.7206405693950177,
"grad_norm": 0.2734375,
"learning_rate": 3.6303692397348455e-05,
"loss": 0.6262,
"step": 2430
},
{
"epoch": 0.7215302491103203,
"grad_norm": 0.2578125,
"learning_rate": 3.6087836249263875e-05,
"loss": 0.599,
"step": 2433
},
{
"epoch": 0.7224199288256228,
"grad_norm": 0.255859375,
"learning_rate": 3.58724823628918e-05,
"loss": 0.5879,
"step": 2436
},
{
"epoch": 0.7233096085409253,
"grad_norm": 0.25390625,
"learning_rate": 3.5657632430617635e-05,
"loss": 0.6095,
"step": 2439
},
{
"epoch": 0.7241992882562278,
"grad_norm": 0.267578125,
"learning_rate": 3.5443288140866316e-05,
"loss": 0.593,
"step": 2442
},
{
"epoch": 0.7250889679715302,
"grad_norm": 0.2578125,
"learning_rate": 3.522945117808929e-05,
"loss": 0.5932,
"step": 2445
},
{
"epoch": 0.7259786476868327,
"grad_norm": 0.263671875,
"learning_rate": 3.501612322275086e-05,
"loss": 0.6149,
"step": 2448
},
{
"epoch": 0.7268683274021353,
"grad_norm": 0.263671875,
"learning_rate": 3.48033059513155e-05,
"loss": 0.591,
"step": 2451
},
{
"epoch": 0.7277580071174378,
"grad_norm": 0.259765625,
"learning_rate": 3.45910010362342e-05,
"loss": 0.5986,
"step": 2454
},
{
"epoch": 0.7286476868327402,
"grad_norm": 0.265625,
"learning_rate": 3.437921014593167e-05,
"loss": 0.5983,
"step": 2457
},
{
"epoch": 0.7295373665480427,
"grad_norm": 0.2734375,
"learning_rate": 3.416793494479308e-05,
"loss": 0.6305,
"step": 2460
},
{
"epoch": 0.7304270462633452,
"grad_norm": 0.2578125,
"learning_rate": 3.3957177093150915e-05,
"loss": 0.6212,
"step": 2463
},
{
"epoch": 0.7313167259786477,
"grad_norm": 0.275390625,
"learning_rate": 3.374693824727204e-05,
"loss": 0.6132,
"step": 2466
},
{
"epoch": 0.7322064056939501,
"grad_norm": 0.251953125,
"learning_rate": 3.353722005934463e-05,
"loss": 0.5886,
"step": 2469
},
{
"epoch": 0.7330960854092526,
"grad_norm": 0.296875,
"learning_rate": 3.332802417746527e-05,
"loss": 0.6087,
"step": 2472
},
{
"epoch": 0.7339857651245552,
"grad_norm": 0.267578125,
"learning_rate": 3.311935224562591e-05,
"loss": 0.6045,
"step": 2475
},
{
"epoch": 0.7348754448398577,
"grad_norm": 0.259765625,
"learning_rate": 3.291120590370091e-05,
"loss": 0.5976,
"step": 2478
},
{
"epoch": 0.7357651245551602,
"grad_norm": 0.267578125,
"learning_rate": 3.270358678743434e-05,
"loss": 0.6191,
"step": 2481
},
{
"epoch": 0.7366548042704626,
"grad_norm": 0.25390625,
"learning_rate": 3.249649652842687e-05,
"loss": 0.5864,
"step": 2484
},
{
"epoch": 0.7375444839857651,
"grad_norm": 0.265625,
"learning_rate": 3.228993675412315e-05,
"loss": 0.6063,
"step": 2487
},
{
"epoch": 0.7384341637010676,
"grad_norm": 0.259765625,
"learning_rate": 3.20839090877989e-05,
"loss": 0.5998,
"step": 2490
},
{
"epoch": 0.7393238434163701,
"grad_norm": 0.248046875,
"learning_rate": 3.187841514854829e-05,
"loss": 0.5807,
"step": 2493
},
{
"epoch": 0.7402135231316725,
"grad_norm": 0.263671875,
"learning_rate": 3.1673456551271086e-05,
"loss": 0.5983,
"step": 2496
},
{
"epoch": 0.7411032028469751,
"grad_norm": 0.259765625,
"learning_rate": 3.1469034906659946e-05,
"loss": 0.6053,
"step": 2499
},
{
"epoch": 0.7419928825622776,
"grad_norm": 0.267578125,
"learning_rate": 3.126515182118793e-05,
"loss": 0.5994,
"step": 2502
},
{
"epoch": 0.7428825622775801,
"grad_norm": 0.263671875,
"learning_rate": 3.106180889709567e-05,
"loss": 0.5969,
"step": 2505
},
{
"epoch": 0.7437722419928826,
"grad_norm": 0.259765625,
"learning_rate": 3.0859007732378896e-05,
"loss": 0.5936,
"step": 2508
},
{
"epoch": 0.744661921708185,
"grad_norm": 0.251953125,
"learning_rate": 3.065674992077584e-05,
"loss": 0.5717,
"step": 2511
},
{
"epoch": 0.7455516014234875,
"grad_norm": 0.2578125,
"learning_rate": 3.0455037051754777e-05,
"loss": 0.6061,
"step": 2514
},
{
"epoch": 0.74644128113879,
"grad_norm": 0.251953125,
"learning_rate": 3.0253870710501475e-05,
"loss": 0.5914,
"step": 2517
},
{
"epoch": 0.7473309608540926,
"grad_norm": 0.251953125,
"learning_rate": 3.005325247790668e-05,
"loss": 0.6067,
"step": 2520
},
{
"epoch": 0.748220640569395,
"grad_norm": 0.271484375,
"learning_rate": 2.9853183930553853e-05,
"loss": 0.5909,
"step": 2523
},
{
"epoch": 0.7491103202846975,
"grad_norm": 0.25,
"learning_rate": 2.965366664070661e-05,
"loss": 0.5847,
"step": 2526
},
{
"epoch": 0.75,
"grad_norm": 0.26953125,
"learning_rate": 2.9454702176296423e-05,
"loss": 0.5907,
"step": 2529
},
{
"epoch": 0.7508896797153025,
"grad_norm": 0.26953125,
"learning_rate": 2.925629210091043e-05,
"loss": 0.606,
"step": 2532
},
{
"epoch": 0.751779359430605,
"grad_norm": 0.267578125,
"learning_rate": 2.9058437973778896e-05,
"loss": 0.6055,
"step": 2535
},
{
"epoch": 0.7526690391459074,
"grad_norm": 0.2578125,
"learning_rate": 2.886114134976322e-05,
"loss": 0.5993,
"step": 2538
},
{
"epoch": 0.75355871886121,
"grad_norm": 0.271484375,
"learning_rate": 2.866440377934352e-05,
"loss": 0.6098,
"step": 2541
},
{
"epoch": 0.7544483985765125,
"grad_norm": 0.26171875,
"learning_rate": 2.8468226808606522e-05,
"loss": 0.584,
"step": 2544
},
{
"epoch": 0.755338078291815,
"grad_norm": 0.255859375,
"learning_rate": 2.827261197923341e-05,
"loss": 0.5949,
"step": 2547
},
{
"epoch": 0.7562277580071174,
"grad_norm": 0.271484375,
"learning_rate": 2.8077560828487748e-05,
"loss": 0.5698,
"step": 2550
},
{
"epoch": 0.7571174377224199,
"grad_norm": 0.26171875,
"learning_rate": 2.7883074889203363e-05,
"loss": 0.612,
"step": 2553
},
{
"epoch": 0.7580071174377224,
"grad_norm": 0.26953125,
"learning_rate": 2.7689155689772217e-05,
"loss": 0.5951,
"step": 2556
},
{
"epoch": 0.7588967971530249,
"grad_norm": 0.259765625,
"learning_rate": 2.7495804754132602e-05,
"loss": 0.5841,
"step": 2559
},
{
"epoch": 0.7597864768683275,
"grad_norm": 0.26953125,
"learning_rate": 2.7303023601756928e-05,
"loss": 0.5978,
"step": 2562
},
{
"epoch": 0.7606761565836299,
"grad_norm": 0.251953125,
"learning_rate": 2.711081374763993e-05,
"loss": 0.5994,
"step": 2565
},
{
"epoch": 0.7615658362989324,
"grad_norm": 0.263671875,
"learning_rate": 2.6919176702286698e-05,
"loss": 0.6014,
"step": 2568
},
{
"epoch": 0.7624555160142349,
"grad_norm": 0.271484375,
"learning_rate": 2.6728113971700908e-05,
"loss": 0.5958,
"step": 2571
},
{
"epoch": 0.7633451957295374,
"grad_norm": 0.28125,
"learning_rate": 2.653762705737287e-05,
"loss": 0.6242,
"step": 2574
},
{
"epoch": 0.7642348754448398,
"grad_norm": 0.26171875,
"learning_rate": 2.634771745626772e-05,
"loss": 0.616,
"step": 2577
},
{
"epoch": 0.7651245551601423,
"grad_norm": 0.25390625,
"learning_rate": 2.6158386660813806e-05,
"loss": 0.5959,
"step": 2580
},
{
"epoch": 0.7660142348754448,
"grad_norm": 0.26953125,
"learning_rate": 2.5969636158890775e-05,
"loss": 0.5971,
"step": 2583
},
{
"epoch": 0.7669039145907474,
"grad_norm": 0.26171875,
"learning_rate": 2.5781467433817973e-05,
"loss": 0.593,
"step": 2586
},
{
"epoch": 0.7677935943060499,
"grad_norm": 0.2578125,
"learning_rate": 2.5593881964342857e-05,
"loss": 0.5841,
"step": 2589
},
{
"epoch": 0.7686832740213523,
"grad_norm": 0.251953125,
"learning_rate": 2.5406881224629174e-05,
"loss": 0.6111,
"step": 2592
},
{
"epoch": 0.7695729537366548,
"grad_norm": 0.263671875,
"learning_rate": 2.5220466684245646e-05,
"loss": 0.5758,
"step": 2595
},
{
"epoch": 0.7704626334519573,
"grad_norm": 0.263671875,
"learning_rate": 2.5034639808154114e-05,
"loss": 0.6276,
"step": 2598
},
{
"epoch": 0.7713523131672598,
"grad_norm": 0.2578125,
"learning_rate": 2.4849402056698334e-05,
"loss": 0.6062,
"step": 2601
},
{
"epoch": 0.7722419928825622,
"grad_norm": 0.263671875,
"learning_rate": 2.4664754885592268e-05,
"loss": 0.5881,
"step": 2604
},
{
"epoch": 0.7731316725978647,
"grad_norm": 0.2578125,
"learning_rate": 2.4480699745908707e-05,
"loss": 0.6124,
"step": 2607
},
{
"epoch": 0.7740213523131673,
"grad_norm": 0.259765625,
"learning_rate": 2.4297238084067985e-05,
"loss": 0.5779,
"step": 2610
},
{
"epoch": 0.7749110320284698,
"grad_norm": 0.263671875,
"learning_rate": 2.4114371341826415e-05,
"loss": 0.6019,
"step": 2613
},
{
"epoch": 0.7758007117437722,
"grad_norm": 0.259765625,
"learning_rate": 2.3932100956265148e-05,
"loss": 0.6087,
"step": 2616
},
{
"epoch": 0.7766903914590747,
"grad_norm": 0.265625,
"learning_rate": 2.375042835977872e-05,
"loss": 0.5983,
"step": 2619
},
{
"epoch": 0.7775800711743772,
"grad_norm": 0.283203125,
"learning_rate": 2.3569354980063906e-05,
"loss": 0.6024,
"step": 2622
},
{
"epoch": 0.7784697508896797,
"grad_norm": 0.25,
"learning_rate": 2.3388882240108423e-05,
"loss": 0.6039,
"step": 2625
},
{
"epoch": 0.7793594306049823,
"grad_norm": 0.26953125,
"learning_rate": 2.3209011558179826e-05,
"loss": 0.5958,
"step": 2628
},
{
"epoch": 0.7802491103202847,
"grad_norm": 0.26171875,
"learning_rate": 2.3029744347814365e-05,
"loss": 0.5979,
"step": 2631
},
{
"epoch": 0.7811387900355872,
"grad_norm": 0.25390625,
"learning_rate": 2.2851082017805703e-05,
"loss": 0.5918,
"step": 2634
},
{
"epoch": 0.7820284697508897,
"grad_norm": 0.26171875,
"learning_rate": 2.2673025972194106e-05,
"loss": 0.5906,
"step": 2637
},
{
"epoch": 0.7829181494661922,
"grad_norm": 0.255859375,
"learning_rate": 2.2495577610255203e-05,
"loss": 0.5857,
"step": 2640
},
{
"epoch": 0.7838078291814946,
"grad_norm": 0.26171875,
"learning_rate": 2.2318738326489074e-05,
"loss": 0.602,
"step": 2643
},
{
"epoch": 0.7846975088967971,
"grad_norm": 0.26171875,
"learning_rate": 2.2142509510609277e-05,
"loss": 0.5846,
"step": 2646
},
{
"epoch": 0.7855871886120996,
"grad_norm": 0.265625,
"learning_rate": 2.196689254753196e-05,
"loss": 0.5983,
"step": 2649
},
{
"epoch": 0.7864768683274022,
"grad_norm": 0.263671875,
"learning_rate": 2.179188881736498e-05,
"loss": 0.5753,
"step": 2652
},
{
"epoch": 0.7873665480427047,
"grad_norm": 0.2578125,
"learning_rate": 2.1617499695396924e-05,
"loss": 0.605,
"step": 2655
},
{
"epoch": 0.7882562277580071,
"grad_norm": 0.265625,
"learning_rate": 2.1443726552086528e-05,
"loss": 0.5982,
"step": 2658
},
{
"epoch": 0.7891459074733096,
"grad_norm": 0.25390625,
"learning_rate": 2.1270570753051668e-05,
"loss": 0.5972,
"step": 2661
},
{
"epoch": 0.7900355871886121,
"grad_norm": 0.25390625,
"learning_rate": 2.109803365905879e-05,
"loss": 0.5869,
"step": 2664
},
{
"epoch": 0.7909252669039146,
"grad_norm": 0.275390625,
"learning_rate": 2.0926116626012205e-05,
"loss": 0.5984,
"step": 2667
},
{
"epoch": 0.791814946619217,
"grad_norm": 0.259765625,
"learning_rate": 2.0754821004943336e-05,
"loss": 0.6054,
"step": 2670
},
{
"epoch": 0.7927046263345195,
"grad_norm": 0.255859375,
"learning_rate": 2.0584148142000225e-05,
"loss": 0.5809,
"step": 2673
},
{
"epoch": 0.7935943060498221,
"grad_norm": 0.267578125,
"learning_rate": 2.0414099378436813e-05,
"loss": 0.6019,
"step": 2676
},
{
"epoch": 0.7944839857651246,
"grad_norm": 0.259765625,
"learning_rate": 2.0244676050602572e-05,
"loss": 0.5837,
"step": 2679
},
{
"epoch": 0.7953736654804271,
"grad_norm": 0.267578125,
"learning_rate": 2.0075879489931847e-05,
"loss": 0.6226,
"step": 2682
},
{
"epoch": 0.7962633451957295,
"grad_norm": 0.271484375,
"learning_rate": 1.990771102293344e-05,
"loss": 0.6255,
"step": 2685
},
{
"epoch": 0.797153024911032,
"grad_norm": 0.25390625,
"learning_rate": 1.9740171971180278e-05,
"loss": 0.5958,
"step": 2688
},
{
"epoch": 0.7980427046263345,
"grad_norm": 0.25,
"learning_rate": 1.9573263651298836e-05,
"loss": 0.5902,
"step": 2691
},
{
"epoch": 0.798932384341637,
"grad_norm": 0.25,
"learning_rate": 1.940698737495904e-05,
"loss": 0.6078,
"step": 2694
},
{
"epoch": 0.7998220640569395,
"grad_norm": 0.259765625,
"learning_rate": 1.9241344448863696e-05,
"loss": 0.6009,
"step": 2697
},
{
"epoch": 0.800711743772242,
"grad_norm": 0.26953125,
"learning_rate": 1.9076336174738473e-05,
"loss": 0.5657,
"step": 2700
},
{
"epoch": 0.8016014234875445,
"grad_norm": 0.251953125,
"learning_rate": 1.891196384932139e-05,
"loss": 0.5824,
"step": 2703
},
{
"epoch": 0.802491103202847,
"grad_norm": 0.2578125,
"learning_rate": 1.8748228764352914e-05,
"loss": 0.5945,
"step": 2706
},
{
"epoch": 0.8033807829181495,
"grad_norm": 0.25390625,
"learning_rate": 1.858513220656567e-05,
"loss": 0.5806,
"step": 2709
},
{
"epoch": 0.8042704626334519,
"grad_norm": 0.255859375,
"learning_rate": 1.8422675457674254e-05,
"loss": 0.5963,
"step": 2712
},
{
"epoch": 0.8051601423487544,
"grad_norm": 0.251953125,
"learning_rate": 1.8260859794365338e-05,
"loss": 0.5849,
"step": 2715
},
{
"epoch": 0.806049822064057,
"grad_norm": 0.275390625,
"learning_rate": 1.809968648828748e-05,
"loss": 0.6066,
"step": 2718
},
{
"epoch": 0.8069395017793595,
"grad_norm": 0.25,
"learning_rate": 1.7939156806041203e-05,
"loss": 0.5969,
"step": 2721
},
{
"epoch": 0.8078291814946619,
"grad_norm": 0.259765625,
"learning_rate": 1.777927200916907e-05,
"loss": 0.5832,
"step": 2724
},
{
"epoch": 0.8087188612099644,
"grad_norm": 0.259765625,
"learning_rate": 1.762003335414566e-05,
"loss": 0.5965,
"step": 2727
},
{
"epoch": 0.8096085409252669,
"grad_norm": 0.25390625,
"learning_rate": 1.7461442092367862e-05,
"loss": 0.5792,
"step": 2730
},
{
"epoch": 0.8104982206405694,
"grad_norm": 0.267578125,
"learning_rate": 1.7303499470144846e-05,
"loss": 0.5943,
"step": 2733
},
{
"epoch": 0.8113879003558719,
"grad_norm": 0.255859375,
"learning_rate": 1.7146206728688463e-05,
"loss": 0.6036,
"step": 2736
},
{
"epoch": 0.8122775800711743,
"grad_norm": 0.25390625,
"learning_rate": 1.6989565104103312e-05,
"loss": 0.5755,
"step": 2739
},
{
"epoch": 0.8131672597864769,
"grad_norm": 0.265625,
"learning_rate": 1.6833575827377134e-05,
"loss": 0.6149,
"step": 2742
},
{
"epoch": 0.8140569395017794,
"grad_norm": 0.26171875,
"learning_rate": 1.6678240124371157e-05,
"loss": 0.5944,
"step": 2745
},
{
"epoch": 0.8149466192170819,
"grad_norm": 0.263671875,
"learning_rate": 1.6523559215810337e-05,
"loss": 0.5925,
"step": 2748
},
{
"epoch": 0.8158362989323843,
"grad_norm": 0.26171875,
"learning_rate": 1.636953431727395e-05,
"loss": 0.5936,
"step": 2751
},
{
"epoch": 0.8167259786476868,
"grad_norm": 0.259765625,
"learning_rate": 1.6216166639185803e-05,
"loss": 0.5973,
"step": 2754
},
{
"epoch": 0.8176156583629893,
"grad_norm": 0.263671875,
"learning_rate": 1.6063457386805004e-05,
"loss": 0.6125,
"step": 2757
},
{
"epoch": 0.8185053380782918,
"grad_norm": 0.259765625,
"learning_rate": 1.5911407760216235e-05,
"loss": 0.6036,
"step": 2760
},
{
"epoch": 0.8193950177935944,
"grad_norm": 0.25,
"learning_rate": 1.576001895432042e-05,
"loss": 0.597,
"step": 2763
},
{
"epoch": 0.8202846975088968,
"grad_norm": 0.251953125,
"learning_rate": 1.5609292158825438e-05,
"loss": 0.5872,
"step": 2766
},
{
"epoch": 0.8211743772241993,
"grad_norm": 0.2578125,
"learning_rate": 1.545922855823656e-05,
"loss": 0.6095,
"step": 2769
},
{
"epoch": 0.8220640569395018,
"grad_norm": 0.255859375,
"learning_rate": 1.530982933184737e-05,
"loss": 0.603,
"step": 2772
},
{
"epoch": 0.8229537366548043,
"grad_norm": 0.263671875,
"learning_rate": 1.5161095653730273e-05,
"loss": 0.5938,
"step": 2775
},
{
"epoch": 0.8238434163701067,
"grad_norm": 0.26953125,
"learning_rate": 1.5013028692727481e-05,
"loss": 0.6032,
"step": 2778
},
{
"epoch": 0.8247330960854092,
"grad_norm": 0.2578125,
"learning_rate": 1.4865629612441656e-05,
"loss": 0.5877,
"step": 2781
},
{
"epoch": 0.8256227758007118,
"grad_norm": 0.25390625,
"learning_rate": 1.471889957122684e-05,
"loss": 0.6057,
"step": 2784
},
{
"epoch": 0.8265124555160143,
"grad_norm": 0.259765625,
"learning_rate": 1.457283972217941e-05,
"loss": 0.6011,
"step": 2787
},
{
"epoch": 0.8274021352313167,
"grad_norm": 0.263671875,
"learning_rate": 1.4427451213128873e-05,
"loss": 0.6009,
"step": 2790
},
{
"epoch": 0.8282918149466192,
"grad_norm": 0.27734375,
"learning_rate": 1.4282735186629014e-05,
"loss": 0.6263,
"step": 2793
},
{
"epoch": 0.8291814946619217,
"grad_norm": 0.2578125,
"learning_rate": 1.4138692779948748e-05,
"loss": 0.5978,
"step": 2796
},
{
"epoch": 0.8300711743772242,
"grad_norm": 0.24609375,
"learning_rate": 1.3995325125063274e-05,
"loss": 0.5968,
"step": 2799
},
{
"epoch": 0.8309608540925267,
"grad_norm": 0.26171875,
"learning_rate": 1.3852633348645262e-05,
"loss": 0.6074,
"step": 2802
},
{
"epoch": 0.8318505338078291,
"grad_norm": 0.25390625,
"learning_rate": 1.3710618572055767e-05,
"loss": 0.608,
"step": 2805
},
{
"epoch": 0.8327402135231317,
"grad_norm": 0.263671875,
"learning_rate": 1.3569281911335684e-05,
"loss": 0.5896,
"step": 2808
},
{
"epoch": 0.8336298932384342,
"grad_norm": 0.265625,
"learning_rate": 1.3428624477196761e-05,
"loss": 0.6042,
"step": 2811
},
{
"epoch": 0.8345195729537367,
"grad_norm": 0.255859375,
"learning_rate": 1.328864737501302e-05,
"loss": 0.6092,
"step": 2814
},
{
"epoch": 0.8354092526690391,
"grad_norm": 0.25,
"learning_rate": 1.3149351704811962e-05,
"loss": 0.6081,
"step": 2817
},
{
"epoch": 0.8362989323843416,
"grad_norm": 0.2734375,
"learning_rate": 1.3010738561265979e-05,
"loss": 0.5918,
"step": 2820
},
{
"epoch": 0.8371886120996441,
"grad_norm": 0.259765625,
"learning_rate": 1.2872809033683798e-05,
"loss": 0.595,
"step": 2823
},
{
"epoch": 0.8380782918149466,
"grad_norm": 0.267578125,
"learning_rate": 1.2735564206001749e-05,
"loss": 0.5856,
"step": 2826
},
{
"epoch": 0.8389679715302492,
"grad_norm": 0.248046875,
"learning_rate": 1.2599005156775512e-05,
"loss": 0.5715,
"step": 2829
},
{
"epoch": 0.8398576512455516,
"grad_norm": 0.255859375,
"learning_rate": 1.2463132959171341e-05,
"loss": 0.607,
"step": 2832
},
{
"epoch": 0.8407473309608541,
"grad_norm": 0.259765625,
"learning_rate": 1.2327948680957924e-05,
"loss": 0.5798,
"step": 2835
},
{
"epoch": 0.8416370106761566,
"grad_norm": 0.251953125,
"learning_rate": 1.2193453384497722e-05,
"loss": 0.6194,
"step": 2838
},
{
"epoch": 0.8425266903914591,
"grad_norm": 0.263671875,
"learning_rate": 1.205964812673881e-05,
"loss": 0.597,
"step": 2841
},
{
"epoch": 0.8434163701067615,
"grad_norm": 0.27734375,
"learning_rate": 1.192653395920652e-05,
"loss": 0.6119,
"step": 2844
},
{
"epoch": 0.844306049822064,
"grad_norm": 0.259765625,
"learning_rate": 1.179411192799511e-05,
"loss": 0.5891,
"step": 2847
},
{
"epoch": 0.8451957295373665,
"grad_norm": 0.251953125,
"learning_rate": 1.1662383073759685e-05,
"loss": 0.6034,
"step": 2850
},
{
"epoch": 0.8460854092526691,
"grad_norm": 0.25390625,
"learning_rate": 1.1531348431707823e-05,
"loss": 0.5686,
"step": 2853
},
{
"epoch": 0.8469750889679716,
"grad_norm": 0.263671875,
"learning_rate": 1.1401009031591658e-05,
"loss": 0.5882,
"step": 2856
},
{
"epoch": 0.847864768683274,
"grad_norm": 0.255859375,
"learning_rate": 1.1271365897699615e-05,
"loss": 0.5835,
"step": 2859
},
{
"epoch": 0.8487544483985765,
"grad_norm": 0.26171875,
"learning_rate": 1.114242004884839e-05,
"loss": 0.6015,
"step": 2862
},
{
"epoch": 0.849644128113879,
"grad_norm": 0.265625,
"learning_rate": 1.1014172498375086e-05,
"loss": 0.6059,
"step": 2865
},
{
"epoch": 0.8505338078291815,
"grad_norm": 0.25390625,
"learning_rate": 1.088662425412903e-05,
"loss": 0.5979,
"step": 2868
},
{
"epoch": 0.8514234875444839,
"grad_norm": 0.26171875,
"learning_rate": 1.0759776318464043e-05,
"loss": 0.6005,
"step": 2871
},
{
"epoch": 0.8523131672597865,
"grad_norm": 0.271484375,
"learning_rate": 1.0633629688230452e-05,
"loss": 0.595,
"step": 2874
},
{
"epoch": 0.853202846975089,
"grad_norm": 0.2578125,
"learning_rate": 1.0508185354767264e-05,
"loss": 0.5866,
"step": 2877
},
{
"epoch": 0.8540925266903915,
"grad_norm": 0.251953125,
"learning_rate": 1.0383444303894452e-05,
"loss": 0.6049,
"step": 2880
},
{
"epoch": 0.854982206405694,
"grad_norm": 0.271484375,
"learning_rate": 1.0259407515905094e-05,
"loss": 0.6029,
"step": 2883
},
{
"epoch": 0.8558718861209964,
"grad_norm": 0.255859375,
"learning_rate": 1.0136075965557811e-05,
"loss": 0.6028,
"step": 2886
},
{
"epoch": 0.8567615658362989,
"grad_norm": 0.2490234375,
"learning_rate": 1.0013450622068921e-05,
"loss": 0.6034,
"step": 2889
},
{
"epoch": 0.8576512455516014,
"grad_norm": 0.24609375,
"learning_rate": 9.891532449105045e-06,
"loss": 0.6008,
"step": 2892
},
{
"epoch": 0.858540925266904,
"grad_norm": 0.251953125,
"learning_rate": 9.770322404775323e-06,
"loss": 0.5886,
"step": 2895
},
{
"epoch": 0.8594306049822064,
"grad_norm": 0.2578125,
"learning_rate": 9.649821441623986e-06,
"loss": 0.5991,
"step": 2898
},
{
"epoch": 0.8603202846975089,
"grad_norm": 0.263671875,
"learning_rate": 9.530030506622934e-06,
"loss": 0.6189,
"step": 2901
},
{
"epoch": 0.8612099644128114,
"grad_norm": 0.259765625,
"learning_rate": 9.410950541164143e-06,
"loss": 0.6034,
"step": 2904
},
{
"epoch": 0.8620996441281139,
"grad_norm": 0.2490234375,
"learning_rate": 9.292582481052403e-06,
"loss": 0.5953,
"step": 2907
},
{
"epoch": 0.8629893238434164,
"grad_norm": 0.26171875,
"learning_rate": 9.174927256497844e-06,
"loss": 0.5974,
"step": 2910
},
{
"epoch": 0.8638790035587188,
"grad_norm": 0.255859375,
"learning_rate": 9.05798579210878e-06,
"loss": 0.6074,
"step": 2913
},
{
"epoch": 0.8647686832740213,
"grad_norm": 0.2578125,
"learning_rate": 8.941759006884265e-06,
"loss": 0.5955,
"step": 2916
},
{
"epoch": 0.8656583629893239,
"grad_norm": 0.271484375,
"learning_rate": 8.826247814206967e-06,
"loss": 0.6182,
"step": 2919
},
{
"epoch": 0.8665480427046264,
"grad_norm": 0.275390625,
"learning_rate": 8.711453121836066e-06,
"loss": 0.5978,
"step": 2922
},
{
"epoch": 0.8674377224199288,
"grad_norm": 0.25,
"learning_rate": 8.597375831899913e-06,
"loss": 0.5886,
"step": 2925
},
{
"epoch": 0.8683274021352313,
"grad_norm": 0.259765625,
"learning_rate": 8.484016840889176e-06,
"loss": 0.6046,
"step": 2928
},
{
"epoch": 0.8692170818505338,
"grad_norm": 0.2578125,
"learning_rate": 8.371377039649586e-06,
"loss": 0.6104,
"step": 2931
},
{
"epoch": 0.8701067615658363,
"grad_norm": 0.26953125,
"learning_rate": 8.259457313375096e-06,
"loss": 0.6054,
"step": 2934
},
{
"epoch": 0.8709964412811388,
"grad_norm": 0.2734375,
"learning_rate": 8.14825854160085e-06,
"loss": 0.5899,
"step": 2937
},
{
"epoch": 0.8718861209964412,
"grad_norm": 0.2578125,
"learning_rate": 8.037781598196225e-06,
"loss": 0.5991,
"step": 2940
},
{
"epoch": 0.8727758007117438,
"grad_norm": 0.259765625,
"learning_rate": 7.928027351358114e-06,
"loss": 0.5856,
"step": 2943
},
{
"epoch": 0.8736654804270463,
"grad_norm": 0.263671875,
"learning_rate": 7.818996663603917e-06,
"loss": 0.6008,
"step": 2946
},
{
"epoch": 0.8745551601423488,
"grad_norm": 0.251953125,
"learning_rate": 7.71069039176493e-06,
"loss": 0.5707,
"step": 2949
},
{
"epoch": 0.8754448398576512,
"grad_norm": 0.271484375,
"learning_rate": 7.603109386979501e-06,
"loss": 0.588,
"step": 2952
},
{
"epoch": 0.8763345195729537,
"grad_norm": 0.306640625,
"learning_rate": 7.496254494686339e-06,
"loss": 0.6,
"step": 2955
},
{
"epoch": 0.8772241992882562,
"grad_norm": 0.2470703125,
"learning_rate": 7.390126554617982e-06,
"loss": 0.5762,
"step": 2958
},
{
"epoch": 0.8781138790035588,
"grad_norm": 0.265625,
"learning_rate": 7.284726400794073e-06,
"loss": 0.5991,
"step": 2961
},
{
"epoch": 0.8790035587188612,
"grad_norm": 0.271484375,
"learning_rate": 7.180054861514885e-06,
"loss": 0.5988,
"step": 2964
},
{
"epoch": 0.8798932384341637,
"grad_norm": 0.271484375,
"learning_rate": 7.076112759354736e-06,
"loss": 0.5755,
"step": 2967
},
{
"epoch": 0.8807829181494662,
"grad_norm": 0.2734375,
"learning_rate": 6.972900911155655e-06,
"loss": 0.609,
"step": 2970
},
{
"epoch": 0.8816725978647687,
"grad_norm": 0.267578125,
"learning_rate": 6.8704201280207935e-06,
"loss": 0.5757,
"step": 2973
},
{
"epoch": 0.8825622775800712,
"grad_norm": 0.259765625,
"learning_rate": 6.7686712153081645e-06,
"loss": 0.6276,
"step": 2976
},
{
"epoch": 0.8834519572953736,
"grad_norm": 0.271484375,
"learning_rate": 6.667654972624315e-06,
"loss": 0.6059,
"step": 2979
},
{
"epoch": 0.8843416370106761,
"grad_norm": 0.25390625,
"learning_rate": 6.567372193817966e-06,
"loss": 0.5917,
"step": 2982
},
{
"epoch": 0.8852313167259787,
"grad_norm": 0.259765625,
"learning_rate": 6.467823666973871e-06,
"loss": 0.5702,
"step": 2985
},
{
"epoch": 0.8861209964412812,
"grad_norm": 0.267578125,
"learning_rate": 6.369010174406531e-06,
"loss": 0.607,
"step": 2988
},
{
"epoch": 0.8870106761565836,
"grad_norm": 0.255859375,
"learning_rate": 6.270932492654125e-06,
"loss": 0.5965,
"step": 2991
},
{
"epoch": 0.8879003558718861,
"grad_norm": 0.2734375,
"learning_rate": 6.173591392472333e-06,
"loss": 0.587,
"step": 2994
},
{
"epoch": 0.8887900355871886,
"grad_norm": 0.2578125,
"learning_rate": 6.076987638828335e-06,
"loss": 0.6043,
"step": 2997
},
{
"epoch": 0.8896797153024911,
"grad_norm": 0.259765625,
"learning_rate": 5.981121990894789e-06,
"loss": 0.5967,
"step": 3000
},
{
"epoch": 0.8905693950177936,
"grad_norm": 0.25390625,
"learning_rate": 5.885995202043848e-06,
"loss": 0.584,
"step": 3003
},
{
"epoch": 0.891459074733096,
"grad_norm": 0.251953125,
"learning_rate": 5.791608019841244e-06,
"loss": 0.5836,
"step": 3006
},
{
"epoch": 0.8923487544483986,
"grad_norm": 0.267578125,
"learning_rate": 5.697961186040435e-06,
"loss": 0.6029,
"step": 3009
},
{
"epoch": 0.8932384341637011,
"grad_norm": 0.255859375,
"learning_rate": 5.605055436576745e-06,
"loss": 0.588,
"step": 3012
},
{
"epoch": 0.8941281138790036,
"grad_norm": 0.26171875,
"learning_rate": 5.51289150156159e-06,
"loss": 0.596,
"step": 3015
},
{
"epoch": 0.895017793594306,
"grad_norm": 0.2578125,
"learning_rate": 5.421470105276749e-06,
"loss": 0.5913,
"step": 3018
},
{
"epoch": 0.8959074733096085,
"grad_norm": 0.2578125,
"learning_rate": 5.33079196616868e-06,
"loss": 0.5799,
"step": 3021
},
{
"epoch": 0.896797153024911,
"grad_norm": 0.25390625,
"learning_rate": 5.240857796842846e-06,
"loss": 0.6145,
"step": 3024
},
{
"epoch": 0.8976868327402135,
"grad_norm": 0.25390625,
"learning_rate": 5.151668304058132e-06,
"loss": 0.6146,
"step": 3027
},
{
"epoch": 0.8985765124555161,
"grad_norm": 0.26171875,
"learning_rate": 5.0632241887213275e-06,
"loss": 0.6113,
"step": 3030
},
{
"epoch": 0.8994661921708185,
"grad_norm": 0.267578125,
"learning_rate": 4.975526145881515e-06,
"loss": 0.5878,
"step": 3033
},
{
"epoch": 0.900355871886121,
"grad_norm": 0.26171875,
"learning_rate": 4.888574864724715e-06,
"loss": 0.6173,
"step": 3036
},
{
"epoch": 0.9012455516014235,
"grad_norm": 0.2490234375,
"learning_rate": 4.8023710285683975e-06,
"loss": 0.5758,
"step": 3039
},
{
"epoch": 0.902135231316726,
"grad_norm": 0.259765625,
"learning_rate": 4.716915314856196e-06,
"loss": 0.5931,
"step": 3042
},
{
"epoch": 0.9030249110320284,
"grad_norm": 0.251953125,
"learning_rate": 4.6322083951524705e-06,
"loss": 0.5857,
"step": 3045
},
{
"epoch": 0.9039145907473309,
"grad_norm": 0.25390625,
"learning_rate": 4.548250935137144e-06,
"loss": 0.5957,
"step": 3048
},
{
"epoch": 0.9048042704626335,
"grad_norm": 0.26171875,
"learning_rate": 4.465043594600382e-06,
"loss": 0.6066,
"step": 3051
},
{
"epoch": 0.905693950177936,
"grad_norm": 0.25390625,
"learning_rate": 4.382587027437435e-06,
"loss": 0.5993,
"step": 3054
},
{
"epoch": 0.9065836298932385,
"grad_norm": 0.255859375,
"learning_rate": 4.300881881643537e-06,
"loss": 0.5769,
"step": 3057
},
{
"epoch": 0.9074733096085409,
"grad_norm": 0.255859375,
"learning_rate": 4.219928799308759e-06,
"loss": 0.5997,
"step": 3060
},
{
"epoch": 0.9083629893238434,
"grad_norm": 0.251953125,
"learning_rate": 4.139728416613031e-06,
"loss": 0.598,
"step": 3063
},
{
"epoch": 0.9092526690391459,
"grad_norm": 0.255859375,
"learning_rate": 4.0602813638210165e-06,
"loss": 0.5925,
"step": 3066
},
{
"epoch": 0.9101423487544484,
"grad_norm": 0.275390625,
"learning_rate": 3.981588265277337e-06,
"loss": 0.6136,
"step": 3069
},
{
"epoch": 0.9110320284697508,
"grad_norm": 0.27734375,
"learning_rate": 3.903649739401494e-06,
"loss": 0.5873,
"step": 3072
},
{
"epoch": 0.9119217081850534,
"grad_norm": 0.26953125,
"learning_rate": 3.826466398683126e-06,
"loss": 0.6177,
"step": 3075
},
{
"epoch": 0.9128113879003559,
"grad_norm": 0.271484375,
"learning_rate": 3.750038849677162e-06,
"loss": 0.5871,
"step": 3078
},
{
"epoch": 0.9137010676156584,
"grad_norm": 0.255859375,
"learning_rate": 3.6743676929989924e-06,
"loss": 0.5864,
"step": 3081
},
{
"epoch": 0.9145907473309609,
"grad_norm": 0.2578125,
"learning_rate": 3.5994535233198846e-06,
"loss": 0.5827,
"step": 3084
},
{
"epoch": 0.9154804270462633,
"grad_norm": 0.255859375,
"learning_rate": 3.525296929362165e-06,
"loss": 0.6044,
"step": 3087
},
{
"epoch": 0.9163701067615658,
"grad_norm": 0.263671875,
"learning_rate": 3.45189849389469e-06,
"loss": 0.6116,
"step": 3090
},
{
"epoch": 0.9172597864768683,
"grad_norm": 0.2578125,
"learning_rate": 3.3792587937282128e-06,
"loss": 0.5705,
"step": 3093
},
{
"epoch": 0.9181494661921709,
"grad_norm": 0.26171875,
"learning_rate": 3.30737839971087e-06,
"loss": 0.6156,
"step": 3096
},
{
"epoch": 0.9190391459074733,
"grad_norm": 0.2451171875,
"learning_rate": 3.236257876723725e-06,
"loss": 0.5991,
"step": 3099
},
{
"epoch": 0.9199288256227758,
"grad_norm": 0.259765625,
"learning_rate": 3.165897783676275e-06,
"loss": 0.5901,
"step": 3102
},
{
"epoch": 0.9208185053380783,
"grad_norm": 0.27734375,
"learning_rate": 3.0962986735020738e-06,
"loss": 0.6183,
"step": 3105
},
{
"epoch": 0.9217081850533808,
"grad_norm": 0.255859375,
"learning_rate": 3.027461093154449e-06,
"loss": 0.5892,
"step": 3108
},
{
"epoch": 0.9225978647686833,
"grad_norm": 0.2578125,
"learning_rate": 2.959385583602081e-06,
"loss": 0.6269,
"step": 3111
},
{
"epoch": 0.9234875444839857,
"grad_norm": 0.26171875,
"learning_rate": 2.8920726798248643e-06,
"loss": 0.5946,
"step": 3114
},
{
"epoch": 0.9243772241992882,
"grad_norm": 0.259765625,
"learning_rate": 2.8255229108096527e-06,
"loss": 0.6192,
"step": 3117
},
{
"epoch": 0.9252669039145908,
"grad_norm": 0.255859375,
"learning_rate": 2.7597367995461086e-06,
"loss": 0.6153,
"step": 3120
},
{
"epoch": 0.9261565836298933,
"grad_norm": 0.265625,
"learning_rate": 2.694714863022585e-06,
"loss": 0.5831,
"step": 3123
},
{
"epoch": 0.9270462633451957,
"grad_norm": 0.267578125,
"learning_rate": 2.6304576122221035e-06,
"loss": 0.5898,
"step": 3126
},
{
"epoch": 0.9279359430604982,
"grad_norm": 0.2490234375,
"learning_rate": 2.566965552118272e-06,
"loss": 0.6098,
"step": 3129
},
{
"epoch": 0.9288256227758007,
"grad_norm": 0.251953125,
"learning_rate": 2.504239181671353e-06,
"loss": 0.5932,
"step": 3132
},
{
"epoch": 0.9297153024911032,
"grad_norm": 0.259765625,
"learning_rate": 2.4422789938243763e-06,
"loss": 0.5877,
"step": 3135
},
{
"epoch": 0.9306049822064056,
"grad_norm": 0.279296875,
"learning_rate": 2.381085475499201e-06,
"loss": 0.5755,
"step": 3138
},
{
"epoch": 0.9314946619217082,
"grad_norm": 0.263671875,
"learning_rate": 2.3206591075927376e-06,
"loss": 0.5875,
"step": 3141
},
{
"epoch": 0.9323843416370107,
"grad_norm": 0.265625,
"learning_rate": 2.2610003649731092e-06,
"loss": 0.6113,
"step": 3144
},
{
"epoch": 0.9332740213523132,
"grad_norm": 0.2578125,
"learning_rate": 2.2021097164760085e-06,
"loss": 0.6035,
"step": 3147
},
{
"epoch": 0.9341637010676157,
"grad_norm": 0.26171875,
"learning_rate": 2.143987624900945e-06,
"loss": 0.5813,
"step": 3150
},
{
"epoch": 0.9350533807829181,
"grad_norm": 0.259765625,
"learning_rate": 2.0866345470076044e-06,
"loss": 0.589,
"step": 3153
},
{
"epoch": 0.9359430604982206,
"grad_norm": 0.279296875,
"learning_rate": 2.0300509335123283e-06,
"loss": 0.5971,
"step": 3156
},
{
"epoch": 0.9368327402135231,
"grad_norm": 0.255859375,
"learning_rate": 1.974237229084497e-06,
"loss": 0.5808,
"step": 3159
},
{
"epoch": 0.9377224199288257,
"grad_norm": 0.263671875,
"learning_rate": 1.9191938723430615e-06,
"loss": 0.6167,
"step": 3162
},
{
"epoch": 0.9386120996441281,
"grad_norm": 0.2578125,
"learning_rate": 1.8649212958531282e-06,
"loss": 0.6088,
"step": 3165
},
{
"epoch": 0.9395017793594306,
"grad_norm": 0.259765625,
"learning_rate": 1.8114199261224928e-06,
"loss": 0.5884,
"step": 3168
},
{
"epoch": 0.9403914590747331,
"grad_norm": 0.265625,
"learning_rate": 1.7586901835983437e-06,
"loss": 0.6122,
"step": 3171
},
{
"epoch": 0.9412811387900356,
"grad_norm": 0.259765625,
"learning_rate": 1.7067324826639419e-06,
"loss": 0.6036,
"step": 3174
},
{
"epoch": 0.9421708185053381,
"grad_norm": 0.2578125,
"learning_rate": 1.655547231635368e-06,
"loss": 0.598,
"step": 3177
},
{
"epoch": 0.9430604982206405,
"grad_norm": 0.26171875,
"learning_rate": 1.6051348327583037e-06,
"loss": 0.6078,
"step": 3180
},
{
"epoch": 0.943950177935943,
"grad_norm": 0.251953125,
"learning_rate": 1.5554956822048661e-06,
"loss": 0.5955,
"step": 3183
},
{
"epoch": 0.9448398576512456,
"grad_norm": 0.271484375,
"learning_rate": 1.5066301700705331e-06,
"loss": 0.589,
"step": 3186
},
{
"epoch": 0.9457295373665481,
"grad_norm": 0.263671875,
"learning_rate": 1.4585386803710021e-06,
"loss": 0.6035,
"step": 3189
},
{
"epoch": 0.9466192170818505,
"grad_norm": 0.265625,
"learning_rate": 1.411221591039269e-06,
"loss": 0.6396,
"step": 3192
},
{
"epoch": 0.947508896797153,
"grad_norm": 0.263671875,
"learning_rate": 1.3646792739225533e-06,
"loss": 0.577,
"step": 3195
},
{
"epoch": 0.9483985765124555,
"grad_norm": 0.2578125,
"learning_rate": 1.3189120947794897e-06,
"loss": 0.5983,
"step": 3198
},
{
"epoch": 0.949288256227758,
"grad_norm": 0.263671875,
"learning_rate": 1.273920413277152e-06,
"loss": 0.6093,
"step": 3201
},
{
"epoch": 0.9501779359430605,
"grad_norm": 0.263671875,
"learning_rate": 1.2297045829882892e-06,
"loss": 0.5966,
"step": 3204
},
{
"epoch": 0.951067615658363,
"grad_norm": 0.265625,
"learning_rate": 1.186264951388516e-06,
"loss": 0.6228,
"step": 3207
},
{
"epoch": 0.9519572953736655,
"grad_norm": 0.26953125,
"learning_rate": 1.1436018598535937e-06,
"loss": 0.6083,
"step": 3210
},
{
"epoch": 0.952846975088968,
"grad_norm": 0.25390625,
"learning_rate": 1.1017156436567532e-06,
"loss": 0.5806,
"step": 3213
},
{
"epoch": 0.9537366548042705,
"grad_norm": 0.267578125,
"learning_rate": 1.0606066319660435e-06,
"loss": 0.579,
"step": 3216
},
{
"epoch": 0.9546263345195729,
"grad_norm": 0.263671875,
"learning_rate": 1.020275147841765e-06,
"loss": 0.6053,
"step": 3219
},
{
"epoch": 0.9555160142348754,
"grad_norm": 0.26171875,
"learning_rate": 9.807215082339394e-07,
"loss": 0.595,
"step": 3222
},
{
"epoch": 0.9564056939501779,
"grad_norm": 0.25,
"learning_rate": 9.41946023979745e-07,
"loss": 0.5857,
"step": 3225
},
{
"epoch": 0.9572953736654805,
"grad_norm": 0.265625,
"learning_rate": 9.039489998011852e-07,
"loss": 0.6189,
"step": 3228
},
{
"epoch": 0.958185053380783,
"grad_norm": 0.26171875,
"learning_rate": 8.66730734302601e-07,
"loss": 0.5837,
"step": 3231
},
{
"epoch": 0.9590747330960854,
"grad_norm": 0.25,
"learning_rate": 8.302915199683737e-07,
"loss": 0.5827,
"step": 3234
},
{
"epoch": 0.9599644128113879,
"grad_norm": 0.259765625,
"learning_rate": 7.94631643160626e-07,
"loss": 0.6043,
"step": 3237
},
{
"epoch": 0.9608540925266904,
"grad_norm": 0.26171875,
"learning_rate": 7.597513841169468e-07,
"loss": 0.5621,
"step": 3240
},
{
"epoch": 0.9617437722419929,
"grad_norm": 0.26953125,
"learning_rate": 7.256510169482034e-07,
"loss": 0.5886,
"step": 3243
},
{
"epoch": 0.9626334519572953,
"grad_norm": 0.279296875,
"learning_rate": 6.923308096363879e-07,
"loss": 0.6205,
"step": 3246
},
{
"epoch": 0.9635231316725978,
"grad_norm": 0.267578125,
"learning_rate": 6.597910240324967e-07,
"loss": 0.6038,
"step": 3249
},
{
"epoch": 0.9644128113879004,
"grad_norm": 0.26171875,
"learning_rate": 6.280319158544989e-07,
"loss": 0.6301,
"step": 3252
},
{
"epoch": 0.9653024911032029,
"grad_norm": 0.265625,
"learning_rate": 5.970537346853156e-07,
"loss": 0.6007,
"step": 3255
},
{
"epoch": 0.9661921708185054,
"grad_norm": 0.2470703125,
"learning_rate": 5.668567239708323e-07,
"loss": 0.5789,
"step": 3258
},
{
"epoch": 0.9670818505338078,
"grad_norm": 0.265625,
"learning_rate": 5.374411210180341e-07,
"loss": 0.5964,
"step": 3261
},
{
"epoch": 0.9679715302491103,
"grad_norm": 0.25390625,
"learning_rate": 5.088071569931185e-07,
"loss": 0.5953,
"step": 3264
},
{
"epoch": 0.9688612099644128,
"grad_norm": 0.2431640625,
"learning_rate": 4.809550569196519e-07,
"loss": 0.5877,
"step": 3267
},
{
"epoch": 0.9697508896797153,
"grad_norm": 0.263671875,
"learning_rate": 4.5388503967683793e-07,
"loss": 0.5923,
"step": 3270
},
{
"epoch": 0.9706405693950177,
"grad_norm": 0.255859375,
"learning_rate": 4.275973179977855e-07,
"loss": 0.5958,
"step": 3273
},
{
"epoch": 0.9715302491103203,
"grad_norm": 0.271484375,
"learning_rate": 4.0209209846783224e-07,
"loss": 0.5977,
"step": 3276
},
{
"epoch": 0.9724199288256228,
"grad_norm": 0.265625,
"learning_rate": 3.773695815229239e-07,
"loss": 0.592,
"step": 3279
},
{
"epoch": 0.9733096085409253,
"grad_norm": 0.248046875,
"learning_rate": 3.534299614480596e-07,
"loss": 0.5702,
"step": 3282
},
{
"epoch": 0.9741992882562278,
"grad_norm": 0.255859375,
"learning_rate": 3.3027342637572676e-07,
"loss": 0.5893,
"step": 3285
},
{
"epoch": 0.9750889679715302,
"grad_norm": 0.251953125,
"learning_rate": 3.079001582844354e-07,
"loss": 0.6177,
"step": 3288
},
{
"epoch": 0.9759786476868327,
"grad_norm": 0.341796875,
"learning_rate": 2.8631033299730825e-07,
"loss": 0.6178,
"step": 3291
},
{
"epoch": 0.9768683274021353,
"grad_norm": 0.255859375,
"learning_rate": 2.655041201806707e-07,
"loss": 0.5924,
"step": 3294
},
{
"epoch": 0.9777580071174378,
"grad_norm": 0.259765625,
"learning_rate": 2.454816833427631e-07,
"loss": 0.6021,
"step": 3297
},
{
"epoch": 0.9786476868327402,
"grad_norm": 0.2578125,
"learning_rate": 2.2624317983239718e-07,
"loss": 0.6131,
"step": 3300
}
],
"logging_steps": 3,
"max_steps": 3372,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.1603485806523056e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}