smollm_1_7B_tulu3 / trainer_state.json
yakazimir's picture
Model save
717d56a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.99979354483518,
"eval_steps": 200,
"global_step": 3632,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00027527355309338655,
"grad_norm": 47.970261255811366,
"learning_rate": 2.7472527472527476e-08,
"loss": 2.9286,
"step": 1
},
{
"epoch": 0.0013763677654669328,
"grad_norm": 197.0659220078,
"learning_rate": 1.3736263736263737e-07,
"loss": 3.3156,
"step": 5
},
{
"epoch": 0.0027527355309338655,
"grad_norm": 155.2306778030855,
"learning_rate": 2.7472527472527475e-07,
"loss": 3.1021,
"step": 10
},
{
"epoch": 0.004129103296400798,
"grad_norm": 40.67122345119242,
"learning_rate": 4.120879120879121e-07,
"loss": 2.9314,
"step": 15
},
{
"epoch": 0.005505471061867731,
"grad_norm": 59.59101654287074,
"learning_rate": 5.494505494505495e-07,
"loss": 3.0831,
"step": 20
},
{
"epoch": 0.006881838827334664,
"grad_norm": 105.45909774967997,
"learning_rate": 6.868131868131869e-07,
"loss": 3.0525,
"step": 25
},
{
"epoch": 0.008258206592801597,
"grad_norm": 34.27227049204782,
"learning_rate": 8.241758241758242e-07,
"loss": 3.1182,
"step": 30
},
{
"epoch": 0.00963457435826853,
"grad_norm": 127.02820304897055,
"learning_rate": 9.615384615384617e-07,
"loss": 2.9352,
"step": 35
},
{
"epoch": 0.011010942123735462,
"grad_norm": 64.04507215334816,
"learning_rate": 1.098901098901099e-06,
"loss": 2.9873,
"step": 40
},
{
"epoch": 0.012387309889202395,
"grad_norm": 23.999491442433914,
"learning_rate": 1.2362637362637365e-06,
"loss": 2.667,
"step": 45
},
{
"epoch": 0.013763677654669328,
"grad_norm": 34.50645980021284,
"learning_rate": 1.3736263736263738e-06,
"loss": 2.6091,
"step": 50
},
{
"epoch": 0.01514004542013626,
"grad_norm": 46.64865828445111,
"learning_rate": 1.510989010989011e-06,
"loss": 2.4922,
"step": 55
},
{
"epoch": 0.016516413185603193,
"grad_norm": 21.789171957916576,
"learning_rate": 1.6483516483516484e-06,
"loss": 2.4031,
"step": 60
},
{
"epoch": 0.017892780951070126,
"grad_norm": 38.24383608079909,
"learning_rate": 1.7857142857142859e-06,
"loss": 2.3918,
"step": 65
},
{
"epoch": 0.01926914871653706,
"grad_norm": 15.503348119198089,
"learning_rate": 1.9230769230769234e-06,
"loss": 2.2549,
"step": 70
},
{
"epoch": 0.02064551648200399,
"grad_norm": 12.557848365372337,
"learning_rate": 2.0604395604395607e-06,
"loss": 2.254,
"step": 75
},
{
"epoch": 0.022021884247470924,
"grad_norm": 13.411610689976131,
"learning_rate": 2.197802197802198e-06,
"loss": 2.2895,
"step": 80
},
{
"epoch": 0.023398252012937857,
"grad_norm": 11.512956200014314,
"learning_rate": 2.3351648351648353e-06,
"loss": 1.9875,
"step": 85
},
{
"epoch": 0.02477461977840479,
"grad_norm": 10.834444585244098,
"learning_rate": 2.472527472527473e-06,
"loss": 1.9547,
"step": 90
},
{
"epoch": 0.026150987543871723,
"grad_norm": 10.311358732225472,
"learning_rate": 2.6098901098901103e-06,
"loss": 1.9957,
"step": 95
},
{
"epoch": 0.027527355309338655,
"grad_norm": 8.779165691485517,
"learning_rate": 2.7472527472527476e-06,
"loss": 1.8935,
"step": 100
},
{
"epoch": 0.028903723074805588,
"grad_norm": 7.066737844316085,
"learning_rate": 2.8846153846153845e-06,
"loss": 1.7438,
"step": 105
},
{
"epoch": 0.03028009084027252,
"grad_norm": 7.678771227526979,
"learning_rate": 3.021978021978022e-06,
"loss": 1.6582,
"step": 110
},
{
"epoch": 0.03165645860573946,
"grad_norm": 6.076422896577125,
"learning_rate": 3.1593406593406595e-06,
"loss": 1.6652,
"step": 115
},
{
"epoch": 0.033032826371206386,
"grad_norm": 6.807996481097765,
"learning_rate": 3.2967032967032968e-06,
"loss": 1.6854,
"step": 120
},
{
"epoch": 0.03440919413667332,
"grad_norm": 5.621259165131704,
"learning_rate": 3.4340659340659345e-06,
"loss": 1.6461,
"step": 125
},
{
"epoch": 0.03578556190214025,
"grad_norm": 4.18571348105954,
"learning_rate": 3.5714285714285718e-06,
"loss": 1.5098,
"step": 130
},
{
"epoch": 0.03716192966760719,
"grad_norm": 4.80488403899634,
"learning_rate": 3.708791208791209e-06,
"loss": 1.5253,
"step": 135
},
{
"epoch": 0.03853829743307412,
"grad_norm": 5.012572093969651,
"learning_rate": 3.846153846153847e-06,
"loss": 1.5984,
"step": 140
},
{
"epoch": 0.039914665198541054,
"grad_norm": 4.552751574433519,
"learning_rate": 3.983516483516483e-06,
"loss": 1.5606,
"step": 145
},
{
"epoch": 0.04129103296400798,
"grad_norm": 4.3599791639751535,
"learning_rate": 4.120879120879121e-06,
"loss": 1.4992,
"step": 150
},
{
"epoch": 0.04266740072947492,
"grad_norm": 3.488042700729171,
"learning_rate": 4.258241758241759e-06,
"loss": 1.4261,
"step": 155
},
{
"epoch": 0.04404376849494185,
"grad_norm": 3.8333617793891577,
"learning_rate": 4.395604395604396e-06,
"loss": 1.4617,
"step": 160
},
{
"epoch": 0.045420136260408785,
"grad_norm": 3.0548962932241848,
"learning_rate": 4.532967032967033e-06,
"loss": 1.4781,
"step": 165
},
{
"epoch": 0.046796504025875714,
"grad_norm": 5.5346794342223,
"learning_rate": 4.6703296703296706e-06,
"loss": 1.4395,
"step": 170
},
{
"epoch": 0.04817287179134265,
"grad_norm": 3.685964787103759,
"learning_rate": 4.807692307692308e-06,
"loss": 1.3802,
"step": 175
},
{
"epoch": 0.04954923955680958,
"grad_norm": 4.058757049675143,
"learning_rate": 4.945054945054946e-06,
"loss": 1.4061,
"step": 180
},
{
"epoch": 0.050925607322276516,
"grad_norm": 2.9156243613759965,
"learning_rate": 4.999990671457219e-06,
"loss": 1.3884,
"step": 185
},
{
"epoch": 0.052301975087743445,
"grad_norm": 2.8044219675737225,
"learning_rate": 4.999933663947887e-06,
"loss": 1.3079,
"step": 190
},
{
"epoch": 0.05367834285321038,
"grad_norm": 2.5580737825571354,
"learning_rate": 4.999824832633327e-06,
"loss": 1.2732,
"step": 195
},
{
"epoch": 0.05505471061867731,
"grad_norm": 2.5474355608388106,
"learning_rate": 4.999664179769621e-06,
"loss": 1.3348,
"step": 200
},
{
"epoch": 0.05505471061867731,
"eval_loss": 1.2703925371170044,
"eval_runtime": 37.5835,
"eval_samples_per_second": 133.037,
"eval_steps_per_second": 2.102,
"step": 200
},
{
"epoch": 0.05643107838414425,
"grad_norm": 2.5400210842014985,
"learning_rate": 4.999451708687114e-06,
"loss": 1.2628,
"step": 205
},
{
"epoch": 0.057807446149611176,
"grad_norm": 2.9134202135126803,
"learning_rate": 4.999187423790347e-06,
"loss": 1.3461,
"step": 210
},
{
"epoch": 0.05918381391507811,
"grad_norm": 2.979982911881612,
"learning_rate": 4.9988713305579665e-06,
"loss": 1.279,
"step": 215
},
{
"epoch": 0.06056018168054504,
"grad_norm": 2.3341235353360754,
"learning_rate": 4.998503435542605e-06,
"loss": 1.2791,
"step": 220
},
{
"epoch": 0.06193654944601198,
"grad_norm": 2.553595077194149,
"learning_rate": 4.9980837463707545e-06,
"loss": 1.2591,
"step": 225
},
{
"epoch": 0.06331291721147891,
"grad_norm": 2.1559374955549675,
"learning_rate": 4.997612271742601e-06,
"loss": 1.2184,
"step": 230
},
{
"epoch": 0.06468928497694584,
"grad_norm": 2.143686811503016,
"learning_rate": 4.9970890214318494e-06,
"loss": 1.2676,
"step": 235
},
{
"epoch": 0.06606565274241277,
"grad_norm": 1.75568515745194,
"learning_rate": 4.996514006285514e-06,
"loss": 1.2013,
"step": 240
},
{
"epoch": 0.0674420205078797,
"grad_norm": 2.4736934108040027,
"learning_rate": 4.995887238223703e-06,
"loss": 1.2801,
"step": 245
},
{
"epoch": 0.06881838827334665,
"grad_norm": 1.9847695711171895,
"learning_rate": 4.99520873023936e-06,
"loss": 1.2026,
"step": 250
},
{
"epoch": 0.07019475603881357,
"grad_norm": 2.3258055796641592,
"learning_rate": 4.994478496398007e-06,
"loss": 1.2115,
"step": 255
},
{
"epoch": 0.0715711238042805,
"grad_norm": 1.8207346260417532,
"learning_rate": 4.993696551837444e-06,
"loss": 1.2521,
"step": 260
},
{
"epoch": 0.07294749156974743,
"grad_norm": 1.901002680506845,
"learning_rate": 4.9928629127674375e-06,
"loss": 1.1437,
"step": 265
},
{
"epoch": 0.07432385933521438,
"grad_norm": 2.1411204432050925,
"learning_rate": 4.991977596469385e-06,
"loss": 1.1638,
"step": 270
},
{
"epoch": 0.0757002271006813,
"grad_norm": 1.6695878470807146,
"learning_rate": 4.991040621295959e-06,
"loss": 1.1406,
"step": 275
},
{
"epoch": 0.07707659486614823,
"grad_norm": 1.9670297948191877,
"learning_rate": 4.990052006670722e-06,
"loss": 1.1152,
"step": 280
},
{
"epoch": 0.07845296263161516,
"grad_norm": 1.8164273632468562,
"learning_rate": 4.989011773087725e-06,
"loss": 1.154,
"step": 285
},
{
"epoch": 0.07982933039708211,
"grad_norm": 1.4090788234057618,
"learning_rate": 4.9879199421110865e-06,
"loss": 1.0789,
"step": 290
},
{
"epoch": 0.08120569816254904,
"grad_norm": 1.5318431537024289,
"learning_rate": 4.9867765363745426e-06,
"loss": 1.156,
"step": 295
},
{
"epoch": 0.08258206592801597,
"grad_norm": 1.5662411802833258,
"learning_rate": 4.9855815795809735e-06,
"loss": 1.2253,
"step": 300
},
{
"epoch": 0.0839584336934829,
"grad_norm": 1.487588263963499,
"learning_rate": 4.984335096501922e-06,
"loss": 1.1697,
"step": 305
},
{
"epoch": 0.08533480145894984,
"grad_norm": 1.5795690517983507,
"learning_rate": 4.983037112977072e-06,
"loss": 1.1747,
"step": 310
},
{
"epoch": 0.08671116922441677,
"grad_norm": 1.5801325920576266,
"learning_rate": 4.981687655913716e-06,
"loss": 1.1812,
"step": 315
},
{
"epoch": 0.0880875369898837,
"grad_norm": 1.5881269944242014,
"learning_rate": 4.980286753286196e-06,
"loss": 1.1158,
"step": 320
},
{
"epoch": 0.08946390475535063,
"grad_norm": 1.3649187456591194,
"learning_rate": 4.978834434135323e-06,
"loss": 1.0911,
"step": 325
},
{
"epoch": 0.09084027252081757,
"grad_norm": 1.3003473744310392,
"learning_rate": 4.977330728567778e-06,
"loss": 1.0947,
"step": 330
},
{
"epoch": 0.0922166402862845,
"grad_norm": 2.0679161756500832,
"learning_rate": 4.975775667755489e-06,
"loss": 1.1364,
"step": 335
},
{
"epoch": 0.09359300805175143,
"grad_norm": 1.5739073785267514,
"learning_rate": 4.974169283934976e-06,
"loss": 1.172,
"step": 340
},
{
"epoch": 0.09496937581721836,
"grad_norm": 1.7213917188161827,
"learning_rate": 4.972511610406693e-06,
"loss": 1.1608,
"step": 345
},
{
"epoch": 0.0963457435826853,
"grad_norm": 1.4537784803209515,
"learning_rate": 4.970802681534331e-06,
"loss": 1.1647,
"step": 350
},
{
"epoch": 0.09772211134815223,
"grad_norm": 1.3731655227058528,
"learning_rate": 4.969042532744109e-06,
"loss": 1.0853,
"step": 355
},
{
"epoch": 0.09909847911361916,
"grad_norm": 1.2363319744461196,
"learning_rate": 4.967231200524037e-06,
"loss": 1.0066,
"step": 360
},
{
"epoch": 0.10047484687908609,
"grad_norm": 1.5759080996624,
"learning_rate": 4.965368722423166e-06,
"loss": 1.1516,
"step": 365
},
{
"epoch": 0.10185121464455303,
"grad_norm": 1.3507882575141734,
"learning_rate": 4.9634551370507985e-06,
"loss": 1.1073,
"step": 370
},
{
"epoch": 0.10322758241001996,
"grad_norm": 1.277730572175811,
"learning_rate": 4.961490484075698e-06,
"loss": 1.1298,
"step": 375
},
{
"epoch": 0.10460395017548689,
"grad_norm": 1.4535170267483315,
"learning_rate": 4.9594748042252635e-06,
"loss": 1.1084,
"step": 380
},
{
"epoch": 0.10598031794095382,
"grad_norm": 1.6186734387061241,
"learning_rate": 4.957408139284682e-06,
"loss": 1.1102,
"step": 385
},
{
"epoch": 0.10735668570642076,
"grad_norm": 1.132537102297945,
"learning_rate": 4.9552905320960685e-06,
"loss": 1.065,
"step": 390
},
{
"epoch": 0.10873305347188769,
"grad_norm": 1.3026196221634236,
"learning_rate": 4.9531220265575714e-06,
"loss": 1.1021,
"step": 395
},
{
"epoch": 0.11010942123735462,
"grad_norm": 1.3077867242379508,
"learning_rate": 4.950902667622468e-06,
"loss": 1.0411,
"step": 400
},
{
"epoch": 0.11010942123735462,
"eval_loss": 1.0434951782226562,
"eval_runtime": 37.5694,
"eval_samples_per_second": 133.087,
"eval_steps_per_second": 2.103,
"step": 400
},
{
"epoch": 0.11148578900282155,
"grad_norm": 1.6148360944145506,
"learning_rate": 4.948632501298228e-06,
"loss": 1.0545,
"step": 405
},
{
"epoch": 0.1128621567682885,
"grad_norm": 1.316099506488652,
"learning_rate": 4.9463115746455656e-06,
"loss": 1.0593,
"step": 410
},
{
"epoch": 0.11423852453375542,
"grad_norm": 1.4454673179124138,
"learning_rate": 4.943939935777455e-06,
"loss": 1.0217,
"step": 415
},
{
"epoch": 0.11561489229922235,
"grad_norm": 1.2512294045462884,
"learning_rate": 4.941517633858142e-06,
"loss": 1.1085,
"step": 420
},
{
"epoch": 0.11699126006468928,
"grad_norm": 1.0101964296414214,
"learning_rate": 4.93904471910212e-06,
"loss": 1.0003,
"step": 425
},
{
"epoch": 0.11836762783015622,
"grad_norm": 1.1944857398005146,
"learning_rate": 4.936521242773091e-06,
"loss": 1.1296,
"step": 430
},
{
"epoch": 0.11974399559562315,
"grad_norm": 1.2974835007262053,
"learning_rate": 4.933947257182901e-06,
"loss": 1.1402,
"step": 435
},
{
"epoch": 0.12112036336109008,
"grad_norm": 1.6118257673114658,
"learning_rate": 4.931322815690457e-06,
"loss": 1.0763,
"step": 440
},
{
"epoch": 0.12249673112655701,
"grad_norm": 1.224531658843626,
"learning_rate": 4.92864797270062e-06,
"loss": 1.1138,
"step": 445
},
{
"epoch": 0.12387309889202396,
"grad_norm": 1.1659984082065094,
"learning_rate": 4.925922783663079e-06,
"loss": 1.0189,
"step": 450
},
{
"epoch": 0.12524946665749087,
"grad_norm": 8.709307393389583,
"learning_rate": 4.923147305071199e-06,
"loss": 1.0822,
"step": 455
},
{
"epoch": 0.12662583442295783,
"grad_norm": 1.1509719961863358,
"learning_rate": 4.9203215944608515e-06,
"loss": 1.0373,
"step": 460
},
{
"epoch": 0.12800220218842476,
"grad_norm": 1.0234613424480405,
"learning_rate": 4.917445710409221e-06,
"loss": 1.037,
"step": 465
},
{
"epoch": 0.1293785699538917,
"grad_norm": 1.1018659479000739,
"learning_rate": 4.914519712533592e-06,
"loss": 1.088,
"step": 470
},
{
"epoch": 0.13075493771935862,
"grad_norm": 0.9721625793315171,
"learning_rate": 4.911543661490111e-06,
"loss": 1.05,
"step": 475
},
{
"epoch": 0.13213130548482555,
"grad_norm": 0.9710335336779444,
"learning_rate": 4.908517618972532e-06,
"loss": 1.0123,
"step": 480
},
{
"epoch": 0.13350767325029247,
"grad_norm": 1.5166549245825005,
"learning_rate": 4.905441647710932e-06,
"loss": 1.05,
"step": 485
},
{
"epoch": 0.1348840410157594,
"grad_norm": 0.8785700675801744,
"learning_rate": 4.90231581147042e-06,
"loss": 1.032,
"step": 490
},
{
"epoch": 0.13626040878122633,
"grad_norm": 1.5060433665040545,
"learning_rate": 4.899140175049806e-06,
"loss": 1.0196,
"step": 495
},
{
"epoch": 0.1376367765466933,
"grad_norm": 1.0487460758379659,
"learning_rate": 4.895914804280262e-06,
"loss": 1.089,
"step": 500
},
{
"epoch": 0.13901314431216022,
"grad_norm": 1.3023717508397745,
"learning_rate": 4.892639766023957e-06,
"loss": 1.022,
"step": 505
},
{
"epoch": 0.14038951207762715,
"grad_norm": 1.0972179140074843,
"learning_rate": 4.889315128172669e-06,
"loss": 1.0049,
"step": 510
},
{
"epoch": 0.14176587984309408,
"grad_norm": 0.8796434076141977,
"learning_rate": 4.885940959646383e-06,
"loss": 0.9685,
"step": 515
},
{
"epoch": 0.143142247608561,
"grad_norm": 0.9834949701746455,
"learning_rate": 4.882517330391854e-06,
"loss": 1.0246,
"step": 520
},
{
"epoch": 0.14451861537402794,
"grad_norm": 1.138039557393203,
"learning_rate": 4.879044311381164e-06,
"loss": 1.1077,
"step": 525
},
{
"epoch": 0.14589498313949487,
"grad_norm": 1.085162105922894,
"learning_rate": 4.875521974610247e-06,
"loss": 1.0675,
"step": 530
},
{
"epoch": 0.1472713509049618,
"grad_norm": 0.871775704668988,
"learning_rate": 4.8719503930973995e-06,
"loss": 1.019,
"step": 535
},
{
"epoch": 0.14864771867042875,
"grad_norm": 0.9961249615759942,
"learning_rate": 4.868329640881764e-06,
"loss": 1.0749,
"step": 540
},
{
"epoch": 0.15002408643589568,
"grad_norm": 1.1040648660006491,
"learning_rate": 4.864659793021795e-06,
"loss": 1.0435,
"step": 545
},
{
"epoch": 0.1514004542013626,
"grad_norm": 0.9631823018123004,
"learning_rate": 4.860940925593703e-06,
"loss": 0.9509,
"step": 550
},
{
"epoch": 0.15277682196682954,
"grad_norm": 1.0645503004444963,
"learning_rate": 4.8571731156898785e-06,
"loss": 1.0142,
"step": 555
},
{
"epoch": 0.15415318973229647,
"grad_norm": 1.0599242806058304,
"learning_rate": 4.8533564414172915e-06,
"loss": 0.9796,
"step": 560
},
{
"epoch": 0.1555295574977634,
"grad_norm": 1.0803032824855876,
"learning_rate": 4.849490981895877e-06,
"loss": 0.9508,
"step": 565
},
{
"epoch": 0.15690592526323033,
"grad_norm": 0.8429127532715366,
"learning_rate": 4.845576817256888e-06,
"loss": 0.9975,
"step": 570
},
{
"epoch": 0.15828229302869726,
"grad_norm": 0.8648413729480695,
"learning_rate": 4.841614028641241e-06,
"loss": 1.0446,
"step": 575
},
{
"epoch": 0.15965866079416421,
"grad_norm": 0.766164996990341,
"learning_rate": 4.83760269819783e-06,
"loss": 1.0017,
"step": 580
},
{
"epoch": 0.16103502855963114,
"grad_norm": 1.1710332971275654,
"learning_rate": 4.833542909081824e-06,
"loss": 0.977,
"step": 585
},
{
"epoch": 0.16241139632509807,
"grad_norm": 0.8805076425003365,
"learning_rate": 4.829434745452944e-06,
"loss": 1.0282,
"step": 590
},
{
"epoch": 0.163787764090565,
"grad_norm": 0.9917312981640594,
"learning_rate": 4.82527829247372e-06,
"loss": 1.0007,
"step": 595
},
{
"epoch": 0.16516413185603193,
"grad_norm": 0.9418229318080311,
"learning_rate": 4.821073636307719e-06,
"loss": 1.0483,
"step": 600
},
{
"epoch": 0.16516413185603193,
"eval_loss": 0.9694015383720398,
"eval_runtime": 37.5636,
"eval_samples_per_second": 133.108,
"eval_steps_per_second": 2.103,
"step": 600
},
{
"epoch": 0.16654049962149886,
"grad_norm": 1.151675240339901,
"learning_rate": 4.81682086411777e-06,
"loss": 1.0735,
"step": 605
},
{
"epoch": 0.1679168673869658,
"grad_norm": 0.8785348232118754,
"learning_rate": 4.812520064064146e-06,
"loss": 0.9803,
"step": 610
},
{
"epoch": 0.16929323515243272,
"grad_norm": 0.7903326312503011,
"learning_rate": 4.8081713253027415e-06,
"loss": 1.0074,
"step": 615
},
{
"epoch": 0.17066960291789968,
"grad_norm": 1.1239070075461262,
"learning_rate": 4.803774737983226e-06,
"loss": 0.9774,
"step": 620
},
{
"epoch": 0.1720459706833666,
"grad_norm": 0.8575748601821118,
"learning_rate": 4.799330393247173e-06,
"loss": 0.9554,
"step": 625
},
{
"epoch": 0.17342233844883354,
"grad_norm": 0.7681124502085931,
"learning_rate": 4.7948383832261665e-06,
"loss": 0.9925,
"step": 630
},
{
"epoch": 0.17479870621430046,
"grad_norm": 0.9533830559317458,
"learning_rate": 4.790298801039901e-06,
"loss": 0.9942,
"step": 635
},
{
"epoch": 0.1761750739797674,
"grad_norm": 0.9451172948449857,
"learning_rate": 4.785711740794241e-06,
"loss": 1.0296,
"step": 640
},
{
"epoch": 0.17755144174523432,
"grad_norm": 0.9522113767917231,
"learning_rate": 4.781077297579278e-06,
"loss": 0.9792,
"step": 645
},
{
"epoch": 0.17892780951070125,
"grad_norm": 0.671435197412033,
"learning_rate": 4.776395567467353e-06,
"loss": 0.967,
"step": 650
},
{
"epoch": 0.18030417727616818,
"grad_norm": 0.775668527281842,
"learning_rate": 4.7716666475110686e-06,
"loss": 1.0187,
"step": 655
},
{
"epoch": 0.18168054504163514,
"grad_norm": 0.8387050001267089,
"learning_rate": 4.766890635741278e-06,
"loss": 1.0319,
"step": 660
},
{
"epoch": 0.18305691280710207,
"grad_norm": 0.824748710334662,
"learning_rate": 4.762067631165049e-06,
"loss": 0.9293,
"step": 665
},
{
"epoch": 0.184433280572569,
"grad_norm": 0.9799352793752129,
"learning_rate": 4.757197733763615e-06,
"loss": 0.9157,
"step": 670
},
{
"epoch": 0.18580964833803593,
"grad_norm": 0.8362870057769561,
"learning_rate": 4.7522810444903004e-06,
"loss": 0.949,
"step": 675
},
{
"epoch": 0.18718601610350286,
"grad_norm": 0.9321760596367203,
"learning_rate": 4.7473176652684276e-06,
"loss": 0.9901,
"step": 680
},
{
"epoch": 0.18856238386896979,
"grad_norm": 0.7717121636862594,
"learning_rate": 4.742307698989207e-06,
"loss": 1.0114,
"step": 685
},
{
"epoch": 0.18993875163443671,
"grad_norm": 1.0300652464671185,
"learning_rate": 4.7372512495096005e-06,
"loss": 1.0247,
"step": 690
},
{
"epoch": 0.19131511939990364,
"grad_norm": 0.7755223484828447,
"learning_rate": 4.732148421650171e-06,
"loss": 0.9337,
"step": 695
},
{
"epoch": 0.1926914871653706,
"grad_norm": 1.0381026277362817,
"learning_rate": 4.7269993211929086e-06,
"loss": 0.9709,
"step": 700
},
{
"epoch": 0.19406785493083753,
"grad_norm": 0.7702377174214647,
"learning_rate": 4.721804054879036e-06,
"loss": 0.9726,
"step": 705
},
{
"epoch": 0.19544422269630446,
"grad_norm": 0.9053621913087226,
"learning_rate": 4.7165627304068e-06,
"loss": 0.953,
"step": 710
},
{
"epoch": 0.1968205904617714,
"grad_norm": 1.003117563447582,
"learning_rate": 4.711275456429235e-06,
"loss": 0.9849,
"step": 715
},
{
"epoch": 0.19819695822723832,
"grad_norm": 0.9099542081572461,
"learning_rate": 4.70594234255191e-06,
"loss": 0.9901,
"step": 720
},
{
"epoch": 0.19957332599270525,
"grad_norm": 1.0432447959538196,
"learning_rate": 4.700563499330664e-06,
"loss": 0.9535,
"step": 725
},
{
"epoch": 0.20094969375817218,
"grad_norm": 0.8237571322857683,
"learning_rate": 4.695139038269303e-06,
"loss": 0.9535,
"step": 730
},
{
"epoch": 0.2023260615236391,
"grad_norm": 0.9877142384272304,
"learning_rate": 4.689669071817296e-06,
"loss": 0.9509,
"step": 735
},
{
"epoch": 0.20370242928910606,
"grad_norm": 0.681443730837864,
"learning_rate": 4.684153713367442e-06,
"loss": 0.917,
"step": 740
},
{
"epoch": 0.205078797054573,
"grad_norm": 1.0414452790533368,
"learning_rate": 4.678593077253521e-06,
"loss": 0.9662,
"step": 745
},
{
"epoch": 0.20645516482003992,
"grad_norm": 0.7632284159752581,
"learning_rate": 4.672987278747919e-06,
"loss": 0.9588,
"step": 750
},
{
"epoch": 0.20783153258550685,
"grad_norm": 0.7404164880344434,
"learning_rate": 4.667336434059246e-06,
"loss": 0.9426,
"step": 755
},
{
"epoch": 0.20920790035097378,
"grad_norm": 0.7658934106898954,
"learning_rate": 4.661640660329918e-06,
"loss": 0.9787,
"step": 760
},
{
"epoch": 0.2105842681164407,
"grad_norm": 0.7794575989249981,
"learning_rate": 4.655900075633736e-06,
"loss": 0.9341,
"step": 765
},
{
"epoch": 0.21196063588190764,
"grad_norm": 0.681238788669416,
"learning_rate": 4.650114798973434e-06,
"loss": 0.9734,
"step": 770
},
{
"epoch": 0.21333700364737457,
"grad_norm": 1.0301580037782345,
"learning_rate": 4.644284950278217e-06,
"loss": 0.9438,
"step": 775
},
{
"epoch": 0.21471337141284152,
"grad_norm": 1.3078635807263586,
"learning_rate": 4.638410650401267e-06,
"loss": 0.9335,
"step": 780
},
{
"epoch": 0.21608973917830845,
"grad_norm": 0.730305470715918,
"learning_rate": 4.632492021117245e-06,
"loss": 0.9164,
"step": 785
},
{
"epoch": 0.21746610694377538,
"grad_norm": 1.1199081512447784,
"learning_rate": 4.626529185119763e-06,
"loss": 0.9451,
"step": 790
},
{
"epoch": 0.2188424747092423,
"grad_norm": 1.0679579821267784,
"learning_rate": 4.620522266018841e-06,
"loss": 0.9914,
"step": 795
},
{
"epoch": 0.22021884247470924,
"grad_norm": 0.6094802947859647,
"learning_rate": 4.614471388338346e-06,
"loss": 0.8801,
"step": 800
},
{
"epoch": 0.22021884247470924,
"eval_loss": 0.9227399230003357,
"eval_runtime": 37.5694,
"eval_samples_per_second": 133.087,
"eval_steps_per_second": 2.103,
"step": 800
},
{
"epoch": 0.22159521024017617,
"grad_norm": 0.8418029822067341,
"learning_rate": 4.60837667751341e-06,
"loss": 0.8924,
"step": 805
},
{
"epoch": 0.2229715780056431,
"grad_norm": 0.822305082880161,
"learning_rate": 4.602238259887825e-06,
"loss": 0.9395,
"step": 810
},
{
"epoch": 0.22434794577111003,
"grad_norm": 0.6380622331066487,
"learning_rate": 4.596056262711434e-06,
"loss": 0.9366,
"step": 815
},
{
"epoch": 0.225724313536577,
"grad_norm": 0.816860005626295,
"learning_rate": 4.5898308141374835e-06,
"loss": 0.9472,
"step": 820
},
{
"epoch": 0.22710068130204392,
"grad_norm": 0.6422659239711008,
"learning_rate": 4.583562043219972e-06,
"loss": 0.9558,
"step": 825
},
{
"epoch": 0.22847704906751085,
"grad_norm": 0.7575538911525173,
"learning_rate": 4.577250079910973e-06,
"loss": 0.933,
"step": 830
},
{
"epoch": 0.22985341683297777,
"grad_norm": 0.72595045850496,
"learning_rate": 4.57089505505794e-06,
"loss": 0.9754,
"step": 835
},
{
"epoch": 0.2312297845984447,
"grad_norm": 0.9692773154744749,
"learning_rate": 4.564497100400998e-06,
"loss": 0.9833,
"step": 840
},
{
"epoch": 0.23260615236391163,
"grad_norm": 0.8045647953099883,
"learning_rate": 4.558056348570209e-06,
"loss": 0.918,
"step": 845
},
{
"epoch": 0.23398252012937856,
"grad_norm": 0.8493707730777622,
"learning_rate": 4.551572933082823e-06,
"loss": 0.9389,
"step": 850
},
{
"epoch": 0.2353588878948455,
"grad_norm": 0.7008145118663581,
"learning_rate": 4.545046988340509e-06,
"loss": 0.8909,
"step": 855
},
{
"epoch": 0.23673525566031245,
"grad_norm": 0.9052644910175894,
"learning_rate": 4.538478649626575e-06,
"loss": 0.9574,
"step": 860
},
{
"epoch": 0.23811162342577938,
"grad_norm": 0.785016020446548,
"learning_rate": 4.531868053103153e-06,
"loss": 1.0396,
"step": 865
},
{
"epoch": 0.2394879911912463,
"grad_norm": 0.8758044752350663,
"learning_rate": 4.52521533580839e-06,
"loss": 0.8471,
"step": 870
},
{
"epoch": 0.24086435895671324,
"grad_norm": 0.7552178534994997,
"learning_rate": 4.518520635653594e-06,
"loss": 0.973,
"step": 875
},
{
"epoch": 0.24224072672218017,
"grad_norm": 0.6477878269303148,
"learning_rate": 4.5117840914203805e-06,
"loss": 0.93,
"step": 880
},
{
"epoch": 0.2436170944876471,
"grad_norm": 0.7740483767304198,
"learning_rate": 4.5050058427578e-06,
"loss": 0.8919,
"step": 885
},
{
"epoch": 0.24499346225311402,
"grad_norm": 0.5175079894302275,
"learning_rate": 4.498186030179434e-06,
"loss": 0.9334,
"step": 890
},
{
"epoch": 0.24636983001858095,
"grad_norm": 0.7470864740423165,
"learning_rate": 4.491324795060491e-06,
"loss": 0.9059,
"step": 895
},
{
"epoch": 0.2477461977840479,
"grad_norm": 0.5912054028857261,
"learning_rate": 4.4844222796348705e-06,
"loss": 0.9406,
"step": 900
},
{
"epoch": 0.24912256554951484,
"grad_norm": 0.7632257634965951,
"learning_rate": 4.477478626992214e-06,
"loss": 0.9365,
"step": 905
},
{
"epoch": 0.25049893331498174,
"grad_norm": 0.938994254636935,
"learning_rate": 4.47049398107494e-06,
"loss": 0.8971,
"step": 910
},
{
"epoch": 0.2518753010804487,
"grad_norm": 0.5444477069055039,
"learning_rate": 4.4634684866752665e-06,
"loss": 0.9098,
"step": 915
},
{
"epoch": 0.25325166884591566,
"grad_norm": 0.9370323101958166,
"learning_rate": 4.456402289432196e-06,
"loss": 0.988,
"step": 920
},
{
"epoch": 0.25462803661138256,
"grad_norm": 0.656323325746701,
"learning_rate": 4.44929553582851e-06,
"loss": 0.9647,
"step": 925
},
{
"epoch": 0.2560044043768495,
"grad_norm": 0.8566812639937329,
"learning_rate": 4.442148373187722e-06,
"loss": 0.9587,
"step": 930
},
{
"epoch": 0.2573807721423164,
"grad_norm": 0.623784946912734,
"learning_rate": 4.434960949671028e-06,
"loss": 0.8996,
"step": 935
},
{
"epoch": 0.2587571399077834,
"grad_norm": 0.556551458145588,
"learning_rate": 4.427733414274238e-06,
"loss": 0.8582,
"step": 940
},
{
"epoch": 0.2601335076732503,
"grad_norm": 0.6165188495979379,
"learning_rate": 4.420465916824681e-06,
"loss": 0.9263,
"step": 945
},
{
"epoch": 0.26150987543871723,
"grad_norm": 0.5388586078056022,
"learning_rate": 4.413158607978104e-06,
"loss": 0.8803,
"step": 950
},
{
"epoch": 0.26288624320418413,
"grad_norm": 0.6313491960276545,
"learning_rate": 4.405811639215547e-06,
"loss": 0.9321,
"step": 955
},
{
"epoch": 0.2642626109696511,
"grad_norm": 0.7175326284846463,
"learning_rate": 4.398425162840202e-06,
"loss": 0.921,
"step": 960
},
{
"epoch": 0.26563897873511805,
"grad_norm": 0.7423069055487866,
"learning_rate": 4.390999331974257e-06,
"loss": 0.9461,
"step": 965
},
{
"epoch": 0.26701534650058495,
"grad_norm": 0.8331321575994248,
"learning_rate": 4.383534300555722e-06,
"loss": 0.962,
"step": 970
},
{
"epoch": 0.2683917142660519,
"grad_norm": 0.7921869343480575,
"learning_rate": 4.376030223335237e-06,
"loss": 0.8739,
"step": 975
},
{
"epoch": 0.2697680820315188,
"grad_norm": 0.8214762995889284,
"learning_rate": 4.368487255872864e-06,
"loss": 0.9187,
"step": 980
},
{
"epoch": 0.27114444979698576,
"grad_norm": 0.5484449386313469,
"learning_rate": 4.360905554534864e-06,
"loss": 0.8698,
"step": 985
},
{
"epoch": 0.27252081756245267,
"grad_norm": 0.7491545097860447,
"learning_rate": 4.35328527649045e-06,
"loss": 0.865,
"step": 990
},
{
"epoch": 0.2738971853279196,
"grad_norm": 0.7804679526519547,
"learning_rate": 4.3456265797085375e-06,
"loss": 0.9351,
"step": 995
},
{
"epoch": 0.2752735530933866,
"grad_norm": 0.5938120679599327,
"learning_rate": 4.3379296229544635e-06,
"loss": 0.8996,
"step": 1000
},
{
"epoch": 0.2752735530933866,
"eval_loss": 0.8887820243835449,
"eval_runtime": 37.567,
"eval_samples_per_second": 133.095,
"eval_steps_per_second": 2.103,
"step": 1000
},
{
"epoch": 0.2766499208588535,
"grad_norm": 0.5199466760472583,
"learning_rate": 4.330194565786696e-06,
"loss": 0.9159,
"step": 1005
},
{
"epoch": 0.27802628862432044,
"grad_norm": 0.5983605277568362,
"learning_rate": 4.322421568553529e-06,
"loss": 0.9187,
"step": 1010
},
{
"epoch": 0.27940265638978734,
"grad_norm": 0.6616302612956438,
"learning_rate": 4.314610792389757e-06,
"loss": 0.958,
"step": 1015
},
{
"epoch": 0.2807790241552543,
"grad_norm": 0.4539525897659009,
"learning_rate": 4.30676239921333e-06,
"loss": 0.8607,
"step": 1020
},
{
"epoch": 0.2821553919207212,
"grad_norm": 0.665268425928804,
"learning_rate": 4.298876551722007e-06,
"loss": 0.8738,
"step": 1025
},
{
"epoch": 0.28353175968618816,
"grad_norm": 0.5882893400505045,
"learning_rate": 4.290953413389977e-06,
"loss": 0.8947,
"step": 1030
},
{
"epoch": 0.28490812745165506,
"grad_norm": 0.5736299385687077,
"learning_rate": 4.282993148464467e-06,
"loss": 0.9378,
"step": 1035
},
{
"epoch": 0.286284495217122,
"grad_norm": 0.7145463884016067,
"learning_rate": 4.2749959219623434e-06,
"loss": 0.9029,
"step": 1040
},
{
"epoch": 0.28766086298258897,
"grad_norm": 0.5441170060136393,
"learning_rate": 4.266961899666689e-06,
"loss": 0.9119,
"step": 1045
},
{
"epoch": 0.2890372307480559,
"grad_norm": 0.7902074547549074,
"learning_rate": 4.2588912481233666e-06,
"loss": 0.9143,
"step": 1050
},
{
"epoch": 0.29041359851352283,
"grad_norm": 0.6019716832259343,
"learning_rate": 4.250784134637564e-06,
"loss": 0.8692,
"step": 1055
},
{
"epoch": 0.29178996627898973,
"grad_norm": 0.660623434550896,
"learning_rate": 4.242640727270329e-06,
"loss": 0.935,
"step": 1060
},
{
"epoch": 0.2931663340444567,
"grad_norm": 0.5545068926773359,
"learning_rate": 4.234461194835083e-06,
"loss": 0.9124,
"step": 1065
},
{
"epoch": 0.2945427018099236,
"grad_norm": 0.6474889256130907,
"learning_rate": 4.2262457068941245e-06,
"loss": 0.9003,
"step": 1070
},
{
"epoch": 0.29591906957539055,
"grad_norm": 0.8379201054192673,
"learning_rate": 4.217994433755112e-06,
"loss": 0.8946,
"step": 1075
},
{
"epoch": 0.2972954373408575,
"grad_norm": 0.5968271233441237,
"learning_rate": 4.209707546467531e-06,
"loss": 0.906,
"step": 1080
},
{
"epoch": 0.2986718051063244,
"grad_norm": 0.5971995063991833,
"learning_rate": 4.201385216819155e-06,
"loss": 0.9148,
"step": 1085
},
{
"epoch": 0.30004817287179136,
"grad_norm": 0.6868895981296808,
"learning_rate": 4.193027617332476e-06,
"loss": 0.8785,
"step": 1090
},
{
"epoch": 0.30142454063725826,
"grad_norm": 0.5652277468935024,
"learning_rate": 4.184634921261136e-06,
"loss": 0.9108,
"step": 1095
},
{
"epoch": 0.3028009084027252,
"grad_norm": 0.5880419121074434,
"learning_rate": 4.176207302586329e-06,
"loss": 0.8955,
"step": 1100
},
{
"epoch": 0.3041772761681921,
"grad_norm": 0.7043218507759722,
"learning_rate": 4.1677449360132e-06,
"loss": 0.9431,
"step": 1105
},
{
"epoch": 0.3055536439336591,
"grad_norm": 0.615367412623154,
"learning_rate": 4.159247996967216e-06,
"loss": 0.9234,
"step": 1110
},
{
"epoch": 0.306930011699126,
"grad_norm": 0.7427039646128769,
"learning_rate": 4.150716661590538e-06,
"loss": 0.8887,
"step": 1115
},
{
"epoch": 0.30830637946459294,
"grad_norm": 0.7310077258792612,
"learning_rate": 4.142151106738364e-06,
"loss": 0.8959,
"step": 1120
},
{
"epoch": 0.3096827472300599,
"grad_norm": 0.5946178494604679,
"learning_rate": 4.133551509975264e-06,
"loss": 0.8957,
"step": 1125
},
{
"epoch": 0.3110591149955268,
"grad_norm": 0.4040536498559825,
"learning_rate": 4.124918049571499e-06,
"loss": 0.8815,
"step": 1130
},
{
"epoch": 0.31243548276099375,
"grad_norm": 0.510452525326286,
"learning_rate": 4.1162509044993264e-06,
"loss": 0.8413,
"step": 1135
},
{
"epoch": 0.31381185052646066,
"grad_norm": 1.095591544246511,
"learning_rate": 4.107550254429289e-06,
"loss": 0.8945,
"step": 1140
},
{
"epoch": 0.3151882182919276,
"grad_norm": 0.6346832255515471,
"learning_rate": 4.09881627972649e-06,
"loss": 0.8879,
"step": 1145
},
{
"epoch": 0.3165645860573945,
"grad_norm": 0.6680761432639554,
"learning_rate": 4.090049161446855e-06,
"loss": 0.9161,
"step": 1150
},
{
"epoch": 0.31794095382286147,
"grad_norm": 0.696714218990553,
"learning_rate": 4.081249081333381e-06,
"loss": 0.9182,
"step": 1155
},
{
"epoch": 0.31931732158832843,
"grad_norm": 1.0720208443645471,
"learning_rate": 4.07241622181236e-06,
"loss": 0.9112,
"step": 1160
},
{
"epoch": 0.32069368935379533,
"grad_norm": 0.6189592447847784,
"learning_rate": 4.063550765989609e-06,
"loss": 0.9185,
"step": 1165
},
{
"epoch": 0.3220700571192623,
"grad_norm": 0.7117706954107574,
"learning_rate": 4.054652897646666e-06,
"loss": 0.8858,
"step": 1170
},
{
"epoch": 0.3234464248847292,
"grad_norm": 0.6362209727557948,
"learning_rate": 4.0457228012369855e-06,
"loss": 0.8753,
"step": 1175
},
{
"epoch": 0.32482279265019615,
"grad_norm": 0.817603348475789,
"learning_rate": 4.036760661882109e-06,
"loss": 0.8376,
"step": 1180
},
{
"epoch": 0.32619916041566305,
"grad_norm": 0.7216789765837882,
"learning_rate": 4.027766665367833e-06,
"loss": 0.9097,
"step": 1185
},
{
"epoch": 0.32757552818113,
"grad_norm": 0.7604270143327692,
"learning_rate": 4.0187409981403525e-06,
"loss": 0.8924,
"step": 1190
},
{
"epoch": 0.3289518959465969,
"grad_norm": 0.5264426392037226,
"learning_rate": 4.009683847302401e-06,
"loss": 0.8908,
"step": 1195
},
{
"epoch": 0.33032826371206386,
"grad_norm": 0.6167807303641967,
"learning_rate": 4.00059540060937e-06,
"loss": 0.8682,
"step": 1200
},
{
"epoch": 0.33032826371206386,
"eval_loss": 0.8648103475570679,
"eval_runtime": 37.566,
"eval_samples_per_second": 133.099,
"eval_steps_per_second": 2.103,
"step": 1200
},
{
"epoch": 0.3317046314775308,
"grad_norm": 0.47612017363637654,
"learning_rate": 3.991475846465415e-06,
"loss": 0.8904,
"step": 1205
},
{
"epoch": 0.3330809992429977,
"grad_norm": 0.724998856357164,
"learning_rate": 3.982325373919549e-06,
"loss": 0.9,
"step": 1210
},
{
"epoch": 0.3344573670084647,
"grad_norm": 0.4223709120139991,
"learning_rate": 3.973144172661731e-06,
"loss": 0.8838,
"step": 1215
},
{
"epoch": 0.3358337347739316,
"grad_norm": 0.5379099635161511,
"learning_rate": 3.963932433018924e-06,
"loss": 0.9138,
"step": 1220
},
{
"epoch": 0.33721010253939854,
"grad_norm": 0.6682983804147051,
"learning_rate": 3.954690345951156e-06,
"loss": 0.8771,
"step": 1225
},
{
"epoch": 0.33858647030486544,
"grad_norm": 0.7454287773417096,
"learning_rate": 3.945418103047558e-06,
"loss": 0.8805,
"step": 1230
},
{
"epoch": 0.3399628380703324,
"grad_norm": 0.7063066134977648,
"learning_rate": 3.936115896522395e-06,
"loss": 0.8563,
"step": 1235
},
{
"epoch": 0.34133920583579935,
"grad_norm": 0.6645576290102445,
"learning_rate": 3.92678391921108e-06,
"loss": 0.9031,
"step": 1240
},
{
"epoch": 0.34271557360126625,
"grad_norm": 0.37691206208242306,
"learning_rate": 3.917422364566175e-06,
"loss": 0.8369,
"step": 1245
},
{
"epoch": 0.3440919413667332,
"grad_norm": 0.41960300904843606,
"learning_rate": 3.908031426653383e-06,
"loss": 0.9235,
"step": 1250
},
{
"epoch": 0.3454683091322001,
"grad_norm": 0.5867963483036928,
"learning_rate": 3.898611300147525e-06,
"loss": 0.8511,
"step": 1255
},
{
"epoch": 0.34684467689766707,
"grad_norm": 0.7099329212172958,
"learning_rate": 3.889162180328504e-06,
"loss": 0.9318,
"step": 1260
},
{
"epoch": 0.34822104466313397,
"grad_norm": 0.615124184951399,
"learning_rate": 3.879684263077255e-06,
"loss": 0.8774,
"step": 1265
},
{
"epoch": 0.34959741242860093,
"grad_norm": 0.5769333758225725,
"learning_rate": 3.870177744871686e-06,
"loss": 0.8878,
"step": 1270
},
{
"epoch": 0.35097378019406783,
"grad_norm": 0.7797713242735278,
"learning_rate": 3.860642822782605e-06,
"loss": 0.8559,
"step": 1275
},
{
"epoch": 0.3523501479595348,
"grad_norm": 0.5476182116265297,
"learning_rate": 3.851079694469636e-06,
"loss": 0.8503,
"step": 1280
},
{
"epoch": 0.35372651572500174,
"grad_norm": 0.5593250703560667,
"learning_rate": 3.841488558177118e-06,
"loss": 0.8666,
"step": 1285
},
{
"epoch": 0.35510288349046865,
"grad_norm": 0.5174533433417139,
"learning_rate": 3.831869612729999e-06,
"loss": 0.88,
"step": 1290
},
{
"epoch": 0.3564792512559356,
"grad_norm": 0.5259246199350045,
"learning_rate": 3.822223057529712e-06,
"loss": 0.8522,
"step": 1295
},
{
"epoch": 0.3578556190214025,
"grad_norm": 0.9561871466566957,
"learning_rate": 3.8125490925500426e-06,
"loss": 0.8947,
"step": 1300
},
{
"epoch": 0.35923198678686946,
"grad_norm": 0.5703094998753527,
"learning_rate": 3.8028479183329816e-06,
"loss": 0.8721,
"step": 1305
},
{
"epoch": 0.36060835455233636,
"grad_norm": 0.7959291859516844,
"learning_rate": 3.793119735984572e-06,
"loss": 0.903,
"step": 1310
},
{
"epoch": 0.3619847223178033,
"grad_norm": 0.6289787758596613,
"learning_rate": 3.7833647471707345e-06,
"loss": 0.8642,
"step": 1315
},
{
"epoch": 0.3633610900832703,
"grad_norm": 0.5823311703497488,
"learning_rate": 3.773583154113092e-06,
"loss": 0.8812,
"step": 1320
},
{
"epoch": 0.3647374578487372,
"grad_norm": 0.5458593624626218,
"learning_rate": 3.7637751595847734e-06,
"loss": 0.8848,
"step": 1325
},
{
"epoch": 0.36611382561420414,
"grad_norm": 0.5422663450938033,
"learning_rate": 3.7539409669062138e-06,
"loss": 0.8546,
"step": 1330
},
{
"epoch": 0.36749019337967104,
"grad_norm": 0.5470752416021339,
"learning_rate": 3.744080779940937e-06,
"loss": 0.8803,
"step": 1335
},
{
"epoch": 0.368866561145138,
"grad_norm": 0.6784499815024887,
"learning_rate": 3.7341948030913293e-06,
"loss": 0.8431,
"step": 1340
},
{
"epoch": 0.3702429289106049,
"grad_norm": 0.5173768738501396,
"learning_rate": 3.7242832412944047e-06,
"loss": 0.923,
"step": 1345
},
{
"epoch": 0.37161929667607185,
"grad_norm": 0.5139718864020647,
"learning_rate": 3.714346300017555e-06,
"loss": 0.925,
"step": 1350
},
{
"epoch": 0.37299566444153875,
"grad_norm": 0.5535987985827241,
"learning_rate": 3.7043841852542884e-06,
"loss": 0.816,
"step": 1355
},
{
"epoch": 0.3743720322070057,
"grad_norm": 0.6220599394236812,
"learning_rate": 3.6943971035199642e-06,
"loss": 0.8975,
"step": 1360
},
{
"epoch": 0.37574839997247267,
"grad_norm": 0.6424558337857292,
"learning_rate": 3.684385261847506e-06,
"loss": 0.8696,
"step": 1365
},
{
"epoch": 0.37712476773793957,
"grad_norm": 0.6193220187952029,
"learning_rate": 3.674348867783115e-06,
"loss": 0.9187,
"step": 1370
},
{
"epoch": 0.3785011355034065,
"grad_norm": 0.7948870412022867,
"learning_rate": 3.6642881293819643e-06,
"loss": 0.8794,
"step": 1375
},
{
"epoch": 0.37987750326887343,
"grad_norm": 0.4277830946620168,
"learning_rate": 3.654203255203886e-06,
"loss": 0.8369,
"step": 1380
},
{
"epoch": 0.3812538710343404,
"grad_norm": 0.5346386829006233,
"learning_rate": 3.6440944543090505e-06,
"loss": 0.8175,
"step": 1385
},
{
"epoch": 0.3826302387998073,
"grad_norm": 0.5567956320600921,
"learning_rate": 3.633961936253628e-06,
"loss": 0.9047,
"step": 1390
},
{
"epoch": 0.38400660656527424,
"grad_norm": 0.48101573175427903,
"learning_rate": 3.623805911085452e-06,
"loss": 0.8312,
"step": 1395
},
{
"epoch": 0.3853829743307412,
"grad_norm": 0.5357232714738431,
"learning_rate": 3.613626589339653e-06,
"loss": 0.8757,
"step": 1400
},
{
"epoch": 0.3853829743307412,
"eval_loss": 0.846756637096405,
"eval_runtime": 37.5777,
"eval_samples_per_second": 133.058,
"eval_steps_per_second": 2.102,
"step": 1400
},
{
"epoch": 0.3867593420962081,
"grad_norm": 0.4727877531161341,
"learning_rate": 3.6034241820343086e-06,
"loss": 0.8599,
"step": 1405
},
{
"epoch": 0.38813570986167506,
"grad_norm": 0.4454834707158075,
"learning_rate": 3.5931989006660567e-06,
"loss": 0.9158,
"step": 1410
},
{
"epoch": 0.38951207762714196,
"grad_norm": 0.6273941979012451,
"learning_rate": 3.582950957205718e-06,
"loss": 0.8325,
"step": 1415
},
{
"epoch": 0.3908884453926089,
"grad_norm": 0.4418868761207735,
"learning_rate": 3.5726805640939e-06,
"loss": 0.8455,
"step": 1420
},
{
"epoch": 0.3922648131580758,
"grad_norm": 0.459223577643327,
"learning_rate": 3.562387934236593e-06,
"loss": 0.8554,
"step": 1425
},
{
"epoch": 0.3936411809235428,
"grad_norm": 0.6308918701832376,
"learning_rate": 3.552073281000757e-06,
"loss": 0.905,
"step": 1430
},
{
"epoch": 0.3950175486890097,
"grad_norm": 0.5375942052449636,
"learning_rate": 3.541736818209897e-06,
"loss": 0.8989,
"step": 1435
},
{
"epoch": 0.39639391645447664,
"grad_norm": 0.45898548069155615,
"learning_rate": 3.5313787601396328e-06,
"loss": 0.8568,
"step": 1440
},
{
"epoch": 0.3977702842199436,
"grad_norm": 0.6621205621192929,
"learning_rate": 3.5209993215132556e-06,
"loss": 0.8988,
"step": 1445
},
{
"epoch": 0.3991466519854105,
"grad_norm": 0.5810221477718872,
"learning_rate": 3.510598717497276e-06,
"loss": 0.8574,
"step": 1450
},
{
"epoch": 0.40052301975087745,
"grad_norm": 0.49897461838543894,
"learning_rate": 3.5001771636969677e-06,
"loss": 0.8677,
"step": 1455
},
{
"epoch": 0.40189938751634435,
"grad_norm": 0.343456686019272,
"learning_rate": 3.4897348761518913e-06,
"loss": 0.8568,
"step": 1460
},
{
"epoch": 0.4032757552818113,
"grad_norm": 0.5890854774221135,
"learning_rate": 3.4792720713314223e-06,
"loss": 0.8084,
"step": 1465
},
{
"epoch": 0.4046521230472782,
"grad_norm": 0.5050249705187615,
"learning_rate": 3.4687889661302577e-06,
"loss": 0.822,
"step": 1470
},
{
"epoch": 0.40602849081274517,
"grad_norm": 0.6261040635433304,
"learning_rate": 3.458285777863926e-06,
"loss": 0.8983,
"step": 1475
},
{
"epoch": 0.4074048585782121,
"grad_norm": 0.6179599241399432,
"learning_rate": 3.4477627242642782e-06,
"loss": 0.8186,
"step": 1480
},
{
"epoch": 0.408781226343679,
"grad_norm": 0.4118673045686313,
"learning_rate": 3.4372200234749735e-06,
"loss": 0.8005,
"step": 1485
},
{
"epoch": 0.410157594109146,
"grad_norm": 0.5523708195732413,
"learning_rate": 3.4266578940469605e-06,
"loss": 0.8231,
"step": 1490
},
{
"epoch": 0.4115339618746129,
"grad_norm": 0.6262009345044368,
"learning_rate": 3.416076554933944e-06,
"loss": 0.8134,
"step": 1495
},
{
"epoch": 0.41291032964007984,
"grad_norm": 0.5855645975272682,
"learning_rate": 3.4054762254878477e-06,
"loss": 0.8583,
"step": 1500
},
{
"epoch": 0.41428669740554674,
"grad_norm": 0.5148476566821822,
"learning_rate": 3.394857125454267e-06,
"loss": 0.8362,
"step": 1505
},
{
"epoch": 0.4156630651710137,
"grad_norm": 0.5372672904747088,
"learning_rate": 3.3842194749679086e-06,
"loss": 0.8381,
"step": 1510
},
{
"epoch": 0.4170394329364806,
"grad_norm": 0.6533529330850166,
"learning_rate": 3.373563494548037e-06,
"loss": 0.8884,
"step": 1515
},
{
"epoch": 0.41841580070194756,
"grad_norm": 0.7855188602038081,
"learning_rate": 3.3628894050938945e-06,
"loss": 0.8554,
"step": 1520
},
{
"epoch": 0.4197921684674145,
"grad_norm": 0.7164029448250899,
"learning_rate": 3.352197427880126e-06,
"loss": 0.8902,
"step": 1525
},
{
"epoch": 0.4211685362328814,
"grad_norm": 0.5654733251747366,
"learning_rate": 3.3414877845521904e-06,
"loss": 0.8858,
"step": 1530
},
{
"epoch": 0.4225449039983484,
"grad_norm": 0.7161901522114019,
"learning_rate": 3.3307606971217665e-06,
"loss": 0.8793,
"step": 1535
},
{
"epoch": 0.4239212717638153,
"grad_norm": 0.37921076195825976,
"learning_rate": 3.320016387962151e-06,
"loss": 0.8133,
"step": 1540
},
{
"epoch": 0.42529763952928223,
"grad_norm": 0.57460942583143,
"learning_rate": 3.309255079803647e-06,
"loss": 0.8308,
"step": 1545
},
{
"epoch": 0.42667400729474914,
"grad_norm": 0.6237565822649527,
"learning_rate": 3.29847699572895e-06,
"loss": 0.9122,
"step": 1550
},
{
"epoch": 0.4280503750602161,
"grad_norm": 0.6604400360810607,
"learning_rate": 3.2876823591685214e-06,
"loss": 0.7869,
"step": 1555
},
{
"epoch": 0.42942674282568305,
"grad_norm": 0.7964686696906034,
"learning_rate": 3.276871393895954e-06,
"loss": 0.8302,
"step": 1560
},
{
"epoch": 0.43080311059114995,
"grad_norm": 0.7353573467891227,
"learning_rate": 3.2660443240233387e-06,
"loss": 0.8878,
"step": 1565
},
{
"epoch": 0.4321794783566169,
"grad_norm": 0.45353882448336647,
"learning_rate": 3.2552013739966147e-06,
"loss": 0.8555,
"step": 1570
},
{
"epoch": 0.4335558461220838,
"grad_norm": 0.600983398689409,
"learning_rate": 3.24434276859092e-06,
"loss": 0.8003,
"step": 1575
},
{
"epoch": 0.43493221388755077,
"grad_norm": 0.33468339832513494,
"learning_rate": 3.233468732905927e-06,
"loss": 0.7919,
"step": 1580
},
{
"epoch": 0.43630858165301767,
"grad_norm": 0.664686119618901,
"learning_rate": 3.222579492361179e-06,
"loss": 0.8585,
"step": 1585
},
{
"epoch": 0.4376849494184846,
"grad_norm": 0.580736295078617,
"learning_rate": 3.21167527269142e-06,
"loss": 0.8537,
"step": 1590
},
{
"epoch": 0.4390613171839515,
"grad_norm": 0.4795429342164793,
"learning_rate": 3.2007562999419094e-06,
"loss": 0.8691,
"step": 1595
},
{
"epoch": 0.4404376849494185,
"grad_norm": 0.5533419912378857,
"learning_rate": 3.189822800463742e-06,
"loss": 0.8441,
"step": 1600
},
{
"epoch": 0.4404376849494185,
"eval_loss": 0.8311466574668884,
"eval_runtime": 37.5777,
"eval_samples_per_second": 133.058,
"eval_steps_per_second": 2.102,
"step": 1600
},
{
"epoch": 0.44181405271488544,
"grad_norm": 0.44901727418791887,
"learning_rate": 3.1788750009091473e-06,
"loss": 0.8785,
"step": 1605
},
{
"epoch": 0.44319042048035234,
"grad_norm": 0.7054761131009049,
"learning_rate": 3.167913128226803e-06,
"loss": 0.8442,
"step": 1610
},
{
"epoch": 0.4445667882458193,
"grad_norm": 0.6338208711460487,
"learning_rate": 3.156937409657119e-06,
"loss": 0.8968,
"step": 1615
},
{
"epoch": 0.4459431560112862,
"grad_norm": 0.6103638077202322,
"learning_rate": 3.145948072727535e-06,
"loss": 0.8823,
"step": 1620
},
{
"epoch": 0.44731952377675316,
"grad_norm": 0.5814180130346099,
"learning_rate": 3.134945345247797e-06,
"loss": 0.8224,
"step": 1625
},
{
"epoch": 0.44869589154222006,
"grad_norm": 0.6704683844032799,
"learning_rate": 3.123929455305239e-06,
"loss": 0.8797,
"step": 1630
},
{
"epoch": 0.450072259307687,
"grad_norm": 0.4798892663438211,
"learning_rate": 3.1129006312600558e-06,
"loss": 0.8386,
"step": 1635
},
{
"epoch": 0.451448627073154,
"grad_norm": 0.5497411331632864,
"learning_rate": 3.101859101740565e-06,
"loss": 0.858,
"step": 1640
},
{
"epoch": 0.4528249948386209,
"grad_norm": 0.5856897243913328,
"learning_rate": 3.09080509563847e-06,
"loss": 0.8904,
"step": 1645
},
{
"epoch": 0.45420136260408783,
"grad_norm": 0.47574657706442885,
"learning_rate": 3.079738842104115e-06,
"loss": 0.831,
"step": 1650
},
{
"epoch": 0.45557773036955473,
"grad_norm": 0.7356917244396622,
"learning_rate": 3.0686605705417337e-06,
"loss": 0.8638,
"step": 1655
},
{
"epoch": 0.4569540981350217,
"grad_norm": 0.4609027088459466,
"learning_rate": 3.057570510604696e-06,
"loss": 0.8342,
"step": 1660
},
{
"epoch": 0.4583304659004886,
"grad_norm": 0.649733338307217,
"learning_rate": 3.0464688921907436e-06,
"loss": 0.844,
"step": 1665
},
{
"epoch": 0.45970683366595555,
"grad_norm": 0.6452557057054518,
"learning_rate": 3.035355945437228e-06,
"loss": 0.901,
"step": 1670
},
{
"epoch": 0.46108320143142245,
"grad_norm": 0.48053889170013253,
"learning_rate": 3.0242319007163373e-06,
"loss": 0.8237,
"step": 1675
},
{
"epoch": 0.4624595691968894,
"grad_norm": 0.5156070687409189,
"learning_rate": 3.01309698863032e-06,
"loss": 0.7832,
"step": 1680
},
{
"epoch": 0.46383593696235637,
"grad_norm": 0.6487171872307684,
"learning_rate": 3.001951440006708e-06,
"loss": 0.8302,
"step": 1685
},
{
"epoch": 0.46521230472782327,
"grad_norm": 0.4088236693992105,
"learning_rate": 2.9907954858935277e-06,
"loss": 0.7978,
"step": 1690
},
{
"epoch": 0.4665886724932902,
"grad_norm": 0.6166137125554756,
"learning_rate": 2.9796293575545143e-06,
"loss": 0.8327,
"step": 1695
},
{
"epoch": 0.4679650402587571,
"grad_norm": 0.5324800896120733,
"learning_rate": 2.9684532864643123e-06,
"loss": 0.8497,
"step": 1700
},
{
"epoch": 0.4693414080242241,
"grad_norm": 0.5330587838888724,
"learning_rate": 2.957267504303682e-06,
"loss": 0.8318,
"step": 1705
},
{
"epoch": 0.470717775789691,
"grad_norm": 0.3066813832965278,
"learning_rate": 2.946072242954695e-06,
"loss": 0.7959,
"step": 1710
},
{
"epoch": 0.47209414355515794,
"grad_norm": 0.4174047766855664,
"learning_rate": 2.934867734495927e-06,
"loss": 0.8157,
"step": 1715
},
{
"epoch": 0.4734705113206249,
"grad_norm": 0.5513234180318742,
"learning_rate": 2.9236542111976468e-06,
"loss": 0.8657,
"step": 1720
},
{
"epoch": 0.4748468790860918,
"grad_norm": 0.36186756893473565,
"learning_rate": 2.9124319055170012e-06,
"loss": 0.8108,
"step": 1725
},
{
"epoch": 0.47622324685155876,
"grad_norm": 0.48483961573426937,
"learning_rate": 2.9012010500931966e-06,
"loss": 0.8532,
"step": 1730
},
{
"epoch": 0.47759961461702566,
"grad_norm": 0.656595154954433,
"learning_rate": 2.8899618777426763e-06,
"loss": 0.8186,
"step": 1735
},
{
"epoch": 0.4789759823824926,
"grad_norm": 0.45682292991009654,
"learning_rate": 2.878714621454294e-06,
"loss": 0.8507,
"step": 1740
},
{
"epoch": 0.4803523501479595,
"grad_norm": 0.4456324138614082,
"learning_rate": 2.867459514384485e-06,
"loss": 0.8809,
"step": 1745
},
{
"epoch": 0.4817287179134265,
"grad_norm": 0.5388546054688201,
"learning_rate": 2.856196789852429e-06,
"loss": 0.8236,
"step": 1750
},
{
"epoch": 0.4831050856788934,
"grad_norm": 0.660462229832185,
"learning_rate": 2.84492668133522e-06,
"loss": 0.8338,
"step": 1755
},
{
"epoch": 0.48448145344436033,
"grad_norm": 0.5278146557932175,
"learning_rate": 2.833649422463019e-06,
"loss": 0.814,
"step": 1760
},
{
"epoch": 0.4858578212098273,
"grad_norm": 0.5470942669154103,
"learning_rate": 2.8223652470142184e-06,
"loss": 0.8183,
"step": 1765
},
{
"epoch": 0.4872341889752942,
"grad_norm": 0.505713995433775,
"learning_rate": 2.8110743889105874e-06,
"loss": 0.8387,
"step": 1770
},
{
"epoch": 0.48861055674076115,
"grad_norm": 0.4875452733503727,
"learning_rate": 2.79977708221243e-06,
"loss": 0.7981,
"step": 1775
},
{
"epoch": 0.48998692450622805,
"grad_norm": 0.43916802590802495,
"learning_rate": 2.7884735611137288e-06,
"loss": 0.8532,
"step": 1780
},
{
"epoch": 0.491363292271695,
"grad_norm": 0.33369528684284744,
"learning_rate": 2.777164059937292e-06,
"loss": 0.8408,
"step": 1785
},
{
"epoch": 0.4927396600371619,
"grad_norm": 0.7202736685154576,
"learning_rate": 2.765848813129895e-06,
"loss": 0.8532,
"step": 1790
},
{
"epoch": 0.49411602780262887,
"grad_norm": 0.42511937449619386,
"learning_rate": 2.7545280552574204e-06,
"loss": 0.8224,
"step": 1795
},
{
"epoch": 0.4954923955680958,
"grad_norm": 0.545135520242412,
"learning_rate": 2.7432020209999956e-06,
"loss": 0.8197,
"step": 1800
},
{
"epoch": 0.4954923955680958,
"eval_loss": 0.8205735087394714,
"eval_runtime": 37.5781,
"eval_samples_per_second": 133.056,
"eval_steps_per_second": 2.102,
"step": 1800
},
{
"epoch": 0.4968687633335627,
"grad_norm": 0.48066576255326976,
"learning_rate": 2.7318709451471288e-06,
"loss": 0.8239,
"step": 1805
},
{
"epoch": 0.4982451310990297,
"grad_norm": 0.7847734357041681,
"learning_rate": 2.7205350625928383e-06,
"loss": 0.9108,
"step": 1810
},
{
"epoch": 0.4996214988644966,
"grad_norm": 0.7016473795626382,
"learning_rate": 2.70919460833079e-06,
"loss": 0.8367,
"step": 1815
},
{
"epoch": 0.5009978666299635,
"grad_norm": 0.6178351624669732,
"learning_rate": 2.697849817449415e-06,
"loss": 0.8282,
"step": 1820
},
{
"epoch": 0.5023742343954305,
"grad_norm": 0.4091504823861622,
"learning_rate": 2.6865009251270506e-06,
"loss": 0.8526,
"step": 1825
},
{
"epoch": 0.5037506021608974,
"grad_norm": 0.6183413041352451,
"learning_rate": 2.6751481666270513e-06,
"loss": 0.8473,
"step": 1830
},
{
"epoch": 0.5051269699263643,
"grad_norm": 0.606173243841225,
"learning_rate": 2.6637917772929213e-06,
"loss": 0.8567,
"step": 1835
},
{
"epoch": 0.5065033376918313,
"grad_norm": 0.6132486268789019,
"learning_rate": 2.65243199254343e-06,
"loss": 0.8325,
"step": 1840
},
{
"epoch": 0.5078797054572982,
"grad_norm": 0.4854570121858536,
"learning_rate": 2.6410690478677353e-06,
"loss": 0.7892,
"step": 1845
},
{
"epoch": 0.5092560732227651,
"grad_norm": 0.37412561464355937,
"learning_rate": 2.6297031788205004e-06,
"loss": 0.8094,
"step": 1850
},
{
"epoch": 0.510632440988232,
"grad_norm": 0.6203401035441314,
"learning_rate": 2.618334621017009e-06,
"loss": 0.822,
"step": 1855
},
{
"epoch": 0.512008808753699,
"grad_norm": 0.653767694616689,
"learning_rate": 2.6069636101282862e-06,
"loss": 0.8367,
"step": 1860
},
{
"epoch": 0.5133851765191659,
"grad_norm": 0.6245289959361092,
"learning_rate": 2.595590381876209e-06,
"loss": 0.8328,
"step": 1865
},
{
"epoch": 0.5147615442846328,
"grad_norm": 0.4457307365521416,
"learning_rate": 2.584215172028618e-06,
"loss": 0.8312,
"step": 1870
},
{
"epoch": 0.5161379120500997,
"grad_norm": 0.4815075541618031,
"learning_rate": 2.572838216394434e-06,
"loss": 0.8686,
"step": 1875
},
{
"epoch": 0.5175142798155667,
"grad_norm": 0.5702137712078795,
"learning_rate": 2.561459750818769e-06,
"loss": 0.8347,
"step": 1880
},
{
"epoch": 0.5188906475810336,
"grad_norm": 0.48523555192405865,
"learning_rate": 2.5500800111780357e-06,
"loss": 0.8036,
"step": 1885
},
{
"epoch": 0.5202670153465005,
"grad_norm": 0.3430136628045643,
"learning_rate": 2.5386992333750565e-06,
"loss": 0.8291,
"step": 1890
},
{
"epoch": 0.5216433831119676,
"grad_norm": 0.470576052419782,
"learning_rate": 2.5273176533341777e-06,
"loss": 0.77,
"step": 1895
},
{
"epoch": 0.5230197508774345,
"grad_norm": 0.4479209182378322,
"learning_rate": 2.5159355069963744e-06,
"loss": 0.8091,
"step": 1900
},
{
"epoch": 0.5243961186429014,
"grad_norm": 0.5763895685381267,
"learning_rate": 2.5045530303143604e-06,
"loss": 0.863,
"step": 1905
},
{
"epoch": 0.5257724864083683,
"grad_norm": 0.6080627036981748,
"learning_rate": 2.4931704592477e-06,
"loss": 0.8713,
"step": 1910
},
{
"epoch": 0.5271488541738353,
"grad_norm": 0.5799585453878151,
"learning_rate": 2.4817880297579134e-06,
"loss": 0.7895,
"step": 1915
},
{
"epoch": 0.5285252219393022,
"grad_norm": 0.6516081569612333,
"learning_rate": 2.4704059778035823e-06,
"loss": 0.8062,
"step": 1920
},
{
"epoch": 0.5299015897047691,
"grad_norm": 0.5284447836149975,
"learning_rate": 2.459024539335467e-06,
"loss": 0.8549,
"step": 1925
},
{
"epoch": 0.5312779574702361,
"grad_norm": 0.5029144016269022,
"learning_rate": 2.447643950291608e-06,
"loss": 0.8279,
"step": 1930
},
{
"epoch": 0.532654325235703,
"grad_norm": 0.6658834516119427,
"learning_rate": 2.4362644465924367e-06,
"loss": 0.8335,
"step": 1935
},
{
"epoch": 0.5340306930011699,
"grad_norm": 0.5953435916222078,
"learning_rate": 2.4248862641358865e-06,
"loss": 0.7918,
"step": 1940
},
{
"epoch": 0.5354070607666368,
"grad_norm": 0.37670098774520216,
"learning_rate": 2.4135096387925e-06,
"loss": 0.8638,
"step": 1945
},
{
"epoch": 0.5367834285321038,
"grad_norm": 0.511664128571869,
"learning_rate": 2.4021348064005417e-06,
"loss": 0.8377,
"step": 1950
},
{
"epoch": 0.5381597962975707,
"grad_norm": 0.42161706759354106,
"learning_rate": 2.3907620027611083e-06,
"loss": 0.83,
"step": 1955
},
{
"epoch": 0.5395361640630376,
"grad_norm": 0.3040178118533278,
"learning_rate": 2.3793914636332394e-06,
"loss": 0.8746,
"step": 1960
},
{
"epoch": 0.5409125318285046,
"grad_norm": 0.689149159117422,
"learning_rate": 2.3680234247290305e-06,
"loss": 0.8247,
"step": 1965
},
{
"epoch": 0.5422888995939715,
"grad_norm": 0.44356157035761207,
"learning_rate": 2.3566581217087496e-06,
"loss": 0.8277,
"step": 1970
},
{
"epoch": 0.5436652673594384,
"grad_norm": 0.5918309406851234,
"learning_rate": 2.3452957901759486e-06,
"loss": 0.8025,
"step": 1975
},
{
"epoch": 0.5450416351249053,
"grad_norm": 0.4997809391017281,
"learning_rate": 2.333936665672579e-06,
"loss": 0.835,
"step": 1980
},
{
"epoch": 0.5464180028903723,
"grad_norm": 0.49766628433257887,
"learning_rate": 2.3225809836741118e-06,
"loss": 0.7756,
"step": 1985
},
{
"epoch": 0.5477943706558392,
"grad_norm": 0.40407922865092843,
"learning_rate": 2.3112289795846537e-06,
"loss": 0.7967,
"step": 1990
},
{
"epoch": 0.5491707384213061,
"grad_norm": 0.6308739524640976,
"learning_rate": 2.2998808887320697e-06,
"loss": 0.781,
"step": 1995
},
{
"epoch": 0.5505471061867732,
"grad_norm": 0.5743558152124677,
"learning_rate": 2.2885369463631003e-06,
"loss": 0.7807,
"step": 2000
},
{
"epoch": 0.5505471061867732,
"eval_loss": 0.8089554905891418,
"eval_runtime": 37.5623,
"eval_samples_per_second": 133.112,
"eval_steps_per_second": 2.103,
"step": 2000
},
{
"epoch": 0.5519234739522401,
"grad_norm": 0.5814715096200206,
"learning_rate": 2.277197387638491e-06,
"loss": 0.8329,
"step": 2005
},
{
"epoch": 0.553299841717707,
"grad_norm": 0.5479788170588639,
"learning_rate": 2.265862447628111e-06,
"loss": 0.8742,
"step": 2010
},
{
"epoch": 0.5546762094831739,
"grad_norm": 0.3062291452983948,
"learning_rate": 2.254532361306085e-06,
"loss": 0.7671,
"step": 2015
},
{
"epoch": 0.5560525772486409,
"grad_norm": 0.5763488126780687,
"learning_rate": 2.2432073635459196e-06,
"loss": 0.8437,
"step": 2020
},
{
"epoch": 0.5574289450141078,
"grad_norm": 0.668939310724625,
"learning_rate": 2.2318876891156356e-06,
"loss": 0.8973,
"step": 2025
},
{
"epoch": 0.5588053127795747,
"grad_norm": 0.37776371512529605,
"learning_rate": 2.2205735726729023e-06,
"loss": 0.8345,
"step": 2030
},
{
"epoch": 0.5601816805450416,
"grad_norm": 0.41280705089503705,
"learning_rate": 2.2092652487601675e-06,
"loss": 0.8323,
"step": 2035
},
{
"epoch": 0.5615580483105086,
"grad_norm": 0.4632509161576438,
"learning_rate": 2.1979629517998027e-06,
"loss": 0.8282,
"step": 2040
},
{
"epoch": 0.5629344160759755,
"grad_norm": 0.7200713467674419,
"learning_rate": 2.186666916089239e-06,
"loss": 0.8379,
"step": 2045
},
{
"epoch": 0.5643107838414424,
"grad_norm": 0.5178576662970729,
"learning_rate": 2.1753773757961137e-06,
"loss": 0.8261,
"step": 2050
},
{
"epoch": 0.5656871516069094,
"grad_norm": 0.5610321336212082,
"learning_rate": 2.1640945649534096e-06,
"loss": 0.8309,
"step": 2055
},
{
"epoch": 0.5670635193723763,
"grad_norm": 0.5892891551665735,
"learning_rate": 2.1528187174546093e-06,
"loss": 0.8297,
"step": 2060
},
{
"epoch": 0.5684398871378432,
"grad_norm": 0.4486760776076786,
"learning_rate": 2.141550067048846e-06,
"loss": 0.8389,
"step": 2065
},
{
"epoch": 0.5698162549033101,
"grad_norm": 0.7907237469372685,
"learning_rate": 2.1302888473360566e-06,
"loss": 0.8321,
"step": 2070
},
{
"epoch": 0.5711926226687771,
"grad_norm": 0.566706200199042,
"learning_rate": 2.119035291762136e-06,
"loss": 0.8212,
"step": 2075
},
{
"epoch": 0.572568990434244,
"grad_norm": 0.5263130247632009,
"learning_rate": 2.1077896336141043e-06,
"loss": 0.8042,
"step": 2080
},
{
"epoch": 0.5739453581997109,
"grad_norm": 0.5308066511740118,
"learning_rate": 2.096552106015266e-06,
"loss": 0.815,
"step": 2085
},
{
"epoch": 0.5753217259651779,
"grad_norm": 0.6957541746253699,
"learning_rate": 2.0853229419203808e-06,
"loss": 0.8261,
"step": 2090
},
{
"epoch": 0.5766980937306448,
"grad_norm": 0.5966904887807215,
"learning_rate": 2.0741023741108276e-06,
"loss": 0.827,
"step": 2095
},
{
"epoch": 0.5780744614961117,
"grad_norm": 0.4539597770375397,
"learning_rate": 2.0628906351897885e-06,
"loss": 0.8182,
"step": 2100
},
{
"epoch": 0.5794508292615786,
"grad_norm": 0.6834026151474581,
"learning_rate": 2.0516879575774203e-06,
"loss": 0.8303,
"step": 2105
},
{
"epoch": 0.5808271970270457,
"grad_norm": 0.5239200524956313,
"learning_rate": 2.040494573506038e-06,
"loss": 0.7741,
"step": 2110
},
{
"epoch": 0.5822035647925126,
"grad_norm": 0.45753694417604984,
"learning_rate": 2.0293107150153006e-06,
"loss": 0.8397,
"step": 2115
},
{
"epoch": 0.5835799325579795,
"grad_norm": 0.46224504028065294,
"learning_rate": 2.018136613947401e-06,
"loss": 0.8244,
"step": 2120
},
{
"epoch": 0.5849563003234465,
"grad_norm": 0.4341362951474634,
"learning_rate": 2.0069725019422624e-06,
"loss": 0.8009,
"step": 2125
},
{
"epoch": 0.5863326680889134,
"grad_norm": 0.4900348861826574,
"learning_rate": 1.9958186104327317e-06,
"loss": 0.8483,
"step": 2130
},
{
"epoch": 0.5877090358543803,
"grad_norm": 0.6257220799254032,
"learning_rate": 1.9846751706397832e-06,
"loss": 0.8405,
"step": 2135
},
{
"epoch": 0.5890854036198472,
"grad_norm": 0.522323526016304,
"learning_rate": 1.9735424135677283e-06,
"loss": 0.8322,
"step": 2140
},
{
"epoch": 0.5904617713853142,
"grad_norm": 0.6328347166830222,
"learning_rate": 1.9624205699994256e-06,
"loss": 0.8607,
"step": 2145
},
{
"epoch": 0.5918381391507811,
"grad_norm": 0.52757076025723,
"learning_rate": 1.951309870491494e-06,
"loss": 0.8003,
"step": 2150
},
{
"epoch": 0.593214506916248,
"grad_norm": 0.4530698938004871,
"learning_rate": 1.9402105453695356e-06,
"loss": 0.843,
"step": 2155
},
{
"epoch": 0.594590874681715,
"grad_norm": 0.5275287498766006,
"learning_rate": 1.9291228247233607e-06,
"loss": 0.7959,
"step": 2160
},
{
"epoch": 0.5959672424471819,
"grad_norm": 0.28183154522813586,
"learning_rate": 1.9180469384022203e-06,
"loss": 0.7799,
"step": 2165
},
{
"epoch": 0.5973436102126488,
"grad_norm": 0.5336877667445091,
"learning_rate": 1.9069831160100338e-06,
"loss": 0.7979,
"step": 2170
},
{
"epoch": 0.5987199779781157,
"grad_norm": 0.6532051021851727,
"learning_rate": 1.8959315869006405e-06,
"loss": 0.8359,
"step": 2175
},
{
"epoch": 0.6000963457435827,
"grad_norm": 0.5307514597676389,
"learning_rate": 1.8848925801730344e-06,
"loss": 0.8937,
"step": 2180
},
{
"epoch": 0.6014727135090496,
"grad_norm": 0.4042895440437914,
"learning_rate": 1.8738663246666234e-06,
"loss": 0.8252,
"step": 2185
},
{
"epoch": 0.6028490812745165,
"grad_norm": 0.44894176165695887,
"learning_rate": 1.8628530489564771e-06,
"loss": 0.7835,
"step": 2190
},
{
"epoch": 0.6042254490399834,
"grad_norm": 0.4991539092087521,
"learning_rate": 1.8518529813485973e-06,
"loss": 0.814,
"step": 2195
},
{
"epoch": 0.6056018168054504,
"grad_norm": 0.3709613327877024,
"learning_rate": 1.8408663498751788e-06,
"loss": 0.7757,
"step": 2200
},
{
"epoch": 0.6056018168054504,
"eval_loss": 0.8015441298484802,
"eval_runtime": 37.566,
"eval_samples_per_second": 133.099,
"eval_steps_per_second": 2.103,
"step": 2200
},
{
"epoch": 0.6069781845709173,
"grad_norm": 0.4683158255043096,
"learning_rate": 1.829893382289886e-06,
"loss": 0.8097,
"step": 2205
},
{
"epoch": 0.6083545523363842,
"grad_norm": 0.5627576336667334,
"learning_rate": 1.818934306063126e-06,
"loss": 0.806,
"step": 2210
},
{
"epoch": 0.6097309201018513,
"grad_norm": 0.5488618788671827,
"learning_rate": 1.8079893483773413e-06,
"loss": 0.8185,
"step": 2215
},
{
"epoch": 0.6111072878673182,
"grad_norm": 0.44918917109293605,
"learning_rate": 1.7970587361222946e-06,
"loss": 0.8271,
"step": 2220
},
{
"epoch": 0.6124836556327851,
"grad_norm": 0.32979924519766196,
"learning_rate": 1.786142695890367e-06,
"loss": 0.7828,
"step": 2225
},
{
"epoch": 0.613860023398252,
"grad_norm": 0.35131817470857274,
"learning_rate": 1.7752414539718582e-06,
"loss": 0.8191,
"step": 2230
},
{
"epoch": 0.615236391163719,
"grad_norm": 0.4656658318567486,
"learning_rate": 1.7643552363503009e-06,
"loss": 0.8358,
"step": 2235
},
{
"epoch": 0.6166127589291859,
"grad_norm": 0.570964532177242,
"learning_rate": 1.7534842686977721e-06,
"loss": 0.8596,
"step": 2240
},
{
"epoch": 0.6179891266946528,
"grad_norm": 0.34212198519333364,
"learning_rate": 1.742628776370216e-06,
"loss": 0.818,
"step": 2245
},
{
"epoch": 0.6193654944601198,
"grad_norm": 0.48535112289382754,
"learning_rate": 1.7317889844027707e-06,
"loss": 0.8623,
"step": 2250
},
{
"epoch": 0.6207418622255867,
"grad_norm": 0.49166416763933013,
"learning_rate": 1.7209651175051056e-06,
"loss": 0.8468,
"step": 2255
},
{
"epoch": 0.6221182299910536,
"grad_norm": 0.4583562391030405,
"learning_rate": 1.7101574000567633e-06,
"loss": 0.822,
"step": 2260
},
{
"epoch": 0.6234945977565205,
"grad_norm": 0.46794730423054154,
"learning_rate": 1.6993660561025072e-06,
"loss": 0.8562,
"step": 2265
},
{
"epoch": 0.6248709655219875,
"grad_norm": 0.6099197600468227,
"learning_rate": 1.6885913093476741e-06,
"loss": 0.8078,
"step": 2270
},
{
"epoch": 0.6262473332874544,
"grad_norm": 0.5249915623303978,
"learning_rate": 1.677833383153542e-06,
"loss": 0.8219,
"step": 2275
},
{
"epoch": 0.6276237010529213,
"grad_norm": 0.4377376254057894,
"learning_rate": 1.6670925005326977e-06,
"loss": 0.8179,
"step": 2280
},
{
"epoch": 0.6290000688183883,
"grad_norm": 0.42122297813103243,
"learning_rate": 1.6563688841444137e-06,
"loss": 0.8418,
"step": 2285
},
{
"epoch": 0.6303764365838552,
"grad_norm": 0.5683008577226283,
"learning_rate": 1.6456627562900296e-06,
"loss": 0.7891,
"step": 2290
},
{
"epoch": 0.6317528043493221,
"grad_norm": 0.6037662603976885,
"learning_rate": 1.63497433890835e-06,
"loss": 0.832,
"step": 2295
},
{
"epoch": 0.633129172114789,
"grad_norm": 0.5247870050565184,
"learning_rate": 1.6243038535710365e-06,
"loss": 0.8076,
"step": 2300
},
{
"epoch": 0.634505539880256,
"grad_norm": 0.41197503482402553,
"learning_rate": 1.6136515214780227e-06,
"loss": 0.7596,
"step": 2305
},
{
"epoch": 0.6358819076457229,
"grad_norm": 0.4791099245291028,
"learning_rate": 1.603017563452919e-06,
"loss": 0.8107,
"step": 2310
},
{
"epoch": 0.6372582754111898,
"grad_norm": 0.43592237438522047,
"learning_rate": 1.592402199938443e-06,
"loss": 0.8185,
"step": 2315
},
{
"epoch": 0.6386346431766569,
"grad_norm": 0.5940623980499528,
"learning_rate": 1.5818056509918478e-06,
"loss": 0.8004,
"step": 2320
},
{
"epoch": 0.6400110109421238,
"grad_norm": 0.5250401511659805,
"learning_rate": 1.5712281362803561e-06,
"loss": 0.802,
"step": 2325
},
{
"epoch": 0.6413873787075907,
"grad_norm": 0.5915893010847437,
"learning_rate": 1.5606698750766108e-06,
"loss": 0.8642,
"step": 2330
},
{
"epoch": 0.6427637464730576,
"grad_norm": 0.46778493168617225,
"learning_rate": 1.550131086254129e-06,
"loss": 0.8092,
"step": 2335
},
{
"epoch": 0.6441401142385246,
"grad_norm": 0.491189650305899,
"learning_rate": 1.5396119882827651e-06,
"loss": 0.8026,
"step": 2340
},
{
"epoch": 0.6455164820039915,
"grad_norm": 0.43582541174722744,
"learning_rate": 1.5291127992241766e-06,
"loss": 0.8141,
"step": 2345
},
{
"epoch": 0.6468928497694584,
"grad_norm": 0.6142246346723809,
"learning_rate": 1.5186337367273105e-06,
"loss": 0.8008,
"step": 2350
},
{
"epoch": 0.6482692175349253,
"grad_norm": 0.35462943795501145,
"learning_rate": 1.5081750180238891e-06,
"loss": 0.7667,
"step": 2355
},
{
"epoch": 0.6496455853003923,
"grad_norm": 0.41180209198737644,
"learning_rate": 1.4977368599239061e-06,
"loss": 0.8028,
"step": 2360
},
{
"epoch": 0.6510219530658592,
"grad_norm": 0.7189067391117583,
"learning_rate": 1.487319478811131e-06,
"loss": 0.8339,
"step": 2365
},
{
"epoch": 0.6523983208313261,
"grad_norm": 0.3806482298690338,
"learning_rate": 1.4769230906386272e-06,
"loss": 0.8151,
"step": 2370
},
{
"epoch": 0.6537746885967931,
"grad_norm": 0.4983576378974423,
"learning_rate": 1.4665479109242696e-06,
"loss": 0.7939,
"step": 2375
},
{
"epoch": 0.65515105636226,
"grad_norm": 0.33177024863231713,
"learning_rate": 1.4561941547462855e-06,
"loss": 0.8009,
"step": 2380
},
{
"epoch": 0.6565274241277269,
"grad_norm": 0.538297674107413,
"learning_rate": 1.4458620367387838e-06,
"loss": 0.8025,
"step": 2385
},
{
"epoch": 0.6579037918931938,
"grad_norm": 0.4901494849475784,
"learning_rate": 1.4355517710873184e-06,
"loss": 0.7845,
"step": 2390
},
{
"epoch": 0.6592801596586608,
"grad_norm": 0.5486200463100078,
"learning_rate": 1.4252635715244394e-06,
"loss": 0.8208,
"step": 2395
},
{
"epoch": 0.6606565274241277,
"grad_norm": 0.4897253532401449,
"learning_rate": 1.4149976513252677e-06,
"loss": 0.7818,
"step": 2400
},
{
"epoch": 0.6606565274241277,
"eval_loss": 0.7957330942153931,
"eval_runtime": 37.5749,
"eval_samples_per_second": 133.068,
"eval_steps_per_second": 2.102,
"step": 2400
},
{
"epoch": 0.6620328951895946,
"grad_norm": 0.46111910295697045,
"learning_rate": 1.4047542233030683e-06,
"loss": 0.8258,
"step": 2405
},
{
"epoch": 0.6634092629550616,
"grad_norm": 0.5784849194355406,
"learning_rate": 1.3945334998048425e-06,
"loss": 0.8157,
"step": 2410
},
{
"epoch": 0.6647856307205285,
"grad_norm": 0.5455404395159691,
"learning_rate": 1.3843356927069266e-06,
"loss": 0.8155,
"step": 2415
},
{
"epoch": 0.6661619984859954,
"grad_norm": 0.4060432121547348,
"learning_rate": 1.3741610134105984e-06,
"loss": 0.7862,
"step": 2420
},
{
"epoch": 0.6675383662514623,
"grad_norm": 0.5175461237539263,
"learning_rate": 1.3640096728376922e-06,
"loss": 0.796,
"step": 2425
},
{
"epoch": 0.6689147340169294,
"grad_norm": 0.5579421958353933,
"learning_rate": 1.353881881426231e-06,
"loss": 0.8159,
"step": 2430
},
{
"epoch": 0.6702911017823963,
"grad_norm": 0.3938550567000424,
"learning_rate": 1.3437778491260626e-06,
"loss": 0.7888,
"step": 2435
},
{
"epoch": 0.6716674695478632,
"grad_norm": 0.3621816668127261,
"learning_rate": 1.3336977853945055e-06,
"loss": 0.7831,
"step": 2440
},
{
"epoch": 0.6730438373133302,
"grad_norm": 0.6046109892992472,
"learning_rate": 1.3236418991920065e-06,
"loss": 0.7899,
"step": 2445
},
{
"epoch": 0.6744202050787971,
"grad_norm": 0.52626524074172,
"learning_rate": 1.3136103989778138e-06,
"loss": 0.7591,
"step": 2450
},
{
"epoch": 0.675796572844264,
"grad_norm": 0.5540405574756208,
"learning_rate": 1.303603492705649e-06,
"loss": 0.8,
"step": 2455
},
{
"epoch": 0.6771729406097309,
"grad_norm": 0.3755183771069488,
"learning_rate": 1.2936213878194031e-06,
"loss": 0.819,
"step": 2460
},
{
"epoch": 0.6785493083751979,
"grad_norm": 0.585894250903544,
"learning_rate": 1.2836642912488287e-06,
"loss": 0.8327,
"step": 2465
},
{
"epoch": 0.6799256761406648,
"grad_norm": 0.5007223315080989,
"learning_rate": 1.2737324094052569e-06,
"loss": 0.8055,
"step": 2470
},
{
"epoch": 0.6813020439061317,
"grad_norm": 0.4543771383459902,
"learning_rate": 1.2638259481773164e-06,
"loss": 0.7892,
"step": 2475
},
{
"epoch": 0.6826784116715987,
"grad_norm": 0.45325256663521224,
"learning_rate": 1.2539451129266603e-06,
"loss": 0.7904,
"step": 2480
},
{
"epoch": 0.6840547794370656,
"grad_norm": 0.42755072695822427,
"learning_rate": 1.244090108483718e-06,
"loss": 0.8696,
"step": 2485
},
{
"epoch": 0.6854311472025325,
"grad_norm": 0.5749380186174909,
"learning_rate": 1.2342611391434424e-06,
"loss": 0.7695,
"step": 2490
},
{
"epoch": 0.6868075149679994,
"grad_norm": 0.47696517661731086,
"learning_rate": 1.2244584086610783e-06,
"loss": 0.8061,
"step": 2495
},
{
"epoch": 0.6881838827334664,
"grad_norm": 0.47628535730003985,
"learning_rate": 1.2146821202479347e-06,
"loss": 0.8252,
"step": 2500
},
{
"epoch": 0.6895602504989333,
"grad_norm": 0.4918566967586663,
"learning_rate": 1.204932476567175e-06,
"loss": 0.8306,
"step": 2505
},
{
"epoch": 0.6909366182644002,
"grad_norm": 0.45157773288125097,
"learning_rate": 1.1952096797296167e-06,
"loss": 0.7911,
"step": 2510
},
{
"epoch": 0.6923129860298671,
"grad_norm": 0.5449035155568759,
"learning_rate": 1.1855139312895412e-06,
"loss": 0.8297,
"step": 2515
},
{
"epoch": 0.6936893537953341,
"grad_norm": 0.539632313493914,
"learning_rate": 1.175845432240511e-06,
"loss": 0.7938,
"step": 2520
},
{
"epoch": 0.695065721560801,
"grad_norm": 0.267579658107954,
"learning_rate": 1.16620438301121e-06,
"loss": 0.8326,
"step": 2525
},
{
"epoch": 0.6964420893262679,
"grad_norm": 0.45094192311072506,
"learning_rate": 1.1565909834612843e-06,
"loss": 0.8183,
"step": 2530
},
{
"epoch": 0.697818457091735,
"grad_norm": 0.45018310356306585,
"learning_rate": 1.1470054328772015e-06,
"loss": 0.8312,
"step": 2535
},
{
"epoch": 0.6991948248572019,
"grad_norm": 0.5424670154260764,
"learning_rate": 1.1374479299681144e-06,
"loss": 0.8547,
"step": 2540
},
{
"epoch": 0.7005711926226688,
"grad_norm": 0.3929689914659013,
"learning_rate": 1.12791867286175e-06,
"loss": 0.7631,
"step": 2545
},
{
"epoch": 0.7019475603881357,
"grad_norm": 0.5189958974232449,
"learning_rate": 1.1184178591002936e-06,
"loss": 0.7974,
"step": 2550
},
{
"epoch": 0.7033239281536027,
"grad_norm": 0.4940843745776494,
"learning_rate": 1.1089456856363023e-06,
"loss": 0.7859,
"step": 2555
},
{
"epoch": 0.7047002959190696,
"grad_norm": 0.3903233975311801,
"learning_rate": 1.0995023488286132e-06,
"loss": 0.7555,
"step": 2560
},
{
"epoch": 0.7060766636845365,
"grad_norm": 0.44367526110003797,
"learning_rate": 1.090088044438281e-06,
"loss": 0.83,
"step": 2565
},
{
"epoch": 0.7074530314500035,
"grad_norm": 0.3625988730276094,
"learning_rate": 1.0807029676245146e-06,
"loss": 0.772,
"step": 2570
},
{
"epoch": 0.7088293992154704,
"grad_norm": 0.4378410267181844,
"learning_rate": 1.0713473129406342e-06,
"loss": 0.7913,
"step": 2575
},
{
"epoch": 0.7102057669809373,
"grad_norm": 0.5659553149700242,
"learning_rate": 1.062021274330035e-06,
"loss": 0.8333,
"step": 2580
},
{
"epoch": 0.7115821347464042,
"grad_norm": 0.480279896191206,
"learning_rate": 1.0527250451221714e-06,
"loss": 0.7924,
"step": 2585
},
{
"epoch": 0.7129585025118712,
"grad_norm": 0.5794313371061417,
"learning_rate": 1.043458818028546e-06,
"loss": 0.8025,
"step": 2590
},
{
"epoch": 0.7143348702773381,
"grad_norm": 0.577692103807798,
"learning_rate": 1.0342227851387132e-06,
"loss": 0.8102,
"step": 2595
},
{
"epoch": 0.715711238042805,
"grad_norm": 0.7048819923650593,
"learning_rate": 1.0250171379163035e-06,
"loss": 0.8235,
"step": 2600
},
{
"epoch": 0.715711238042805,
"eval_loss": 0.7914655208587646,
"eval_runtime": 37.5636,
"eval_samples_per_second": 133.108,
"eval_steps_per_second": 2.103,
"step": 2600
},
{
"epoch": 0.717087605808272,
"grad_norm": 0.5135405438687206,
"learning_rate": 1.0158420671950458e-06,
"loss": 0.8354,
"step": 2605
},
{
"epoch": 0.7184639735737389,
"grad_norm": 0.49088601989364183,
"learning_rate": 1.0066977631748192e-06,
"loss": 0.8243,
"step": 2610
},
{
"epoch": 0.7198403413392058,
"grad_norm": 0.43340747956612413,
"learning_rate": 9.975844154177068e-07,
"loss": 0.8082,
"step": 2615
},
{
"epoch": 0.7212167091046727,
"grad_norm": 0.4929677260928928,
"learning_rate": 9.88502212844063e-07,
"loss": 0.8259,
"step": 2620
},
{
"epoch": 0.7225930768701397,
"grad_norm": 0.5110702548169729,
"learning_rate": 9.794513437286039e-07,
"loss": 0.8231,
"step": 2625
},
{
"epoch": 0.7239694446356066,
"grad_norm": 0.43992378048573655,
"learning_rate": 9.704319956964997e-07,
"loss": 0.7803,
"step": 2630
},
{
"epoch": 0.7253458124010735,
"grad_norm": 0.3073181934194982,
"learning_rate": 9.61444355719484e-07,
"loss": 0.7606,
"step": 2635
},
{
"epoch": 0.7267221801665406,
"grad_norm": 0.5324585928575014,
"learning_rate": 9.524886101119846e-07,
"loss": 0.8537,
"step": 2640
},
{
"epoch": 0.7280985479320075,
"grad_norm": 0.45921892305675543,
"learning_rate": 9.435649445272516e-07,
"loss": 0.8069,
"step": 2645
},
{
"epoch": 0.7294749156974744,
"grad_norm": 0.48054038898023743,
"learning_rate": 9.346735439535182e-07,
"loss": 0.8097,
"step": 2650
},
{
"epoch": 0.7308512834629413,
"grad_norm": 0.4380511249962659,
"learning_rate": 9.25814592710158e-07,
"loss": 0.7914,
"step": 2655
},
{
"epoch": 0.7322276512284083,
"grad_norm": 0.337588637619449,
"learning_rate": 9.16988274443871e-07,
"loss": 0.7967,
"step": 2660
},
{
"epoch": 0.7336040189938752,
"grad_norm": 0.4171603673327812,
"learning_rate": 9.08194772124871e-07,
"loss": 0.7909,
"step": 2665
},
{
"epoch": 0.7349803867593421,
"grad_norm": 0.5340434480071784,
"learning_rate": 8.994342680430971e-07,
"loss": 0.7702,
"step": 2670
},
{
"epoch": 0.736356754524809,
"grad_norm": 0.45146796076237466,
"learning_rate": 8.907069438044283e-07,
"loss": 0.8057,
"step": 2675
},
{
"epoch": 0.737733122290276,
"grad_norm": 0.4216880628061979,
"learning_rate": 8.820129803269272e-07,
"loss": 0.8074,
"step": 2680
},
{
"epoch": 0.7391094900557429,
"grad_norm": 0.5979240606977037,
"learning_rate": 8.733525578370849e-07,
"loss": 0.8162,
"step": 2685
},
{
"epoch": 0.7404858578212098,
"grad_norm": 0.490528351797546,
"learning_rate": 8.647258558660829e-07,
"loss": 0.8103,
"step": 2690
},
{
"epoch": 0.7418622255866768,
"grad_norm": 0.6084011328958213,
"learning_rate": 8.561330532460765e-07,
"loss": 0.8821,
"step": 2695
},
{
"epoch": 0.7432385933521437,
"grad_norm": 0.3812636095028781,
"learning_rate": 8.47574328106483e-07,
"loss": 0.7365,
"step": 2700
},
{
"epoch": 0.7446149611176106,
"grad_norm": 0.4418699971833215,
"learning_rate": 8.390498578702924e-07,
"loss": 0.8175,
"step": 2705
},
{
"epoch": 0.7459913288830775,
"grad_norm": 0.3665034778827065,
"learning_rate": 8.305598192503892e-07,
"loss": 0.7635,
"step": 2710
},
{
"epoch": 0.7473676966485445,
"grad_norm": 0.39085090708364434,
"learning_rate": 8.22104388245884e-07,
"loss": 0.7682,
"step": 2715
},
{
"epoch": 0.7487440644140114,
"grad_norm": 0.515141732505318,
"learning_rate": 8.136837401384734e-07,
"loss": 0.8256,
"step": 2720
},
{
"epoch": 0.7501204321794783,
"grad_norm": 0.4873699702775256,
"learning_rate": 8.052980494887996e-07,
"loss": 0.8079,
"step": 2725
},
{
"epoch": 0.7514967999449453,
"grad_norm": 0.3657787697288285,
"learning_rate": 7.969474901328359e-07,
"loss": 0.78,
"step": 2730
},
{
"epoch": 0.7528731677104122,
"grad_norm": 0.514544078198768,
"learning_rate": 7.886322351782782e-07,
"loss": 0.821,
"step": 2735
},
{
"epoch": 0.7542495354758791,
"grad_norm": 0.31173151058089976,
"learning_rate": 7.803524570009638e-07,
"loss": 0.793,
"step": 2740
},
{
"epoch": 0.755625903241346,
"grad_norm": 0.42241508592582616,
"learning_rate": 7.7210832724129e-07,
"loss": 0.7798,
"step": 2745
},
{
"epoch": 0.757002271006813,
"grad_norm": 0.3071764331948748,
"learning_rate": 7.63900016800663e-07,
"loss": 0.7698,
"step": 2750
},
{
"epoch": 0.75837863877228,
"grad_norm": 0.5439311482897147,
"learning_rate": 7.55727695837949e-07,
"loss": 0.8452,
"step": 2755
},
{
"epoch": 0.7597550065377469,
"grad_norm": 0.3563361621522225,
"learning_rate": 7.475915337659517e-07,
"loss": 0.7901,
"step": 2760
},
{
"epoch": 0.7611313743032139,
"grad_norm": 0.5754448075856651,
"learning_rate": 7.394916992478982e-07,
"loss": 0.7638,
"step": 2765
},
{
"epoch": 0.7625077420686808,
"grad_norm": 0.511147390789392,
"learning_rate": 7.314283601939432e-07,
"loss": 0.7966,
"step": 2770
},
{
"epoch": 0.7638841098341477,
"grad_norm": 0.5082660532801647,
"learning_rate": 7.234016837576855e-07,
"loss": 0.7977,
"step": 2775
},
{
"epoch": 0.7652604775996146,
"grad_norm": 0.4037699385261041,
"learning_rate": 7.154118363327076e-07,
"loss": 0.8714,
"step": 2780
},
{
"epoch": 0.7666368453650816,
"grad_norm": 0.4667815625876788,
"learning_rate": 7.074589835491236e-07,
"loss": 0.797,
"step": 2785
},
{
"epoch": 0.7680132131305485,
"grad_norm": 0.4480832075891221,
"learning_rate": 6.995432902701452e-07,
"loss": 0.8327,
"step": 2790
},
{
"epoch": 0.7693895808960154,
"grad_norm": 0.42380996565690365,
"learning_rate": 6.916649205886639e-07,
"loss": 0.7462,
"step": 2795
},
{
"epoch": 0.7707659486614824,
"grad_norm": 0.33453237656086077,
"learning_rate": 6.838240378238528e-07,
"loss": 0.7854,
"step": 2800
},
{
"epoch": 0.7707659486614824,
"eval_loss": 0.788250744342804,
"eval_runtime": 37.5666,
"eval_samples_per_second": 133.097,
"eval_steps_per_second": 2.103,
"step": 2800
},
{
"epoch": 0.7721423164269493,
"grad_norm": 0.7936762263185101,
"learning_rate": 6.760208045177777e-07,
"loss": 0.8265,
"step": 2805
},
{
"epoch": 0.7735186841924162,
"grad_norm": 0.5449658294351013,
"learning_rate": 6.68255382432027e-07,
"loss": 0.8289,
"step": 2810
},
{
"epoch": 0.7748950519578831,
"grad_norm": 0.5162826155892138,
"learning_rate": 6.605279325443615e-07,
"loss": 0.7767,
"step": 2815
},
{
"epoch": 0.7762714197233501,
"grad_norm": 0.36993986629866255,
"learning_rate": 6.528386150453747e-07,
"loss": 0.7914,
"step": 2820
},
{
"epoch": 0.777647787488817,
"grad_norm": 0.4025106600408673,
"learning_rate": 6.451875893351742e-07,
"loss": 0.8094,
"step": 2825
},
{
"epoch": 0.7790241552542839,
"grad_norm": 0.3803752081384692,
"learning_rate": 6.375750140200729e-07,
"loss": 0.7834,
"step": 2830
},
{
"epoch": 0.7804005230197508,
"grad_norm": 0.4647690834948723,
"learning_rate": 6.300010469093085e-07,
"loss": 0.7677,
"step": 2835
},
{
"epoch": 0.7817768907852178,
"grad_norm": 0.4360851526308311,
"learning_rate": 6.224658450117638e-07,
"loss": 0.8241,
"step": 2840
},
{
"epoch": 0.7831532585506847,
"grad_norm": 0.40784724834132835,
"learning_rate": 6.149695645327197e-07,
"loss": 0.7794,
"step": 2845
},
{
"epoch": 0.7845296263161516,
"grad_norm": 0.3733748290573589,
"learning_rate": 6.075123608706093e-07,
"loss": 0.7934,
"step": 2850
},
{
"epoch": 0.7859059940816187,
"grad_norm": 0.46257419791313253,
"learning_rate": 6.000943886138039e-07,
"loss": 0.8197,
"step": 2855
},
{
"epoch": 0.7872823618470856,
"grad_norm": 0.3846998032798624,
"learning_rate": 5.927158015374032e-07,
"loss": 0.7601,
"step": 2860
},
{
"epoch": 0.7886587296125525,
"grad_norm": 0.32739463317156486,
"learning_rate": 5.853767526000506e-07,
"loss": 0.7976,
"step": 2865
},
{
"epoch": 0.7900350973780194,
"grad_norm": 0.38672592567017716,
"learning_rate": 5.780773939407586e-07,
"loss": 0.8075,
"step": 2870
},
{
"epoch": 0.7914114651434864,
"grad_norm": 0.4386867822178414,
"learning_rate": 5.708178768757594e-07,
"loss": 0.8151,
"step": 2875
},
{
"epoch": 0.7927878329089533,
"grad_norm": 0.3283397967295564,
"learning_rate": 5.635983518953664e-07,
"loss": 0.8467,
"step": 2880
},
{
"epoch": 0.7941642006744202,
"grad_norm": 0.4837636826148133,
"learning_rate": 5.564189686608528e-07,
"loss": 0.829,
"step": 2885
},
{
"epoch": 0.7955405684398872,
"grad_norm": 0.7164604908817186,
"learning_rate": 5.492798760013504e-07,
"loss": 0.8363,
"step": 2890
},
{
"epoch": 0.7969169362053541,
"grad_norm": 0.5246720628848824,
"learning_rate": 5.421812219107652e-07,
"loss": 0.7728,
"step": 2895
},
{
"epoch": 0.798293303970821,
"grad_norm": 0.48145772973666245,
"learning_rate": 5.351231535447096e-07,
"loss": 0.8351,
"step": 2900
},
{
"epoch": 0.7996696717362879,
"grad_norm": 0.6428159033099584,
"learning_rate": 5.2810581721745e-07,
"loss": 0.7936,
"step": 2905
},
{
"epoch": 0.8010460395017549,
"grad_norm": 0.42644687973421025,
"learning_rate": 5.211293583988736e-07,
"loss": 0.7612,
"step": 2910
},
{
"epoch": 0.8024224072672218,
"grad_norm": 0.49844315374917036,
"learning_rate": 5.141939217114761e-07,
"loss": 0.9081,
"step": 2915
},
{
"epoch": 0.8037987750326887,
"grad_norm": 0.45961744850293057,
"learning_rate": 5.072996509273597e-07,
"loss": 0.7703,
"step": 2920
},
{
"epoch": 0.8051751427981557,
"grad_norm": 0.46944692269351923,
"learning_rate": 5.004466889652568e-07,
"loss": 0.8183,
"step": 2925
},
{
"epoch": 0.8065515105636226,
"grad_norm": 0.43054493887650536,
"learning_rate": 4.93635177887562e-07,
"loss": 0.8182,
"step": 2930
},
{
"epoch": 0.8079278783290895,
"grad_norm": 0.40134994197247525,
"learning_rate": 4.86865258897391e-07,
"loss": 0.7662,
"step": 2935
},
{
"epoch": 0.8093042460945564,
"grad_norm": 0.3679569701658888,
"learning_rate": 4.801370723356533e-07,
"loss": 0.7397,
"step": 2940
},
{
"epoch": 0.8106806138600234,
"grad_norm": 0.4996662512653364,
"learning_rate": 4.7345075767814277e-07,
"loss": 0.7655,
"step": 2945
},
{
"epoch": 0.8120569816254903,
"grad_norm": 0.33008573990481305,
"learning_rate": 4.668064535326433e-07,
"loss": 0.7733,
"step": 2950
},
{
"epoch": 0.8134333493909572,
"grad_norm": 0.5216225381160015,
"learning_rate": 4.602042976360596e-07,
"loss": 0.8131,
"step": 2955
},
{
"epoch": 0.8148097171564243,
"grad_norm": 0.3009799921127595,
"learning_rate": 4.536444268515608e-07,
"loss": 0.761,
"step": 2960
},
{
"epoch": 0.8161860849218912,
"grad_norm": 0.4808517772368335,
"learning_rate": 4.4712697716573994e-07,
"loss": 0.7887,
"step": 2965
},
{
"epoch": 0.817562452687358,
"grad_norm": 0.31056324010954034,
"learning_rate": 4.406520836858003e-07,
"loss": 0.7373,
"step": 2970
},
{
"epoch": 0.818938820452825,
"grad_norm": 0.4026317051978566,
"learning_rate": 4.342198806367512e-07,
"loss": 0.8102,
"step": 2975
},
{
"epoch": 0.820315188218292,
"grad_norm": 0.571522212386199,
"learning_rate": 4.2783050135862454e-07,
"loss": 0.8232,
"step": 2980
},
{
"epoch": 0.8216915559837589,
"grad_norm": 0.6020021292977741,
"learning_rate": 4.2148407830371553e-07,
"loss": 0.8423,
"step": 2985
},
{
"epoch": 0.8230679237492258,
"grad_norm": 0.31515971826344896,
"learning_rate": 4.1518074303383006e-07,
"loss": 0.7556,
"step": 2990
},
{
"epoch": 0.8244442915146927,
"grad_norm": 0.4964415865676406,
"learning_rate": 4.0892062621756436e-07,
"loss": 0.8106,
"step": 2995
},
{
"epoch": 0.8258206592801597,
"grad_norm": 0.5295609606861289,
"learning_rate": 4.027038576275921e-07,
"loss": 0.7958,
"step": 3000
},
{
"epoch": 0.8258206592801597,
"eval_loss": 0.7862712144851685,
"eval_runtime": 37.5789,
"eval_samples_per_second": 133.054,
"eval_steps_per_second": 2.102,
"step": 3000
},
{
"epoch": 0.8271970270456266,
"grad_norm": 0.5235293567756518,
"learning_rate": 3.9653056613797315e-07,
"loss": 0.8119,
"step": 3005
},
{
"epoch": 0.8285733948110935,
"grad_norm": 0.46096657495011545,
"learning_rate": 3.904008797214867e-07,
"loss": 0.7939,
"step": 3010
},
{
"epoch": 0.8299497625765605,
"grad_norm": 0.4583761081561358,
"learning_rate": 3.8431492544697384e-07,
"loss": 0.8206,
"step": 3015
},
{
"epoch": 0.8313261303420274,
"grad_norm": 0.4307989854498663,
"learning_rate": 3.7827282947670686e-07,
"loss": 0.805,
"step": 3020
},
{
"epoch": 0.8327024981074943,
"grad_norm": 0.6458058398139984,
"learning_rate": 3.722747170637703e-07,
"loss": 0.8272,
"step": 3025
},
{
"epoch": 0.8340788658729612,
"grad_norm": 0.27246801971613765,
"learning_rate": 3.663207125494667e-07,
"loss": 0.7188,
"step": 3030
},
{
"epoch": 0.8354552336384282,
"grad_norm": 0.31316968826402736,
"learning_rate": 3.604109393607397e-07,
"loss": 0.7771,
"step": 3035
},
{
"epoch": 0.8368316014038951,
"grad_norm": 0.46558859230251337,
"learning_rate": 3.545455200076148e-07,
"loss": 0.7697,
"step": 3040
},
{
"epoch": 0.838207969169362,
"grad_norm": 0.4510507655849259,
"learning_rate": 3.4872457608065706e-07,
"loss": 0.7729,
"step": 3045
},
{
"epoch": 0.839584336934829,
"grad_norm": 0.43470367010584865,
"learning_rate": 3.4294822824845447e-07,
"loss": 0.8024,
"step": 3050
},
{
"epoch": 0.8409607047002959,
"grad_norm": 0.5457490936555786,
"learning_rate": 3.3721659625511466e-07,
"loss": 0.8288,
"step": 3055
},
{
"epoch": 0.8423370724657628,
"grad_norm": 0.4460830059047153,
"learning_rate": 3.315297989177829e-07,
"loss": 0.7704,
"step": 3060
},
{
"epoch": 0.8437134402312297,
"grad_norm": 0.4017748109392074,
"learning_rate": 3.2588795412417715e-07,
"loss": 0.8081,
"step": 3065
},
{
"epoch": 0.8450898079966968,
"grad_norm": 0.43509526046571056,
"learning_rate": 3.20291178830148e-07,
"loss": 0.7696,
"step": 3070
},
{
"epoch": 0.8464661757621637,
"grad_norm": 0.4968191608476648,
"learning_rate": 3.1473958905725023e-07,
"loss": 0.8007,
"step": 3075
},
{
"epoch": 0.8478425435276306,
"grad_norm": 0.35779440235114646,
"learning_rate": 3.092332998903416e-07,
"loss": 0.7844,
"step": 3080
},
{
"epoch": 0.8492189112930976,
"grad_norm": 0.41824405953159766,
"learning_rate": 3.0377242547519224e-07,
"loss": 0.8119,
"step": 3085
},
{
"epoch": 0.8505952790585645,
"grad_norm": 0.3627881737322641,
"learning_rate": 2.983570790161236e-07,
"loss": 0.7926,
"step": 3090
},
{
"epoch": 0.8519716468240314,
"grad_norm": 0.6401049982386239,
"learning_rate": 2.9298737277365875e-07,
"loss": 0.7957,
"step": 3095
},
{
"epoch": 0.8533480145894983,
"grad_norm": 0.4518026617163562,
"learning_rate": 2.8766341806219565e-07,
"loss": 0.8071,
"step": 3100
},
{
"epoch": 0.8547243823549653,
"grad_norm": 0.431837948798665,
"learning_rate": 2.823853252476988e-07,
"loss": 0.8007,
"step": 3105
},
{
"epoch": 0.8561007501204322,
"grad_norm": 0.347884381355328,
"learning_rate": 2.771532037454136e-07,
"loss": 0.8173,
"step": 3110
},
{
"epoch": 0.8574771178858991,
"grad_norm": 0.4330221328925689,
"learning_rate": 2.719671620175968e-07,
"loss": 0.7266,
"step": 3115
},
{
"epoch": 0.8588534856513661,
"grad_norm": 0.3637332745725778,
"learning_rate": 2.6682730757126627e-07,
"loss": 0.8076,
"step": 3120
},
{
"epoch": 0.860229853416833,
"grad_norm": 0.6109148107559905,
"learning_rate": 2.6173374695597693e-07,
"loss": 0.8339,
"step": 3125
},
{
"epoch": 0.8616062211822999,
"grad_norm": 0.5076827526837399,
"learning_rate": 2.566865857616066e-07,
"loss": 0.8432,
"step": 3130
},
{
"epoch": 0.8629825889477668,
"grad_norm": 0.5457670411835718,
"learning_rate": 2.5168592861617216e-07,
"loss": 0.7928,
"step": 3135
},
{
"epoch": 0.8643589567132338,
"grad_norm": 0.41906024544255965,
"learning_rate": 2.4673187918365593e-07,
"loss": 0.7741,
"step": 3140
},
{
"epoch": 0.8657353244787007,
"grad_norm": 0.5161588655724126,
"learning_rate": 2.4182454016186046e-07,
"loss": 0.8115,
"step": 3145
},
{
"epoch": 0.8671116922441676,
"grad_norm": 0.3816011931932575,
"learning_rate": 2.3696401328027806e-07,
"loss": 0.7693,
"step": 3150
},
{
"epoch": 0.8684880600096345,
"grad_norm": 0.4654707569379619,
"learning_rate": 2.3215039929798205e-07,
"loss": 0.8122,
"step": 3155
},
{
"epoch": 0.8698644277751015,
"grad_norm": 0.306667830180016,
"learning_rate": 2.2738379800153641e-07,
"loss": 0.7612,
"step": 3160
},
{
"epoch": 0.8712407955405684,
"grad_norm": 0.46545229964609797,
"learning_rate": 2.226643082029309e-07,
"loss": 0.7892,
"step": 3165
},
{
"epoch": 0.8726171633060353,
"grad_norm": 0.1953075510516031,
"learning_rate": 2.1799202773752943e-07,
"loss": 0.7521,
"step": 3170
},
{
"epoch": 0.8739935310715023,
"grad_norm": 0.381786701995952,
"learning_rate": 2.1336705346204301e-07,
"loss": 0.8512,
"step": 3175
},
{
"epoch": 0.8753698988369693,
"grad_norm": 0.43933009705251314,
"learning_rate": 2.087894812525218e-07,
"loss": 0.8025,
"step": 3180
},
{
"epoch": 0.8767462666024362,
"grad_norm": 0.44005358531346456,
"learning_rate": 2.042594060023681e-07,
"loss": 0.7756,
"step": 3185
},
{
"epoch": 0.878122634367903,
"grad_norm": 0.49824718850948474,
"learning_rate": 1.9977692162036876e-07,
"loss": 0.7978,
"step": 3190
},
{
"epoch": 0.8794990021333701,
"grad_norm": 0.3227198829491034,
"learning_rate": 1.95342121028749e-07,
"loss": 0.7738,
"step": 3195
},
{
"epoch": 0.880875369898837,
"grad_norm": 0.34555908654114686,
"learning_rate": 1.9095509616124385e-07,
"loss": 0.8192,
"step": 3200
},
{
"epoch": 0.880875369898837,
"eval_loss": 0.7828695774078369,
"eval_runtime": 37.5732,
"eval_samples_per_second": 133.074,
"eval_steps_per_second": 2.103,
"step": 3200
},
{
"epoch": 0.8822517376643039,
"grad_norm": 0.4058193409462779,
"learning_rate": 1.866159379611965e-07,
"loss": 0.7827,
"step": 3205
},
{
"epoch": 0.8836281054297709,
"grad_norm": 0.634249713668319,
"learning_rate": 1.8232473637966874e-07,
"loss": 0.8316,
"step": 3210
},
{
"epoch": 0.8850044731952378,
"grad_norm": 0.47086197421634446,
"learning_rate": 1.7808158037357997e-07,
"loss": 0.8106,
"step": 3215
},
{
"epoch": 0.8863808409607047,
"grad_norm": 0.4923463916845965,
"learning_rate": 1.7388655790385928e-07,
"loss": 0.7618,
"step": 3220
},
{
"epoch": 0.8877572087261716,
"grad_norm": 0.5706429066921409,
"learning_rate": 1.6973975593362557e-07,
"loss": 0.8026,
"step": 3225
},
{
"epoch": 0.8891335764916386,
"grad_norm": 0.40654338758220415,
"learning_rate": 1.656412604263824e-07,
"loss": 0.805,
"step": 3230
},
{
"epoch": 0.8905099442571055,
"grad_norm": 0.36239810433911784,
"learning_rate": 1.615911563442385e-07,
"loss": 0.7901,
"step": 3235
},
{
"epoch": 0.8918863120225724,
"grad_norm": 0.3902839217508603,
"learning_rate": 1.5758952764614254e-07,
"loss": 0.772,
"step": 3240
},
{
"epoch": 0.8932626797880394,
"grad_norm": 0.4730030214965539,
"learning_rate": 1.536364572861465e-07,
"loss": 0.7981,
"step": 3245
},
{
"epoch": 0.8946390475535063,
"grad_norm": 0.5222497792049411,
"learning_rate": 1.4973202721168452e-07,
"loss": 0.774,
"step": 3250
},
{
"epoch": 0.8960154153189732,
"grad_norm": 0.457253030971323,
"learning_rate": 1.4587631836187362e-07,
"loss": 0.7762,
"step": 3255
},
{
"epoch": 0.8973917830844401,
"grad_norm": 0.35084396615645064,
"learning_rate": 1.420694106658363e-07,
"loss": 0.7956,
"step": 3260
},
{
"epoch": 0.8987681508499071,
"grad_norm": 0.46947234334867594,
"learning_rate": 1.3831138304104374e-07,
"loss": 0.7488,
"step": 3265
},
{
"epoch": 0.900144518615374,
"grad_norm": 0.4211703728724711,
"learning_rate": 1.3460231339168018e-07,
"loss": 0.7594,
"step": 3270
},
{
"epoch": 0.9015208863808409,
"grad_norm": 0.4318480909616356,
"learning_rate": 1.3094227860702636e-07,
"loss": 0.7535,
"step": 3275
},
{
"epoch": 0.902897254146308,
"grad_norm": 0.5304843144531105,
"learning_rate": 1.2733135455986755e-07,
"loss": 0.7631,
"step": 3280
},
{
"epoch": 0.9042736219117748,
"grad_norm": 0.39712712425481755,
"learning_rate": 1.237696161049201e-07,
"loss": 0.7967,
"step": 3285
},
{
"epoch": 0.9056499896772418,
"grad_norm": 0.3607138196755753,
"learning_rate": 1.2025713707727954e-07,
"loss": 0.7673,
"step": 3290
},
{
"epoch": 0.9070263574427087,
"grad_norm": 0.36526857232256044,
"learning_rate": 1.1679399029088878e-07,
"loss": 0.8021,
"step": 3295
},
{
"epoch": 0.9084027252081757,
"grad_norm": 0.4307690645765424,
"learning_rate": 1.1338024753703076e-07,
"loss": 0.7855,
"step": 3300
},
{
"epoch": 0.9097790929736426,
"grad_norm": 0.5370242540693829,
"learning_rate": 1.1001597958283927e-07,
"loss": 0.7942,
"step": 3305
},
{
"epoch": 0.9111554607391095,
"grad_norm": 0.762967780858564,
"learning_rate": 1.067012561698319e-07,
"loss": 0.7809,
"step": 3310
},
{
"epoch": 0.9125318285045764,
"grad_norm": 0.38146623134983393,
"learning_rate": 1.0343614601246388e-07,
"loss": 0.8512,
"step": 3315
},
{
"epoch": 0.9139081962700434,
"grad_norm": 0.483635357463634,
"learning_rate": 1.0022071679670426e-07,
"loss": 0.8334,
"step": 3320
},
{
"epoch": 0.9152845640355103,
"grad_norm": 0.45987532559150957,
"learning_rate": 9.705503517863286e-08,
"loss": 0.7624,
"step": 3325
},
{
"epoch": 0.9166609318009772,
"grad_norm": 0.5146306345364831,
"learning_rate": 9.393916678305831e-08,
"loss": 0.781,
"step": 3330
},
{
"epoch": 0.9180372995664442,
"grad_norm": 0.4353941113364763,
"learning_rate": 9.087317620215642e-08,
"loss": 0.7926,
"step": 3335
},
{
"epoch": 0.9194136673319111,
"grad_norm": 0.5238188017998141,
"learning_rate": 8.78571269941339e-08,
"loss": 0.7944,
"step": 3340
},
{
"epoch": 0.920790035097378,
"grad_norm": 0.39784081936662746,
"learning_rate": 8.48910816819079e-08,
"loss": 0.7456,
"step": 3345
},
{
"epoch": 0.9221664028628449,
"grad_norm": 0.4021648902631367,
"learning_rate": 8.197510175181279e-08,
"loss": 0.7972,
"step": 3350
},
{
"epoch": 0.9235427706283119,
"grad_norm": 0.4535198430400062,
"learning_rate": 7.910924765232169e-08,
"loss": 0.7716,
"step": 3355
},
{
"epoch": 0.9249191383937788,
"grad_norm": 0.4679043443981073,
"learning_rate": 7.629357879279764e-08,
"loss": 0.8151,
"step": 3360
},
{
"epoch": 0.9262955061592457,
"grad_norm": 0.4138512577922224,
"learning_rate": 7.352815354225856e-08,
"loss": 0.7778,
"step": 3365
},
{
"epoch": 0.9276718739247127,
"grad_norm": 0.4959390023966221,
"learning_rate": 7.08130292281703e-08,
"loss": 0.7654,
"step": 3370
},
{
"epoch": 0.9290482416901796,
"grad_norm": 0.3635744148121797,
"learning_rate": 6.8148262135255e-08,
"loss": 0.7713,
"step": 3375
},
{
"epoch": 0.9304246094556465,
"grad_norm": 0.3880051004435765,
"learning_rate": 6.553390750432709e-08,
"loss": 0.797,
"step": 3380
},
{
"epoch": 0.9318009772211134,
"grad_norm": 0.376848176957449,
"learning_rate": 6.297001953114696e-08,
"loss": 0.7915,
"step": 3385
},
{
"epoch": 0.9331773449865804,
"grad_norm": 0.5229520565025577,
"learning_rate": 6.045665136529683e-08,
"loss": 0.7831,
"step": 3390
},
{
"epoch": 0.9345537127520473,
"grad_norm": 0.3749002422429637,
"learning_rate": 5.799385510908029e-08,
"loss": 0.813,
"step": 3395
},
{
"epoch": 0.9359300805175143,
"grad_norm": 0.538377010361361,
"learning_rate": 5.558168181644147e-08,
"loss": 0.765,
"step": 3400
},
{
"epoch": 0.9359300805175143,
"eval_loss": 0.7824124693870544,
"eval_runtime": 37.5729,
"eval_samples_per_second": 133.075,
"eval_steps_per_second": 2.103,
"step": 3400
},
{
"epoch": 0.9373064482829813,
"grad_norm": 0.40542132743231607,
"learning_rate": 5.3220181491906997e-08,
"loss": 0.7939,
"step": 3405
},
{
"epoch": 0.9386828160484482,
"grad_norm": 0.45653448883349285,
"learning_rate": 5.0909403089548504e-08,
"loss": 0.7683,
"step": 3410
},
{
"epoch": 0.9400591838139151,
"grad_norm": 0.43498449283812496,
"learning_rate": 4.864939451196926e-08,
"loss": 0.7706,
"step": 3415
},
{
"epoch": 0.941435551579382,
"grad_norm": 0.4632642432067583,
"learning_rate": 4.6440202609309983e-08,
"loss": 0.847,
"step": 3420
},
{
"epoch": 0.942811919344849,
"grad_norm": 0.398573024905695,
"learning_rate": 4.428187317827848e-08,
"loss": 0.8004,
"step": 3425
},
{
"epoch": 0.9441882871103159,
"grad_norm": 0.3520961675944394,
"learning_rate": 4.217445096119932e-08,
"loss": 0.7768,
"step": 3430
},
{
"epoch": 0.9455646548757828,
"grad_norm": 0.2659907213915567,
"learning_rate": 4.011797964508707e-08,
"loss": 0.8068,
"step": 3435
},
{
"epoch": 0.9469410226412498,
"grad_norm": 0.35721378611449023,
"learning_rate": 3.8112501860740893e-08,
"loss": 0.7761,
"step": 3440
},
{
"epoch": 0.9483173904067167,
"grad_norm": 0.5662605277154725,
"learning_rate": 3.615805918185999e-08,
"loss": 0.7956,
"step": 3445
},
{
"epoch": 0.9496937581721836,
"grad_norm": 0.4708997292827784,
"learning_rate": 3.4254692124181256e-08,
"loss": 0.781,
"step": 3450
},
{
"epoch": 0.9510701259376505,
"grad_norm": 0.3463983685723411,
"learning_rate": 3.240244014464211e-08,
"loss": 0.8038,
"step": 3455
},
{
"epoch": 0.9524464937031175,
"grad_norm": 0.30914041030379646,
"learning_rate": 3.060134164055928e-08,
"loss": 0.7855,
"step": 3460
},
{
"epoch": 0.9538228614685844,
"grad_norm": 0.4094929093409069,
"learning_rate": 2.885143394883466e-08,
"loss": 0.7922,
"step": 3465
},
{
"epoch": 0.9551992292340513,
"grad_norm": 0.3550357942305495,
"learning_rate": 2.7152753345181248e-08,
"loss": 0.7488,
"step": 3470
},
{
"epoch": 0.9565755969995182,
"grad_norm": 0.4969121322892503,
"learning_rate": 2.5505335043370105e-08,
"loss": 0.8235,
"step": 3475
},
{
"epoch": 0.9579519647649852,
"grad_norm": 0.5594870015775546,
"learning_rate": 2.3909213194501513e-08,
"loss": 0.8019,
"step": 3480
},
{
"epoch": 0.9593283325304521,
"grad_norm": 0.3982458733337497,
"learning_rate": 2.2364420886297202e-08,
"loss": 0.7931,
"step": 3485
},
{
"epoch": 0.960704700295919,
"grad_norm": 0.40600940068470787,
"learning_rate": 2.087099014241256e-08,
"loss": 0.7751,
"step": 3490
},
{
"epoch": 0.962081068061386,
"grad_norm": 0.5688667017637077,
"learning_rate": 1.9428951921774687e-08,
"loss": 0.8253,
"step": 3495
},
{
"epoch": 0.963457435826853,
"grad_norm": 0.41507791635586866,
"learning_rate": 1.8038336117940368e-08,
"loss": 0.7615,
"step": 3500
},
{
"epoch": 0.9648338035923198,
"grad_norm": 0.3974167729618538,
"learning_rate": 1.6699171558474946e-08,
"loss": 0.7943,
"step": 3505
},
{
"epoch": 0.9662101713577868,
"grad_norm": 0.6940727253100508,
"learning_rate": 1.541148600435721e-08,
"loss": 0.8198,
"step": 3510
},
{
"epoch": 0.9675865391232538,
"grad_norm": 0.46568158788586206,
"learning_rate": 1.4175306149400715e-08,
"loss": 0.8164,
"step": 3515
},
{
"epoch": 0.9689629068887207,
"grad_norm": 0.5038186258746858,
"learning_rate": 1.2990657619703361e-08,
"loss": 0.755,
"step": 3520
},
{
"epoch": 0.9703392746541876,
"grad_norm": 0.44963940045587836,
"learning_rate": 1.1857564973114798e-08,
"loss": 0.8276,
"step": 3525
},
{
"epoch": 0.9717156424196546,
"grad_norm": 0.3869453345256143,
"learning_rate": 1.0776051698727363e-08,
"loss": 0.7643,
"step": 3530
},
{
"epoch": 0.9730920101851215,
"grad_norm": 0.4625561038516912,
"learning_rate": 9.746140216388978e-09,
"loss": 0.7961,
"step": 3535
},
{
"epoch": 0.9744683779505884,
"grad_norm": 0.43657476353887914,
"learning_rate": 8.767851876239075e-09,
"loss": 0.785,
"step": 3540
},
{
"epoch": 0.9758447457160553,
"grad_norm": 0.36886304870486564,
"learning_rate": 7.841206958265901e-09,
"loss": 0.8109,
"step": 3545
},
{
"epoch": 0.9772211134815223,
"grad_norm": 0.48365435933116596,
"learning_rate": 6.9662246718849025e-09,
"loss": 0.805,
"step": 3550
},
{
"epoch": 0.9785974812469892,
"grad_norm": 0.5676220156571874,
"learning_rate": 6.142923155542379e-09,
"loss": 0.8249,
"step": 3555
},
{
"epoch": 0.9799738490124561,
"grad_norm": 0.5777045577708868,
"learning_rate": 5.371319476338288e-09,
"loss": 0.8371,
"step": 3560
},
{
"epoch": 0.9813502167779231,
"grad_norm": 0.4285639278318182,
"learning_rate": 4.651429629672077e-09,
"loss": 0.8493,
"step": 3565
},
{
"epoch": 0.98272658454339,
"grad_norm": 0.5788587404914071,
"learning_rate": 3.9832685389123995e-09,
"loss": 0.8533,
"step": 3570
},
{
"epoch": 0.9841029523088569,
"grad_norm": 0.504857407057922,
"learning_rate": 3.3668500550870787e-09,
"loss": 0.8482,
"step": 3575
},
{
"epoch": 0.9854793200743238,
"grad_norm": 0.48362838745693276,
"learning_rate": 2.8021869565958427e-09,
"loss": 0.7877,
"step": 3580
},
{
"epoch": 0.9868556878397908,
"grad_norm": 0.4381509362003844,
"learning_rate": 2.289290948944978e-09,
"loss": 0.8337,
"step": 3585
},
{
"epoch": 0.9882320556052577,
"grad_norm": 0.35883074278050137,
"learning_rate": 1.8281726645061338e-09,
"loss": 0.8103,
"step": 3590
},
{
"epoch": 0.9896084233707246,
"grad_norm": 0.30791028488533356,
"learning_rate": 1.4188416622945566e-09,
"loss": 0.7772,
"step": 3595
},
{
"epoch": 0.9909847911361916,
"grad_norm": 0.519345784734355,
"learning_rate": 1.0613064277711916e-09,
"loss": 0.7939,
"step": 3600
},
{
"epoch": 0.9909847911361916,
"eval_loss": 0.7823675870895386,
"eval_runtime": 37.5664,
"eval_samples_per_second": 133.098,
"eval_steps_per_second": 2.103,
"step": 3600
},
{
"epoch": 0.9923611589016585,
"grad_norm": 0.5511524759279022,
"learning_rate": 7.555743726675446e-10,
"loss": 0.7623,
"step": 3605
},
{
"epoch": 0.9937375266671254,
"grad_norm": 0.46879838306977284,
"learning_rate": 5.01651834831085e-10,
"loss": 0.7906,
"step": 3610
},
{
"epoch": 0.9951138944325923,
"grad_norm": 0.5200935954838051,
"learning_rate": 2.9954407809423823e-10,
"loss": 0.788,
"step": 3615
},
{
"epoch": 0.9964902621980594,
"grad_norm": 0.47789258781015914,
"learning_rate": 1.4925529216558432e-10,
"loss": 0.7885,
"step": 3620
},
{
"epoch": 0.9978666299635263,
"grad_norm": 0.2942434423794945,
"learning_rate": 5.078859254242785e-11,
"loss": 0.8001,
"step": 3625
},
{
"epoch": 0.9992429977289932,
"grad_norm": 0.49341815424912977,
"learning_rate": 4.1460204466825526e-12,
"loss": 0.8235,
"step": 3630
},
{
"epoch": 0.99979354483518,
"step": 3632,
"total_flos": 1258708224212992.0,
"train_loss": 0.935843440076328,
"train_runtime": 26049.5256,
"train_samples_per_second": 35.699,
"train_steps_per_second": 0.139
}
],
"logging_steps": 5,
"max_steps": 3632,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1258708224212992.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}