martimfasantos's picture
Model save
69fdcfc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.998984083982391,
"eval_steps": 500,
"global_step": 4428,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006772773450728074,
"grad_norm": 11.0625,
"learning_rate": 4.514672686230248e-08,
"loss": 2.4477,
"step": 1
},
{
"epoch": 0.0033863867253640365,
"grad_norm": 11.75,
"learning_rate": 2.2573363431151243e-07,
"loss": 2.4743,
"step": 5
},
{
"epoch": 0.006772773450728073,
"grad_norm": 12.625,
"learning_rate": 4.5146726862302486e-07,
"loss": 2.4675,
"step": 10
},
{
"epoch": 0.01015916017609211,
"grad_norm": 11.8125,
"learning_rate": 6.772009029345373e-07,
"loss": 2.4607,
"step": 15
},
{
"epoch": 0.013545546901456146,
"grad_norm": 11.9375,
"learning_rate": 9.029345372460497e-07,
"loss": 2.4413,
"step": 20
},
{
"epoch": 0.016931933626820182,
"grad_norm": 11.4375,
"learning_rate": 1.1286681715575621e-06,
"loss": 2.4661,
"step": 25
},
{
"epoch": 0.02031832035218422,
"grad_norm": 10.6875,
"learning_rate": 1.3544018058690746e-06,
"loss": 2.4284,
"step": 30
},
{
"epoch": 0.023704707077548254,
"grad_norm": 9.0625,
"learning_rate": 1.5801354401805871e-06,
"loss": 2.4384,
"step": 35
},
{
"epoch": 0.027091093802912292,
"grad_norm": 7.53125,
"learning_rate": 1.8058690744920994e-06,
"loss": 2.4398,
"step": 40
},
{
"epoch": 0.03047748052827633,
"grad_norm": 5.40625,
"learning_rate": 2.0316027088036117e-06,
"loss": 2.4001,
"step": 45
},
{
"epoch": 0.033863867253640365,
"grad_norm": 3.75,
"learning_rate": 2.2573363431151243e-06,
"loss": 2.3842,
"step": 50
},
{
"epoch": 0.0372502539790044,
"grad_norm": 3.03125,
"learning_rate": 2.4830699774266368e-06,
"loss": 2.3908,
"step": 55
},
{
"epoch": 0.04063664070436844,
"grad_norm": 2.609375,
"learning_rate": 2.7088036117381493e-06,
"loss": 2.3855,
"step": 60
},
{
"epoch": 0.04402302742973248,
"grad_norm": 2.484375,
"learning_rate": 2.9345372460496618e-06,
"loss": 2.3649,
"step": 65
},
{
"epoch": 0.04740941415509651,
"grad_norm": 2.28125,
"learning_rate": 3.1602708803611743e-06,
"loss": 2.3336,
"step": 70
},
{
"epoch": 0.05079580088046055,
"grad_norm": 1.9140625,
"learning_rate": 3.386004514672687e-06,
"loss": 2.3418,
"step": 75
},
{
"epoch": 0.054182187605824585,
"grad_norm": 1.765625,
"learning_rate": 3.611738148984199e-06,
"loss": 2.3162,
"step": 80
},
{
"epoch": 0.05756857433118862,
"grad_norm": 1.796875,
"learning_rate": 3.837471783295712e-06,
"loss": 2.274,
"step": 85
},
{
"epoch": 0.06095496105655266,
"grad_norm": 1.6875,
"learning_rate": 4.0632054176072235e-06,
"loss": 2.2926,
"step": 90
},
{
"epoch": 0.06434134778191669,
"grad_norm": 1.859375,
"learning_rate": 4.288939051918736e-06,
"loss": 2.2671,
"step": 95
},
{
"epoch": 0.06772773450728073,
"grad_norm": 1.578125,
"learning_rate": 4.5146726862302485e-06,
"loss": 2.255,
"step": 100
},
{
"epoch": 0.07111412123264477,
"grad_norm": 1.546875,
"learning_rate": 4.740406320541761e-06,
"loss": 2.2431,
"step": 105
},
{
"epoch": 0.0745005079580088,
"grad_norm": 1.4765625,
"learning_rate": 4.9661399548532735e-06,
"loss": 2.2486,
"step": 110
},
{
"epoch": 0.07788689468337284,
"grad_norm": 1.5,
"learning_rate": 5.191873589164786e-06,
"loss": 2.249,
"step": 115
},
{
"epoch": 0.08127328140873688,
"grad_norm": 1.3984375,
"learning_rate": 5.4176072234762986e-06,
"loss": 2.2278,
"step": 120
},
{
"epoch": 0.08465966813410092,
"grad_norm": 1.375,
"learning_rate": 5.643340857787811e-06,
"loss": 2.2095,
"step": 125
},
{
"epoch": 0.08804605485946496,
"grad_norm": 1.375,
"learning_rate": 5.8690744920993236e-06,
"loss": 2.2445,
"step": 130
},
{
"epoch": 0.091432441584829,
"grad_norm": 1.390625,
"learning_rate": 6.094808126410836e-06,
"loss": 2.2414,
"step": 135
},
{
"epoch": 0.09481882831019302,
"grad_norm": 1.3671875,
"learning_rate": 6.320541760722349e-06,
"loss": 2.1974,
"step": 140
},
{
"epoch": 0.09820521503555706,
"grad_norm": 1.3359375,
"learning_rate": 6.546275395033861e-06,
"loss": 2.1979,
"step": 145
},
{
"epoch": 0.1015916017609211,
"grad_norm": 1.3203125,
"learning_rate": 6.772009029345374e-06,
"loss": 2.1945,
"step": 150
},
{
"epoch": 0.10497798848628513,
"grad_norm": 1.3828125,
"learning_rate": 6.997742663656886e-06,
"loss": 2.2061,
"step": 155
},
{
"epoch": 0.10836437521164917,
"grad_norm": 1.328125,
"learning_rate": 7.223476297968398e-06,
"loss": 2.2234,
"step": 160
},
{
"epoch": 0.11175076193701321,
"grad_norm": 1.34375,
"learning_rate": 7.44920993227991e-06,
"loss": 2.1885,
"step": 165
},
{
"epoch": 0.11513714866237724,
"grad_norm": 1.3046875,
"learning_rate": 7.674943566591424e-06,
"loss": 2.187,
"step": 170
},
{
"epoch": 0.11852353538774128,
"grad_norm": 1.40625,
"learning_rate": 7.900677200902936e-06,
"loss": 2.2022,
"step": 175
},
{
"epoch": 0.12190992211310532,
"grad_norm": 1.359375,
"learning_rate": 8.126410835214447e-06,
"loss": 2.2013,
"step": 180
},
{
"epoch": 0.12529630883846934,
"grad_norm": 1.3515625,
"learning_rate": 8.35214446952596e-06,
"loss": 2.1984,
"step": 185
},
{
"epoch": 0.12868269556383338,
"grad_norm": 1.3125,
"learning_rate": 8.577878103837472e-06,
"loss": 2.1861,
"step": 190
},
{
"epoch": 0.13206908228919742,
"grad_norm": 1.3359375,
"learning_rate": 8.803611738148985e-06,
"loss": 2.2115,
"step": 195
},
{
"epoch": 0.13545546901456146,
"grad_norm": 1.3359375,
"learning_rate": 9.029345372460497e-06,
"loss": 2.1902,
"step": 200
},
{
"epoch": 0.1388418557399255,
"grad_norm": 1.2890625,
"learning_rate": 9.25507900677201e-06,
"loss": 2.2061,
"step": 205
},
{
"epoch": 0.14222824246528953,
"grad_norm": 1.3046875,
"learning_rate": 9.480812641083522e-06,
"loss": 2.1917,
"step": 210
},
{
"epoch": 0.14561462919065357,
"grad_norm": 1.3046875,
"learning_rate": 9.706546275395035e-06,
"loss": 2.1859,
"step": 215
},
{
"epoch": 0.1490010159160176,
"grad_norm": 1.265625,
"learning_rate": 9.932279909706547e-06,
"loss": 2.1786,
"step": 220
},
{
"epoch": 0.15238740264138165,
"grad_norm": 1.3515625,
"learning_rate": 1.015801354401806e-05,
"loss": 2.1978,
"step": 225
},
{
"epoch": 0.15577378936674569,
"grad_norm": 1.2890625,
"learning_rate": 1.0383747178329572e-05,
"loss": 2.1843,
"step": 230
},
{
"epoch": 0.15916017609210972,
"grad_norm": 1.296875,
"learning_rate": 1.0609480812641085e-05,
"loss": 2.1768,
"step": 235
},
{
"epoch": 0.16254656281747376,
"grad_norm": 1.3515625,
"learning_rate": 1.0835214446952597e-05,
"loss": 2.1826,
"step": 240
},
{
"epoch": 0.1659329495428378,
"grad_norm": 1.2734375,
"learning_rate": 1.106094808126411e-05,
"loss": 2.1822,
"step": 245
},
{
"epoch": 0.16931933626820184,
"grad_norm": 1.265625,
"learning_rate": 1.1286681715575622e-05,
"loss": 2.1865,
"step": 250
},
{
"epoch": 0.17270572299356587,
"grad_norm": 1.2421875,
"learning_rate": 1.1512415349887135e-05,
"loss": 2.1902,
"step": 255
},
{
"epoch": 0.1760921097189299,
"grad_norm": 1.3125,
"learning_rate": 1.1738148984198647e-05,
"loss": 2.1975,
"step": 260
},
{
"epoch": 0.17947849644429395,
"grad_norm": 1.2890625,
"learning_rate": 1.1963882618510158e-05,
"loss": 2.1951,
"step": 265
},
{
"epoch": 0.182864883169658,
"grad_norm": 1.3203125,
"learning_rate": 1.2189616252821672e-05,
"loss": 2.1777,
"step": 270
},
{
"epoch": 0.186251269895022,
"grad_norm": 1.296875,
"learning_rate": 1.2415349887133183e-05,
"loss": 2.1964,
"step": 275
},
{
"epoch": 0.18963765662038604,
"grad_norm": 1.265625,
"learning_rate": 1.2641083521444697e-05,
"loss": 2.1692,
"step": 280
},
{
"epoch": 0.19302404334575007,
"grad_norm": 1.296875,
"learning_rate": 1.2866817155756208e-05,
"loss": 2.1907,
"step": 285
},
{
"epoch": 0.1964104300711141,
"grad_norm": 1.328125,
"learning_rate": 1.3092550790067722e-05,
"loss": 2.17,
"step": 290
},
{
"epoch": 0.19979681679647815,
"grad_norm": 1.2578125,
"learning_rate": 1.3318284424379233e-05,
"loss": 2.1558,
"step": 295
},
{
"epoch": 0.2031832035218422,
"grad_norm": 1.3125,
"learning_rate": 1.3544018058690747e-05,
"loss": 2.184,
"step": 300
},
{
"epoch": 0.20656959024720623,
"grad_norm": 1.2734375,
"learning_rate": 1.3769751693002258e-05,
"loss": 2.1542,
"step": 305
},
{
"epoch": 0.20995597697257026,
"grad_norm": 1.265625,
"learning_rate": 1.3995485327313772e-05,
"loss": 2.2053,
"step": 310
},
{
"epoch": 0.2133423636979343,
"grad_norm": 1.2734375,
"learning_rate": 1.4221218961625283e-05,
"loss": 2.1536,
"step": 315
},
{
"epoch": 0.21672875042329834,
"grad_norm": 1.25,
"learning_rate": 1.4446952595936796e-05,
"loss": 2.1703,
"step": 320
},
{
"epoch": 0.22011513714866238,
"grad_norm": 1.3046875,
"learning_rate": 1.4672686230248308e-05,
"loss": 2.1635,
"step": 325
},
{
"epoch": 0.22350152387402641,
"grad_norm": 1.2734375,
"learning_rate": 1.489841986455982e-05,
"loss": 2.167,
"step": 330
},
{
"epoch": 0.22688791059939045,
"grad_norm": 1.2421875,
"learning_rate": 1.5124153498871333e-05,
"loss": 2.193,
"step": 335
},
{
"epoch": 0.2302742973247545,
"grad_norm": 1.328125,
"learning_rate": 1.5349887133182847e-05,
"loss": 2.1613,
"step": 340
},
{
"epoch": 0.23366068405011853,
"grad_norm": 1.2109375,
"learning_rate": 1.5575620767494356e-05,
"loss": 2.1805,
"step": 345
},
{
"epoch": 0.23704707077548257,
"grad_norm": 1.265625,
"learning_rate": 1.5801354401805872e-05,
"loss": 2.1765,
"step": 350
},
{
"epoch": 0.2404334575008466,
"grad_norm": 1.21875,
"learning_rate": 1.602708803611738e-05,
"loss": 2.1734,
"step": 355
},
{
"epoch": 0.24381984422621064,
"grad_norm": 1.265625,
"learning_rate": 1.6252821670428894e-05,
"loss": 2.1681,
"step": 360
},
{
"epoch": 0.24720623095157468,
"grad_norm": 1.2421875,
"learning_rate": 1.6478555304740406e-05,
"loss": 2.1865,
"step": 365
},
{
"epoch": 0.2505926176769387,
"grad_norm": 1.25,
"learning_rate": 1.670428893905192e-05,
"loss": 2.1665,
"step": 370
},
{
"epoch": 0.25397900440230275,
"grad_norm": 1.1875,
"learning_rate": 1.693002257336343e-05,
"loss": 2.1621,
"step": 375
},
{
"epoch": 0.25736539112766676,
"grad_norm": 1.2421875,
"learning_rate": 1.7155756207674944e-05,
"loss": 2.186,
"step": 380
},
{
"epoch": 0.26075177785303083,
"grad_norm": 1.2890625,
"learning_rate": 1.7381489841986457e-05,
"loss": 2.1651,
"step": 385
},
{
"epoch": 0.26413816457839484,
"grad_norm": 1.2421875,
"learning_rate": 1.760722347629797e-05,
"loss": 2.1786,
"step": 390
},
{
"epoch": 0.2675245513037589,
"grad_norm": 1.25,
"learning_rate": 1.783295711060948e-05,
"loss": 2.1472,
"step": 395
},
{
"epoch": 0.2709109380291229,
"grad_norm": 1.265625,
"learning_rate": 1.8058690744920994e-05,
"loss": 2.1712,
"step": 400
},
{
"epoch": 0.274297324754487,
"grad_norm": 1.2109375,
"learning_rate": 1.8284424379232507e-05,
"loss": 2.1611,
"step": 405
},
{
"epoch": 0.277683711479851,
"grad_norm": 1.265625,
"learning_rate": 1.851015801354402e-05,
"loss": 2.1727,
"step": 410
},
{
"epoch": 0.28107009820521506,
"grad_norm": 1.21875,
"learning_rate": 1.873589164785553e-05,
"loss": 2.1499,
"step": 415
},
{
"epoch": 0.28445648493057907,
"grad_norm": 1.2265625,
"learning_rate": 1.8961625282167044e-05,
"loss": 2.1616,
"step": 420
},
{
"epoch": 0.28784287165594313,
"grad_norm": 1.203125,
"learning_rate": 1.9187358916478557e-05,
"loss": 2.1646,
"step": 425
},
{
"epoch": 0.29122925838130714,
"grad_norm": 1.234375,
"learning_rate": 1.941309255079007e-05,
"loss": 2.1476,
"step": 430
},
{
"epoch": 0.2946156451066712,
"grad_norm": 1.28125,
"learning_rate": 1.963882618510158e-05,
"loss": 2.1641,
"step": 435
},
{
"epoch": 0.2980020318320352,
"grad_norm": 1.234375,
"learning_rate": 1.9864559819413094e-05,
"loss": 2.1577,
"step": 440
},
{
"epoch": 0.30138841855739923,
"grad_norm": 1.2265625,
"learning_rate": 1.999998756994645e-05,
"loss": 2.1505,
"step": 445
},
{
"epoch": 0.3047748052827633,
"grad_norm": 1.2109375,
"learning_rate": 1.99998477321989e-05,
"loss": 2.1605,
"step": 450
},
{
"epoch": 0.3081611920081273,
"grad_norm": 1.265625,
"learning_rate": 1.9999552521316842e-05,
"loss": 2.1373,
"step": 455
},
{
"epoch": 0.31154757873349137,
"grad_norm": 1.2265625,
"learning_rate": 1.9999101941887125e-05,
"loss": 2.1473,
"step": 460
},
{
"epoch": 0.3149339654588554,
"grad_norm": 1.234375,
"learning_rate": 1.9998496000910653e-05,
"loss": 2.1594,
"step": 465
},
{
"epoch": 0.31832035218421945,
"grad_norm": 1.2890625,
"learning_rate": 1.999773470780227e-05,
"loss": 2.1588,
"step": 470
},
{
"epoch": 0.32170673890958346,
"grad_norm": 1.2734375,
"learning_rate": 1.999681807439059e-05,
"loss": 2.1672,
"step": 475
},
{
"epoch": 0.3250931256349475,
"grad_norm": 1.203125,
"learning_rate": 1.9995746114917866e-05,
"loss": 2.1631,
"step": 480
},
{
"epoch": 0.32847951236031153,
"grad_norm": 1.234375,
"learning_rate": 1.9994518846039715e-05,
"loss": 2.156,
"step": 485
},
{
"epoch": 0.3318658990856756,
"grad_norm": 1.2265625,
"learning_rate": 1.9993136286824894e-05,
"loss": 2.1593,
"step": 490
},
{
"epoch": 0.3352522858110396,
"grad_norm": 1.1640625,
"learning_rate": 1.999159845875498e-05,
"loss": 2.174,
"step": 495
},
{
"epoch": 0.3386386725364037,
"grad_norm": 1.21875,
"learning_rate": 1.9989905385724052e-05,
"loss": 2.1444,
"step": 500
},
{
"epoch": 0.3420250592617677,
"grad_norm": 1.2421875,
"learning_rate": 1.9988057094038325e-05,
"loss": 2.1395,
"step": 505
},
{
"epoch": 0.34541144598713175,
"grad_norm": 1.2109375,
"learning_rate": 1.9986053612415717e-05,
"loss": 2.1486,
"step": 510
},
{
"epoch": 0.34879783271249576,
"grad_norm": 1.2109375,
"learning_rate": 1.9983894971985425e-05,
"loss": 2.1498,
"step": 515
},
{
"epoch": 0.3521842194378598,
"grad_norm": 1.234375,
"learning_rate": 1.9981581206287437e-05,
"loss": 2.1265,
"step": 520
},
{
"epoch": 0.35557060616322383,
"grad_norm": 1.234375,
"learning_rate": 1.9979112351271996e-05,
"loss": 2.1298,
"step": 525
},
{
"epoch": 0.3589569928885879,
"grad_norm": 1.2578125,
"learning_rate": 1.9976488445299066e-05,
"loss": 2.1563,
"step": 530
},
{
"epoch": 0.3623433796139519,
"grad_norm": 1.1875,
"learning_rate": 1.9973709529137717e-05,
"loss": 2.1582,
"step": 535
},
{
"epoch": 0.365729766339316,
"grad_norm": 1.1953125,
"learning_rate": 1.9970775645965493e-05,
"loss": 2.1549,
"step": 540
},
{
"epoch": 0.36911615306468,
"grad_norm": 1.2265625,
"learning_rate": 1.9967686841367757e-05,
"loss": 2.1804,
"step": 545
},
{
"epoch": 0.372502539790044,
"grad_norm": 1.171875,
"learning_rate": 1.996444316333696e-05,
"loss": 2.153,
"step": 550
},
{
"epoch": 0.37588892651540806,
"grad_norm": 1.1640625,
"learning_rate": 1.9961044662271922e-05,
"loss": 2.162,
"step": 555
},
{
"epoch": 0.37927531324077207,
"grad_norm": 1.21875,
"learning_rate": 1.9957491390977007e-05,
"loss": 2.164,
"step": 560
},
{
"epoch": 0.38266169996613614,
"grad_norm": 1.2265625,
"learning_rate": 1.9953783404661356e-05,
"loss": 2.1532,
"step": 565
},
{
"epoch": 0.38604808669150015,
"grad_norm": 1.2109375,
"learning_rate": 1.994992076093799e-05,
"loss": 2.1603,
"step": 570
},
{
"epoch": 0.3894344734168642,
"grad_norm": 1.21875,
"learning_rate": 1.9945903519822927e-05,
"loss": 2.158,
"step": 575
},
{
"epoch": 0.3928208601422282,
"grad_norm": 1.2421875,
"learning_rate": 1.9941731743734258e-05,
"loss": 2.1451,
"step": 580
},
{
"epoch": 0.3962072468675923,
"grad_norm": 1.2109375,
"learning_rate": 1.993740549749115e-05,
"loss": 2.1554,
"step": 585
},
{
"epoch": 0.3995936335929563,
"grad_norm": 1.1875,
"learning_rate": 1.9932924848312888e-05,
"loss": 2.1331,
"step": 590
},
{
"epoch": 0.40298002031832036,
"grad_norm": 1.1953125,
"learning_rate": 1.992828986581777e-05,
"loss": 2.1476,
"step": 595
},
{
"epoch": 0.4063664070436844,
"grad_norm": 1.21875,
"learning_rate": 1.992350062202207e-05,
"loss": 2.1458,
"step": 600
},
{
"epoch": 0.40975279376904844,
"grad_norm": 1.203125,
"learning_rate": 1.991855719133891e-05,
"loss": 2.1569,
"step": 605
},
{
"epoch": 0.41313918049441245,
"grad_norm": 1.203125,
"learning_rate": 1.9913459650577082e-05,
"loss": 2.1378,
"step": 610
},
{
"epoch": 0.4165255672197765,
"grad_norm": 1.1875,
"learning_rate": 1.990820807893989e-05,
"loss": 2.1378,
"step": 615
},
{
"epoch": 0.4199119539451405,
"grad_norm": 1.2109375,
"learning_rate": 1.9902802558023883e-05,
"loss": 2.1436,
"step": 620
},
{
"epoch": 0.4232983406705046,
"grad_norm": 1.21875,
"learning_rate": 1.989724317181762e-05,
"loss": 2.1556,
"step": 625
},
{
"epoch": 0.4266847273958686,
"grad_norm": 1.1796875,
"learning_rate": 1.9891530006700335e-05,
"loss": 2.152,
"step": 630
},
{
"epoch": 0.43007111412123267,
"grad_norm": 1.2421875,
"learning_rate": 1.988566315144062e-05,
"loss": 2.1539,
"step": 635
},
{
"epoch": 0.4334575008465967,
"grad_norm": 1.234375,
"learning_rate": 1.9879642697195035e-05,
"loss": 2.1624,
"step": 640
},
{
"epoch": 0.43684388757196074,
"grad_norm": 1.1796875,
"learning_rate": 1.987346873750669e-05,
"loss": 2.1513,
"step": 645
},
{
"epoch": 0.44023027429732475,
"grad_norm": 1.234375,
"learning_rate": 1.9867141368303783e-05,
"loss": 2.1344,
"step": 650
},
{
"epoch": 0.4436166610226888,
"grad_norm": 1.2109375,
"learning_rate": 1.9860660687898142e-05,
"loss": 2.16,
"step": 655
},
{
"epoch": 0.44700304774805283,
"grad_norm": 1.203125,
"learning_rate": 1.9854026796983652e-05,
"loss": 2.1398,
"step": 660
},
{
"epoch": 0.45038943447341684,
"grad_norm": 1.1953125,
"learning_rate": 1.9847239798634734e-05,
"loss": 2.1383,
"step": 665
},
{
"epoch": 0.4537758211987809,
"grad_norm": 1.234375,
"learning_rate": 1.9840299798304708e-05,
"loss": 2.1298,
"step": 670
},
{
"epoch": 0.4571622079241449,
"grad_norm": 1.2265625,
"learning_rate": 1.9833206903824177e-05,
"loss": 2.1569,
"step": 675
},
{
"epoch": 0.460548594649509,
"grad_norm": 1.2109375,
"learning_rate": 1.982596122539935e-05,
"loss": 2.134,
"step": 680
},
{
"epoch": 0.463934981374873,
"grad_norm": 1.1796875,
"learning_rate": 1.981856287561031e-05,
"loss": 2.1341,
"step": 685
},
{
"epoch": 0.46732136810023706,
"grad_norm": 1.21875,
"learning_rate": 1.9811011969409297e-05,
"loss": 2.1496,
"step": 690
},
{
"epoch": 0.47070775482560107,
"grad_norm": 1.2109375,
"learning_rate": 1.9803308624118882e-05,
"loss": 2.1695,
"step": 695
},
{
"epoch": 0.47409414155096513,
"grad_norm": 1.2109375,
"learning_rate": 1.9795452959430187e-05,
"loss": 2.1361,
"step": 700
},
{
"epoch": 0.47748052827632914,
"grad_norm": 1.2109375,
"learning_rate": 1.978744509740099e-05,
"loss": 2.165,
"step": 705
},
{
"epoch": 0.4808669150016932,
"grad_norm": 1.2109375,
"learning_rate": 1.9779285162453853e-05,
"loss": 2.1356,
"step": 710
},
{
"epoch": 0.4842533017270572,
"grad_norm": 1.2109375,
"learning_rate": 1.9770973281374168e-05,
"loss": 2.1442,
"step": 715
},
{
"epoch": 0.4876396884524213,
"grad_norm": 1.1875,
"learning_rate": 1.9762509583308206e-05,
"loss": 2.1332,
"step": 720
},
{
"epoch": 0.4910260751777853,
"grad_norm": 1.1953125,
"learning_rate": 1.97538941997611e-05,
"loss": 2.1472,
"step": 725
},
{
"epoch": 0.49441246190314936,
"grad_norm": 1.1953125,
"learning_rate": 1.9745127264594802e-05,
"loss": 2.1405,
"step": 730
},
{
"epoch": 0.49779884862851337,
"grad_norm": 1.25,
"learning_rate": 1.9736208914026004e-05,
"loss": 2.1393,
"step": 735
},
{
"epoch": 0.5011852353538774,
"grad_norm": 1.203125,
"learning_rate": 1.9727139286624026e-05,
"loss": 2.1532,
"step": 740
},
{
"epoch": 0.5045716220792414,
"grad_norm": 1.2109375,
"learning_rate": 1.9717918523308656e-05,
"loss": 2.1491,
"step": 745
},
{
"epoch": 0.5079580088046055,
"grad_norm": 1.1875,
"learning_rate": 1.9708546767347972e-05,
"loss": 2.1705,
"step": 750
},
{
"epoch": 0.5113443955299696,
"grad_norm": 1.15625,
"learning_rate": 1.9699024164356092e-05,
"loss": 2.1159,
"step": 755
},
{
"epoch": 0.5147307822553335,
"grad_norm": 1.1796875,
"learning_rate": 1.9689350862290943e-05,
"loss": 2.1381,
"step": 760
},
{
"epoch": 0.5181171689806976,
"grad_norm": 1.1796875,
"learning_rate": 1.9679527011451937e-05,
"loss": 2.1408,
"step": 765
},
{
"epoch": 0.5215035557060617,
"grad_norm": 1.203125,
"learning_rate": 1.9669552764477644e-05,
"loss": 2.1505,
"step": 770
},
{
"epoch": 0.5248899424314256,
"grad_norm": 1.203125,
"learning_rate": 1.9659428276343435e-05,
"loss": 2.1325,
"step": 775
},
{
"epoch": 0.5282763291567897,
"grad_norm": 1.203125,
"learning_rate": 1.9649153704359042e-05,
"loss": 2.1494,
"step": 780
},
{
"epoch": 0.5316627158821537,
"grad_norm": 1.171875,
"learning_rate": 1.963872920816615e-05,
"loss": 2.1513,
"step": 785
},
{
"epoch": 0.5350491026075178,
"grad_norm": 1.1640625,
"learning_rate": 1.9628154949735882e-05,
"loss": 2.1416,
"step": 790
},
{
"epoch": 0.5384354893328818,
"grad_norm": 1.1953125,
"learning_rate": 1.9617431093366312e-05,
"loss": 2.1302,
"step": 795
},
{
"epoch": 0.5418218760582458,
"grad_norm": 1.21875,
"learning_rate": 1.9606557805679893e-05,
"loss": 2.1431,
"step": 800
},
{
"epoch": 0.5452082627836099,
"grad_norm": 1.2265625,
"learning_rate": 1.9595535255620877e-05,
"loss": 2.1397,
"step": 805
},
{
"epoch": 0.548594649508974,
"grad_norm": 1.171875,
"learning_rate": 1.958436361445269e-05,
"loss": 2.1399,
"step": 810
},
{
"epoch": 0.5519810362343379,
"grad_norm": 1.21875,
"learning_rate": 1.957304305575526e-05,
"loss": 2.1521,
"step": 815
},
{
"epoch": 0.555367422959702,
"grad_norm": 1.2109375,
"learning_rate": 1.9561573755422338e-05,
"loss": 2.1551,
"step": 820
},
{
"epoch": 0.558753809685066,
"grad_norm": 1.21875,
"learning_rate": 1.954995589165875e-05,
"loss": 2.1325,
"step": 825
},
{
"epoch": 0.5621401964104301,
"grad_norm": 1.2578125,
"learning_rate": 1.953818964497764e-05,
"loss": 2.1375,
"step": 830
},
{
"epoch": 0.5655265831357941,
"grad_norm": 1.1875,
"learning_rate": 1.9526275198197647e-05,
"loss": 2.1424,
"step": 835
},
{
"epoch": 0.5689129698611581,
"grad_norm": 1.1796875,
"learning_rate": 1.9514212736440094e-05,
"loss": 2.1309,
"step": 840
},
{
"epoch": 0.5722993565865222,
"grad_norm": 1.1953125,
"learning_rate": 1.9502002447126073e-05,
"loss": 2.1355,
"step": 845
},
{
"epoch": 0.5756857433118863,
"grad_norm": 1.1796875,
"learning_rate": 1.948964451997357e-05,
"loss": 2.1404,
"step": 850
},
{
"epoch": 0.5790721300372502,
"grad_norm": 1.1875,
"learning_rate": 1.9477139146994505e-05,
"loss": 2.1129,
"step": 855
},
{
"epoch": 0.5824585167626143,
"grad_norm": 1.1796875,
"learning_rate": 1.946448652249172e-05,
"loss": 2.1403,
"step": 860
},
{
"epoch": 0.5858449034879784,
"grad_norm": 1.2109375,
"learning_rate": 1.9451686843056013e-05,
"loss": 2.1333,
"step": 865
},
{
"epoch": 0.5892312902133424,
"grad_norm": 1.25,
"learning_rate": 1.943874030756304e-05,
"loss": 2.144,
"step": 870
},
{
"epoch": 0.5926176769387064,
"grad_norm": 1.1953125,
"learning_rate": 1.9425647117170245e-05,
"loss": 2.1423,
"step": 875
},
{
"epoch": 0.5960040636640704,
"grad_norm": 1.2109375,
"learning_rate": 1.9412407475313727e-05,
"loss": 2.1338,
"step": 880
},
{
"epoch": 0.5993904503894345,
"grad_norm": 1.1484375,
"learning_rate": 1.939902158770509e-05,
"loss": 2.1294,
"step": 885
},
{
"epoch": 0.6027768371147985,
"grad_norm": 1.1875,
"learning_rate": 1.938548966232822e-05,
"loss": 2.1167,
"step": 890
},
{
"epoch": 0.6061632238401625,
"grad_norm": 1.203125,
"learning_rate": 1.9371811909436097e-05,
"loss": 2.1691,
"step": 895
},
{
"epoch": 0.6095496105655266,
"grad_norm": 1.1875,
"learning_rate": 1.9357988541547486e-05,
"loss": 2.1158,
"step": 900
},
{
"epoch": 0.6129359972908907,
"grad_norm": 1.1640625,
"learning_rate": 1.9344019773443655e-05,
"loss": 2.1318,
"step": 905
},
{
"epoch": 0.6163223840162546,
"grad_norm": 1.15625,
"learning_rate": 1.9329905822165042e-05,
"loss": 2.1414,
"step": 910
},
{
"epoch": 0.6197087707416187,
"grad_norm": 1.1484375,
"learning_rate": 1.931564690700787e-05,
"loss": 2.1453,
"step": 915
},
{
"epoch": 0.6230951574669827,
"grad_norm": 1.1875,
"learning_rate": 1.9301243249520746e-05,
"loss": 2.133,
"step": 920
},
{
"epoch": 0.6264815441923468,
"grad_norm": 1.1640625,
"learning_rate": 1.928669507350122e-05,
"loss": 2.123,
"step": 925
},
{
"epoch": 0.6298679309177108,
"grad_norm": 1.1796875,
"learning_rate": 1.92720026049923e-05,
"loss": 2.1289,
"step": 930
},
{
"epoch": 0.6332543176430748,
"grad_norm": 1.171875,
"learning_rate": 1.9257166072278957e-05,
"loss": 2.1385,
"step": 935
},
{
"epoch": 0.6366407043684389,
"grad_norm": 1.21875,
"learning_rate": 1.924218570588456e-05,
"loss": 2.1414,
"step": 940
},
{
"epoch": 0.640027091093803,
"grad_norm": 1.1484375,
"learning_rate": 1.9227061738567293e-05,
"loss": 2.1418,
"step": 945
},
{
"epoch": 0.6434134778191669,
"grad_norm": 1.1640625,
"learning_rate": 1.9211794405316566e-05,
"loss": 2.1372,
"step": 950
},
{
"epoch": 0.646799864544531,
"grad_norm": 1.21875,
"learning_rate": 1.919638394334933e-05,
"loss": 2.122,
"step": 955
},
{
"epoch": 0.650186251269895,
"grad_norm": 1.2265625,
"learning_rate": 1.91808305921064e-05,
"loss": 2.1357,
"step": 960
},
{
"epoch": 0.6535726379952591,
"grad_norm": 1.203125,
"learning_rate": 1.916513459324876e-05,
"loss": 2.1205,
"step": 965
},
{
"epoch": 0.6569590247206231,
"grad_norm": 1.1953125,
"learning_rate": 1.9149296190653776e-05,
"loss": 2.1248,
"step": 970
},
{
"epoch": 0.6603454114459871,
"grad_norm": 1.203125,
"learning_rate": 1.9133315630411423e-05,
"loss": 2.1463,
"step": 975
},
{
"epoch": 0.6637317981713512,
"grad_norm": 1.203125,
"learning_rate": 1.9117193160820445e-05,
"loss": 2.1155,
"step": 980
},
{
"epoch": 0.6671181848967151,
"grad_norm": 1.1796875,
"learning_rate": 1.9100929032384527e-05,
"loss": 2.107,
"step": 985
},
{
"epoch": 0.6705045716220792,
"grad_norm": 1.171875,
"learning_rate": 1.908452349780838e-05,
"loss": 2.1478,
"step": 990
},
{
"epoch": 0.6738909583474433,
"grad_norm": 1.1796875,
"learning_rate": 1.906797681199382e-05,
"loss": 2.1388,
"step": 995
},
{
"epoch": 0.6772773450728073,
"grad_norm": 1.1875,
"learning_rate": 1.9051289232035794e-05,
"loss": 2.1359,
"step": 1000
},
{
"epoch": 0.6806637317981713,
"grad_norm": 1.203125,
"learning_rate": 1.9034461017218424e-05,
"loss": 2.149,
"step": 1005
},
{
"epoch": 0.6840501185235354,
"grad_norm": 1.1875,
"learning_rate": 1.9017492429010933e-05,
"loss": 2.1402,
"step": 1010
},
{
"epoch": 0.6874365052488994,
"grad_norm": 1.1875,
"learning_rate": 1.900038373106361e-05,
"loss": 2.1211,
"step": 1015
},
{
"epoch": 0.6908228919742635,
"grad_norm": 1.1875,
"learning_rate": 1.8983135189203708e-05,
"loss": 2.1127,
"step": 1020
},
{
"epoch": 0.6942092786996275,
"grad_norm": 1.203125,
"learning_rate": 1.8965747071431307e-05,
"loss": 2.1432,
"step": 1025
},
{
"epoch": 0.6975956654249915,
"grad_norm": 1.171875,
"learning_rate": 1.8948219647915157e-05,
"loss": 2.1353,
"step": 1030
},
{
"epoch": 0.7009820521503556,
"grad_norm": 1.203125,
"learning_rate": 1.893055319098848e-05,
"loss": 2.1372,
"step": 1035
},
{
"epoch": 0.7043684388757196,
"grad_norm": 1.1796875,
"learning_rate": 1.891274797514473e-05,
"loss": 2.1385,
"step": 1040
},
{
"epoch": 0.7077548256010836,
"grad_norm": 1.15625,
"learning_rate": 1.8894804277033345e-05,
"loss": 2.1252,
"step": 1045
},
{
"epoch": 0.7111412123264477,
"grad_norm": 1.1953125,
"learning_rate": 1.8876722375455426e-05,
"loss": 2.1302,
"step": 1050
},
{
"epoch": 0.7145275990518117,
"grad_norm": 1.1875,
"learning_rate": 1.885850255135943e-05,
"loss": 2.1437,
"step": 1055
},
{
"epoch": 0.7179139857771758,
"grad_norm": 1.15625,
"learning_rate": 1.8840145087836777e-05,
"loss": 2.1213,
"step": 1060
},
{
"epoch": 0.7213003725025398,
"grad_norm": 1.1640625,
"learning_rate": 1.8821650270117485e-05,
"loss": 2.1354,
"step": 1065
},
{
"epoch": 0.7246867592279038,
"grad_norm": 1.203125,
"learning_rate": 1.8803018385565707e-05,
"loss": 2.1425,
"step": 1070
},
{
"epoch": 0.7280731459532679,
"grad_norm": 1.1875,
"learning_rate": 1.8784249723675273e-05,
"loss": 2.1342,
"step": 1075
},
{
"epoch": 0.731459532678632,
"grad_norm": 1.2265625,
"learning_rate": 1.8765344576065222e-05,
"loss": 2.1274,
"step": 1080
},
{
"epoch": 0.7348459194039959,
"grad_norm": 1.1875,
"learning_rate": 1.8746303236475212e-05,
"loss": 2.1635,
"step": 1085
},
{
"epoch": 0.73823230612936,
"grad_norm": 1.2421875,
"learning_rate": 1.8727126000761025e-05,
"loss": 2.1549,
"step": 1090
},
{
"epoch": 0.741618692854724,
"grad_norm": 1.1953125,
"learning_rate": 1.8707813166889915e-05,
"loss": 2.1232,
"step": 1095
},
{
"epoch": 0.745005079580088,
"grad_norm": 1.2109375,
"learning_rate": 1.8688365034936002e-05,
"loss": 2.1266,
"step": 1100
},
{
"epoch": 0.7483914663054521,
"grad_norm": 1.171875,
"learning_rate": 1.8668781907075618e-05,
"loss": 2.1122,
"step": 1105
},
{
"epoch": 0.7517778530308161,
"grad_norm": 1.1484375,
"learning_rate": 1.864906408758258e-05,
"loss": 2.1109,
"step": 1110
},
{
"epoch": 0.7551642397561802,
"grad_norm": 1.2265625,
"learning_rate": 1.8629211882823502e-05,
"loss": 2.1381,
"step": 1115
},
{
"epoch": 0.7585506264815441,
"grad_norm": 1.2109375,
"learning_rate": 1.860922560125301e-05,
"loss": 2.1412,
"step": 1120
},
{
"epoch": 0.7619370132069082,
"grad_norm": 1.1953125,
"learning_rate": 1.858910555340895e-05,
"loss": 2.1333,
"step": 1125
},
{
"epoch": 0.7653233999322723,
"grad_norm": 1.1953125,
"learning_rate": 1.8568852051907575e-05,
"loss": 2.1583,
"step": 1130
},
{
"epoch": 0.7687097866576363,
"grad_norm": 1.1484375,
"learning_rate": 1.854846541143868e-05,
"loss": 2.122,
"step": 1135
},
{
"epoch": 0.7720961733830003,
"grad_norm": 1.171875,
"learning_rate": 1.8527945948760702e-05,
"loss": 2.1293,
"step": 1140
},
{
"epoch": 0.7754825601083644,
"grad_norm": 1.5546875,
"learning_rate": 1.850729398269583e-05,
"loss": 2.1142,
"step": 1145
},
{
"epoch": 0.7788689468337284,
"grad_norm": 1.21875,
"learning_rate": 1.8486509834125013e-05,
"loss": 2.148,
"step": 1150
},
{
"epoch": 0.7822553335590925,
"grad_norm": 1.21875,
"learning_rate": 1.8465593825983005e-05,
"loss": 2.145,
"step": 1155
},
{
"epoch": 0.7856417202844564,
"grad_norm": 1.1796875,
"learning_rate": 1.8444546283253325e-05,
"loss": 2.1206,
"step": 1160
},
{
"epoch": 0.7890281070098205,
"grad_norm": 1.203125,
"learning_rate": 1.8423367532963224e-05,
"loss": 2.1323,
"step": 1165
},
{
"epoch": 0.7924144937351846,
"grad_norm": 1.2421875,
"learning_rate": 1.840205790417859e-05,
"loss": 2.1299,
"step": 1170
},
{
"epoch": 0.7958008804605486,
"grad_norm": 1.2421875,
"learning_rate": 1.838061772799885e-05,
"loss": 2.1289,
"step": 1175
},
{
"epoch": 0.7991872671859126,
"grad_norm": 1.203125,
"learning_rate": 1.8359047337551815e-05,
"loss": 2.1564,
"step": 1180
},
{
"epoch": 0.8025736539112767,
"grad_norm": 1.1796875,
"learning_rate": 1.8337347067988506e-05,
"loss": 2.1398,
"step": 1185
},
{
"epoch": 0.8059600406366407,
"grad_norm": 1.1875,
"learning_rate": 1.8315517256477942e-05,
"loss": 2.1273,
"step": 1190
},
{
"epoch": 0.8093464273620048,
"grad_norm": 1.2265625,
"learning_rate": 1.8293558242201914e-05,
"loss": 2.1186,
"step": 1195
},
{
"epoch": 0.8127328140873687,
"grad_norm": 1.1640625,
"learning_rate": 1.827147036634971e-05,
"loss": 2.1423,
"step": 1200
},
{
"epoch": 0.8161192008127328,
"grad_norm": 1.15625,
"learning_rate": 1.8249253972112805e-05,
"loss": 2.1372,
"step": 1205
},
{
"epoch": 0.8195055875380969,
"grad_norm": 1.1796875,
"learning_rate": 1.8226909404679526e-05,
"loss": 2.1415,
"step": 1210
},
{
"epoch": 0.8228919742634608,
"grad_norm": 1.203125,
"learning_rate": 1.820443701122972e-05,
"loss": 2.1409,
"step": 1215
},
{
"epoch": 0.8262783609888249,
"grad_norm": 1.1484375,
"learning_rate": 1.8181837140929322e-05,
"loss": 2.1275,
"step": 1220
},
{
"epoch": 0.829664747714189,
"grad_norm": 1.1953125,
"learning_rate": 1.8159110144924943e-05,
"loss": 2.1377,
"step": 1225
},
{
"epoch": 0.833051134439553,
"grad_norm": 1.1640625,
"learning_rate": 1.8136256376338427e-05,
"loss": 2.1274,
"step": 1230
},
{
"epoch": 0.836437521164917,
"grad_norm": 1.15625,
"learning_rate": 1.811327619026134e-05,
"loss": 2.1449,
"step": 1235
},
{
"epoch": 0.839823907890281,
"grad_norm": 1.1875,
"learning_rate": 1.8090169943749477e-05,
"loss": 2.1263,
"step": 1240
},
{
"epoch": 0.8432102946156451,
"grad_norm": 1.1640625,
"learning_rate": 1.8066937995817296e-05,
"loss": 2.1207,
"step": 1245
},
{
"epoch": 0.8465966813410092,
"grad_norm": 1.1796875,
"learning_rate": 1.8043580707432356e-05,
"loss": 2.1066,
"step": 1250
},
{
"epoch": 0.8499830680663731,
"grad_norm": 1.1953125,
"learning_rate": 1.802009844150969e-05,
"loss": 2.1421,
"step": 1255
},
{
"epoch": 0.8533694547917372,
"grad_norm": 1.2109375,
"learning_rate": 1.7996491562906186e-05,
"loss": 2.1265,
"step": 1260
},
{
"epoch": 0.8567558415171013,
"grad_norm": 1.140625,
"learning_rate": 1.797276043841489e-05,
"loss": 2.1343,
"step": 1265
},
{
"epoch": 0.8601422282424653,
"grad_norm": 1.2109375,
"learning_rate": 1.7948905436759346e-05,
"loss": 2.134,
"step": 1270
},
{
"epoch": 0.8635286149678293,
"grad_norm": 1.1640625,
"learning_rate": 1.7924926928587834e-05,
"loss": 2.1073,
"step": 1275
},
{
"epoch": 0.8669150016931934,
"grad_norm": 1.1875,
"learning_rate": 1.7900825286467622e-05,
"loss": 2.1149,
"step": 1280
},
{
"epoch": 0.8703013884185574,
"grad_norm": 1.1796875,
"learning_rate": 1.787660088487918e-05,
"loss": 2.1335,
"step": 1285
},
{
"epoch": 0.8736877751439215,
"grad_norm": 1.203125,
"learning_rate": 1.7852254100210363e-05,
"loss": 2.1261,
"step": 1290
},
{
"epoch": 0.8770741618692854,
"grad_norm": 1.203125,
"learning_rate": 1.7827785310750552e-05,
"loss": 2.1274,
"step": 1295
},
{
"epoch": 0.8804605485946495,
"grad_norm": 1.15625,
"learning_rate": 1.780319489668479e-05,
"loss": 2.1213,
"step": 1300
},
{
"epoch": 0.8838469353200136,
"grad_norm": 1.1875,
"learning_rate": 1.7778483240087873e-05,
"loss": 2.1304,
"step": 1305
},
{
"epoch": 0.8872333220453776,
"grad_norm": 1.171875,
"learning_rate": 1.7753650724918388e-05,
"loss": 2.1386,
"step": 1310
},
{
"epoch": 0.8906197087707416,
"grad_norm": 1.1875,
"learning_rate": 1.7728697737012794e-05,
"loss": 2.128,
"step": 1315
},
{
"epoch": 0.8940060954961057,
"grad_norm": 1.1484375,
"learning_rate": 1.770362466407938e-05,
"loss": 2.1301,
"step": 1320
},
{
"epoch": 0.8973924822214697,
"grad_norm": 1.140625,
"learning_rate": 1.7678431895692278e-05,
"loss": 2.1275,
"step": 1325
},
{
"epoch": 0.9007788689468337,
"grad_norm": 1.203125,
"learning_rate": 1.7653119823285383e-05,
"loss": 2.1423,
"step": 1330
},
{
"epoch": 0.9041652556721977,
"grad_norm": 1.15625,
"learning_rate": 1.7627688840146286e-05,
"loss": 2.1195,
"step": 1335
},
{
"epoch": 0.9075516423975618,
"grad_norm": 1.2109375,
"learning_rate": 1.760213934141017e-05,
"loss": 2.1369,
"step": 1340
},
{
"epoch": 0.9109380291229259,
"grad_norm": 1.1875,
"learning_rate": 1.7576471724053634e-05,
"loss": 2.1259,
"step": 1345
},
{
"epoch": 0.9143244158482898,
"grad_norm": 1.2109375,
"learning_rate": 1.7550686386888583e-05,
"loss": 2.1294,
"step": 1350
},
{
"epoch": 0.9177108025736539,
"grad_norm": 1.1640625,
"learning_rate": 1.752478373055597e-05,
"loss": 2.1275,
"step": 1355
},
{
"epoch": 0.921097189299018,
"grad_norm": 1.1953125,
"learning_rate": 1.7498764157519625e-05,
"loss": 2.1376,
"step": 1360
},
{
"epoch": 0.924483576024382,
"grad_norm": 1.1953125,
"learning_rate": 1.7472628072059955e-05,
"loss": 2.1415,
"step": 1365
},
{
"epoch": 0.927869962749746,
"grad_norm": 1.1875,
"learning_rate": 1.744637588026771e-05,
"loss": 2.1356,
"step": 1370
},
{
"epoch": 0.93125634947511,
"grad_norm": 1.1875,
"learning_rate": 1.742000799003762e-05,
"loss": 2.1287,
"step": 1375
},
{
"epoch": 0.9346427362004741,
"grad_norm": 1.1796875,
"learning_rate": 1.7393524811062105e-05,
"loss": 2.1163,
"step": 1380
},
{
"epoch": 0.9380291229258382,
"grad_norm": 1.21875,
"learning_rate": 1.736692675482488e-05,
"loss": 2.1505,
"step": 1385
},
{
"epoch": 0.9414155096512021,
"grad_norm": 1.1953125,
"learning_rate": 1.734021423459458e-05,
"loss": 2.1286,
"step": 1390
},
{
"epoch": 0.9448018963765662,
"grad_norm": 1.1484375,
"learning_rate": 1.731338766541832e-05,
"loss": 2.1295,
"step": 1395
},
{
"epoch": 0.9481882831019303,
"grad_norm": 1.2109375,
"learning_rate": 1.7286447464115267e-05,
"loss": 2.1351,
"step": 1400
},
{
"epoch": 0.9515746698272943,
"grad_norm": 1.1953125,
"learning_rate": 1.7259394049270138e-05,
"loss": 2.1251,
"step": 1405
},
{
"epoch": 0.9549610565526583,
"grad_norm": 1.1640625,
"learning_rate": 1.7232227841226724e-05,
"loss": 2.1313,
"step": 1410
},
{
"epoch": 0.9583474432780223,
"grad_norm": 1.171875,
"learning_rate": 1.7204949262081344e-05,
"loss": 2.1242,
"step": 1415
},
{
"epoch": 0.9617338300033864,
"grad_norm": 1.1796875,
"learning_rate": 1.7177558735676285e-05,
"loss": 2.1422,
"step": 1420
},
{
"epoch": 0.9651202167287504,
"grad_norm": 1.1796875,
"learning_rate": 1.7150056687593218e-05,
"loss": 2.1116,
"step": 1425
},
{
"epoch": 0.9685066034541144,
"grad_norm": 1.1953125,
"learning_rate": 1.7122443545146598e-05,
"loss": 2.1154,
"step": 1430
},
{
"epoch": 0.9718929901794785,
"grad_norm": 1.234375,
"learning_rate": 1.7094719737377007e-05,
"loss": 2.1348,
"step": 1435
},
{
"epoch": 0.9752793769048426,
"grad_norm": 1.1953125,
"learning_rate": 1.7066885695044482e-05,
"loss": 2.1122,
"step": 1440
},
{
"epoch": 0.9786657636302065,
"grad_norm": 1.1640625,
"learning_rate": 1.703894185062187e-05,
"loss": 2.1258,
"step": 1445
},
{
"epoch": 0.9820521503555706,
"grad_norm": 1.1953125,
"learning_rate": 1.7010888638288037e-05,
"loss": 2.1304,
"step": 1450
},
{
"epoch": 0.9854385370809347,
"grad_norm": 1.1640625,
"learning_rate": 1.6982726493921175e-05,
"loss": 2.118,
"step": 1455
},
{
"epoch": 0.9888249238062987,
"grad_norm": 1.1953125,
"learning_rate": 1.6954455855092014e-05,
"loss": 2.1361,
"step": 1460
},
{
"epoch": 0.9922113105316627,
"grad_norm": 1.171875,
"learning_rate": 1.692607716105702e-05,
"loss": 2.1223,
"step": 1465
},
{
"epoch": 0.9955976972570267,
"grad_norm": 1.203125,
"learning_rate": 1.689759085275157e-05,
"loss": 2.1329,
"step": 1470
},
{
"epoch": 0.9989840839823908,
"grad_norm": 1.15625,
"learning_rate": 1.6868997372783106e-05,
"loss": 2.1208,
"step": 1475
},
{
"epoch": 0.9996613613274636,
"eval_loss": 2.124814987182617,
"eval_runtime": 85.1985,
"eval_samples_per_second": 15.317,
"eval_steps_per_second": 1.925,
"step": 1476
},
{
"epoch": 1.0023704707077548,
"grad_norm": 1.15625,
"learning_rate": 1.6840297165424254e-05,
"loss": 2.0853,
"step": 1480
},
{
"epoch": 1.0057568574331188,
"grad_norm": 1.1640625,
"learning_rate": 1.6811490676605916e-05,
"loss": 2.0976,
"step": 1485
},
{
"epoch": 1.0091432441584829,
"grad_norm": 1.203125,
"learning_rate": 1.6782578353910363e-05,
"loss": 2.1058,
"step": 1490
},
{
"epoch": 1.012529630883847,
"grad_norm": 1.21875,
"learning_rate": 1.675356064656425e-05,
"loss": 2.1074,
"step": 1495
},
{
"epoch": 1.015916017609211,
"grad_norm": 1.1953125,
"learning_rate": 1.6724438005431656e-05,
"loss": 2.0834,
"step": 1500
},
{
"epoch": 1.019302404334575,
"grad_norm": 1.1875,
"learning_rate": 1.6695210883007077e-05,
"loss": 2.0947,
"step": 1505
},
{
"epoch": 1.0226887910599392,
"grad_norm": 1.171875,
"learning_rate": 1.666587973340839e-05,
"loss": 2.0953,
"step": 1510
},
{
"epoch": 1.026075177785303,
"grad_norm": 1.15625,
"learning_rate": 1.66364450123698e-05,
"loss": 2.0828,
"step": 1515
},
{
"epoch": 1.029461564510667,
"grad_norm": 1.1953125,
"learning_rate": 1.6606907177234757e-05,
"loss": 2.0834,
"step": 1520
},
{
"epoch": 1.0328479512360311,
"grad_norm": 1.1640625,
"learning_rate": 1.657726668694885e-05,
"loss": 2.1086,
"step": 1525
},
{
"epoch": 1.0362343379613952,
"grad_norm": 1.1640625,
"learning_rate": 1.654752400205269e-05,
"loss": 2.1209,
"step": 1530
},
{
"epoch": 1.0396207246867593,
"grad_norm": 1.21875,
"learning_rate": 1.6517679584674716e-05,
"loss": 2.106,
"step": 1535
},
{
"epoch": 1.0430071114121233,
"grad_norm": 1.2109375,
"learning_rate": 1.6487733898524068e-05,
"loss": 2.0924,
"step": 1540
},
{
"epoch": 1.0463934981374874,
"grad_norm": 1.2109375,
"learning_rate": 1.645768740888334e-05,
"loss": 2.0973,
"step": 1545
},
{
"epoch": 1.0497798848628512,
"grad_norm": 1.1953125,
"learning_rate": 1.6427540582601368e-05,
"loss": 2.0975,
"step": 1550
},
{
"epoch": 1.0531662715882153,
"grad_norm": 1.1796875,
"learning_rate": 1.6397293888085975e-05,
"loss": 2.0917,
"step": 1555
},
{
"epoch": 1.0565526583135794,
"grad_norm": 1.21875,
"learning_rate": 1.636694779529669e-05,
"loss": 2.1072,
"step": 1560
},
{
"epoch": 1.0599390450389434,
"grad_norm": 1.203125,
"learning_rate": 1.6336502775737443e-05,
"loss": 2.0714,
"step": 1565
},
{
"epoch": 1.0633254317643075,
"grad_norm": 1.2109375,
"learning_rate": 1.6305959302449255e-05,
"loss": 2.0936,
"step": 1570
},
{
"epoch": 1.0667118184896716,
"grad_norm": 1.203125,
"learning_rate": 1.6275317850002875e-05,
"loss": 2.0924,
"step": 1575
},
{
"epoch": 1.0700982052150356,
"grad_norm": 1.1875,
"learning_rate": 1.624457889449139e-05,
"loss": 2.1,
"step": 1580
},
{
"epoch": 1.0734845919403997,
"grad_norm": 1.21875,
"learning_rate": 1.621374291352287e-05,
"loss": 2.1331,
"step": 1585
},
{
"epoch": 1.0768709786657635,
"grad_norm": 1.1875,
"learning_rate": 1.618281038621291e-05,
"loss": 2.0948,
"step": 1590
},
{
"epoch": 1.0802573653911276,
"grad_norm": 1.21875,
"learning_rate": 1.6151781793177198e-05,
"loss": 2.1174,
"step": 1595
},
{
"epoch": 1.0836437521164917,
"grad_norm": 1.203125,
"learning_rate": 1.6120657616524053e-05,
"loss": 2.0937,
"step": 1600
},
{
"epoch": 1.0870301388418557,
"grad_norm": 1.1953125,
"learning_rate": 1.608943833984693e-05,
"loss": 2.0901,
"step": 1605
},
{
"epoch": 1.0904165255672198,
"grad_norm": 1.2265625,
"learning_rate": 1.6058124448216897e-05,
"loss": 2.0892,
"step": 1610
},
{
"epoch": 1.0938029122925839,
"grad_norm": 1.1953125,
"learning_rate": 1.602671642817511e-05,
"loss": 2.0988,
"step": 1615
},
{
"epoch": 1.097189299017948,
"grad_norm": 1.2109375,
"learning_rate": 1.5995214767725267e-05,
"loss": 2.09,
"step": 1620
},
{
"epoch": 1.100575685743312,
"grad_norm": 1.1875,
"learning_rate": 1.5963619956325977e-05,
"loss": 2.1143,
"step": 1625
},
{
"epoch": 1.1039620724686758,
"grad_norm": 1.2265625,
"learning_rate": 1.593193248488321e-05,
"loss": 2.0987,
"step": 1630
},
{
"epoch": 1.10734845919404,
"grad_norm": 1.1796875,
"learning_rate": 1.5900152845742638e-05,
"loss": 2.0955,
"step": 1635
},
{
"epoch": 1.110734845919404,
"grad_norm": 1.2109375,
"learning_rate": 1.5868281532681995e-05,
"loss": 2.1032,
"step": 1640
},
{
"epoch": 1.114121232644768,
"grad_norm": 1.203125,
"learning_rate": 1.5836319040903402e-05,
"loss": 2.0949,
"step": 1645
},
{
"epoch": 1.117507619370132,
"grad_norm": 1.1875,
"learning_rate": 1.5804265867025674e-05,
"loss": 2.0914,
"step": 1650
},
{
"epoch": 1.1208940060954962,
"grad_norm": 1.1953125,
"learning_rate": 1.5772122509076604e-05,
"loss": 2.1035,
"step": 1655
},
{
"epoch": 1.1242803928208602,
"grad_norm": 1.1953125,
"learning_rate": 1.5739889466485227e-05,
"loss": 2.1001,
"step": 1660
},
{
"epoch": 1.127666779546224,
"grad_norm": 1.1953125,
"learning_rate": 1.5707567240074052e-05,
"loss": 2.1013,
"step": 1665
},
{
"epoch": 1.1310531662715881,
"grad_norm": 1.2421875,
"learning_rate": 1.5675156332051293e-05,
"loss": 2.0902,
"step": 1670
},
{
"epoch": 1.1344395529969522,
"grad_norm": 1.1953125,
"learning_rate": 1.5642657246003048e-05,
"loss": 2.1021,
"step": 1675
},
{
"epoch": 1.1378259397223163,
"grad_norm": 1.2265625,
"learning_rate": 1.5610070486885498e-05,
"loss": 2.1041,
"step": 1680
},
{
"epoch": 1.1412123264476803,
"grad_norm": 1.171875,
"learning_rate": 1.5577396561017043e-05,
"loss": 2.1088,
"step": 1685
},
{
"epoch": 1.1445987131730444,
"grad_norm": 1.1875,
"learning_rate": 1.554463597607044e-05,
"loss": 2.1084,
"step": 1690
},
{
"epoch": 1.1479850998984085,
"grad_norm": 1.1953125,
"learning_rate": 1.5511789241064922e-05,
"loss": 2.1016,
"step": 1695
},
{
"epoch": 1.1513714866237725,
"grad_norm": 1.1796875,
"learning_rate": 1.5478856866358266e-05,
"loss": 2.0917,
"step": 1700
},
{
"epoch": 1.1547578733491364,
"grad_norm": 1.25,
"learning_rate": 1.5445839363638905e-05,
"loss": 2.1034,
"step": 1705
},
{
"epoch": 1.1581442600745004,
"grad_norm": 1.234375,
"learning_rate": 1.5412737245917927e-05,
"loss": 2.0877,
"step": 1710
},
{
"epoch": 1.1615306467998645,
"grad_norm": 1.2109375,
"learning_rate": 1.5379551027521154e-05,
"loss": 2.0906,
"step": 1715
},
{
"epoch": 1.1649170335252286,
"grad_norm": 1.171875,
"learning_rate": 1.5346281224081113e-05,
"loss": 2.0846,
"step": 1720
},
{
"epoch": 1.1683034202505926,
"grad_norm": 1.2578125,
"learning_rate": 1.531292835252903e-05,
"loss": 2.102,
"step": 1725
},
{
"epoch": 1.1716898069759567,
"grad_norm": 1.2265625,
"learning_rate": 1.527949293108683e-05,
"loss": 2.0851,
"step": 1730
},
{
"epoch": 1.1750761937013208,
"grad_norm": 1.234375,
"learning_rate": 1.524597547925903e-05,
"loss": 2.1029,
"step": 1735
},
{
"epoch": 1.1784625804266846,
"grad_norm": 1.2109375,
"learning_rate": 1.5212376517824727e-05,
"loss": 2.0947,
"step": 1740
},
{
"epoch": 1.1818489671520487,
"grad_norm": 1.2265625,
"learning_rate": 1.5178696568829465e-05,
"loss": 2.1251,
"step": 1745
},
{
"epoch": 1.1852353538774127,
"grad_norm": 1.2109375,
"learning_rate": 1.5144936155577136e-05,
"loss": 2.0804,
"step": 1750
},
{
"epoch": 1.1886217406027768,
"grad_norm": 1.203125,
"learning_rate": 1.5111095802621844e-05,
"loss": 2.0978,
"step": 1755
},
{
"epoch": 1.1920081273281409,
"grad_norm": 1.234375,
"learning_rate": 1.5077176035759772e-05,
"loss": 2.0717,
"step": 1760
},
{
"epoch": 1.195394514053505,
"grad_norm": 1.2109375,
"learning_rate": 1.5043177382020994e-05,
"loss": 2.0926,
"step": 1765
},
{
"epoch": 1.198780900778869,
"grad_norm": 1.21875,
"learning_rate": 1.5009100369661294e-05,
"loss": 2.1096,
"step": 1770
},
{
"epoch": 1.202167287504233,
"grad_norm": 1.21875,
"learning_rate": 1.4974945528153956e-05,
"loss": 2.0994,
"step": 1775
},
{
"epoch": 1.2055536742295971,
"grad_norm": 1.21875,
"learning_rate": 1.494071338818154e-05,
"loss": 2.0904,
"step": 1780
},
{
"epoch": 1.208940060954961,
"grad_norm": 1.21875,
"learning_rate": 1.4906404481627633e-05,
"loss": 2.093,
"step": 1785
},
{
"epoch": 1.212326447680325,
"grad_norm": 1.203125,
"learning_rate": 1.4872019341568585e-05,
"loss": 2.0988,
"step": 1790
},
{
"epoch": 1.2157128344056891,
"grad_norm": 1.1875,
"learning_rate": 1.483755850226523e-05,
"loss": 2.1117,
"step": 1795
},
{
"epoch": 1.2190992211310532,
"grad_norm": 1.234375,
"learning_rate": 1.4803022499154589e-05,
"loss": 2.0897,
"step": 1800
},
{
"epoch": 1.2224856078564172,
"grad_norm": 1.1953125,
"learning_rate": 1.4768411868841534e-05,
"loss": 2.0905,
"step": 1805
},
{
"epoch": 1.2258719945817813,
"grad_norm": 1.2578125,
"learning_rate": 1.4733727149090467e-05,
"loss": 2.1207,
"step": 1810
},
{
"epoch": 1.2292583813071452,
"grad_norm": 1.21875,
"learning_rate": 1.4698968878816952e-05,
"loss": 2.0997,
"step": 1815
},
{
"epoch": 1.2326447680325092,
"grad_norm": 1.234375,
"learning_rate": 1.466413759807936e-05,
"loss": 2.1071,
"step": 1820
},
{
"epoch": 1.2360311547578733,
"grad_norm": 1.25,
"learning_rate": 1.4629233848070452e-05,
"loss": 2.1019,
"step": 1825
},
{
"epoch": 1.2394175414832374,
"grad_norm": 1.2421875,
"learning_rate": 1.4594258171108997e-05,
"loss": 2.0857,
"step": 1830
},
{
"epoch": 1.2428039282086014,
"grad_norm": 1.1875,
"learning_rate": 1.4559211110631317e-05,
"loss": 2.1097,
"step": 1835
},
{
"epoch": 1.2461903149339655,
"grad_norm": 1.203125,
"learning_rate": 1.4524093211182874e-05,
"loss": 2.1162,
"step": 1840
},
{
"epoch": 1.2495767016593295,
"grad_norm": 1.1953125,
"learning_rate": 1.4488905018409792e-05,
"loss": 2.0866,
"step": 1845
},
{
"epoch": 1.2529630883846936,
"grad_norm": 1.1953125,
"learning_rate": 1.4453647079050366e-05,
"loss": 2.1043,
"step": 1850
},
{
"epoch": 1.2563494751100577,
"grad_norm": 1.1875,
"learning_rate": 1.4418319940926601e-05,
"loss": 2.0843,
"step": 1855
},
{
"epoch": 1.2597358618354215,
"grad_norm": 1.21875,
"learning_rate": 1.438292415293567e-05,
"loss": 2.1041,
"step": 1860
},
{
"epoch": 1.2631222485607856,
"grad_norm": 1.171875,
"learning_rate": 1.43474602650414e-05,
"loss": 2.0916,
"step": 1865
},
{
"epoch": 1.2665086352861497,
"grad_norm": 1.234375,
"learning_rate": 1.4311928828265724e-05,
"loss": 2.0774,
"step": 1870
},
{
"epoch": 1.2698950220115137,
"grad_norm": 1.2109375,
"learning_rate": 1.4276330394680122e-05,
"loss": 2.1185,
"step": 1875
},
{
"epoch": 1.2732814087368778,
"grad_norm": 1.2265625,
"learning_rate": 1.4240665517397031e-05,
"loss": 2.1146,
"step": 1880
},
{
"epoch": 1.2766677954622419,
"grad_norm": 1.2109375,
"learning_rate": 1.4204934750561275e-05,
"loss": 2.0987,
"step": 1885
},
{
"epoch": 1.2800541821876057,
"grad_norm": 1.2109375,
"learning_rate": 1.4169138649341422e-05,
"loss": 2.0938,
"step": 1890
},
{
"epoch": 1.2834405689129698,
"grad_norm": 1.2265625,
"learning_rate": 1.4133277769921192e-05,
"loss": 2.1163,
"step": 1895
},
{
"epoch": 1.2868269556383338,
"grad_norm": 1.203125,
"learning_rate": 1.4097352669490791e-05,
"loss": 2.0889,
"step": 1900
},
{
"epoch": 1.290213342363698,
"grad_norm": 1.171875,
"learning_rate": 1.4061363906238268e-05,
"loss": 2.103,
"step": 1905
},
{
"epoch": 1.293599729089062,
"grad_norm": 1.1875,
"learning_rate": 1.4025312039340829e-05,
"loss": 2.104,
"step": 1910
},
{
"epoch": 1.296986115814426,
"grad_norm": 1.1640625,
"learning_rate": 1.398919762895616e-05,
"loss": 2.1108,
"step": 1915
},
{
"epoch": 1.30037250253979,
"grad_norm": 1.1796875,
"learning_rate": 1.395302123621372e-05,
"loss": 2.0843,
"step": 1920
},
{
"epoch": 1.3037588892651542,
"grad_norm": 1.1953125,
"learning_rate": 1.391678342320602e-05,
"loss": 2.0992,
"step": 1925
},
{
"epoch": 1.3071452759905182,
"grad_norm": 1.1796875,
"learning_rate": 1.3880484752979898e-05,
"loss": 2.1083,
"step": 1930
},
{
"epoch": 1.3105316627158823,
"grad_norm": 1.2265625,
"learning_rate": 1.3844125789527755e-05,
"loss": 2.09,
"step": 1935
},
{
"epoch": 1.3139180494412461,
"grad_norm": 1.21875,
"learning_rate": 1.3807707097778806e-05,
"loss": 2.0936,
"step": 1940
},
{
"epoch": 1.3173044361666102,
"grad_norm": 1.203125,
"learning_rate": 1.3771229243590296e-05,
"loss": 2.1146,
"step": 1945
},
{
"epoch": 1.3206908228919743,
"grad_norm": 1.21875,
"learning_rate": 1.3734692793738709e-05,
"loss": 2.1093,
"step": 1950
},
{
"epoch": 1.3240772096173383,
"grad_norm": 1.2109375,
"learning_rate": 1.3698098315910961e-05,
"loss": 2.104,
"step": 1955
},
{
"epoch": 1.3274635963427024,
"grad_norm": 1.21875,
"learning_rate": 1.3661446378695588e-05,
"loss": 2.0981,
"step": 1960
},
{
"epoch": 1.3308499830680665,
"grad_norm": 1.1796875,
"learning_rate": 1.3624737551573896e-05,
"loss": 2.1017,
"step": 1965
},
{
"epoch": 1.3342363697934303,
"grad_norm": 1.171875,
"learning_rate": 1.358797240491112e-05,
"loss": 2.0747,
"step": 1970
},
{
"epoch": 1.3376227565187944,
"grad_norm": 1.15625,
"learning_rate": 1.3551151509947572e-05,
"loss": 2.0771,
"step": 1975
},
{
"epoch": 1.3410091432441584,
"grad_norm": 1.1875,
"learning_rate": 1.3514275438789745e-05,
"loss": 2.1104,
"step": 1980
},
{
"epoch": 1.3443955299695225,
"grad_norm": 1.2265625,
"learning_rate": 1.3477344764401438e-05,
"loss": 2.0789,
"step": 1985
},
{
"epoch": 1.3477819166948866,
"grad_norm": 1.2109375,
"learning_rate": 1.344036006059486e-05,
"loss": 2.0989,
"step": 1990
},
{
"epoch": 1.3511683034202506,
"grad_norm": 1.1796875,
"learning_rate": 1.3403321902021691e-05,
"loss": 2.0889,
"step": 1995
},
{
"epoch": 1.3545546901456147,
"grad_norm": 1.2109375,
"learning_rate": 1.3366230864164173e-05,
"loss": 2.091,
"step": 2000
},
{
"epoch": 1.3579410768709788,
"grad_norm": 1.1640625,
"learning_rate": 1.3329087523326168e-05,
"loss": 2.0969,
"step": 2005
},
{
"epoch": 1.3613274635963428,
"grad_norm": 1.25,
"learning_rate": 1.3291892456624185e-05,
"loss": 2.0938,
"step": 2010
},
{
"epoch": 1.3647138503217067,
"grad_norm": 1.234375,
"learning_rate": 1.3254646241978439e-05,
"loss": 2.1058,
"step": 2015
},
{
"epoch": 1.3681002370470707,
"grad_norm": 1.2109375,
"learning_rate": 1.3217349458103855e-05,
"loss": 2.0882,
"step": 2020
},
{
"epoch": 1.3714866237724348,
"grad_norm": 1.2421875,
"learning_rate": 1.3180002684501075e-05,
"loss": 2.1114,
"step": 2025
},
{
"epoch": 1.3748730104977989,
"grad_norm": 1.21875,
"learning_rate": 1.3142606501447464e-05,
"loss": 2.0993,
"step": 2030
},
{
"epoch": 1.378259397223163,
"grad_norm": 1.2109375,
"learning_rate": 1.3105161489988087e-05,
"loss": 2.0827,
"step": 2035
},
{
"epoch": 1.381645783948527,
"grad_norm": 1.21875,
"learning_rate": 1.3067668231926685e-05,
"loss": 2.1031,
"step": 2040
},
{
"epoch": 1.3850321706738908,
"grad_norm": 1.1875,
"learning_rate": 1.3030127309816632e-05,
"loss": 2.1086,
"step": 2045
},
{
"epoch": 1.388418557399255,
"grad_norm": 1.1953125,
"learning_rate": 1.2992539306951878e-05,
"loss": 2.0985,
"step": 2050
},
{
"epoch": 1.391804944124619,
"grad_norm": 1.203125,
"learning_rate": 1.2954904807357905e-05,
"loss": 2.0981,
"step": 2055
},
{
"epoch": 1.395191330849983,
"grad_norm": 1.21875,
"learning_rate": 1.2917224395782627e-05,
"loss": 2.0995,
"step": 2060
},
{
"epoch": 1.398577717575347,
"grad_norm": 1.1875,
"learning_rate": 1.2879498657687335e-05,
"loss": 2.1045,
"step": 2065
},
{
"epoch": 1.4019641043007112,
"grad_norm": 1.2109375,
"learning_rate": 1.284172817923756e-05,
"loss": 2.1057,
"step": 2070
},
{
"epoch": 1.4053504910260752,
"grad_norm": 1.2265625,
"learning_rate": 1.280391354729401e-05,
"loss": 2.1083,
"step": 2075
},
{
"epoch": 1.4087368777514393,
"grad_norm": 1.1875,
"learning_rate": 1.2766055349403415e-05,
"loss": 2.0772,
"step": 2080
},
{
"epoch": 1.4121232644768034,
"grad_norm": 1.1796875,
"learning_rate": 1.272815417378942e-05,
"loss": 2.0991,
"step": 2085
},
{
"epoch": 1.4155096512021672,
"grad_norm": 1.171875,
"learning_rate": 1.2690210609343439e-05,
"loss": 2.0951,
"step": 2090
},
{
"epoch": 1.4188960379275313,
"grad_norm": 1.1953125,
"learning_rate": 1.2652225245615503e-05,
"loss": 2.0951,
"step": 2095
},
{
"epoch": 1.4222824246528953,
"grad_norm": 1.1953125,
"learning_rate": 1.2614198672805105e-05,
"loss": 2.0926,
"step": 2100
},
{
"epoch": 1.4256688113782594,
"grad_norm": 1.171875,
"learning_rate": 1.2576131481752018e-05,
"loss": 2.0907,
"step": 2105
},
{
"epoch": 1.4290551981036235,
"grad_norm": 1.1875,
"learning_rate": 1.2538024263927128e-05,
"loss": 2.0992,
"step": 2110
},
{
"epoch": 1.4324415848289875,
"grad_norm": 1.203125,
"learning_rate": 1.2499877611423241e-05,
"loss": 2.0954,
"step": 2115
},
{
"epoch": 1.4358279715543514,
"grad_norm": 1.21875,
"learning_rate": 1.2461692116945878e-05,
"loss": 2.1015,
"step": 2120
},
{
"epoch": 1.4392143582797154,
"grad_norm": 1.2421875,
"learning_rate": 1.242346837380407e-05,
"loss": 2.086,
"step": 2125
},
{
"epoch": 1.4426007450050795,
"grad_norm": 1.1875,
"learning_rate": 1.238520697590114e-05,
"loss": 2.0897,
"step": 2130
},
{
"epoch": 1.4459871317304436,
"grad_norm": 1.2265625,
"learning_rate": 1.2346908517725472e-05,
"loss": 2.09,
"step": 2135
},
{
"epoch": 1.4493735184558076,
"grad_norm": 1.171875,
"learning_rate": 1.2308573594341275e-05,
"loss": 2.0932,
"step": 2140
},
{
"epoch": 1.4527599051811717,
"grad_norm": 1.1875,
"learning_rate": 1.227020280137934e-05,
"loss": 2.0823,
"step": 2145
},
{
"epoch": 1.4561462919065358,
"grad_norm": 1.2421875,
"learning_rate": 1.2231796735027781e-05,
"loss": 2.0954,
"step": 2150
},
{
"epoch": 1.4595326786318998,
"grad_norm": 1.2109375,
"learning_rate": 1.2193355992022778e-05,
"loss": 2.0976,
"step": 2155
},
{
"epoch": 1.462919065357264,
"grad_norm": 1.1953125,
"learning_rate": 1.21548811696393e-05,
"loss": 2.0933,
"step": 2160
},
{
"epoch": 1.466305452082628,
"grad_norm": 1.1953125,
"learning_rate": 1.2116372865681824e-05,
"loss": 2.097,
"step": 2165
},
{
"epoch": 1.4696918388079918,
"grad_norm": 1.2421875,
"learning_rate": 1.2077831678475047e-05,
"loss": 2.1055,
"step": 2170
},
{
"epoch": 1.4730782255333559,
"grad_norm": 1.1875,
"learning_rate": 1.2039258206854594e-05,
"loss": 2.0978,
"step": 2175
},
{
"epoch": 1.47646461225872,
"grad_norm": 1.203125,
"learning_rate": 1.2000653050157713e-05,
"loss": 2.0939,
"step": 2180
},
{
"epoch": 1.479850998984084,
"grad_norm": 1.2109375,
"learning_rate": 1.1962016808213958e-05,
"loss": 2.0923,
"step": 2185
},
{
"epoch": 1.483237385709448,
"grad_norm": 1.203125,
"learning_rate": 1.1923350081335871e-05,
"loss": 2.0995,
"step": 2190
},
{
"epoch": 1.486623772434812,
"grad_norm": 1.203125,
"learning_rate": 1.1884653470309655e-05,
"loss": 2.0904,
"step": 2195
},
{
"epoch": 1.490010159160176,
"grad_norm": 1.21875,
"learning_rate": 1.1845927576385838e-05,
"loss": 2.1105,
"step": 2200
},
{
"epoch": 1.49339654588554,
"grad_norm": 1.21875,
"learning_rate": 1.1807173001269938e-05,
"loss": 2.1011,
"step": 2205
},
{
"epoch": 1.4967829326109041,
"grad_norm": 1.1484375,
"learning_rate": 1.17683903471131e-05,
"loss": 2.1078,
"step": 2210
},
{
"epoch": 1.5001693193362682,
"grad_norm": 1.203125,
"learning_rate": 1.1729580216502765e-05,
"loss": 2.0945,
"step": 2215
},
{
"epoch": 1.5035557060616322,
"grad_norm": 1.1796875,
"learning_rate": 1.1690743212453265e-05,
"loss": 2.0874,
"step": 2220
},
{
"epoch": 1.5069420927869963,
"grad_norm": 1.21875,
"learning_rate": 1.1651879938396505e-05,
"loss": 2.1066,
"step": 2225
},
{
"epoch": 1.5103284795123604,
"grad_norm": 1.234375,
"learning_rate": 1.1612990998172544e-05,
"loss": 2.0968,
"step": 2230
},
{
"epoch": 1.5137148662377244,
"grad_norm": 1.203125,
"learning_rate": 1.1574076996020238e-05,
"loss": 2.0896,
"step": 2235
},
{
"epoch": 1.5171012529630885,
"grad_norm": 1.203125,
"learning_rate": 1.1535138536567845e-05,
"loss": 2.0805,
"step": 2240
},
{
"epoch": 1.5204876396884526,
"grad_norm": 1.1953125,
"learning_rate": 1.1496176224823623e-05,
"loss": 2.105,
"step": 2245
},
{
"epoch": 1.5238740264138164,
"grad_norm": 1.21875,
"learning_rate": 1.1457190666166443e-05,
"loss": 2.076,
"step": 2250
},
{
"epoch": 1.5272604131391805,
"grad_norm": 1.1875,
"learning_rate": 1.1418182466336372e-05,
"loss": 2.0935,
"step": 2255
},
{
"epoch": 1.5306467998645445,
"grad_norm": 1.203125,
"learning_rate": 1.1379152231425268e-05,
"loss": 2.064,
"step": 2260
},
{
"epoch": 1.5340331865899086,
"grad_norm": 1.203125,
"learning_rate": 1.1340100567867358e-05,
"loss": 2.1166,
"step": 2265
},
{
"epoch": 1.5374195733152725,
"grad_norm": 1.2109375,
"learning_rate": 1.1301028082429823e-05,
"loss": 2.1109,
"step": 2270
},
{
"epoch": 1.5408059600406365,
"grad_norm": 1.1953125,
"learning_rate": 1.1261935382203353e-05,
"loss": 2.0978,
"step": 2275
},
{
"epoch": 1.5441923467660006,
"grad_norm": 1.234375,
"learning_rate": 1.1222823074592737e-05,
"loss": 2.0733,
"step": 2280
},
{
"epoch": 1.5475787334913647,
"grad_norm": 1.2109375,
"learning_rate": 1.1183691767307412e-05,
"loss": 2.1078,
"step": 2285
},
{
"epoch": 1.5509651202167287,
"grad_norm": 1.2421875,
"learning_rate": 1.1144542068352026e-05,
"loss": 2.0953,
"step": 2290
},
{
"epoch": 1.5543515069420928,
"grad_norm": 1.234375,
"learning_rate": 1.1105374586016984e-05,
"loss": 2.1037,
"step": 2295
},
{
"epoch": 1.5577378936674569,
"grad_norm": 1.25,
"learning_rate": 1.1066189928868996e-05,
"loss": 2.1059,
"step": 2300
},
{
"epoch": 1.561124280392821,
"grad_norm": 1.171875,
"learning_rate": 1.102698870574164e-05,
"loss": 2.0744,
"step": 2305
},
{
"epoch": 1.564510667118185,
"grad_norm": 1.2109375,
"learning_rate": 1.0987771525725882e-05,
"loss": 2.0996,
"step": 2310
},
{
"epoch": 1.567897053843549,
"grad_norm": 1.1796875,
"learning_rate": 1.0948538998160614e-05,
"loss": 2.0866,
"step": 2315
},
{
"epoch": 1.5712834405689131,
"grad_norm": 1.1953125,
"learning_rate": 1.0909291732623202e-05,
"loss": 2.0929,
"step": 2320
},
{
"epoch": 1.574669827294277,
"grad_norm": 1.203125,
"learning_rate": 1.0870030338919995e-05,
"loss": 2.097,
"step": 2325
},
{
"epoch": 1.578056214019641,
"grad_norm": 1.203125,
"learning_rate": 1.0830755427076865e-05,
"loss": 2.0992,
"step": 2330
},
{
"epoch": 1.581442600745005,
"grad_norm": 1.2109375,
"learning_rate": 1.0791467607329713e-05,
"loss": 2.0996,
"step": 2335
},
{
"epoch": 1.5848289874703692,
"grad_norm": 1.1640625,
"learning_rate": 1.0752167490115012e-05,
"loss": 2.1022,
"step": 2340
},
{
"epoch": 1.588215374195733,
"grad_norm": 1.1875,
"learning_rate": 1.0712855686060293e-05,
"loss": 2.0803,
"step": 2345
},
{
"epoch": 1.591601760921097,
"grad_norm": 1.2265625,
"learning_rate": 1.0673532805974687e-05,
"loss": 2.0929,
"step": 2350
},
{
"epoch": 1.5949881476464611,
"grad_norm": 1.203125,
"learning_rate": 1.0634199460839403e-05,
"loss": 2.0993,
"step": 2355
},
{
"epoch": 1.5983745343718252,
"grad_norm": 1.2265625,
"learning_rate": 1.0594856261798264e-05,
"loss": 2.1005,
"step": 2360
},
{
"epoch": 1.6017609210971893,
"grad_norm": 1.1953125,
"learning_rate": 1.055550382014819e-05,
"loss": 2.074,
"step": 2365
},
{
"epoch": 1.6051473078225533,
"grad_norm": 1.2265625,
"learning_rate": 1.0516142747329715e-05,
"loss": 2.0856,
"step": 2370
},
{
"epoch": 1.6085336945479174,
"grad_norm": 1.2109375,
"learning_rate": 1.0476773654917476e-05,
"loss": 2.0722,
"step": 2375
},
{
"epoch": 1.6119200812732815,
"grad_norm": 1.2109375,
"learning_rate": 1.0437397154610717e-05,
"loss": 2.106,
"step": 2380
},
{
"epoch": 1.6153064679986455,
"grad_norm": 1.203125,
"learning_rate": 1.0398013858223784e-05,
"loss": 2.0995,
"step": 2385
},
{
"epoch": 1.6186928547240096,
"grad_norm": 1.21875,
"learning_rate": 1.035862437767661e-05,
"loss": 2.114,
"step": 2390
},
{
"epoch": 1.6220792414493737,
"grad_norm": 1.1953125,
"learning_rate": 1.0319229324985228e-05,
"loss": 2.118,
"step": 2395
},
{
"epoch": 1.6254656281747377,
"grad_norm": 1.1953125,
"learning_rate": 1.027982931225223e-05,
"loss": 2.0772,
"step": 2400
},
{
"epoch": 1.6288520149001016,
"grad_norm": 1.2109375,
"learning_rate": 1.0240424951657295e-05,
"loss": 2.1139,
"step": 2405
},
{
"epoch": 1.6322384016254656,
"grad_norm": 1.21875,
"learning_rate": 1.020101685544764e-05,
"loss": 2.0946,
"step": 2410
},
{
"epoch": 1.6356247883508297,
"grad_norm": 1.2109375,
"learning_rate": 1.0161605635928538e-05,
"loss": 2.1095,
"step": 2415
},
{
"epoch": 1.6390111750761935,
"grad_norm": 1.2265625,
"learning_rate": 1.012219190545378e-05,
"loss": 2.1012,
"step": 2420
},
{
"epoch": 1.6423975618015576,
"grad_norm": 1.171875,
"learning_rate": 1.0082776276416177e-05,
"loss": 2.1011,
"step": 2425
},
{
"epoch": 1.6457839485269217,
"grad_norm": 1.2265625,
"learning_rate": 1.0043359361238036e-05,
"loss": 2.0969,
"step": 2430
},
{
"epoch": 1.6491703352522857,
"grad_norm": 1.203125,
"learning_rate": 1.0003941772361651e-05,
"loss": 2.0733,
"step": 2435
},
{
"epoch": 1.6525567219776498,
"grad_norm": 1.1953125,
"learning_rate": 9.96452412223978e-06,
"loss": 2.095,
"step": 2440
},
{
"epoch": 1.6559431087030139,
"grad_norm": 1.1953125,
"learning_rate": 9.925107023326137e-06,
"loss": 2.0997,
"step": 2445
},
{
"epoch": 1.659329495428378,
"grad_norm": 1.1796875,
"learning_rate": 9.885691088065866e-06,
"loss": 2.1065,
"step": 2450
},
{
"epoch": 1.662715882153742,
"grad_norm": 1.265625,
"learning_rate": 9.846276928886032e-06,
"loss": 2.1,
"step": 2455
},
{
"epoch": 1.666102268879106,
"grad_norm": 1.1796875,
"learning_rate": 9.80686515818611e-06,
"loss": 2.0979,
"step": 2460
},
{
"epoch": 1.6694886556044701,
"grad_norm": 1.2109375,
"learning_rate": 9.767456388328454e-06,
"loss": 2.0939,
"step": 2465
},
{
"epoch": 1.6728750423298342,
"grad_norm": 1.1796875,
"learning_rate": 9.728051231628802e-06,
"loss": 2.0998,
"step": 2470
},
{
"epoch": 1.6762614290551983,
"grad_norm": 1.25,
"learning_rate": 9.688650300346749e-06,
"loss": 2.0978,
"step": 2475
},
{
"epoch": 1.679647815780562,
"grad_norm": 1.234375,
"learning_rate": 9.649254206676226e-06,
"loss": 2.0911,
"step": 2480
},
{
"epoch": 1.6830342025059262,
"grad_norm": 1.2109375,
"learning_rate": 9.609863562736023e-06,
"loss": 2.0819,
"step": 2485
},
{
"epoch": 1.6864205892312902,
"grad_norm": 1.1953125,
"learning_rate": 9.570478980560233e-06,
"loss": 2.1249,
"step": 2490
},
{
"epoch": 1.6898069759566543,
"grad_norm": 1.1640625,
"learning_rate": 9.531101072088779e-06,
"loss": 2.0797,
"step": 2495
},
{
"epoch": 1.6931933626820181,
"grad_norm": 1.1875,
"learning_rate": 9.491730449157878e-06,
"loss": 2.0914,
"step": 2500
},
{
"epoch": 1.6965797494073822,
"grad_norm": 1.203125,
"learning_rate": 9.452367723490553e-06,
"loss": 2.1069,
"step": 2505
},
{
"epoch": 1.6999661361327463,
"grad_norm": 1.1875,
"learning_rate": 9.41301350668713e-06,
"loss": 2.1065,
"step": 2510
},
{
"epoch": 1.7033525228581103,
"grad_norm": 1.234375,
"learning_rate": 9.373668410215717e-06,
"loss": 2.0948,
"step": 2515
},
{
"epoch": 1.7067389095834744,
"grad_norm": 1.2109375,
"learning_rate": 9.334333045402721e-06,
"loss": 2.1243,
"step": 2520
},
{
"epoch": 1.7101252963088385,
"grad_norm": 1.25,
"learning_rate": 9.29500802342334e-06,
"loss": 2.09,
"step": 2525
},
{
"epoch": 1.7135116830342025,
"grad_norm": 1.203125,
"learning_rate": 9.255693955292072e-06,
"loss": 2.0846,
"step": 2530
},
{
"epoch": 1.7168980697595666,
"grad_norm": 1.1796875,
"learning_rate": 9.216391451853218e-06,
"loss": 2.0937,
"step": 2535
},
{
"epoch": 1.7202844564849307,
"grad_norm": 1.2109375,
"learning_rate": 9.177101123771387e-06,
"loss": 2.0945,
"step": 2540
},
{
"epoch": 1.7236708432102947,
"grad_norm": 1.21875,
"learning_rate": 9.137823581522024e-06,
"loss": 2.0997,
"step": 2545
},
{
"epoch": 1.7270572299356588,
"grad_norm": 1.21875,
"learning_rate": 9.098559435381903e-06,
"loss": 2.0878,
"step": 2550
},
{
"epoch": 1.7304436166610226,
"grad_norm": 1.203125,
"learning_rate": 9.059309295419665e-06,
"loss": 2.0783,
"step": 2555
},
{
"epoch": 1.7338300033863867,
"grad_norm": 1.203125,
"learning_rate": 9.020073771486319e-06,
"loss": 2.0842,
"step": 2560
},
{
"epoch": 1.7372163901117508,
"grad_norm": 1.2109375,
"learning_rate": 8.980853473205776e-06,
"loss": 2.1148,
"step": 2565
},
{
"epoch": 1.7406027768371148,
"grad_norm": 1.2109375,
"learning_rate": 8.941649009965393e-06,
"loss": 2.1094,
"step": 2570
},
{
"epoch": 1.7439891635624787,
"grad_norm": 1.1875,
"learning_rate": 8.902460990906474e-06,
"loss": 2.0912,
"step": 2575
},
{
"epoch": 1.7473755502878427,
"grad_norm": 1.2265625,
"learning_rate": 8.863290024914828e-06,
"loss": 2.0935,
"step": 2580
},
{
"epoch": 1.7507619370132068,
"grad_norm": 1.2109375,
"learning_rate": 8.8241367206113e-06,
"loss": 2.0992,
"step": 2585
},
{
"epoch": 1.7541483237385709,
"grad_norm": 1.21875,
"learning_rate": 8.78500168634231e-06,
"loss": 2.0929,
"step": 2590
},
{
"epoch": 1.757534710463935,
"grad_norm": 1.1796875,
"learning_rate": 8.745885530170418e-06,
"loss": 2.1136,
"step": 2595
},
{
"epoch": 1.760921097189299,
"grad_norm": 1.1640625,
"learning_rate": 8.70678885986485e-06,
"loss": 2.0984,
"step": 2600
},
{
"epoch": 1.764307483914663,
"grad_norm": 1.203125,
"learning_rate": 8.667712282892084e-06,
"loss": 2.089,
"step": 2605
},
{
"epoch": 1.7676938706400271,
"grad_norm": 1.1796875,
"learning_rate": 8.628656406406388e-06,
"loss": 2.0945,
"step": 2610
},
{
"epoch": 1.7710802573653912,
"grad_norm": 1.1875,
"learning_rate": 8.589621837240393e-06,
"loss": 2.0941,
"step": 2615
},
{
"epoch": 1.7744666440907553,
"grad_norm": 1.234375,
"learning_rate": 8.550609181895679e-06,
"loss": 2.1016,
"step": 2620
},
{
"epoch": 1.7778530308161193,
"grad_norm": 1.1953125,
"learning_rate": 8.511619046533319e-06,
"loss": 2.082,
"step": 2625
},
{
"epoch": 1.7812394175414832,
"grad_norm": 1.21875,
"learning_rate": 8.472652036964504e-06,
"loss": 2.0971,
"step": 2630
},
{
"epoch": 1.7846258042668472,
"grad_norm": 1.2109375,
"learning_rate": 8.433708758641085e-06,
"loss": 2.1035,
"step": 2635
},
{
"epoch": 1.7880121909922113,
"grad_norm": 1.2734375,
"learning_rate": 8.394789816646207e-06,
"loss": 2.1157,
"step": 2640
},
{
"epoch": 1.7913985777175754,
"grad_norm": 1.1796875,
"learning_rate": 8.355895815684875e-06,
"loss": 2.0962,
"step": 2645
},
{
"epoch": 1.7947849644429392,
"grad_norm": 1.1953125,
"learning_rate": 8.31702736007457e-06,
"loss": 2.1044,
"step": 2650
},
{
"epoch": 1.7981713511683033,
"grad_norm": 1.2109375,
"learning_rate": 8.278185053735874e-06,
"loss": 2.0846,
"step": 2655
},
{
"epoch": 1.8015577378936674,
"grad_norm": 1.1953125,
"learning_rate": 8.239369500183057e-06,
"loss": 2.0979,
"step": 2660
},
{
"epoch": 1.8049441246190314,
"grad_norm": 1.1953125,
"learning_rate": 8.200581302514733e-06,
"loss": 2.1079,
"step": 2665
},
{
"epoch": 1.8083305113443955,
"grad_norm": 1.265625,
"learning_rate": 8.161821063404458e-06,
"loss": 2.1269,
"step": 2670
},
{
"epoch": 1.8117168980697596,
"grad_norm": 1.2265625,
"learning_rate": 8.123089385091388e-06,
"loss": 2.1159,
"step": 2675
},
{
"epoch": 1.8151032847951236,
"grad_norm": 1.21875,
"learning_rate": 8.084386869370917e-06,
"loss": 2.1139,
"step": 2680
},
{
"epoch": 1.8184896715204877,
"grad_norm": 1.1953125,
"learning_rate": 8.04571411758531e-06,
"loss": 2.105,
"step": 2685
},
{
"epoch": 1.8218760582458517,
"grad_norm": 1.1875,
"learning_rate": 8.007071730614395e-06,
"loss": 2.0957,
"step": 2690
},
{
"epoch": 1.8252624449712158,
"grad_norm": 1.1796875,
"learning_rate": 7.968460308866187e-06,
"loss": 2.1123,
"step": 2695
},
{
"epoch": 1.8286488316965799,
"grad_norm": 1.2109375,
"learning_rate": 7.92988045226758e-06,
"loss": 2.1086,
"step": 2700
},
{
"epoch": 1.832035218421944,
"grad_norm": 1.1796875,
"learning_rate": 7.891332760255033e-06,
"loss": 2.0779,
"step": 2705
},
{
"epoch": 1.8354216051473078,
"grad_norm": 1.234375,
"learning_rate": 7.852817831765235e-06,
"loss": 2.0877,
"step": 2710
},
{
"epoch": 1.8388079918726719,
"grad_norm": 1.1875,
"learning_rate": 7.814336265225819e-06,
"loss": 2.0794,
"step": 2715
},
{
"epoch": 1.842194378598036,
"grad_norm": 1.1640625,
"learning_rate": 7.775888658546046e-06,
"loss": 2.098,
"step": 2720
},
{
"epoch": 1.8455807653234,
"grad_norm": 1.1796875,
"learning_rate": 7.737475609107528e-06,
"loss": 2.0884,
"step": 2725
},
{
"epoch": 1.8489671520487638,
"grad_norm": 1.1953125,
"learning_rate": 7.69909771375495e-06,
"loss": 2.1047,
"step": 2730
},
{
"epoch": 1.852353538774128,
"grad_norm": 1.1796875,
"learning_rate": 7.660755568786771e-06,
"loss": 2.0949,
"step": 2735
},
{
"epoch": 1.855739925499492,
"grad_norm": 1.1953125,
"learning_rate": 7.6224497699459965e-06,
"loss": 2.0708,
"step": 2740
},
{
"epoch": 1.859126312224856,
"grad_norm": 1.265625,
"learning_rate": 7.584180912410888e-06,
"loss": 2.0953,
"step": 2745
},
{
"epoch": 1.86251269895022,
"grad_norm": 1.1796875,
"learning_rate": 7.545949590785737e-06,
"loss": 2.0715,
"step": 2750
},
{
"epoch": 1.8658990856755842,
"grad_norm": 1.21875,
"learning_rate": 7.507756399091614e-06,
"loss": 2.0796,
"step": 2755
},
{
"epoch": 1.8692854724009482,
"grad_norm": 1.2109375,
"learning_rate": 7.469601930757142e-06,
"loss": 2.0941,
"step": 2760
},
{
"epoch": 1.8726718591263123,
"grad_norm": 1.203125,
"learning_rate": 7.431486778609291e-06,
"loss": 2.0884,
"step": 2765
},
{
"epoch": 1.8760582458516764,
"grad_norm": 1.2265625,
"learning_rate": 7.393411534864139e-06,
"loss": 2.1029,
"step": 2770
},
{
"epoch": 1.8794446325770404,
"grad_norm": 1.1875,
"learning_rate": 7.355376791117692e-06,
"loss": 2.0997,
"step": 2775
},
{
"epoch": 1.8828310193024045,
"grad_norm": 1.1953125,
"learning_rate": 7.3173831383366855e-06,
"loss": 2.0989,
"step": 2780
},
{
"epoch": 1.8862174060277683,
"grad_norm": 1.2109375,
"learning_rate": 7.279431166849394e-06,
"loss": 2.1009,
"step": 2785
},
{
"epoch": 1.8896037927531324,
"grad_norm": 1.1875,
"learning_rate": 7.241521466336485e-06,
"loss": 2.1078,
"step": 2790
},
{
"epoch": 1.8929901794784965,
"grad_norm": 1.203125,
"learning_rate": 7.203654625821815e-06,
"loss": 2.0946,
"step": 2795
},
{
"epoch": 1.8963765662038605,
"grad_norm": 1.15625,
"learning_rate": 7.165831233663324e-06,
"loss": 2.0832,
"step": 2800
},
{
"epoch": 1.8997629529292244,
"grad_norm": 1.21875,
"learning_rate": 7.1280518775438555e-06,
"loss": 2.0737,
"step": 2805
},
{
"epoch": 1.9031493396545884,
"grad_norm": 1.203125,
"learning_rate": 7.090317144462045e-06,
"loss": 2.1022,
"step": 2810
},
{
"epoch": 1.9065357263799525,
"grad_norm": 1.1875,
"learning_rate": 7.052627620723196e-06,
"loss": 2.1055,
"step": 2815
},
{
"epoch": 1.9099221131053166,
"grad_norm": 1.234375,
"learning_rate": 7.014983891930167e-06,
"loss": 2.1058,
"step": 2820
},
{
"epoch": 1.9133084998306806,
"grad_norm": 1.1953125,
"learning_rate": 6.977386542974285e-06,
"loss": 2.0923,
"step": 2825
},
{
"epoch": 1.9166948865560447,
"grad_norm": 1.2265625,
"learning_rate": 6.939836158026236e-06,
"loss": 2.1055,
"step": 2830
},
{
"epoch": 1.9200812732814088,
"grad_norm": 1.1953125,
"learning_rate": 6.9023333205270024e-06,
"loss": 2.113,
"step": 2835
},
{
"epoch": 1.9234676600067728,
"grad_norm": 1.2265625,
"learning_rate": 6.864878613178805e-06,
"loss": 2.103,
"step": 2840
},
{
"epoch": 1.926854046732137,
"grad_norm": 1.2109375,
"learning_rate": 6.8274726179360285e-06,
"loss": 2.0934,
"step": 2845
},
{
"epoch": 1.930240433457501,
"grad_norm": 1.2109375,
"learning_rate": 6.790115915996199e-06,
"loss": 2.0839,
"step": 2850
},
{
"epoch": 1.933626820182865,
"grad_norm": 1.234375,
"learning_rate": 6.752809087790934e-06,
"loss": 2.0942,
"step": 2855
},
{
"epoch": 1.9370132069082289,
"grad_norm": 1.2265625,
"learning_rate": 6.715552712976956e-06,
"loss": 2.0843,
"step": 2860
},
{
"epoch": 1.940399593633593,
"grad_norm": 1.203125,
"learning_rate": 6.678347370427047e-06,
"loss": 2.0934,
"step": 2865
},
{
"epoch": 1.943785980358957,
"grad_norm": 1.1796875,
"learning_rate": 6.641193638221075e-06,
"loss": 2.0906,
"step": 2870
},
{
"epoch": 1.947172367084321,
"grad_norm": 1.2109375,
"learning_rate": 6.604092093637018e-06,
"loss": 2.1081,
"step": 2875
},
{
"epoch": 1.950558753809685,
"grad_norm": 1.203125,
"learning_rate": 6.567043313141976e-06,
"loss": 2.0758,
"step": 2880
},
{
"epoch": 1.953945140535049,
"grad_norm": 1.1953125,
"learning_rate": 6.5300478723832385e-06,
"loss": 2.0903,
"step": 2885
},
{
"epoch": 1.957331527260413,
"grad_norm": 1.15625,
"learning_rate": 6.493106346179312e-06,
"loss": 2.1073,
"step": 2890
},
{
"epoch": 1.960717913985777,
"grad_norm": 1.203125,
"learning_rate": 6.456219308511005e-06,
"loss": 2.088,
"step": 2895
},
{
"epoch": 1.9641043007111412,
"grad_norm": 1.1875,
"learning_rate": 6.4193873325125186e-06,
"loss": 2.0892,
"step": 2900
},
{
"epoch": 1.9674906874365052,
"grad_norm": 1.234375,
"learning_rate": 6.382610990462519e-06,
"loss": 2.1027,
"step": 2905
},
{
"epoch": 1.9708770741618693,
"grad_norm": 1.2421875,
"learning_rate": 6.3458908537752616e-06,
"loss": 2.0747,
"step": 2910
},
{
"epoch": 1.9742634608872334,
"grad_norm": 1.203125,
"learning_rate": 6.309227492991708e-06,
"loss": 2.1045,
"step": 2915
},
{
"epoch": 1.9776498476125974,
"grad_norm": 1.203125,
"learning_rate": 6.272621477770655e-06,
"loss": 2.1038,
"step": 2920
},
{
"epoch": 1.9810362343379615,
"grad_norm": 1.1875,
"learning_rate": 6.236073376879907e-06,
"loss": 2.0945,
"step": 2925
},
{
"epoch": 1.9844226210633256,
"grad_norm": 1.1875,
"learning_rate": 6.1995837581874e-06,
"loss": 2.0771,
"step": 2930
},
{
"epoch": 1.9878090077886896,
"grad_norm": 1.203125,
"learning_rate": 6.1631531886524175e-06,
"loss": 2.0948,
"step": 2935
},
{
"epoch": 1.9911953945140535,
"grad_norm": 1.1875,
"learning_rate": 6.126782234316752e-06,
"loss": 2.0919,
"step": 2940
},
{
"epoch": 1.9945817812394175,
"grad_norm": 1.1953125,
"learning_rate": 6.090471460295928e-06,
"loss": 2.0973,
"step": 2945
},
{
"epoch": 1.9979681679647816,
"grad_norm": 1.1640625,
"learning_rate": 6.054221430770416e-06,
"loss": 2.0925,
"step": 2950
},
{
"epoch": 2.0,
"eval_loss": 2.1174328327178955,
"eval_runtime": 85.4558,
"eval_samples_per_second": 15.271,
"eval_steps_per_second": 1.919,
"step": 2953
},
{
"epoch": 2.0013545546901454,
"grad_norm": 1.2265625,
"learning_rate": 6.018032708976857e-06,
"loss": 2.0953,
"step": 2955
},
{
"epoch": 2.0047409414155095,
"grad_norm": 1.171875,
"learning_rate": 5.981905857199335e-06,
"loss": 2.0947,
"step": 2960
},
{
"epoch": 2.0081273281408736,
"grad_norm": 1.21875,
"learning_rate": 5.945841436760612e-06,
"loss": 2.1016,
"step": 2965
},
{
"epoch": 2.0115137148662376,
"grad_norm": 1.1953125,
"learning_rate": 5.909840008013428e-06,
"loss": 2.0855,
"step": 2970
},
{
"epoch": 2.0149001015916017,
"grad_norm": 1.2109375,
"learning_rate": 5.8739021303317825e-06,
"loss": 2.0817,
"step": 2975
},
{
"epoch": 2.0182864883169658,
"grad_norm": 1.2109375,
"learning_rate": 5.838028362102246e-06,
"loss": 2.0723,
"step": 2980
},
{
"epoch": 2.02167287504233,
"grad_norm": 1.1875,
"learning_rate": 5.80221926071529e-06,
"loss": 2.0643,
"step": 2985
},
{
"epoch": 2.025059261767694,
"grad_norm": 1.203125,
"learning_rate": 5.7664753825566155e-06,
"loss": 2.0711,
"step": 2990
},
{
"epoch": 2.028445648493058,
"grad_norm": 1.2265625,
"learning_rate": 5.7307972829985234e-06,
"loss": 2.1014,
"step": 2995
},
{
"epoch": 2.031832035218422,
"grad_norm": 1.1953125,
"learning_rate": 5.69518551639127e-06,
"loss": 2.0799,
"step": 3000
},
{
"epoch": 2.035218421943786,
"grad_norm": 1.171875,
"learning_rate": 5.659640636054453e-06,
"loss": 2.0754,
"step": 3005
},
{
"epoch": 2.03860480866915,
"grad_norm": 1.203125,
"learning_rate": 5.624163194268441e-06,
"loss": 2.0847,
"step": 3010
},
{
"epoch": 2.0419911953945142,
"grad_norm": 1.1640625,
"learning_rate": 5.588753742265751e-06,
"loss": 2.082,
"step": 3015
},
{
"epoch": 2.0453775821198783,
"grad_norm": 1.203125,
"learning_rate": 5.553412830222523e-06,
"loss": 2.0866,
"step": 3020
},
{
"epoch": 2.048763968845242,
"grad_norm": 1.2109375,
"learning_rate": 5.518141007249944e-06,
"loss": 2.1161,
"step": 3025
},
{
"epoch": 2.052150355570606,
"grad_norm": 1.1953125,
"learning_rate": 5.482938821385722e-06,
"loss": 2.073,
"step": 3030
},
{
"epoch": 2.05553674229597,
"grad_norm": 1.25,
"learning_rate": 5.44780681958559e-06,
"loss": 2.0779,
"step": 3035
},
{
"epoch": 2.058923129021334,
"grad_norm": 1.2109375,
"learning_rate": 5.412745547714777e-06,
"loss": 2.0771,
"step": 3040
},
{
"epoch": 2.062309515746698,
"grad_norm": 1.1953125,
"learning_rate": 5.377755550539559e-06,
"loss": 2.0847,
"step": 3045
},
{
"epoch": 2.0656959024720623,
"grad_norm": 1.2265625,
"learning_rate": 5.3428373717187575e-06,
"loss": 2.1026,
"step": 3050
},
{
"epoch": 2.0690822891974263,
"grad_norm": 1.2109375,
"learning_rate": 5.307991553795328e-06,
"loss": 2.1031,
"step": 3055
},
{
"epoch": 2.0724686759227904,
"grad_norm": 1.1875,
"learning_rate": 5.2732186381879115e-06,
"loss": 2.0797,
"step": 3060
},
{
"epoch": 2.0758550626481544,
"grad_norm": 1.21875,
"learning_rate": 5.238519165182415e-06,
"loss": 2.0848,
"step": 3065
},
{
"epoch": 2.0792414493735185,
"grad_norm": 1.203125,
"learning_rate": 5.203893673923649e-06,
"loss": 2.0678,
"step": 3070
},
{
"epoch": 2.0826278360988826,
"grad_norm": 1.21875,
"learning_rate": 5.169342702406907e-06,
"loss": 2.0913,
"step": 3075
},
{
"epoch": 2.0860142228242466,
"grad_norm": 1.171875,
"learning_rate": 5.1348667874696455e-06,
"loss": 2.0952,
"step": 3080
},
{
"epoch": 2.0894006095496107,
"grad_norm": 1.2421875,
"learning_rate": 5.100466464783113e-06,
"loss": 2.0562,
"step": 3085
},
{
"epoch": 2.0927869962749748,
"grad_norm": 1.1953125,
"learning_rate": 5.066142268844043e-06,
"loss": 2.0847,
"step": 3090
},
{
"epoch": 2.096173383000339,
"grad_norm": 1.21875,
"learning_rate": 5.0318947329663545e-06,
"loss": 2.1097,
"step": 3095
},
{
"epoch": 2.0995597697257025,
"grad_norm": 1.1796875,
"learning_rate": 4.997724389272841e-06,
"loss": 2.0795,
"step": 3100
},
{
"epoch": 2.1029461564510665,
"grad_norm": 1.1953125,
"learning_rate": 4.963631768686937e-06,
"loss": 2.0798,
"step": 3105
},
{
"epoch": 2.1063325431764306,
"grad_norm": 1.203125,
"learning_rate": 4.929617400924435e-06,
"loss": 2.1103,
"step": 3110
},
{
"epoch": 2.1097189299017947,
"grad_norm": 1.2265625,
"learning_rate": 4.895681814485277e-06,
"loss": 2.0683,
"step": 3115
},
{
"epoch": 2.1131053166271587,
"grad_norm": 1.203125,
"learning_rate": 4.861825536645335e-06,
"loss": 2.0845,
"step": 3120
},
{
"epoch": 2.116491703352523,
"grad_norm": 1.1875,
"learning_rate": 4.828049093448216e-06,
"loss": 2.0827,
"step": 3125
},
{
"epoch": 2.119878090077887,
"grad_norm": 1.2265625,
"learning_rate": 4.794353009697105e-06,
"loss": 2.0843,
"step": 3130
},
{
"epoch": 2.123264476803251,
"grad_norm": 1.21875,
"learning_rate": 4.760737808946585e-06,
"loss": 2.0966,
"step": 3135
},
{
"epoch": 2.126650863528615,
"grad_norm": 1.234375,
"learning_rate": 4.727204013494515e-06,
"loss": 2.1099,
"step": 3140
},
{
"epoch": 2.130037250253979,
"grad_norm": 1.1875,
"learning_rate": 4.693752144373929e-06,
"loss": 2.0737,
"step": 3145
},
{
"epoch": 2.133423636979343,
"grad_norm": 1.2265625,
"learning_rate": 4.660382721344909e-06,
"loss": 2.0899,
"step": 3150
},
{
"epoch": 2.136810023704707,
"grad_norm": 1.234375,
"learning_rate": 4.627096262886542e-06,
"loss": 2.0803,
"step": 3155
},
{
"epoch": 2.1401964104300712,
"grad_norm": 1.234375,
"learning_rate": 4.59389328618884e-06,
"loss": 2.0913,
"step": 3160
},
{
"epoch": 2.1435827971554353,
"grad_norm": 1.203125,
"learning_rate": 4.5607743071447085e-06,
"loss": 2.0812,
"step": 3165
},
{
"epoch": 2.1469691838807994,
"grad_norm": 1.2109375,
"learning_rate": 4.52773984034195e-06,
"loss": 2.0924,
"step": 3170
},
{
"epoch": 2.1503555706061634,
"grad_norm": 1.2421875,
"learning_rate": 4.494790399055242e-06,
"loss": 2.1026,
"step": 3175
},
{
"epoch": 2.153741957331527,
"grad_norm": 1.203125,
"learning_rate": 4.461926495238178e-06,
"loss": 2.0782,
"step": 3180
},
{
"epoch": 2.157128344056891,
"grad_norm": 1.2265625,
"learning_rate": 4.429148639515304e-06,
"loss": 2.0978,
"step": 3185
},
{
"epoch": 2.160514730782255,
"grad_norm": 1.2109375,
"learning_rate": 4.3964573411741994e-06,
"loss": 2.0553,
"step": 3190
},
{
"epoch": 2.1639011175076193,
"grad_norm": 1.234375,
"learning_rate": 4.363853108157545e-06,
"loss": 2.0731,
"step": 3195
},
{
"epoch": 2.1672875042329833,
"grad_norm": 1.1875,
"learning_rate": 4.33133644705524e-06,
"loss": 2.0734,
"step": 3200
},
{
"epoch": 2.1706738909583474,
"grad_norm": 1.234375,
"learning_rate": 4.29890786309654e-06,
"loss": 2.1203,
"step": 3205
},
{
"epoch": 2.1740602776837115,
"grad_norm": 1.25,
"learning_rate": 4.266567860142184e-06,
"loss": 2.0883,
"step": 3210
},
{
"epoch": 2.1774466644090755,
"grad_norm": 1.2109375,
"learning_rate": 4.234316940676592e-06,
"loss": 2.0996,
"step": 3215
},
{
"epoch": 2.1808330511344396,
"grad_norm": 1.21875,
"learning_rate": 4.202155605800035e-06,
"loss": 2.0974,
"step": 3220
},
{
"epoch": 2.1842194378598037,
"grad_norm": 1.21875,
"learning_rate": 4.170084355220862e-06,
"loss": 2.0774,
"step": 3225
},
{
"epoch": 2.1876058245851677,
"grad_norm": 1.25,
"learning_rate": 4.13810368724774e-06,
"loss": 2.0841,
"step": 3230
},
{
"epoch": 2.190992211310532,
"grad_norm": 1.2109375,
"learning_rate": 4.1062140987818934e-06,
"loss": 2.1023,
"step": 3235
},
{
"epoch": 2.194378598035896,
"grad_norm": 1.21875,
"learning_rate": 4.074416085309399e-06,
"loss": 2.0651,
"step": 3240
},
{
"epoch": 2.19776498476126,
"grad_norm": 1.1875,
"learning_rate": 4.042710140893482e-06,
"loss": 2.0787,
"step": 3245
},
{
"epoch": 2.201151371486624,
"grad_norm": 1.203125,
"learning_rate": 4.011096758166837e-06,
"loss": 2.066,
"step": 3250
},
{
"epoch": 2.2045377582119876,
"grad_norm": 1.21875,
"learning_rate": 3.979576428323988e-06,
"loss": 2.1087,
"step": 3255
},
{
"epoch": 2.2079241449373517,
"grad_norm": 1.203125,
"learning_rate": 3.948149641113628e-06,
"loss": 2.0741,
"step": 3260
},
{
"epoch": 2.2113105316627157,
"grad_norm": 1.203125,
"learning_rate": 3.916816884831044e-06,
"loss": 2.0685,
"step": 3265
},
{
"epoch": 2.21469691838808,
"grad_norm": 1.203125,
"learning_rate": 3.8855786463105015e-06,
"loss": 2.0564,
"step": 3270
},
{
"epoch": 2.218083305113444,
"grad_norm": 1.203125,
"learning_rate": 3.854435410917689e-06,
"loss": 2.0938,
"step": 3275
},
{
"epoch": 2.221469691838808,
"grad_norm": 1.234375,
"learning_rate": 3.823387662542196e-06,
"loss": 2.08,
"step": 3280
},
{
"epoch": 2.224856078564172,
"grad_norm": 1.234375,
"learning_rate": 3.7924358835899556e-06,
"loss": 2.0755,
"step": 3285
},
{
"epoch": 2.228242465289536,
"grad_norm": 1.2265625,
"learning_rate": 3.76158055497579e-06,
"loss": 2.0729,
"step": 3290
},
{
"epoch": 2.2316288520149,
"grad_norm": 1.2421875,
"learning_rate": 3.730822156115904e-06,
"loss": 2.0929,
"step": 3295
},
{
"epoch": 2.235015238740264,
"grad_norm": 1.25,
"learning_rate": 3.7001611649204685e-06,
"loss": 2.1104,
"step": 3300
},
{
"epoch": 2.2384016254656283,
"grad_norm": 1.234375,
"learning_rate": 3.6695980577861614e-06,
"loss": 2.0777,
"step": 3305
},
{
"epoch": 2.2417880121909923,
"grad_norm": 1.234375,
"learning_rate": 3.6391333095887917e-06,
"loss": 2.0583,
"step": 3310
},
{
"epoch": 2.2451743989163564,
"grad_norm": 1.203125,
"learning_rate": 3.6087673936759084e-06,
"loss": 2.0685,
"step": 3315
},
{
"epoch": 2.2485607856417205,
"grad_norm": 1.2109375,
"learning_rate": 3.578500781859444e-06,
"loss": 2.0925,
"step": 3320
},
{
"epoch": 2.2519471723670845,
"grad_norm": 1.203125,
"learning_rate": 3.548333944408404e-06,
"loss": 2.0788,
"step": 3325
},
{
"epoch": 2.255333559092448,
"grad_norm": 1.21875,
"learning_rate": 3.518267350041529e-06,
"loss": 2.0847,
"step": 3330
},
{
"epoch": 2.258719945817812,
"grad_norm": 1.2109375,
"learning_rate": 3.488301465920031e-06,
"loss": 2.0857,
"step": 3335
},
{
"epoch": 2.2621063325431763,
"grad_norm": 1.234375,
"learning_rate": 3.458436757640341e-06,
"loss": 2.0855,
"step": 3340
},
{
"epoch": 2.2654927192685403,
"grad_norm": 1.2421875,
"learning_rate": 3.428673689226851e-06,
"loss": 2.1017,
"step": 3345
},
{
"epoch": 2.2688791059939044,
"grad_norm": 1.21875,
"learning_rate": 3.3990127231247337e-06,
"loss": 2.0951,
"step": 3350
},
{
"epoch": 2.2722654927192685,
"grad_norm": 1.234375,
"learning_rate": 3.3694543201927288e-06,
"loss": 2.0723,
"step": 3355
},
{
"epoch": 2.2756518794446325,
"grad_norm": 1.2421875,
"learning_rate": 3.3399989396959988e-06,
"loss": 2.071,
"step": 3360
},
{
"epoch": 2.2790382661699966,
"grad_norm": 1.2109375,
"learning_rate": 3.3106470392989977e-06,
"loss": 2.0926,
"step": 3365
},
{
"epoch": 2.2824246528953607,
"grad_norm": 1.1953125,
"learning_rate": 3.2813990750583434e-06,
"loss": 2.0823,
"step": 3370
},
{
"epoch": 2.2858110396207247,
"grad_norm": 1.21875,
"learning_rate": 3.2522555014157454e-06,
"loss": 2.0888,
"step": 3375
},
{
"epoch": 2.289197426346089,
"grad_norm": 1.1796875,
"learning_rate": 3.2232167711909368e-06,
"loss": 2.0763,
"step": 3380
},
{
"epoch": 2.292583813071453,
"grad_norm": 1.1875,
"learning_rate": 3.194283335574637e-06,
"loss": 2.0701,
"step": 3385
},
{
"epoch": 2.295970199796817,
"grad_norm": 1.2265625,
"learning_rate": 3.1654556441215577e-06,
"loss": 2.0877,
"step": 3390
},
{
"epoch": 2.299356586522181,
"grad_norm": 1.234375,
"learning_rate": 3.1367341447433897e-06,
"loss": 2.0859,
"step": 3395
},
{
"epoch": 2.302742973247545,
"grad_norm": 1.2265625,
"learning_rate": 3.1081192837018736e-06,
"loss": 2.0796,
"step": 3400
},
{
"epoch": 2.3061293599729087,
"grad_norm": 1.1953125,
"learning_rate": 3.0796115056018383e-06,
"loss": 2.0888,
"step": 3405
},
{
"epoch": 2.3095157466982728,
"grad_norm": 1.203125,
"learning_rate": 3.0512112533843185e-06,
"loss": 2.0851,
"step": 3410
},
{
"epoch": 2.312902133423637,
"grad_norm": 1.21875,
"learning_rate": 3.0229189683196513e-06,
"loss": 2.0888,
"step": 3415
},
{
"epoch": 2.316288520149001,
"grad_norm": 1.1875,
"learning_rate": 2.994735090000629e-06,
"loss": 2.0817,
"step": 3420
},
{
"epoch": 2.319674906874365,
"grad_norm": 1.2265625,
"learning_rate": 2.9666600563356773e-06,
"loss": 2.0871,
"step": 3425
},
{
"epoch": 2.323061293599729,
"grad_norm": 1.203125,
"learning_rate": 2.938694303542032e-06,
"loss": 2.0758,
"step": 3430
},
{
"epoch": 2.326447680325093,
"grad_norm": 1.25,
"learning_rate": 2.910838266138979e-06,
"loss": 2.08,
"step": 3435
},
{
"epoch": 2.329834067050457,
"grad_norm": 1.2109375,
"learning_rate": 2.8830923769410903e-06,
"loss": 2.0799,
"step": 3440
},
{
"epoch": 2.333220453775821,
"grad_norm": 1.2421875,
"learning_rate": 2.855457067051507e-06,
"loss": 2.0702,
"step": 3445
},
{
"epoch": 2.3366068405011853,
"grad_norm": 1.21875,
"learning_rate": 2.8279327658552447e-06,
"loss": 2.0697,
"step": 3450
},
{
"epoch": 2.3399932272265493,
"grad_norm": 1.203125,
"learning_rate": 2.800519901012504e-06,
"loss": 2.083,
"step": 3455
},
{
"epoch": 2.3433796139519134,
"grad_norm": 1.1953125,
"learning_rate": 2.77321889845205e-06,
"loss": 2.0727,
"step": 3460
},
{
"epoch": 2.3467660006772775,
"grad_norm": 1.171875,
"learning_rate": 2.746030182364574e-06,
"loss": 2.0728,
"step": 3465
},
{
"epoch": 2.3501523874026415,
"grad_norm": 1.203125,
"learning_rate": 2.718954175196109e-06,
"loss": 2.0772,
"step": 3470
},
{
"epoch": 2.3535387741280056,
"grad_norm": 1.25,
"learning_rate": 2.6919912976414787e-06,
"loss": 2.0827,
"step": 3475
},
{
"epoch": 2.3569251608533692,
"grad_norm": 1.2109375,
"learning_rate": 2.6651419686377335e-06,
"loss": 2.0773,
"step": 3480
},
{
"epoch": 2.3603115475787337,
"grad_norm": 1.171875,
"learning_rate": 2.6384066053576742e-06,
"loss": 2.0858,
"step": 3485
},
{
"epoch": 2.3636979343040974,
"grad_norm": 1.1796875,
"learning_rate": 2.611785623203341e-06,
"loss": 2.1062,
"step": 3490
},
{
"epoch": 2.3670843210294614,
"grad_norm": 1.21875,
"learning_rate": 2.585279435799578e-06,
"loss": 2.0904,
"step": 3495
},
{
"epoch": 2.3704707077548255,
"grad_norm": 1.21875,
"learning_rate": 2.558888454987598e-06,
"loss": 2.0853,
"step": 3500
},
{
"epoch": 2.3738570944801896,
"grad_norm": 1.2421875,
"learning_rate": 2.532613090818582e-06,
"loss": 2.0966,
"step": 3505
},
{
"epoch": 2.3772434812055536,
"grad_norm": 1.21875,
"learning_rate": 2.506453751547322e-06,
"loss": 2.1083,
"step": 3510
},
{
"epoch": 2.3806298679309177,
"grad_norm": 1.21875,
"learning_rate": 2.4804108436258545e-06,
"loss": 2.0822,
"step": 3515
},
{
"epoch": 2.3840162546562818,
"grad_norm": 1.21875,
"learning_rate": 2.454484771697171e-06,
"loss": 2.0813,
"step": 3520
},
{
"epoch": 2.387402641381646,
"grad_norm": 1.203125,
"learning_rate": 2.428675938588908e-06,
"loss": 2.0685,
"step": 3525
},
{
"epoch": 2.39078902810701,
"grad_norm": 1.1796875,
"learning_rate": 2.4029847453070987e-06,
"loss": 2.0774,
"step": 3530
},
{
"epoch": 2.394175414832374,
"grad_norm": 1.1875,
"learning_rate": 2.37741159102995e-06,
"loss": 2.0857,
"step": 3535
},
{
"epoch": 2.397561801557738,
"grad_norm": 1.203125,
"learning_rate": 2.3519568731016205e-06,
"loss": 2.0696,
"step": 3540
},
{
"epoch": 2.400948188283102,
"grad_norm": 1.2109375,
"learning_rate": 2.326620987026069e-06,
"loss": 2.0851,
"step": 3545
},
{
"epoch": 2.404334575008466,
"grad_norm": 1.1796875,
"learning_rate": 2.301404326460891e-06,
"loss": 2.1071,
"step": 3550
},
{
"epoch": 2.4077209617338298,
"grad_norm": 1.1875,
"learning_rate": 2.276307283211209e-06,
"loss": 2.0909,
"step": 3555
},
{
"epoch": 2.4111073484591943,
"grad_norm": 1.171875,
"learning_rate": 2.2513302472235922e-06,
"loss": 2.09,
"step": 3560
},
{
"epoch": 2.414493735184558,
"grad_norm": 1.2265625,
"learning_rate": 2.2264736065799863e-06,
"loss": 2.0854,
"step": 3565
},
{
"epoch": 2.417880121909922,
"grad_norm": 1.203125,
"learning_rate": 2.201737747491688e-06,
"loss": 2.0796,
"step": 3570
},
{
"epoch": 2.421266508635286,
"grad_norm": 1.2109375,
"learning_rate": 2.177123054293345e-06,
"loss": 2.0866,
"step": 3575
},
{
"epoch": 2.42465289536065,
"grad_norm": 1.2265625,
"learning_rate": 2.152629909436984e-06,
"loss": 2.0796,
"step": 3580
},
{
"epoch": 2.428039282086014,
"grad_norm": 1.21875,
"learning_rate": 2.128258693486073e-06,
"loss": 2.092,
"step": 3585
},
{
"epoch": 2.4314256688113782,
"grad_norm": 1.21875,
"learning_rate": 2.1040097851095933e-06,
"loss": 2.1183,
"step": 3590
},
{
"epoch": 2.4348120555367423,
"grad_norm": 1.234375,
"learning_rate": 2.079883561076178e-06,
"loss": 2.071,
"step": 3595
},
{
"epoch": 2.4381984422621064,
"grad_norm": 1.2265625,
"learning_rate": 2.0558803962482375e-06,
"loss": 2.1,
"step": 3600
},
{
"epoch": 2.4415848289874704,
"grad_norm": 1.2421875,
"learning_rate": 2.0320006635761435e-06,
"loss": 2.0839,
"step": 3605
},
{
"epoch": 2.4449712157128345,
"grad_norm": 1.1875,
"learning_rate": 2.008244734092444e-06,
"loss": 2.0761,
"step": 3610
},
{
"epoch": 2.4483576024381986,
"grad_norm": 1.203125,
"learning_rate": 1.984612976906075e-06,
"loss": 2.0696,
"step": 3615
},
{
"epoch": 2.4517439891635626,
"grad_norm": 1.203125,
"learning_rate": 1.9611057591966522e-06,
"loss": 2.0966,
"step": 3620
},
{
"epoch": 2.4551303758889267,
"grad_norm": 1.2109375,
"learning_rate": 1.9377234462087447e-06,
"loss": 2.0985,
"step": 3625
},
{
"epoch": 2.4585167626142903,
"grad_norm": 1.203125,
"learning_rate": 1.91446640124621e-06,
"loss": 2.0828,
"step": 3630
},
{
"epoch": 2.461903149339655,
"grad_norm": 1.2421875,
"learning_rate": 1.8913349856665486e-06,
"loss": 2.0791,
"step": 3635
},
{
"epoch": 2.4652895360650184,
"grad_norm": 1.234375,
"learning_rate": 1.8683295588752847e-06,
"loss": 2.0866,
"step": 3640
},
{
"epoch": 2.4686759227903825,
"grad_norm": 1.21875,
"learning_rate": 1.845450478320393e-06,
"loss": 2.1008,
"step": 3645
},
{
"epoch": 2.4720623095157466,
"grad_norm": 1.1875,
"learning_rate": 1.822698099486727e-06,
"loss": 2.0725,
"step": 3650
},
{
"epoch": 2.4754486962411106,
"grad_norm": 1.21875,
"learning_rate": 1.8000727758905168e-06,
"loss": 2.0797,
"step": 3655
},
{
"epoch": 2.4788350829664747,
"grad_norm": 1.1796875,
"learning_rate": 1.7775748590738584e-06,
"loss": 2.0912,
"step": 3660
},
{
"epoch": 2.4822214696918388,
"grad_norm": 1.203125,
"learning_rate": 1.7552046985992555e-06,
"loss": 2.0789,
"step": 3665
},
{
"epoch": 2.485607856417203,
"grad_norm": 1.234375,
"learning_rate": 1.7329626420442026e-06,
"loss": 2.0996,
"step": 3670
},
{
"epoch": 2.488994243142567,
"grad_norm": 1.2421875,
"learning_rate": 1.7108490349957607e-06,
"loss": 2.0803,
"step": 3675
},
{
"epoch": 2.492380629867931,
"grad_norm": 1.2109375,
"learning_rate": 1.6888642210452112e-06,
"loss": 2.0879,
"step": 3680
},
{
"epoch": 2.495767016593295,
"grad_norm": 1.203125,
"learning_rate": 1.6670085417826975e-06,
"loss": 2.0896,
"step": 3685
},
{
"epoch": 2.499153403318659,
"grad_norm": 1.2265625,
"learning_rate": 1.6452823367919314e-06,
"loss": 2.076,
"step": 3690
},
{
"epoch": 2.502539790044023,
"grad_norm": 1.203125,
"learning_rate": 1.6236859436449082e-06,
"loss": 2.1018,
"step": 3695
},
{
"epoch": 2.5059261767693872,
"grad_norm": 1.2421875,
"learning_rate": 1.6022196978966653e-06,
"loss": 2.0709,
"step": 3700
},
{
"epoch": 2.509312563494751,
"grad_norm": 1.1953125,
"learning_rate": 1.5808839330800751e-06,
"loss": 2.0698,
"step": 3705
},
{
"epoch": 2.5126989502201154,
"grad_norm": 1.1875,
"learning_rate": 1.5596789807006496e-06,
"loss": 2.0919,
"step": 3710
},
{
"epoch": 2.516085336945479,
"grad_norm": 1.2421875,
"learning_rate": 1.5386051702313932e-06,
"loss": 2.1077,
"step": 3715
},
{
"epoch": 2.519471723670843,
"grad_norm": 1.2109375,
"learning_rate": 1.5176628291076967e-06,
"loss": 2.0902,
"step": 3720
},
{
"epoch": 2.522858110396207,
"grad_norm": 1.2265625,
"learning_rate": 1.4968522827222277e-06,
"loss": 2.0845,
"step": 3725
},
{
"epoch": 2.526244497121571,
"grad_norm": 1.2265625,
"learning_rate": 1.4761738544198966e-06,
"loss": 2.0843,
"step": 3730
},
{
"epoch": 2.5296308838469352,
"grad_norm": 1.171875,
"learning_rate": 1.4556278654928146e-06,
"loss": 2.0749,
"step": 3735
},
{
"epoch": 2.5330172705722993,
"grad_norm": 1.2265625,
"learning_rate": 1.435214635175316e-06,
"loss": 2.0794,
"step": 3740
},
{
"epoch": 2.5364036572976634,
"grad_norm": 1.2578125,
"learning_rate": 1.4149344806389863e-06,
"loss": 2.0812,
"step": 3745
},
{
"epoch": 2.5397900440230274,
"grad_norm": 1.1953125,
"learning_rate": 1.394787716987741e-06,
"loss": 2.0753,
"step": 3750
},
{
"epoch": 2.5431764307483915,
"grad_norm": 1.2265625,
"learning_rate": 1.3747746572529275e-06,
"loss": 2.0816,
"step": 3755
},
{
"epoch": 2.5465628174737556,
"grad_norm": 1.203125,
"learning_rate": 1.3548956123884649e-06,
"loss": 2.0848,
"step": 3760
},
{
"epoch": 2.5499492041991196,
"grad_norm": 1.1875,
"learning_rate": 1.3351508912660084e-06,
"loss": 2.0692,
"step": 3765
},
{
"epoch": 2.5533355909244837,
"grad_norm": 1.21875,
"learning_rate": 1.3155408006701476e-06,
"loss": 2.0906,
"step": 3770
},
{
"epoch": 2.5567219776498478,
"grad_norm": 1.1953125,
"learning_rate": 1.296065645293646e-06,
"loss": 2.0738,
"step": 3775
},
{
"epoch": 2.5601083643752114,
"grad_norm": 1.25,
"learning_rate": 1.2767257277327083e-06,
"loss": 2.0964,
"step": 3780
},
{
"epoch": 2.563494751100576,
"grad_norm": 1.21875,
"learning_rate": 1.2575213484822669e-06,
"loss": 2.0804,
"step": 3785
},
{
"epoch": 2.5668811378259395,
"grad_norm": 1.1796875,
"learning_rate": 1.2384528059313306e-06,
"loss": 2.0697,
"step": 3790
},
{
"epoch": 2.570267524551304,
"grad_norm": 1.2109375,
"learning_rate": 1.21952039635833e-06,
"loss": 2.0894,
"step": 3795
},
{
"epoch": 2.5736539112766676,
"grad_norm": 1.21875,
"learning_rate": 1.200724413926525e-06,
"loss": 2.0812,
"step": 3800
},
{
"epoch": 2.5770402980020317,
"grad_norm": 1.234375,
"learning_rate": 1.1820651506794368e-06,
"loss": 2.1039,
"step": 3805
},
{
"epoch": 2.580426684727396,
"grad_norm": 1.2109375,
"learning_rate": 1.1635428965362982e-06,
"loss": 2.0739,
"step": 3810
},
{
"epoch": 2.58381307145276,
"grad_norm": 1.1875,
"learning_rate": 1.1451579392875633e-06,
"loss": 2.0805,
"step": 3815
},
{
"epoch": 2.587199458178124,
"grad_norm": 1.203125,
"learning_rate": 1.1269105645904232e-06,
"loss": 2.0979,
"step": 3820
},
{
"epoch": 2.590585844903488,
"grad_norm": 1.2265625,
"learning_rate": 1.1088010559643758e-06,
"loss": 2.086,
"step": 3825
},
{
"epoch": 2.593972231628852,
"grad_norm": 1.21875,
"learning_rate": 1.090829694786818e-06,
"loss": 2.0914,
"step": 3830
},
{
"epoch": 2.597358618354216,
"grad_norm": 1.2421875,
"learning_rate": 1.0729967602886703e-06,
"loss": 2.0993,
"step": 3835
},
{
"epoch": 2.60074500507958,
"grad_norm": 1.2109375,
"learning_rate": 1.0553025295500484e-06,
"loss": 2.0861,
"step": 3840
},
{
"epoch": 2.6041313918049442,
"grad_norm": 1.234375,
"learning_rate": 1.037747277495945e-06,
"loss": 2.0837,
"step": 3845
},
{
"epoch": 2.6075177785303083,
"grad_norm": 1.2109375,
"learning_rate": 1.0203312768919716e-06,
"loss": 2.0849,
"step": 3850
},
{
"epoch": 2.610904165255672,
"grad_norm": 1.21875,
"learning_rate": 1.003054798340105e-06,
"loss": 2.0779,
"step": 3855
},
{
"epoch": 2.6142905519810364,
"grad_norm": 1.234375,
"learning_rate": 9.859181102744963e-07,
"loss": 2.0924,
"step": 3860
},
{
"epoch": 2.6176769387064,
"grad_norm": 1.1953125,
"learning_rate": 9.689214789572943e-07,
"loss": 2.0715,
"step": 3865
},
{
"epoch": 2.6210633254317646,
"grad_norm": 1.1953125,
"learning_rate": 9.520651684745064e-07,
"loss": 2.0824,
"step": 3870
},
{
"epoch": 2.624449712157128,
"grad_norm": 1.2109375,
"learning_rate": 9.353494407319052e-07,
"loss": 2.0809,
"step": 3875
},
{
"epoch": 2.6278360988824923,
"grad_norm": 1.203125,
"learning_rate": 9.187745554509431e-07,
"loss": 2.0817,
"step": 3880
},
{
"epoch": 2.6312224856078563,
"grad_norm": 1.2109375,
"learning_rate": 9.023407701647291e-07,
"loss": 2.0802,
"step": 3885
},
{
"epoch": 2.6346088723332204,
"grad_norm": 1.21875,
"learning_rate": 8.860483402140263e-07,
"loss": 2.0896,
"step": 3890
},
{
"epoch": 2.6379952590585845,
"grad_norm": 1.2109375,
"learning_rate": 8.698975187432779e-07,
"loss": 2.0717,
"step": 3895
},
{
"epoch": 2.6413816457839485,
"grad_norm": 1.1484375,
"learning_rate": 8.538885566966837e-07,
"loss": 2.0819,
"step": 3900
},
{
"epoch": 2.6447680325093126,
"grad_norm": 1.1953125,
"learning_rate": 8.380217028142912e-07,
"loss": 2.0907,
"step": 3905
},
{
"epoch": 2.6481544192346766,
"grad_norm": 1.234375,
"learning_rate": 8.222972036281351e-07,
"loss": 2.0789,
"step": 3910
},
{
"epoch": 2.6515408059600407,
"grad_norm": 1.2109375,
"learning_rate": 8.067153034584108e-07,
"loss": 2.0913,
"step": 3915
},
{
"epoch": 2.654927192685405,
"grad_norm": 1.171875,
"learning_rate": 7.912762444096689e-07,
"loss": 2.0671,
"step": 3920
},
{
"epoch": 2.658313579410769,
"grad_norm": 1.1796875,
"learning_rate": 7.759802663670657e-07,
"loss": 2.0727,
"step": 3925
},
{
"epoch": 2.661699966136133,
"grad_norm": 1.21875,
"learning_rate": 7.608276069926224e-07,
"loss": 2.0865,
"step": 3930
},
{
"epoch": 2.665086352861497,
"grad_norm": 1.25,
"learning_rate": 7.458185017215425e-07,
"loss": 2.0759,
"step": 3935
},
{
"epoch": 2.6684727395868606,
"grad_norm": 1.203125,
"learning_rate": 7.309531837585526e-07,
"loss": 2.103,
"step": 3940
},
{
"epoch": 2.671859126312225,
"grad_norm": 1.1953125,
"learning_rate": 7.162318840742732e-07,
"loss": 2.1004,
"step": 3945
},
{
"epoch": 2.6752455130375887,
"grad_norm": 1.171875,
"learning_rate": 7.016548314016336e-07,
"loss": 2.0904,
"step": 3950
},
{
"epoch": 2.678631899762953,
"grad_norm": 1.25,
"learning_rate": 6.87222252232318e-07,
"loss": 2.1003,
"step": 3955
},
{
"epoch": 2.682018286488317,
"grad_norm": 1.2109375,
"learning_rate": 6.729343708132507e-07,
"loss": 2.0847,
"step": 3960
},
{
"epoch": 2.685404673213681,
"grad_norm": 1.203125,
"learning_rate": 6.587914091431036e-07,
"loss": 2.0695,
"step": 3965
},
{
"epoch": 2.688791059939045,
"grad_norm": 1.21875,
"learning_rate": 6.447935869688482e-07,
"loss": 2.0567,
"step": 3970
},
{
"epoch": 2.692177446664409,
"grad_norm": 1.21875,
"learning_rate": 6.309411217823503e-07,
"loss": 2.0951,
"step": 3975
},
{
"epoch": 2.695563833389773,
"grad_norm": 1.1953125,
"learning_rate": 6.172342288169786e-07,
"loss": 2.0876,
"step": 3980
},
{
"epoch": 2.698950220115137,
"grad_norm": 1.1875,
"learning_rate": 6.036731210442715e-07,
"loss": 2.0966,
"step": 3985
},
{
"epoch": 2.7023366068405013,
"grad_norm": 1.1953125,
"learning_rate": 5.902580091706167e-07,
"loss": 2.1063,
"step": 3990
},
{
"epoch": 2.7057229935658653,
"grad_norm": 1.21875,
"learning_rate": 5.769891016339868e-07,
"loss": 2.0848,
"step": 3995
},
{
"epoch": 2.7091093802912294,
"grad_norm": 1.25,
"learning_rate": 5.63866604600698e-07,
"loss": 2.0966,
"step": 4000
},
{
"epoch": 2.7124957670165935,
"grad_norm": 1.2578125,
"learning_rate": 5.508907219622029e-07,
"loss": 2.0774,
"step": 4005
},
{
"epoch": 2.7158821537419575,
"grad_norm": 1.1875,
"learning_rate": 5.380616553319273e-07,
"loss": 2.0928,
"step": 4010
},
{
"epoch": 2.719268540467321,
"grad_norm": 1.203125,
"learning_rate": 5.253796040421366e-07,
"loss": 2.0798,
"step": 4015
},
{
"epoch": 2.7226549271926856,
"grad_norm": 1.234375,
"learning_rate": 5.128447651408386e-07,
"loss": 2.0824,
"step": 4020
},
{
"epoch": 2.7260413139180493,
"grad_norm": 1.1875,
"learning_rate": 5.004573333887175e-07,
"loss": 2.0768,
"step": 4025
},
{
"epoch": 2.7294277006434133,
"grad_norm": 1.171875,
"learning_rate": 4.882175012561141e-07,
"loss": 2.0848,
"step": 4030
},
{
"epoch": 2.7328140873687774,
"grad_norm": 1.1953125,
"learning_rate": 4.761254589200359e-07,
"loss": 2.0625,
"step": 4035
},
{
"epoch": 2.7362004740941415,
"grad_norm": 1.21875,
"learning_rate": 4.641813942611917e-07,
"loss": 2.1149,
"step": 4040
},
{
"epoch": 2.7395868608195055,
"grad_norm": 1.1796875,
"learning_rate": 4.523854928610849e-07,
"loss": 2.0874,
"step": 4045
},
{
"epoch": 2.7429732475448696,
"grad_norm": 1.2578125,
"learning_rate": 4.407379379991261e-07,
"loss": 2.0857,
"step": 4050
},
{
"epoch": 2.7463596342702337,
"grad_norm": 1.2265625,
"learning_rate": 4.292389106497785e-07,
"loss": 2.0922,
"step": 4055
},
{
"epoch": 2.7497460209955977,
"grad_norm": 1.1796875,
"learning_rate": 4.178885894797569e-07,
"loss": 2.0767,
"step": 4060
},
{
"epoch": 2.753132407720962,
"grad_norm": 1.21875,
"learning_rate": 4.066871508452441e-07,
"loss": 2.0927,
"step": 4065
},
{
"epoch": 2.756518794446326,
"grad_norm": 1.1796875,
"learning_rate": 3.9563476878915374e-07,
"loss": 2.0975,
"step": 4070
},
{
"epoch": 2.75990518117169,
"grad_norm": 1.203125,
"learning_rate": 3.847316150384239e-07,
"loss": 2.0766,
"step": 4075
},
{
"epoch": 2.763291567897054,
"grad_norm": 1.1953125,
"learning_rate": 3.7397785900135317e-07,
"loss": 2.0697,
"step": 4080
},
{
"epoch": 2.766677954622418,
"grad_norm": 1.203125,
"learning_rate": 3.633736677649624e-07,
"loss": 2.0817,
"step": 4085
},
{
"epoch": 2.7700643413477817,
"grad_norm": 1.2578125,
"learning_rate": 3.529192060924036e-07,
"loss": 2.0901,
"step": 4090
},
{
"epoch": 2.773450728073146,
"grad_norm": 1.203125,
"learning_rate": 3.426146364203997e-07,
"loss": 2.0796,
"step": 4095
},
{
"epoch": 2.77683711479851,
"grad_norm": 1.2109375,
"learning_rate": 3.324601188567167e-07,
"loss": 2.0835,
"step": 4100
},
{
"epoch": 2.780223501523874,
"grad_norm": 1.234375,
"learning_rate": 3.2245581117767875e-07,
"loss": 2.1009,
"step": 4105
},
{
"epoch": 2.783609888249238,
"grad_norm": 1.203125,
"learning_rate": 3.126018688257182e-07,
"loss": 2.0807,
"step": 4110
},
{
"epoch": 2.786996274974602,
"grad_norm": 1.1875,
"learning_rate": 3.028984449069561e-07,
"loss": 2.0664,
"step": 4115
},
{
"epoch": 2.790382661699966,
"grad_norm": 1.1953125,
"learning_rate": 2.9334569018882874e-07,
"loss": 2.0727,
"step": 4120
},
{
"epoch": 2.79376904842533,
"grad_norm": 1.2265625,
"learning_rate": 2.839437530977385e-07,
"loss": 2.0744,
"step": 4125
},
{
"epoch": 2.797155435150694,
"grad_norm": 1.2421875,
"learning_rate": 2.7469277971675536e-07,
"loss": 2.0825,
"step": 4130
},
{
"epoch": 2.8005418218760583,
"grad_norm": 1.2265625,
"learning_rate": 2.6559291378334016e-07,
"loss": 2.0826,
"step": 4135
},
{
"epoch": 2.8039282086014223,
"grad_norm": 1.2265625,
"learning_rate": 2.566442966871152e-07,
"loss": 2.0688,
"step": 4140
},
{
"epoch": 2.8073145953267864,
"grad_norm": 1.1953125,
"learning_rate": 2.4784706746766694e-07,
"loss": 2.0766,
"step": 4145
},
{
"epoch": 2.8107009820521505,
"grad_norm": 1.21875,
"learning_rate": 2.3920136281238347e-07,
"loss": 2.0842,
"step": 4150
},
{
"epoch": 2.8140873687775145,
"grad_norm": 1.1875,
"learning_rate": 2.3070731705433392e-07,
"loss": 2.0715,
"step": 4155
},
{
"epoch": 2.8174737555028786,
"grad_norm": 1.2109375,
"learning_rate": 2.2236506217018007e-07,
"loss": 2.0785,
"step": 4160
},
{
"epoch": 2.820860142228242,
"grad_norm": 1.21875,
"learning_rate": 2.1417472777812365e-07,
"loss": 2.0964,
"step": 4165
},
{
"epoch": 2.8242465289536067,
"grad_norm": 1.2421875,
"learning_rate": 2.0613644113589571e-07,
"loss": 2.096,
"step": 4170
},
{
"epoch": 2.8276329156789703,
"grad_norm": 1.21875,
"learning_rate": 1.982503271387759e-07,
"loss": 2.0975,
"step": 4175
},
{
"epoch": 2.8310193024043344,
"grad_norm": 1.203125,
"learning_rate": 1.9051650831765745e-07,
"loss": 2.0841,
"step": 4180
},
{
"epoch": 2.8344056891296985,
"grad_norm": 1.2265625,
"learning_rate": 1.829351048371364e-07,
"loss": 2.0861,
"step": 4185
},
{
"epoch": 2.8377920758550625,
"grad_norm": 1.234375,
"learning_rate": 1.7550623449364645e-07,
"loss": 2.098,
"step": 4190
},
{
"epoch": 2.8411784625804266,
"grad_norm": 1.21875,
"learning_rate": 1.6823001271363493e-07,
"loss": 2.0681,
"step": 4195
},
{
"epoch": 2.8445648493057907,
"grad_norm": 1.2109375,
"learning_rate": 1.611065525517619e-07,
"loss": 2.1106,
"step": 4200
},
{
"epoch": 2.8479512360311547,
"grad_norm": 1.2265625,
"learning_rate": 1.5413596468914384e-07,
"loss": 2.0908,
"step": 4205
},
{
"epoch": 2.851337622756519,
"grad_norm": 1.1875,
"learning_rate": 1.4731835743164057e-07,
"loss": 2.0829,
"step": 4210
},
{
"epoch": 2.854724009481883,
"grad_norm": 1.2265625,
"learning_rate": 1.4065383670816446e-07,
"loss": 2.084,
"step": 4215
},
{
"epoch": 2.858110396207247,
"grad_norm": 1.2578125,
"learning_rate": 1.3414250606904046e-07,
"loss": 2.0741,
"step": 4220
},
{
"epoch": 2.861496782932611,
"grad_norm": 1.2109375,
"learning_rate": 1.2778446668439415e-07,
"loss": 2.0881,
"step": 4225
},
{
"epoch": 2.864883169657975,
"grad_norm": 1.2578125,
"learning_rate": 1.2157981734258084e-07,
"loss": 2.0815,
"step": 4230
},
{
"epoch": 2.868269556383339,
"grad_norm": 1.203125,
"learning_rate": 1.1552865444865002e-07,
"loss": 2.0903,
"step": 4235
},
{
"epoch": 2.8716559431087028,
"grad_norm": 1.2109375,
"learning_rate": 1.0963107202284773e-07,
"loss": 2.1015,
"step": 4240
},
{
"epoch": 2.8750423298340673,
"grad_norm": 1.1953125,
"learning_rate": 1.0388716169915547e-07,
"loss": 2.0921,
"step": 4245
},
{
"epoch": 2.878428716559431,
"grad_norm": 1.2109375,
"learning_rate": 9.829701272386804e-08,
"loss": 2.0922,
"step": 4250
},
{
"epoch": 2.8818151032847954,
"grad_norm": 1.2421875,
"learning_rate": 9.286071195420576e-08,
"loss": 2.0977,
"step": 4255
},
{
"epoch": 2.885201490010159,
"grad_norm": 1.1953125,
"learning_rate": 8.757834385696329e-08,
"loss": 2.0848,
"step": 4260
},
{
"epoch": 2.888587876735523,
"grad_norm": 1.2265625,
"learning_rate": 8.244999050719738e-08,
"loss": 2.0834,
"step": 4265
},
{
"epoch": 2.891974263460887,
"grad_norm": 1.1640625,
"learning_rate": 7.747573158695676e-08,
"loss": 2.0504,
"step": 4270
},
{
"epoch": 2.895360650186251,
"grad_norm": 1.1875,
"learning_rate": 7.265564438403872e-08,
"loss": 2.0784,
"step": 4275
},
{
"epoch": 2.8987470369116153,
"grad_norm": 1.1875,
"learning_rate": 6.798980379078779e-08,
"loss": 2.0662,
"step": 4280
},
{
"epoch": 2.9021334236369793,
"grad_norm": 1.25,
"learning_rate": 6.347828230293563e-08,
"loss": 2.088,
"step": 4285
},
{
"epoch": 2.9055198103623434,
"grad_norm": 1.2265625,
"learning_rate": 5.912115001847407e-08,
"loss": 2.1124,
"step": 4290
},
{
"epoch": 2.9089061970877075,
"grad_norm": 1.203125,
"learning_rate": 5.491847463656497e-08,
"loss": 2.0721,
"step": 4295
},
{
"epoch": 2.9122925838130715,
"grad_norm": 1.203125,
"learning_rate": 5.08703214564843e-08,
"loss": 2.0933,
"step": 4300
},
{
"epoch": 2.9156789705384356,
"grad_norm": 1.2734375,
"learning_rate": 4.697675337661856e-08,
"loss": 2.0805,
"step": 4305
},
{
"epoch": 2.9190653572637997,
"grad_norm": 1.1953125,
"learning_rate": 4.323783089347222e-08,
"loss": 2.0628,
"step": 4310
},
{
"epoch": 2.9224517439891633,
"grad_norm": 1.15625,
"learning_rate": 3.965361210074514e-08,
"loss": 2.0946,
"step": 4315
},
{
"epoch": 2.925838130714528,
"grad_norm": 1.171875,
"learning_rate": 3.6224152688412174e-08,
"loss": 2.0809,
"step": 4320
},
{
"epoch": 2.9292245174398914,
"grad_norm": 1.1796875,
"learning_rate": 3.2949505941872736e-08,
"loss": 2.083,
"step": 4325
},
{
"epoch": 2.932610904165256,
"grad_norm": 1.2109375,
"learning_rate": 2.9829722741113734e-08,
"loss": 2.0766,
"step": 4330
},
{
"epoch": 2.9359972908906196,
"grad_norm": 1.1875,
"learning_rate": 2.6864851559923465e-08,
"loss": 2.0876,
"step": 4335
},
{
"epoch": 2.9393836776159836,
"grad_norm": 1.203125,
"learning_rate": 2.4054938465135626e-08,
"loss": 2.0948,
"step": 4340
},
{
"epoch": 2.9427700643413477,
"grad_norm": 1.21875,
"learning_rate": 2.140002711591538e-08,
"loss": 2.0879,
"step": 4345
},
{
"epoch": 2.9461564510667118,
"grad_norm": 1.2421875,
"learning_rate": 1.8900158763081045e-08,
"loss": 2.0865,
"step": 4350
},
{
"epoch": 2.949542837792076,
"grad_norm": 1.21875,
"learning_rate": 1.6555372248462376e-08,
"loss": 2.0778,
"step": 4355
},
{
"epoch": 2.95292922451744,
"grad_norm": 1.21875,
"learning_rate": 1.4365704004295489e-08,
"loss": 2.0722,
"step": 4360
},
{
"epoch": 2.956315611242804,
"grad_norm": 1.1953125,
"learning_rate": 1.2331188052663312e-08,
"loss": 2.089,
"step": 4365
},
{
"epoch": 2.959701997968168,
"grad_norm": 1.21875,
"learning_rate": 1.0451856004957128e-08,
"loss": 2.0723,
"step": 4370
},
{
"epoch": 2.963088384693532,
"grad_norm": 1.203125,
"learning_rate": 8.727737061393626e-09,
"loss": 2.0725,
"step": 4375
},
{
"epoch": 2.966474771418896,
"grad_norm": 1.1953125,
"learning_rate": 7.15885801055638e-09,
"loss": 2.0768,
"step": 4380
},
{
"epoch": 2.96986115814426,
"grad_norm": 1.203125,
"learning_rate": 5.745243228982844e-09,
"loss": 2.0641,
"step": 4385
},
{
"epoch": 2.973247544869624,
"grad_norm": 1.2109375,
"learning_rate": 4.4869146807824396e-09,
"loss": 2.0904,
"step": 4390
},
{
"epoch": 2.9766339315949883,
"grad_norm": 1.203125,
"learning_rate": 3.3838919172990426e-09,
"loss": 2.1051,
"step": 4395
},
{
"epoch": 2.980020318320352,
"grad_norm": 1.1875,
"learning_rate": 2.4361920768045668e-09,
"loss": 2.0645,
"step": 4400
},
{
"epoch": 2.9834067050457165,
"grad_norm": 1.2265625,
"learning_rate": 1.6438298842302858e-09,
"loss": 2.0866,
"step": 4405
},
{
"epoch": 2.98679309177108,
"grad_norm": 1.2265625,
"learning_rate": 1.0068176509447913e-09,
"loss": 2.0859,
"step": 4410
},
{
"epoch": 2.990179478496444,
"grad_norm": 1.1875,
"learning_rate": 5.251652745585922e-10,
"loss": 2.0636,
"step": 4415
},
{
"epoch": 2.9935658652218082,
"grad_norm": 1.1953125,
"learning_rate": 1.9888023876757368e-10,
"loss": 2.0931,
"step": 4420
},
{
"epoch": 2.9969522519471723,
"grad_norm": 1.1953125,
"learning_rate": 2.796761324419528e-11,
"loss": 2.0766,
"step": 4425
},
{
"epoch": 2.998984083982391,
"eval_loss": 2.1175546646118164,
"eval_runtime": 85.4567,
"eval_samples_per_second": 15.271,
"eval_steps_per_second": 1.919,
"step": 4428
},
{
"epoch": 2.998984083982391,
"step": 4428,
"total_flos": 9.004989021620797e+17,
"train_loss": 2.11554208852842,
"train_runtime": 19880.5774,
"train_samples_per_second": 3.564,
"train_steps_per_second": 0.223
}
],
"logging_steps": 5,
"max_steps": 4428,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.004989021620797e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}