{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996613613274636, "eval_steps": 500, "global_step": 1476, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006772773450728074, "grad_norm": 11.0625, "learning_rate": 2.702702702702703e-07, "loss": 2.4477, "step": 1 }, { "epoch": 0.0033863867253640365, "grad_norm": 11.8125, "learning_rate": 1.3513513513513515e-06, "loss": 2.4737, "step": 5 }, { "epoch": 0.006772773450728073, "grad_norm": 11.75, "learning_rate": 2.702702702702703e-06, "loss": 2.457, "step": 10 }, { "epoch": 0.01015916017609211, "grad_norm": 5.46875, "learning_rate": 4.0540540540540545e-06, "loss": 2.4211, "step": 15 }, { "epoch": 0.013545546901456146, "grad_norm": 2.875, "learning_rate": 5.405405405405406e-06, "loss": 2.3635, "step": 20 }, { "epoch": 0.016931933626820182, "grad_norm": 2.109375, "learning_rate": 6.7567567567567575e-06, "loss": 2.3577, "step": 25 }, { "epoch": 0.02031832035218422, "grad_norm": 1.7265625, "learning_rate": 8.108108108108109e-06, "loss": 2.2991, "step": 30 }, { "epoch": 0.023704707077548254, "grad_norm": 1.671875, "learning_rate": 9.45945945945946e-06, "loss": 2.2859, "step": 35 }, { "epoch": 0.027091093802912292, "grad_norm": 1.7421875, "learning_rate": 1.0810810810810812e-05, "loss": 2.2701, "step": 40 }, { "epoch": 0.03047748052827633, "grad_norm": 1.375, "learning_rate": 1.2162162162162164e-05, "loss": 2.2274, "step": 45 }, { "epoch": 0.033863867253640365, "grad_norm": 1.3203125, "learning_rate": 1.3513513513513515e-05, "loss": 2.2163, "step": 50 }, { "epoch": 0.0372502539790044, "grad_norm": 1.3046875, "learning_rate": 1.4864864864864865e-05, "loss": 2.2257, "step": 55 }, { "epoch": 0.04063664070436844, "grad_norm": 1.3046875, "learning_rate": 1.6216216216216218e-05, "loss": 2.2233, "step": 60 }, { "epoch": 0.04402302742973248, "grad_norm": 1.28125, "learning_rate": 1.756756756756757e-05, "loss": 2.2121, "step": 65 }, { "epoch": 0.04740941415509651, "grad_norm": 1.2421875, "learning_rate": 1.891891891891892e-05, "loss": 2.1914, "step": 70 }, { "epoch": 0.05079580088046055, "grad_norm": 1.2734375, "learning_rate": 2.0270270270270273e-05, "loss": 2.2119, "step": 75 }, { "epoch": 0.054182187605824585, "grad_norm": 1.21875, "learning_rate": 2.1621621621621624e-05, "loss": 2.1958, "step": 80 }, { "epoch": 0.05756857433118862, "grad_norm": 1.28125, "learning_rate": 2.2972972972972976e-05, "loss": 2.1629, "step": 85 }, { "epoch": 0.06095496105655266, "grad_norm": 1.21875, "learning_rate": 2.4324324324324327e-05, "loss": 2.1922, "step": 90 }, { "epoch": 0.06434134778191669, "grad_norm": 1.21875, "learning_rate": 2.567567567567568e-05, "loss": 2.1729, "step": 95 }, { "epoch": 0.06772773450728073, "grad_norm": 1.3203125, "learning_rate": 2.702702702702703e-05, "loss": 2.1695, "step": 100 }, { "epoch": 0.07111412123264477, "grad_norm": 1.21875, "learning_rate": 2.8378378378378378e-05, "loss": 2.1656, "step": 105 }, { "epoch": 0.0745005079580088, "grad_norm": 1.265625, "learning_rate": 2.972972972972973e-05, "loss": 2.1775, "step": 110 }, { "epoch": 0.07788689468337284, "grad_norm": 1.265625, "learning_rate": 3.108108108108108e-05, "loss": 2.1819, "step": 115 }, { "epoch": 0.08127328140873688, "grad_norm": 1.2421875, "learning_rate": 3.2432432432432436e-05, "loss": 2.1626, "step": 120 }, { "epoch": 0.08465966813410092, "grad_norm": 1.1953125, "learning_rate": 3.378378378378379e-05, "loss": 2.1499, "step": 125 }, { "epoch": 0.08804605485946496, "grad_norm": 1.2734375, "learning_rate": 3.513513513513514e-05, "loss": 2.191, "step": 130 }, { "epoch": 0.091432441584829, "grad_norm": 1.234375, "learning_rate": 3.648648648648649e-05, "loss": 2.1903, "step": 135 }, { "epoch": 0.09481882831019302, "grad_norm": 1.265625, "learning_rate": 3.783783783783784e-05, "loss": 2.1469, "step": 140 }, { "epoch": 0.09820521503555706, "grad_norm": 1.2265625, "learning_rate": 3.918918918918919e-05, "loss": 2.1473, "step": 145 }, { "epoch": 0.1015916017609211, "grad_norm": 1.2109375, "learning_rate": 3.9999776147073465e-05, "loss": 2.148, "step": 150 }, { "epoch": 0.10497798848628513, "grad_norm": 1.234375, "learning_rate": 3.999725785919791e-05, "loss": 2.1608, "step": 155 }, { "epoch": 0.10836437521164917, "grad_norm": 1.2109375, "learning_rate": 3.9991941820788456e-05, "loss": 2.1793, "step": 160 }, { "epoch": 0.11175076193701321, "grad_norm": 1.2265625, "learning_rate": 3.998382877559453e-05, "loss": 2.1449, "step": 165 }, { "epoch": 0.11513714866237724, "grad_norm": 1.1953125, "learning_rate": 3.99729198586856e-05, "loss": 2.146, "step": 170 }, { "epoch": 0.11852353538774128, "grad_norm": 1.2109375, "learning_rate": 3.995921659629232e-05, "loss": 2.1624, "step": 175 }, { "epoch": 0.12190992211310532, "grad_norm": 1.2265625, "learning_rate": 3.9942720905593045e-05, "loss": 2.1611, "step": 180 }, { "epoch": 0.12529630883846934, "grad_norm": 1.171875, "learning_rate": 3.992343509444555e-05, "loss": 2.1585, "step": 185 }, { "epoch": 0.12868269556383338, "grad_norm": 1.2578125, "learning_rate": 3.9901361861064215e-05, "loss": 2.1458, "step": 190 }, { "epoch": 0.13206908228919742, "grad_norm": 1.2421875, "learning_rate": 3.987650429364247e-05, "loss": 2.1729, "step": 195 }, { "epoch": 0.13545546901456146, "grad_norm": 1.2109375, "learning_rate": 3.984886586992077e-05, "loss": 2.1515, "step": 200 }, { "epoch": 0.1388418557399255, "grad_norm": 1.140625, "learning_rate": 3.9818450456700014e-05, "loss": 2.1697, "step": 205 }, { "epoch": 0.14222824246528953, "grad_norm": 1.1796875, "learning_rate": 3.978526230930056e-05, "loss": 2.155, "step": 210 }, { "epoch": 0.14561462919065357, "grad_norm": 1.1796875, "learning_rate": 3.97493060709669e-05, "loss": 2.1506, "step": 215 }, { "epoch": 0.1490010159160176, "grad_norm": 1.15625, "learning_rate": 3.971058677221799e-05, "loss": 2.1427, "step": 220 }, { "epoch": 0.15238740264138165, "grad_norm": 1.2578125, "learning_rate": 3.966910983014349e-05, "loss": 2.1601, "step": 225 }, { "epoch": 0.15577378936674569, "grad_norm": 1.1953125, "learning_rate": 3.962488104764586e-05, "loss": 2.1503, "step": 230 }, { "epoch": 0.15916017609210972, "grad_norm": 1.171875, "learning_rate": 3.95779066126285e-05, "loss": 2.1404, "step": 235 }, { "epoch": 0.16254656281747376, "grad_norm": 1.1875, "learning_rate": 3.952819309713002e-05, "loss": 2.1468, "step": 240 }, { "epoch": 0.1659329495428378, "grad_norm": 1.1875, "learning_rate": 3.947574745640475e-05, "loss": 2.1482, "step": 245 }, { "epoch": 0.16931933626820184, "grad_norm": 1.171875, "learning_rate": 3.942057702794969e-05, "loss": 2.1544, "step": 250 }, { "epoch": 0.17270572299356587, "grad_norm": 1.1796875, "learning_rate": 3.9362689530477915e-05, "loss": 2.1575, "step": 255 }, { "epoch": 0.1760921097189299, "grad_norm": 1.1953125, "learning_rate": 3.930209306283867e-05, "loss": 2.1607, "step": 260 }, { "epoch": 0.17947849644429395, "grad_norm": 1.1796875, "learning_rate": 3.923879610288432e-05, "loss": 2.1627, "step": 265 }, { "epoch": 0.182864883169658, "grad_norm": 1.203125, "learning_rate": 3.917280750628421e-05, "loss": 2.1424, "step": 270 }, { "epoch": 0.186251269895022, "grad_norm": 1.1640625, "learning_rate": 3.9104136505285716e-05, "loss": 2.1607, "step": 275 }, { "epoch": 0.18963765662038604, "grad_norm": 1.1875, "learning_rate": 3.903279270742259e-05, "loss": 2.136, "step": 280 }, { "epoch": 0.19302404334575007, "grad_norm": 1.1953125, "learning_rate": 3.8958786094170784e-05, "loss": 2.1558, "step": 285 }, { "epoch": 0.1964104300711141, "grad_norm": 1.1953125, "learning_rate": 3.8882127019552006e-05, "loss": 2.1368, "step": 290 }, { "epoch": 0.19979681679647815, "grad_norm": 1.171875, "learning_rate": 3.880282620868507e-05, "loss": 2.1243, "step": 295 }, { "epoch": 0.2031832035218422, "grad_norm": 1.1796875, "learning_rate": 3.872089475628546e-05, "loss": 2.1518, "step": 300 }, { "epoch": 0.20656959024720623, "grad_norm": 1.15625, "learning_rate": 3.863634412511302e-05, "loss": 2.12, "step": 305 }, { "epoch": 0.20995597697257026, "grad_norm": 1.1484375, "learning_rate": 3.8549186144368304e-05, "loss": 2.172, "step": 310 }, { "epoch": 0.2133423636979343, "grad_norm": 1.15625, "learning_rate": 3.845943300803754e-05, "loss": 2.1216, "step": 315 }, { "epoch": 0.21672875042329834, "grad_norm": 1.140625, "learning_rate": 3.8367097273186644e-05, "loss": 2.1387, "step": 320 }, { "epoch": 0.22011513714866238, "grad_norm": 1.1875, "learning_rate": 3.827219185820441e-05, "loss": 2.1338, "step": 325 }, { "epoch": 0.22350152387402641, "grad_norm": 1.1796875, "learning_rate": 3.817473004099508e-05, "loss": 2.134, "step": 330 }, { "epoch": 0.22688791059939045, "grad_norm": 1.1640625, "learning_rate": 3.807472545712076e-05, "loss": 2.1619, "step": 335 }, { "epoch": 0.2302742973247545, "grad_norm": 1.1640625, "learning_rate": 3.797219209789365e-05, "loss": 2.129, "step": 340 }, { "epoch": 0.23366068405011853, "grad_norm": 1.1484375, "learning_rate": 3.786714430841858e-05, "loss": 2.1493, "step": 345 }, { "epoch": 0.23704707077548257, "grad_norm": 1.1796875, "learning_rate": 3.7759596785586066e-05, "loss": 2.1453, "step": 350 }, { "epoch": 0.2404334575008466, "grad_norm": 1.1328125, "learning_rate": 3.7649564576016076e-05, "loss": 2.1411, "step": 355 }, { "epoch": 0.24381984422621064, "grad_norm": 1.2109375, "learning_rate": 3.7537063073952904e-05, "loss": 2.1367, "step": 360 }, { "epoch": 0.24720623095157468, "grad_norm": 1.1953125, "learning_rate": 3.742210801911147e-05, "loss": 2.1542, "step": 365 }, { "epoch": 0.2505926176769387, "grad_norm": 1.2109375, "learning_rate": 3.730471549447516e-05, "loss": 2.1327, "step": 370 }, { "epoch": 0.25397900440230275, "grad_norm": 1.1328125, "learning_rate": 3.7184901924045745e-05, "loss": 2.1263, "step": 375 }, { "epoch": 0.25736539112766676, "grad_norm": 1.15625, "learning_rate": 3.7062684070545535e-05, "loss": 2.1543, "step": 380 }, { "epoch": 0.26075177785303083, "grad_norm": 1.2109375, "learning_rate": 3.69380790330722e-05, "loss": 2.1336, "step": 385 }, { "epoch": 0.26413816457839484, "grad_norm": 1.1796875, "learning_rate": 3.6811104244706424e-05, "loss": 2.148, "step": 390 }, { "epoch": 0.2675245513037589, "grad_norm": 1.203125, "learning_rate": 3.668177747007297e-05, "loss": 2.1161, "step": 395 }, { "epoch": 0.2709109380291229, "grad_norm": 1.1875, "learning_rate": 3.6550116802855244e-05, "loss": 2.1428, "step": 400 }, { "epoch": 0.274297324754487, "grad_norm": 1.171875, "learning_rate": 3.6416140663263885e-05, "loss": 2.1307, "step": 405 }, { "epoch": 0.277683711479851, "grad_norm": 1.1640625, "learning_rate": 3.627986779545967e-05, "loss": 2.1407, "step": 410 }, { "epoch": 0.28107009820521506, "grad_norm": 1.125, "learning_rate": 3.614131726493102e-05, "loss": 2.1198, "step": 415 }, { "epoch": 0.28445648493057907, "grad_norm": 1.171875, "learning_rate": 3.600050845582669e-05, "loss": 2.1339, "step": 420 }, { "epoch": 0.28784287165594313, "grad_norm": 1.09375, "learning_rate": 3.5857461068243744e-05, "loss": 2.1346, "step": 425 }, { "epoch": 0.29122925838130714, "grad_norm": 1.125, "learning_rate": 3.5712195115471394e-05, "loss": 2.1184, "step": 430 }, { "epoch": 0.2946156451066712, "grad_norm": 1.1953125, "learning_rate": 3.5564730921191e-05, "loss": 2.1343, "step": 435 }, { "epoch": 0.2980020318320352, "grad_norm": 1.1328125, "learning_rate": 3.5415089116632665e-05, "loss": 2.1285, "step": 440 }, { "epoch": 0.30138841855739923, "grad_norm": 1.171875, "learning_rate": 3.526329063768878e-05, "loss": 2.1209, "step": 445 }, { "epoch": 0.3047748052827633, "grad_norm": 1.125, "learning_rate": 3.510935672198495e-05, "loss": 2.1302, "step": 450 }, { "epoch": 0.3081611920081273, "grad_norm": 1.171875, "learning_rate": 3.495330890590871e-05, "loss": 2.1075, "step": 455 }, { "epoch": 0.31154757873349137, "grad_norm": 1.140625, "learning_rate": 3.4795169021596417e-05, "loss": 2.1173, "step": 460 }, { "epoch": 0.3149339654588554, "grad_norm": 1.1640625, "learning_rate": 3.463495919387885e-05, "loss": 2.1284, "step": 465 }, { "epoch": 0.31832035218421945, "grad_norm": 1.1640625, "learning_rate": 3.447270183718574e-05, "loss": 2.1303, "step": 470 }, { "epoch": 0.32170673890958346, "grad_norm": 1.1796875, "learning_rate": 3.430841965240983e-05, "loss": 2.1401, "step": 475 }, { "epoch": 0.3250931256349475, "grad_norm": 1.140625, "learning_rate": 3.4142135623730954e-05, "loss": 2.1344, "step": 480 }, { "epoch": 0.32847951236031153, "grad_norm": 1.140625, "learning_rate": 3.397387301540028e-05, "loss": 2.1269, "step": 485 }, { "epoch": 0.3318658990856756, "grad_norm": 1.1171875, "learning_rate": 3.380365536848558e-05, "loss": 2.1308, "step": 490 }, { "epoch": 0.3352522858110396, "grad_norm": 1.0859375, "learning_rate": 3.363150649757763e-05, "loss": 2.1456, "step": 495 }, { "epoch": 0.3386386725364037, "grad_norm": 1.1640625, "learning_rate": 3.345745048745838e-05, "loss": 2.1163, "step": 500 }, { "epoch": 0.3420250592617677, "grad_norm": 1.1328125, "learning_rate": 3.328151168973139e-05, "loss": 2.1115, "step": 505 }, { "epoch": 0.34541144598713175, "grad_norm": 1.1640625, "learning_rate": 3.3103714719414814e-05, "loss": 2.1203, "step": 510 }, { "epoch": 0.34879783271249576, "grad_norm": 1.15625, "learning_rate": 3.2924084451497606e-05, "loss": 2.1226, "step": 515 }, { "epoch": 0.3521842194378598, "grad_norm": 1.1328125, "learning_rate": 3.274264601745938e-05, "loss": 2.1006, "step": 520 }, { "epoch": 0.35557060616322383, "grad_norm": 1.140625, "learning_rate": 3.25594248017543e-05, "loss": 2.1031, "step": 525 }, { "epoch": 0.3589569928885879, "grad_norm": 1.171875, "learning_rate": 3.237444643825965e-05, "loss": 2.128, "step": 530 }, { "epoch": 0.3623433796139519, "grad_norm": 1.109375, "learning_rate": 3.21877368066895e-05, "loss": 2.1303, "step": 535 }, { "epoch": 0.365729766339316, "grad_norm": 1.1484375, "learning_rate": 3.199932202897391e-05, "loss": 2.1269, "step": 540 }, { "epoch": 0.36911615306468, "grad_norm": 1.140625, "learning_rate": 3.1809228465604345e-05, "loss": 2.1526, "step": 545 }, { "epoch": 0.372502539790044, "grad_norm": 1.1015625, "learning_rate": 3.1617482711945616e-05, "loss": 2.1275, "step": 550 }, { "epoch": 0.37588892651540806, "grad_norm": 1.109375, "learning_rate": 3.1424111594515085e-05, "loss": 2.1329, "step": 555 }, { "epoch": 0.37927531324077207, "grad_norm": 1.1328125, "learning_rate": 3.1229142167229366e-05, "loss": 2.1363, "step": 560 }, { "epoch": 0.38266169996613614, "grad_norm": 1.140625, "learning_rate": 3.1032601707619375e-05, "loss": 2.1249, "step": 565 }, { "epoch": 0.38604808669150015, "grad_norm": 1.125, "learning_rate": 3.083451771301399e-05, "loss": 2.1323, "step": 570 }, { "epoch": 0.3894344734168642, "grad_norm": 1.1171875, "learning_rate": 3.0634917896692965e-05, "loss": 2.1328, "step": 575 }, { "epoch": 0.3928208601422282, "grad_norm": 1.15625, "learning_rate": 3.0433830184009694e-05, "loss": 2.1177, "step": 580 }, { "epoch": 0.3962072468675923, "grad_norm": 1.1484375, "learning_rate": 3.023128270848427e-05, "loss": 2.1293, "step": 585 }, { "epoch": 0.3995936335929563, "grad_norm": 1.09375, "learning_rate": 3.002730380786737e-05, "loss": 2.1073, "step": 590 }, { "epoch": 0.40298002031832036, "grad_norm": 1.125, "learning_rate": 2.982192202017568e-05, "loss": 2.1186, "step": 595 }, { "epoch": 0.4063664070436844, "grad_norm": 1.140625, "learning_rate": 2.9615166079699178e-05, "loss": 2.1191, "step": 600 }, { "epoch": 0.40975279376904844, "grad_norm": 1.125, "learning_rate": 2.9407064912981035e-05, "loss": 2.1293, "step": 605 }, { "epoch": 0.41313918049441245, "grad_norm": 1.125, "learning_rate": 2.91976476347706e-05, "loss": 2.1106, "step": 610 }, { "epoch": 0.4165255672197765, "grad_norm": 1.109375, "learning_rate": 2.898694354395006e-05, "loss": 2.1094, "step": 615 }, { "epoch": 0.4199119539451405, "grad_norm": 1.1484375, "learning_rate": 2.8774982119435307e-05, "loss": 2.1175, "step": 620 }, { "epoch": 0.4232983406705046, "grad_norm": 1.1484375, "learning_rate": 2.856179301605165e-05, "loss": 2.1291, "step": 625 }, { "epoch": 0.4266847273958686, "grad_norm": 1.109375, "learning_rate": 2.8347406060384934e-05, "loss": 2.1246, "step": 630 }, { "epoch": 0.43007111412123267, "grad_norm": 1.1640625, "learning_rate": 2.8131851246608533e-05, "loss": 2.1263, "step": 635 }, { "epoch": 0.4334575008465967, "grad_norm": 1.1328125, "learning_rate": 2.7915158732287042e-05, "loss": 2.1355, "step": 640 }, { "epoch": 0.43684388757196074, "grad_norm": 1.109375, "learning_rate": 2.7697358834156974e-05, "loss": 2.124, "step": 645 }, { "epoch": 0.44023027429732475, "grad_norm": 1.1640625, "learning_rate": 2.747848202388528e-05, "loss": 2.107, "step": 650 }, { "epoch": 0.4436166610226888, "grad_norm": 1.1328125, "learning_rate": 2.725855892380613e-05, "loss": 2.1343, "step": 655 }, { "epoch": 0.44700304774805283, "grad_norm": 1.140625, "learning_rate": 2.703762030263666e-05, "loss": 2.1117, "step": 660 }, { "epoch": 0.45038943447341684, "grad_norm": 1.1015625, "learning_rate": 2.6815697071172245e-05, "loss": 2.1107, "step": 665 }, { "epoch": 0.4537758211987809, "grad_norm": 1.1640625, "learning_rate": 2.6592820277961805e-05, "loss": 2.1012, "step": 670 }, { "epoch": 0.4571622079241449, "grad_norm": 1.140625, "learning_rate": 2.6369021104963976e-05, "loss": 2.1291, "step": 675 }, { "epoch": 0.460548594649509, "grad_norm": 1.1484375, "learning_rate": 2.61443308631845e-05, "loss": 2.1062, "step": 680 }, { "epoch": 0.463934981374873, "grad_norm": 1.109375, "learning_rate": 2.591878098829563e-05, "loss": 2.1077, "step": 685 }, { "epoch": 0.46732136810023706, "grad_norm": 1.140625, "learning_rate": 2.5692403036238038e-05, "loss": 2.1231, "step": 690 }, { "epoch": 0.47070775482560107, "grad_norm": 1.1328125, "learning_rate": 2.546522867880598e-05, "loss": 2.1441, "step": 695 }, { "epoch": 0.47409414155096513, "grad_norm": 1.140625, "learning_rate": 2.5237289699216136e-05, "loss": 2.109, "step": 700 }, { "epoch": 0.47748052827632914, "grad_norm": 1.140625, "learning_rate": 2.5008617987660975e-05, "loss": 2.1384, "step": 705 }, { "epoch": 0.4808669150016932, "grad_norm": 1.125, "learning_rate": 2.4779245536847074e-05, "loss": 2.1083, "step": 710 }, { "epoch": 0.4842533017270572, "grad_norm": 1.1328125, "learning_rate": 2.4549204437519137e-05, "loss": 2.1149, "step": 715 }, { "epoch": 0.4876396884524213, "grad_norm": 1.09375, "learning_rate": 2.4318526873970278e-05, "loss": 2.1059, "step": 720 }, { "epoch": 0.4910260751777853, "grad_norm": 1.125, "learning_rate": 2.4087245119539227e-05, "loss": 2.1218, "step": 725 }, { "epoch": 0.49441246190314936, "grad_norm": 1.109375, "learning_rate": 2.385539153209508e-05, "loss": 2.112, "step": 730 }, { "epoch": 0.49779884862851337, "grad_norm": 1.15625, "learning_rate": 2.3622998549510197e-05, "loss": 2.1128, "step": 735 }, { "epoch": 0.5011852353538774, "grad_norm": 1.1328125, "learning_rate": 2.3390098685121938e-05, "loss": 2.1269, "step": 740 }, { "epoch": 0.5045716220792414, "grad_norm": 1.1484375, "learning_rate": 2.3156724523183832e-05, "loss": 2.1233, "step": 745 }, { "epoch": 0.5079580088046055, "grad_norm": 1.109375, "learning_rate": 2.2922908714306814e-05, "loss": 2.1443, "step": 750 }, { "epoch": 0.5113443955299696, "grad_norm": 1.078125, "learning_rate": 2.268868397089119e-05, "loss": 2.0891, "step": 755 }, { "epoch": 0.5147307822553335, "grad_norm": 1.1015625, "learning_rate": 2.2454083062549957e-05, "loss": 2.112, "step": 760 }, { "epoch": 0.5181171689806976, "grad_norm": 1.1171875, "learning_rate": 2.2219138811524102e-05, "loss": 2.1152, "step": 765 }, { "epoch": 0.5215035557060617, "grad_norm": 1.1328125, "learning_rate": 2.1983884088090553e-05, "loss": 2.1255, "step": 770 }, { "epoch": 0.5248899424314256, "grad_norm": 1.109375, "learning_rate": 2.1748351805963422e-05, "loss": 2.1058, "step": 775 }, { "epoch": 0.5282763291567897, "grad_norm": 1.125, "learning_rate": 2.151257491768914e-05, "loss": 2.1231, "step": 780 }, { "epoch": 0.5316627158821537, "grad_norm": 1.109375, "learning_rate": 2.127658641003617e-05, "loss": 2.1258, "step": 785 }, { "epoch": 0.5350491026075178, "grad_norm": 1.09375, "learning_rate": 2.1040419299379958e-05, "loss": 2.1146, "step": 790 }, { "epoch": 0.5384354893328818, "grad_norm": 1.1171875, "learning_rate": 2.0804106627083708e-05, "loss": 2.1032, "step": 795 }, { "epoch": 0.5418218760582458, "grad_norm": 1.1328125, "learning_rate": 2.0567681454875664e-05, "loss": 2.117, "step": 800 }, { "epoch": 0.5452082627836099, "grad_norm": 1.125, "learning_rate": 2.0331176860223575e-05, "loss": 2.1131, "step": 805 }, { "epoch": 0.548594649508974, "grad_norm": 1.09375, "learning_rate": 2.009462593170691e-05, "loss": 2.1134, "step": 810 }, { "epoch": 0.5519810362343379, "grad_norm": 1.1328125, "learning_rate": 1.985806176438756e-05, "loss": 2.126, "step": 815 }, { "epoch": 0.555367422959702, "grad_norm": 1.1171875, "learning_rate": 1.9621517455179627e-05, "loss": 2.1293, "step": 820 }, { "epoch": 0.558753809685066, "grad_norm": 1.140625, "learning_rate": 1.93850260982189e-05, "loss": 2.1059, "step": 825 }, { "epoch": 0.5621401964104301, "grad_norm": 1.15625, "learning_rate": 1.9148620780232842e-05, "loss": 2.1121, "step": 830 }, { "epoch": 0.5655265831357941, "grad_norm": 1.1171875, "learning_rate": 1.891233457591142e-05, "loss": 2.1178, "step": 835 }, { "epoch": 0.5689129698611581, "grad_norm": 1.109375, "learning_rate": 1.8676200543279864e-05, "loss": 2.105, "step": 840 }, { "epoch": 0.5722993565865222, "grad_norm": 1.109375, "learning_rate": 1.8440251719073533e-05, "loss": 2.1097, "step": 845 }, { "epoch": 0.5756857433118863, "grad_norm": 1.1015625, "learning_rate": 1.8204521114115872e-05, "loss": 2.1136, "step": 850 }, { "epoch": 0.5790721300372502, "grad_norm": 1.1015625, "learning_rate": 1.7969041708700028e-05, "loss": 2.087, "step": 855 }, { "epoch": 0.5824585167626143, "grad_norm": 1.109375, "learning_rate": 1.7733846447974606e-05, "loss": 2.115, "step": 860 }, { "epoch": 0.5858449034879784, "grad_norm": 1.140625, "learning_rate": 1.7498968237334484e-05, "loss": 2.1076, "step": 865 }, { "epoch": 0.5892312902133424, "grad_norm": 1.15625, "learning_rate": 1.7264439937817112e-05, "loss": 2.1182, "step": 870 }, { "epoch": 0.5926176769387064, "grad_norm": 1.1171875, "learning_rate": 1.7030294361505007e-05, "loss": 2.1157, "step": 875 }, { "epoch": 0.5960040636640704, "grad_norm": 1.125, "learning_rate": 1.6796564266935158e-05, "loss": 2.1072, "step": 880 }, { "epoch": 0.5993904503894345, "grad_norm": 1.09375, "learning_rate": 1.6563282354515857e-05, "loss": 2.1034, "step": 885 }, { "epoch": 0.6027768371147985, "grad_norm": 1.078125, "learning_rate": 1.6330481261951726e-05, "loss": 2.0906, "step": 890 }, { "epoch": 0.6061632238401625, "grad_norm": 1.1328125, "learning_rate": 1.609819355967744e-05, "loss": 2.1431, "step": 895 }, { "epoch": 0.6095496105655266, "grad_norm": 1.1015625, "learning_rate": 1.586645174630094e-05, "loss": 2.0905, "step": 900 }, { "epoch": 0.6129359972908907, "grad_norm": 1.0859375, "learning_rate": 1.563528824405666e-05, "loss": 2.1059, "step": 905 }, { "epoch": 0.6163223840162546, "grad_norm": 1.0703125, "learning_rate": 1.5404735394269403e-05, "loss": 2.1163, "step": 910 }, { "epoch": 0.6197087707416187, "grad_norm": 1.0703125, "learning_rate": 1.5174825452829615e-05, "loss": 2.1211, "step": 915 }, { "epoch": 0.6230951574669827, "grad_norm": 1.09375, "learning_rate": 1.4945590585680539e-05, "loss": 2.1069, "step": 920 }, { "epoch": 0.6264815441923468, "grad_norm": 1.09375, "learning_rate": 1.4717062864318004e-05, "loss": 2.0974, "step": 925 }, { "epoch": 0.6298679309177108, "grad_norm": 1.09375, "learning_rate": 1.4489274261303387e-05, "loss": 2.1034, "step": 930 }, { "epoch": 0.6332543176430748, "grad_norm": 1.1015625, "learning_rate": 1.4262256645790469e-05, "loss": 2.1126, "step": 935 }, { "epoch": 0.6366407043684389, "grad_norm": 1.109375, "learning_rate": 1.4036041779066696e-05, "loss": 2.1133, "step": 940 }, { "epoch": 0.640027091093803, "grad_norm": 1.0625, "learning_rate": 1.3810661310109565e-05, "loss": 2.1165, "step": 945 }, { "epoch": 0.6434134778191669, "grad_norm": 1.0859375, "learning_rate": 1.3586146771158746e-05, "loss": 2.1116, "step": 950 }, { "epoch": 0.646799864544531, "grad_norm": 1.1328125, "learning_rate": 1.3362529573304473e-05, "loss": 2.0953, "step": 955 }, { "epoch": 0.650186251269895, "grad_norm": 1.125, "learning_rate": 1.3139841002092949e-05, "loss": 2.1109, "step": 960 }, { "epoch": 0.6535726379952591, "grad_norm": 1.109375, "learning_rate": 1.2918112213149292e-05, "loss": 2.0961, "step": 965 }, { "epoch": 0.6569590247206231, "grad_norm": 1.1015625, "learning_rate": 1.2697374227818609e-05, "loss": 2.1, "step": 970 }, { "epoch": 0.6603454114459871, "grad_norm": 1.1171875, "learning_rate": 1.2477657928825977e-05, "loss": 2.122, "step": 975 }, { "epoch": 0.6637317981713512, "grad_norm": 1.125, "learning_rate": 1.2258994055955658e-05, "loss": 2.0896, "step": 980 }, { "epoch": 0.6671181848967151, "grad_norm": 1.1015625, "learning_rate": 1.2041413201750473e-05, "loss": 2.0827, "step": 985 }, { "epoch": 0.6705045716220792, "grad_norm": 1.0703125, "learning_rate": 1.1824945807231642e-05, "loss": 2.1213, "step": 990 }, { "epoch": 0.6738909583474433, "grad_norm": 1.0859375, "learning_rate": 1.1609622157639913e-05, "loss": 2.113, "step": 995 }, { "epoch": 0.6772773450728073, "grad_norm": 1.125, "learning_rate": 1.139547237819846e-05, "loss": 2.1118, "step": 1000 }, { "epoch": 0.6806637317981713, "grad_norm": 1.109375, "learning_rate": 1.1182526429898118e-05, "loss": 2.1255, "step": 1005 }, { "epoch": 0.6840501185235354, "grad_norm": 1.1328125, "learning_rate": 1.0970814105305689e-05, "loss": 2.116, "step": 1010 }, { "epoch": 0.6874365052488994, "grad_norm": 1.1015625, "learning_rate": 1.0760365024395745e-05, "loss": 2.0958, "step": 1015 }, { "epoch": 0.6908228919742635, "grad_norm": 1.1015625, "learning_rate": 1.0551208630406587e-05, "loss": 2.0891, "step": 1020 }, { "epoch": 0.6942092786996275, "grad_norm": 1.1328125, "learning_rate": 1.0343374185720927e-05, "loss": 2.1199, "step": 1025 }, { "epoch": 0.6975956654249915, "grad_norm": 1.109375, "learning_rate": 1.0136890767771923e-05, "loss": 2.1115, "step": 1030 }, { "epoch": 0.7009820521503556, "grad_norm": 1.109375, "learning_rate": 9.931787264975021e-06, "loss": 2.1147, "step": 1035 }, { "epoch": 0.7043684388757196, "grad_norm": 1.1015625, "learning_rate": 9.72809237268628e-06, "loss": 2.1142, "step": 1040 }, { "epoch": 0.7077548256010836, "grad_norm": 1.09375, "learning_rate": 9.525834589187701e-06, "loss": 2.1012, "step": 1045 }, { "epoch": 0.7111412123264477, "grad_norm": 1.1015625, "learning_rate": 9.325042211700111e-06, "loss": 2.1048, "step": 1050 }, { "epoch": 0.7145275990518117, "grad_norm": 1.1171875, "learning_rate": 9.125743332424213e-06, "loss": 2.1197, "step": 1055 }, { "epoch": 0.7179139857771758, "grad_norm": 1.078125, "learning_rate": 8.92796583461031e-06, "loss": 2.0983, "step": 1060 }, { "epoch": 0.7213003725025398, "grad_norm": 1.1171875, "learning_rate": 8.731737388657198e-06, "loss": 2.1128, "step": 1065 }, { "epoch": 0.7246867592279038, "grad_norm": 1.1015625, "learning_rate": 8.537085448240951e-06, "loss": 2.1188, "step": 1070 }, { "epoch": 0.7280731459532679, "grad_norm": 1.1171875, "learning_rate": 8.344037246473919e-06, "loss": 2.1098, "step": 1075 }, { "epoch": 0.731459532678632, "grad_norm": 1.1171875, "learning_rate": 8.152619792094674e-06, "loss": 2.1052, "step": 1080 }, { "epoch": 0.7348459194039959, "grad_norm": 1.125, "learning_rate": 7.962859865689282e-06, "loss": 2.1407, "step": 1085 }, { "epoch": 0.73823230612936, "grad_norm": 1.15625, "learning_rate": 7.77478401594453e-06, "loss": 2.1314, "step": 1090 }, { "epoch": 0.741618692854724, "grad_norm": 1.1015625, "learning_rate": 7.588418555933581e-06, "loss": 2.101, "step": 1095 }, { "epoch": 0.745005079580088, "grad_norm": 1.109375, "learning_rate": 7.403789559434573e-06, "loss": 2.1036, "step": 1100 }, { "epoch": 0.7483914663054521, "grad_norm": 1.09375, "learning_rate": 7.220922857282804e-06, "loss": 2.0901, "step": 1105 }, { "epoch": 0.7517778530308161, "grad_norm": 1.09375, "learning_rate": 7.039844033756713e-06, "loss": 2.088, "step": 1110 }, { "epoch": 0.7551642397561802, "grad_norm": 1.140625, "learning_rate": 6.860578422998563e-06, "loss": 2.1146, "step": 1115 }, { "epoch": 0.7585506264815441, "grad_norm": 1.1328125, "learning_rate": 6.683151105469956e-06, "loss": 2.1177, "step": 1120 }, { "epoch": 0.7619370132069082, "grad_norm": 1.125, "learning_rate": 6.50758690444297e-06, "loss": 2.1109, "step": 1125 }, { "epoch": 0.7653233999322723, "grad_norm": 1.1171875, "learning_rate": 6.333910382527175e-06, "loss": 2.1356, "step": 1130 }, { "epoch": 0.7687097866576363, "grad_norm": 1.09375, "learning_rate": 6.1621458382331444e-06, "loss": 2.1003, "step": 1135 }, { "epoch": 0.7720961733830003, "grad_norm": 1.1015625, "learning_rate": 5.9923173025729895e-06, "loss": 2.1078, "step": 1140 }, { "epoch": 0.7754825601083644, "grad_norm": 1.140625, "learning_rate": 5.824448535698195e-06, "loss": 2.0915, "step": 1145 }, { "epoch": 0.7788689468337284, "grad_norm": 1.140625, "learning_rate": 5.658563023575478e-06, "loss": 2.1269, "step": 1150 }, { "epoch": 0.7822553335590925, "grad_norm": 1.09375, "learning_rate": 5.494683974700878e-06, "loss": 2.1221, "step": 1155 }, { "epoch": 0.7856417202844564, "grad_norm": 1.1015625, "learning_rate": 5.332834316852786e-06, "loss": 2.0986, "step": 1160 }, { "epoch": 0.7890281070098205, "grad_norm": 1.125, "learning_rate": 5.173036693884164e-06, "loss": 2.1099, "step": 1165 }, { "epoch": 0.7924144937351846, "grad_norm": 1.1484375, "learning_rate": 5.015313462554527e-06, "loss": 2.1086, "step": 1170 }, { "epoch": 0.7958008804605486, "grad_norm": 1.125, "learning_rate": 4.859686689402099e-06, "loss": 2.1064, "step": 1175 }, { "epoch": 0.7991872671859126, "grad_norm": 1.125, "learning_rate": 4.70617814765651e-06, "loss": 2.1341, "step": 1180 }, { "epoch": 0.8025736539112767, "grad_norm": 1.1015625, "learning_rate": 4.554809314192629e-06, "loss": 2.1181, "step": 1185 }, { "epoch": 0.8059600406366407, "grad_norm": 1.1171875, "learning_rate": 4.405601366525776e-06, "loss": 2.1068, "step": 1190 }, { "epoch": 0.8093464273620048, "grad_norm": 1.1328125, "learning_rate": 4.258575179848847e-06, "loss": 2.0982, "step": 1195 }, { "epoch": 0.8127328140873687, "grad_norm": 1.09375, "learning_rate": 4.113751324111748e-06, "loss": 2.1218, "step": 1200 }, { "epoch": 0.8161192008127328, "grad_norm": 1.0859375, "learning_rate": 3.971150061143492e-06, "loss": 2.1166, "step": 1205 }, { "epoch": 0.8195055875380969, "grad_norm": 1.1015625, "learning_rate": 3.830791341817468e-06, "loss": 2.1201, "step": 1210 }, { "epoch": 0.8228919742634608, "grad_norm": 1.109375, "learning_rate": 3.6926948032601663e-06, "loss": 2.1185, "step": 1215 }, { "epoch": 0.8262783609888249, "grad_norm": 1.0703125, "learning_rate": 3.5568797661038004e-06, "loss": 2.1061, "step": 1220 }, { "epoch": 0.829664747714189, "grad_norm": 1.125, "learning_rate": 3.4233652317832424e-06, "loss": 2.1188, "step": 1225 }, { "epoch": 0.833051134439553, "grad_norm": 1.1015625, "learning_rate": 3.292169879877569e-06, "loss": 2.1077, "step": 1230 }, { "epoch": 0.836437521164917, "grad_norm": 1.09375, "learning_rate": 3.1633120654966932e-06, "loss": 2.124, "step": 1235 }, { "epoch": 0.839823907890281, "grad_norm": 1.109375, "learning_rate": 3.0368098167133376e-06, "loss": 2.1057, "step": 1240 }, { "epoch": 0.8432102946156451, "grad_norm": 1.09375, "learning_rate": 2.9126808320408016e-06, "loss": 2.1011, "step": 1245 }, { "epoch": 0.8465966813410092, "grad_norm": 1.09375, "learning_rate": 2.7909424779567996e-06, "loss": 2.0859, "step": 1250 }, { "epoch": 0.8499830680663731, "grad_norm": 1.1015625, "learning_rate": 2.6716117864738024e-06, "loss": 2.1224, "step": 1255 }, { "epoch": 0.8533694547917372, "grad_norm": 1.140625, "learning_rate": 2.5547054527561253e-06, "loss": 2.108, "step": 1260 }, { "epoch": 0.8567558415171013, "grad_norm": 1.0859375, "learning_rate": 2.4402398327841658e-06, "loss": 2.1146, "step": 1265 }, { "epoch": 0.8601422282424653, "grad_norm": 1.125, "learning_rate": 2.3282309410661007e-06, "loss": 2.1147, "step": 1270 }, { "epoch": 0.8635286149678293, "grad_norm": 1.0859375, "learning_rate": 2.2186944483973273e-06, "loss": 2.087, "step": 1275 }, { "epoch": 0.8669150016931934, "grad_norm": 1.125, "learning_rate": 2.1116456796680376e-06, "loss": 2.0946, "step": 1280 }, { "epoch": 0.8703013884185574, "grad_norm": 1.09375, "learning_rate": 2.0070996117191676e-06, "loss": 2.115, "step": 1285 }, { "epoch": 0.8736877751439215, "grad_norm": 1.125, "learning_rate": 1.9050708712470012e-06, "loss": 2.1077, "step": 1290 }, { "epoch": 0.8770741618692854, "grad_norm": 1.1171875, "learning_rate": 1.8055737327568378e-06, "loss": 2.1076, "step": 1295 }, { "epoch": 0.8804605485946495, "grad_norm": 1.0859375, "learning_rate": 1.7086221165658544e-06, "loss": 2.102, "step": 1300 }, { "epoch": 0.8838469353200136, "grad_norm": 1.1171875, "learning_rate": 1.614229586855609e-06, "loss": 2.1115, "step": 1305 }, { "epoch": 0.8872333220453776, "grad_norm": 1.09375, "learning_rate": 1.5224093497742654e-06, "loss": 2.1187, "step": 1310 }, { "epoch": 0.8906197087707416, "grad_norm": 1.1015625, "learning_rate": 1.4331742515890091e-06, "loss": 2.1099, "step": 1315 }, { "epoch": 0.8940060954961057, "grad_norm": 1.078125, "learning_rate": 1.3465367768887471e-06, "loss": 2.11, "step": 1320 }, { "epoch": 0.8973924822214697, "grad_norm": 1.0625, "learning_rate": 1.2625090468374212e-06, "loss": 2.1088, "step": 1325 }, { "epoch": 0.9007788689468337, "grad_norm": 1.1171875, "learning_rate": 1.1811028174782102e-06, "loss": 2.1239, "step": 1330 }, { "epoch": 0.9041652556721977, "grad_norm": 1.078125, "learning_rate": 1.1023294780887394e-06, "loss": 2.1018, "step": 1335 }, { "epoch": 0.9075516423975618, "grad_norm": 1.1328125, "learning_rate": 1.0262000495876735e-06, "loss": 2.12, "step": 1340 }, { "epoch": 0.9109380291229259, "grad_norm": 1.09375, "learning_rate": 9.527251829927997e-07, "loss": 2.1071, "step": 1345 }, { "epoch": 0.9143244158482898, "grad_norm": 1.09375, "learning_rate": 8.819151579308949e-07, "loss": 2.1099, "step": 1350 }, { "epoch": 0.9177108025736539, "grad_norm": 1.109375, "learning_rate": 8.137798811995235e-07, "loss": 2.1087, "step": 1355 }, { "epoch": 0.921097189299018, "grad_norm": 1.109375, "learning_rate": 7.483288853810067e-07, "loss": 2.1193, "step": 1360 }, { "epoch": 0.924483576024382, "grad_norm": 1.1171875, "learning_rate": 6.85571327508765e-07, "loss": 2.1228, "step": 1365 }, { "epoch": 0.927869962749746, "grad_norm": 1.1015625, "learning_rate": 6.255159877861782e-07, "loss": 2.1164, "step": 1370 }, { "epoch": 0.93125634947511, "grad_norm": 1.109375, "learning_rate": 5.681712683581775e-07, "loss": 2.1116, "step": 1375 }, { "epoch": 0.9346427362004741, "grad_norm": 1.1015625, "learning_rate": 5.135451921357337e-07, "loss": 2.0996, "step": 1380 }, { "epoch": 0.9380291229258382, "grad_norm": 1.140625, "learning_rate": 4.616454016733851e-07, "loss": 2.1326, "step": 1385 }, { "epoch": 0.9414155096512021, "grad_norm": 1.1171875, "learning_rate": 4.1247915810001205e-07, "loss": 2.1102, "step": 1390 }, { "epoch": 0.9448018963765662, "grad_norm": 1.0546875, "learning_rate": 3.660533401029387e-07, "loss": 2.1122, "step": 1395 }, { "epoch": 0.9481882831019303, "grad_norm": 1.1484375, "learning_rate": 3.2237444296557173e-07, "loss": 2.1179, "step": 1400 }, { "epoch": 0.9515746698272943, "grad_norm": 1.125, "learning_rate": 2.81448577658654e-07, "loss": 2.1075, "step": 1405 }, { "epoch": 0.9549610565526583, "grad_norm": 1.1015625, "learning_rate": 2.432814699853081e-07, "loss": 2.1126, "step": 1410 }, { "epoch": 0.9583474432780223, "grad_norm": 1.0859375, "learning_rate": 2.078784597799599e-07, "loss": 2.106, "step": 1415 }, { "epoch": 0.9617338300033864, "grad_norm": 1.109375, "learning_rate": 1.752445001612535e-07, "loss": 2.1234, "step": 1420 }, { "epoch": 0.9651202167287504, "grad_norm": 1.09375, "learning_rate": 1.4538415683907236e-07, "loss": 2.0946, "step": 1425 }, { "epoch": 0.9685066034541144, "grad_norm": 1.1171875, "learning_rate": 1.1830160747577924e-07, "loss": 2.0985, "step": 1430 }, { "epoch": 0.9718929901794785, "grad_norm": 1.1953125, "learning_rate": 9.400064110172358e-08, "loss": 2.1185, "step": 1435 }, { "epoch": 0.9752793769048426, "grad_norm": 1.109375, "learning_rate": 7.248465758513457e-08, "loss": 2.0938, "step": 1440 }, { "epoch": 0.9786657636302065, "grad_norm": 1.09375, "learning_rate": 5.375666715645489e-08, "loss": 2.109, "step": 1445 }, { "epoch": 0.9820521503555706, "grad_norm": 1.1015625, "learning_rate": 3.781928998718654e-08, "loss": 2.1146, "step": 1450 }, { "epoch": 0.9854385370809347, "grad_norm": 1.09375, "learning_rate": 2.4674755823321794e-08, "loss": 2.1029, "step": 1455 }, { "epoch": 0.9888249238062987, "grad_norm": 1.1015625, "learning_rate": 1.4324903673370583e-08, "loss": 2.1195, "step": 1460 }, { "epoch": 0.9922113105316627, "grad_norm": 1.0859375, "learning_rate": 6.771181551088468e-09, "loss": 2.1055, "step": 1465 }, { "epoch": 0.9955976972570267, "grad_norm": 1.125, "learning_rate": 2.014646272876508e-09, "loss": 2.1176, "step": 1470 }, { "epoch": 0.9989840839823908, "grad_norm": 1.078125, "learning_rate": 5.596330993284937e-11, "loss": 2.1044, "step": 1475 }, { "epoch": 0.9996613613274636, "eval_loss": 2.1087405681610107, "eval_runtime": 85.1679, "eval_samples_per_second": 15.323, "eval_steps_per_second": 1.926, "step": 1476 }, { "epoch": 0.9996613613274636, "step": 1476, "total_flos": 3.002086743311647e+17, "train_loss": 2.1322910175090883, "train_runtime": 6591.5154, "train_samples_per_second": 3.583, "train_steps_per_second": 0.224 } ], "logging_steps": 5, "max_steps": 1476, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.002086743311647e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }