{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995721009841677, "eval_steps": 876, "global_step": 1752, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "eval_loss": 2.8475096225738525, "eval_runtime": 103.0812, "eval_samples_per_second": 110.845, "eval_steps_per_second": 13.863, "step": 1 }, { "epoch": 0.01, "grad_norm": 8.440493443386991, "learning_rate": 2.0000000000000003e-06, "loss": 2.8492, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.9490346173447717, "learning_rate": 4.000000000000001e-06, "loss": 2.5819, "step": 20 }, { "epoch": 0.02, "grad_norm": 1.600282990359268, "learning_rate": 6e-06, "loss": 2.368, "step": 30 }, { "epoch": 0.02, "grad_norm": 1.5128342355183866, "learning_rate": 8.000000000000001e-06, "loss": 2.285, "step": 40 }, { "epoch": 0.03, "grad_norm": 1.374859750679287, "learning_rate": 1e-05, "loss": 2.2456, "step": 50 }, { "epoch": 0.03, "grad_norm": 1.5897286646420046, "learning_rate": 1.2e-05, "loss": 2.2245, "step": 60 }, { "epoch": 0.04, "grad_norm": 1.5989674455070406, "learning_rate": 1.4e-05, "loss": 2.2195, "step": 70 }, { "epoch": 0.05, "grad_norm": 1.5945958733352381, "learning_rate": 1.6000000000000003e-05, "loss": 2.1831, "step": 80 }, { "epoch": 0.05, "grad_norm": 1.694220003324143, "learning_rate": 1.8e-05, "loss": 2.1767, "step": 90 }, { "epoch": 0.06, "grad_norm": 1.4554411206177154, "learning_rate": 2e-05, "loss": 2.1882, "step": 100 }, { "epoch": 0.06, "grad_norm": 1.512788547176528, "learning_rate": 1.9998191841174705e-05, "loss": 2.1663, "step": 110 }, { "epoch": 0.07, "grad_norm": 1.4884413661778526, "learning_rate": 1.999276801858648e-05, "loss": 2.1768, "step": 120 }, { "epoch": 0.07, "grad_norm": 1.3566762491802051, "learning_rate": 1.998373049366187e-05, "loss": 2.1667, "step": 130 }, { "epoch": 0.08, "grad_norm": 1.3202292242087335, "learning_rate": 1.9971082534656958e-05, "loss": 2.1643, "step": 140 }, { "epoch": 0.09, "grad_norm": 1.5085528727448396, "learning_rate": 1.995482871547548e-05, "loss": 2.1487, "step": 150 }, { "epoch": 0.09, "grad_norm": 1.5895534610050568, "learning_rate": 1.9934974914014765e-05, "loss": 2.1724, "step": 160 }, { "epoch": 0.1, "grad_norm": 1.3805999898228127, "learning_rate": 1.9911528310040073e-05, "loss": 2.1537, "step": 170 }, { "epoch": 0.1, "grad_norm": 1.4290062832106343, "learning_rate": 1.9884497382588185e-05, "loss": 2.1519, "step": 180 }, { "epoch": 0.11, "grad_norm": 1.3924244336208988, "learning_rate": 1.985389190690111e-05, "loss": 2.1671, "step": 190 }, { "epoch": 0.11, "grad_norm": 1.746424181546391, "learning_rate": 1.9819722950891034e-05, "loss": 2.1265, "step": 200 }, { "epoch": 0.12, "grad_norm": 1.487352260526166, "learning_rate": 1.9782002871137835e-05, "loss": 2.1444, "step": 210 }, { "epoch": 0.13, "grad_norm": 1.8190326103740864, "learning_rate": 1.974074530842053e-05, "loss": 2.1355, "step": 220 }, { "epoch": 0.13, "grad_norm": 1.714020487946669, "learning_rate": 1.9695965182784347e-05, "loss": 2.1142, "step": 230 }, { "epoch": 0.14, "grad_norm": 1.5433895344149444, "learning_rate": 1.9647678688145163e-05, "loss": 2.1351, "step": 240 }, { "epoch": 0.14, "grad_norm": 1.512497351400695, "learning_rate": 1.9595903286433256e-05, "loss": 2.1321, "step": 250 }, { "epoch": 0.15, "grad_norm": 1.5487574537580002, "learning_rate": 1.9540657701278536e-05, "loss": 2.1423, "step": 260 }, { "epoch": 0.15, "grad_norm": 1.4010705826153367, "learning_rate": 1.948196191123948e-05, "loss": 2.1352, "step": 270 }, { "epoch": 0.16, "grad_norm": 1.6179557586451234, "learning_rate": 1.9419837142578228e-05, "loss": 2.1248, "step": 280 }, { "epoch": 0.17, "grad_norm": 1.4561046353587423, "learning_rate": 1.9354305861584542e-05, "loss": 2.1301, "step": 290 }, { "epoch": 0.17, "grad_norm": 1.2669388875042749, "learning_rate": 1.928539176645122e-05, "loss": 2.1379, "step": 300 }, { "epoch": 0.18, "grad_norm": 1.341753008560977, "learning_rate": 1.921311977870413e-05, "loss": 2.1439, "step": 310 }, { "epoch": 0.18, "grad_norm": 1.3365022464568894, "learning_rate": 1.9137516034189768e-05, "loss": 2.1262, "step": 320 }, { "epoch": 0.19, "grad_norm": 1.2734810511458647, "learning_rate": 1.9058607873623697e-05, "loss": 2.1266, "step": 330 }, { "epoch": 0.19, "grad_norm": 1.3720854684658654, "learning_rate": 1.897642383270331e-05, "loss": 2.1343, "step": 340 }, { "epoch": 0.2, "grad_norm": 1.121677779284525, "learning_rate": 1.8890993631788384e-05, "loss": 2.1352, "step": 350 }, { "epoch": 0.21, "grad_norm": 1.189239436645775, "learning_rate": 1.880234816515326e-05, "loss": 2.0924, "step": 360 }, { "epoch": 0.21, "grad_norm": 1.3089885528086298, "learning_rate": 1.8710519489814503e-05, "loss": 2.1121, "step": 370 }, { "epoch": 0.22, "grad_norm": 1.2381050831578697, "learning_rate": 1.8615540813938063e-05, "loss": 2.1225, "step": 380 }, { "epoch": 0.22, "grad_norm": 1.3316405801410958, "learning_rate": 1.851744648483014e-05, "loss": 2.1203, "step": 390 }, { "epoch": 0.23, "grad_norm": 1.1917580911147747, "learning_rate": 1.84162719765161e-05, "loss": 2.1079, "step": 400 }, { "epoch": 0.23, "grad_norm": 1.0924602070988052, "learning_rate": 1.831205387691198e-05, "loss": 2.111, "step": 410 }, { "epoch": 0.24, "grad_norm": 1.4364738066604248, "learning_rate": 1.8204829874593083e-05, "loss": 2.0951, "step": 420 }, { "epoch": 0.25, "grad_norm": 1.1895141642997293, "learning_rate": 1.809463874516462e-05, "loss": 2.104, "step": 430 }, { "epoch": 0.25, "grad_norm": 1.0767130864233465, "learning_rate": 1.798152033723923e-05, "loss": 2.0951, "step": 440 }, { "epoch": 0.26, "grad_norm": 1.2427794223386772, "learning_rate": 1.786551555802643e-05, "loss": 2.1186, "step": 450 }, { "epoch": 0.26, "grad_norm": 1.2682001982680597, "learning_rate": 1.774666635853927e-05, "loss": 2.0969, "step": 460 }, { "epoch": 0.27, "grad_norm": 1.3411561694518368, "learning_rate": 1.762501571842355e-05, "loss": 2.0851, "step": 470 }, { "epoch": 0.27, "grad_norm": 1.1660818376097224, "learning_rate": 1.7500607630414973e-05, "loss": 2.0842, "step": 480 }, { "epoch": 0.28, "grad_norm": 26.782646263514767, "learning_rate": 1.7373487084429988e-05, "loss": 2.109, "step": 490 }, { "epoch": 0.29, "grad_norm": 1.0725118823204605, "learning_rate": 1.7243700051296016e-05, "loss": 2.1033, "step": 500 }, { "epoch": 0.29, "grad_norm": 1.1473544876658683, "learning_rate": 1.7111293466126938e-05, "loss": 2.1067, "step": 510 }, { "epoch": 0.3, "grad_norm": 1.1233164996213143, "learning_rate": 1.6976315211349848e-05, "loss": 2.1005, "step": 520 }, { "epoch": 0.3, "grad_norm": 1.6171587137595458, "learning_rate": 1.6838814099389268e-05, "loss": 2.1043, "step": 530 }, { "epoch": 0.31, "grad_norm": 1.372813395267995, "learning_rate": 1.669883985501501e-05, "loss": 2.1002, "step": 540 }, { "epoch": 0.31, "grad_norm": 1.1127836389394197, "learning_rate": 1.6556443097360136e-05, "loss": 2.0851, "step": 550 }, { "epoch": 0.32, "grad_norm": 2.717610258588088, "learning_rate": 1.641167532161545e-05, "loss": 2.0845, "step": 560 }, { "epoch": 0.33, "grad_norm": 1.5653769104194137, "learning_rate": 1.6264588880407218e-05, "loss": 2.0724, "step": 570 }, { "epoch": 0.33, "grad_norm": 1.2316818487469028, "learning_rate": 1.6115236964864798e-05, "loss": 2.078, "step": 580 }, { "epoch": 0.34, "grad_norm": 1.1213742181427604, "learning_rate": 1.5963673585385016e-05, "loss": 2.0748, "step": 590 }, { "epoch": 0.34, "grad_norm": 1.1502465686154286, "learning_rate": 1.580995355210031e-05, "loss": 2.093, "step": 600 }, { "epoch": 0.35, "grad_norm": 1.0993269956575817, "learning_rate": 1.565413245505765e-05, "loss": 2.0444, "step": 610 }, { "epoch": 0.35, "grad_norm": 1.2751286594255482, "learning_rate": 1.5496266644115386e-05, "loss": 2.0861, "step": 620 }, { "epoch": 0.36, "grad_norm": 2.446189093106462, "learning_rate": 1.5336413208565373e-05, "loss": 2.0779, "step": 630 }, { "epoch": 0.37, "grad_norm": 1.2051699362872939, "learning_rate": 1.5174629956487659e-05, "loss": 2.0606, "step": 640 }, { "epoch": 0.37, "grad_norm": 1.1973773050996088, "learning_rate": 1.5010975393845257e-05, "loss": 2.053, "step": 650 }, { "epoch": 0.38, "grad_norm": 1.1884541045802726, "learning_rate": 1.4845508703326504e-05, "loss": 2.0756, "step": 660 }, { "epoch": 0.38, "grad_norm": 1.073704568611228, "learning_rate": 1.4678289722942757e-05, "loss": 2.1002, "step": 670 }, { "epoch": 0.39, "grad_norm": 1.2085026626189295, "learning_rate": 1.4509378924389044e-05, "loss": 2.0443, "step": 680 }, { "epoch": 0.39, "grad_norm": 1.120124358877359, "learning_rate": 1.4338837391175582e-05, "loss": 2.0861, "step": 690 }, { "epoch": 0.4, "grad_norm": 1.1688083159685203, "learning_rate": 1.4166726796538044e-05, "loss": 2.0521, "step": 700 }, { "epoch": 0.41, "grad_norm": 1.1839022965360089, "learning_rate": 1.3993109381134553e-05, "loss": 2.0825, "step": 710 }, { "epoch": 0.41, "grad_norm": 1.204413246457808, "learning_rate": 1.3818047930537491e-05, "loss": 2.0638, "step": 720 }, { "epoch": 0.42, "grad_norm": 1.0984747546501181, "learning_rate": 1.3641605752528225e-05, "loss": 2.0668, "step": 730 }, { "epoch": 0.42, "grad_norm": 1.2424062933194955, "learning_rate": 1.3463846654203021e-05, "loss": 2.0704, "step": 740 }, { "epoch": 0.43, "grad_norm": 1.2287388164827397, "learning_rate": 1.3284834918898362e-05, "loss": 2.0604, "step": 750 }, { "epoch": 0.43, "grad_norm": 1.2727536904439012, "learning_rate": 1.3104635282944054e-05, "loss": 2.062, "step": 760 }, { "epoch": 0.44, "grad_norm": 1.2445619190644972, "learning_rate": 1.2923312912252509e-05, "loss": 2.0675, "step": 770 }, { "epoch": 0.45, "grad_norm": 1.209175703891303, "learning_rate": 1.2740933378752685e-05, "loss": 2.0813, "step": 780 }, { "epoch": 0.45, "grad_norm": 1.05801221856849, "learning_rate": 1.2557562636677195e-05, "loss": 2.0291, "step": 790 }, { "epoch": 0.46, "grad_norm": 1.2744525431893252, "learning_rate": 1.2373266998711152e-05, "loss": 2.07, "step": 800 }, { "epoch": 0.46, "grad_norm": 1.1119947967102999, "learning_rate": 1.2188113112011407e-05, "loss": 2.0593, "step": 810 }, { "epoch": 0.47, "grad_norm": 1.0106196828749796, "learning_rate": 1.2002167934104815e-05, "loss": 2.0631, "step": 820 }, { "epoch": 0.47, "grad_norm": 1.0010665987533642, "learning_rate": 1.1815498708674266e-05, "loss": 2.0714, "step": 830 }, { "epoch": 0.48, "grad_norm": 1.2516027979222362, "learning_rate": 1.162817294124124e-05, "loss": 2.0464, "step": 840 }, { "epoch": 0.48, "grad_norm": 1.0156082708830316, "learning_rate": 1.144025837475365e-05, "loss": 2.0557, "step": 850 }, { "epoch": 0.49, "grad_norm": 1.1250221739850519, "learning_rate": 1.1251822965087856e-05, "loss": 2.0496, "step": 860 }, { "epoch": 0.5, "grad_norm": 1.2672537523919118, "learning_rate": 1.1062934856473655e-05, "loss": 2.0448, "step": 870 }, { "epoch": 0.5, "eval_loss": 2.0418317317962646, "eval_runtime": 103.7763, "eval_samples_per_second": 110.102, "eval_steps_per_second": 13.77, "step": 876 }, { "epoch": 0.5, "grad_norm": 1.1236822586139523, "learning_rate": 1.0873662356851164e-05, "loss": 2.0371, "step": 880 }, { "epoch": 0.51, "grad_norm": 0.992435580318823, "learning_rate": 1.0684073913168502e-05, "loss": 2.0444, "step": 890 }, { "epoch": 0.51, "grad_norm": 1.102811565572559, "learning_rate": 1.0494238086629184e-05, "loss": 2.0619, "step": 900 }, { "epoch": 0.52, "grad_norm": 1.1541097046897795, "learning_rate": 1.0304223527898244e-05, "loss": 2.0344, "step": 910 }, { "epoch": 0.52, "grad_norm": 1.0366360190903774, "learning_rate": 1.0114098952275935e-05, "loss": 2.0665, "step": 920 }, { "epoch": 0.53, "grad_norm": 1.0161399852210633, "learning_rate": 9.923933114848125e-06, "loss": 2.036, "step": 930 }, { "epoch": 0.54, "grad_norm": 1.24836412296511, "learning_rate": 9.733794785622254e-06, "loss": 2.0575, "step": 940 }, { "epoch": 0.54, "grad_norm": 1.1159767378128298, "learning_rate": 9.543752724657924e-06, "loss": 2.0264, "step": 950 }, { "epoch": 0.55, "grad_norm": 1.089755633213397, "learning_rate": 9.353875657201084e-06, "loss": 2.0253, "step": 960 }, { "epoch": 0.55, "grad_norm": 1.106234215385007, "learning_rate": 9.164232248830777e-06, "loss": 2.0188, "step": 970 }, { "epoch": 0.56, "grad_norm": 1.0275698738246675, "learning_rate": 8.974891080627504e-06, "loss": 2.0551, "step": 980 }, { "epoch": 0.56, "grad_norm": 1.1622793874021287, "learning_rate": 8.785920624372122e-06, "loss": 2.036, "step": 990 }, { "epoch": 0.57, "grad_norm": 1.1956825372131208, "learning_rate": 8.597389217784268e-06, "loss": 2.0121, "step": 1000 }, { "epoch": 0.58, "grad_norm": 1.1603630656961061, "learning_rate": 8.409365039809282e-06, "loss": 2.0522, "step": 1010 }, { "epoch": 0.58, "grad_norm": 0.9807193257053537, "learning_rate": 8.221916085962511e-06, "loss": 2.0383, "step": 1020 }, { "epoch": 0.59, "grad_norm": 1.1576541284187967, "learning_rate": 8.03511014374e-06, "loss": 2.0338, "step": 1030 }, { "epoch": 0.59, "grad_norm": 1.058077247137416, "learning_rate": 7.849014768104354e-06, "loss": 2.0087, "step": 1040 }, { "epoch": 0.6, "grad_norm": 1.0492342153111835, "learning_rate": 7.663697257054736e-06, "loss": 2.0375, "step": 1050 }, { "epoch": 0.6, "grad_norm": 1.0607146652851884, "learning_rate": 7.479224627289765e-06, "loss": 2.027, "step": 1060 }, { "epoch": 0.61, "grad_norm": 1.0606337357914948, "learning_rate": 7.295663589972139e-06, "loss": 2.0304, "step": 1070 }, { "epoch": 0.62, "grad_norm": 1.2458990385743096, "learning_rate": 7.113080526603793e-06, "loss": 2.0247, "step": 1080 }, { "epoch": 0.62, "grad_norm": 1.0520000264708274, "learning_rate": 6.93154146502019e-06, "loss": 2.0347, "step": 1090 }, { "epoch": 0.63, "grad_norm": 1.0190764685112967, "learning_rate": 6.7511120555126055e-06, "loss": 2.0245, "step": 1100 }, { "epoch": 0.63, "grad_norm": 1.0288266074715982, "learning_rate": 6.571857547086864e-06, "loss": 2.0269, "step": 1110 }, { "epoch": 0.64, "grad_norm": 1.0406522209538531, "learning_rate": 6.393842763867248e-06, "loss": 2.0148, "step": 1120 }, { "epoch": 0.64, "grad_norm": 0.9707392990243359, "learning_rate": 6.2171320816540144e-06, "loss": 2.0242, "step": 1130 }, { "epoch": 0.65, "grad_norm": 1.0459965944310712, "learning_rate": 6.041789404643078e-06, "loss": 2.0217, "step": 1140 }, { "epoch": 0.66, "grad_norm": 1.1477415350002944, "learning_rate": 5.867878142316221e-06, "loss": 2.0396, "step": 1150 }, { "epoch": 0.66, "grad_norm": 1.02016191198618, "learning_rate": 5.695461186510194e-06, "loss": 2.01, "step": 1160 }, { "epoch": 0.67, "grad_norm": 1.0555922529805126, "learning_rate": 5.524600888673058e-06, "loss": 2.0279, "step": 1170 }, { "epoch": 0.67, "grad_norm": 1.047057359776084, "learning_rate": 5.355359037315893e-06, "loss": 2.0288, "step": 1180 }, { "epoch": 0.68, "grad_norm": 1.2084733042646434, "learning_rate": 5.187796835668137e-06, "loss": 2.0069, "step": 1190 }, { "epoch": 0.68, "grad_norm": 1.0679386080767506, "learning_rate": 5.021974879544522e-06, "loss": 2.0239, "step": 1200 }, { "epoch": 0.69, "grad_norm": 1.1309906953604523, "learning_rate": 4.857953135431723e-06, "loss": 1.9917, "step": 1210 }, { "epoch": 0.7, "grad_norm": 1.0606942502452434, "learning_rate": 4.695790918802577e-06, "loss": 2.0103, "step": 1220 }, { "epoch": 0.7, "grad_norm": 1.222543190695613, "learning_rate": 4.535546872665707e-06, "loss": 2.0211, "step": 1230 }, { "epoch": 0.71, "grad_norm": 1.1395641397720997, "learning_rate": 4.377278946358363e-06, "loss": 2.0281, "step": 1240 }, { "epoch": 0.71, "grad_norm": 1.2221683190418635, "learning_rate": 4.2210443745900806e-06, "loss": 1.9907, "step": 1250 }, { "epoch": 0.72, "grad_norm": 1.080522060394201, "learning_rate": 4.066899656744816e-06, "loss": 1.9982, "step": 1260 }, { "epoch": 0.72, "grad_norm": 0.9711133864519806, "learning_rate": 3.914900536448959e-06, "loss": 1.9983, "step": 1270 }, { "epoch": 0.73, "grad_norm": 1.1380541178528216, "learning_rate": 3.7651019814126656e-06, "loss": 2.0097, "step": 1280 }, { "epoch": 0.74, "grad_norm": 0.9633220950938465, "learning_rate": 3.617558163551802e-06, "loss": 1.986, "step": 1290 }, { "epoch": 0.74, "grad_norm": 1.0500879776289904, "learning_rate": 3.4723224393976353e-06, "loss": 2.0258, "step": 1300 }, { "epoch": 0.75, "grad_norm": 1.064203126844793, "learning_rate": 3.329447330801455e-06, "loss": 2.03, "step": 1310 }, { "epoch": 0.75, "grad_norm": 0.9922208622268893, "learning_rate": 3.1889845059409552e-06, "loss": 2.0059, "step": 1320 }, { "epoch": 0.76, "grad_norm": 1.153139782016341, "learning_rate": 3.0509847606354215e-06, "loss": 2.024, "step": 1330 }, { "epoch": 0.76, "grad_norm": 1.4479646282996403, "learning_rate": 2.91549799997632e-06, "loss": 1.996, "step": 1340 }, { "epoch": 0.77, "grad_norm": 1.0338025335111782, "learning_rate": 2.782573220280055e-06, "loss": 2.0027, "step": 1350 }, { "epoch": 0.78, "grad_norm": 1.063732584831313, "learning_rate": 2.6522584913693295e-06, "loss": 1.999, "step": 1360 }, { "epoch": 0.78, "grad_norm": 1.18151388005813, "learning_rate": 2.5246009391895665e-06, "loss": 2.0197, "step": 1370 }, { "epoch": 0.79, "grad_norm": 0.96078239213874, "learning_rate": 2.3996467287666914e-06, "loss": 1.9811, "step": 1380 }, { "epoch": 0.79, "grad_norm": 1.2378184274393051, "learning_rate": 2.277441047512361e-06, "loss": 2.0001, "step": 1390 }, { "epoch": 0.8, "grad_norm": 1.0558611234794726, "learning_rate": 2.1580280888828e-06, "loss": 1.9939, "step": 1400 }, { "epoch": 0.8, "grad_norm": 0.9611056812337401, "learning_rate": 2.041451036397002e-06, "loss": 1.9958, "step": 1410 }, { "epoch": 0.81, "grad_norm": 1.031950356244068, "learning_rate": 1.9277520480202205e-06, "loss": 2.0299, "step": 1420 }, { "epoch": 0.82, "grad_norm": 1.2064256312461334, "learning_rate": 1.81697224091831e-06, "loss": 1.9985, "step": 1430 }, { "epoch": 0.82, "grad_norm": 1.06230601717853, "learning_rate": 1.7091516765884464e-06, "loss": 2.0173, "step": 1440 }, { "epoch": 0.83, "grad_norm": 1.2012632430707513, "learning_rate": 1.6043293463716202e-06, "loss": 2.0259, "step": 1450 }, { "epoch": 0.83, "grad_norm": 1.0228988400529624, "learning_rate": 1.5025431573521209e-06, "loss": 2.0066, "step": 1460 }, { "epoch": 0.84, "grad_norm": 1.0275759044730708, "learning_rate": 1.4038299186491444e-06, "loss": 2.0044, "step": 1470 }, { "epoch": 0.84, "grad_norm": 1.0956109756099108, "learning_rate": 1.308225328105439e-06, "loss": 1.9951, "step": 1480 }, { "epoch": 0.85, "grad_norm": 1.5121064882610806, "learning_rate": 1.215763959377827e-06, "loss": 1.9667, "step": 1490 }, { "epoch": 0.86, "grad_norm": 0.9788828191357646, "learning_rate": 1.1264792494342858e-06, "loss": 2.0037, "step": 1500 }, { "epoch": 0.86, "grad_norm": 0.9967966044408001, "learning_rate": 1.0404034864620605e-06, "loss": 1.9866, "step": 1510 }, { "epoch": 0.87, "grad_norm": 1.0099926855076669, "learning_rate": 9.575677981912457e-07, "loss": 1.9836, "step": 1520 }, { "epoch": 0.87, "grad_norm": 1.002362361634915, "learning_rate": 8.780021406380012e-07, "loss": 1.9983, "step": 1530 }, { "epoch": 0.88, "grad_norm": 0.9435464306765974, "learning_rate": 8.017352872715078e-07, "loss": 1.9797, "step": 1540 }, { "epoch": 0.88, "grad_norm": 0.999977176523449, "learning_rate": 7.287948186085614e-07, "loss": 2.0142, "step": 1550 }, { "epoch": 0.89, "grad_norm": 1.152097320833946, "learning_rate": 6.592071122395849e-07, "loss": 2.0097, "step": 1560 }, { "epoch": 0.9, "grad_norm": 1.0857203698752966, "learning_rate": 5.929973332896677e-07, "loss": 2.0183, "step": 1570 }, { "epoch": 0.9, "grad_norm": 1.0543333030471254, "learning_rate": 5.301894253180295e-07, "loss": 1.985, "step": 1580 }, { "epoch": 0.91, "grad_norm": 1.0212489054938816, "learning_rate": 4.708061016592924e-07, "loss": 1.9888, "step": 1590 }, { "epoch": 0.91, "grad_norm": 1.2687381408479979, "learning_rate": 4.1486883720960436e-07, "loss": 1.997, "step": 1600 }, { "epoch": 0.92, "grad_norm": 1.0473866672283625, "learning_rate": 3.6239786066064264e-07, "loss": 2.0062, "step": 1610 }, { "epoch": 0.92, "grad_norm": 0.9105990124075911, "learning_rate": 3.1341214718426885e-07, "loss": 1.999, "step": 1620 }, { "epoch": 0.93, "grad_norm": 1.0250683907923996, "learning_rate": 2.6792941157051446e-07, "loss": 1.9727, "step": 1630 }, { "epoch": 0.94, "grad_norm": 0.9011974189916996, "learning_rate": 2.2596610182133328e-07, "loss": 1.998, "step": 1640 }, { "epoch": 0.94, "grad_norm": 1.0603603261183756, "learning_rate": 1.8753739320250153e-07, "loss": 2.0138, "step": 1650 }, { "epoch": 0.95, "grad_norm": 0.9978845922018089, "learning_rate": 1.5265718275574658e-07, "loss": 1.9968, "step": 1660 }, { "epoch": 0.95, "grad_norm": 1.092063001361788, "learning_rate": 1.2133808427313486e-07, "loss": 2.011, "step": 1670 }, { "epoch": 0.96, "grad_norm": 1.0628189984992609, "learning_rate": 9.359142373553287e-08, "loss": 2.0037, "step": 1680 }, { "epoch": 0.96, "grad_norm": 0.9584594600500118, "learning_rate": 6.942723521676465e-08, "loss": 1.9823, "step": 1690 }, { "epoch": 0.97, "grad_norm": 1.0728091649423657, "learning_rate": 4.88542572549755e-08, "loss": 1.9732, "step": 1700 }, { "epoch": 0.98, "grad_norm": 1.0209005535735403, "learning_rate": 3.187992969249876e-08, "loss": 2.0, "step": 1710 }, { "epoch": 0.98, "grad_norm": 1.0849633622627997, "learning_rate": 1.851039098537122e-08, "loss": 1.9958, "step": 1720 }, { "epoch": 0.99, "grad_norm": 0.9962734868875971, "learning_rate": 8.750475983472228e-09, "loss": 1.9992, "step": 1730 }, { "epoch": 0.99, "grad_norm": 1.083369781103426, "learning_rate": 2.6037141820933752e-09, "loss": 1.9757, "step": 1740 }, { "epoch": 1.0, "grad_norm": 0.9782925803672103, "learning_rate": 7.232844555282725e-11, "loss": 1.9782, "step": 1750 }, { "epoch": 1.0, "eval_loss": 1.987204909324646, "eval_runtime": 103.685, "eval_samples_per_second": 110.199, "eval_steps_per_second": 13.782, "step": 1752 } ], "logging_steps": 10, "max_steps": 1752, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 876, "total_flos": 366806984294400.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }