|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9995721009841677, |
|
"eval_steps": 876, |
|
"global_step": 1752, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 2.8475096225738525, |
|
"eval_runtime": 103.0812, |
|
"eval_samples_per_second": 110.845, |
|
"eval_steps_per_second": 13.863, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 8.440493443386991, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.8492, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.9490346173447717, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.5819, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.600282990359268, |
|
"learning_rate": 6e-06, |
|
"loss": 2.368, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.5128342355183866, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.285, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.374859750679287, |
|
"learning_rate": 1e-05, |
|
"loss": 2.2456, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.5897286646420046, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.2245, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5989674455070406, |
|
"learning_rate": 1.4e-05, |
|
"loss": 2.2195, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.5945958733352381, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.1831, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.694220003324143, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.1767, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.4554411206177154, |
|
"learning_rate": 2e-05, |
|
"loss": 2.1882, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.512788547176528, |
|
"learning_rate": 1.9998191841174705e-05, |
|
"loss": 2.1663, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.4884413661778526, |
|
"learning_rate": 1.999276801858648e-05, |
|
"loss": 2.1768, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.3566762491802051, |
|
"learning_rate": 1.998373049366187e-05, |
|
"loss": 2.1667, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.3202292242087335, |
|
"learning_rate": 1.9971082534656958e-05, |
|
"loss": 2.1643, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.5085528727448396, |
|
"learning_rate": 1.995482871547548e-05, |
|
"loss": 2.1487, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.5895534610050568, |
|
"learning_rate": 1.9934974914014765e-05, |
|
"loss": 2.1724, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.3805999898228127, |
|
"learning_rate": 1.9911528310040073e-05, |
|
"loss": 2.1537, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.4290062832106343, |
|
"learning_rate": 1.9884497382588185e-05, |
|
"loss": 2.1519, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.3924244336208988, |
|
"learning_rate": 1.985389190690111e-05, |
|
"loss": 2.1671, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.746424181546391, |
|
"learning_rate": 1.9819722950891034e-05, |
|
"loss": 2.1265, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.487352260526166, |
|
"learning_rate": 1.9782002871137835e-05, |
|
"loss": 2.1444, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.8190326103740864, |
|
"learning_rate": 1.974074530842053e-05, |
|
"loss": 2.1355, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.714020487946669, |
|
"learning_rate": 1.9695965182784347e-05, |
|
"loss": 2.1142, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.5433895344149444, |
|
"learning_rate": 1.9647678688145163e-05, |
|
"loss": 2.1351, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.512497351400695, |
|
"learning_rate": 1.9595903286433256e-05, |
|
"loss": 2.1321, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.5487574537580002, |
|
"learning_rate": 1.9540657701278536e-05, |
|
"loss": 2.1423, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.4010705826153367, |
|
"learning_rate": 1.948196191123948e-05, |
|
"loss": 2.1352, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.6179557586451234, |
|
"learning_rate": 1.9419837142578228e-05, |
|
"loss": 2.1248, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.4561046353587423, |
|
"learning_rate": 1.9354305861584542e-05, |
|
"loss": 2.1301, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.2669388875042749, |
|
"learning_rate": 1.928539176645122e-05, |
|
"loss": 2.1379, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.341753008560977, |
|
"learning_rate": 1.921311977870413e-05, |
|
"loss": 2.1439, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.3365022464568894, |
|
"learning_rate": 1.9137516034189768e-05, |
|
"loss": 2.1262, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.2734810511458647, |
|
"learning_rate": 1.9058607873623697e-05, |
|
"loss": 2.1266, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.3720854684658654, |
|
"learning_rate": 1.897642383270331e-05, |
|
"loss": 2.1343, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.121677779284525, |
|
"learning_rate": 1.8890993631788384e-05, |
|
"loss": 2.1352, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.189239436645775, |
|
"learning_rate": 1.880234816515326e-05, |
|
"loss": 2.0924, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.3089885528086298, |
|
"learning_rate": 1.8710519489814503e-05, |
|
"loss": 2.1121, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.2381050831578697, |
|
"learning_rate": 1.8615540813938063e-05, |
|
"loss": 2.1225, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.3316405801410958, |
|
"learning_rate": 1.851744648483014e-05, |
|
"loss": 2.1203, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.1917580911147747, |
|
"learning_rate": 1.84162719765161e-05, |
|
"loss": 2.1079, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.0924602070988052, |
|
"learning_rate": 1.831205387691198e-05, |
|
"loss": 2.111, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.4364738066604248, |
|
"learning_rate": 1.8204829874593083e-05, |
|
"loss": 2.0951, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.1895141642997293, |
|
"learning_rate": 1.809463874516462e-05, |
|
"loss": 2.104, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.0767130864233465, |
|
"learning_rate": 1.798152033723923e-05, |
|
"loss": 2.0951, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.2427794223386772, |
|
"learning_rate": 1.786551555802643e-05, |
|
"loss": 2.1186, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.2682001982680597, |
|
"learning_rate": 1.774666635853927e-05, |
|
"loss": 2.0969, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.3411561694518368, |
|
"learning_rate": 1.762501571842355e-05, |
|
"loss": 2.0851, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1660818376097224, |
|
"learning_rate": 1.7500607630414973e-05, |
|
"loss": 2.0842, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 26.782646263514767, |
|
"learning_rate": 1.7373487084429988e-05, |
|
"loss": 2.109, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.0725118823204605, |
|
"learning_rate": 1.7243700051296016e-05, |
|
"loss": 2.1033, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.1473544876658683, |
|
"learning_rate": 1.7111293466126938e-05, |
|
"loss": 2.1067, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.1233164996213143, |
|
"learning_rate": 1.6976315211349848e-05, |
|
"loss": 2.1005, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.6171587137595458, |
|
"learning_rate": 1.6838814099389268e-05, |
|
"loss": 2.1043, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.372813395267995, |
|
"learning_rate": 1.669883985501501e-05, |
|
"loss": 2.1002, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.1127836389394197, |
|
"learning_rate": 1.6556443097360136e-05, |
|
"loss": 2.0851, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.717610258588088, |
|
"learning_rate": 1.641167532161545e-05, |
|
"loss": 2.0845, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.5653769104194137, |
|
"learning_rate": 1.6264588880407218e-05, |
|
"loss": 2.0724, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.2316818487469028, |
|
"learning_rate": 1.6115236964864798e-05, |
|
"loss": 2.078, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.1213742181427604, |
|
"learning_rate": 1.5963673585385016e-05, |
|
"loss": 2.0748, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.1502465686154286, |
|
"learning_rate": 1.580995355210031e-05, |
|
"loss": 2.093, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.0993269956575817, |
|
"learning_rate": 1.565413245505765e-05, |
|
"loss": 2.0444, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.2751286594255482, |
|
"learning_rate": 1.5496266644115386e-05, |
|
"loss": 2.0861, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.446189093106462, |
|
"learning_rate": 1.5336413208565373e-05, |
|
"loss": 2.0779, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.2051699362872939, |
|
"learning_rate": 1.5174629956487659e-05, |
|
"loss": 2.0606, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.1973773050996088, |
|
"learning_rate": 1.5010975393845257e-05, |
|
"loss": 2.053, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.1884541045802726, |
|
"learning_rate": 1.4845508703326504e-05, |
|
"loss": 2.0756, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.073704568611228, |
|
"learning_rate": 1.4678289722942757e-05, |
|
"loss": 2.1002, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.2085026626189295, |
|
"learning_rate": 1.4509378924389044e-05, |
|
"loss": 2.0443, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.120124358877359, |
|
"learning_rate": 1.4338837391175582e-05, |
|
"loss": 2.0861, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.1688083159685203, |
|
"learning_rate": 1.4166726796538044e-05, |
|
"loss": 2.0521, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.1839022965360089, |
|
"learning_rate": 1.3993109381134553e-05, |
|
"loss": 2.0825, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.204413246457808, |
|
"learning_rate": 1.3818047930537491e-05, |
|
"loss": 2.0638, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.0984747546501181, |
|
"learning_rate": 1.3641605752528225e-05, |
|
"loss": 2.0668, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.2424062933194955, |
|
"learning_rate": 1.3463846654203021e-05, |
|
"loss": 2.0704, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.2287388164827397, |
|
"learning_rate": 1.3284834918898362e-05, |
|
"loss": 2.0604, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.2727536904439012, |
|
"learning_rate": 1.3104635282944054e-05, |
|
"loss": 2.062, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.2445619190644972, |
|
"learning_rate": 1.2923312912252509e-05, |
|
"loss": 2.0675, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.209175703891303, |
|
"learning_rate": 1.2740933378752685e-05, |
|
"loss": 2.0813, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.05801221856849, |
|
"learning_rate": 1.2557562636677195e-05, |
|
"loss": 2.0291, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.2744525431893252, |
|
"learning_rate": 1.2373266998711152e-05, |
|
"loss": 2.07, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.1119947967102999, |
|
"learning_rate": 1.2188113112011407e-05, |
|
"loss": 2.0593, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.0106196828749796, |
|
"learning_rate": 1.2002167934104815e-05, |
|
"loss": 2.0631, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.0010665987533642, |
|
"learning_rate": 1.1815498708674266e-05, |
|
"loss": 2.0714, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.2516027979222362, |
|
"learning_rate": 1.162817294124124e-05, |
|
"loss": 2.0464, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.0156082708830316, |
|
"learning_rate": 1.144025837475365e-05, |
|
"loss": 2.0557, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.1250221739850519, |
|
"learning_rate": 1.1251822965087856e-05, |
|
"loss": 2.0496, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.2672537523919118, |
|
"learning_rate": 1.1062934856473655e-05, |
|
"loss": 2.0448, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 2.0418317317962646, |
|
"eval_runtime": 103.7763, |
|
"eval_samples_per_second": 110.102, |
|
"eval_steps_per_second": 13.77, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.1236822586139523, |
|
"learning_rate": 1.0873662356851164e-05, |
|
"loss": 2.0371, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.992435580318823, |
|
"learning_rate": 1.0684073913168502e-05, |
|
"loss": 2.0444, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.102811565572559, |
|
"learning_rate": 1.0494238086629184e-05, |
|
"loss": 2.0619, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.1541097046897795, |
|
"learning_rate": 1.0304223527898244e-05, |
|
"loss": 2.0344, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.0366360190903774, |
|
"learning_rate": 1.0114098952275935e-05, |
|
"loss": 2.0665, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.0161399852210633, |
|
"learning_rate": 9.923933114848125e-06, |
|
"loss": 2.036, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.24836412296511, |
|
"learning_rate": 9.733794785622254e-06, |
|
"loss": 2.0575, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.1159767378128298, |
|
"learning_rate": 9.543752724657924e-06, |
|
"loss": 2.0264, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.089755633213397, |
|
"learning_rate": 9.353875657201084e-06, |
|
"loss": 2.0253, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.106234215385007, |
|
"learning_rate": 9.164232248830777e-06, |
|
"loss": 2.0188, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.0275698738246675, |
|
"learning_rate": 8.974891080627504e-06, |
|
"loss": 2.0551, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.1622793874021287, |
|
"learning_rate": 8.785920624372122e-06, |
|
"loss": 2.036, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.1956825372131208, |
|
"learning_rate": 8.597389217784268e-06, |
|
"loss": 2.0121, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.1603630656961061, |
|
"learning_rate": 8.409365039809282e-06, |
|
"loss": 2.0522, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.9807193257053537, |
|
"learning_rate": 8.221916085962511e-06, |
|
"loss": 2.0383, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.1576541284187967, |
|
"learning_rate": 8.03511014374e-06, |
|
"loss": 2.0338, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.058077247137416, |
|
"learning_rate": 7.849014768104354e-06, |
|
"loss": 2.0087, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.0492342153111835, |
|
"learning_rate": 7.663697257054736e-06, |
|
"loss": 2.0375, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.0607146652851884, |
|
"learning_rate": 7.479224627289765e-06, |
|
"loss": 2.027, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.0606337357914948, |
|
"learning_rate": 7.295663589972139e-06, |
|
"loss": 2.0304, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.2458990385743096, |
|
"learning_rate": 7.113080526603793e-06, |
|
"loss": 2.0247, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0520000264708274, |
|
"learning_rate": 6.93154146502019e-06, |
|
"loss": 2.0347, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.0190764685112967, |
|
"learning_rate": 6.7511120555126055e-06, |
|
"loss": 2.0245, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.0288266074715982, |
|
"learning_rate": 6.571857547086864e-06, |
|
"loss": 2.0269, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0406522209538531, |
|
"learning_rate": 6.393842763867248e-06, |
|
"loss": 2.0148, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.9707392990243359, |
|
"learning_rate": 6.2171320816540144e-06, |
|
"loss": 2.0242, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.0459965944310712, |
|
"learning_rate": 6.041789404643078e-06, |
|
"loss": 2.0217, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.1477415350002944, |
|
"learning_rate": 5.867878142316221e-06, |
|
"loss": 2.0396, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.02016191198618, |
|
"learning_rate": 5.695461186510194e-06, |
|
"loss": 2.01, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.0555922529805126, |
|
"learning_rate": 5.524600888673058e-06, |
|
"loss": 2.0279, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.047057359776084, |
|
"learning_rate": 5.355359037315893e-06, |
|
"loss": 2.0288, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.2084733042646434, |
|
"learning_rate": 5.187796835668137e-06, |
|
"loss": 2.0069, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.0679386080767506, |
|
"learning_rate": 5.021974879544522e-06, |
|
"loss": 2.0239, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.1309906953604523, |
|
"learning_rate": 4.857953135431723e-06, |
|
"loss": 1.9917, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.0606942502452434, |
|
"learning_rate": 4.695790918802577e-06, |
|
"loss": 2.0103, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.222543190695613, |
|
"learning_rate": 4.535546872665707e-06, |
|
"loss": 2.0211, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.1395641397720997, |
|
"learning_rate": 4.377278946358363e-06, |
|
"loss": 2.0281, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.2221683190418635, |
|
"learning_rate": 4.2210443745900806e-06, |
|
"loss": 1.9907, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.080522060394201, |
|
"learning_rate": 4.066899656744816e-06, |
|
"loss": 1.9982, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.9711133864519806, |
|
"learning_rate": 3.914900536448959e-06, |
|
"loss": 1.9983, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.1380541178528216, |
|
"learning_rate": 3.7651019814126656e-06, |
|
"loss": 2.0097, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.9633220950938465, |
|
"learning_rate": 3.617558163551802e-06, |
|
"loss": 1.986, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.0500879776289904, |
|
"learning_rate": 3.4723224393976353e-06, |
|
"loss": 2.0258, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.064203126844793, |
|
"learning_rate": 3.329447330801455e-06, |
|
"loss": 2.03, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.9922208622268893, |
|
"learning_rate": 3.1889845059409552e-06, |
|
"loss": 2.0059, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.153139782016341, |
|
"learning_rate": 3.0509847606354215e-06, |
|
"loss": 2.024, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.4479646282996403, |
|
"learning_rate": 2.91549799997632e-06, |
|
"loss": 1.996, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.0338025335111782, |
|
"learning_rate": 2.782573220280055e-06, |
|
"loss": 2.0027, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.063732584831313, |
|
"learning_rate": 2.6522584913693295e-06, |
|
"loss": 1.999, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.18151388005813, |
|
"learning_rate": 2.5246009391895665e-06, |
|
"loss": 2.0197, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.96078239213874, |
|
"learning_rate": 2.3996467287666914e-06, |
|
"loss": 1.9811, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.2378184274393051, |
|
"learning_rate": 2.277441047512361e-06, |
|
"loss": 2.0001, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.0558611234794726, |
|
"learning_rate": 2.1580280888828e-06, |
|
"loss": 1.9939, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.9611056812337401, |
|
"learning_rate": 2.041451036397002e-06, |
|
"loss": 1.9958, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.031950356244068, |
|
"learning_rate": 1.9277520480202205e-06, |
|
"loss": 2.0299, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.2064256312461334, |
|
"learning_rate": 1.81697224091831e-06, |
|
"loss": 1.9985, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.06230601717853, |
|
"learning_rate": 1.7091516765884464e-06, |
|
"loss": 2.0173, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.2012632430707513, |
|
"learning_rate": 1.6043293463716202e-06, |
|
"loss": 2.0259, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.0228988400529624, |
|
"learning_rate": 1.5025431573521209e-06, |
|
"loss": 2.0066, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.0275759044730708, |
|
"learning_rate": 1.4038299186491444e-06, |
|
"loss": 2.0044, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.0956109756099108, |
|
"learning_rate": 1.308225328105439e-06, |
|
"loss": 1.9951, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.5121064882610806, |
|
"learning_rate": 1.215763959377827e-06, |
|
"loss": 1.9667, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.9788828191357646, |
|
"learning_rate": 1.1264792494342858e-06, |
|
"loss": 2.0037, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.9967966044408001, |
|
"learning_rate": 1.0404034864620605e-06, |
|
"loss": 1.9866, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.0099926855076669, |
|
"learning_rate": 9.575677981912457e-07, |
|
"loss": 1.9836, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.002362361634915, |
|
"learning_rate": 8.780021406380012e-07, |
|
"loss": 1.9983, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.9435464306765974, |
|
"learning_rate": 8.017352872715078e-07, |
|
"loss": 1.9797, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.999977176523449, |
|
"learning_rate": 7.287948186085614e-07, |
|
"loss": 2.0142, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.152097320833946, |
|
"learning_rate": 6.592071122395849e-07, |
|
"loss": 2.0097, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.0857203698752966, |
|
"learning_rate": 5.929973332896677e-07, |
|
"loss": 2.0183, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.0543333030471254, |
|
"learning_rate": 5.301894253180295e-07, |
|
"loss": 1.985, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.0212489054938816, |
|
"learning_rate": 4.708061016592924e-07, |
|
"loss": 1.9888, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.2687381408479979, |
|
"learning_rate": 4.1486883720960436e-07, |
|
"loss": 1.997, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.0473866672283625, |
|
"learning_rate": 3.6239786066064264e-07, |
|
"loss": 2.0062, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.9105990124075911, |
|
"learning_rate": 3.1341214718426885e-07, |
|
"loss": 1.999, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.0250683907923996, |
|
"learning_rate": 2.6792941157051446e-07, |
|
"loss": 1.9727, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.9011974189916996, |
|
"learning_rate": 2.2596610182133328e-07, |
|
"loss": 1.998, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.0603603261183756, |
|
"learning_rate": 1.8753739320250153e-07, |
|
"loss": 2.0138, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.9978845922018089, |
|
"learning_rate": 1.5265718275574658e-07, |
|
"loss": 1.9968, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.092063001361788, |
|
"learning_rate": 1.2133808427313486e-07, |
|
"loss": 2.011, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.0628189984992609, |
|
"learning_rate": 9.359142373553287e-08, |
|
"loss": 2.0037, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.9584594600500118, |
|
"learning_rate": 6.942723521676465e-08, |
|
"loss": 1.9823, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.0728091649423657, |
|
"learning_rate": 4.88542572549755e-08, |
|
"loss": 1.9732, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.0209005535735403, |
|
"learning_rate": 3.187992969249876e-08, |
|
"loss": 2.0, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.0849633622627997, |
|
"learning_rate": 1.851039098537122e-08, |
|
"loss": 1.9958, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.9962734868875971, |
|
"learning_rate": 8.750475983472228e-09, |
|
"loss": 1.9992, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.083369781103426, |
|
"learning_rate": 2.6037141820933752e-09, |
|
"loss": 1.9757, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.9782925803672103, |
|
"learning_rate": 7.232844555282725e-11, |
|
"loss": 1.9782, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.987204909324646, |
|
"eval_runtime": 103.685, |
|
"eval_samples_per_second": 110.199, |
|
"eval_steps_per_second": 13.782, |
|
"step": 1752 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1752, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 876, |
|
"total_flos": 366806984294400.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|