|
{ |
|
"best_metric": 1.7600704893538932e-07, |
|
"best_model_checkpoint": "Models/t5-base-class-gen/checkpoint-32000", |
|
"epoch": 14.925373134328359, |
|
"eval_steps": 100, |
|
"global_step": 32000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04664179104477612, |
|
"grad_norm": 1.517059087753296, |
|
"learning_rate": 3.9878109452736323e-05, |
|
"loss": 0.427, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04664179104477612, |
|
"eval_loss": 0.030949920415878296, |
|
"eval_runtime": 0.1617, |
|
"eval_samples_per_second": 185.512, |
|
"eval_steps_per_second": 24.735, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09328358208955224, |
|
"grad_norm": 0.39070969820022583, |
|
"learning_rate": 3.9753731343283585e-05, |
|
"loss": 0.0403, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09328358208955224, |
|
"eval_loss": 0.0011001491220667958, |
|
"eval_runtime": 0.1623, |
|
"eval_samples_per_second": 184.821, |
|
"eval_steps_per_second": 24.643, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13992537313432835, |
|
"grad_norm": 1.9688999652862549, |
|
"learning_rate": 3.962935323383085e-05, |
|
"loss": 0.0212, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13992537313432835, |
|
"eval_loss": 0.0012941204477101564, |
|
"eval_runtime": 0.1624, |
|
"eval_samples_per_second": 184.715, |
|
"eval_steps_per_second": 24.629, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1865671641791045, |
|
"grad_norm": 0.4089069068431854, |
|
"learning_rate": 3.950497512437811e-05, |
|
"loss": 0.0121, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1865671641791045, |
|
"eval_loss": 0.0006173434085212648, |
|
"eval_runtime": 0.1662, |
|
"eval_samples_per_second": 180.537, |
|
"eval_steps_per_second": 24.072, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2332089552238806, |
|
"grad_norm": 0.029543137177824974, |
|
"learning_rate": 3.938059701492538e-05, |
|
"loss": 0.0124, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2332089552238806, |
|
"eval_loss": 0.0004312974342610687, |
|
"eval_runtime": 0.1619, |
|
"eval_samples_per_second": 185.304, |
|
"eval_steps_per_second": 24.707, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2798507462686567, |
|
"grad_norm": 0.03992057591676712, |
|
"learning_rate": 3.925621890547264e-05, |
|
"loss": 0.0078, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2798507462686567, |
|
"eval_loss": 0.0006017092382535338, |
|
"eval_runtime": 0.1668, |
|
"eval_samples_per_second": 179.838, |
|
"eval_steps_per_second": 23.978, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.32649253731343286, |
|
"grad_norm": 0.058661218732595444, |
|
"learning_rate": 3.9131840796019907e-05, |
|
"loss": 0.0052, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.32649253731343286, |
|
"eval_loss": 0.0009563259081915021, |
|
"eval_runtime": 0.1598, |
|
"eval_samples_per_second": 187.691, |
|
"eval_steps_per_second": 25.025, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"grad_norm": 0.01660950854420662, |
|
"learning_rate": 3.900746268656717e-05, |
|
"loss": 0.0073, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"eval_loss": 0.002205046359449625, |
|
"eval_runtime": 0.1627, |
|
"eval_samples_per_second": 184.382, |
|
"eval_steps_per_second": 24.584, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4197761194029851, |
|
"grad_norm": 0.7414250373840332, |
|
"learning_rate": 3.888308457711443e-05, |
|
"loss": 0.0054, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4197761194029851, |
|
"eval_loss": 0.0007016424206085503, |
|
"eval_runtime": 0.1586, |
|
"eval_samples_per_second": 189.134, |
|
"eval_steps_per_second": 25.218, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4664179104477612, |
|
"grad_norm": 0.02844785712659359, |
|
"learning_rate": 3.875870646766169e-05, |
|
"loss": 0.0038, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4664179104477612, |
|
"eval_loss": 2.6049870939459652e-05, |
|
"eval_runtime": 0.1606, |
|
"eval_samples_per_second": 186.806, |
|
"eval_steps_per_second": 24.907, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5130597014925373, |
|
"grad_norm": 0.014462646096944809, |
|
"learning_rate": 3.863432835820896e-05, |
|
"loss": 0.0055, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5130597014925373, |
|
"eval_loss": 2.936022610811051e-05, |
|
"eval_runtime": 0.1625, |
|
"eval_samples_per_second": 184.588, |
|
"eval_steps_per_second": 24.612, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5597014925373134, |
|
"grad_norm": 0.3922411799430847, |
|
"learning_rate": 3.850995024875622e-05, |
|
"loss": 0.0045, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5597014925373134, |
|
"eval_loss": 3.228507557651028e-05, |
|
"eval_runtime": 0.16, |
|
"eval_samples_per_second": 187.448, |
|
"eval_steps_per_second": 24.993, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6063432835820896, |
|
"grad_norm": 0.09573910385370255, |
|
"learning_rate": 3.838557213930348e-05, |
|
"loss": 0.0039, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6063432835820896, |
|
"eval_loss": 3.271787500125356e-05, |
|
"eval_runtime": 0.1628, |
|
"eval_samples_per_second": 184.312, |
|
"eval_steps_per_second": 24.575, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6529850746268657, |
|
"grad_norm": 0.07145769894123077, |
|
"learning_rate": 3.8261194029850745e-05, |
|
"loss": 0.0022, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6529850746268657, |
|
"eval_loss": 2.1901514628552832e-05, |
|
"eval_runtime": 0.1652, |
|
"eval_samples_per_second": 181.627, |
|
"eval_steps_per_second": 24.217, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6996268656716418, |
|
"grad_norm": 0.0054560168646276, |
|
"learning_rate": 3.813681592039801e-05, |
|
"loss": 0.003, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6996268656716418, |
|
"eval_loss": 1.5640718629583716e-05, |
|
"eval_runtime": 0.1629, |
|
"eval_samples_per_second": 184.135, |
|
"eval_steps_per_second": 24.551, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 0.011433558538556099, |
|
"learning_rate": 3.8012437810945275e-05, |
|
"loss": 0.0028, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"eval_loss": 2.3460554075427353e-05, |
|
"eval_runtime": 0.1627, |
|
"eval_samples_per_second": 184.379, |
|
"eval_steps_per_second": 24.584, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.792910447761194, |
|
"grad_norm": 3.918342113494873, |
|
"learning_rate": 3.788805970149254e-05, |
|
"loss": 0.0024, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.792910447761194, |
|
"eval_loss": 1.9451523257885128e-05, |
|
"eval_runtime": 0.1594, |
|
"eval_samples_per_second": 188.168, |
|
"eval_steps_per_second": 25.089, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8395522388059702, |
|
"grad_norm": 0.07180186361074448, |
|
"learning_rate": 3.7763681592039805e-05, |
|
"loss": 0.0008, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8395522388059702, |
|
"eval_loss": 1.5172809071373194e-05, |
|
"eval_runtime": 0.1619, |
|
"eval_samples_per_second": 185.284, |
|
"eval_steps_per_second": 24.705, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8861940298507462, |
|
"grad_norm": 0.05330910533666611, |
|
"learning_rate": 3.7639303482587066e-05, |
|
"loss": 0.0025, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.8861940298507462, |
|
"eval_loss": 1.201580380438827e-05, |
|
"eval_runtime": 0.162, |
|
"eval_samples_per_second": 185.15, |
|
"eval_steps_per_second": 24.687, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.9328358208955224, |
|
"grad_norm": 0.05660516396164894, |
|
"learning_rate": 3.751492537313433e-05, |
|
"loss": 0.0019, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9328358208955224, |
|
"eval_loss": 1.2361979315755889e-05, |
|
"eval_runtime": 0.1669, |
|
"eval_samples_per_second": 179.773, |
|
"eval_steps_per_second": 23.97, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9794776119402985, |
|
"grad_norm": 0.007570169400423765, |
|
"learning_rate": 3.7390547263681596e-05, |
|
"loss": 0.0025, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.9794776119402985, |
|
"eval_loss": 1.094327490136493e-05, |
|
"eval_runtime": 0.1611, |
|
"eval_samples_per_second": 186.167, |
|
"eval_steps_per_second": 24.822, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.0261194029850746, |
|
"grad_norm": 0.28353750705718994, |
|
"learning_rate": 3.726616915422886e-05, |
|
"loss": 0.0011, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.0261194029850746, |
|
"eval_loss": 1.550199340272229e-05, |
|
"eval_runtime": 0.1612, |
|
"eval_samples_per_second": 186.156, |
|
"eval_steps_per_second": 24.821, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.0727611940298507, |
|
"grad_norm": 0.09195005893707275, |
|
"learning_rate": 3.7141791044776126e-05, |
|
"loss": 0.0012, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.0727611940298507, |
|
"eval_loss": 9.91811066342052e-06, |
|
"eval_runtime": 0.1665, |
|
"eval_samples_per_second": 180.181, |
|
"eval_steps_per_second": 24.024, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.1194029850746268, |
|
"grad_norm": 0.0037090331315994263, |
|
"learning_rate": 3.701741293532339e-05, |
|
"loss": 0.0012, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.1194029850746268, |
|
"eval_loss": 1.1438472029112745e-05, |
|
"eval_runtime": 0.1609, |
|
"eval_samples_per_second": 186.44, |
|
"eval_steps_per_second": 24.859, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.166044776119403, |
|
"grad_norm": 0.023447873070836067, |
|
"learning_rate": 3.689303482587065e-05, |
|
"loss": 0.0015, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.166044776119403, |
|
"eval_loss": 1.1437785360612907e-05, |
|
"eval_runtime": 0.1647, |
|
"eval_samples_per_second": 182.123, |
|
"eval_steps_per_second": 24.283, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.212686567164179, |
|
"grad_norm": 0.5180063843727112, |
|
"learning_rate": 3.676865671641791e-05, |
|
"loss": 0.0018, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.212686567164179, |
|
"eval_loss": 2.4738228603382595e-05, |
|
"eval_runtime": 0.1685, |
|
"eval_samples_per_second": 178.021, |
|
"eval_steps_per_second": 23.736, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.2593283582089552, |
|
"grad_norm": 0.01776309497654438, |
|
"learning_rate": 3.664427860696518e-05, |
|
"loss": 0.0009, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.2593283582089552, |
|
"eval_loss": 4.052605800097808e-05, |
|
"eval_runtime": 0.1601, |
|
"eval_samples_per_second": 187.399, |
|
"eval_steps_per_second": 24.987, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.3059701492537314, |
|
"grad_norm": 0.003942739684134722, |
|
"learning_rate": 3.651990049751244e-05, |
|
"loss": 0.0008, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.3059701492537314, |
|
"eval_loss": 1.8365864889346994e-05, |
|
"eval_runtime": 0.1602, |
|
"eval_samples_per_second": 187.233, |
|
"eval_steps_per_second": 24.964, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.3526119402985075, |
|
"grad_norm": 0.02459954284131527, |
|
"learning_rate": 3.63955223880597e-05, |
|
"loss": 0.0012, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.3526119402985075, |
|
"eval_loss": 1.1862516657856759e-05, |
|
"eval_runtime": 0.1629, |
|
"eval_samples_per_second": 184.179, |
|
"eval_steps_per_second": 24.557, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.3992537313432836, |
|
"grad_norm": 0.002258304273709655, |
|
"learning_rate": 3.6271144278606964e-05, |
|
"loss": 0.0013, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.3992537313432836, |
|
"eval_loss": 8.7456610344816e-06, |
|
"eval_runtime": 0.1649, |
|
"eval_samples_per_second": 181.877, |
|
"eval_steps_per_second": 24.25, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.4458955223880596, |
|
"grad_norm": 0.0027893888764083385, |
|
"learning_rate": 3.614676616915423e-05, |
|
"loss": 0.001, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.4458955223880596, |
|
"eval_loss": 1.060960585164139e-05, |
|
"eval_runtime": 0.1717, |
|
"eval_samples_per_second": 174.725, |
|
"eval_steps_per_second": 23.297, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"grad_norm": 0.19958974421024323, |
|
"learning_rate": 3.6022388059701494e-05, |
|
"loss": 0.0005, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"eval_loss": 7.3110695666400716e-06, |
|
"eval_runtime": 0.1703, |
|
"eval_samples_per_second": 176.161, |
|
"eval_steps_per_second": 23.488, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.539179104477612, |
|
"grad_norm": 0.009552941657602787, |
|
"learning_rate": 3.589800995024876e-05, |
|
"loss": 0.0007, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.539179104477612, |
|
"eval_loss": 7.919575182313565e-06, |
|
"eval_runtime": 0.162, |
|
"eval_samples_per_second": 185.238, |
|
"eval_steps_per_second": 24.698, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.585820895522388, |
|
"grad_norm": 0.0036544003523886204, |
|
"learning_rate": 3.5773631840796024e-05, |
|
"loss": 0.001, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.585820895522388, |
|
"eval_loss": 9.348523235530593e-06, |
|
"eval_runtime": 0.1653, |
|
"eval_samples_per_second": 181.53, |
|
"eval_steps_per_second": 24.204, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.6324626865671643, |
|
"grad_norm": 0.013017863966524601, |
|
"learning_rate": 3.5649253731343286e-05, |
|
"loss": 0.0008, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.6324626865671643, |
|
"eval_loss": 7.1847543949843384e-06, |
|
"eval_runtime": 0.1602, |
|
"eval_samples_per_second": 187.276, |
|
"eval_steps_per_second": 24.97, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.6791044776119404, |
|
"grad_norm": 0.0018999130697920918, |
|
"learning_rate": 3.552487562189055e-05, |
|
"loss": 0.0025, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.6791044776119404, |
|
"eval_loss": 8.392209565499797e-06, |
|
"eval_runtime": 0.1609, |
|
"eval_samples_per_second": 186.495, |
|
"eval_steps_per_second": 24.866, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.7257462686567164, |
|
"grad_norm": 0.0020983312278985977, |
|
"learning_rate": 3.5400497512437816e-05, |
|
"loss": 0.0017, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.7257462686567164, |
|
"eval_loss": 1.0126370398211293e-05, |
|
"eval_runtime": 0.1624, |
|
"eval_samples_per_second": 184.676, |
|
"eval_steps_per_second": 24.623, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.7723880597014925, |
|
"grad_norm": 0.026340099051594734, |
|
"learning_rate": 3.527611940298508e-05, |
|
"loss": 0.0005, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.7723880597014925, |
|
"eval_loss": 1.1981122952420264e-05, |
|
"eval_runtime": 0.1676, |
|
"eval_samples_per_second": 179.033, |
|
"eval_steps_per_second": 23.871, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.8190298507462686, |
|
"grad_norm": 0.014922079630196095, |
|
"learning_rate": 3.515174129353234e-05, |
|
"loss": 0.0006, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.8190298507462686, |
|
"eval_loss": 9.199016858474351e-06, |
|
"eval_runtime": 0.1613, |
|
"eval_samples_per_second": 185.942, |
|
"eval_steps_per_second": 24.792, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.8656716417910446, |
|
"grad_norm": 0.27961060404777527, |
|
"learning_rate": 3.50273631840796e-05, |
|
"loss": 0.0007, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.8656716417910446, |
|
"eval_loss": 4.940530743624549e-06, |
|
"eval_runtime": 0.1625, |
|
"eval_samples_per_second": 184.614, |
|
"eval_steps_per_second": 24.615, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.912313432835821, |
|
"grad_norm": 0.017754705622792244, |
|
"learning_rate": 3.490298507462687e-05, |
|
"loss": 0.0007, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.912313432835821, |
|
"eval_loss": 5.068028713139938e-06, |
|
"eval_runtime": 0.16, |
|
"eval_samples_per_second": 187.528, |
|
"eval_steps_per_second": 25.004, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.9589552238805972, |
|
"grad_norm": 0.27114221453666687, |
|
"learning_rate": 3.477860696517413e-05, |
|
"loss": 0.0004, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.9589552238805972, |
|
"eval_loss": 4.581762823363533e-06, |
|
"eval_runtime": 0.1629, |
|
"eval_samples_per_second": 184.179, |
|
"eval_steps_per_second": 24.557, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.0055970149253732, |
|
"grad_norm": 0.012334506027400494, |
|
"learning_rate": 3.46542288557214e-05, |
|
"loss": 0.002, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.0055970149253732, |
|
"eval_loss": 6.550972102559172e-06, |
|
"eval_runtime": 0.1623, |
|
"eval_samples_per_second": 184.884, |
|
"eval_steps_per_second": 24.651, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.0522388059701493, |
|
"grad_norm": 0.06736209243535995, |
|
"learning_rate": 3.452985074626866e-05, |
|
"loss": 0.0004, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.0522388059701493, |
|
"eval_loss": 5.742116627516225e-06, |
|
"eval_runtime": 0.1636, |
|
"eval_samples_per_second": 183.421, |
|
"eval_steps_per_second": 24.456, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.0988805970149254, |
|
"grad_norm": 0.006035651080310345, |
|
"learning_rate": 3.440547263681592e-05, |
|
"loss": 0.0014, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.0988805970149254, |
|
"eval_loss": 7.217912752821576e-06, |
|
"eval_runtime": 0.1633, |
|
"eval_samples_per_second": 183.679, |
|
"eval_steps_per_second": 24.491, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.1455223880597014, |
|
"grad_norm": 0.0010893335565924644, |
|
"learning_rate": 3.4281094527363184e-05, |
|
"loss": 0.0003, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.1455223880597014, |
|
"eval_loss": 5.789845090475865e-06, |
|
"eval_runtime": 0.1605, |
|
"eval_samples_per_second": 186.889, |
|
"eval_steps_per_second": 24.919, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.1921641791044775, |
|
"grad_norm": 0.0023283003829419613, |
|
"learning_rate": 3.415671641791045e-05, |
|
"loss": 0.0003, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.1921641791044775, |
|
"eval_loss": 4.947921297571156e-06, |
|
"eval_runtime": 0.1602, |
|
"eval_samples_per_second": 187.294, |
|
"eval_steps_per_second": 24.973, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.2388059701492535, |
|
"grad_norm": 0.01192499604076147, |
|
"learning_rate": 3.4032338308457714e-05, |
|
"loss": 0.0007, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.2388059701492535, |
|
"eval_loss": 5.415991836343892e-06, |
|
"eval_runtime": 0.1607, |
|
"eval_samples_per_second": 186.643, |
|
"eval_steps_per_second": 24.886, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.28544776119403, |
|
"grad_norm": 0.001801644335500896, |
|
"learning_rate": 3.390796019900498e-05, |
|
"loss": 0.0003, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.28544776119403, |
|
"eval_loss": 4.487563728616806e-06, |
|
"eval_runtime": 0.1615, |
|
"eval_samples_per_second": 185.729, |
|
"eval_steps_per_second": 24.764, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.332089552238806, |
|
"grad_norm": 0.0010308738565072417, |
|
"learning_rate": 3.3783582089552244e-05, |
|
"loss": 0.0009, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.332089552238806, |
|
"eval_loss": 6.067215053917607e-06, |
|
"eval_runtime": 0.1628, |
|
"eval_samples_per_second": 184.262, |
|
"eval_steps_per_second": 24.568, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.378731343283582, |
|
"grad_norm": 0.0012472744565457106, |
|
"learning_rate": 3.3659203980099505e-05, |
|
"loss": 0.0003, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.378731343283582, |
|
"eval_loss": 1.881633943412453e-05, |
|
"eval_runtime": 0.1606, |
|
"eval_samples_per_second": 186.836, |
|
"eval_steps_per_second": 24.911, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.425373134328358, |
|
"grad_norm": 0.7578136920928955, |
|
"learning_rate": 3.353482587064677e-05, |
|
"loss": 0.0009, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.425373134328358, |
|
"eval_loss": 1.3828990631736815e-05, |
|
"eval_runtime": 0.1631, |
|
"eval_samples_per_second": 183.952, |
|
"eval_steps_per_second": 24.527, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.4720149253731343, |
|
"grad_norm": 0.1774366796016693, |
|
"learning_rate": 3.3410447761194035e-05, |
|
"loss": 0.0003, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.4720149253731343, |
|
"eval_loss": 9.651944310462568e-06, |
|
"eval_runtime": 0.1623, |
|
"eval_samples_per_second": 184.83, |
|
"eval_steps_per_second": 24.644, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.5186567164179103, |
|
"grad_norm": 0.020136339589953423, |
|
"learning_rate": 3.32860696517413e-05, |
|
"loss": 0.0006, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.5186567164179103, |
|
"eval_loss": 9.613087968318723e-06, |
|
"eval_runtime": 0.1609, |
|
"eval_samples_per_second": 186.441, |
|
"eval_steps_per_second": 24.859, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.5652985074626864, |
|
"grad_norm": 0.0021144032943993807, |
|
"learning_rate": 3.316169154228856e-05, |
|
"loss": 0.0002, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.5652985074626864, |
|
"eval_loss": 6.973440122237662e-06, |
|
"eval_runtime": 0.1613, |
|
"eval_samples_per_second": 186.02, |
|
"eval_steps_per_second": 24.803, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.611940298507463, |
|
"grad_norm": 0.0018188412068411708, |
|
"learning_rate": 3.303731343283582e-05, |
|
"loss": 0.0002, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.611940298507463, |
|
"eval_loss": 5.028930445405422e-06, |
|
"eval_runtime": 0.1636, |
|
"eval_samples_per_second": 183.338, |
|
"eval_steps_per_second": 24.445, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.658582089552239, |
|
"grad_norm": 0.0019931041169911623, |
|
"learning_rate": 3.291293532338309e-05, |
|
"loss": 0.0005, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.658582089552239, |
|
"eval_loss": 4.870566499448614e-06, |
|
"eval_runtime": 0.1657, |
|
"eval_samples_per_second": 180.997, |
|
"eval_steps_per_second": 24.133, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.705223880597015, |
|
"grad_norm": 0.0018326346762478352, |
|
"learning_rate": 3.278855721393035e-05, |
|
"loss": 0.0002, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.705223880597015, |
|
"eval_loss": 3.6711526263388805e-06, |
|
"eval_runtime": 0.1741, |
|
"eval_samples_per_second": 172.304, |
|
"eval_steps_per_second": 22.974, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.751865671641791, |
|
"grad_norm": 0.003958791960030794, |
|
"learning_rate": 3.266417910447762e-05, |
|
"loss": 0.0008, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.751865671641791, |
|
"eval_loss": 8.262709343398456e-06, |
|
"eval_runtime": 0.1621, |
|
"eval_samples_per_second": 185.052, |
|
"eval_steps_per_second": 24.674, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.798507462686567, |
|
"grad_norm": 0.001310827792622149, |
|
"learning_rate": 3.253980099502488e-05, |
|
"loss": 0.0002, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.798507462686567, |
|
"eval_loss": 5.826313099532854e-06, |
|
"eval_runtime": 0.1597, |
|
"eval_samples_per_second": 187.8, |
|
"eval_steps_per_second": 25.04, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.845149253731343, |
|
"grad_norm": 0.0018846031744033098, |
|
"learning_rate": 3.241542288557214e-05, |
|
"loss": 0.0006, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.845149253731343, |
|
"eval_loss": 1.0719989404606167e-05, |
|
"eval_runtime": 0.1635, |
|
"eval_samples_per_second": 183.451, |
|
"eval_steps_per_second": 24.46, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.8917910447761193, |
|
"grad_norm": 0.0027365258429199457, |
|
"learning_rate": 3.22910447761194e-05, |
|
"loss": 0.0003, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.8917910447761193, |
|
"eval_loss": 4.158847787039122e-06, |
|
"eval_runtime": 0.1603, |
|
"eval_samples_per_second": 187.178, |
|
"eval_steps_per_second": 24.957, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.9384328358208958, |
|
"grad_norm": 0.003453996032476425, |
|
"learning_rate": 3.216666666666667e-05, |
|
"loss": 0.001, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.9384328358208958, |
|
"eval_loss": 1.029476788971806e-05, |
|
"eval_runtime": 0.1606, |
|
"eval_samples_per_second": 186.748, |
|
"eval_steps_per_second": 24.9, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"grad_norm": 0.036285314708948135, |
|
"learning_rate": 3.204228855721393e-05, |
|
"loss": 0.0006, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"eval_loss": 5.606885224551661e-06, |
|
"eval_runtime": 0.16, |
|
"eval_samples_per_second": 187.553, |
|
"eval_steps_per_second": 25.007, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 3.031716417910448, |
|
"grad_norm": 0.46069106459617615, |
|
"learning_rate": 3.1917910447761195e-05, |
|
"loss": 0.0014, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.031716417910448, |
|
"eval_loss": 6.717081760143628e-06, |
|
"eval_runtime": 0.1597, |
|
"eval_samples_per_second": 187.883, |
|
"eval_steps_per_second": 25.051, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.078358208955224, |
|
"grad_norm": 0.0014442165847867727, |
|
"learning_rate": 3.1793532338308456e-05, |
|
"loss": 0.0004, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.078358208955224, |
|
"eval_loss": 2.4730301447561942e-05, |
|
"eval_runtime": 0.1633, |
|
"eval_samples_per_second": 183.726, |
|
"eval_steps_per_second": 24.497, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.0008062911801971495, |
|
"learning_rate": 3.1670398009950254e-05, |
|
"loss": 0.0013, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"eval_loss": 3.047320433324785e-06, |
|
"eval_runtime": 0.1643, |
|
"eval_samples_per_second": 182.624, |
|
"eval_steps_per_second": 24.35, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 3.171641791044776, |
|
"grad_norm": 0.0008228031219914556, |
|
"learning_rate": 3.1546019900497516e-05, |
|
"loss": 0.0007, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.171641791044776, |
|
"eval_loss": 3.1468273391510593e-06, |
|
"eval_runtime": 0.1684, |
|
"eval_samples_per_second": 178.135, |
|
"eval_steps_per_second": 23.751, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.218283582089552, |
|
"grad_norm": 0.006291732657700777, |
|
"learning_rate": 3.142164179104478e-05, |
|
"loss": 0.0006, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 3.218283582089552, |
|
"eval_loss": 3.600493528210791e-06, |
|
"eval_runtime": 0.1637, |
|
"eval_samples_per_second": 183.264, |
|
"eval_steps_per_second": 24.435, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 3.264925373134328, |
|
"grad_norm": 0.007644134573638439, |
|
"learning_rate": 3.129726368159204e-05, |
|
"loss": 0.0004, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.264925373134328, |
|
"eval_loss": 3.303811354271602e-06, |
|
"eval_runtime": 0.1708, |
|
"eval_samples_per_second": 175.635, |
|
"eval_steps_per_second": 23.418, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.3115671641791042, |
|
"grad_norm": 0.005915221758186817, |
|
"learning_rate": 3.117288557213931e-05, |
|
"loss": 0.0005, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 3.3115671641791042, |
|
"eval_loss": 2.879673274946981e-06, |
|
"eval_runtime": 0.1654, |
|
"eval_samples_per_second": 181.433, |
|
"eval_steps_per_second": 24.191, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 3.3582089552238807, |
|
"grad_norm": 0.0019528436241671443, |
|
"learning_rate": 3.104850746268657e-05, |
|
"loss": 0.0006, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.3582089552238807, |
|
"eval_loss": 3.403750042707543e-06, |
|
"eval_runtime": 0.1608, |
|
"eval_samples_per_second": 186.522, |
|
"eval_steps_per_second": 24.87, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.404850746268657, |
|
"grad_norm": 0.004577342886477709, |
|
"learning_rate": 3.092412935323384e-05, |
|
"loss": 0.0003, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 3.404850746268657, |
|
"eval_loss": 6.347925591398962e-06, |
|
"eval_runtime": 0.1612, |
|
"eval_samples_per_second": 186.155, |
|
"eval_steps_per_second": 24.821, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 3.451492537313433, |
|
"grad_norm": 0.0017618165584281087, |
|
"learning_rate": 3.079975124378109e-05, |
|
"loss": 0.0003, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.451492537313433, |
|
"eval_loss": 4.75058095616987e-06, |
|
"eval_runtime": 0.1761, |
|
"eval_samples_per_second": 170.387, |
|
"eval_steps_per_second": 22.718, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.498134328358209, |
|
"grad_norm": 0.29626068472862244, |
|
"learning_rate": 3.067537313432836e-05, |
|
"loss": 0.0002, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.498134328358209, |
|
"eval_loss": 3.206900828445214e-06, |
|
"eval_runtime": 0.1606, |
|
"eval_samples_per_second": 186.83, |
|
"eval_steps_per_second": 24.911, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.544776119402985, |
|
"grad_norm": 0.0010133940959349275, |
|
"learning_rate": 3.055099502487562e-05, |
|
"loss": 0.0002, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 3.544776119402985, |
|
"eval_loss": 5.057888301962521e-06, |
|
"eval_runtime": 0.1667, |
|
"eval_samples_per_second": 179.969, |
|
"eval_steps_per_second": 23.996, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 3.591417910447761, |
|
"grad_norm": 0.001219356432557106, |
|
"learning_rate": 3.0426616915422887e-05, |
|
"loss": 0.0006, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 3.591417910447761, |
|
"eval_loss": 4.3034724512835965e-06, |
|
"eval_runtime": 0.1589, |
|
"eval_samples_per_second": 188.793, |
|
"eval_steps_per_second": 25.172, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 3.638059701492537, |
|
"grad_norm": 0.004916150122880936, |
|
"learning_rate": 3.030223880597015e-05, |
|
"loss": 0.0005, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 3.638059701492537, |
|
"eval_loss": 7.532180006819544e-06, |
|
"eval_runtime": 0.1593, |
|
"eval_samples_per_second": 188.302, |
|
"eval_steps_per_second": 25.107, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 3.6847014925373136, |
|
"grad_norm": 0.0009036241099238396, |
|
"learning_rate": 3.0177860696517417e-05, |
|
"loss": 0.0002, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 3.6847014925373136, |
|
"eval_loss": 7.954057764436584e-06, |
|
"eval_runtime": 0.1617, |
|
"eval_samples_per_second": 185.49, |
|
"eval_steps_per_second": 24.732, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 3.7313432835820897, |
|
"grad_norm": 0.0004865360097028315, |
|
"learning_rate": 3.005348258706468e-05, |
|
"loss": 0.0002, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.7313432835820897, |
|
"eval_loss": 5.304416390572442e-06, |
|
"eval_runtime": 0.1601, |
|
"eval_samples_per_second": 187.374, |
|
"eval_steps_per_second": 24.983, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.7779850746268657, |
|
"grad_norm": 0.0006289943703450263, |
|
"learning_rate": 2.9929104477611944e-05, |
|
"loss": 0.001, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 3.7779850746268657, |
|
"eval_loss": 6.098490302974824e-06, |
|
"eval_runtime": 0.1602, |
|
"eval_samples_per_second": 187.228, |
|
"eval_steps_per_second": 24.964, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 3.824626865671642, |
|
"grad_norm": 0.0017220574663951993, |
|
"learning_rate": 2.9804726368159205e-05, |
|
"loss": 0.0003, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 3.824626865671642, |
|
"eval_loss": 6.202932127052918e-06, |
|
"eval_runtime": 0.1592, |
|
"eval_samples_per_second": 188.401, |
|
"eval_steps_per_second": 25.12, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 3.871268656716418, |
|
"grad_norm": 0.10363404452800751, |
|
"learning_rate": 2.968034825870647e-05, |
|
"loss": 0.0004, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 3.871268656716418, |
|
"eval_loss": 3.6653259485319722e-06, |
|
"eval_runtime": 0.162, |
|
"eval_samples_per_second": 185.235, |
|
"eval_steps_per_second": 24.698, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 3.917910447761194, |
|
"grad_norm": 0.001283775782212615, |
|
"learning_rate": 2.9555970149253732e-05, |
|
"loss": 0.0003, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 3.917910447761194, |
|
"eval_loss": 3.4666393275983864e-06, |
|
"eval_runtime": 0.1624, |
|
"eval_samples_per_second": 184.777, |
|
"eval_steps_per_second": 24.637, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 3.96455223880597, |
|
"grad_norm": 0.0013221842236816883, |
|
"learning_rate": 2.9431592039800997e-05, |
|
"loss": 0.0001, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.96455223880597, |
|
"eval_loss": 2.620423856569687e-06, |
|
"eval_runtime": 0.1605, |
|
"eval_samples_per_second": 186.903, |
|
"eval_steps_per_second": 24.92, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 4.0111940298507465, |
|
"grad_norm": 0.0007219673716463149, |
|
"learning_rate": 2.930721393034826e-05, |
|
"loss": 0.0004, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 4.0111940298507465, |
|
"eval_loss": 2.7292528557154583e-06, |
|
"eval_runtime": 0.1831, |
|
"eval_samples_per_second": 163.862, |
|
"eval_steps_per_second": 21.848, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 4.057835820895522, |
|
"grad_norm": 0.13019587099552155, |
|
"learning_rate": 2.9182835820895527e-05, |
|
"loss": 0.0002, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 4.057835820895522, |
|
"eval_loss": 2.8391550586093217e-06, |
|
"eval_runtime": 0.1605, |
|
"eval_samples_per_second": 186.893, |
|
"eval_steps_per_second": 24.919, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 4.104477611940299, |
|
"grad_norm": 0.0006854361272417009, |
|
"learning_rate": 2.9058457711442788e-05, |
|
"loss": 0.0002, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 4.104477611940299, |
|
"eval_loss": 2.3670136215514503e-06, |
|
"eval_runtime": 0.1616, |
|
"eval_samples_per_second": 185.674, |
|
"eval_steps_per_second": 24.757, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 4.151119402985074, |
|
"grad_norm": 0.0259992815554142, |
|
"learning_rate": 2.8934079601990053e-05, |
|
"loss": 0.0001, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 4.151119402985074, |
|
"eval_loss": 1.925168362504337e-06, |
|
"eval_runtime": 0.1602, |
|
"eval_samples_per_second": 187.21, |
|
"eval_steps_per_second": 24.961, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 4.197761194029851, |
|
"grad_norm": 0.002843194641172886, |
|
"learning_rate": 2.8809701492537315e-05, |
|
"loss": 0.0006, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.197761194029851, |
|
"eval_loss": 1.060922113538254e-05, |
|
"eval_runtime": 0.1591, |
|
"eval_samples_per_second": 188.552, |
|
"eval_steps_per_second": 25.14, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.244402985074627, |
|
"grad_norm": 0.001723111025057733, |
|
"learning_rate": 2.868532338308458e-05, |
|
"loss": 0.0004, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 4.244402985074627, |
|
"eval_loss": 6.985505024204031e-06, |
|
"eval_runtime": 0.1598, |
|
"eval_samples_per_second": 187.765, |
|
"eval_steps_per_second": 25.035, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 4.291044776119403, |
|
"grad_norm": 0.0005880256649106741, |
|
"learning_rate": 2.856094527363184e-05, |
|
"loss": 0.0003, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 4.291044776119403, |
|
"eval_loss": 3.5329176171217114e-06, |
|
"eval_runtime": 0.1632, |
|
"eval_samples_per_second": 183.85, |
|
"eval_steps_per_second": 24.513, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 4.337686567164179, |
|
"grad_norm": 0.0004977516946382821, |
|
"learning_rate": 2.8436567164179106e-05, |
|
"loss": 0.0004, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 4.337686567164179, |
|
"eval_loss": 4.645138687919825e-06, |
|
"eval_runtime": 0.1619, |
|
"eval_samples_per_second": 185.244, |
|
"eval_steps_per_second": 24.699, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 4.384328358208955, |
|
"grad_norm": 0.0014300497714430094, |
|
"learning_rate": 2.8312189054726368e-05, |
|
"loss": 0.0002, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 4.384328358208955, |
|
"eval_loss": 3.881290922436165e-06, |
|
"eval_runtime": 0.1601, |
|
"eval_samples_per_second": 187.344, |
|
"eval_steps_per_second": 24.979, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 4.4309701492537314, |
|
"grad_norm": 0.0009422674193046987, |
|
"learning_rate": 2.8187810945273636e-05, |
|
"loss": 0.0003, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.4309701492537314, |
|
"eval_loss": 2.567733190517174e-06, |
|
"eval_runtime": 0.1599, |
|
"eval_samples_per_second": 187.663, |
|
"eval_steps_per_second": 25.022, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.477611940298507, |
|
"grad_norm": 0.0015334930503740907, |
|
"learning_rate": 2.8063432835820895e-05, |
|
"loss": 0.0004, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 4.477611940298507, |
|
"eval_loss": 4.461974185687723e-06, |
|
"eval_runtime": 0.1614, |
|
"eval_samples_per_second": 185.881, |
|
"eval_steps_per_second": 24.784, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 4.524253731343284, |
|
"grad_norm": 0.002924109809100628, |
|
"learning_rate": 2.7939054726368163e-05, |
|
"loss": 0.0009, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 4.524253731343284, |
|
"eval_loss": 1.8840278244169895e-06, |
|
"eval_runtime": 0.1609, |
|
"eval_samples_per_second": 186.403, |
|
"eval_steps_per_second": 24.854, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 4.57089552238806, |
|
"grad_norm": 0.0012187871616333723, |
|
"learning_rate": 2.7814676616915425e-05, |
|
"loss": 0.0001, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 4.57089552238806, |
|
"eval_loss": 1.5747774568808381e-06, |
|
"eval_runtime": 0.1691, |
|
"eval_samples_per_second": 177.369, |
|
"eval_steps_per_second": 23.649, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 4.617537313432836, |
|
"grad_norm": 0.025326132774353027, |
|
"learning_rate": 2.769029850746269e-05, |
|
"loss": 0.0005, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 4.617537313432836, |
|
"eval_loss": 1.4370193639479112e-06, |
|
"eval_runtime": 0.1606, |
|
"eval_samples_per_second": 186.746, |
|
"eval_steps_per_second": 24.899, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 4.664179104477612, |
|
"grad_norm": 0.0006336395163089037, |
|
"learning_rate": 2.756592039800995e-05, |
|
"loss": 0.0002, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.664179104477612, |
|
"eval_loss": 1.4652428035333287e-06, |
|
"eval_runtime": 0.1613, |
|
"eval_samples_per_second": 185.994, |
|
"eval_steps_per_second": 24.799, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.710820895522388, |
|
"grad_norm": 0.0023961891420185566, |
|
"learning_rate": 2.7441542288557216e-05, |
|
"loss": 0.0004, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 4.710820895522388, |
|
"eval_loss": 1.3807976984026027e-06, |
|
"eval_runtime": 0.1606, |
|
"eval_samples_per_second": 186.745, |
|
"eval_steps_per_second": 24.899, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 4.757462686567164, |
|
"grad_norm": 0.0004522838571574539, |
|
"learning_rate": 2.7317164179104478e-05, |
|
"loss": 0.0001, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 4.757462686567164, |
|
"eval_loss": 1.4655278164354968e-06, |
|
"eval_runtime": 0.1607, |
|
"eval_samples_per_second": 186.679, |
|
"eval_steps_per_second": 24.891, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 4.80410447761194, |
|
"grad_norm": 0.0003487040812615305, |
|
"learning_rate": 2.7192786069651743e-05, |
|
"loss": 0.0001, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 4.80410447761194, |
|
"eval_loss": 1.6429443121523946e-06, |
|
"eval_runtime": 0.1606, |
|
"eval_samples_per_second": 186.758, |
|
"eval_steps_per_second": 24.901, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 4.850746268656716, |
|
"grad_norm": 0.01166547555476427, |
|
"learning_rate": 2.7068407960199004e-05, |
|
"loss": 0.0007, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 4.850746268656716, |
|
"eval_loss": 2.425857928756159e-06, |
|
"eval_runtime": 0.1599, |
|
"eval_samples_per_second": 187.564, |
|
"eval_steps_per_second": 25.009, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 4.897388059701493, |
|
"grad_norm": 0.0007575357449240983, |
|
"learning_rate": 2.6944029850746273e-05, |
|
"loss": 0.0001, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.897388059701493, |
|
"eval_loss": 2.3037405298964586e-06, |
|
"eval_runtime": 0.1599, |
|
"eval_samples_per_second": 187.656, |
|
"eval_steps_per_second": 25.021, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.9440298507462686, |
|
"grad_norm": 0.0067961206659674644, |
|
"learning_rate": 2.6819651741293534e-05, |
|
"loss": 0.0003, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 4.9440298507462686, |
|
"eval_loss": 2.1566411305684596e-06, |
|
"eval_runtime": 0.1671, |
|
"eval_samples_per_second": 179.556, |
|
"eval_steps_per_second": 23.941, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 4.990671641791045, |
|
"grad_norm": 0.0007165633141994476, |
|
"learning_rate": 2.6696517412935325e-05, |
|
"loss": 0.0002, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 4.990671641791045, |
|
"eval_loss": 2.0811207832593936e-06, |
|
"eval_runtime": 0.1599, |
|
"eval_samples_per_second": 187.637, |
|
"eval_steps_per_second": 25.018, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 5.037313432835821, |
|
"grad_norm": 0.000998039380647242, |
|
"learning_rate": 2.657338308457712e-05, |
|
"loss": 0.0008, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 5.037313432835821, |
|
"eval_loss": 1.8726080952546909e-06, |
|
"eval_runtime": 0.1624, |
|
"eval_samples_per_second": 184.7, |
|
"eval_steps_per_second": 24.627, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 5.083955223880597, |
|
"grad_norm": 0.0009990198304876685, |
|
"learning_rate": 2.6449004975124378e-05, |
|
"loss": 0.0001, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 5.083955223880597, |
|
"eval_loss": 3.987378022429766e-06, |
|
"eval_runtime": 0.1606, |
|
"eval_samples_per_second": 186.835, |
|
"eval_steps_per_second": 24.911, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 5.130597014925373, |
|
"grad_norm": 0.0006653439486399293, |
|
"learning_rate": 2.6324626865671646e-05, |
|
"loss": 0.0001, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.130597014925373, |
|
"eval_loss": 2.8886024665553123e-06, |
|
"eval_runtime": 0.1597, |
|
"eval_samples_per_second": 187.847, |
|
"eval_steps_per_second": 25.046, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.177238805970149, |
|
"grad_norm": 0.0025863787159323692, |
|
"learning_rate": 2.6200248756218908e-05, |
|
"loss": 0.0003, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 5.177238805970149, |
|
"eval_loss": 3.7123268157301936e-06, |
|
"eval_runtime": 0.1611, |
|
"eval_samples_per_second": 186.257, |
|
"eval_steps_per_second": 24.834, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 5.223880597014926, |
|
"grad_norm": 0.002973082009702921, |
|
"learning_rate": 2.6075870646766173e-05, |
|
"loss": 0.0003, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 5.223880597014926, |
|
"eval_loss": 1.7487926697867806e-06, |
|
"eval_runtime": 0.166, |
|
"eval_samples_per_second": 180.691, |
|
"eval_steps_per_second": 24.092, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 5.270522388059701, |
|
"grad_norm": 0.0015913803363218904, |
|
"learning_rate": 2.5951492537313434e-05, |
|
"loss": 0.0001, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 5.270522388059701, |
|
"eval_loss": 1.5435723526024958e-06, |
|
"eval_runtime": 0.1604, |
|
"eval_samples_per_second": 187.017, |
|
"eval_steps_per_second": 24.936, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 5.317164179104478, |
|
"grad_norm": 0.0005316142342053354, |
|
"learning_rate": 2.58271144278607e-05, |
|
"loss": 0.0001, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 5.317164179104478, |
|
"eval_loss": 1.4954798643884715e-06, |
|
"eval_runtime": 0.1594, |
|
"eval_samples_per_second": 188.187, |
|
"eval_steps_per_second": 25.092, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 5.3638059701492535, |
|
"grad_norm": 0.0008541183196939528, |
|
"learning_rate": 2.570273631840796e-05, |
|
"loss": 0.0001, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 5.3638059701492535, |
|
"eval_loss": 1.961838506758795e-06, |
|
"eval_runtime": 0.1597, |
|
"eval_samples_per_second": 187.843, |
|
"eval_steps_per_second": 25.046, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 5.41044776119403, |
|
"grad_norm": 0.0004898870829492807, |
|
"learning_rate": 2.5578358208955226e-05, |
|
"loss": 0.0001, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 5.41044776119403, |
|
"eval_loss": 1.911733761517098e-06, |
|
"eval_runtime": 0.1589, |
|
"eval_samples_per_second": 188.76, |
|
"eval_steps_per_second": 25.168, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 5.457089552238806, |
|
"grad_norm": 0.0006626475951634347, |
|
"learning_rate": 2.5453980099502488e-05, |
|
"loss": 0.0001, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 5.457089552238806, |
|
"eval_loss": 3.094894736932474e-06, |
|
"eval_runtime": 0.1695, |
|
"eval_samples_per_second": 177.033, |
|
"eval_steps_per_second": 23.604, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 5.503731343283582, |
|
"grad_norm": 0.005953106097877026, |
|
"learning_rate": 2.5329601990049756e-05, |
|
"loss": 0.0001, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 5.503731343283582, |
|
"eval_loss": 3.518416178849293e-06, |
|
"eval_runtime": 0.1624, |
|
"eval_samples_per_second": 184.755, |
|
"eval_steps_per_second": 24.634, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 5.550373134328359, |
|
"grad_norm": 0.017002714797854424, |
|
"learning_rate": 2.5205223880597018e-05, |
|
"loss": 0.0004, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 5.550373134328359, |
|
"eval_loss": 6.997787295404123e-06, |
|
"eval_runtime": 0.1643, |
|
"eval_samples_per_second": 182.545, |
|
"eval_steps_per_second": 24.339, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 5.597014925373134, |
|
"grad_norm": 0.00019286558381281793, |
|
"learning_rate": 2.5080845771144283e-05, |
|
"loss": 0.0001, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 5.597014925373134, |
|
"eval_loss": 4.117903699807357e-06, |
|
"eval_runtime": 0.1621, |
|
"eval_samples_per_second": 185.078, |
|
"eval_steps_per_second": 24.677, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 5.643656716417911, |
|
"grad_norm": 0.0007640725816600025, |
|
"learning_rate": 2.4956467661691544e-05, |
|
"loss": 0.0003, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 5.643656716417911, |
|
"eval_loss": 1.6209987734328024e-06, |
|
"eval_runtime": 0.1629, |
|
"eval_samples_per_second": 184.172, |
|
"eval_steps_per_second": 24.556, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 5.690298507462686, |
|
"grad_norm": 0.0009279770310968161, |
|
"learning_rate": 2.483208955223881e-05, |
|
"loss": 0.0001, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 5.690298507462686, |
|
"eval_loss": 1.4867666777718114e-06, |
|
"eval_runtime": 0.1612, |
|
"eval_samples_per_second": 186.065, |
|
"eval_steps_per_second": 24.809, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 5.736940298507463, |
|
"grad_norm": 0.015755705535411835, |
|
"learning_rate": 2.470771144278607e-05, |
|
"loss": 0.0001, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 5.736940298507463, |
|
"eval_loss": 1.4831955468253e-06, |
|
"eval_runtime": 0.1631, |
|
"eval_samples_per_second": 183.983, |
|
"eval_steps_per_second": 24.531, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 5.7835820895522385, |
|
"grad_norm": 0.0005079147522337735, |
|
"learning_rate": 2.4583333333333336e-05, |
|
"loss": 0.0001, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 5.7835820895522385, |
|
"eval_loss": 1.4258381497711525e-06, |
|
"eval_runtime": 0.166, |
|
"eval_samples_per_second": 180.722, |
|
"eval_steps_per_second": 24.096, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 5.830223880597015, |
|
"grad_norm": 0.0003629447892308235, |
|
"learning_rate": 2.4458955223880597e-05, |
|
"loss": 0.0002, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 5.830223880597015, |
|
"eval_loss": 1.1145809821755392e-06, |
|
"eval_runtime": 0.1616, |
|
"eval_samples_per_second": 185.699, |
|
"eval_steps_per_second": 24.76, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 5.8768656716417915, |
|
"grad_norm": 0.0025047562085092068, |
|
"learning_rate": 2.4334577114427866e-05, |
|
"loss": 0.0003, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 5.8768656716417915, |
|
"eval_loss": 1.6086485175037524e-06, |
|
"eval_runtime": 0.163, |
|
"eval_samples_per_second": 184.018, |
|
"eval_steps_per_second": 24.536, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 5.923507462686567, |
|
"grad_norm": 0.0010979525977745652, |
|
"learning_rate": 2.4210199004975127e-05, |
|
"loss": 0.0001, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 5.923507462686567, |
|
"eval_loss": 1.6331794085999718e-06, |
|
"eval_runtime": 0.1638, |
|
"eval_samples_per_second": 183.117, |
|
"eval_steps_per_second": 24.416, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 5.970149253731344, |
|
"grad_norm": 0.00042478801333345473, |
|
"learning_rate": 2.4085820895522392e-05, |
|
"loss": 0.0005, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 5.970149253731344, |
|
"eval_loss": 2.18420427700039e-06, |
|
"eval_runtime": 0.1694, |
|
"eval_samples_per_second": 177.069, |
|
"eval_steps_per_second": 23.609, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 6.016791044776119, |
|
"grad_norm": 0.0026371392887085676, |
|
"learning_rate": 2.3961442786069654e-05, |
|
"loss": 0.0005, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 6.016791044776119, |
|
"eval_loss": 2.724199703152408e-06, |
|
"eval_runtime": 0.1645, |
|
"eval_samples_per_second": 182.396, |
|
"eval_steps_per_second": 24.319, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 6.063432835820896, |
|
"grad_norm": 0.0005338056362234056, |
|
"learning_rate": 2.383706467661692e-05, |
|
"loss": 0.0007, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 6.063432835820896, |
|
"eval_loss": 2.369762114540208e-06, |
|
"eval_runtime": 0.1628, |
|
"eval_samples_per_second": 184.255, |
|
"eval_steps_per_second": 24.567, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 6.110074626865671, |
|
"grad_norm": 0.0006243676762096584, |
|
"learning_rate": 2.371268656716418e-05, |
|
"loss": 0.0004, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 6.110074626865671, |
|
"eval_loss": 4.675569016399095e-06, |
|
"eval_runtime": 0.1639, |
|
"eval_samples_per_second": 183.01, |
|
"eval_steps_per_second": 24.401, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 6.156716417910448, |
|
"grad_norm": 0.013767559081315994, |
|
"learning_rate": 2.3588308457711445e-05, |
|
"loss": 0.0001, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 6.156716417910448, |
|
"eval_loss": 2.445647169224685e-06, |
|
"eval_runtime": 0.1632, |
|
"eval_samples_per_second": 183.845, |
|
"eval_steps_per_second": 24.513, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 6.2033582089552235, |
|
"grad_norm": 0.001806699438020587, |
|
"learning_rate": 2.3463930348258707e-05, |
|
"loss": 0.0001, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 6.2033582089552235, |
|
"eval_loss": 2.094437604682753e-06, |
|
"eval_runtime": 0.1613, |
|
"eval_samples_per_second": 185.963, |
|
"eval_steps_per_second": 24.795, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.0007174393394961953, |
|
"learning_rate": 2.3339552238805972e-05, |
|
"loss": 0.0002, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"eval_loss": 1.5305414535760065e-06, |
|
"eval_runtime": 0.1641, |
|
"eval_samples_per_second": 182.796, |
|
"eval_steps_per_second": 24.373, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 6.2966417910447765, |
|
"grad_norm": 0.0014867472928017378, |
|
"learning_rate": 2.3215174129353234e-05, |
|
"loss": 0.0001, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 6.2966417910447765, |
|
"eval_loss": 1.4038557765161386e-06, |
|
"eval_runtime": 0.1614, |
|
"eval_samples_per_second": 185.841, |
|
"eval_steps_per_second": 24.779, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 6.343283582089552, |
|
"grad_norm": 0.0010864852229133248, |
|
"learning_rate": 2.3090796019900502e-05, |
|
"loss": 0.0001, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 6.343283582089552, |
|
"eval_loss": 1.2951429653185187e-06, |
|
"eval_runtime": 0.1624, |
|
"eval_samples_per_second": 184.725, |
|
"eval_steps_per_second": 24.63, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 6.389925373134329, |
|
"grad_norm": 0.0002408225554972887, |
|
"learning_rate": 2.2966417910447764e-05, |
|
"loss": 0.0001, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 6.389925373134329, |
|
"eval_loss": 1.1805148005805677e-06, |
|
"eval_runtime": 0.1646, |
|
"eval_samples_per_second": 182.255, |
|
"eval_steps_per_second": 24.301, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 6.436567164179104, |
|
"grad_norm": 0.000278256309684366, |
|
"learning_rate": 2.284203980099503e-05, |
|
"loss": 0.0001, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 6.436567164179104, |
|
"eval_loss": 1.1244396773690823e-06, |
|
"eval_runtime": 0.1635, |
|
"eval_samples_per_second": 183.524, |
|
"eval_steps_per_second": 24.47, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 6.483208955223881, |
|
"grad_norm": 0.0005975699750706553, |
|
"learning_rate": 2.271766169154229e-05, |
|
"loss": 0.0001, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 6.483208955223881, |
|
"eval_loss": 9.522905202175025e-07, |
|
"eval_runtime": 0.1609, |
|
"eval_samples_per_second": 186.467, |
|
"eval_steps_per_second": 24.862, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 6.529850746268656, |
|
"grad_norm": 0.0024864040315151215, |
|
"learning_rate": 2.2593283582089555e-05, |
|
"loss": 0.0002, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 6.529850746268656, |
|
"eval_loss": 8.695107567291416e-07, |
|
"eval_runtime": 0.1606, |
|
"eval_samples_per_second": 186.805, |
|
"eval_steps_per_second": 24.907, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 6.576492537313433, |
|
"grad_norm": 0.0012296285713091493, |
|
"learning_rate": 2.2468905472636817e-05, |
|
"loss": 0.0001, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 6.576492537313433, |
|
"eval_loss": 7.833210702301585e-07, |
|
"eval_runtime": 0.1625, |
|
"eval_samples_per_second": 184.667, |
|
"eval_steps_per_second": 24.622, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 6.6231343283582085, |
|
"grad_norm": 0.00046841197763569653, |
|
"learning_rate": 2.2344527363184082e-05, |
|
"loss": 0.0001, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 6.6231343283582085, |
|
"eval_loss": 7.663988981221337e-07, |
|
"eval_runtime": 0.1615, |
|
"eval_samples_per_second": 185.74, |
|
"eval_steps_per_second": 24.765, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 6.669776119402985, |
|
"grad_norm": 0.001978447660803795, |
|
"learning_rate": 2.2220149253731343e-05, |
|
"loss": 0.0028, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 6.669776119402985, |
|
"eval_loss": 9.115080956689781e-07, |
|
"eval_runtime": 0.1638, |
|
"eval_samples_per_second": 183.206, |
|
"eval_steps_per_second": 24.427, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 6.7164179104477615, |
|
"grad_norm": 0.0005547442706301808, |
|
"learning_rate": 2.2095771144278612e-05, |
|
"loss": 0.0001, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 6.7164179104477615, |
|
"eval_loss": 9.500566306996916e-07, |
|
"eval_runtime": 0.1611, |
|
"eval_samples_per_second": 186.194, |
|
"eval_steps_per_second": 24.826, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 6.763059701492537, |
|
"grad_norm": 0.0010947503615170717, |
|
"learning_rate": 2.1971393034825873e-05, |
|
"loss": 0.0004, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 6.763059701492537, |
|
"eval_loss": 1.4260593843573588e-06, |
|
"eval_runtime": 0.1612, |
|
"eval_samples_per_second": 186.089, |
|
"eval_steps_per_second": 24.812, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 6.809701492537314, |
|
"grad_norm": 0.00212489883415401, |
|
"learning_rate": 2.184701492537314e-05, |
|
"loss": 0.0008, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 6.809701492537314, |
|
"eval_loss": 1.273897169085103e-06, |
|
"eval_runtime": 0.1692, |
|
"eval_samples_per_second": 177.355, |
|
"eval_steps_per_second": 23.647, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 6.856343283582089, |
|
"grad_norm": 0.0002277992753079161, |
|
"learning_rate": 2.17226368159204e-05, |
|
"loss": 0.0001, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 6.856343283582089, |
|
"eval_loss": 1.0682890660973499e-06, |
|
"eval_runtime": 0.1623, |
|
"eval_samples_per_second": 184.877, |
|
"eval_steps_per_second": 24.65, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 6.902985074626866, |
|
"grad_norm": 0.0003076612192671746, |
|
"learning_rate": 2.1598258706467665e-05, |
|
"loss": 0.0, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 6.902985074626866, |
|
"eval_loss": 1.0034937076852657e-06, |
|
"eval_runtime": 0.1641, |
|
"eval_samples_per_second": 182.834, |
|
"eval_steps_per_second": 24.378, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 6.949626865671641, |
|
"grad_norm": 0.0002643874322529882, |
|
"learning_rate": 2.1473880597014927e-05, |
|
"loss": 0.0, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 6.949626865671641, |
|
"eval_loss": 9.63684556154476e-07, |
|
"eval_runtime": 0.1646, |
|
"eval_samples_per_second": 182.236, |
|
"eval_steps_per_second": 24.298, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 6.996268656716418, |
|
"grad_norm": 0.00024292635498568416, |
|
"learning_rate": 2.134950248756219e-05, |
|
"loss": 0.0001, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 6.996268656716418, |
|
"eval_loss": 9.160715990219614e-07, |
|
"eval_runtime": 0.1631, |
|
"eval_samples_per_second": 183.892, |
|
"eval_steps_per_second": 24.519, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 7.042910447761194, |
|
"grad_norm": 0.0030344121623784304, |
|
"learning_rate": 2.1225124378109453e-05, |
|
"loss": 0.0001, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 7.042910447761194, |
|
"eval_loss": 8.196147973649204e-07, |
|
"eval_runtime": 0.1634, |
|
"eval_samples_per_second": 183.619, |
|
"eval_steps_per_second": 24.482, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 7.08955223880597, |
|
"grad_norm": 0.0007852727430872619, |
|
"learning_rate": 2.110074626865672e-05, |
|
"loss": 0.0001, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 7.08955223880597, |
|
"eval_loss": 8.8703302481008e-07, |
|
"eval_runtime": 0.1629, |
|
"eval_samples_per_second": 184.182, |
|
"eval_steps_per_second": 24.558, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 7.1361940298507465, |
|
"grad_norm": 0.0011632639216259122, |
|
"learning_rate": 2.097636815920398e-05, |
|
"loss": 0.0001, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 7.1361940298507465, |
|
"eval_loss": 7.986407126736594e-07, |
|
"eval_runtime": 0.1615, |
|
"eval_samples_per_second": 185.782, |
|
"eval_steps_per_second": 24.771, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 7.182835820895522, |
|
"grad_norm": 0.000128728206618689, |
|
"learning_rate": 2.0851990049751248e-05, |
|
"loss": 0.0, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 7.182835820895522, |
|
"eval_loss": 7.614453352289274e-07, |
|
"eval_runtime": 0.1635, |
|
"eval_samples_per_second": 183.534, |
|
"eval_steps_per_second": 24.471, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 7.229477611940299, |
|
"grad_norm": 0.00016260573465842754, |
|
"learning_rate": 2.072761194029851e-05, |
|
"loss": 0.0001, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 7.229477611940299, |
|
"eval_loss": 7.017587222435395e-07, |
|
"eval_runtime": 0.1617, |
|
"eval_samples_per_second": 185.473, |
|
"eval_steps_per_second": 24.73, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 7.276119402985074, |
|
"grad_norm": 0.0003854296519421041, |
|
"learning_rate": 2.0603233830845775e-05, |
|
"loss": 0.0002, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 7.276119402985074, |
|
"eval_loss": 6.69997461955063e-07, |
|
"eval_runtime": 0.163, |
|
"eval_samples_per_second": 184.053, |
|
"eval_steps_per_second": 24.54, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 7.322761194029851, |
|
"grad_norm": 0.0005631668609566987, |
|
"learning_rate": 2.0478855721393036e-05, |
|
"loss": 0.0, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 7.322761194029851, |
|
"eval_loss": 6.535386773975915e-07, |
|
"eval_runtime": 0.1623, |
|
"eval_samples_per_second": 184.837, |
|
"eval_steps_per_second": 24.645, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 7.369402985074627, |
|
"grad_norm": 0.00030086564947851, |
|
"learning_rate": 2.03544776119403e-05, |
|
"loss": 0.0001, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 7.369402985074627, |
|
"eval_loss": 6.11449081588944e-07, |
|
"eval_runtime": 0.17, |
|
"eval_samples_per_second": 176.431, |
|
"eval_steps_per_second": 23.524, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 7.416044776119403, |
|
"grad_norm": 0.00038130092434585094, |
|
"learning_rate": 2.0230099502487563e-05, |
|
"loss": 0.0001, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 7.416044776119403, |
|
"eval_loss": 5.757526650995715e-07, |
|
"eval_runtime": 0.1612, |
|
"eval_samples_per_second": 186.141, |
|
"eval_steps_per_second": 24.819, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 7.462686567164179, |
|
"grad_norm": 0.000344213709468022, |
|
"learning_rate": 2.0106965174129357e-05, |
|
"loss": 0.0004, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 7.462686567164179, |
|
"eval_loss": 5.750833338424854e-07, |
|
"eval_runtime": 0.1619, |
|
"eval_samples_per_second": 185.307, |
|
"eval_steps_per_second": 24.708, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 7.509328358208955, |
|
"grad_norm": 0.0006042002351023257, |
|
"learning_rate": 1.998258706467662e-05, |
|
"loss": 0.0002, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 7.509328358208955, |
|
"eval_loss": 8.297544695778925e-07, |
|
"eval_runtime": 0.1687, |
|
"eval_samples_per_second": 177.799, |
|
"eval_steps_per_second": 23.707, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 7.5559701492537314, |
|
"grad_norm": 0.0012880139984190464, |
|
"learning_rate": 1.9858208955223884e-05, |
|
"loss": 0.0001, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 7.5559701492537314, |
|
"eval_loss": 7.933731467346661e-07, |
|
"eval_runtime": 0.1631, |
|
"eval_samples_per_second": 183.887, |
|
"eval_steps_per_second": 24.518, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 7.602611940298507, |
|
"grad_norm": 0.0001834977010730654, |
|
"learning_rate": 1.9733830845771145e-05, |
|
"loss": 0.0, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 7.602611940298507, |
|
"eval_loss": 7.420648557854292e-07, |
|
"eval_runtime": 0.1644, |
|
"eval_samples_per_second": 182.491, |
|
"eval_steps_per_second": 24.332, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 7.649253731343284, |
|
"grad_norm": 0.0002803165989462286, |
|
"learning_rate": 1.960945273631841e-05, |
|
"loss": 0.0004, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 7.649253731343284, |
|
"eval_loss": 8.606352821516339e-07, |
|
"eval_runtime": 0.1658, |
|
"eval_samples_per_second": 180.905, |
|
"eval_steps_per_second": 24.121, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 7.69589552238806, |
|
"grad_norm": 0.00030525182955898345, |
|
"learning_rate": 1.9485074626865675e-05, |
|
"loss": 0.0, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 7.69589552238806, |
|
"eval_loss": 7.444876928275335e-07, |
|
"eval_runtime": 0.1624, |
|
"eval_samples_per_second": 184.746, |
|
"eval_steps_per_second": 24.633, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 7.742537313432836, |
|
"grad_norm": 0.0005061542615294456, |
|
"learning_rate": 1.9360696517412937e-05, |
|
"loss": 0.0, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 7.742537313432836, |
|
"eval_loss": 7.410563966914196e-07, |
|
"eval_runtime": 0.1632, |
|
"eval_samples_per_second": 183.792, |
|
"eval_steps_per_second": 24.506, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 7.789179104477612, |
|
"grad_norm": 0.002078367630019784, |
|
"learning_rate": 1.9236318407960202e-05, |
|
"loss": 0.0, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 7.789179104477612, |
|
"eval_loss": 7.394659746751131e-07, |
|
"eval_runtime": 0.1672, |
|
"eval_samples_per_second": 179.409, |
|
"eval_steps_per_second": 23.921, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 7.835820895522388, |
|
"grad_norm": 0.0005327766994014382, |
|
"learning_rate": 1.9111940298507467e-05, |
|
"loss": 0.0, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 7.835820895522388, |
|
"eval_loss": 6.818034421485208e-07, |
|
"eval_runtime": 0.1907, |
|
"eval_samples_per_second": 157.346, |
|
"eval_steps_per_second": 20.979, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 7.882462686567164, |
|
"grad_norm": 0.00013845170906279236, |
|
"learning_rate": 1.8987562189054725e-05, |
|
"loss": 0.0001, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 7.882462686567164, |
|
"eval_loss": 6.097284313000273e-07, |
|
"eval_runtime": 0.1662, |
|
"eval_samples_per_second": 180.555, |
|
"eval_steps_per_second": 24.074, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 7.92910447761194, |
|
"grad_norm": 0.0008268446545116603, |
|
"learning_rate": 1.886318407960199e-05, |
|
"loss": 0.0001, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 7.92910447761194, |
|
"eval_loss": 6.165503236843506e-07, |
|
"eval_runtime": 0.1612, |
|
"eval_samples_per_second": 186.081, |
|
"eval_steps_per_second": 24.811, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 7.975746268656716, |
|
"grad_norm": 0.011116830632090569, |
|
"learning_rate": 1.8738805970149255e-05, |
|
"loss": 0.0002, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 7.975746268656716, |
|
"eval_loss": 6.690197551506571e-07, |
|
"eval_runtime": 0.1629, |
|
"eval_samples_per_second": 184.18, |
|
"eval_steps_per_second": 24.557, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 8.022388059701493, |
|
"grad_norm": 0.00011780932982219383, |
|
"learning_rate": 1.8614427860696517e-05, |
|
"loss": 0.0, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 8.022388059701493, |
|
"eval_loss": 6.267726462283463e-07, |
|
"eval_runtime": 0.1622, |
|
"eval_samples_per_second": 184.913, |
|
"eval_steps_per_second": 24.655, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 8.069029850746269, |
|
"grad_norm": 0.00017962571291718632, |
|
"learning_rate": 1.8490049751243782e-05, |
|
"loss": 0.0001, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 8.069029850746269, |
|
"eval_loss": 5.683208996742906e-07, |
|
"eval_runtime": 0.1617, |
|
"eval_samples_per_second": 185.574, |
|
"eval_steps_per_second": 24.743, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 8.115671641791044, |
|
"grad_norm": 0.005691648926585913, |
|
"learning_rate": 1.8365671641791047e-05, |
|
"loss": 0.0, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 8.115671641791044, |
|
"eval_loss": 5.686233066626301e-07, |
|
"eval_runtime": 0.164, |
|
"eval_samples_per_second": 182.936, |
|
"eval_steps_per_second": 24.392, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 8.162313432835822, |
|
"grad_norm": 0.0010604019043967128, |
|
"learning_rate": 1.824129353233831e-05, |
|
"loss": 0.0003, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 8.162313432835822, |
|
"eval_loss": 4.841186296289379e-07, |
|
"eval_runtime": 0.163, |
|
"eval_samples_per_second": 184.061, |
|
"eval_steps_per_second": 24.541, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 8.208955223880597, |
|
"grad_norm": 0.0007983978721313179, |
|
"learning_rate": 1.8116915422885573e-05, |
|
"loss": 0.0, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 8.208955223880597, |
|
"eval_loss": 4.732060858714249e-07, |
|
"eval_runtime": 0.1617, |
|
"eval_samples_per_second": 185.472, |
|
"eval_steps_per_second": 24.73, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 8.255597014925373, |
|
"grad_norm": 0.007192044984549284, |
|
"learning_rate": 1.7992537313432835e-05, |
|
"loss": 0.0, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 8.255597014925373, |
|
"eval_loss": 4.6311160417644714e-07, |
|
"eval_runtime": 0.165, |
|
"eval_samples_per_second": 181.852, |
|
"eval_steps_per_second": 24.247, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 8.302238805970148, |
|
"grad_norm": 0.0002632784890010953, |
|
"learning_rate": 1.78681592039801e-05, |
|
"loss": 0.0001, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 8.302238805970148, |
|
"eval_loss": 4.890302420790249e-07, |
|
"eval_runtime": 0.1652, |
|
"eval_samples_per_second": 181.562, |
|
"eval_steps_per_second": 24.208, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 8.348880597014926, |
|
"grad_norm": 0.00016267193132080138, |
|
"learning_rate": 1.7743781094527365e-05, |
|
"loss": 0.0, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 8.348880597014926, |
|
"eval_loss": 4.708208507508971e-07, |
|
"eval_runtime": 0.1641, |
|
"eval_samples_per_second": 182.793, |
|
"eval_steps_per_second": 24.372, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 8.395522388059701, |
|
"grad_norm": 0.002555535174906254, |
|
"learning_rate": 1.7620646766169156e-05, |
|
"loss": 0.0005, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 8.395522388059701, |
|
"eval_loss": 4.697274107456906e-07, |
|
"eval_runtime": 0.1628, |
|
"eval_samples_per_second": 184.314, |
|
"eval_steps_per_second": 24.575, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 8.442164179104477, |
|
"grad_norm": 0.00030655847513116896, |
|
"learning_rate": 1.749626865671642e-05, |
|
"loss": 0.0, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 8.442164179104477, |
|
"eval_loss": 4.6684658627782483e-07, |
|
"eval_runtime": 0.1639, |
|
"eval_samples_per_second": 183.019, |
|
"eval_steps_per_second": 24.403, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 8.488805970149254, |
|
"grad_norm": 0.001360408728942275, |
|
"learning_rate": 1.7371890547263682e-05, |
|
"loss": 0.0, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 8.488805970149254, |
|
"eval_loss": 4.7605757913515845e-07, |
|
"eval_runtime": 0.1648, |
|
"eval_samples_per_second": 182.057, |
|
"eval_steps_per_second": 24.274, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 8.53544776119403, |
|
"grad_norm": 0.00014535202353727072, |
|
"learning_rate": 1.7247512437810947e-05, |
|
"loss": 0.0005, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 8.53544776119403, |
|
"eval_loss": 4.991737228010606e-07, |
|
"eval_runtime": 0.1636, |
|
"eval_samples_per_second": 183.427, |
|
"eval_steps_per_second": 24.457, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 8.582089552238806, |
|
"grad_norm": 0.0006497761351056397, |
|
"learning_rate": 1.7123134328358212e-05, |
|
"loss": 0.0001, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 8.582089552238806, |
|
"eval_loss": 4.467078440484329e-07, |
|
"eval_runtime": 0.1633, |
|
"eval_samples_per_second": 183.659, |
|
"eval_steps_per_second": 24.488, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 8.628731343283581, |
|
"grad_norm": 0.00027351349126547575, |
|
"learning_rate": 1.6998756218905474e-05, |
|
"loss": 0.0, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 8.628731343283581, |
|
"eval_loss": 4.435265168467595e-07, |
|
"eval_runtime": 0.1621, |
|
"eval_samples_per_second": 185.033, |
|
"eval_steps_per_second": 24.671, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 8.675373134328359, |
|
"grad_norm": 0.0005942362477071583, |
|
"learning_rate": 1.687437810945274e-05, |
|
"loss": 0.0003, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 8.675373134328359, |
|
"eval_loss": 4.2622443174877844e-07, |
|
"eval_runtime": 0.1645, |
|
"eval_samples_per_second": 182.387, |
|
"eval_steps_per_second": 24.318, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 8.722014925373134, |
|
"grad_norm": 0.00017634141840972006, |
|
"learning_rate": 1.675e-05, |
|
"loss": 0.0006, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 8.722014925373134, |
|
"eval_loss": 4.732918341687764e-07, |
|
"eval_runtime": 0.1614, |
|
"eval_samples_per_second": 185.867, |
|
"eval_steps_per_second": 24.782, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 8.76865671641791, |
|
"grad_norm": 0.0002473385538905859, |
|
"learning_rate": 1.6625621890547266e-05, |
|
"loss": 0.0, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 8.76865671641791, |
|
"eval_loss": 4.792402705788845e-07, |
|
"eval_runtime": 0.163, |
|
"eval_samples_per_second": 184.093, |
|
"eval_steps_per_second": 24.546, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 8.815298507462687, |
|
"grad_norm": 0.0002839862136170268, |
|
"learning_rate": 1.650124378109453e-05, |
|
"loss": 0.0, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 8.815298507462687, |
|
"eval_loss": 4.7373944767059584e-07, |
|
"eval_runtime": 0.1629, |
|
"eval_samples_per_second": 184.148, |
|
"eval_steps_per_second": 24.553, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 8.861940298507463, |
|
"grad_norm": 0.00030838322709314525, |
|
"learning_rate": 1.6376865671641792e-05, |
|
"loss": 0.0003, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 8.861940298507463, |
|
"eval_loss": 5.06621631757298e-07, |
|
"eval_runtime": 0.1629, |
|
"eval_samples_per_second": 184.159, |
|
"eval_steps_per_second": 24.555, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 8.908582089552239, |
|
"grad_norm": 0.0008744837250560522, |
|
"learning_rate": 1.6252487562189057e-05, |
|
"loss": 0.0001, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 8.908582089552239, |
|
"eval_loss": 4.863770755036967e-07, |
|
"eval_runtime": 0.1638, |
|
"eval_samples_per_second": 183.173, |
|
"eval_steps_per_second": 24.423, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 8.955223880597014, |
|
"grad_norm": 0.0004683129664044827, |
|
"learning_rate": 1.612810945273632e-05, |
|
"loss": 0.0001, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 8.955223880597014, |
|
"eval_loss": 4.3130833660143253e-07, |
|
"eval_runtime": 0.163, |
|
"eval_samples_per_second": 184.02, |
|
"eval_steps_per_second": 24.536, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 9.001865671641792, |
|
"grad_norm": 0.0001069534191628918, |
|
"learning_rate": 1.6003731343283584e-05, |
|
"loss": 0.0, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 9.001865671641792, |
|
"eval_loss": 4.232615538057871e-07, |
|
"eval_runtime": 0.1637, |
|
"eval_samples_per_second": 183.284, |
|
"eval_steps_per_second": 24.438, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 9.048507462686567, |
|
"grad_norm": 0.0001369858073303476, |
|
"learning_rate": 1.587935323383085e-05, |
|
"loss": 0.0001, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 9.048507462686567, |
|
"eval_loss": 4.031139724247623e-07, |
|
"eval_runtime": 0.1628, |
|
"eval_samples_per_second": 184.23, |
|
"eval_steps_per_second": 24.564, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 9.095149253731343, |
|
"grad_norm": 0.0004517412162385881, |
|
"learning_rate": 1.575497512437811e-05, |
|
"loss": 0.0, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 9.095149253731343, |
|
"eval_loss": 3.9417889752257906e-07, |
|
"eval_runtime": 0.1614, |
|
"eval_samples_per_second": 185.913, |
|
"eval_steps_per_second": 24.788, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 9.14179104477612, |
|
"grad_norm": 0.0002976813993882388, |
|
"learning_rate": 1.5630597014925375e-05, |
|
"loss": 0.0001, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 9.14179104477612, |
|
"eval_loss": 3.6548814819070685e-07, |
|
"eval_runtime": 0.1844, |
|
"eval_samples_per_second": 162.664, |
|
"eval_steps_per_second": 21.689, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 9.188432835820896, |
|
"grad_norm": 0.0005973413935862482, |
|
"learning_rate": 1.550621890547264e-05, |
|
"loss": 0.0, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 9.188432835820896, |
|
"eval_loss": 3.6364929201226914e-07, |
|
"eval_runtime": 0.1642, |
|
"eval_samples_per_second": 182.755, |
|
"eval_steps_per_second": 24.367, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 9.235074626865671, |
|
"grad_norm": 0.0002460335963405669, |
|
"learning_rate": 1.5381840796019902e-05, |
|
"loss": 0.0001, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 9.235074626865671, |
|
"eval_loss": 3.593271742374782e-07, |
|
"eval_runtime": 0.1686, |
|
"eval_samples_per_second": 177.934, |
|
"eval_steps_per_second": 23.724, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 9.281716417910447, |
|
"grad_norm": 0.0013155044289305806, |
|
"learning_rate": 1.5257462686567165e-05, |
|
"loss": 0.0, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 9.281716417910447, |
|
"eval_loss": 3.7705902400375635e-07, |
|
"eval_runtime": 0.1626, |
|
"eval_samples_per_second": 184.503, |
|
"eval_steps_per_second": 24.6, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 9.328358208955224, |
|
"grad_norm": 0.0009738287189975381, |
|
"learning_rate": 1.513308457711443e-05, |
|
"loss": 0.0, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 9.328358208955224, |
|
"eval_loss": 3.643332604497118e-07, |
|
"eval_runtime": 0.189, |
|
"eval_samples_per_second": 158.751, |
|
"eval_steps_per_second": 21.167, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"grad_norm": 0.0006048243958503008, |
|
"learning_rate": 1.5008706467661693e-05, |
|
"loss": 0.0, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"eval_loss": 3.528231502514245e-07, |
|
"eval_runtime": 0.1642, |
|
"eval_samples_per_second": 182.704, |
|
"eval_steps_per_second": 24.36, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 9.421641791044776, |
|
"grad_norm": 8.00610869191587e-05, |
|
"learning_rate": 1.4884328358208957e-05, |
|
"loss": 0.0, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 9.421641791044776, |
|
"eval_loss": 3.4202770393676474e-07, |
|
"eval_runtime": 0.1664, |
|
"eval_samples_per_second": 180.277, |
|
"eval_steps_per_second": 24.037, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 9.468283582089553, |
|
"grad_norm": 0.00042591695091687143, |
|
"learning_rate": 1.475995024875622e-05, |
|
"loss": 0.0, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 9.468283582089553, |
|
"eval_loss": 3.143865114907385e-07, |
|
"eval_runtime": 0.165, |
|
"eval_samples_per_second": 181.766, |
|
"eval_steps_per_second": 24.235, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 9.514925373134329, |
|
"grad_norm": 0.00014322852075565606, |
|
"learning_rate": 1.4635572139303485e-05, |
|
"loss": 0.0, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 9.514925373134329, |
|
"eval_loss": 3.284554850324639e-07, |
|
"eval_runtime": 0.1675, |
|
"eval_samples_per_second": 179.113, |
|
"eval_steps_per_second": 23.882, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 9.561567164179104, |
|
"grad_norm": 0.005179948173463345, |
|
"learning_rate": 1.4511194029850748e-05, |
|
"loss": 0.0, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 9.561567164179104, |
|
"eval_loss": 3.1635789810025017e-07, |
|
"eval_runtime": 0.1647, |
|
"eval_samples_per_second": 182.128, |
|
"eval_steps_per_second": 24.284, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 9.60820895522388, |
|
"grad_norm": 0.12359941005706787, |
|
"learning_rate": 1.4386815920398012e-05, |
|
"loss": 0.0, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 9.60820895522388, |
|
"eval_loss": 3.2864005561350496e-07, |
|
"eval_runtime": 0.1623, |
|
"eval_samples_per_second": 184.87, |
|
"eval_steps_per_second": 24.649, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 9.654850746268657, |
|
"grad_norm": 0.00016295950626954436, |
|
"learning_rate": 1.4262437810945275e-05, |
|
"loss": 0.0, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 9.654850746268657, |
|
"eval_loss": 3.228102798402688e-07, |
|
"eval_runtime": 0.1635, |
|
"eval_samples_per_second": 183.504, |
|
"eval_steps_per_second": 24.467, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 9.701492537313433, |
|
"grad_norm": 0.00019894339493475854, |
|
"learning_rate": 1.413805970149254e-05, |
|
"loss": 0.0, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 9.701492537313433, |
|
"eval_loss": 3.1584852422383847e-07, |
|
"eval_runtime": 0.1648, |
|
"eval_samples_per_second": 182.082, |
|
"eval_steps_per_second": 24.278, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 9.748134328358208, |
|
"grad_norm": 7.664102304261178e-05, |
|
"learning_rate": 1.4013681592039803e-05, |
|
"loss": 0.0, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 9.748134328358208, |
|
"eval_loss": 3.116034292816039e-07, |
|
"eval_runtime": 0.1853, |
|
"eval_samples_per_second": 161.942, |
|
"eval_steps_per_second": 21.592, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 9.794776119402986, |
|
"grad_norm": 0.0002929531619884074, |
|
"learning_rate": 1.3889303482587067e-05, |
|
"loss": 0.0, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 9.794776119402986, |
|
"eval_loss": 3.048232031233056e-07, |
|
"eval_runtime": 0.1652, |
|
"eval_samples_per_second": 181.592, |
|
"eval_steps_per_second": 24.212, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 9.841417910447761, |
|
"grad_norm": 0.0021353147458285093, |
|
"learning_rate": 1.376492537313433e-05, |
|
"loss": 0.0, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 9.841417910447761, |
|
"eval_loss": 2.998399111220351e-07, |
|
"eval_runtime": 0.1725, |
|
"eval_samples_per_second": 173.901, |
|
"eval_steps_per_second": 23.187, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 9.888059701492537, |
|
"grad_norm": 0.00020385629613883793, |
|
"learning_rate": 1.3640547263681593e-05, |
|
"loss": 0.0, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 9.888059701492537, |
|
"eval_loss": 2.8804151952499524e-07, |
|
"eval_runtime": 0.163, |
|
"eval_samples_per_second": 184.019, |
|
"eval_steps_per_second": 24.536, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 9.934701492537313, |
|
"grad_norm": 0.00024354759079869837, |
|
"learning_rate": 1.3516169154228858e-05, |
|
"loss": 0.0, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 9.934701492537313, |
|
"eval_loss": 2.821187479185028e-07, |
|
"eval_runtime": 0.165, |
|
"eval_samples_per_second": 181.842, |
|
"eval_steps_per_second": 24.246, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 9.98134328358209, |
|
"grad_norm": 0.00834750197827816, |
|
"learning_rate": 1.3391791044776121e-05, |
|
"loss": 0.0, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 9.98134328358209, |
|
"eval_loss": 2.7767814003709645e-07, |
|
"eval_runtime": 0.1622, |
|
"eval_samples_per_second": 184.954, |
|
"eval_steps_per_second": 24.661, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 10.027985074626866, |
|
"grad_norm": 0.0002831424935720861, |
|
"learning_rate": 1.3267412935323385e-05, |
|
"loss": 0.0, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 10.027985074626866, |
|
"eval_loss": 2.571897823600011e-07, |
|
"eval_runtime": 0.1632, |
|
"eval_samples_per_second": 183.854, |
|
"eval_steps_per_second": 24.514, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 10.074626865671641, |
|
"grad_norm": 0.0013846103101968765, |
|
"learning_rate": 1.3143034825870648e-05, |
|
"loss": 0.0, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 10.074626865671641, |
|
"eval_loss": 2.5103787493208074e-07, |
|
"eval_runtime": 0.1621, |
|
"eval_samples_per_second": 185.048, |
|
"eval_steps_per_second": 24.673, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 10.121268656716419, |
|
"grad_norm": 0.0005128834745846689, |
|
"learning_rate": 1.3018656716417913e-05, |
|
"loss": 0.0, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 10.121268656716419, |
|
"eval_loss": 2.498533717698592e-07, |
|
"eval_runtime": 0.165, |
|
"eval_samples_per_second": 181.785, |
|
"eval_steps_per_second": 24.238, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 10.167910447761194, |
|
"grad_norm": 0.00024777796352282166, |
|
"learning_rate": 1.2894278606965176e-05, |
|
"loss": 0.0, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 10.167910447761194, |
|
"eval_loss": 2.5362717792631884e-07, |
|
"eval_runtime": 0.163, |
|
"eval_samples_per_second": 184.079, |
|
"eval_steps_per_second": 24.544, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 10.21455223880597, |
|
"grad_norm": 0.00038368601235561073, |
|
"learning_rate": 1.276990049751244e-05, |
|
"loss": 0.0, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 10.21455223880597, |
|
"eval_loss": 2.490402550847648e-07, |
|
"eval_runtime": 0.165, |
|
"eval_samples_per_second": 181.841, |
|
"eval_steps_per_second": 24.245, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 10.261194029850746, |
|
"grad_norm": 0.0011056758230552077, |
|
"learning_rate": 1.2645522388059703e-05, |
|
"loss": 0.0, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 10.261194029850746, |
|
"eval_loss": 2.4568615231146396e-07, |
|
"eval_runtime": 0.1671, |
|
"eval_samples_per_second": 179.53, |
|
"eval_steps_per_second": 23.937, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 10.307835820895523, |
|
"grad_norm": 0.0002836316707544029, |
|
"learning_rate": 1.2521144278606966e-05, |
|
"loss": 0.0, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 10.307835820895523, |
|
"eval_loss": 2.3665577941756055e-07, |
|
"eval_runtime": 0.1618, |
|
"eval_samples_per_second": 185.41, |
|
"eval_steps_per_second": 24.721, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 10.354477611940299, |
|
"grad_norm": 0.00016041977505665272, |
|
"learning_rate": 1.2396766169154231e-05, |
|
"loss": 0.0, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 10.354477611940299, |
|
"eval_loss": 2.4731053827053984e-07, |
|
"eval_runtime": 0.1668, |
|
"eval_samples_per_second": 179.878, |
|
"eval_steps_per_second": 23.984, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 10.401119402985074, |
|
"grad_norm": 0.0011778981424868107, |
|
"learning_rate": 1.2272388059701494e-05, |
|
"loss": 0.0, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 10.401119402985074, |
|
"eval_loss": 2.5048382212844444e-07, |
|
"eval_runtime": 0.164, |
|
"eval_samples_per_second": 182.885, |
|
"eval_steps_per_second": 24.385, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 10.447761194029852, |
|
"grad_norm": 0.004293927922844887, |
|
"learning_rate": 1.2149253731343285e-05, |
|
"loss": 0.0007, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 10.447761194029852, |
|
"eval_loss": 2.6977289735441445e-07, |
|
"eval_runtime": 0.1664, |
|
"eval_samples_per_second": 180.28, |
|
"eval_steps_per_second": 24.037, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 10.494402985074627, |
|
"grad_norm": 0.0003302557743154466, |
|
"learning_rate": 1.2024875621890549e-05, |
|
"loss": 0.0, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 10.494402985074627, |
|
"eval_loss": 2.6683247256187315e-07, |
|
"eval_runtime": 0.1641, |
|
"eval_samples_per_second": 182.868, |
|
"eval_steps_per_second": 24.382, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 10.541044776119403, |
|
"grad_norm": 0.000583185872528702, |
|
"learning_rate": 1.1900497512437812e-05, |
|
"loss": 0.0001, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 10.541044776119403, |
|
"eval_loss": 2.8590250167326303e-07, |
|
"eval_runtime": 0.1741, |
|
"eval_samples_per_second": 172.328, |
|
"eval_steps_per_second": 22.977, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 10.587686567164178, |
|
"grad_norm": 0.00015397944662254304, |
|
"learning_rate": 1.1776119402985075e-05, |
|
"loss": 0.0, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 10.587686567164178, |
|
"eval_loss": 2.9392734290922817e-07, |
|
"eval_runtime": 0.1658, |
|
"eval_samples_per_second": 180.957, |
|
"eval_steps_per_second": 24.128, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 10.634328358208956, |
|
"grad_norm": 0.00023940723622217774, |
|
"learning_rate": 1.1651741293532339e-05, |
|
"loss": 0.0, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 10.634328358208956, |
|
"eval_loss": 2.8955645348105463e-07, |
|
"eval_runtime": 0.1646, |
|
"eval_samples_per_second": 182.232, |
|
"eval_steps_per_second": 24.298, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 10.680970149253731, |
|
"grad_norm": 0.0006399727426469326, |
|
"learning_rate": 1.1527363184079604e-05, |
|
"loss": 0.0, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 10.680970149253731, |
|
"eval_loss": 2.7836446747642185e-07, |
|
"eval_runtime": 0.1653, |
|
"eval_samples_per_second": 181.504, |
|
"eval_steps_per_second": 24.201, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 10.727611940298507, |
|
"grad_norm": 0.00027732501621358097, |
|
"learning_rate": 1.1402985074626867e-05, |
|
"loss": 0.0, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 10.727611940298507, |
|
"eval_loss": 2.747021596860577e-07, |
|
"eval_runtime": 0.1614, |
|
"eval_samples_per_second": 185.849, |
|
"eval_steps_per_second": 24.78, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 10.774253731343283, |
|
"grad_norm": 0.000188069578143768, |
|
"learning_rate": 1.127860696517413e-05, |
|
"loss": 0.0, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 10.774253731343283, |
|
"eval_loss": 2.720941552070144e-07, |
|
"eval_runtime": 0.1657, |
|
"eval_samples_per_second": 181.099, |
|
"eval_steps_per_second": 24.147, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 10.82089552238806, |
|
"grad_norm": 0.000323320651659742, |
|
"learning_rate": 1.1154228855721393e-05, |
|
"loss": 0.0, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 10.82089552238806, |
|
"eval_loss": 2.7039632755077037e-07, |
|
"eval_runtime": 0.1651, |
|
"eval_samples_per_second": 181.694, |
|
"eval_steps_per_second": 24.226, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 10.867537313432836, |
|
"grad_norm": 0.00014747037494089454, |
|
"learning_rate": 1.1029850746268658e-05, |
|
"loss": 0.0, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 10.867537313432836, |
|
"eval_loss": 2.6682712928050023e-07, |
|
"eval_runtime": 0.1662, |
|
"eval_samples_per_second": 180.507, |
|
"eval_steps_per_second": 24.068, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 10.914179104477611, |
|
"grad_norm": 7.306891347980127e-05, |
|
"learning_rate": 1.0905472636815922e-05, |
|
"loss": 0.0, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 10.914179104477611, |
|
"eval_loss": 2.6359310822954285e-07, |
|
"eval_runtime": 0.1726, |
|
"eval_samples_per_second": 173.796, |
|
"eval_steps_per_second": 23.173, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 10.960820895522389, |
|
"grad_norm": 5.098180554341525e-05, |
|
"learning_rate": 1.0781094527363185e-05, |
|
"loss": 0.0, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 10.960820895522389, |
|
"eval_loss": 2.554411082655861e-07, |
|
"eval_runtime": 0.1637, |
|
"eval_samples_per_second": 183.313, |
|
"eval_steps_per_second": 24.442, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 11.007462686567164, |
|
"grad_norm": 9.442290320293978e-05, |
|
"learning_rate": 1.0656716417910448e-05, |
|
"loss": 0.0, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 11.007462686567164, |
|
"eval_loss": 2.5533600478411245e-07, |
|
"eval_runtime": 0.1606, |
|
"eval_samples_per_second": 186.742, |
|
"eval_steps_per_second": 24.899, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 11.05410447761194, |
|
"grad_norm": 0.00022185999841894954, |
|
"learning_rate": 1.0532338308457712e-05, |
|
"loss": 0.0, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 11.05410447761194, |
|
"eval_loss": 2.524078581700451e-07, |
|
"eval_runtime": 0.1638, |
|
"eval_samples_per_second": 183.127, |
|
"eval_steps_per_second": 24.417, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 11.100746268656716, |
|
"grad_norm": 0.00503884069621563, |
|
"learning_rate": 1.0407960199004977e-05, |
|
"loss": 0.0004, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 11.100746268656716, |
|
"eval_loss": 2.567036858636129e-07, |
|
"eval_runtime": 0.1643, |
|
"eval_samples_per_second": 182.544, |
|
"eval_steps_per_second": 24.339, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 11.147388059701493, |
|
"grad_norm": 9.032327943714336e-05, |
|
"learning_rate": 1.028358208955224e-05, |
|
"loss": 0.0002, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 11.147388059701493, |
|
"eval_loss": 2.859654841813608e-07, |
|
"eval_runtime": 0.1632, |
|
"eval_samples_per_second": 183.792, |
|
"eval_steps_per_second": 24.506, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 11.194029850746269, |
|
"grad_norm": 0.00012014710227958858, |
|
"learning_rate": 1.016044776119403e-05, |
|
"loss": 0.0, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 11.194029850746269, |
|
"eval_loss": 2.840646402546554e-07, |
|
"eval_runtime": 0.1633, |
|
"eval_samples_per_second": 183.679, |
|
"eval_steps_per_second": 24.49, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 11.240671641791044, |
|
"grad_norm": 0.0031628520227968693, |
|
"learning_rate": 1.0036069651741294e-05, |
|
"loss": 0.0, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 11.240671641791044, |
|
"eval_loss": 2.8124046025368443e-07, |
|
"eval_runtime": 0.1622, |
|
"eval_samples_per_second": 184.928, |
|
"eval_steps_per_second": 24.657, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 11.287313432835822, |
|
"grad_norm": 4.342416286817752e-05, |
|
"learning_rate": 9.911691542288559e-06, |
|
"loss": 0.0001, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 11.287313432835822, |
|
"eval_loss": 2.839091735040711e-07, |
|
"eval_runtime": 0.1665, |
|
"eval_samples_per_second": 180.187, |
|
"eval_steps_per_second": 24.025, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 11.333955223880597, |
|
"grad_norm": 9.751073230290785e-05, |
|
"learning_rate": 9.787313432835822e-06, |
|
"loss": 0.0, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 11.333955223880597, |
|
"eval_loss": 2.823463773893309e-07, |
|
"eval_runtime": 0.1652, |
|
"eval_samples_per_second": 181.594, |
|
"eval_steps_per_second": 24.213, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 11.380597014925373, |
|
"grad_norm": 0.00015635335876140743, |
|
"learning_rate": 9.662935323383086e-06, |
|
"loss": 0.0, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 11.380597014925373, |
|
"eval_loss": 2.809511840951018e-07, |
|
"eval_runtime": 0.1669, |
|
"eval_samples_per_second": 179.702, |
|
"eval_steps_per_second": 23.96, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 11.427238805970148, |
|
"grad_norm": 0.00012053705722792074, |
|
"learning_rate": 9.538557213930349e-06, |
|
"loss": 0.0, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 11.427238805970148, |
|
"eval_loss": 2.786693471534818e-07, |
|
"eval_runtime": 0.1641, |
|
"eval_samples_per_second": 182.826, |
|
"eval_steps_per_second": 24.377, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 11.473880597014926, |
|
"grad_norm": 6.743449193891138e-05, |
|
"learning_rate": 9.414179104477614e-06, |
|
"loss": 0.0, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 11.473880597014926, |
|
"eval_loss": 2.7615345743470243e-07, |
|
"eval_runtime": 0.1652, |
|
"eval_samples_per_second": 181.635, |
|
"eval_steps_per_second": 24.218, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 11.520522388059701, |
|
"grad_norm": 0.0008915641228668392, |
|
"learning_rate": 9.289800995024877e-06, |
|
"loss": 0.0, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 11.520522388059701, |
|
"eval_loss": 2.7316258410792216e-07, |
|
"eval_runtime": 0.182, |
|
"eval_samples_per_second": 164.79, |
|
"eval_steps_per_second": 21.972, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 11.567164179104477, |
|
"grad_norm": 0.0019023872446268797, |
|
"learning_rate": 9.16542288557214e-06, |
|
"loss": 0.0, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 11.567164179104477, |
|
"eval_loss": 2.688062750166864e-07, |
|
"eval_runtime": 0.1654, |
|
"eval_samples_per_second": 181.342, |
|
"eval_steps_per_second": 24.179, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 11.613805970149254, |
|
"grad_norm": 0.0024738411884754896, |
|
"learning_rate": 9.041044776119404e-06, |
|
"loss": 0.0, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 11.613805970149254, |
|
"eval_loss": 2.621562771309982e-07, |
|
"eval_runtime": 0.1634, |
|
"eval_samples_per_second": 183.65, |
|
"eval_steps_per_second": 24.487, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 11.66044776119403, |
|
"grad_norm": 0.0007770381635054946, |
|
"learning_rate": 8.916666666666667e-06, |
|
"loss": 0.0, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 11.66044776119403, |
|
"eval_loss": 2.6587503043629113e-07, |
|
"eval_runtime": 0.1642, |
|
"eval_samples_per_second": 182.719, |
|
"eval_steps_per_second": 24.363, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 11.707089552238806, |
|
"grad_norm": 9.729754674481228e-05, |
|
"learning_rate": 8.792288557213932e-06, |
|
"loss": 0.0, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 11.707089552238806, |
|
"eval_loss": 2.6003732500612387e-07, |
|
"eval_runtime": 0.1636, |
|
"eval_samples_per_second": 183.389, |
|
"eval_steps_per_second": 24.452, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 11.753731343283581, |
|
"grad_norm": 0.00033962438465096056, |
|
"learning_rate": 8.667910447761195e-06, |
|
"loss": 0.0, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 11.753731343283581, |
|
"eval_loss": 2.570840251792106e-07, |
|
"eval_runtime": 0.1754, |
|
"eval_samples_per_second": 171.067, |
|
"eval_steps_per_second": 22.809, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 11.800373134328359, |
|
"grad_norm": 0.0037661008536815643, |
|
"learning_rate": 8.543532338308459e-06, |
|
"loss": 0.0, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 11.800373134328359, |
|
"eval_loss": 2.5637064027250744e-07, |
|
"eval_runtime": 0.1648, |
|
"eval_samples_per_second": 182.061, |
|
"eval_steps_per_second": 24.275, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 11.847014925373134, |
|
"grad_norm": 7.473176083294675e-05, |
|
"learning_rate": 8.419154228855722e-06, |
|
"loss": 0.0, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 11.847014925373134, |
|
"eval_loss": 2.5390116320522793e-07, |
|
"eval_runtime": 0.1659, |
|
"eval_samples_per_second": 180.827, |
|
"eval_steps_per_second": 24.11, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 11.89365671641791, |
|
"grad_norm": 0.00015807716408744454, |
|
"learning_rate": 8.294776119402985e-06, |
|
"loss": 0.0, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 11.89365671641791, |
|
"eval_loss": 2.5069658704524045e-07, |
|
"eval_runtime": 0.1638, |
|
"eval_samples_per_second": 183.2, |
|
"eval_steps_per_second": 24.427, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 11.940298507462687, |
|
"grad_norm": 0.0018306206911802292, |
|
"learning_rate": 8.170398009950249e-06, |
|
"loss": 0.0, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 11.940298507462687, |
|
"eval_loss": 2.496325635092944e-07, |
|
"eval_runtime": 0.1664, |
|
"eval_samples_per_second": 180.237, |
|
"eval_steps_per_second": 24.032, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 11.986940298507463, |
|
"grad_norm": 4.205930235912092e-05, |
|
"learning_rate": 8.046019900497512e-06, |
|
"loss": 0.0, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 11.986940298507463, |
|
"eval_loss": 2.4730249492677103e-07, |
|
"eval_runtime": 0.1636, |
|
"eval_samples_per_second": 183.389, |
|
"eval_steps_per_second": 24.452, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 12.033582089552239, |
|
"grad_norm": 0.006276166532188654, |
|
"learning_rate": 7.921641791044777e-06, |
|
"loss": 0.0, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 12.033582089552239, |
|
"eval_loss": 2.452173930578283e-07, |
|
"eval_runtime": 0.1637, |
|
"eval_samples_per_second": 183.207, |
|
"eval_steps_per_second": 24.428, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 12.080223880597014, |
|
"grad_norm": 0.0031838086433708668, |
|
"learning_rate": 7.79726368159204e-06, |
|
"loss": 0.0, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 12.080223880597014, |
|
"eval_loss": 2.5401092784704815e-07, |
|
"eval_runtime": 0.1636, |
|
"eval_samples_per_second": 183.344, |
|
"eval_steps_per_second": 24.446, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 12.126865671641792, |
|
"grad_norm": 6.814413063693792e-05, |
|
"learning_rate": 7.672885572139303e-06, |
|
"loss": 0.0, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 12.126865671641792, |
|
"eval_loss": 2.5293377348134527e-07, |
|
"eval_runtime": 0.1613, |
|
"eval_samples_per_second": 185.984, |
|
"eval_steps_per_second": 24.798, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 12.173507462686567, |
|
"grad_norm": 0.0002226163778686896, |
|
"learning_rate": 7.549751243781095e-06, |
|
"loss": 0.0, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 12.173507462686567, |
|
"eval_loss": 2.4839519596753235e-07, |
|
"eval_runtime": 0.164, |
|
"eval_samples_per_second": 182.882, |
|
"eval_steps_per_second": 24.384, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 12.220149253731343, |
|
"grad_norm": 0.00015515003178734332, |
|
"learning_rate": 7.4253731343283585e-06, |
|
"loss": 0.0, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 12.220149253731343, |
|
"eval_loss": 2.3797457515684073e-07, |
|
"eval_runtime": 0.1632, |
|
"eval_samples_per_second": 183.77, |
|
"eval_steps_per_second": 24.503, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 12.26679104477612, |
|
"grad_norm": 0.0004189789469819516, |
|
"learning_rate": 7.300995024875623e-06, |
|
"loss": 0.0, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 12.26679104477612, |
|
"eval_loss": 2.4495119532730314e-07, |
|
"eval_runtime": 0.1641, |
|
"eval_samples_per_second": 182.814, |
|
"eval_steps_per_second": 24.375, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 12.313432835820896, |
|
"grad_norm": 0.0008414119947701693, |
|
"learning_rate": 7.176616915422886e-06, |
|
"loss": 0.0, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 12.313432835820896, |
|
"eval_loss": 2.375207088789466e-07, |
|
"eval_runtime": 0.1644, |
|
"eval_samples_per_second": 182.441, |
|
"eval_steps_per_second": 24.325, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 12.360074626865671, |
|
"grad_norm": 0.004033736884593964, |
|
"learning_rate": 7.052238805970149e-06, |
|
"loss": 0.0, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 12.360074626865671, |
|
"eval_loss": 2.3545504745925427e-07, |
|
"eval_runtime": 0.1634, |
|
"eval_samples_per_second": 183.625, |
|
"eval_steps_per_second": 24.483, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 12.406716417910447, |
|
"grad_norm": 0.0006975606665946543, |
|
"learning_rate": 6.927860696517413e-06, |
|
"loss": 0.0002, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 12.406716417910447, |
|
"eval_loss": 2.2062947380163678e-07, |
|
"eval_runtime": 0.1647, |
|
"eval_samples_per_second": 182.126, |
|
"eval_steps_per_second": 24.283, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 12.453358208955224, |
|
"grad_norm": 0.00041404165676794946, |
|
"learning_rate": 6.803482587064677e-06, |
|
"loss": 0.0001, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 12.453358208955224, |
|
"eval_loss": 2.187805563380607e-07, |
|
"eval_runtime": 0.1659, |
|
"eval_samples_per_second": 180.878, |
|
"eval_steps_per_second": 24.117, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 12.5, |
|
"grad_norm": 0.00021756745991297066, |
|
"learning_rate": 6.679104477611941e-06, |
|
"loss": 0.0, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 12.5, |
|
"eval_loss": 2.150947295831429e-07, |
|
"eval_runtime": 0.1631, |
|
"eval_samples_per_second": 183.961, |
|
"eval_steps_per_second": 24.528, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 12.546641791044776, |
|
"grad_norm": 0.0006945555796846747, |
|
"learning_rate": 6.554726368159204e-06, |
|
"loss": 0.0, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 12.546641791044776, |
|
"eval_loss": 2.1574922470790625e-07, |
|
"eval_runtime": 0.1662, |
|
"eval_samples_per_second": 180.505, |
|
"eval_steps_per_second": 24.067, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 12.593283582089553, |
|
"grad_norm": 0.0018626791425049305, |
|
"learning_rate": 6.430348258706468e-06, |
|
"loss": 0.0, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 12.593283582089553, |
|
"eval_loss": 2.1518117421237548e-07, |
|
"eval_runtime": 0.1668, |
|
"eval_samples_per_second": 179.805, |
|
"eval_steps_per_second": 23.974, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 12.639925373134329, |
|
"grad_norm": 8.379531936952844e-05, |
|
"learning_rate": 6.3059701492537316e-06, |
|
"loss": 0.0, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 12.639925373134329, |
|
"eval_loss": 2.1498192381841363e-07, |
|
"eval_runtime": 0.1636, |
|
"eval_samples_per_second": 183.376, |
|
"eval_steps_per_second": 24.45, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 12.686567164179104, |
|
"grad_norm": 7.26276048226282e-05, |
|
"learning_rate": 6.181592039800996e-06, |
|
"loss": 0.0, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 12.686567164179104, |
|
"eval_loss": 2.13713647667646e-07, |
|
"eval_runtime": 0.1663, |
|
"eval_samples_per_second": 180.45, |
|
"eval_steps_per_second": 24.06, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 12.73320895522388, |
|
"grad_norm": 0.0001676314859651029, |
|
"learning_rate": 6.057213930348259e-06, |
|
"loss": 0.0, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 12.73320895522388, |
|
"eval_loss": 2.1666031102540728e-07, |
|
"eval_runtime": 0.1669, |
|
"eval_samples_per_second": 179.72, |
|
"eval_steps_per_second": 23.963, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 12.779850746268657, |
|
"grad_norm": 4.116259515285492e-05, |
|
"learning_rate": 5.932835820895523e-06, |
|
"loss": 0.0, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 12.779850746268657, |
|
"eval_loss": 2.157899245958106e-07, |
|
"eval_runtime": 0.1667, |
|
"eval_samples_per_second": 179.968, |
|
"eval_steps_per_second": 23.996, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 12.826492537313433, |
|
"grad_norm": 0.00012961241009179503, |
|
"learning_rate": 5.8084577114427864e-06, |
|
"loss": 0.0, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 12.826492537313433, |
|
"eval_loss": 2.1415317519313248e-07, |
|
"eval_runtime": 0.1688, |
|
"eval_samples_per_second": 177.772, |
|
"eval_steps_per_second": 23.703, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 12.873134328358208, |
|
"grad_norm": 0.00015239656204357743, |
|
"learning_rate": 5.68407960199005e-06, |
|
"loss": 0.0, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 12.873134328358208, |
|
"eval_loss": 2.1278803785662603e-07, |
|
"eval_runtime": 0.1654, |
|
"eval_samples_per_second": 181.421, |
|
"eval_steps_per_second": 24.189, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 12.919776119402986, |
|
"grad_norm": 0.0003725362184923142, |
|
"learning_rate": 5.559701492537314e-06, |
|
"loss": 0.0, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 12.919776119402986, |
|
"eval_loss": 2.1224755641924276e-07, |
|
"eval_runtime": 0.1695, |
|
"eval_samples_per_second": 176.969, |
|
"eval_steps_per_second": 23.596, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 12.966417910447761, |
|
"grad_norm": 0.00010239533003186807, |
|
"learning_rate": 5.435323383084577e-06, |
|
"loss": 0.0, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 12.966417910447761, |
|
"eval_loss": 2.121297200119443e-07, |
|
"eval_runtime": 0.1713, |
|
"eval_samples_per_second": 175.102, |
|
"eval_steps_per_second": 23.347, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 13.013059701492537, |
|
"grad_norm": 0.0025495258159935474, |
|
"learning_rate": 5.310945273631841e-06, |
|
"loss": 0.0, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 13.013059701492537, |
|
"eval_loss": 2.0902733410821384e-07, |
|
"eval_runtime": 0.1664, |
|
"eval_samples_per_second": 180.279, |
|
"eval_steps_per_second": 24.037, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 13.059701492537313, |
|
"grad_norm": 0.00011632608948275447, |
|
"learning_rate": 5.187810945273632e-06, |
|
"loss": 0.0, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 13.059701492537313, |
|
"eval_loss": 2.0934349720391765e-07, |
|
"eval_runtime": 0.1699, |
|
"eval_samples_per_second": 176.597, |
|
"eval_steps_per_second": 23.546, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 13.10634328358209, |
|
"grad_norm": 0.0007514033932238817, |
|
"learning_rate": 5.063432835820896e-06, |
|
"loss": 0.0, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 13.10634328358209, |
|
"eval_loss": 2.073689557846592e-07, |
|
"eval_runtime": 0.1657, |
|
"eval_samples_per_second": 181.029, |
|
"eval_steps_per_second": 24.137, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 13.152985074626866, |
|
"grad_norm": 0.0023394667077809572, |
|
"learning_rate": 4.93905472636816e-06, |
|
"loss": 0.0, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 13.152985074626866, |
|
"eval_loss": 2.0222093155553011e-07, |
|
"eval_runtime": 0.1659, |
|
"eval_samples_per_second": 180.828, |
|
"eval_steps_per_second": 24.11, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 13.199626865671641, |
|
"grad_norm": 7.573967013740912e-05, |
|
"learning_rate": 4.814676616915424e-06, |
|
"loss": 0.0, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 13.199626865671641, |
|
"eval_loss": 2.0147807333614765e-07, |
|
"eval_runtime": 0.1637, |
|
"eval_samples_per_second": 183.213, |
|
"eval_steps_per_second": 24.428, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 13.246268656716419, |
|
"grad_norm": 0.00038927345303818583, |
|
"learning_rate": 4.690298507462687e-06, |
|
"loss": 0.0, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 13.246268656716419, |
|
"eval_loss": 2.0081444063180243e-07, |
|
"eval_runtime": 0.1655, |
|
"eval_samples_per_second": 181.315, |
|
"eval_steps_per_second": 24.175, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 13.292910447761194, |
|
"grad_norm": 4.6674933400936425e-05, |
|
"learning_rate": 4.56592039800995e-06, |
|
"loss": 0.0, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 13.292910447761194, |
|
"eval_loss": 1.9368634696093068e-07, |
|
"eval_runtime": 0.164, |
|
"eval_samples_per_second": 182.957, |
|
"eval_steps_per_second": 24.394, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 13.33955223880597, |
|
"grad_norm": 0.0003617222246248275, |
|
"learning_rate": 4.441542288557214e-06, |
|
"loss": 0.0, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 13.33955223880597, |
|
"eval_loss": 1.9277055685051891e-07, |
|
"eval_runtime": 0.1664, |
|
"eval_samples_per_second": 180.32, |
|
"eval_steps_per_second": 24.043, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 13.386194029850746, |
|
"grad_norm": 0.0003852724621538073, |
|
"learning_rate": 4.317164179104478e-06, |
|
"loss": 0.0, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 13.386194029850746, |
|
"eval_loss": 1.9214044755244686e-07, |
|
"eval_runtime": 0.1668, |
|
"eval_samples_per_second": 179.9, |
|
"eval_steps_per_second": 23.987, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 13.432835820895523, |
|
"grad_norm": 8.544667798560113e-05, |
|
"learning_rate": 4.192786069651741e-06, |
|
"loss": 0.0, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 13.432835820895523, |
|
"eval_loss": 1.926564152654464e-07, |
|
"eval_runtime": 0.1658, |
|
"eval_samples_per_second": 180.893, |
|
"eval_steps_per_second": 24.119, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 13.479477611940299, |
|
"grad_norm": 0.0006275973282754421, |
|
"learning_rate": 4.068407960199005e-06, |
|
"loss": 0.0, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 13.479477611940299, |
|
"eval_loss": 1.9328433609189233e-07, |
|
"eval_runtime": 0.1709, |
|
"eval_samples_per_second": 175.591, |
|
"eval_steps_per_second": 23.412, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 13.526119402985074, |
|
"grad_norm": 0.00011571097275009379, |
|
"learning_rate": 3.9440298507462686e-06, |
|
"loss": 0.0, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 13.526119402985074, |
|
"eval_loss": 1.924393444596717e-07, |
|
"eval_runtime": 0.1653, |
|
"eval_samples_per_second": 181.482, |
|
"eval_steps_per_second": 24.198, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 13.572761194029852, |
|
"grad_norm": 0.0030688499100506306, |
|
"learning_rate": 3.819651741293533e-06, |
|
"loss": 0.0, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 13.572761194029852, |
|
"eval_loss": 1.9209718971069378e-07, |
|
"eval_runtime": 0.1677, |
|
"eval_samples_per_second": 178.88, |
|
"eval_steps_per_second": 23.851, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 13.619402985074627, |
|
"grad_norm": 0.00018914089014288038, |
|
"learning_rate": 3.695273631840796e-06, |
|
"loss": 0.0, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 13.619402985074627, |
|
"eval_loss": 1.9149970853504783e-07, |
|
"eval_runtime": 0.1652, |
|
"eval_samples_per_second": 181.641, |
|
"eval_steps_per_second": 24.219, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 13.666044776119403, |
|
"grad_norm": 0.00021034800738561898, |
|
"learning_rate": 3.5708955223880597e-06, |
|
"loss": 0.0, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 13.666044776119403, |
|
"eval_loss": 1.9132801298837876e-07, |
|
"eval_runtime": 0.1656, |
|
"eval_samples_per_second": 181.151, |
|
"eval_steps_per_second": 24.154, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 13.712686567164178, |
|
"grad_norm": 0.0001684948947513476, |
|
"learning_rate": 3.4465174129353234e-06, |
|
"loss": 0.0, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 13.712686567164178, |
|
"eval_loss": 1.9041186760659912e-07, |
|
"eval_runtime": 0.1657, |
|
"eval_samples_per_second": 181.048, |
|
"eval_steps_per_second": 24.14, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 13.759328358208956, |
|
"grad_norm": 0.00033117341808974743, |
|
"learning_rate": 3.322139303482587e-06, |
|
"loss": 0.0, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 13.759328358208956, |
|
"eval_loss": 1.8998919415480486e-07, |
|
"eval_runtime": 0.1684, |
|
"eval_samples_per_second": 178.197, |
|
"eval_steps_per_second": 23.76, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 13.805970149253731, |
|
"grad_norm": 7.936067413538694e-05, |
|
"learning_rate": 3.197761194029851e-06, |
|
"loss": 0.0001, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 13.805970149253731, |
|
"eval_loss": 1.7708023847262666e-07, |
|
"eval_runtime": 0.1876, |
|
"eval_samples_per_second": 159.875, |
|
"eval_steps_per_second": 21.317, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 13.852611940298507, |
|
"grad_norm": 0.00019114046881441027, |
|
"learning_rate": 3.0733830845771146e-06, |
|
"loss": 0.0, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 13.852611940298507, |
|
"eval_loss": 1.7666478413502773e-07, |
|
"eval_runtime": 0.1663, |
|
"eval_samples_per_second": 180.348, |
|
"eval_steps_per_second": 24.046, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 13.899253731343283, |
|
"grad_norm": 0.00010124894470209256, |
|
"learning_rate": 2.9490049751243783e-06, |
|
"loss": 0.0, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 13.899253731343283, |
|
"eval_loss": 1.7688346076738526e-07, |
|
"eval_runtime": 0.1653, |
|
"eval_samples_per_second": 181.441, |
|
"eval_steps_per_second": 24.192, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 13.94589552238806, |
|
"grad_norm": 0.000508667784743011, |
|
"learning_rate": 2.824626865671642e-06, |
|
"loss": 0.0001, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 13.94589552238806, |
|
"eval_loss": 1.8212197971934074e-07, |
|
"eval_runtime": 0.1667, |
|
"eval_samples_per_second": 179.986, |
|
"eval_steps_per_second": 23.998, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 13.992537313432836, |
|
"grad_norm": 0.0005743975634686649, |
|
"learning_rate": 2.7002487562189058e-06, |
|
"loss": 0.0, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 13.992537313432836, |
|
"eval_loss": 1.8214987562714668e-07, |
|
"eval_runtime": 0.1642, |
|
"eval_samples_per_second": 182.734, |
|
"eval_steps_per_second": 24.364, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 14.039179104477611, |
|
"grad_norm": 0.0024433776270598173, |
|
"learning_rate": 2.5758706467661695e-06, |
|
"loss": 0.0, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 14.039179104477611, |
|
"eval_loss": 1.819559116711389e-07, |
|
"eval_runtime": 0.1681, |
|
"eval_samples_per_second": 178.453, |
|
"eval_steps_per_second": 23.794, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 14.085820895522389, |
|
"grad_norm": 6.446899351431057e-05, |
|
"learning_rate": 2.4514925373134328e-06, |
|
"loss": 0.0, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 14.085820895522389, |
|
"eval_loss": 1.817829797801096e-07, |
|
"eval_runtime": 0.1665, |
|
"eval_samples_per_second": 180.178, |
|
"eval_steps_per_second": 24.024, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 14.132462686567164, |
|
"grad_norm": 9.17048382689245e-05, |
|
"learning_rate": 2.3271144278606965e-06, |
|
"loss": 0.0, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 14.132462686567164, |
|
"eval_loss": 1.822889856839538e-07, |
|
"eval_runtime": 0.1655, |
|
"eval_samples_per_second": 181.316, |
|
"eval_steps_per_second": 24.175, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 14.17910447761194, |
|
"grad_norm": 0.00043933966662734747, |
|
"learning_rate": 2.20273631840796e-06, |
|
"loss": 0.0, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 14.17910447761194, |
|
"eval_loss": 1.820878310354601e-07, |
|
"eval_runtime": 0.1621, |
|
"eval_samples_per_second": 185.117, |
|
"eval_steps_per_second": 24.682, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 14.225746268656717, |
|
"grad_norm": 0.0007023545331321657, |
|
"learning_rate": 2.078358208955224e-06, |
|
"loss": 0.0, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 14.225746268656717, |
|
"eval_loss": 1.8134495860522293e-07, |
|
"eval_runtime": 0.166, |
|
"eval_samples_per_second": 180.697, |
|
"eval_steps_per_second": 24.093, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 14.272388059701493, |
|
"grad_norm": 0.0002605569316074252, |
|
"learning_rate": 1.9539800995024877e-06, |
|
"loss": 0.0002, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 14.272388059701493, |
|
"eval_loss": 1.7974176103052741e-07, |
|
"eval_runtime": 0.1664, |
|
"eval_samples_per_second": 180.282, |
|
"eval_steps_per_second": 24.038, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 14.319029850746269, |
|
"grad_norm": 0.00020215619588270783, |
|
"learning_rate": 1.8296019900497514e-06, |
|
"loss": 0.0, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 14.319029850746269, |
|
"eval_loss": 1.7835911592101183e-07, |
|
"eval_runtime": 0.1678, |
|
"eval_samples_per_second": 178.732, |
|
"eval_steps_per_second": 23.831, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 14.365671641791044, |
|
"grad_norm": 0.0002130908687831834, |
|
"learning_rate": 1.705223880597015e-06, |
|
"loss": 0.0, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 14.365671641791044, |
|
"eval_loss": 1.7787193939966528e-07, |
|
"eval_runtime": 0.1641, |
|
"eval_samples_per_second": 182.803, |
|
"eval_steps_per_second": 24.374, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 14.412313432835822, |
|
"grad_norm": 0.000937594857532531, |
|
"learning_rate": 1.5808457711442788e-06, |
|
"loss": 0.0, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 14.412313432835822, |
|
"eval_loss": 1.7799224849568418e-07, |
|
"eval_runtime": 0.1985, |
|
"eval_samples_per_second": 151.106, |
|
"eval_steps_per_second": 20.148, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 14.458955223880597, |
|
"grad_norm": 8.0404024629388e-05, |
|
"learning_rate": 1.4564676616915423e-06, |
|
"loss": 0.0, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 14.458955223880597, |
|
"eval_loss": 1.7809625774134474e-07, |
|
"eval_runtime": 0.1637, |
|
"eval_samples_per_second": 183.297, |
|
"eval_steps_per_second": 24.44, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 14.505597014925373, |
|
"grad_norm": 0.00047697682748548687, |
|
"learning_rate": 1.332089552238806e-06, |
|
"loss": 0.0, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 14.505597014925373, |
|
"eval_loss": 1.7796874374198524e-07, |
|
"eval_runtime": 0.1644, |
|
"eval_samples_per_second": 182.438, |
|
"eval_steps_per_second": 24.325, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 14.552238805970148, |
|
"grad_norm": 0.0002544411108829081, |
|
"learning_rate": 1.2077114427860698e-06, |
|
"loss": 0.0, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 14.552238805970148, |
|
"eval_loss": 1.771030895270087e-07, |
|
"eval_runtime": 0.1737, |
|
"eval_samples_per_second": 172.756, |
|
"eval_steps_per_second": 23.034, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 14.598880597014926, |
|
"grad_norm": 0.00014778469630982727, |
|
"learning_rate": 1.0833333333333335e-06, |
|
"loss": 0.0, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 14.598880597014926, |
|
"eval_loss": 1.7678004837762273e-07, |
|
"eval_runtime": 0.1628, |
|
"eval_samples_per_second": 184.321, |
|
"eval_steps_per_second": 24.576, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 14.645522388059701, |
|
"grad_norm": 9.278374636778608e-05, |
|
"learning_rate": 9.589552238805972e-07, |
|
"loss": 0.0005, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 14.645522388059701, |
|
"eval_loss": 1.776275126985638e-07, |
|
"eval_runtime": 0.1672, |
|
"eval_samples_per_second": 179.379, |
|
"eval_steps_per_second": 23.917, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 14.692164179104477, |
|
"grad_norm": 0.00015798135427758098, |
|
"learning_rate": 8.345771144278608e-07, |
|
"loss": 0.0001, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 14.692164179104477, |
|
"eval_loss": 1.7723084511089837e-07, |
|
"eval_runtime": 0.1647, |
|
"eval_samples_per_second": 182.127, |
|
"eval_steps_per_second": 24.284, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 14.738805970149254, |
|
"grad_norm": 4.799765156349167e-05, |
|
"learning_rate": 7.101990049751243e-07, |
|
"loss": 0.0, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 14.738805970149254, |
|
"eval_loss": 1.7615087699596188e-07, |
|
"eval_runtime": 0.1648, |
|
"eval_samples_per_second": 182.059, |
|
"eval_steps_per_second": 24.275, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 14.78544776119403, |
|
"grad_norm": 0.0001866283710114658, |
|
"learning_rate": 5.858208955223881e-07, |
|
"loss": 0.0001, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 14.78544776119403, |
|
"eval_loss": 1.769698627640537e-07, |
|
"eval_runtime": 0.1647, |
|
"eval_samples_per_second": 182.204, |
|
"eval_steps_per_second": 24.294, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 14.832089552238806, |
|
"grad_norm": 0.00022171168529894203, |
|
"learning_rate": 4.614427860696518e-07, |
|
"loss": 0.0, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 14.832089552238806, |
|
"eval_loss": 1.7630063098295068e-07, |
|
"eval_runtime": 0.1655, |
|
"eval_samples_per_second": 181.321, |
|
"eval_steps_per_second": 24.176, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 14.878731343283581, |
|
"grad_norm": 0.0003983838832937181, |
|
"learning_rate": 3.370646766169155e-07, |
|
"loss": 0.0, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 14.878731343283581, |
|
"eval_loss": 1.7645477612404648e-07, |
|
"eval_runtime": 0.1651, |
|
"eval_samples_per_second": 181.747, |
|
"eval_steps_per_second": 24.233, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 14.925373134328359, |
|
"grad_norm": 0.00016803256585262716, |
|
"learning_rate": 2.1268656716417912e-07, |
|
"loss": 0.0, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 14.925373134328359, |
|
"eval_loss": 1.7600704893538932e-07, |
|
"eval_runtime": 0.1658, |
|
"eval_samples_per_second": 180.892, |
|
"eval_steps_per_second": 24.119, |
|
"step": 32000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 32160, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.257078909708288e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|