|
{ |
|
"best_metric": 1.1947814226150513, |
|
"best_model_checkpoint": "./output/checkpoint-4650", |
|
"epoch": 0.21500559910414332, |
|
"eval_steps": 150, |
|
"global_step": 4800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004479283314669653, |
|
"grad_norm": 6.820243835449219, |
|
"learning_rate": 7.500000000000001e-07, |
|
"loss": 1.2628, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0008958566629339306, |
|
"grad_norm": 5.822151184082031, |
|
"learning_rate": 1.5000000000000002e-06, |
|
"loss": 1.3652, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0013437849944008958, |
|
"grad_norm": 4.442959785461426, |
|
"learning_rate": 2.25e-06, |
|
"loss": 1.412, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0017917133258678612, |
|
"grad_norm": 9.916281700134277, |
|
"learning_rate": 3.0000000000000005e-06, |
|
"loss": 1.5213, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0022396416573348264, |
|
"grad_norm": 22.53717613220215, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 1.3189, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0026875699888017916, |
|
"grad_norm": 5.07314920425415, |
|
"learning_rate": 4.5e-06, |
|
"loss": 1.3022, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.003135498320268757, |
|
"grad_norm": 9.401494026184082, |
|
"learning_rate": 5.2500000000000006e-06, |
|
"loss": 1.5065, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0035834266517357225, |
|
"grad_norm": 8.749906539916992, |
|
"learning_rate": 6.000000000000001e-06, |
|
"loss": 1.1579, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.004031354983202688, |
|
"grad_norm": 6.749314785003662, |
|
"learning_rate": 6.7500000000000014e-06, |
|
"loss": 1.2524, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.004479283314669653, |
|
"grad_norm": 8.411529541015625, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.3242, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.004927211646136618, |
|
"grad_norm": 5.293492794036865, |
|
"learning_rate": 7.499922926093874e-06, |
|
"loss": 0.9967, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.005375139977603583, |
|
"grad_norm": 8.860544204711914, |
|
"learning_rate": 7.499691707543699e-06, |
|
"loss": 1.1881, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0058230683090705485, |
|
"grad_norm": 9.859148979187012, |
|
"learning_rate": 7.499306353853963e-06, |
|
"loss": 1.0598, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.006270996640537514, |
|
"grad_norm": 4.37281608581543, |
|
"learning_rate": 7.49876688086505e-06, |
|
"loss": 1.1233, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.006718924972004479, |
|
"grad_norm": 4.489595890045166, |
|
"learning_rate": 7.4980733107525805e-06, |
|
"loss": 1.2183, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.006718924972004479, |
|
"eval_loss": 1.282976508140564, |
|
"eval_runtime": 51.7095, |
|
"eval_samples_per_second": 9.669, |
|
"eval_steps_per_second": 9.669, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.007166853303471445, |
|
"grad_norm": 6.339463233947754, |
|
"learning_rate": 7.4972256720265044e-06, |
|
"loss": 1.1818, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.00761478163493841, |
|
"grad_norm": 6.762680530548096, |
|
"learning_rate": 7.496223999529932e-06, |
|
"loss": 1.0349, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.008062709966405375, |
|
"grad_norm": 7.486023426055908, |
|
"learning_rate": 7.4950683344376926e-06, |
|
"loss": 1.1735, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.00851063829787234, |
|
"grad_norm": 4.099631309509277, |
|
"learning_rate": 7.4937587242546544e-06, |
|
"loss": 1.2452, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.008958566629339306, |
|
"grad_norm": 5.422396183013916, |
|
"learning_rate": 7.492295222813762e-06, |
|
"loss": 1.1032, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.009406494960806271, |
|
"grad_norm": 6.336536407470703, |
|
"learning_rate": 7.490677890273828e-06, |
|
"loss": 1.0852, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.009854423292273236, |
|
"grad_norm": 4.766495704650879, |
|
"learning_rate": 7.488906793117058e-06, |
|
"loss": 1.2168, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.010302351623740201, |
|
"grad_norm": 5.892153263092041, |
|
"learning_rate": 7.486982004146319e-06, |
|
"loss": 1.1595, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.010750279955207167, |
|
"grad_norm": 4.957208633422852, |
|
"learning_rate": 7.484903602482148e-06, |
|
"loss": 1.1423, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.011198208286674132, |
|
"grad_norm": 4.198282718658447, |
|
"learning_rate": 7.4826716735594945e-06, |
|
"loss": 1.0562, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.011646136618141097, |
|
"grad_norm": 3.4756815433502197, |
|
"learning_rate": 7.480286309124216e-06, |
|
"loss": 0.9894, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.012094064949608062, |
|
"grad_norm": 4.725418567657471, |
|
"learning_rate": 7.477747607229302e-06, |
|
"loss": 1.1761, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.012541993281075027, |
|
"grad_norm": 4.241955280303955, |
|
"learning_rate": 7.475055672230844e-06, |
|
"loss": 1.1118, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.012989921612541993, |
|
"grad_norm": 5.7904863357543945, |
|
"learning_rate": 7.472210614783745e-06, |
|
"loss": 1.0932, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.013437849944008958, |
|
"grad_norm": 4.546011924743652, |
|
"learning_rate": 7.469212551837173e-06, |
|
"loss": 1.1187, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.013437849944008958, |
|
"eval_loss": 1.26471745967865, |
|
"eval_runtime": 51.7822, |
|
"eval_samples_per_second": 9.656, |
|
"eval_steps_per_second": 9.656, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.013885778275475923, |
|
"grad_norm": 6.256772994995117, |
|
"learning_rate": 7.4660616066297565e-06, |
|
"loss": 1.2176, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.01433370660694289, |
|
"grad_norm": 7.437366485595703, |
|
"learning_rate": 7.462757908684509e-06, |
|
"loss": 1.046, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.014781634938409855, |
|
"grad_norm": 8.049488067626953, |
|
"learning_rate": 7.459301593803512e-06, |
|
"loss": 1.2396, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.01522956326987682, |
|
"grad_norm": 5.115020751953125, |
|
"learning_rate": 7.455692804062335e-06, |
|
"loss": 1.1018, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.015677491601343786, |
|
"grad_norm": 5.805201530456543, |
|
"learning_rate": 7.451931687804189e-06, |
|
"loss": 1.0083, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.01612541993281075, |
|
"grad_norm": 5.960669040679932, |
|
"learning_rate": 7.448018399633831e-06, |
|
"loss": 1.1773, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.016573348264277716, |
|
"grad_norm": 4.82655143737793, |
|
"learning_rate": 7.443953100411214e-06, |
|
"loss": 1.2279, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.01702127659574468, |
|
"grad_norm": 5.768619060516357, |
|
"learning_rate": 7.439735957244862e-06, |
|
"loss": 1.0924, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.017469204927211646, |
|
"grad_norm": 4.603348731994629, |
|
"learning_rate": 7.435367143485015e-06, |
|
"loss": 0.9547, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.01791713325867861, |
|
"grad_norm": 3.802041530609131, |
|
"learning_rate": 7.430846838716496e-06, |
|
"loss": 1.0569, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.018365061590145577, |
|
"grad_norm": 4.473762035369873, |
|
"learning_rate": 7.426175228751328e-06, |
|
"loss": 1.1299, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.018812989921612542, |
|
"grad_norm": 4.674028396606445, |
|
"learning_rate": 7.421352505621099e-06, |
|
"loss": 1.0512, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.019260918253079507, |
|
"grad_norm": 5.1446852684021, |
|
"learning_rate": 7.416378867569069e-06, |
|
"loss": 1.2024, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.019708846584546472, |
|
"grad_norm": 3.742156744003296, |
|
"learning_rate": 7.411254519042017e-06, |
|
"loss": 1.1778, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.020156774916013438, |
|
"grad_norm": 4.0376200675964355, |
|
"learning_rate": 7.4059796706818396e-06, |
|
"loss": 1.1754, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.020156774916013438, |
|
"eval_loss": 1.2499778270721436, |
|
"eval_runtime": 51.5995, |
|
"eval_samples_per_second": 9.69, |
|
"eval_steps_per_second": 9.69, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.020604703247480403, |
|
"grad_norm": 3.672325372695923, |
|
"learning_rate": 7.400554539316894e-06, |
|
"loss": 1.1627, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.021052631578947368, |
|
"grad_norm": 4.949635982513428, |
|
"learning_rate": 7.394979347953081e-06, |
|
"loss": 1.3115, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.021500559910414333, |
|
"grad_norm": 4.03855037689209, |
|
"learning_rate": 7.389254325764681e-06, |
|
"loss": 1.1176, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.0219484882418813, |
|
"grad_norm": 4.981250762939453, |
|
"learning_rate": 7.383379708084934e-06, |
|
"loss": 1.0668, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.022396416573348264, |
|
"grad_norm": 4.68571138381958, |
|
"learning_rate": 7.377355736396362e-06, |
|
"loss": 1.1235, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.02284434490481523, |
|
"grad_norm": 5.7003326416015625, |
|
"learning_rate": 7.371182658320847e-06, |
|
"loss": 1.0535, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.023292273236282194, |
|
"grad_norm": 2.357079029083252, |
|
"learning_rate": 7.36486072760945e-06, |
|
"loss": 0.9768, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.02374020156774916, |
|
"grad_norm": 4.828664779663086, |
|
"learning_rate": 7.358390204131984e-06, |
|
"loss": 1.0385, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.024188129899216124, |
|
"grad_norm": 3.4303321838378906, |
|
"learning_rate": 7.3517713538663235e-06, |
|
"loss": 0.9826, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.02463605823068309, |
|
"grad_norm": 8.705097198486328, |
|
"learning_rate": 7.345004448887478e-06, |
|
"loss": 1.0988, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.025083986562150055, |
|
"grad_norm": 4.806099891662598, |
|
"learning_rate": 7.3380897673564085e-06, |
|
"loss": 1.2765, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.02553191489361702, |
|
"grad_norm": 3.948829174041748, |
|
"learning_rate": 7.33102759350859e-06, |
|
"loss": 1.2548, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.025979843225083985, |
|
"grad_norm": 8.706982612609863, |
|
"learning_rate": 7.323818217642328e-06, |
|
"loss": 1.1907, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.02642777155655095, |
|
"grad_norm": 4.196287155151367, |
|
"learning_rate": 7.316461936106827e-06, |
|
"loss": 1.1541, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.026875699888017916, |
|
"grad_norm": 4.2185187339782715, |
|
"learning_rate": 7.3089590512900084e-06, |
|
"loss": 1.0761, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.026875699888017916, |
|
"eval_loss": 1.2407419681549072, |
|
"eval_runtime": 51.6589, |
|
"eval_samples_per_second": 9.679, |
|
"eval_steps_per_second": 9.679, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.02732362821948488, |
|
"grad_norm": 4.50939416885376, |
|
"learning_rate": 7.301309871606081e-06, |
|
"loss": 1.1746, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.027771556550951846, |
|
"grad_norm": 5.48988676071167, |
|
"learning_rate": 7.293514711482861e-06, |
|
"loss": 1.0518, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.028219484882418815, |
|
"grad_norm": 4.441885471343994, |
|
"learning_rate": 7.285573891348849e-06, |
|
"loss": 1.0679, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.02866741321388578, |
|
"grad_norm": 6.711030006408691, |
|
"learning_rate": 7.27748773762006e-06, |
|
"loss": 1.2901, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.029115341545352745, |
|
"grad_norm": 5.328275680541992, |
|
"learning_rate": 7.269256582686603e-06, |
|
"loss": 1.1749, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.02956326987681971, |
|
"grad_norm": 3.016313314437866, |
|
"learning_rate": 7.260880764899016e-06, |
|
"loss": 1.1398, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.030011198208286675, |
|
"grad_norm": 4.6470866203308105, |
|
"learning_rate": 7.252360628554363e-06, |
|
"loss": 1.0427, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.03045912653975364, |
|
"grad_norm": 9.044170379638672, |
|
"learning_rate": 7.243696523882079e-06, |
|
"loss": 1.0913, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.030907054871220606, |
|
"grad_norm": 4.983870029449463, |
|
"learning_rate": 7.2348888070295705e-06, |
|
"loss": 1.1174, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.03135498320268757, |
|
"grad_norm": 10.38315486907959, |
|
"learning_rate": 7.225937840047583e-06, |
|
"loss": 1.2386, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.031802911534154536, |
|
"grad_norm": 5.104282855987549, |
|
"learning_rate": 7.216843990875307e-06, |
|
"loss": 1.1014, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.0322508398656215, |
|
"grad_norm": 5.493166446685791, |
|
"learning_rate": 7.207607633325266e-06, |
|
"loss": 1.2569, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.03269876819708847, |
|
"grad_norm": 5.069271564483643, |
|
"learning_rate": 7.198229147067941e-06, |
|
"loss": 1.1938, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.03314669652855543, |
|
"grad_norm": 5.183401107788086, |
|
"learning_rate": 7.18870891761617e-06, |
|
"loss": 0.9859, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.0335946248600224, |
|
"grad_norm": 4.3622965812683105, |
|
"learning_rate": 7.1790473363092974e-06, |
|
"loss": 1.1359, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.0335946248600224, |
|
"eval_loss": 1.2344202995300293, |
|
"eval_runtime": 51.6321, |
|
"eval_samples_per_second": 9.684, |
|
"eval_steps_per_second": 9.684, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.03404255319148936, |
|
"grad_norm": 4.141931056976318, |
|
"learning_rate": 7.169244800297089e-06, |
|
"loss": 1.2613, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.03449048152295633, |
|
"grad_norm": 4.191932201385498, |
|
"learning_rate": 7.159301712523407e-06, |
|
"loss": 1.1802, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.03493840985442329, |
|
"grad_norm": 4.759700775146484, |
|
"learning_rate": 7.149218481709644e-06, |
|
"loss": 1.0651, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.03538633818589026, |
|
"grad_norm": 3.969430923461914, |
|
"learning_rate": 7.1389955223379266e-06, |
|
"loss": 0.9129, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.03583426651735722, |
|
"grad_norm": 5.1956467628479, |
|
"learning_rate": 7.128633254634072e-06, |
|
"loss": 1.2688, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.03628219484882419, |
|
"grad_norm": 3.615705966949463, |
|
"learning_rate": 7.118132104550322e-06, |
|
"loss": 1.1092, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.036730123180291153, |
|
"grad_norm": 3.635277271270752, |
|
"learning_rate": 7.107492503747826e-06, |
|
"loss": 1.0265, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.03717805151175812, |
|
"grad_norm": 4.518077373504639, |
|
"learning_rate": 7.096714889578898e-06, |
|
"loss": 1.0817, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.037625979843225084, |
|
"grad_norm": 6.652565002441406, |
|
"learning_rate": 7.085799705069046e-06, |
|
"loss": 0.9709, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.03807390817469205, |
|
"grad_norm": 5.337361812591553, |
|
"learning_rate": 7.0747473988987515e-06, |
|
"loss": 1.0883, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.038521836506159014, |
|
"grad_norm": 5.067249774932861, |
|
"learning_rate": 7.063558425385033e-06, |
|
"loss": 1.08, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.03896976483762598, |
|
"grad_norm": 3.9859232902526855, |
|
"learning_rate": 7.052233244462769e-06, |
|
"loss": 1.0063, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.039417693169092945, |
|
"grad_norm": 5.297623634338379, |
|
"learning_rate": 7.040772321665788e-06, |
|
"loss": 0.9638, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.03986562150055991, |
|
"grad_norm": 6.088709354400635, |
|
"learning_rate": 7.029176128107734e-06, |
|
"loss": 1.2673, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.040313549832026875, |
|
"grad_norm": 7.997159957885742, |
|
"learning_rate": 7.017445140462711e-06, |
|
"loss": 0.9986, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.040313549832026875, |
|
"eval_loss": 1.2309150695800781, |
|
"eval_runtime": 51.612, |
|
"eval_samples_per_second": 9.688, |
|
"eval_steps_per_second": 9.688, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.04076147816349384, |
|
"grad_norm": 6.393094062805176, |
|
"learning_rate": 7.00557984094567e-06, |
|
"loss": 1.066, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.041209406494960805, |
|
"grad_norm": 4.47462797164917, |
|
"learning_rate": 6.993580717292601e-06, |
|
"loss": 1.3117, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.04165733482642777, |
|
"grad_norm": 4.160079479217529, |
|
"learning_rate": 6.981448262740483e-06, |
|
"loss": 1.3003, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.042105263157894736, |
|
"grad_norm": 5.260162353515625, |
|
"learning_rate": 6.969182976006999e-06, |
|
"loss": 1.312, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.0425531914893617, |
|
"grad_norm": 4.503716468811035, |
|
"learning_rate": 6.95678536127005e-06, |
|
"loss": 1.185, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.043001119820828666, |
|
"grad_norm": 3.7414872646331787, |
|
"learning_rate": 6.944255928147017e-06, |
|
"loss": 1.1585, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.04344904815229563, |
|
"grad_norm": 5.410964012145996, |
|
"learning_rate": 6.931595191673823e-06, |
|
"loss": 1.1403, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.0438969764837626, |
|
"grad_norm": 4.388716220855713, |
|
"learning_rate": 6.9188036722837555e-06, |
|
"loss": 1.0452, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.04434490481522956, |
|
"grad_norm": 2.7749533653259277, |
|
"learning_rate": 6.905881895786076e-06, |
|
"loss": 1.0638, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.04479283314669653, |
|
"grad_norm": 5.431761741638184, |
|
"learning_rate": 6.892830393344403e-06, |
|
"loss": 1.2718, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.04524076147816349, |
|
"grad_norm": 4.384571552276611, |
|
"learning_rate": 6.879649701454886e-06, |
|
"loss": 1.0594, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.04568868980963046, |
|
"grad_norm": 5.040534019470215, |
|
"learning_rate": 6.866340361924141e-06, |
|
"loss": 1.2255, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.04613661814109742, |
|
"grad_norm": 4.800682544708252, |
|
"learning_rate": 6.852902921846988e-06, |
|
"loss": 1.1093, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.04658454647256439, |
|
"grad_norm": 5.662080764770508, |
|
"learning_rate": 6.8393379335839565e-06, |
|
"loss": 1.2003, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.04703247480403135, |
|
"grad_norm": 3.93361234664917, |
|
"learning_rate": 6.825645954738586e-06, |
|
"loss": 1.0652, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.04703247480403135, |
|
"eval_loss": 1.2271474599838257, |
|
"eval_runtime": 51.5746, |
|
"eval_samples_per_second": 9.695, |
|
"eval_steps_per_second": 9.695, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.04748040313549832, |
|
"grad_norm": 4.918002605438232, |
|
"learning_rate": 6.811827548134495e-06, |
|
"loss": 1.156, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.047928331466965284, |
|
"grad_norm": 3.533487319946289, |
|
"learning_rate": 6.797883281792261e-06, |
|
"loss": 1.0533, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.04837625979843225, |
|
"grad_norm": 4.698348045349121, |
|
"learning_rate": 6.783813728906054e-06, |
|
"loss": 1.2621, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.048824188129899214, |
|
"grad_norm": 3.90852427482605, |
|
"learning_rate": 6.769619467820086e-06, |
|
"loss": 1.0754, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.04927211646136618, |
|
"grad_norm": 6.924786567687988, |
|
"learning_rate": 6.755301082004838e-06, |
|
"loss": 1.0617, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.049720044792833144, |
|
"grad_norm": 5.685960292816162, |
|
"learning_rate": 6.740859160033068e-06, |
|
"loss": 1.2185, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.05016797312430011, |
|
"grad_norm": 5.533092975616455, |
|
"learning_rate": 6.726294295555623e-06, |
|
"loss": 1.0583, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.050615901455767075, |
|
"grad_norm": 4.5029988288879395, |
|
"learning_rate": 6.711607087277034e-06, |
|
"loss": 1.1781, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.05106382978723404, |
|
"grad_norm": 3.2203736305236816, |
|
"learning_rate": 6.69679813893091e-06, |
|
"loss": 1.151, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.051511758118701005, |
|
"grad_norm": 6.602795600891113, |
|
"learning_rate": 6.681868059255113e-06, |
|
"loss": 1.1373, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.05195968645016797, |
|
"grad_norm": 3.071552038192749, |
|
"learning_rate": 6.666817461966741e-06, |
|
"loss": 1.1554, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.052407614781634936, |
|
"grad_norm": 5.886751174926758, |
|
"learning_rate": 6.651646965736902e-06, |
|
"loss": 1.1328, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.0528555431131019, |
|
"grad_norm": 4.323307991027832, |
|
"learning_rate": 6.636357194165274e-06, |
|
"loss": 1.1535, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.053303471444568866, |
|
"grad_norm": 4.585876941680908, |
|
"learning_rate": 6.620948775754481e-06, |
|
"loss": 1.1636, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.05375139977603583, |
|
"grad_norm": 3.9351437091827393, |
|
"learning_rate": 6.605422343884255e-06, |
|
"loss": 1.2689, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.05375139977603583, |
|
"eval_loss": 1.2224195003509521, |
|
"eval_runtime": 51.5936, |
|
"eval_samples_per_second": 9.691, |
|
"eval_steps_per_second": 9.691, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.054199328107502796, |
|
"grad_norm": 3.1242146492004395, |
|
"learning_rate": 6.589778536785396e-06, |
|
"loss": 1.2646, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.05464725643896976, |
|
"grad_norm": 3.1645703315734863, |
|
"learning_rate": 6.5740179975135426e-06, |
|
"loss": 0.9831, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.05509518477043673, |
|
"grad_norm": 6.550941467285156, |
|
"learning_rate": 6.5581413739227314e-06, |
|
"loss": 1.1777, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.05554311310190369, |
|
"grad_norm": 17.51181983947754, |
|
"learning_rate": 6.542149318638777e-06, |
|
"loss": 1.0765, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.055991041433370664, |
|
"grad_norm": 6.8737664222717285, |
|
"learning_rate": 6.526042489032434e-06, |
|
"loss": 1.0107, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.05643896976483763, |
|
"grad_norm": 3.5256145000457764, |
|
"learning_rate": 6.509821547192383e-06, |
|
"loss": 1.1973, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.056886898096304594, |
|
"grad_norm": 5.974047660827637, |
|
"learning_rate": 6.493487159898006e-06, |
|
"loss": 1.2409, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.05733482642777156, |
|
"grad_norm": 3.98787522315979, |
|
"learning_rate": 6.477039998591991e-06, |
|
"loss": 1.3272, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.057782754759238525, |
|
"grad_norm": 5.225778102874756, |
|
"learning_rate": 6.460480739352719e-06, |
|
"loss": 1.2937, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.05823068309070549, |
|
"grad_norm": 3.719729423522949, |
|
"learning_rate": 6.4438100628664795e-06, |
|
"loss": 1.0965, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.058678611422172455, |
|
"grad_norm": 2.8820245265960693, |
|
"learning_rate": 6.4270286543994874e-06, |
|
"loss": 1.2178, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.05912653975363942, |
|
"grad_norm": 3.031202793121338, |
|
"learning_rate": 6.410137203769718e-06, |
|
"loss": 1.354, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.059574468085106386, |
|
"grad_norm": 3.010680675506592, |
|
"learning_rate": 6.393136405318545e-06, |
|
"loss": 1.185, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.06002239641657335, |
|
"grad_norm": 3.756014823913574, |
|
"learning_rate": 6.376026957882207e-06, |
|
"loss": 1.1636, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.060470324748040316, |
|
"grad_norm": 4.391636848449707, |
|
"learning_rate": 6.3588095647630754e-06, |
|
"loss": 1.2252, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.060470324748040316, |
|
"eval_loss": 1.222408652305603, |
|
"eval_runtime": 51.5211, |
|
"eval_samples_per_second": 9.705, |
|
"eval_steps_per_second": 9.705, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.06091825307950728, |
|
"grad_norm": 3.5359737873077393, |
|
"learning_rate": 6.341484933700744e-06, |
|
"loss": 1.0688, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.061366181410974247, |
|
"grad_norm": 4.412395477294922, |
|
"learning_rate": 6.32405377684294e-06, |
|
"loss": 1.1889, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.06181410974244121, |
|
"grad_norm": 7.099231719970703, |
|
"learning_rate": 6.306516810716249e-06, |
|
"loss": 1.0922, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.06226203807390818, |
|
"grad_norm": 3.257270097732544, |
|
"learning_rate": 6.288874756196662e-06, |
|
"loss": 1.2291, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.06270996640537514, |
|
"grad_norm": 3.6133875846862793, |
|
"learning_rate": 6.271128338479939e-06, |
|
"loss": 1.0567, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.06315789473684211, |
|
"grad_norm": 4.996825695037842, |
|
"learning_rate": 6.253278287051806e-06, |
|
"loss": 1.1242, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.06360582306830907, |
|
"grad_norm": 5.642391204833984, |
|
"learning_rate": 6.235325335657962e-06, |
|
"loss": 1.1998, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.06405375139977604, |
|
"grad_norm": 4.652320384979248, |
|
"learning_rate": 6.217270222273923e-06, |
|
"loss": 1.0647, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.064501679731243, |
|
"grad_norm": 8.814513206481934, |
|
"learning_rate": 6.1991136890746825e-06, |
|
"loss": 0.97, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.06494960806270997, |
|
"grad_norm": 4.535324573516846, |
|
"learning_rate": 6.180856482404208e-06, |
|
"loss": 1.0829, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.06539753639417693, |
|
"grad_norm": 5.13389778137207, |
|
"learning_rate": 6.162499352744754e-06, |
|
"loss": 1.3333, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.0658454647256439, |
|
"grad_norm": 4.871939182281494, |
|
"learning_rate": 6.144043054686022e-06, |
|
"loss": 1.1397, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.06629339305711086, |
|
"grad_norm": 3.31581449508667, |
|
"learning_rate": 6.125488346894139e-06, |
|
"loss": 1.0983, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.06674132138857783, |
|
"grad_norm": 6.067586898803711, |
|
"learning_rate": 6.106835992080464e-06, |
|
"loss": 1.0931, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.0671892497200448, |
|
"grad_norm": 4.4560465812683105, |
|
"learning_rate": 6.088086756970252e-06, |
|
"loss": 1.0743, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.0671892497200448, |
|
"eval_loss": 1.21743643283844, |
|
"eval_runtime": 51.7437, |
|
"eval_samples_per_second": 9.663, |
|
"eval_steps_per_second": 9.663, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.06763717805151176, |
|
"grad_norm": 6.724518775939941, |
|
"learning_rate": 6.0692414122711184e-06, |
|
"loss": 1.2655, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.06808510638297872, |
|
"grad_norm": 4.3255085945129395, |
|
"learning_rate": 6.050300732641376e-06, |
|
"loss": 1.0058, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.06853303471444569, |
|
"grad_norm": 2.7948145866394043, |
|
"learning_rate": 6.0312654966581755e-06, |
|
"loss": 1.1331, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.06898096304591265, |
|
"grad_norm": 4.223801612854004, |
|
"learning_rate": 6.012136486785512e-06, |
|
"loss": 0.9267, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.06942889137737962, |
|
"grad_norm": 8.328617095947266, |
|
"learning_rate": 5.992914489342061e-06, |
|
"loss": 1.0601, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.06987681970884659, |
|
"grad_norm": 3.9401023387908936, |
|
"learning_rate": 5.9736002944688474e-06, |
|
"loss": 1.1296, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.07032474804031355, |
|
"grad_norm": 4.462929725646973, |
|
"learning_rate": 5.954194696096775e-06, |
|
"loss": 1.1266, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.07077267637178052, |
|
"grad_norm": 9.879998207092285, |
|
"learning_rate": 5.9346984919139865e-06, |
|
"loss": 1.0835, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.07122060470324748, |
|
"grad_norm": 4.088196277618408, |
|
"learning_rate": 5.9151124833330745e-06, |
|
"loss": 1.1256, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.07166853303471445, |
|
"grad_norm": 6.066174030303955, |
|
"learning_rate": 5.895437475458137e-06, |
|
"loss": 1.2295, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.07211646136618141, |
|
"grad_norm": 4.754509449005127, |
|
"learning_rate": 5.875674277051688e-06, |
|
"loss": 1.1676, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.07256438969764838, |
|
"grad_norm": 3.898282289505005, |
|
"learning_rate": 5.855823700501406e-06, |
|
"loss": 1.2583, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.07301231802911534, |
|
"grad_norm": 5.35301399230957, |
|
"learning_rate": 5.835886561786744e-06, |
|
"loss": 1.3667, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.07346024636058231, |
|
"grad_norm": 6.24777889251709, |
|
"learning_rate": 5.815863680445385e-06, |
|
"loss": 1.1099, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.07390817469204927, |
|
"grad_norm": 3.7771286964416504, |
|
"learning_rate": 5.795755879539558e-06, |
|
"loss": 0.9985, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.07390817469204927, |
|
"eval_loss": 1.2118867635726929, |
|
"eval_runtime": 51.6701, |
|
"eval_samples_per_second": 9.677, |
|
"eval_steps_per_second": 9.677, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.07435610302351624, |
|
"grad_norm": 4.368626117706299, |
|
"learning_rate": 5.775563985622202e-06, |
|
"loss": 1.1, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.0748040313549832, |
|
"grad_norm": 6.341384410858154, |
|
"learning_rate": 5.755288828702987e-06, |
|
"loss": 1.0292, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.07525195968645017, |
|
"grad_norm": 5.869757652282715, |
|
"learning_rate": 5.734931242214204e-06, |
|
"loss": 1.0937, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.07569988801791713, |
|
"grad_norm": 4.857089042663574, |
|
"learning_rate": 5.7144920629764955e-06, |
|
"loss": 1.0987, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.0761478163493841, |
|
"grad_norm": 5.114626884460449, |
|
"learning_rate": 5.693972131164471e-06, |
|
"loss": 0.9623, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.07659574468085106, |
|
"grad_norm": 5.152310371398926, |
|
"learning_rate": 5.673372290272149e-06, |
|
"loss": 1.1423, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.07704367301231803, |
|
"grad_norm": 3.8204965591430664, |
|
"learning_rate": 5.652693387078309e-06, |
|
"loss": 1.0523, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.077491601343785, |
|
"grad_norm": 3.0346767902374268, |
|
"learning_rate": 5.631936271611667e-06, |
|
"loss": 1.0483, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.07793952967525196, |
|
"grad_norm": 4.436351299285889, |
|
"learning_rate": 5.611101797115939e-06, |
|
"loss": 1.0144, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.07838745800671892, |
|
"grad_norm": 5.614783763885498, |
|
"learning_rate": 5.5901908200147685e-06, |
|
"loss": 1.078, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.07883538633818589, |
|
"grad_norm": 4.0426926612854, |
|
"learning_rate": 5.56920419987652e-06, |
|
"loss": 1.2628, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.07928331466965285, |
|
"grad_norm": 5.30089807510376, |
|
"learning_rate": 5.5481427993789534e-06, |
|
"loss": 1.1257, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.07973124300111982, |
|
"grad_norm": 3.5508739948272705, |
|
"learning_rate": 5.527007484273746e-06, |
|
"loss": 1.0355, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.08017917133258678, |
|
"grad_norm": 4.027277946472168, |
|
"learning_rate": 5.5057991233509225e-06, |
|
"loss": 0.9196, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.08062709966405375, |
|
"grad_norm": 7.427858352661133, |
|
"learning_rate": 5.484518588403134e-06, |
|
"loss": 1.1913, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.08062709966405375, |
|
"eval_loss": 1.2111696004867554, |
|
"eval_runtime": 51.6854, |
|
"eval_samples_per_second": 9.674, |
|
"eval_steps_per_second": 9.674, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.08107502799552072, |
|
"grad_norm": 6.3730597496032715, |
|
"learning_rate": 5.463166754189819e-06, |
|
"loss": 1.171, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.08152295632698768, |
|
"grad_norm": 5.194447994232178, |
|
"learning_rate": 5.441744498401255e-06, |
|
"loss": 1.2202, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.08197088465845465, |
|
"grad_norm": 4.3045454025268555, |
|
"learning_rate": 5.4202527016224725e-06, |
|
"loss": 1.1318, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.08241881298992161, |
|
"grad_norm": 5.316900253295898, |
|
"learning_rate": 5.398692247297059e-06, |
|
"loss": 1.2107, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.08286674132138858, |
|
"grad_norm": 8.284939765930176, |
|
"learning_rate": 5.377064021690844e-06, |
|
"loss": 1.1683, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.08331466965285554, |
|
"grad_norm": 4.051226615905762, |
|
"learning_rate": 5.355368913855472e-06, |
|
"loss": 1.2974, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.0837625979843225, |
|
"grad_norm": 5.353118896484375, |
|
"learning_rate": 5.333607815591851e-06, |
|
"loss": 1.235, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.08421052631578947, |
|
"grad_norm": 5.097784996032715, |
|
"learning_rate": 5.311781621413497e-06, |
|
"loss": 1.0172, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.08465845464725644, |
|
"grad_norm": 3.437659978866577, |
|
"learning_rate": 5.289891228509769e-06, |
|
"loss": 1.0104, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.0851063829787234, |
|
"grad_norm": 4.631069660186768, |
|
"learning_rate": 5.267937536708977e-06, |
|
"loss": 1.0368, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.08555431131019037, |
|
"grad_norm": 5.044907569885254, |
|
"learning_rate": 5.245921448441407e-06, |
|
"loss": 1.0732, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.08600223964165733, |
|
"grad_norm": 3.2756667137145996, |
|
"learning_rate": 5.223843868702214e-06, |
|
"loss": 1.2815, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.0864501679731243, |
|
"grad_norm": 5.061473369598389, |
|
"learning_rate": 5.201705705014231e-06, |
|
"loss": 1.1059, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.08689809630459126, |
|
"grad_norm": 4.924319744110107, |
|
"learning_rate": 5.1795078673906575e-06, |
|
"loss": 1.0561, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.08734602463605823, |
|
"grad_norm": 4.019739627838135, |
|
"learning_rate": 5.1572512682976546e-06, |
|
"loss": 0.9889, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.08734602463605823, |
|
"eval_loss": 1.2077045440673828, |
|
"eval_runtime": 51.7283, |
|
"eval_samples_per_second": 9.666, |
|
"eval_steps_per_second": 9.666, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.0877939529675252, |
|
"grad_norm": 6.297740459442139, |
|
"learning_rate": 5.134936822616837e-06, |
|
"loss": 1.1664, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.08824188129899216, |
|
"grad_norm": 5.478749752044678, |
|
"learning_rate": 5.112565447607669e-06, |
|
"loss": 1.2503, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.08868980963045912, |
|
"grad_norm": 4.692316055297852, |
|
"learning_rate": 5.090138062869755e-06, |
|
"loss": 1.1421, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.08913773796192609, |
|
"grad_norm": 3.5623536109924316, |
|
"learning_rate": 5.067655590305036e-06, |
|
"loss": 1.1203, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.08958566629339305, |
|
"grad_norm": 6.875621318817139, |
|
"learning_rate": 5.045118954079904e-06, |
|
"loss": 1.1348, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.09003359462486002, |
|
"grad_norm": 5.2604756355285645, |
|
"learning_rate": 5.022529080587205e-06, |
|
"loss": 1.0326, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.09048152295632698, |
|
"grad_norm": 5.012307643890381, |
|
"learning_rate": 4.999886898408157e-06, |
|
"loss": 1.12, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.09092945128779395, |
|
"grad_norm": 5.246688365936279, |
|
"learning_rate": 4.977193338274189e-06, |
|
"loss": 1.1164, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.09137737961926092, |
|
"grad_norm": 3.9779398441314697, |
|
"learning_rate": 4.954449333028672e-06, |
|
"loss": 1.0607, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.09182530795072788, |
|
"grad_norm": 5.392056465148926, |
|
"learning_rate": 4.931655817588579e-06, |
|
"loss": 1.1102, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.09227323628219485, |
|
"grad_norm": 5.144470691680908, |
|
"learning_rate": 4.9088137289060535e-06, |
|
"loss": 1.0649, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.09272116461366181, |
|
"grad_norm": 3.7060792446136475, |
|
"learning_rate": 4.885924005929896e-06, |
|
"loss": 1.0718, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.09316909294512878, |
|
"grad_norm": 3.357794761657715, |
|
"learning_rate": 4.862987589566965e-06, |
|
"loss": 1.1003, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.09361702127659574, |
|
"grad_norm": 5.704718589782715, |
|
"learning_rate": 4.840005422643503e-06, |
|
"loss": 1.2042, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.0940649496080627, |
|
"grad_norm": 5.481514930725098, |
|
"learning_rate": 4.816978449866372e-06, |
|
"loss": 1.0777, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.0940649496080627, |
|
"eval_loss": 1.2093305587768555, |
|
"eval_runtime": 51.7975, |
|
"eval_samples_per_second": 9.653, |
|
"eval_steps_per_second": 9.653, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.09451287793952967, |
|
"grad_norm": 5.508385181427002, |
|
"learning_rate": 4.793907617784238e-06, |
|
"loss": 1.5375, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.09496080627099664, |
|
"grad_norm": 4.192409515380859, |
|
"learning_rate": 4.770793874748642e-06, |
|
"loss": 0.9964, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.0954087346024636, |
|
"grad_norm": 4.068387508392334, |
|
"learning_rate": 4.747638170875032e-06, |
|
"loss": 0.9244, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.09585666293393057, |
|
"grad_norm": 2.513946771621704, |
|
"learning_rate": 4.724441458003699e-06, |
|
"loss": 1.1329, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.09630459126539753, |
|
"grad_norm": 4.470638275146484, |
|
"learning_rate": 4.701204689660653e-06, |
|
"loss": 1.0299, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.0967525195968645, |
|
"grad_norm": 5.644805908203125, |
|
"learning_rate": 4.67792882101843e-06, |
|
"loss": 1.2654, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.09720044792833146, |
|
"grad_norm": 5.1912736892700195, |
|
"learning_rate": 4.654614808856823e-06, |
|
"loss": 1.2265, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.09764837625979843, |
|
"grad_norm": 11.092533111572266, |
|
"learning_rate": 4.631263611523557e-06, |
|
"loss": 1.2182, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.09809630459126539, |
|
"grad_norm": 4.138496398925781, |
|
"learning_rate": 4.607876188894896e-06, |
|
"loss": 1.2283, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.09854423292273236, |
|
"grad_norm": 5.229914665222168, |
|
"learning_rate": 4.58445350233618e-06, |
|
"loss": 1.1319, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.09899216125419932, |
|
"grad_norm": 4.059961318969727, |
|
"learning_rate": 4.560996514662314e-06, |
|
"loss": 1.0411, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.09944008958566629, |
|
"grad_norm": 4.80086088180542, |
|
"learning_rate": 4.5375061900981855e-06, |
|
"loss": 1.23, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.09988801791713325, |
|
"grad_norm": 5.166756629943848, |
|
"learning_rate": 4.513983494239034e-06, |
|
"loss": 1.219, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.10033594624860022, |
|
"grad_norm": 5.53660249710083, |
|
"learning_rate": 4.490429394010752e-06, |
|
"loss": 1.1245, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.10078387458006718, |
|
"grad_norm": 2.9756040573120117, |
|
"learning_rate": 4.466844857630147e-06, |
|
"loss": 1.1395, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.10078387458006718, |
|
"eval_loss": 1.2089135646820068, |
|
"eval_runtime": 51.6342, |
|
"eval_samples_per_second": 9.684, |
|
"eval_steps_per_second": 9.684, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.10123180291153415, |
|
"grad_norm": 3.644266128540039, |
|
"learning_rate": 4.443230854565133e-06, |
|
"loss": 1.0985, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.10167973124300111, |
|
"grad_norm": 4.662050724029541, |
|
"learning_rate": 4.4195883554948885e-06, |
|
"loss": 1.3397, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.10212765957446808, |
|
"grad_norm": 5.3237385749816895, |
|
"learning_rate": 4.3959183322699466e-06, |
|
"loss": 1.1351, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.10257558790593505, |
|
"grad_norm": 4.3604207038879395, |
|
"learning_rate": 4.372221757872255e-06, |
|
"loss": 1.1208, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.10302351623740201, |
|
"grad_norm": 3.731410264968872, |
|
"learning_rate": 4.3484996063751725e-06, |
|
"loss": 1.1584, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.10347144456886898, |
|
"grad_norm": 4.031397342681885, |
|
"learning_rate": 4.324752852903435e-06, |
|
"loss": 0.9656, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.10391937290033594, |
|
"grad_norm": 3.564148187637329, |
|
"learning_rate": 4.300982473593068e-06, |
|
"loss": 1.0031, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.1043673012318029, |
|
"grad_norm": 5.459331035614014, |
|
"learning_rate": 4.277189445551261e-06, |
|
"loss": 1.0037, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.10481522956326987, |
|
"grad_norm": 4.870905876159668, |
|
"learning_rate": 4.253374746816209e-06, |
|
"loss": 0.9615, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.10526315789473684, |
|
"grad_norm": 5.284097671508789, |
|
"learning_rate": 4.229539356316898e-06, |
|
"loss": 1.3278, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.1057110862262038, |
|
"grad_norm": 5.323864459991455, |
|
"learning_rate": 4.205684253832877e-06, |
|
"loss": 1.1903, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.10615901455767077, |
|
"grad_norm": 7.844208717346191, |
|
"learning_rate": 4.1818104199539735e-06, |
|
"loss": 1.056, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.10660694288913773, |
|
"grad_norm": 4.325316905975342, |
|
"learning_rate": 4.1579188360399916e-06, |
|
"loss": 1.2431, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.1070548712206047, |
|
"grad_norm": 3.5362424850463867, |
|
"learning_rate": 4.134010484180368e-06, |
|
"loss": 1.1804, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.10750279955207166, |
|
"grad_norm": 3.2404041290283203, |
|
"learning_rate": 4.110086347153807e-06, |
|
"loss": 1.1556, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.10750279955207166, |
|
"eval_loss": 1.2038679122924805, |
|
"eval_runtime": 51.7303, |
|
"eval_samples_per_second": 9.666, |
|
"eval_steps_per_second": 9.666, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.10795072788353863, |
|
"grad_norm": 3.8270246982574463, |
|
"learning_rate": 4.0861474083878765e-06, |
|
"loss": 1.0918, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.10839865621500559, |
|
"grad_norm": 5.627485752105713, |
|
"learning_rate": 4.062194651918585e-06, |
|
"loss": 1.257, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.10884658454647256, |
|
"grad_norm": 4.910660743713379, |
|
"learning_rate": 4.0382290623499384e-06, |
|
"loss": 1.2748, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.10929451287793952, |
|
"grad_norm": 2.3609941005706787, |
|
"learning_rate": 4.014251624813453e-06, |
|
"loss": 0.9422, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.10974244120940649, |
|
"grad_norm": 3.063828706741333, |
|
"learning_rate": 3.990263324927675e-06, |
|
"loss": 1.1829, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.11019036954087345, |
|
"grad_norm": 2.658452033996582, |
|
"learning_rate": 3.966265148757655e-06, |
|
"loss": 1.0062, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.11063829787234042, |
|
"grad_norm": 6.130062103271484, |
|
"learning_rate": 3.9422580827744224e-06, |
|
"loss": 1.1504, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.11108622620380738, |
|
"grad_norm": 3.3496034145355225, |
|
"learning_rate": 3.9182431138144315e-06, |
|
"loss": 0.8731, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.11153415453527436, |
|
"grad_norm": 3.8455569744110107, |
|
"learning_rate": 3.894221229038995e-06, |
|
"loss": 1.0125, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.11198208286674133, |
|
"grad_norm": 4.499962329864502, |
|
"learning_rate": 3.870193415893709e-06, |
|
"loss": 1.0228, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.1124300111982083, |
|
"grad_norm": 6.230105876922607, |
|
"learning_rate": 3.846160662067859e-06, |
|
"loss": 1.1794, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.11287793952967526, |
|
"grad_norm": 7.316727638244629, |
|
"learning_rate": 3.8221239554538275e-06, |
|
"loss": 1.2728, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.11332586786114222, |
|
"grad_norm": 3.291714906692505, |
|
"learning_rate": 3.798084284106478e-06, |
|
"loss": 1.167, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.11377379619260919, |
|
"grad_norm": 5.075141429901123, |
|
"learning_rate": 3.7740426362025424e-06, |
|
"loss": 1.0547, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.11422172452407615, |
|
"grad_norm": 3.961540937423706, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 1.0713, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.11422172452407615, |
|
"eval_loss": 1.2046430110931396, |
|
"eval_runtime": 51.7175, |
|
"eval_samples_per_second": 9.668, |
|
"eval_steps_per_second": 9.668, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.11466965285554312, |
|
"grad_norm": 6.124125003814697, |
|
"learning_rate": 3.7259573637974587e-06, |
|
"loss": 1.0568, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.11511758118701008, |
|
"grad_norm": 4.3748602867126465, |
|
"learning_rate": 3.701915715893523e-06, |
|
"loss": 1.4124, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.11556550951847705, |
|
"grad_norm": 7.382061004638672, |
|
"learning_rate": 3.677876044546174e-06, |
|
"loss": 1.1357, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.11601343784994401, |
|
"grad_norm": 4.097735404968262, |
|
"learning_rate": 3.6538393379321427e-06, |
|
"loss": 1.0885, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.11646136618141098, |
|
"grad_norm": 5.039736270904541, |
|
"learning_rate": 3.6298065841062934e-06, |
|
"loss": 1.107, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.11690929451287795, |
|
"grad_norm": 4.383152008056641, |
|
"learning_rate": 3.6057787709610064e-06, |
|
"loss": 1.1695, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.11735722284434491, |
|
"grad_norm": 4.900496482849121, |
|
"learning_rate": 3.5817568861855708e-06, |
|
"loss": 1.1107, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.11780515117581188, |
|
"grad_norm": 6.267992973327637, |
|
"learning_rate": 3.557741917225579e-06, |
|
"loss": 1.1896, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.11825307950727884, |
|
"grad_norm": 3.8060693740844727, |
|
"learning_rate": 3.5337348512423468e-06, |
|
"loss": 1.2245, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.1187010078387458, |
|
"grad_norm": 3.5068161487579346, |
|
"learning_rate": 3.5097366750723275e-06, |
|
"loss": 1.0629, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.11914893617021277, |
|
"grad_norm": 4.6765360832214355, |
|
"learning_rate": 3.4857483751865478e-06, |
|
"loss": 1.1783, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.11959686450167974, |
|
"grad_norm": 7.864380836486816, |
|
"learning_rate": 3.461770937650064e-06, |
|
"loss": 1.0683, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.1200447928331467, |
|
"grad_norm": 3.138843297958374, |
|
"learning_rate": 3.437805348081416e-06, |
|
"loss": 0.9814, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.12049272116461367, |
|
"grad_norm": 5.134324550628662, |
|
"learning_rate": 3.413852591612125e-06, |
|
"loss": 1.1631, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.12094064949608063, |
|
"grad_norm": 4.688596725463867, |
|
"learning_rate": 3.389913652846194e-06, |
|
"loss": 1.0644, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.12094064949608063, |
|
"eval_loss": 1.2033374309539795, |
|
"eval_runtime": 51.6099, |
|
"eval_samples_per_second": 9.688, |
|
"eval_steps_per_second": 9.688, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.1213885778275476, |
|
"grad_norm": 4.218849182128906, |
|
"learning_rate": 3.365989515819633e-06, |
|
"loss": 1.1395, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.12183650615901456, |
|
"grad_norm": 5.043267726898193, |
|
"learning_rate": 3.34208116396001e-06, |
|
"loss": 1.2327, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.12228443449048153, |
|
"grad_norm": 7.991638660430908, |
|
"learning_rate": 3.318189580046028e-06, |
|
"loss": 1.0106, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.12273236282194849, |
|
"grad_norm": 4.103755474090576, |
|
"learning_rate": 3.294315746167124e-06, |
|
"loss": 0.9751, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.12318029115341546, |
|
"grad_norm": 4.224274635314941, |
|
"learning_rate": 3.2704606436831023e-06, |
|
"loss": 1.1427, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.12362821948488242, |
|
"grad_norm": 5.190283298492432, |
|
"learning_rate": 3.2466252531837934e-06, |
|
"loss": 1.1758, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.12407614781634939, |
|
"grad_norm": 6.470210075378418, |
|
"learning_rate": 3.2228105544487405e-06, |
|
"loss": 1.2584, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.12452407614781635, |
|
"grad_norm": 4.470674514770508, |
|
"learning_rate": 3.1990175264069333e-06, |
|
"loss": 1.0279, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.12497200447928332, |
|
"grad_norm": 4.63865327835083, |
|
"learning_rate": 3.1752471470965653e-06, |
|
"loss": 1.2431, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.12541993281075028, |
|
"grad_norm": 5.2822089195251465, |
|
"learning_rate": 3.151500393624829e-06, |
|
"loss": 1.0206, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.12586786114221724, |
|
"grad_norm": 3.3929495811462402, |
|
"learning_rate": 3.127778242127747e-06, |
|
"loss": 0.9654, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.12631578947368421, |
|
"grad_norm": 3.526858329772949, |
|
"learning_rate": 3.104081667730055e-06, |
|
"loss": 1.0832, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.12676371780515117, |
|
"grad_norm": 5.531039714813232, |
|
"learning_rate": 3.0804116445051133e-06, |
|
"loss": 1.1649, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.12721164613661815, |
|
"grad_norm": 5.811004161834717, |
|
"learning_rate": 3.0567691454348674e-06, |
|
"loss": 1.095, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.1276595744680851, |
|
"grad_norm": 4.319146633148193, |
|
"learning_rate": 3.033155142369855e-06, |
|
"loss": 0.9761, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.1276595744680851, |
|
"eval_loss": 1.2028086185455322, |
|
"eval_runtime": 51.6383, |
|
"eval_samples_per_second": 9.683, |
|
"eval_steps_per_second": 9.683, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.12810750279955208, |
|
"grad_norm": 5.54340124130249, |
|
"learning_rate": 3.009570605989249e-06, |
|
"loss": 0.999, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.12855543113101903, |
|
"grad_norm": 3.859863758087158, |
|
"learning_rate": 2.986016505760967e-06, |
|
"loss": 1.025, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.129003359462486, |
|
"grad_norm": 5.119099140167236, |
|
"learning_rate": 2.962493809901815e-06, |
|
"loss": 1.3963, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.12945128779395296, |
|
"grad_norm": 5.8379130363464355, |
|
"learning_rate": 2.9390034853376875e-06, |
|
"loss": 1.0822, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.12989921612541994, |
|
"grad_norm": 3.261016845703125, |
|
"learning_rate": 2.9155464976638217e-06, |
|
"loss": 1.0526, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.1303471444568869, |
|
"grad_norm": 3.678527355194092, |
|
"learning_rate": 2.8921238111051057e-06, |
|
"loss": 1.1167, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.13079507278835387, |
|
"grad_norm": 4.787365436553955, |
|
"learning_rate": 2.8687363884764434e-06, |
|
"loss": 1.0829, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.13124300111982082, |
|
"grad_norm": 3.475607395172119, |
|
"learning_rate": 2.8453851911431783e-06, |
|
"loss": 1.0801, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.1316909294512878, |
|
"grad_norm": 6.456125736236572, |
|
"learning_rate": 2.822071178981572e-06, |
|
"loss": 1.1287, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.13213885778275475, |
|
"grad_norm": 3.778585910797119, |
|
"learning_rate": 2.7987953103393484e-06, |
|
"loss": 1.1359, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.13258678611422173, |
|
"grad_norm": 3.37793231010437, |
|
"learning_rate": 2.7755585419963026e-06, |
|
"loss": 1.0584, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.13303471444568868, |
|
"grad_norm": 5.2485575675964355, |
|
"learning_rate": 2.7523618291249687e-06, |
|
"loss": 1.2037, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.13348264277715566, |
|
"grad_norm": 4.524936676025391, |
|
"learning_rate": 2.729206125251359e-06, |
|
"loss": 0.9778, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.1339305711086226, |
|
"grad_norm": 5.820756912231445, |
|
"learning_rate": 2.7060923822157638e-06, |
|
"loss": 1.0351, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.1343784994400896, |
|
"grad_norm": 5.031400680541992, |
|
"learning_rate": 2.6830215501336288e-06, |
|
"loss": 1.1926, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.1343784994400896, |
|
"eval_loss": 1.199351191520691, |
|
"eval_runtime": 51.5688, |
|
"eval_samples_per_second": 9.696, |
|
"eval_steps_per_second": 9.696, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.13482642777155654, |
|
"grad_norm": 4.307104587554932, |
|
"learning_rate": 2.6599945773564997e-06, |
|
"loss": 1.1743, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.13527435610302352, |
|
"grad_norm": 4.9457221031188965, |
|
"learning_rate": 2.6370124104330357e-06, |
|
"loss": 1.1287, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.13572228443449047, |
|
"grad_norm": 3.17401385307312, |
|
"learning_rate": 2.614075994070105e-06, |
|
"loss": 1.1686, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.13617021276595745, |
|
"grad_norm": 6.098177433013916, |
|
"learning_rate": 2.591186271093948e-06, |
|
"loss": 1.1546, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.1366181410974244, |
|
"grad_norm": 4.12905216217041, |
|
"learning_rate": 2.568344182411423e-06, |
|
"loss": 1.0909, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.13706606942889138, |
|
"grad_norm": 4.946627616882324, |
|
"learning_rate": 2.5455506669713293e-06, |
|
"loss": 1.2223, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.13751399776035833, |
|
"grad_norm": 4.25789737701416, |
|
"learning_rate": 2.522806661725812e-06, |
|
"loss": 1.0383, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.1379619260918253, |
|
"grad_norm": 6.536715030670166, |
|
"learning_rate": 2.5001131015918444e-06, |
|
"loss": 0.9992, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.13840985442329226, |
|
"grad_norm": 5.861030578613281, |
|
"learning_rate": 2.4774709194127973e-06, |
|
"loss": 1.1678, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.13885778275475924, |
|
"grad_norm": 4.58046293258667, |
|
"learning_rate": 2.4548810459200973e-06, |
|
"loss": 1.2545, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.1393057110862262, |
|
"grad_norm": 6.048022270202637, |
|
"learning_rate": 2.4323444096949647e-06, |
|
"loss": 1.0531, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.13975363941769317, |
|
"grad_norm": 5.86400842666626, |
|
"learning_rate": 2.409861937130248e-06, |
|
"loss": 1.1093, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.14020156774916012, |
|
"grad_norm": 3.7916102409362793, |
|
"learning_rate": 2.3874345523923327e-06, |
|
"loss": 1.1048, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.1406494960806271, |
|
"grad_norm": 4.009166717529297, |
|
"learning_rate": 2.3650631773831644e-06, |
|
"loss": 1.0198, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.14109742441209405, |
|
"grad_norm": 4.695572853088379, |
|
"learning_rate": 2.3427487317023477e-06, |
|
"loss": 1.1909, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.14109742441209405, |
|
"eval_loss": 1.1985480785369873, |
|
"eval_runtime": 51.6619, |
|
"eval_samples_per_second": 9.678, |
|
"eval_steps_per_second": 9.678, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.14154535274356103, |
|
"grad_norm": 5.317529201507568, |
|
"learning_rate": 2.320492132609344e-06, |
|
"loss": 1.084, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.14199328107502798, |
|
"grad_norm": 3.3507909774780273, |
|
"learning_rate": 2.2982942949857705e-06, |
|
"loss": 1.0169, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.14244120940649496, |
|
"grad_norm": 5.125346660614014, |
|
"learning_rate": 2.276156131297787e-06, |
|
"loss": 1.0202, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.1428891377379619, |
|
"grad_norm": 6.09945821762085, |
|
"learning_rate": 2.254078551558594e-06, |
|
"loss": 1.1235, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.1433370660694289, |
|
"grad_norm": 6.263647079467773, |
|
"learning_rate": 2.2320624632910232e-06, |
|
"loss": 1.1284, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.14378499440089584, |
|
"grad_norm": 6.879512310028076, |
|
"learning_rate": 2.210108771490233e-06, |
|
"loss": 1.0602, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.14423292273236282, |
|
"grad_norm": 3.726658582687378, |
|
"learning_rate": 2.1882183785865047e-06, |
|
"loss": 1.1038, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.14468085106382977, |
|
"grad_norm": 5.486456394195557, |
|
"learning_rate": 2.166392184408152e-06, |
|
"loss": 1.1794, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.14512877939529675, |
|
"grad_norm": 4.750957012176514, |
|
"learning_rate": 2.1446310861445306e-06, |
|
"loss": 0.9833, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.1455767077267637, |
|
"grad_norm": 3.6656692028045654, |
|
"learning_rate": 2.1229359783091576e-06, |
|
"loss": 1.0272, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.14602463605823068, |
|
"grad_norm": 3.691014528274536, |
|
"learning_rate": 2.1013077527029428e-06, |
|
"loss": 1.0861, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.14647256438969763, |
|
"grad_norm": 5.651008605957031, |
|
"learning_rate": 2.079747298377528e-06, |
|
"loss": 1.096, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.14692049272116461, |
|
"grad_norm": 4.2657318115234375, |
|
"learning_rate": 2.058255501598745e-06, |
|
"loss": 1.0871, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.14736842105263157, |
|
"grad_norm": 3.884568452835083, |
|
"learning_rate": 2.0368332458101814e-06, |
|
"loss": 1.0087, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.14781634938409854, |
|
"grad_norm": 3.191197395324707, |
|
"learning_rate": 2.015481411596869e-06, |
|
"loss": 1.1387, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.14781634938409854, |
|
"eval_loss": 1.1979233026504517, |
|
"eval_runtime": 51.7549, |
|
"eval_samples_per_second": 9.661, |
|
"eval_steps_per_second": 9.661, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.14826427771556552, |
|
"grad_norm": 6.709813594818115, |
|
"learning_rate": 1.9942008766490793e-06, |
|
"loss": 1.0685, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.14871220604703247, |
|
"grad_norm": 3.687634229660034, |
|
"learning_rate": 1.9729925157262554e-06, |
|
"loss": 1.1542, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.14916013437849945, |
|
"grad_norm": 3.637235403060913, |
|
"learning_rate": 1.9518572006210484e-06, |
|
"loss": 1.1365, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.1496080627099664, |
|
"grad_norm": 3.113184690475464, |
|
"learning_rate": 1.9307958001234794e-06, |
|
"loss": 1.0218, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.15005599104143338, |
|
"grad_norm": 4.447634220123291, |
|
"learning_rate": 1.9098091799852347e-06, |
|
"loss": 1.222, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.15050391937290034, |
|
"grad_norm": 3.8236501216888428, |
|
"learning_rate": 1.8888982028840636e-06, |
|
"loss": 1.2012, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.15095184770436731, |
|
"grad_norm": 5.108892440795898, |
|
"learning_rate": 1.8680637283883355e-06, |
|
"loss": 1.0181, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.15139977603583427, |
|
"grad_norm": 3.81886887550354, |
|
"learning_rate": 1.8473066129216927e-06, |
|
"loss": 1.125, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.15184770436730124, |
|
"grad_norm": 4.7799835205078125, |
|
"learning_rate": 1.8266277097278527e-06, |
|
"loss": 1.1038, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.1522956326987682, |
|
"grad_norm": 6.478558540344238, |
|
"learning_rate": 1.8060278688355313e-06, |
|
"loss": 0.9218, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.15274356103023518, |
|
"grad_norm": 4.482583522796631, |
|
"learning_rate": 1.7855079370235043e-06, |
|
"loss": 1.0629, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.15319148936170213, |
|
"grad_norm": 2.6053950786590576, |
|
"learning_rate": 1.7650687577857972e-06, |
|
"loss": 1.1975, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.1536394176931691, |
|
"grad_norm": 4.930041313171387, |
|
"learning_rate": 1.7447111712970138e-06, |
|
"loss": 1.0566, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.15408734602463606, |
|
"grad_norm": 4.492660045623779, |
|
"learning_rate": 1.7244360143778004e-06, |
|
"loss": 1.1441, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.15453527435610304, |
|
"grad_norm": 4.847555637359619, |
|
"learning_rate": 1.704244120460443e-06, |
|
"loss": 1.231, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.15453527435610304, |
|
"eval_loss": 1.198148488998413, |
|
"eval_runtime": 51.6757, |
|
"eval_samples_per_second": 9.676, |
|
"eval_steps_per_second": 9.676, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.15498320268757, |
|
"grad_norm": 5.320653438568115, |
|
"learning_rate": 1.6841363195546162e-06, |
|
"loss": 0.996, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.15543113101903697, |
|
"grad_norm": 4.333999156951904, |
|
"learning_rate": 1.6641134382132576e-06, |
|
"loss": 1.2536, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.15587905935050392, |
|
"grad_norm": 6.867399215698242, |
|
"learning_rate": 1.6441762994985947e-06, |
|
"loss": 1.1461, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.1563269876819709, |
|
"grad_norm": 3.2110917568206787, |
|
"learning_rate": 1.6243257229483141e-06, |
|
"loss": 1.1086, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.15677491601343785, |
|
"grad_norm": 3.345970630645752, |
|
"learning_rate": 1.6045625245418648e-06, |
|
"loss": 0.9485, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.15722284434490483, |
|
"grad_norm": 4.890392780303955, |
|
"learning_rate": 1.584887516666928e-06, |
|
"loss": 1.0968, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.15767077267637178, |
|
"grad_norm": 5.448171615600586, |
|
"learning_rate": 1.565301508086015e-06, |
|
"loss": 1.1305, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.15811870100783876, |
|
"grad_norm": 7.16267728805542, |
|
"learning_rate": 1.5458053039032263e-06, |
|
"loss": 1.2279, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.1585666293393057, |
|
"grad_norm": 5.2700018882751465, |
|
"learning_rate": 1.5263997055311536e-06, |
|
"loss": 1.0474, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.1590145576707727, |
|
"grad_norm": 5.955024719238281, |
|
"learning_rate": 1.5070855106579404e-06, |
|
"loss": 1.1283, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.15946248600223964, |
|
"grad_norm": 2.882784366607666, |
|
"learning_rate": 1.4878635132144885e-06, |
|
"loss": 0.9112, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.15991041433370662, |
|
"grad_norm": 4.2263875007629395, |
|
"learning_rate": 1.4687345033418258e-06, |
|
"loss": 1.1554, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.16035834266517357, |
|
"grad_norm": 4.622799396514893, |
|
"learning_rate": 1.4496992673586262e-06, |
|
"loss": 1.3423, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.16080627099664055, |
|
"grad_norm": 5.2950897216796875, |
|
"learning_rate": 1.4307585877288822e-06, |
|
"loss": 1.0494, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.1612541993281075, |
|
"grad_norm": 5.289889335632324, |
|
"learning_rate": 1.4119132430297496e-06, |
|
"loss": 1.1448, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.1612541993281075, |
|
"eval_loss": 1.1965739727020264, |
|
"eval_runtime": 51.7182, |
|
"eval_samples_per_second": 9.668, |
|
"eval_steps_per_second": 9.668, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.16170212765957448, |
|
"grad_norm": 6.415092468261719, |
|
"learning_rate": 1.3931640079195365e-06, |
|
"loss": 1.0204, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.16215005599104143, |
|
"grad_norm": 3.348160743713379, |
|
"learning_rate": 1.3745116531058645e-06, |
|
"loss": 1.1308, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.1625979843225084, |
|
"grad_norm": 6.698293209075928, |
|
"learning_rate": 1.3559569453139797e-06, |
|
"loss": 0.9401, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.16304591265397536, |
|
"grad_norm": 3.5045154094696045, |
|
"learning_rate": 1.3375006472552483e-06, |
|
"loss": 1.152, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.16349384098544234, |
|
"grad_norm": 4.656421661376953, |
|
"learning_rate": 1.3191435175957945e-06, |
|
"loss": 1.1775, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.1639417693169093, |
|
"grad_norm": 8.8998384475708, |
|
"learning_rate": 1.3008863109253174e-06, |
|
"loss": 1.0061, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.16438969764837627, |
|
"grad_norm": 3.5046370029449463, |
|
"learning_rate": 1.282729777726078e-06, |
|
"loss": 1.1871, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.16483762597984322, |
|
"grad_norm": 4.024252891540527, |
|
"learning_rate": 1.2646746643420392e-06, |
|
"loss": 1.2593, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.1652855543113102, |
|
"grad_norm": 4.861652851104736, |
|
"learning_rate": 1.2467217129481952e-06, |
|
"loss": 1.1068, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.16573348264277715, |
|
"grad_norm": 6.007284641265869, |
|
"learning_rate": 1.2288716615200617e-06, |
|
"loss": 1.0237, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.16618141097424413, |
|
"grad_norm": 4.506286144256592, |
|
"learning_rate": 1.2111252438033404e-06, |
|
"loss": 1.0827, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.16662933930571108, |
|
"grad_norm": 7.5774102210998535, |
|
"learning_rate": 1.1934831892837524e-06, |
|
"loss": 1.2481, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.16707726763717806, |
|
"grad_norm": 4.199349880218506, |
|
"learning_rate": 1.1759462231570618e-06, |
|
"loss": 1.1948, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.167525195968645, |
|
"grad_norm": 3.675760269165039, |
|
"learning_rate": 1.1585150662992578e-06, |
|
"loss": 0.8945, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.167973124300112, |
|
"grad_norm": 4.647981643676758, |
|
"learning_rate": 1.1411904352369262e-06, |
|
"loss": 1.0746, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.167973124300112, |
|
"eval_loss": 1.1958056688308716, |
|
"eval_runtime": 51.7591, |
|
"eval_samples_per_second": 9.66, |
|
"eval_steps_per_second": 9.66, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.16842105263157894, |
|
"grad_norm": 2.354313611984253, |
|
"learning_rate": 1.1239730421177952e-06, |
|
"loss": 1.0362, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.16886898096304592, |
|
"grad_norm": 4.00113582611084, |
|
"learning_rate": 1.1068635946814569e-06, |
|
"loss": 1.0924, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.16931690929451287, |
|
"grad_norm": 3.765235185623169, |
|
"learning_rate": 1.0898627962302831e-06, |
|
"loss": 1.3452, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.16976483762597985, |
|
"grad_norm": 3.814605236053467, |
|
"learning_rate": 1.072971345600513e-06, |
|
"loss": 1.0048, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.1702127659574468, |
|
"grad_norm": 3.447803020477295, |
|
"learning_rate": 1.056189937133522e-06, |
|
"loss": 1.149, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.17066069428891378, |
|
"grad_norm": 7.1337714195251465, |
|
"learning_rate": 1.0395192606472822e-06, |
|
"loss": 1.1497, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.17110862262038073, |
|
"grad_norm": 5.239931583404541, |
|
"learning_rate": 1.0229600014080101e-06, |
|
"loss": 0.9874, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.1715565509518477, |
|
"grad_norm": 3.4100687503814697, |
|
"learning_rate": 1.006512840101995e-06, |
|
"loss": 1.0393, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.17200447928331467, |
|
"grad_norm": 4.527777671813965, |
|
"learning_rate": 9.90178452807619e-07, |
|
"loss": 0.968, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.17245240761478164, |
|
"grad_norm": 3.7964625358581543, |
|
"learning_rate": 9.739575109675674e-07, |
|
"loss": 1.1207, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.1729003359462486, |
|
"grad_norm": 4.329505920410156, |
|
"learning_rate": 9.578506813612243e-07, |
|
"loss": 1.0924, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.17334826427771557, |
|
"grad_norm": 3.9827823638916016, |
|
"learning_rate": 9.418586260772695e-07, |
|
"loss": 1.0937, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.17379619260918253, |
|
"grad_norm": 4.150352954864502, |
|
"learning_rate": 9.259820024864594e-07, |
|
"loss": 1.2071, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.1742441209406495, |
|
"grad_norm": 2.648918867111206, |
|
"learning_rate": 9.102214632146059e-07, |
|
"loss": 1.1754, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.17469204927211646, |
|
"grad_norm": 5.348718166351318, |
|
"learning_rate": 8.94577656115746e-07, |
|
"loss": 1.1031, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.17469204927211646, |
|
"eval_loss": 1.1968835592269897, |
|
"eval_runtime": 51.6518, |
|
"eval_samples_per_second": 9.68, |
|
"eval_steps_per_second": 9.68, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.17513997760358344, |
|
"grad_norm": 6.799318313598633, |
|
"learning_rate": 8.790512242455198e-07, |
|
"loss": 1.1188, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.1755879059350504, |
|
"grad_norm": 4.05487060546875, |
|
"learning_rate": 8.636428058347274e-07, |
|
"loss": 1.3045, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.17603583426651737, |
|
"grad_norm": 4.513579845428467, |
|
"learning_rate": 8.483530342630993e-07, |
|
"loss": 1.2577, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.17648376259798432, |
|
"grad_norm": 7.971194267272949, |
|
"learning_rate": 8.331825380332599e-07, |
|
"loss": 1.1376, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.1769316909294513, |
|
"grad_norm": 3.740802764892578, |
|
"learning_rate": 8.181319407448884e-07, |
|
"loss": 1.1413, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.17737961926091825, |
|
"grad_norm": 3.431658983230591, |
|
"learning_rate": 8.032018610690914e-07, |
|
"loss": 1.0802, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.17782754759238523, |
|
"grad_norm": 3.8207449913024902, |
|
"learning_rate": 7.883929127229665e-07, |
|
"loss": 1.173, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.17827547592385218, |
|
"grad_norm": 3.088942289352417, |
|
"learning_rate": 7.737057044443793e-07, |
|
"loss": 1.1144, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.17872340425531916, |
|
"grad_norm": 3.705589532852173, |
|
"learning_rate": 7.591408399669337e-07, |
|
"loss": 1.2676, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.1791713325867861, |
|
"grad_norm": 4.925235271453857, |
|
"learning_rate": 7.446989179951632e-07, |
|
"loss": 1.0197, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1796192609182531, |
|
"grad_norm": 4.373708248138428, |
|
"learning_rate": 7.303805321799146e-07, |
|
"loss": 1.0041, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.18006718924972004, |
|
"grad_norm": 4.23321008682251, |
|
"learning_rate": 7.161862710939476e-07, |
|
"loss": 1.0504, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.18051511758118702, |
|
"grad_norm": 6.634941101074219, |
|
"learning_rate": 7.021167182077403e-07, |
|
"loss": 1.062, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.18096304591265397, |
|
"grad_norm": 12.015007972717285, |
|
"learning_rate": 6.881724518655049e-07, |
|
"loss": 1.3095, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.18141097424412095, |
|
"grad_norm": 5.376244068145752, |
|
"learning_rate": 6.743540452614152e-07, |
|
"loss": 1.0552, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.18141097424412095, |
|
"eval_loss": 1.1952238082885742, |
|
"eval_runtime": 51.6946, |
|
"eval_samples_per_second": 9.672, |
|
"eval_steps_per_second": 9.672, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.1818589025755879, |
|
"grad_norm": 5.1148858070373535, |
|
"learning_rate": 6.606620664160438e-07, |
|
"loss": 1.0796, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.18230683090705488, |
|
"grad_norm": 3.497487783432007, |
|
"learning_rate": 6.470970781530139e-07, |
|
"loss": 1.0996, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.18275475923852183, |
|
"grad_norm": 4.02069616317749, |
|
"learning_rate": 6.336596380758604e-07, |
|
"loss": 1.18, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.1832026875699888, |
|
"grad_norm": 4.936882495880127, |
|
"learning_rate": 6.203502985451152e-07, |
|
"loss": 1.1434, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.18365061590145576, |
|
"grad_norm": 3.6114046573638916, |
|
"learning_rate": 6.071696066555978e-07, |
|
"loss": 1.1957, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.18409854423292274, |
|
"grad_norm": 3.0989315509796143, |
|
"learning_rate": 5.941181042139258e-07, |
|
"loss": 1.1672, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.1845464725643897, |
|
"grad_norm": 3.9395434856414795, |
|
"learning_rate": 5.811963277162466e-07, |
|
"loss": 1.3213, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.18499440089585667, |
|
"grad_norm": 3.7421300411224365, |
|
"learning_rate": 5.684048083261789e-07, |
|
"loss": 0.9563, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.18544232922732362, |
|
"grad_norm": 3.190976858139038, |
|
"learning_rate": 5.557440718529848e-07, |
|
"loss": 1.1234, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.1858902575587906, |
|
"grad_norm": 3.461064100265503, |
|
"learning_rate": 5.432146387299522e-07, |
|
"loss": 1.0016, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.18633818589025755, |
|
"grad_norm": 6.645826816558838, |
|
"learning_rate": 5.308170239930022e-07, |
|
"loss": 1.1967, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.18678611422172453, |
|
"grad_norm": 4.823378562927246, |
|
"learning_rate": 5.185517372595187e-07, |
|
"loss": 1.032, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.18723404255319148, |
|
"grad_norm": 3.5760250091552734, |
|
"learning_rate": 5.064192827073995e-07, |
|
"loss": 1.1513, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.18768197088465846, |
|
"grad_norm": 3.162781000137329, |
|
"learning_rate": 4.944201590543308e-07, |
|
"loss": 0.9593, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.1881298992161254, |
|
"grad_norm": 8.633989334106445, |
|
"learning_rate": 4.825548595372898e-07, |
|
"loss": 1.2696, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.1881298992161254, |
|
"eval_loss": 1.1959577798843384, |
|
"eval_runtime": 51.6407, |
|
"eval_samples_per_second": 9.682, |
|
"eval_steps_per_second": 9.682, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.1885778275475924, |
|
"grad_norm": 4.277423858642578, |
|
"learning_rate": 4.7082387189226646e-07, |
|
"loss": 1.0834, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.18902575587905934, |
|
"grad_norm": 3.7345645427703857, |
|
"learning_rate": 4.5922767833421454e-07, |
|
"loss": 1.255, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.18947368421052632, |
|
"grad_norm": 5.163575172424316, |
|
"learning_rate": 4.477667555372326e-07, |
|
"loss": 1.1317, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.18992161254199327, |
|
"grad_norm": 5.2220892906188965, |
|
"learning_rate": 4.364415746149678e-07, |
|
"loss": 1.0966, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.19036954087346025, |
|
"grad_norm": 5.796306610107422, |
|
"learning_rate": 4.2525260110124964e-07, |
|
"loss": 1.0268, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.1908174692049272, |
|
"grad_norm": 4.295403003692627, |
|
"learning_rate": 4.1420029493095623e-07, |
|
"loss": 1.0465, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.19126539753639418, |
|
"grad_norm": 5.671868324279785, |
|
"learning_rate": 4.032851104211036e-07, |
|
"loss": 1.2124, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.19171332586786113, |
|
"grad_norm": 4.053644180297852, |
|
"learning_rate": 3.925074962521762e-07, |
|
"loss": 1.0574, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.1921612541993281, |
|
"grad_norm": 3.7694053649902344, |
|
"learning_rate": 3.818678954496787e-07, |
|
"loss": 1.0604, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.19260918253079506, |
|
"grad_norm": 4.982527256011963, |
|
"learning_rate": 3.713667453659287e-07, |
|
"loss": 1.1518, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.19305711086226204, |
|
"grad_norm": 5.036848545074463, |
|
"learning_rate": 3.6100447766207473e-07, |
|
"loss": 1.0251, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.193505039193729, |
|
"grad_norm": 5.744006633758545, |
|
"learning_rate": 3.5078151829035693e-07, |
|
"loss": 1.0103, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.19395296752519597, |
|
"grad_norm": 3.843419075012207, |
|
"learning_rate": 3.4069828747659405e-07, |
|
"loss": 1.0053, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.19440089585666293, |
|
"grad_norm": 4.357511043548584, |
|
"learning_rate": 3.3075519970291144e-07, |
|
"loss": 1.202, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.1948488241881299, |
|
"grad_norm": 6.164062976837158, |
|
"learning_rate": 3.209526636907036e-07, |
|
"loss": 1.1136, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.1948488241881299, |
|
"eval_loss": 1.1951868534088135, |
|
"eval_runtime": 51.6432, |
|
"eval_samples_per_second": 9.682, |
|
"eval_steps_per_second": 9.682, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.19529675251959686, |
|
"grad_norm": 3.893348217010498, |
|
"learning_rate": 3.1129108238383095e-07, |
|
"loss": 1.2238, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.19574468085106383, |
|
"grad_norm": 3.704392433166504, |
|
"learning_rate": 3.017708529320604e-07, |
|
"loss": 1.0766, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.19619260918253079, |
|
"grad_norm": 4.406269073486328, |
|
"learning_rate": 2.923923666747357e-07, |
|
"loss": 0.9588, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.19664053751399777, |
|
"grad_norm": 6.578729152679443, |
|
"learning_rate": 2.8315600912469477e-07, |
|
"loss": 1.1622, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.19708846584546472, |
|
"grad_norm": 4.1804094314575195, |
|
"learning_rate": 2.740621599524189e-07, |
|
"loss": 1.1999, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.1975363941769317, |
|
"grad_norm": 6.192513465881348, |
|
"learning_rate": 2.651111929704303e-07, |
|
"loss": 1.1274, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.19798432250839865, |
|
"grad_norm": 4.356874942779541, |
|
"learning_rate": 2.563034761179223e-07, |
|
"loss": 1.0262, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.19843225083986563, |
|
"grad_norm": 4.435469627380371, |
|
"learning_rate": 2.476393714456384e-07, |
|
"loss": 1.1814, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.19888017917133258, |
|
"grad_norm": 3.9173505306243896, |
|
"learning_rate": 2.391192351009855e-07, |
|
"loss": 0.7984, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.19932810750279956, |
|
"grad_norm": 6.546506881713867, |
|
"learning_rate": 2.3074341731339837e-07, |
|
"loss": 1.168, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.1997760358342665, |
|
"grad_norm": 6.1646223068237305, |
|
"learning_rate": 2.225122623799407e-07, |
|
"loss": 1.2589, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.2002239641657335, |
|
"grad_norm": 3.210203170776367, |
|
"learning_rate": 2.1442610865115135e-07, |
|
"loss": 1.0636, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.20067189249720044, |
|
"grad_norm": 5.133816242218018, |
|
"learning_rate": 2.0648528851714077e-07, |
|
"loss": 1.0195, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.20111982082866742, |
|
"grad_norm": 4.449398517608643, |
|
"learning_rate": 1.9869012839392064e-07, |
|
"loss": 1.1007, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.20156774916013437, |
|
"grad_norm": 4.8083977699279785, |
|
"learning_rate": 1.9104094870999264e-07, |
|
"loss": 1.1975, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.20156774916013437, |
|
"eval_loss": 1.1950809955596924, |
|
"eval_runtime": 51.7311, |
|
"eval_samples_per_second": 9.665, |
|
"eval_steps_per_second": 9.665, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.20201567749160135, |
|
"grad_norm": 4.709386348724365, |
|
"learning_rate": 1.8353806389317428e-07, |
|
"loss": 0.9829, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.2024636058230683, |
|
"grad_norm": 5.23099946975708, |
|
"learning_rate": 1.761817823576731e-07, |
|
"loss": 1.1149, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.20291153415453528, |
|
"grad_norm": 3.4107179641723633, |
|
"learning_rate": 1.6897240649141125e-07, |
|
"loss": 0.9822, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.20335946248600223, |
|
"grad_norm": 3.951052188873291, |
|
"learning_rate": 1.619102326435923e-07, |
|
"loss": 1.2333, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.2038073908174692, |
|
"grad_norm": 4.30809211730957, |
|
"learning_rate": 1.5499555111252285e-07, |
|
"loss": 1.0641, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.20425531914893616, |
|
"grad_norm": 4.1274189949035645, |
|
"learning_rate": 1.4822864613367766e-07, |
|
"loss": 1.0962, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.20470324748040314, |
|
"grad_norm": 6.046044826507568, |
|
"learning_rate": 1.4160979586801724e-07, |
|
"loss": 1.0241, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.2051511758118701, |
|
"grad_norm": 4.066288471221924, |
|
"learning_rate": 1.3513927239055036e-07, |
|
"loss": 0.9061, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.20559910414333707, |
|
"grad_norm": 3.9250218868255615, |
|
"learning_rate": 1.2881734167915425e-07, |
|
"loss": 1.1666, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.20604703247480402, |
|
"grad_norm": 4.965548515319824, |
|
"learning_rate": 1.2264426360363956e-07, |
|
"loss": 0.8048, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.206494960806271, |
|
"grad_norm": 5.192389965057373, |
|
"learning_rate": 1.1662029191506775e-07, |
|
"loss": 0.9869, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.20694288913773795, |
|
"grad_norm": 4.953862190246582, |
|
"learning_rate": 1.107456742353201e-07, |
|
"loss": 1.0042, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.20739081746920493, |
|
"grad_norm": 4.955436706542969, |
|
"learning_rate": 1.0502065204692062e-07, |
|
"loss": 1.101, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.20783874580067188, |
|
"grad_norm": 2.5195674896240234, |
|
"learning_rate": 9.94454606831076e-08, |
|
"loss": 0.9542, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.20828667413213886, |
|
"grad_norm": 4.142997741699219, |
|
"learning_rate": 9.402032931816144e-08, |
|
"loss": 1.1318, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.20828667413213886, |
|
"eval_loss": 1.1947814226150513, |
|
"eval_runtime": 51.8063, |
|
"eval_samples_per_second": 9.651, |
|
"eval_steps_per_second": 9.651, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.2087346024636058, |
|
"grad_norm": 4.046876907348633, |
|
"learning_rate": 8.874548095798464e-08, |
|
"loss": 1.1393, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.2091825307950728, |
|
"grad_norm": 4.740685939788818, |
|
"learning_rate": 8.362113243093245e-08, |
|
"loss": 1.0529, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.20963045912653974, |
|
"grad_norm": 6.356805324554443, |
|
"learning_rate": 7.864749437890173e-08, |
|
"loss": 1.2791, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.21007838745800672, |
|
"grad_norm": 4.329228401184082, |
|
"learning_rate": 7.382477124867282e-08, |
|
"loss": 1.2672, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": 5.217611312866211, |
|
"learning_rate": 6.915316128350461e-08, |
|
"loss": 0.9357, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.21097424412094065, |
|
"grad_norm": 5.418657302856445, |
|
"learning_rate": 6.463285651498563e-08, |
|
"loss": 1.011, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.2114221724524076, |
|
"grad_norm": 6.056429386138916, |
|
"learning_rate": 6.026404275513875e-08, |
|
"loss": 1.4377, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.21187010078387458, |
|
"grad_norm": 3.5456736087799072, |
|
"learning_rate": 5.604689958878723e-08, |
|
"loss": 1.1192, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.21231802911534153, |
|
"grad_norm": 5.697049140930176, |
|
"learning_rate": 5.198160036616898e-08, |
|
"loss": 1.0392, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 4.248316764831543, |
|
"learning_rate": 4.8068312195811847e-08, |
|
"loss": 1.0041, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.21321388577827546, |
|
"grad_norm": 3.3937604427337646, |
|
"learning_rate": 4.4307195937666194e-08, |
|
"loss": 0.9791, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.21366181410974244, |
|
"grad_norm": 3.097196340560913, |
|
"learning_rate": 4.069840619648935e-08, |
|
"loss": 1.1306, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.2141097424412094, |
|
"grad_norm": 5.534854888916016, |
|
"learning_rate": 3.72420913154932e-08, |
|
"loss": 1.104, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.21455767077267637, |
|
"grad_norm": 5.693947792053223, |
|
"learning_rate": 3.3938393370244876e-08, |
|
"loss": 1.1541, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.21500559910414332, |
|
"grad_norm": 4.025967597961426, |
|
"learning_rate": 3.078744816282731e-08, |
|
"loss": 1.1515, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.21500559910414332, |
|
"eval_loss": 1.1954809427261353, |
|
"eval_runtime": 51.6284, |
|
"eval_samples_per_second": 9.685, |
|
"eval_steps_per_second": 9.685, |
|
"step": 4800 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.204448348803072e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|