|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 15.988870339454646, |
|
"eval_steps": 1000, |
|
"global_step": 3584, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.044518642181413465, |
|
"grad_norm": 33.75, |
|
"learning_rate": 2.785515320334262e-07, |
|
"loss": 2.0545, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08903728436282693, |
|
"grad_norm": 28.125, |
|
"learning_rate": 5.571030640668524e-07, |
|
"loss": 2.0294, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1335559265442404, |
|
"grad_norm": 20.875, |
|
"learning_rate": 8.356545961002786e-07, |
|
"loss": 1.9841, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.17807456872565386, |
|
"grad_norm": 14.25, |
|
"learning_rate": 1.1142061281337048e-06, |
|
"loss": 1.8176, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.22259321090706732, |
|
"grad_norm": 23.5, |
|
"learning_rate": 1.392757660167131e-06, |
|
"loss": 1.6325, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2671118530884808, |
|
"grad_norm": 19.25, |
|
"learning_rate": 1.6713091922005572e-06, |
|
"loss": 1.6341, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3116304952698943, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 1.9498607242339835e-06, |
|
"loss": 1.4943, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3561491374513077, |
|
"grad_norm": 12.0, |
|
"learning_rate": 2.2284122562674097e-06, |
|
"loss": 1.4708, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4006677796327212, |
|
"grad_norm": 11.125, |
|
"learning_rate": 2.506963788300836e-06, |
|
"loss": 1.415, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.44518642181413465, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 2.785515320334262e-06, |
|
"loss": 1.4596, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.48970506399554814, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 3.064066852367688e-06, |
|
"loss": 1.4339, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5342237061769616, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 3.3426183844011143e-06, |
|
"loss": 1.4009, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5787423483583751, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 3.6211699164345405e-06, |
|
"loss": 1.3688, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6232609905397886, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 3.899721448467967e-06, |
|
"loss": 1.3595, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.667779632721202, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 4.178272980501394e-06, |
|
"loss": 1.3609, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7122982749026154, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 4.456824512534819e-06, |
|
"loss": 1.3777, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.756816917084029, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 4.735376044568246e-06, |
|
"loss": 1.3374, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8013355592654424, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 5.013927576601672e-06, |
|
"loss": 1.3524, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8458542014468559, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.292479108635098e-06, |
|
"loss": 1.3153, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8903728436282693, |
|
"grad_norm": 2.375, |
|
"learning_rate": 5.571030640668524e-06, |
|
"loss": 1.3519, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9348914858096828, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 5.849582172701951e-06, |
|
"loss": 1.348, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9794101279910963, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 6.128133704735376e-06, |
|
"loss": 1.3062, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.0261547022815805, |
|
"grad_norm": 2.5, |
|
"learning_rate": 6.406685236768803e-06, |
|
"loss": 1.4358, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.070673344462994, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 6.685236768802229e-06, |
|
"loss": 1.2481, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.1151919866444073, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 6.963788300835655e-06, |
|
"loss": 1.2833, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1597106288258208, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 7.242339832869081e-06, |
|
"loss": 1.1941, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.2042292710072342, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 7.5208913649025075e-06, |
|
"loss": 1.2831, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.2487479131886476, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 7.799442896935934e-06, |
|
"loss": 1.2854, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.293266555370061, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 8.07799442896936e-06, |
|
"loss": 1.257, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.3377851975514747, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 8.356545961002787e-06, |
|
"loss": 1.2468, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.3823038397328882, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 8.635097493036211e-06, |
|
"loss": 1.2743, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.4268224819143016, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 8.913649025069639e-06, |
|
"loss": 1.2265, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.471341124095715, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 9.192200557103064e-06, |
|
"loss": 1.2898, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.5158597662771287, |
|
"grad_norm": 2.0, |
|
"learning_rate": 9.470752089136492e-06, |
|
"loss": 1.2406, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.5603784084585421, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.749303621169918e-06, |
|
"loss": 1.2098, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.6048970506399556, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 9.9999976276417e-06, |
|
"loss": 1.2067, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.649415692821369, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 9.999712947369595e-06, |
|
"loss": 1.2338, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.6939343350027825, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 9.998953826391322e-06, |
|
"loss": 1.2546, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.738452977184196, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 9.997720336742596e-06, |
|
"loss": 1.201, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.7829716193656093, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 9.996012595473676e-06, |
|
"loss": 1.1761, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.8274902615470228, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 9.993830764638262e-06, |
|
"loss": 1.1884, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.8720089037284362, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 9.991175051278111e-06, |
|
"loss": 1.1951, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.9165275459098496, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 9.988045707403394e-06, |
|
"loss": 1.175, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.961046188091263, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.984443029968786e-06, |
|
"loss": 1.2045, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.0077907623817475, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 9.980367360845278e-06, |
|
"loss": 1.3052, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.052309404563161, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 9.975819086787743e-06, |
|
"loss": 1.1092, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.0968280467445743, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 9.970798639398228e-06, |
|
"loss": 1.1435, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.141346688925988, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 9.965306495085005e-06, |
|
"loss": 1.0927, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.185865331107401, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 9.959343175017362e-06, |
|
"loss": 1.0692, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.2303839732888147, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 9.952909245076141e-06, |
|
"loss": 1.0603, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.274902615470228, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 9.946005315800047e-06, |
|
"loss": 1.0717, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.3194212576516415, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 9.93863204232771e-06, |
|
"loss": 1.0808, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.363939899833055, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 9.930790124335511e-06, |
|
"loss": 1.0297, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.4084585420144684, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 9.922480305971193e-06, |
|
"loss": 1.0481, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.452977184195882, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 9.91370337578325e-06, |
|
"loss": 1.0919, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.4974958263772953, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 9.904460166646084e-06, |
|
"loss": 1.0835, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.542014468558709, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 9.894751555680988e-06, |
|
"loss": 1.0336, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.586533110740122, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.884578464172901e-06, |
|
"loss": 1.0728, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.631051752921536, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 9.873941857482988e-06, |
|
"loss": 1.0493, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.6755703951029495, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 9.862842744957037e-06, |
|
"loss": 1.0346, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.720089037284363, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 9.85128217982967e-06, |
|
"loss": 1.0483, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.7646076794657763, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 9.8392612591244e-06, |
|
"loss": 1.0384, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.80912632164719, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 9.826781123549542e-06, |
|
"loss": 1.0266, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.853644963828603, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 9.813842957389953e-06, |
|
"loss": 1.0352, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.8981636060100167, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 9.800447988394657e-06, |
|
"loss": 1.009, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.94268224819143, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 9.786597487660336e-06, |
|
"loss": 1.0834, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.9872008903728435, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 9.772292769510718e-06, |
|
"loss": 1.0735, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.033945464663328, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 9.75753519137185e-06, |
|
"loss": 1.0532, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.0784641068447414, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 9.742326153643285e-06, |
|
"loss": 0.9169, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.122982749026155, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 9.726667099565202e-06, |
|
"loss": 0.9443, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.1675013912075682, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 9.710559515081446e-06, |
|
"loss": 0.9023, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.2120200333889817, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 9.69400492869852e-06, |
|
"loss": 0.9227, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.256538675570395, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 9.677004911340539e-06, |
|
"loss": 0.9329, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.3010573177518086, |
|
"grad_norm": 1.875, |
|
"learning_rate": 9.659561076200173e-06, |
|
"loss": 0.903, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.345575959933222, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 9.64167507858554e-06, |
|
"loss": 0.9046, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.3900946021146354, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 9.62334861576315e-06, |
|
"loss": 0.927, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.434613244296049, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 9.604583426796837e-06, |
|
"loss": 0.9274, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.4791318864774623, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 9.585381292382734e-06, |
|
"loss": 0.9127, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.5236505286588757, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 9.565744034680291e-06, |
|
"loss": 0.9269, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.5681691708402896, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 9.545673517139376e-06, |
|
"loss": 0.8863, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.6126878130217026, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 9.52517164432343e-06, |
|
"loss": 0.8776, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.6572064552031165, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 9.50424036172875e-06, |
|
"loss": 0.9424, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.70172509738453, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 9.482881655599867e-06, |
|
"loss": 0.8712, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.7462437395659434, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 9.461097552741065e-06, |
|
"loss": 0.9157, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.790762381747357, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 9.438890120324049e-06, |
|
"loss": 0.8571, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.8352810239287702, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 9.416261465691786e-06, |
|
"loss": 0.861, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.8797996661101837, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 9.393213736158532e-06, |
|
"loss": 0.8952, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.924318308291597, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 9.369749118806063e-06, |
|
"loss": 0.8598, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.9688369504730105, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 9.345869840276138e-06, |
|
"loss": 0.8614, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 4.015581524763495, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 9.321578166559202e-06, |
|
"loss": 0.8842, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.060100166944908, |
|
"grad_norm": 1.25, |
|
"learning_rate": 9.296876402779357e-06, |
|
"loss": 0.7889, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.104618809126322, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 9.271766892975632e-06, |
|
"loss": 0.8188, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.149137451307735, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 9.246252019879526e-06, |
|
"loss": 0.7822, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 4.193656093489149, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 9.22033420468893e-06, |
|
"loss": 0.8268, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.238174735670562, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 9.194015906838345e-06, |
|
"loss": 0.7838, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.282693377851976, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 9.167299623765515e-06, |
|
"loss": 0.7691, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.3272120200333895, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 9.14018789067443e-06, |
|
"loss": 0.7575, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.371730662214802, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 9.11268328029475e-06, |
|
"loss": 0.8305, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.416249304396216, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 9.08478840263767e-06, |
|
"loss": 0.7607, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.460767946577629, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 9.05650590474825e-06, |
|
"loss": 0.7759, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.460767946577629, |
|
"eval_loss": 1.0106589794158936, |
|
"eval_runtime": 46.0703, |
|
"eval_samples_per_second": 8.682, |
|
"eval_steps_per_second": 8.682, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.505286588759043, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 9.027838470454222e-06, |
|
"loss": 0.7025, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.549805230940456, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 8.998788820111323e-06, |
|
"loss": 0.776, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.59432387312187, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 8.969359710345132e-06, |
|
"loss": 0.8328, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.638842515303283, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 8.939553933789499e-06, |
|
"loss": 0.7564, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.683361157484697, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 8.90937431882154e-06, |
|
"loss": 0.7684, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.72787979966611, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 8.878823729293238e-06, |
|
"loss": 0.8135, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.772398441847524, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 8.847905064259683e-06, |
|
"loss": 0.8271, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.816917084028937, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 8.816621257703969e-06, |
|
"loss": 0.8179, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.861435726210351, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 8.784975278258783e-06, |
|
"loss": 0.7721, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.905954368391764, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 8.752970128924696e-06, |
|
"loss": 0.7752, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.950473010573178, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 8.7206088467852e-06, |
|
"loss": 0.788, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.994991652754591, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 8.687894502718503e-06, |
|
"loss": 0.8012, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 5.041736227045075, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 8.654830201106133e-06, |
|
"loss": 0.8055, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 5.086254869226488, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 8.621419079538337e-06, |
|
"loss": 0.7483, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 5.130773511407902, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 8.587664308516361e-06, |
|
"loss": 0.7349, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 5.175292153589315, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 8.553569091151576e-06, |
|
"loss": 0.7454, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 5.219810795770729, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 8.519136662861531e-06, |
|
"loss": 0.6866, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 5.264329437952142, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 8.484370291062927e-06, |
|
"loss": 0.7269, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 5.308848080133556, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 8.449273274861566e-06, |
|
"loss": 0.6977, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 5.353366722314969, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 8.413848944739282e-06, |
|
"loss": 0.6814, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 5.397885364496383, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 8.378100662237904e-06, |
|
"loss": 0.7206, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 5.442404006677796, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 8.342031819640263e-06, |
|
"loss": 0.7317, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 5.48692264885921, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 8.305645839648287e-06, |
|
"loss": 0.7149, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 5.531441291040624, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 8.268946175058214e-06, |
|
"loss": 0.6568, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 5.575959933222037, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 8.231936308432935e-06, |
|
"loss": 0.7292, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 5.6204785754034505, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 8.194619751771527e-06, |
|
"loss": 0.6966, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 5.6649972175848635, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 8.157000046175984e-06, |
|
"loss": 0.7128, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 5.709515859766277, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.119080761515197e-06, |
|
"loss": 0.7343, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 5.75403450194769, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 8.080865496086177e-06, |
|
"loss": 0.7454, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 5.798553144129104, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 8.042357876272626e-06, |
|
"loss": 0.7337, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.843071786310517, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 8.003561556200796e-06, |
|
"loss": 0.7011, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 5.887590428491931, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 7.964480217392739e-06, |
|
"loss": 0.6969, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 5.932109070673344, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 7.925117568416966e-06, |
|
"loss": 0.7272, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 5.976627712854758, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 7.885477344536516e-06, |
|
"loss": 0.6795, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 6.023372287145242, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 7.845563307354506e-06, |
|
"loss": 0.7507, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 6.067890929326656, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 7.80537924445718e-06, |
|
"loss": 0.6812, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 6.112409571508069, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 7.764928969054493e-06, |
|
"loss": 0.694, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 6.156928213689483, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 7.724216319618257e-06, |
|
"loss": 0.6636, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 6.201446855870896, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 7.683245159517903e-06, |
|
"loss": 0.6817, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 6.24596549805231, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.642019376653858e-06, |
|
"loss": 0.6709, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 6.290484140233723, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 7.600542883088629e-06, |
|
"loss": 0.6755, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 6.3350027824151365, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 7.5588196146755526e-06, |
|
"loss": 0.7135, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 6.3795214245965495, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.5168535306853155e-06, |
|
"loss": 0.6461, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 6.424040066777963, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 7.474648613430252e-06, |
|
"loss": 0.6194, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 6.468558708959376, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 7.432208867886439e-06, |
|
"loss": 0.6871, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 6.51307735114079, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 7.389538321313652e-06, |
|
"loss": 0.6691, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 6.557595993322204, |
|
"grad_norm": 1.625, |
|
"learning_rate": 7.346641022873205e-06, |
|
"loss": 0.6686, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 6.602114635503617, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 7.303521043243711e-06, |
|
"loss": 0.648, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 6.646633277685031, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 7.2601824742347985e-06, |
|
"loss": 0.7131, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 6.691151919866444, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 7.2166294283988315e-06, |
|
"loss": 0.7121, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.735670562047858, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 7.172866038640644e-06, |
|
"loss": 0.6216, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 6.780189204229271, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 7.128896457825364e-06, |
|
"loss": 0.6726, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 6.824707846410685, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 7.084724858384326e-06, |
|
"loss": 0.6597, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 6.869226488592098, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.04035543191914e-06, |
|
"loss": 0.6608, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 6.913745130773512, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 6.995792388803929e-06, |
|
"loss": 0.6419, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 6.958263772954925, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 6.9510399577857976e-06, |
|
"loss": 0.6505, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 7.005008347245409, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.906102385583548e-06, |
|
"loss": 0.734, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 7.049526989426822, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 6.860983936484689e-06, |
|
"loss": 0.6262, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 7.094045631608236, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 6.815688891940796e-06, |
|
"loss": 0.6499, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 7.138564273789649, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 6.770221550161214e-06, |
|
"loss": 0.6259, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 7.183082915971063, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 6.724586225705191e-06, |
|
"loss": 0.6564, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 7.227601558152476, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 6.678787249072456e-06, |
|
"loss": 0.6358, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 7.27212020033389, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 6.632828966292279e-06, |
|
"loss": 0.6883, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 7.316638842515303, |
|
"grad_norm": 4.625, |
|
"learning_rate": 6.586715738511067e-06, |
|
"loss": 0.6618, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 7.361157484696717, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 6.540451941578505e-06, |
|
"loss": 0.6233, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 7.40567612687813, |
|
"grad_norm": 12.625, |
|
"learning_rate": 6.494041965632335e-06, |
|
"loss": 0.6973, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 7.450194769059544, |
|
"grad_norm": 11.5, |
|
"learning_rate": 6.447490214681742e-06, |
|
"loss": 0.6683, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 7.494713411240957, |
|
"grad_norm": 10.625, |
|
"learning_rate": 6.400801106189457e-06, |
|
"loss": 0.5964, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 7.539232053422371, |
|
"grad_norm": 9.875, |
|
"learning_rate": 6.353979070652555e-06, |
|
"loss": 0.6784, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 7.583750695603785, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 6.307028551182041e-06, |
|
"loss": 0.6335, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 7.628269337785198, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 6.259954003081215e-06, |
|
"loss": 0.6539, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 7.6727879799666105, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 6.212759893422908e-06, |
|
"loss": 0.6371, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 7.717306622148024, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 6.165450700625565e-06, |
|
"loss": 0.6426, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 7.761825264329438, |
|
"grad_norm": 3.0, |
|
"learning_rate": 6.118030914028292e-06, |
|
"loss": 0.6587, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 7.806343906510851, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 6.070505033464835e-06, |
|
"loss": 0.5994, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 7.850862548692265, |
|
"grad_norm": 3.0, |
|
"learning_rate": 6.022877568836579e-06, |
|
"loss": 0.6387, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 7.895381190873678, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 5.975153039684579e-06, |
|
"loss": 0.6704, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 7.939899833055092, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 5.927335974760699e-06, |
|
"loss": 0.6274, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 7.984418475236505, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 5.87943091159785e-06, |
|
"loss": 0.6611, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 8.03116304952699, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 5.831442396079413e-06, |
|
"loss": 0.6732, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 8.075681691708404, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 5.78337498200786e-06, |
|
"loss": 0.5774, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 8.120200333889816, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 5.735233230672636e-06, |
|
"loss": 0.6312, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 8.16471897607123, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 5.687021710417308e-06, |
|
"loss": 0.6262, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 8.209237618252644, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 5.638744996206074e-06, |
|
"loss": 0.5604, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 8.253756260434058, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 5.590407669189612e-06, |
|
"loss": 0.6017, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 8.29827490261547, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 5.542014316270377e-06, |
|
"loss": 0.5133, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 8.342793544796884, |
|
"grad_norm": 2.5, |
|
"learning_rate": 5.493569529667312e-06, |
|
"loss": 0.5995, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 8.387312186978297, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 5.445077906480095e-06, |
|
"loss": 0.6081, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 8.431830829159711, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 5.396544048252893e-06, |
|
"loss": 0.6193, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 8.476349471341123, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.3479725605377065e-06, |
|
"loss": 0.568, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 8.520868113522537, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 5.299368052457332e-06, |
|
"loss": 0.5966, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 8.565386755703951, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 5.250735136267993e-06, |
|
"loss": 0.6217, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 8.609905397885365, |
|
"grad_norm": 2.375, |
|
"learning_rate": 5.2020784269216515e-06, |
|
"loss": 0.554, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 8.654424040066779, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 5.153402541628097e-06, |
|
"loss": 0.562, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 8.698942682248191, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 5.1047120994167855e-06, |
|
"loss": 0.598, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 8.743461324429605, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 5.056011720698536e-06, |
|
"loss": 0.6065, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 8.787979966611019, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 5.007306026827076e-06, |
|
"loss": 0.5696, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 8.832498608792433, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.958599639660508e-06, |
|
"loss": 0.5824, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 8.877017250973845, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 4.909897181122725e-06, |
|
"loss": 0.6082, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 8.921535893155259, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 4.861203272764813e-06, |
|
"loss": 0.554, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.921535893155259, |
|
"eval_loss": 0.9391384720802307, |
|
"eval_runtime": 17.1717, |
|
"eval_samples_per_second": 23.294, |
|
"eval_steps_per_second": 23.294, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.966054535336673, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4.8125225353265085e-06, |
|
"loss": 0.5373, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 9.012799109627156, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 4.7638595882977064e-06, |
|
"loss": 0.6353, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 9.05731775180857, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.71521904948011e-06, |
|
"loss": 0.5151, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 9.101836393989982, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 4.666605534549021e-06, |
|
"loss": 0.5314, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 9.146355036171396, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 4.618023656615352e-06, |
|
"loss": 0.5424, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 9.19087367835281, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 4.569478025787869e-06, |
|
"loss": 0.4959, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 9.235392320534224, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.520973248735715e-06, |
|
"loss": 0.5301, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 9.279910962715638, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.472513928251275e-06, |
|
"loss": 0.5219, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 9.32442960489705, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 4.424104662813396e-06, |
|
"loss": 0.5537, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 9.368948247078464, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 4.375750046151023e-06, |
|
"loss": 0.5269, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 9.413466889259878, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 4.3274546668072835e-06, |
|
"loss": 0.5535, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 9.457985531441292, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 4.279223107704058e-06, |
|
"loss": 0.5382, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 9.502504173622704, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.2310599457071e-06, |
|
"loss": 0.5643, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 9.547022815804118, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.1829697511917146e-06, |
|
"loss": 0.5493, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 9.591541457985532, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 4.134957087609065e-06, |
|
"loss": 0.5457, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 9.636060100166945, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.087026511053116e-06, |
|
"loss": 0.4859, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 9.680578742348358, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.0391825698283084e-06, |
|
"loss": 0.4969, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 9.725097384529771, |
|
"grad_norm": 2.125, |
|
"learning_rate": 3.991429804017944e-06, |
|
"loss": 0.5311, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 9.769616026711185, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 3.9437727450533605e-06, |
|
"loss": 0.5437, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 9.8141346688926, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 3.89621591528393e-06, |
|
"loss": 0.5197, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 9.858653311074011, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 3.848763827547915e-06, |
|
"loss": 0.5104, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 9.903171953255425, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 3.8014209847442345e-06, |
|
"loss": 0.55, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 9.947690595436839, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 3.7541918794051637e-06, |
|
"loss": 0.53, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 9.992209237618253, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 3.7070809932700134e-06, |
|
"loss": 0.4882, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 10.038953811908737, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.6600927968598588e-06, |
|
"loss": 0.4714, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 10.08347245409015, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 3.613231749053304e-06, |
|
"loss": 0.4774, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 10.127991096271563, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 3.5665022966633678e-06, |
|
"loss": 0.4764, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 10.172509738452977, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 3.519908874015501e-06, |
|
"loss": 0.4632, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 10.21702838063439, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 3.473455902526809e-06, |
|
"loss": 0.4604, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 10.261547022815805, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.4271477902864836e-06, |
|
"loss": 0.4753, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 10.306065664997218, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.3809889316375012e-06, |
|
"loss": 0.4323, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 10.35058430717863, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.334983706759627e-06, |
|
"loss": 0.4659, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 10.395102949360044, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 3.2891364812537686e-06, |
|
"loss": 0.4896, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 10.439621591541458, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 3.2434516057277055e-06, |
|
"loss": 0.478, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 10.484140233722872, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 3.1979334153832486e-06, |
|
"loss": 0.4453, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 10.528658875904284, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.1525862296048446e-06, |
|
"loss": 0.5075, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 10.573177518085698, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 3.1074143515497114e-06, |
|
"loss": 0.4865, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 10.617696160267112, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 3.0624220677394854e-06, |
|
"loss": 0.5178, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 10.662214802448526, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.017613647653461e-06, |
|
"loss": 0.5069, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 10.706733444629938, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.9729933433234402e-06, |
|
"loss": 0.4423, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 10.751252086811352, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 2.9285653889302514e-06, |
|
"loss": 0.4359, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 10.795770728992766, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 2.8843340004019427e-06, |
|
"loss": 0.4517, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 10.84028937117418, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.8403033750137255e-06, |
|
"loss": 0.4775, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 10.884808013355592, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.7964776909896733e-06, |
|
"loss": 0.5064, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 10.929326655537006, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 2.7528611071062366e-06, |
|
"loss": 0.4651, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 10.97384529771842, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 2.7094577622976096e-06, |
|
"loss": 0.4909, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 11.020589872008903, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 2.6662717752629597e-06, |
|
"loss": 0.4996, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 11.065108514190317, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 2.6233072440755934e-06, |
|
"loss": 0.4445, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 11.109627156371731, |
|
"grad_norm": 1.375, |
|
"learning_rate": 2.580568245794085e-06, |
|
"loss": 0.4471, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 11.154145798553143, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 2.538058836075373e-06, |
|
"loss": 0.49, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 11.198664440734557, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 2.4957830487899224e-06, |
|
"loss": 0.4148, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 11.243183082915971, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 2.4537448956389146e-06, |
|
"loss": 0.4247, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 11.287701725097385, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 2.411948365773588e-06, |
|
"loss": 0.4368, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 11.332220367278797, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 2.3703974254166704e-06, |
|
"loss": 0.4273, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 11.376739009460211, |
|
"grad_norm": 1.0, |
|
"learning_rate": 2.3290960174860293e-06, |
|
"loss": 0.4421, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 11.421257651641625, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 2.2880480612204925e-06, |
|
"loss": 0.4072, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 11.465776293823039, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 2.247257451807961e-06, |
|
"loss": 0.4472, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 11.510294936004453, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 2.206728060015761e-06, |
|
"loss": 0.4613, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 11.554813578185865, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 2.1664637318233484e-06, |
|
"loss": 0.4111, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 11.599332220367279, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 2.1264682880573374e-06, |
|
"loss": 0.4385, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 11.643850862548693, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 2.086745524028933e-06, |
|
"loss": 0.4448, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 11.688369504730106, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 2.0472992091737886e-06, |
|
"loss": 0.4292, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 11.732888146911518, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 2.0081330866942962e-06, |
|
"loss": 0.425, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 11.777406789092932, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.96925087320439e-06, |
|
"loss": 0.4311, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 11.821925431274346, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 1.930656258376859e-06, |
|
"loss": 0.4725, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 11.86644407345576, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.8923529045932292e-06, |
|
"loss": 0.4149, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 11.910962715637172, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.8543444465962147e-06, |
|
"loss": 0.4436, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 11.955481357818586, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 1.8166344911448115e-06, |
|
"loss": 0.4254, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 12.00222593210907, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.7792266166720368e-06, |
|
"loss": 0.5129, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 12.046744574290484, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.742124372945364e-06, |
|
"loss": 0.4114, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 12.091263216471898, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 1.7053312807298633e-06, |
|
"loss": 0.4351, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 12.135781858653312, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 1.6688508314541086e-06, |
|
"loss": 0.404, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 12.180300500834724, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 1.6326864868788678e-06, |
|
"loss": 0.4349, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 12.224819143016138, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.5968416787685919e-06, |
|
"loss": 0.4581, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 12.269337785197552, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.5613198085657804e-06, |
|
"loss": 0.4589, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 12.313856427378965, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 1.5261242470681813e-06, |
|
"loss": 0.4357, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 12.358375069560378, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.4912583341089516e-06, |
|
"loss": 0.3949, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 12.402893711741791, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 1.4567253782397073e-06, |
|
"loss": 0.4184, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 12.447412353923205, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 1.4225286564165785e-06, |
|
"loss": 0.4309, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 12.49193099610462, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 1.3886714136892287e-06, |
|
"loss": 0.4539, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 12.536449638286033, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.3551568628929434e-06, |
|
"loss": 0.4243, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 12.580968280467445, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 1.321988184343732e-06, |
|
"loss": 0.4039, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 12.625486922648859, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.2891685255365517e-06, |
|
"loss": 0.4182, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 12.670005564830273, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 1.256701000846619e-06, |
|
"loss": 0.4146, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 12.714524207011687, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 1.22458869123388e-06, |
|
"loss": 0.434, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 12.759042849193099, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 1.1928346439506526e-06, |
|
"loss": 0.4356, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 12.803561491374513, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 1.1614418722524506e-06, |
|
"loss": 0.4073, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 12.848080133555927, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 1.1304133551120532e-06, |
|
"loss": 0.4376, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 12.89259877573734, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 1.0997520369368158e-06, |
|
"loss": 0.4078, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 12.937117417918753, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 1.0694608272892698e-06, |
|
"loss": 0.4329, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 12.981636060100167, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 1.0395426006110164e-06, |
|
"loss": 0.3766, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 13.02838063439065, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 1.0100001959499644e-06, |
|
"loss": 0.3808, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 13.072899276572064, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9.808364166909256e-07, |
|
"loss": 0.4232, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 13.117417918753478, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 9.520540302895847e-07, |
|
"loss": 0.4332, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 13.161936560934892, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.236557680098918e-07, |
|
"loss": 0.4059, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 13.206455203116304, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 8.956443246648771e-07, |
|
"loss": 0.3704, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 13.250973845297718, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 8.680223583609399e-07, |
|
"loss": 0.4327, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 13.295492487479132, |
|
"grad_norm": 0.875, |
|
"learning_rate": 8.407924902455983e-07, |
|
"loss": 0.4229, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 13.340011129660546, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 8.139573042587729e-07, |
|
"loss": 0.4121, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 13.384529771841958, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 7.875193468875719e-07, |
|
"loss": 0.423, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 13.384529771841958, |
|
"eval_loss": 0.9250730872154236, |
|
"eval_runtime": 17.1049, |
|
"eval_samples_per_second": 23.385, |
|
"eval_steps_per_second": 23.385, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 13.429048414023372, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 7.614811269246631e-07, |
|
"loss": 0.4316, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 13.473567056204786, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 7.35845115230191e-07, |
|
"loss": 0.4104, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 13.5180856983862, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 7.106137444973177e-07, |
|
"loss": 0.4367, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 13.562604340567614, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 6.857894090213702e-07, |
|
"loss": 0.417, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 13.607122982749026, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 6.613744644726383e-07, |
|
"loss": 0.394, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 13.65164162493044, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 6.3737122767284e-07, |
|
"loss": 0.4172, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 13.696160267111853, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 6.137819763752656e-07, |
|
"loss": 0.4517, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 13.740678909293267, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 5.90608949048635e-07, |
|
"loss": 0.4256, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 13.78519755147468, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 5.678543446646811e-07, |
|
"loss": 0.4019, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 13.829716193656093, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 5.455203224894857e-07, |
|
"loss": 0.453, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 13.874234835837507, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 5.236090018785705e-07, |
|
"loss": 0.4107, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 13.918753478018921, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 5.021224620757914e-07, |
|
"loss": 0.4475, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 13.963272120200333, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.810627420160269e-07, |
|
"loss": 0.4322, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 14.010016694490819, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.604318401317009e-07, |
|
"loss": 0.4318, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 14.054535336672231, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.402317141631407e-07, |
|
"loss": 0.4489, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 14.099053978853645, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 4.2046428097279766e-07, |
|
"loss": 0.4381, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 14.143572621035059, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.011314163633573e-07, |
|
"loss": 0.4107, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 14.188091263216473, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 3.822349548997295e-07, |
|
"loss": 0.4399, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 14.232609905397885, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 3.637766897349654e-07, |
|
"loss": 0.417, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 14.277128547579299, |
|
"grad_norm": 3.5, |
|
"learning_rate": 3.4575837244009367e-07, |
|
"loss": 0.4449, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 14.321647189760712, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 3.281817128379139e-07, |
|
"loss": 0.3875, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 14.366165831942126, |
|
"grad_norm": 4.625, |
|
"learning_rate": 3.1104837884073866e-07, |
|
"loss": 0.4187, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 14.410684474123538, |
|
"grad_norm": 3.875, |
|
"learning_rate": 2.943599962921279e-07, |
|
"loss": 0.41, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 14.455203116304952, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 2.7811814881259503e-07, |
|
"loss": 0.4016, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 14.499721758486366, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 2.623243776493434e-07, |
|
"loss": 0.3906, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 14.54424040066778, |
|
"grad_norm": 8.125, |
|
"learning_rate": 2.469801815300027e-07, |
|
"loss": 0.4241, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 14.588759042849194, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 2.3208701652041697e-07, |
|
"loss": 0.4104, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 14.633277685030606, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 2.1764629588646667e-07, |
|
"loss": 0.4031, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 14.67779632721202, |
|
"grad_norm": 7.125, |
|
"learning_rate": 2.036593899599615e-07, |
|
"loss": 0.3911, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 14.722314969393434, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.9012762600860656e-07, |
|
"loss": 0.4137, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 14.766833611574848, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.7705228811005004e-07, |
|
"loss": 0.4559, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 14.81135225375626, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.6443461703003427e-07, |
|
"loss": 0.3986, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 14.855870895937674, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.5227581010465341e-07, |
|
"loss": 0.4073, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 14.900389538119088, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.4057702112673765e-07, |
|
"loss": 0.4137, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 14.944908180300501, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.2933936023636073e-07, |
|
"loss": 0.4283, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 14.989426822481914, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.185638938154976e-07, |
|
"loss": 0.4097, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 15.0361713967724, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.08251644386832e-07, |
|
"loss": 0.4326, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 15.080690038953811, |
|
"grad_norm": 3.0, |
|
"learning_rate": 9.84035905167241e-08, |
|
"loss": 0.4338, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 15.125208681135225, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 8.902066672235144e-08, |
|
"loss": 0.4197, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 15.16972732331664, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 8.010376338302872e-08, |
|
"loss": 0.4277, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 15.214245965498053, |
|
"grad_norm": 2.25, |
|
"learning_rate": 7.165372665571879e-08, |
|
"loss": 0.4325, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 15.258764607679465, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 6.367135839473349e-08, |
|
"loss": 0.399, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 15.303283249860879, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.6157416075648954e-08, |
|
"loss": 0.4368, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 15.347801892042293, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 4.911261272341872e-08, |
|
"loss": 0.4029, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 15.392320534223707, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 4.25376168447178e-08, |
|
"loss": 0.4269, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 15.436839176405119, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 3.643305236450345e-08, |
|
"loss": 0.4442, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 15.481357818586533, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 3.079949856680975e-08, |
|
"loss": 0.4207, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 15.525876460767947, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 2.5637490039775447e-08, |
|
"loss": 0.4257, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 15.57039510294936, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 2.0947516624917898e-08, |
|
"loss": 0.4161, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 15.614913745130773, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.6730023370645775e-08, |
|
"loss": 0.3976, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 15.659432387312187, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.298541049003288e-08, |
|
"loss": 0.4074, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 15.7039510294936, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 9.714033322833494e-09, |
|
"loss": 0.4155, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 15.748469671675014, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 6.9162023017699255e-09, |
|
"loss": 0.3747, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 15.792988313856428, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 4.592182923068289e-09, |
|
"loss": 0.3766, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 15.83750695603784, |
|
"grad_norm": 2.125, |
|
"learning_rate": 2.7421957212697692e-09, |
|
"loss": 0.4017, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 15.882025598219254, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.3664162482990296e-09, |
|
"loss": 0.42, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 15.926544240400668, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 4.649750568080924e-10, |
|
"loss": 0.44, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 15.971062882582082, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 3.795768778680487e-11, |
|
"loss": 0.4156, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 15.988870339454646, |
|
"step": 3584, |
|
"total_flos": 8.551781210951516e+17, |
|
"train_loss": 0.7042842949075359, |
|
"train_runtime": 4850.1152, |
|
"train_samples_per_second": 11.853, |
|
"train_steps_per_second": 0.739 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3584, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 16, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.551781210951516e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|