{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.988870339454646, "eval_steps": 1000, "global_step": 3584, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.044518642181413465, "grad_norm": 33.75, "learning_rate": 2.785515320334262e-07, "loss": 2.0545, "step": 10 }, { "epoch": 0.08903728436282693, "grad_norm": 28.125, "learning_rate": 5.571030640668524e-07, "loss": 2.0294, "step": 20 }, { "epoch": 0.1335559265442404, "grad_norm": 20.875, "learning_rate": 8.356545961002786e-07, "loss": 1.9841, "step": 30 }, { "epoch": 0.17807456872565386, "grad_norm": 14.25, "learning_rate": 1.1142061281337048e-06, "loss": 1.8176, "step": 40 }, { "epoch": 0.22259321090706732, "grad_norm": 23.5, "learning_rate": 1.392757660167131e-06, "loss": 1.6325, "step": 50 }, { "epoch": 0.2671118530884808, "grad_norm": 19.25, "learning_rate": 1.6713091922005572e-06, "loss": 1.6341, "step": 60 }, { "epoch": 0.3116304952698943, "grad_norm": 13.8125, "learning_rate": 1.9498607242339835e-06, "loss": 1.4943, "step": 70 }, { "epoch": 0.3561491374513077, "grad_norm": 12.0, "learning_rate": 2.2284122562674097e-06, "loss": 1.4708, "step": 80 }, { "epoch": 0.4006677796327212, "grad_norm": 11.125, "learning_rate": 2.506963788300836e-06, "loss": 1.415, "step": 90 }, { "epoch": 0.44518642181413465, "grad_norm": 3.203125, "learning_rate": 2.785515320334262e-06, "loss": 1.4596, "step": 100 }, { "epoch": 0.48970506399554814, "grad_norm": 2.71875, "learning_rate": 3.064066852367688e-06, "loss": 1.4339, "step": 110 }, { "epoch": 0.5342237061769616, "grad_norm": 2.6875, "learning_rate": 3.3426183844011143e-06, "loss": 1.4009, "step": 120 }, { "epoch": 0.5787423483583751, "grad_norm": 3.109375, "learning_rate": 3.6211699164345405e-06, "loss": 1.3688, "step": 130 }, { "epoch": 0.6232609905397886, "grad_norm": 2.84375, "learning_rate": 3.899721448467967e-06, "loss": 1.3595, "step": 140 }, { "epoch": 0.667779632721202, "grad_norm": 2.65625, "learning_rate": 4.178272980501394e-06, "loss": 1.3609, "step": 150 }, { "epoch": 0.7122982749026154, "grad_norm": 2.953125, "learning_rate": 4.456824512534819e-06, "loss": 1.3777, "step": 160 }, { "epoch": 0.756816917084029, "grad_norm": 2.71875, "learning_rate": 4.735376044568246e-06, "loss": 1.3374, "step": 170 }, { "epoch": 0.8013355592654424, "grad_norm": 2.90625, "learning_rate": 5.013927576601672e-06, "loss": 1.3524, "step": 180 }, { "epoch": 0.8458542014468559, "grad_norm": 2.5625, "learning_rate": 5.292479108635098e-06, "loss": 1.3153, "step": 190 }, { "epoch": 0.8903728436282693, "grad_norm": 2.375, "learning_rate": 5.571030640668524e-06, "loss": 1.3519, "step": 200 }, { "epoch": 0.9348914858096828, "grad_norm": 3.140625, "learning_rate": 5.849582172701951e-06, "loss": 1.348, "step": 210 }, { "epoch": 0.9794101279910963, "grad_norm": 2.8125, "learning_rate": 6.128133704735376e-06, "loss": 1.3062, "step": 220 }, { "epoch": 1.0261547022815805, "grad_norm": 2.5, "learning_rate": 6.406685236768803e-06, "loss": 1.4358, "step": 230 }, { "epoch": 1.070673344462994, "grad_norm": 2.40625, "learning_rate": 6.685236768802229e-06, "loss": 1.2481, "step": 240 }, { "epoch": 1.1151919866444073, "grad_norm": 2.8125, "learning_rate": 6.963788300835655e-06, "loss": 1.2833, "step": 250 }, { "epoch": 1.1597106288258208, "grad_norm": 2.140625, "learning_rate": 7.242339832869081e-06, "loss": 1.1941, "step": 260 }, { "epoch": 1.2042292710072342, "grad_norm": 2.46875, "learning_rate": 7.5208913649025075e-06, "loss": 1.2831, "step": 270 }, { "epoch": 1.2487479131886476, "grad_norm": 2.671875, "learning_rate": 7.799442896935934e-06, "loss": 1.2854, "step": 280 }, { "epoch": 1.293266555370061, "grad_norm": 2.46875, "learning_rate": 8.07799442896936e-06, "loss": 1.257, "step": 290 }, { "epoch": 1.3377851975514747, "grad_norm": 2.03125, "learning_rate": 8.356545961002787e-06, "loss": 1.2468, "step": 300 }, { "epoch": 1.3823038397328882, "grad_norm": 2.0625, "learning_rate": 8.635097493036211e-06, "loss": 1.2743, "step": 310 }, { "epoch": 1.4268224819143016, "grad_norm": 2.109375, "learning_rate": 8.913649025069639e-06, "loss": 1.2265, "step": 320 }, { "epoch": 1.471341124095715, "grad_norm": 2.078125, "learning_rate": 9.192200557103064e-06, "loss": 1.2898, "step": 330 }, { "epoch": 1.5158597662771287, "grad_norm": 2.0, "learning_rate": 9.470752089136492e-06, "loss": 1.2406, "step": 340 }, { "epoch": 1.5603784084585421, "grad_norm": 2.21875, "learning_rate": 9.749303621169918e-06, "loss": 1.2098, "step": 350 }, { "epoch": 1.6048970506399556, "grad_norm": 1.953125, "learning_rate": 9.9999976276417e-06, "loss": 1.2067, "step": 360 }, { "epoch": 1.649415692821369, "grad_norm": 1.9765625, "learning_rate": 9.999712947369595e-06, "loss": 1.2338, "step": 370 }, { "epoch": 1.6939343350027825, "grad_norm": 1.859375, "learning_rate": 9.998953826391322e-06, "loss": 1.2546, "step": 380 }, { "epoch": 1.738452977184196, "grad_norm": 2.015625, "learning_rate": 9.997720336742596e-06, "loss": 1.201, "step": 390 }, { "epoch": 1.7829716193656093, "grad_norm": 2.09375, "learning_rate": 9.996012595473676e-06, "loss": 1.1761, "step": 400 }, { "epoch": 1.8274902615470228, "grad_norm": 2.015625, "learning_rate": 9.993830764638262e-06, "loss": 1.1884, "step": 410 }, { "epoch": 1.8720089037284362, "grad_norm": 1.90625, "learning_rate": 9.991175051278111e-06, "loss": 1.1951, "step": 420 }, { "epoch": 1.9165275459098496, "grad_norm": 2.09375, "learning_rate": 9.988045707403394e-06, "loss": 1.175, "step": 430 }, { "epoch": 1.961046188091263, "grad_norm": 2.0625, "learning_rate": 9.984443029968786e-06, "loss": 1.2045, "step": 440 }, { "epoch": 2.0077907623817475, "grad_norm": 2.03125, "learning_rate": 9.980367360845278e-06, "loss": 1.3052, "step": 450 }, { "epoch": 2.052309404563161, "grad_norm": 1.984375, "learning_rate": 9.975819086787743e-06, "loss": 1.1092, "step": 460 }, { "epoch": 2.0968280467445743, "grad_norm": 2.1875, "learning_rate": 9.970798639398228e-06, "loss": 1.1435, "step": 470 }, { "epoch": 2.141346688925988, "grad_norm": 1.8828125, "learning_rate": 9.965306495085005e-06, "loss": 1.0927, "step": 480 }, { "epoch": 2.185865331107401, "grad_norm": 2.03125, "learning_rate": 9.959343175017362e-06, "loss": 1.0692, "step": 490 }, { "epoch": 2.2303839732888147, "grad_norm": 1.8671875, "learning_rate": 9.952909245076141e-06, "loss": 1.0603, "step": 500 }, { "epoch": 2.274902615470228, "grad_norm": 2.03125, "learning_rate": 9.946005315800047e-06, "loss": 1.0717, "step": 510 }, { "epoch": 2.3194212576516415, "grad_norm": 1.9140625, "learning_rate": 9.93863204232771e-06, "loss": 1.0808, "step": 520 }, { "epoch": 2.363939899833055, "grad_norm": 1.609375, "learning_rate": 9.930790124335511e-06, "loss": 1.0297, "step": 530 }, { "epoch": 2.4084585420144684, "grad_norm": 1.84375, "learning_rate": 9.922480305971193e-06, "loss": 1.0481, "step": 540 }, { "epoch": 2.452977184195882, "grad_norm": 1.90625, "learning_rate": 9.91370337578325e-06, "loss": 1.0919, "step": 550 }, { "epoch": 2.4974958263772953, "grad_norm": 2.09375, "learning_rate": 9.904460166646084e-06, "loss": 1.0835, "step": 560 }, { "epoch": 2.542014468558709, "grad_norm": 1.8046875, "learning_rate": 9.894751555680988e-06, "loss": 1.0336, "step": 570 }, { "epoch": 2.586533110740122, "grad_norm": 2.0625, "learning_rate": 9.884578464172901e-06, "loss": 1.0728, "step": 580 }, { "epoch": 2.631051752921536, "grad_norm": 1.6484375, "learning_rate": 9.873941857482988e-06, "loss": 1.0493, "step": 590 }, { "epoch": 2.6755703951029495, "grad_norm": 1.796875, "learning_rate": 9.862842744957037e-06, "loss": 1.0346, "step": 600 }, { "epoch": 2.720089037284363, "grad_norm": 1.7421875, "learning_rate": 9.85128217982967e-06, "loss": 1.0483, "step": 610 }, { "epoch": 2.7646076794657763, "grad_norm": 1.6796875, "learning_rate": 9.8392612591244e-06, "loss": 1.0384, "step": 620 }, { "epoch": 2.80912632164719, "grad_norm": 1.765625, "learning_rate": 9.826781123549542e-06, "loss": 1.0266, "step": 630 }, { "epoch": 2.853644963828603, "grad_norm": 1.578125, "learning_rate": 9.813842957389953e-06, "loss": 1.0352, "step": 640 }, { "epoch": 2.8981636060100167, "grad_norm": 1.796875, "learning_rate": 9.800447988394657e-06, "loss": 1.009, "step": 650 }, { "epoch": 2.94268224819143, "grad_norm": 1.796875, "learning_rate": 9.786597487660336e-06, "loss": 1.0834, "step": 660 }, { "epoch": 2.9872008903728435, "grad_norm": 1.8046875, "learning_rate": 9.772292769510718e-06, "loss": 1.0735, "step": 670 }, { "epoch": 3.033945464663328, "grad_norm": 1.6484375, "learning_rate": 9.75753519137185e-06, "loss": 1.0532, "step": 680 }, { "epoch": 3.0784641068447414, "grad_norm": 1.8125, "learning_rate": 9.742326153643285e-06, "loss": 0.9169, "step": 690 }, { "epoch": 3.122982749026155, "grad_norm": 1.671875, "learning_rate": 9.726667099565202e-06, "loss": 0.9443, "step": 700 }, { "epoch": 3.1675013912075682, "grad_norm": 1.6015625, "learning_rate": 9.710559515081446e-06, "loss": 0.9023, "step": 710 }, { "epoch": 3.2120200333889817, "grad_norm": 1.7265625, "learning_rate": 9.69400492869852e-06, "loss": 0.9227, "step": 720 }, { "epoch": 3.256538675570395, "grad_norm": 1.71875, "learning_rate": 9.677004911340539e-06, "loss": 0.9329, "step": 730 }, { "epoch": 3.3010573177518086, "grad_norm": 1.875, "learning_rate": 9.659561076200173e-06, "loss": 0.903, "step": 740 }, { "epoch": 3.345575959933222, "grad_norm": 1.484375, "learning_rate": 9.64167507858554e-06, "loss": 0.9046, "step": 750 }, { "epoch": 3.3900946021146354, "grad_norm": 1.65625, "learning_rate": 9.62334861576315e-06, "loss": 0.927, "step": 760 }, { "epoch": 3.434613244296049, "grad_norm": 1.8046875, "learning_rate": 9.604583426796837e-06, "loss": 0.9274, "step": 770 }, { "epoch": 3.4791318864774623, "grad_norm": 1.53125, "learning_rate": 9.585381292382734e-06, "loss": 0.9127, "step": 780 }, { "epoch": 3.5236505286588757, "grad_norm": 1.59375, "learning_rate": 9.565744034680291e-06, "loss": 0.9269, "step": 790 }, { "epoch": 3.5681691708402896, "grad_norm": 1.6796875, "learning_rate": 9.545673517139376e-06, "loss": 0.8863, "step": 800 }, { "epoch": 3.6126878130217026, "grad_norm": 1.359375, "learning_rate": 9.52517164432343e-06, "loss": 0.8776, "step": 810 }, { "epoch": 3.6572064552031165, "grad_norm": 1.3359375, "learning_rate": 9.50424036172875e-06, "loss": 0.9424, "step": 820 }, { "epoch": 3.70172509738453, "grad_norm": 1.4296875, "learning_rate": 9.482881655599867e-06, "loss": 0.8712, "step": 830 }, { "epoch": 3.7462437395659434, "grad_norm": 1.5390625, "learning_rate": 9.461097552741065e-06, "loss": 0.9157, "step": 840 }, { "epoch": 3.790762381747357, "grad_norm": 1.296875, "learning_rate": 9.438890120324049e-06, "loss": 0.8571, "step": 850 }, { "epoch": 3.8352810239287702, "grad_norm": 1.328125, "learning_rate": 9.416261465691786e-06, "loss": 0.861, "step": 860 }, { "epoch": 3.8797996661101837, "grad_norm": 1.265625, "learning_rate": 9.393213736158532e-06, "loss": 0.8952, "step": 870 }, { "epoch": 3.924318308291597, "grad_norm": 1.328125, "learning_rate": 9.369749118806063e-06, "loss": 0.8598, "step": 880 }, { "epoch": 3.9688369504730105, "grad_norm": 1.3359375, "learning_rate": 9.345869840276138e-06, "loss": 0.8614, "step": 890 }, { "epoch": 4.015581524763495, "grad_norm": 1.0625, "learning_rate": 9.321578166559202e-06, "loss": 0.8842, "step": 900 }, { "epoch": 4.060100166944908, "grad_norm": 1.25, "learning_rate": 9.296876402779357e-06, "loss": 0.7889, "step": 910 }, { "epoch": 4.104618809126322, "grad_norm": 1.2109375, "learning_rate": 9.271766892975632e-06, "loss": 0.8188, "step": 920 }, { "epoch": 4.149137451307735, "grad_norm": 1.1796875, "learning_rate": 9.246252019879526e-06, "loss": 0.7822, "step": 930 }, { "epoch": 4.193656093489149, "grad_norm": 1.0546875, "learning_rate": 9.22033420468893e-06, "loss": 0.8268, "step": 940 }, { "epoch": 4.238174735670562, "grad_norm": 1.015625, "learning_rate": 9.194015906838345e-06, "loss": 0.7838, "step": 950 }, { "epoch": 4.282693377851976, "grad_norm": 1.2109375, "learning_rate": 9.167299623765515e-06, "loss": 0.7691, "step": 960 }, { "epoch": 4.3272120200333895, "grad_norm": 1.171875, "learning_rate": 9.14018789067443e-06, "loss": 0.7575, "step": 970 }, { "epoch": 4.371730662214802, "grad_norm": 1.1640625, "learning_rate": 9.11268328029475e-06, "loss": 0.8305, "step": 980 }, { "epoch": 4.416249304396216, "grad_norm": 1.0390625, "learning_rate": 9.08478840263767e-06, "loss": 0.7607, "step": 990 }, { "epoch": 4.460767946577629, "grad_norm": 0.99609375, "learning_rate": 9.05650590474825e-06, "loss": 0.7759, "step": 1000 }, { "epoch": 4.460767946577629, "eval_loss": 1.0106589794158936, "eval_runtime": 46.0703, "eval_samples_per_second": 8.682, "eval_steps_per_second": 8.682, "step": 1000 }, { "epoch": 4.505286588759043, "grad_norm": 0.9140625, "learning_rate": 9.027838470454222e-06, "loss": 0.7025, "step": 1010 }, { "epoch": 4.549805230940456, "grad_norm": 1.046875, "learning_rate": 8.998788820111323e-06, "loss": 0.776, "step": 1020 }, { "epoch": 4.59432387312187, "grad_norm": 1.0078125, "learning_rate": 8.969359710345132e-06, "loss": 0.8328, "step": 1030 }, { "epoch": 4.638842515303283, "grad_norm": 1.0546875, "learning_rate": 8.939553933789499e-06, "loss": 0.7564, "step": 1040 }, { "epoch": 4.683361157484697, "grad_norm": 1.1796875, "learning_rate": 8.90937431882154e-06, "loss": 0.7684, "step": 1050 }, { "epoch": 4.72787979966611, "grad_norm": 1.109375, "learning_rate": 8.878823729293238e-06, "loss": 0.8135, "step": 1060 }, { "epoch": 4.772398441847524, "grad_norm": 1.1953125, "learning_rate": 8.847905064259683e-06, "loss": 0.8271, "step": 1070 }, { "epoch": 4.816917084028937, "grad_norm": 1.0625, "learning_rate": 8.816621257703969e-06, "loss": 0.8179, "step": 1080 }, { "epoch": 4.861435726210351, "grad_norm": 1.0546875, "learning_rate": 8.784975278258783e-06, "loss": 0.7721, "step": 1090 }, { "epoch": 4.905954368391764, "grad_norm": 1.0546875, "learning_rate": 8.752970128924696e-06, "loss": 0.7752, "step": 1100 }, { "epoch": 4.950473010573178, "grad_norm": 0.91015625, "learning_rate": 8.7206088467852e-06, "loss": 0.788, "step": 1110 }, { "epoch": 4.994991652754591, "grad_norm": 0.9609375, "learning_rate": 8.687894502718503e-06, "loss": 0.8012, "step": 1120 }, { "epoch": 5.041736227045075, "grad_norm": 0.8984375, "learning_rate": 8.654830201106133e-06, "loss": 0.8055, "step": 1130 }, { "epoch": 5.086254869226488, "grad_norm": 0.98046875, "learning_rate": 8.621419079538337e-06, "loss": 0.7483, "step": 1140 }, { "epoch": 5.130773511407902, "grad_norm": 0.9375, "learning_rate": 8.587664308516361e-06, "loss": 0.7349, "step": 1150 }, { "epoch": 5.175292153589315, "grad_norm": 0.80859375, "learning_rate": 8.553569091151576e-06, "loss": 0.7454, "step": 1160 }, { "epoch": 5.219810795770729, "grad_norm": 0.953125, "learning_rate": 8.519136662861531e-06, "loss": 0.6866, "step": 1170 }, { "epoch": 5.264329437952142, "grad_norm": 0.859375, "learning_rate": 8.484370291062927e-06, "loss": 0.7269, "step": 1180 }, { "epoch": 5.308848080133556, "grad_norm": 0.94140625, "learning_rate": 8.449273274861566e-06, "loss": 0.6977, "step": 1190 }, { "epoch": 5.353366722314969, "grad_norm": 0.8984375, "learning_rate": 8.413848944739282e-06, "loss": 0.6814, "step": 1200 }, { "epoch": 5.397885364496383, "grad_norm": 0.97265625, "learning_rate": 8.378100662237904e-06, "loss": 0.7206, "step": 1210 }, { "epoch": 5.442404006677796, "grad_norm": 0.85546875, "learning_rate": 8.342031819640263e-06, "loss": 0.7317, "step": 1220 }, { "epoch": 5.48692264885921, "grad_norm": 0.859375, "learning_rate": 8.305645839648287e-06, "loss": 0.7149, "step": 1230 }, { "epoch": 5.531441291040624, "grad_norm": 0.8984375, "learning_rate": 8.268946175058214e-06, "loss": 0.6568, "step": 1240 }, { "epoch": 5.575959933222037, "grad_norm": 0.859375, "learning_rate": 8.231936308432935e-06, "loss": 0.7292, "step": 1250 }, { "epoch": 5.6204785754034505, "grad_norm": 0.82421875, "learning_rate": 8.194619751771527e-06, "loss": 0.6966, "step": 1260 }, { "epoch": 5.6649972175848635, "grad_norm": 0.85546875, "learning_rate": 8.157000046175984e-06, "loss": 0.7128, "step": 1270 }, { "epoch": 5.709515859766277, "grad_norm": 0.77734375, "learning_rate": 8.119080761515197e-06, "loss": 0.7343, "step": 1280 }, { "epoch": 5.75403450194769, "grad_norm": 0.953125, "learning_rate": 8.080865496086177e-06, "loss": 0.7454, "step": 1290 }, { "epoch": 5.798553144129104, "grad_norm": 0.89453125, "learning_rate": 8.042357876272626e-06, "loss": 0.7337, "step": 1300 }, { "epoch": 5.843071786310517, "grad_norm": 0.796875, "learning_rate": 8.003561556200796e-06, "loss": 0.7011, "step": 1310 }, { "epoch": 5.887590428491931, "grad_norm": 0.83984375, "learning_rate": 7.964480217392739e-06, "loss": 0.6969, "step": 1320 }, { "epoch": 5.932109070673344, "grad_norm": 0.96484375, "learning_rate": 7.925117568416966e-06, "loss": 0.7272, "step": 1330 }, { "epoch": 5.976627712854758, "grad_norm": 0.89453125, "learning_rate": 7.885477344536516e-06, "loss": 0.6795, "step": 1340 }, { "epoch": 6.023372287145242, "grad_norm": 0.8828125, "learning_rate": 7.845563307354506e-06, "loss": 0.7507, "step": 1350 }, { "epoch": 6.067890929326656, "grad_norm": 0.83984375, "learning_rate": 7.80537924445718e-06, "loss": 0.6812, "step": 1360 }, { "epoch": 6.112409571508069, "grad_norm": 0.94921875, "learning_rate": 7.764928969054493e-06, "loss": 0.694, "step": 1370 }, { "epoch": 6.156928213689483, "grad_norm": 0.87890625, "learning_rate": 7.724216319618257e-06, "loss": 0.6636, "step": 1380 }, { "epoch": 6.201446855870896, "grad_norm": 0.828125, "learning_rate": 7.683245159517903e-06, "loss": 0.6817, "step": 1390 }, { "epoch": 6.24596549805231, "grad_norm": 1.0546875, "learning_rate": 7.642019376653858e-06, "loss": 0.6709, "step": 1400 }, { "epoch": 6.290484140233723, "grad_norm": 0.8515625, "learning_rate": 7.600542883088629e-06, "loss": 0.6755, "step": 1410 }, { "epoch": 6.3350027824151365, "grad_norm": 1.03125, "learning_rate": 7.5588196146755526e-06, "loss": 0.7135, "step": 1420 }, { "epoch": 6.3795214245965495, "grad_norm": 0.78515625, "learning_rate": 7.5168535306853155e-06, "loss": 0.6461, "step": 1430 }, { "epoch": 6.424040066777963, "grad_norm": 0.77734375, "learning_rate": 7.474648613430252e-06, "loss": 0.6194, "step": 1440 }, { "epoch": 6.468558708959376, "grad_norm": 1.2421875, "learning_rate": 7.432208867886439e-06, "loss": 0.6871, "step": 1450 }, { "epoch": 6.51307735114079, "grad_norm": 1.3828125, "learning_rate": 7.389538321313652e-06, "loss": 0.6691, "step": 1460 }, { "epoch": 6.557595993322204, "grad_norm": 1.625, "learning_rate": 7.346641022873205e-06, "loss": 0.6686, "step": 1470 }, { "epoch": 6.602114635503617, "grad_norm": 1.8046875, "learning_rate": 7.303521043243711e-06, "loss": 0.648, "step": 1480 }, { "epoch": 6.646633277685031, "grad_norm": 2.40625, "learning_rate": 7.2601824742347985e-06, "loss": 0.7131, "step": 1490 }, { "epoch": 6.691151919866444, "grad_norm": 1.09375, "learning_rate": 7.2166294283988315e-06, "loss": 0.7121, "step": 1500 }, { "epoch": 6.735670562047858, "grad_norm": 0.9765625, "learning_rate": 7.172866038640644e-06, "loss": 0.6216, "step": 1510 }, { "epoch": 6.780189204229271, "grad_norm": 1.0390625, "learning_rate": 7.128896457825364e-06, "loss": 0.6726, "step": 1520 }, { "epoch": 6.824707846410685, "grad_norm": 1.1328125, "learning_rate": 7.084724858384326e-06, "loss": 0.6597, "step": 1530 }, { "epoch": 6.869226488592098, "grad_norm": 1.0546875, "learning_rate": 7.04035543191914e-06, "loss": 0.6608, "step": 1540 }, { "epoch": 6.913745130773512, "grad_norm": 1.78125, "learning_rate": 6.995792388803929e-06, "loss": 0.6419, "step": 1550 }, { "epoch": 6.958263772954925, "grad_norm": 1.7734375, "learning_rate": 6.9510399577857976e-06, "loss": 0.6505, "step": 1560 }, { "epoch": 7.005008347245409, "grad_norm": 2.015625, "learning_rate": 6.906102385583548e-06, "loss": 0.734, "step": 1570 }, { "epoch": 7.049526989426822, "grad_norm": 1.9140625, "learning_rate": 6.860983936484689e-06, "loss": 0.6262, "step": 1580 }, { "epoch": 7.094045631608236, "grad_norm": 1.953125, "learning_rate": 6.815688891940796e-06, "loss": 0.6499, "step": 1590 }, { "epoch": 7.138564273789649, "grad_norm": 4.53125, "learning_rate": 6.770221550161214e-06, "loss": 0.6259, "step": 1600 }, { "epoch": 7.183082915971063, "grad_norm": 5.1875, "learning_rate": 6.724586225705191e-06, "loss": 0.6564, "step": 1610 }, { "epoch": 7.227601558152476, "grad_norm": 6.1875, "learning_rate": 6.678787249072456e-06, "loss": 0.6358, "step": 1620 }, { "epoch": 7.27212020033389, "grad_norm": 4.9375, "learning_rate": 6.632828966292279e-06, "loss": 0.6883, "step": 1630 }, { "epoch": 7.316638842515303, "grad_norm": 4.625, "learning_rate": 6.586715738511067e-06, "loss": 0.6618, "step": 1640 }, { "epoch": 7.361157484696717, "grad_norm": 10.9375, "learning_rate": 6.540451941578505e-06, "loss": 0.6233, "step": 1650 }, { "epoch": 7.40567612687813, "grad_norm": 12.625, "learning_rate": 6.494041965632335e-06, "loss": 0.6973, "step": 1660 }, { "epoch": 7.450194769059544, "grad_norm": 11.5, "learning_rate": 6.447490214681742e-06, "loss": 0.6683, "step": 1670 }, { "epoch": 7.494713411240957, "grad_norm": 10.625, "learning_rate": 6.400801106189457e-06, "loss": 0.5964, "step": 1680 }, { "epoch": 7.539232053422371, "grad_norm": 9.875, "learning_rate": 6.353979070652555e-06, "loss": 0.6784, "step": 1690 }, { "epoch": 7.583750695603785, "grad_norm": 3.671875, "learning_rate": 6.307028551182041e-06, "loss": 0.6335, "step": 1700 }, { "epoch": 7.628269337785198, "grad_norm": 3.421875, "learning_rate": 6.259954003081215e-06, "loss": 0.6539, "step": 1710 }, { "epoch": 7.6727879799666105, "grad_norm": 2.828125, "learning_rate": 6.212759893422908e-06, "loss": 0.6371, "step": 1720 }, { "epoch": 7.717306622148024, "grad_norm": 3.734375, "learning_rate": 6.165450700625565e-06, "loss": 0.6426, "step": 1730 }, { "epoch": 7.761825264329438, "grad_norm": 3.0, "learning_rate": 6.118030914028292e-06, "loss": 0.6587, "step": 1740 }, { "epoch": 7.806343906510851, "grad_norm": 3.234375, "learning_rate": 6.070505033464835e-06, "loss": 0.5994, "step": 1750 }, { "epoch": 7.850862548692265, "grad_norm": 3.0, "learning_rate": 6.022877568836579e-06, "loss": 0.6387, "step": 1760 }, { "epoch": 7.895381190873678, "grad_norm": 2.90625, "learning_rate": 5.975153039684579e-06, "loss": 0.6704, "step": 1770 }, { "epoch": 7.939899833055092, "grad_norm": 2.609375, "learning_rate": 5.927335974760699e-06, "loss": 0.6274, "step": 1780 }, { "epoch": 7.984418475236505, "grad_norm": 3.03125, "learning_rate": 5.87943091159785e-06, "loss": 0.6611, "step": 1790 }, { "epoch": 8.03116304952699, "grad_norm": 2.609375, "learning_rate": 5.831442396079413e-06, "loss": 0.6732, "step": 1800 }, { "epoch": 8.075681691708404, "grad_norm": 3.078125, "learning_rate": 5.78337498200786e-06, "loss": 0.5774, "step": 1810 }, { "epoch": 8.120200333889816, "grad_norm": 2.71875, "learning_rate": 5.735233230672636e-06, "loss": 0.6312, "step": 1820 }, { "epoch": 8.16471897607123, "grad_norm": 2.96875, "learning_rate": 5.687021710417308e-06, "loss": 0.6262, "step": 1830 }, { "epoch": 8.209237618252644, "grad_norm": 2.765625, "learning_rate": 5.638744996206074e-06, "loss": 0.5604, "step": 1840 }, { "epoch": 8.253756260434058, "grad_norm": 3.078125, "learning_rate": 5.590407669189612e-06, "loss": 0.6017, "step": 1850 }, { "epoch": 8.29827490261547, "grad_norm": 2.140625, "learning_rate": 5.542014316270377e-06, "loss": 0.5133, "step": 1860 }, { "epoch": 8.342793544796884, "grad_norm": 2.5, "learning_rate": 5.493569529667312e-06, "loss": 0.5995, "step": 1870 }, { "epoch": 8.387312186978297, "grad_norm": 2.96875, "learning_rate": 5.445077906480095e-06, "loss": 0.6081, "step": 1880 }, { "epoch": 8.431830829159711, "grad_norm": 2.359375, "learning_rate": 5.396544048252893e-06, "loss": 0.6193, "step": 1890 }, { "epoch": 8.476349471341123, "grad_norm": 2.5625, "learning_rate": 5.3479725605377065e-06, "loss": 0.568, "step": 1900 }, { "epoch": 8.520868113522537, "grad_norm": 2.59375, "learning_rate": 5.299368052457332e-06, "loss": 0.5966, "step": 1910 }, { "epoch": 8.565386755703951, "grad_norm": 2.90625, "learning_rate": 5.250735136267993e-06, "loss": 0.6217, "step": 1920 }, { "epoch": 8.609905397885365, "grad_norm": 2.375, "learning_rate": 5.2020784269216515e-06, "loss": 0.554, "step": 1930 }, { "epoch": 8.654424040066779, "grad_norm": 2.40625, "learning_rate": 5.153402541628097e-06, "loss": 0.562, "step": 1940 }, { "epoch": 8.698942682248191, "grad_norm": 2.203125, "learning_rate": 5.1047120994167855e-06, "loss": 0.598, "step": 1950 }, { "epoch": 8.743461324429605, "grad_norm": 2.96875, "learning_rate": 5.056011720698536e-06, "loss": 0.6065, "step": 1960 }, { "epoch": 8.787979966611019, "grad_norm": 2.53125, "learning_rate": 5.007306026827076e-06, "loss": 0.5696, "step": 1970 }, { "epoch": 8.832498608792433, "grad_norm": 2.171875, "learning_rate": 4.958599639660508e-06, "loss": 0.5824, "step": 1980 }, { "epoch": 8.877017250973845, "grad_norm": 2.5625, "learning_rate": 4.909897181122725e-06, "loss": 0.6082, "step": 1990 }, { "epoch": 8.921535893155259, "grad_norm": 2.84375, "learning_rate": 4.861203272764813e-06, "loss": 0.554, "step": 2000 }, { "epoch": 8.921535893155259, "eval_loss": 0.9391384720802307, "eval_runtime": 17.1717, "eval_samples_per_second": 23.294, "eval_steps_per_second": 23.294, "step": 2000 }, { "epoch": 8.966054535336673, "grad_norm": 2.125, "learning_rate": 4.8125225353265085e-06, "loss": 0.5373, "step": 2010 }, { "epoch": 9.012799109627156, "grad_norm": 2.203125, "learning_rate": 4.7638595882977064e-06, "loss": 0.6353, "step": 2020 }, { "epoch": 9.05731775180857, "grad_norm": 2.25, "learning_rate": 4.71521904948011e-06, "loss": 0.5151, "step": 2030 }, { "epoch": 9.101836393989982, "grad_norm": 1.9296875, "learning_rate": 4.666605534549021e-06, "loss": 0.5314, "step": 2040 }, { "epoch": 9.146355036171396, "grad_norm": 2.296875, "learning_rate": 4.618023656615352e-06, "loss": 0.5424, "step": 2050 }, { "epoch": 9.19087367835281, "grad_norm": 2.21875, "learning_rate": 4.569478025787869e-06, "loss": 0.4959, "step": 2060 }, { "epoch": 9.235392320534224, "grad_norm": 2.078125, "learning_rate": 4.520973248735715e-06, "loss": 0.5301, "step": 2070 }, { "epoch": 9.279910962715638, "grad_norm": 2.265625, "learning_rate": 4.472513928251275e-06, "loss": 0.5219, "step": 2080 }, { "epoch": 9.32442960489705, "grad_norm": 3.078125, "learning_rate": 4.424104662813396e-06, "loss": 0.5537, "step": 2090 }, { "epoch": 9.368948247078464, "grad_norm": 2.21875, "learning_rate": 4.375750046151023e-06, "loss": 0.5269, "step": 2100 }, { "epoch": 9.413466889259878, "grad_norm": 1.84375, "learning_rate": 4.3274546668072835e-06, "loss": 0.5535, "step": 2110 }, { "epoch": 9.457985531441292, "grad_norm": 2.234375, "learning_rate": 4.279223107704058e-06, "loss": 0.5382, "step": 2120 }, { "epoch": 9.502504173622704, "grad_norm": 2.1875, "learning_rate": 4.2310599457071e-06, "loss": 0.5643, "step": 2130 }, { "epoch": 9.547022815804118, "grad_norm": 2.171875, "learning_rate": 4.1829697511917146e-06, "loss": 0.5493, "step": 2140 }, { "epoch": 9.591541457985532, "grad_norm": 2.328125, "learning_rate": 4.134957087609065e-06, "loss": 0.5457, "step": 2150 }, { "epoch": 9.636060100166945, "grad_norm": 2.484375, "learning_rate": 4.087026511053116e-06, "loss": 0.4859, "step": 2160 }, { "epoch": 9.680578742348358, "grad_norm": 2.1875, "learning_rate": 4.0391825698283084e-06, "loss": 0.4969, "step": 2170 }, { "epoch": 9.725097384529771, "grad_norm": 2.125, "learning_rate": 3.991429804017944e-06, "loss": 0.5311, "step": 2180 }, { "epoch": 9.769616026711185, "grad_norm": 1.9921875, "learning_rate": 3.9437727450533605e-06, "loss": 0.5437, "step": 2190 }, { "epoch": 9.8141346688926, "grad_norm": 1.9609375, "learning_rate": 3.89621591528393e-06, "loss": 0.5197, "step": 2200 }, { "epoch": 9.858653311074011, "grad_norm": 1.90625, "learning_rate": 3.848763827547915e-06, "loss": 0.5104, "step": 2210 }, { "epoch": 9.903171953255425, "grad_norm": 1.859375, "learning_rate": 3.8014209847442345e-06, "loss": 0.55, "step": 2220 }, { "epoch": 9.947690595436839, "grad_norm": 1.8671875, "learning_rate": 3.7541918794051637e-06, "loss": 0.53, "step": 2230 }, { "epoch": 9.992209237618253, "grad_norm": 2.203125, "learning_rate": 3.7070809932700134e-06, "loss": 0.4882, "step": 2240 }, { "epoch": 10.038953811908737, "grad_norm": 1.75, "learning_rate": 3.6600927968598588e-06, "loss": 0.4714, "step": 2250 }, { "epoch": 10.08347245409015, "grad_norm": 1.90625, "learning_rate": 3.613231749053304e-06, "loss": 0.4774, "step": 2260 }, { "epoch": 10.127991096271563, "grad_norm": 2.046875, "learning_rate": 3.5665022966633678e-06, "loss": 0.4764, "step": 2270 }, { "epoch": 10.172509738452977, "grad_norm": 1.953125, "learning_rate": 3.519908874015501e-06, "loss": 0.4632, "step": 2280 }, { "epoch": 10.21702838063439, "grad_norm": 1.9453125, "learning_rate": 3.473455902526809e-06, "loss": 0.4604, "step": 2290 }, { "epoch": 10.261547022815805, "grad_norm": 1.6484375, "learning_rate": 3.4271477902864836e-06, "loss": 0.4753, "step": 2300 }, { "epoch": 10.306065664997218, "grad_norm": 1.6875, "learning_rate": 3.3809889316375012e-06, "loss": 0.4323, "step": 2310 }, { "epoch": 10.35058430717863, "grad_norm": 1.734375, "learning_rate": 3.334983706759627e-06, "loss": 0.4659, "step": 2320 }, { "epoch": 10.395102949360044, "grad_norm": 1.9453125, "learning_rate": 3.2891364812537686e-06, "loss": 0.4896, "step": 2330 }, { "epoch": 10.439621591541458, "grad_norm": 1.8515625, "learning_rate": 3.2434516057277055e-06, "loss": 0.478, "step": 2340 }, { "epoch": 10.484140233722872, "grad_norm": 1.765625, "learning_rate": 3.1979334153832486e-06, "loss": 0.4453, "step": 2350 }, { "epoch": 10.528658875904284, "grad_norm": 1.7421875, "learning_rate": 3.1525862296048446e-06, "loss": 0.5075, "step": 2360 }, { "epoch": 10.573177518085698, "grad_norm": 1.5546875, "learning_rate": 3.1074143515497114e-06, "loss": 0.4865, "step": 2370 }, { "epoch": 10.617696160267112, "grad_norm": 1.5234375, "learning_rate": 3.0624220677394854e-06, "loss": 0.5178, "step": 2380 }, { "epoch": 10.662214802448526, "grad_norm": 2.015625, "learning_rate": 3.017613647653461e-06, "loss": 0.5069, "step": 2390 }, { "epoch": 10.706733444629938, "grad_norm": 1.4375, "learning_rate": 2.9729933433234402e-06, "loss": 0.4423, "step": 2400 }, { "epoch": 10.751252086811352, "grad_norm": 1.609375, "learning_rate": 2.9285653889302514e-06, "loss": 0.4359, "step": 2410 }, { "epoch": 10.795770728992766, "grad_norm": 1.296875, "learning_rate": 2.8843340004019427e-06, "loss": 0.4517, "step": 2420 }, { "epoch": 10.84028937117418, "grad_norm": 1.578125, "learning_rate": 2.8403033750137255e-06, "loss": 0.4775, "step": 2430 }, { "epoch": 10.884808013355592, "grad_norm": 1.5, "learning_rate": 2.7964776909896733e-06, "loss": 0.5064, "step": 2440 }, { "epoch": 10.929326655537006, "grad_norm": 1.4609375, "learning_rate": 2.7528611071062366e-06, "loss": 0.4651, "step": 2450 }, { "epoch": 10.97384529771842, "grad_norm": 1.4921875, "learning_rate": 2.7094577622976096e-06, "loss": 0.4909, "step": 2460 }, { "epoch": 11.020589872008903, "grad_norm": 1.3984375, "learning_rate": 2.6662717752629597e-06, "loss": 0.4996, "step": 2470 }, { "epoch": 11.065108514190317, "grad_norm": 1.3984375, "learning_rate": 2.6233072440755934e-06, "loss": 0.4445, "step": 2480 }, { "epoch": 11.109627156371731, "grad_norm": 1.375, "learning_rate": 2.580568245794085e-06, "loss": 0.4471, "step": 2490 }, { "epoch": 11.154145798553143, "grad_norm": 1.0703125, "learning_rate": 2.538058836075373e-06, "loss": 0.49, "step": 2500 }, { "epoch": 11.198664440734557, "grad_norm": 0.9453125, "learning_rate": 2.4957830487899224e-06, "loss": 0.4148, "step": 2510 }, { "epoch": 11.243183082915971, "grad_norm": 1.078125, "learning_rate": 2.4537448956389146e-06, "loss": 0.4247, "step": 2520 }, { "epoch": 11.287701725097385, "grad_norm": 1.1953125, "learning_rate": 2.411948365773588e-06, "loss": 0.4368, "step": 2530 }, { "epoch": 11.332220367278797, "grad_norm": 1.203125, "learning_rate": 2.3703974254166704e-06, "loss": 0.4273, "step": 2540 }, { "epoch": 11.376739009460211, "grad_norm": 1.0, "learning_rate": 2.3290960174860293e-06, "loss": 0.4421, "step": 2550 }, { "epoch": 11.421257651641625, "grad_norm": 0.97265625, "learning_rate": 2.2880480612204925e-06, "loss": 0.4072, "step": 2560 }, { "epoch": 11.465776293823039, "grad_norm": 1.2421875, "learning_rate": 2.247257451807961e-06, "loss": 0.4472, "step": 2570 }, { "epoch": 11.510294936004453, "grad_norm": 1.1328125, "learning_rate": 2.206728060015761e-06, "loss": 0.4613, "step": 2580 }, { "epoch": 11.554813578185865, "grad_norm": 1.34375, "learning_rate": 2.1664637318233484e-06, "loss": 0.4111, "step": 2590 }, { "epoch": 11.599332220367279, "grad_norm": 1.1796875, "learning_rate": 2.1264682880573374e-06, "loss": 0.4385, "step": 2600 }, { "epoch": 11.643850862548693, "grad_norm": 0.98046875, "learning_rate": 2.086745524028933e-06, "loss": 0.4448, "step": 2610 }, { "epoch": 11.688369504730106, "grad_norm": 1.1328125, "learning_rate": 2.0472992091737886e-06, "loss": 0.4292, "step": 2620 }, { "epoch": 11.732888146911518, "grad_norm": 1.2421875, "learning_rate": 2.0081330866942962e-06, "loss": 0.425, "step": 2630 }, { "epoch": 11.777406789092932, "grad_norm": 1.0859375, "learning_rate": 1.96925087320439e-06, "loss": 0.4311, "step": 2640 }, { "epoch": 11.821925431274346, "grad_norm": 0.9375, "learning_rate": 1.930656258376859e-06, "loss": 0.4725, "step": 2650 }, { "epoch": 11.86644407345576, "grad_norm": 1.0, "learning_rate": 1.8923529045932292e-06, "loss": 0.4149, "step": 2660 }, { "epoch": 11.910962715637172, "grad_norm": 1.0625, "learning_rate": 1.8543444465962147e-06, "loss": 0.4436, "step": 2670 }, { "epoch": 11.955481357818586, "grad_norm": 0.9375, "learning_rate": 1.8166344911448115e-06, "loss": 0.4254, "step": 2680 }, { "epoch": 12.00222593210907, "grad_norm": 3.515625, "learning_rate": 1.7792266166720368e-06, "loss": 0.5129, "step": 2690 }, { "epoch": 12.046744574290484, "grad_norm": 1.0625, "learning_rate": 1.742124372945364e-06, "loss": 0.4114, "step": 2700 }, { "epoch": 12.091263216471898, "grad_norm": 0.98046875, "learning_rate": 1.7053312807298633e-06, "loss": 0.4351, "step": 2710 }, { "epoch": 12.135781858653312, "grad_norm": 0.890625, "learning_rate": 1.6688508314541086e-06, "loss": 0.404, "step": 2720 }, { "epoch": 12.180300500834724, "grad_norm": 0.99609375, "learning_rate": 1.6326864868788678e-06, "loss": 0.4349, "step": 2730 }, { "epoch": 12.224819143016138, "grad_norm": 1.0234375, "learning_rate": 1.5968416787685919e-06, "loss": 0.4581, "step": 2740 }, { "epoch": 12.269337785197552, "grad_norm": 0.9921875, "learning_rate": 1.5613198085657804e-06, "loss": 0.4589, "step": 2750 }, { "epoch": 12.313856427378965, "grad_norm": 0.95703125, "learning_rate": 1.5261242470681813e-06, "loss": 0.4357, "step": 2760 }, { "epoch": 12.358375069560378, "grad_norm": 0.78515625, "learning_rate": 1.4912583341089516e-06, "loss": 0.3949, "step": 2770 }, { "epoch": 12.402893711741791, "grad_norm": 0.9140625, "learning_rate": 1.4567253782397073e-06, "loss": 0.4184, "step": 2780 }, { "epoch": 12.447412353923205, "grad_norm": 0.91796875, "learning_rate": 1.4225286564165785e-06, "loss": 0.4309, "step": 2790 }, { "epoch": 12.49193099610462, "grad_norm": 0.9453125, "learning_rate": 1.3886714136892287e-06, "loss": 0.4539, "step": 2800 }, { "epoch": 12.536449638286033, "grad_norm": 0.77734375, "learning_rate": 1.3551568628929434e-06, "loss": 0.4243, "step": 2810 }, { "epoch": 12.580968280467445, "grad_norm": 0.83984375, "learning_rate": 1.321988184343732e-06, "loss": 0.4039, "step": 2820 }, { "epoch": 12.625486922648859, "grad_norm": 1.0078125, "learning_rate": 1.2891685255365517e-06, "loss": 0.4182, "step": 2830 }, { "epoch": 12.670005564830273, "grad_norm": 0.96875, "learning_rate": 1.256701000846619e-06, "loss": 0.4146, "step": 2840 }, { "epoch": 12.714524207011687, "grad_norm": 0.8984375, "learning_rate": 1.22458869123388e-06, "loss": 0.434, "step": 2850 }, { "epoch": 12.759042849193099, "grad_norm": 0.890625, "learning_rate": 1.1928346439506526e-06, "loss": 0.4356, "step": 2860 }, { "epoch": 12.803561491374513, "grad_norm": 0.75390625, "learning_rate": 1.1614418722524506e-06, "loss": 0.4073, "step": 2870 }, { "epoch": 12.848080133555927, "grad_norm": 0.84375, "learning_rate": 1.1304133551120532e-06, "loss": 0.4376, "step": 2880 }, { "epoch": 12.89259877573734, "grad_norm": 0.72265625, "learning_rate": 1.0997520369368158e-06, "loss": 0.4078, "step": 2890 }, { "epoch": 12.937117417918753, "grad_norm": 0.87109375, "learning_rate": 1.0694608272892698e-06, "loss": 0.4329, "step": 2900 }, { "epoch": 12.981636060100167, "grad_norm": 0.9765625, "learning_rate": 1.0395426006110164e-06, "loss": 0.3766, "step": 2910 }, { "epoch": 13.02838063439065, "grad_norm": 0.8046875, "learning_rate": 1.0100001959499644e-06, "loss": 0.3808, "step": 2920 }, { "epoch": 13.072899276572064, "grad_norm": 0.64453125, "learning_rate": 9.808364166909256e-07, "loss": 0.4232, "step": 2930 }, { "epoch": 13.117417918753478, "grad_norm": 0.8046875, "learning_rate": 9.520540302895847e-07, "loss": 0.4332, "step": 2940 }, { "epoch": 13.161936560934892, "grad_norm": 0.6640625, "learning_rate": 9.236557680098918e-07, "loss": 0.4059, "step": 2950 }, { "epoch": 13.206455203116304, "grad_norm": 0.83984375, "learning_rate": 8.956443246648771e-07, "loss": 0.3704, "step": 2960 }, { "epoch": 13.250973845297718, "grad_norm": 0.8046875, "learning_rate": 8.680223583609399e-07, "loss": 0.4327, "step": 2970 }, { "epoch": 13.295492487479132, "grad_norm": 0.875, "learning_rate": 8.407924902455983e-07, "loss": 0.4229, "step": 2980 }, { "epoch": 13.340011129660546, "grad_norm": 0.80859375, "learning_rate": 8.139573042587729e-07, "loss": 0.4121, "step": 2990 }, { "epoch": 13.384529771841958, "grad_norm": 0.70703125, "learning_rate": 7.875193468875719e-07, "loss": 0.423, "step": 3000 }, { "epoch": 13.384529771841958, "eval_loss": 0.9250730872154236, "eval_runtime": 17.1049, "eval_samples_per_second": 23.385, "eval_steps_per_second": 23.385, "step": 3000 }, { "epoch": 13.429048414023372, "grad_norm": 0.90234375, "learning_rate": 7.614811269246631e-07, "loss": 0.4316, "step": 3010 }, { "epoch": 13.473567056204786, "grad_norm": 0.87890625, "learning_rate": 7.35845115230191e-07, "loss": 0.4104, "step": 3020 }, { "epoch": 13.5180856983862, "grad_norm": 0.76953125, "learning_rate": 7.106137444973177e-07, "loss": 0.4367, "step": 3030 }, { "epoch": 13.562604340567614, "grad_norm": 0.83984375, "learning_rate": 6.857894090213702e-07, "loss": 0.417, "step": 3040 }, { "epoch": 13.607122982749026, "grad_norm": 1.4765625, "learning_rate": 6.613744644726383e-07, "loss": 0.394, "step": 3050 }, { "epoch": 13.65164162493044, "grad_norm": 0.91796875, "learning_rate": 6.3737122767284e-07, "loss": 0.4172, "step": 3060 }, { "epoch": 13.696160267111853, "grad_norm": 0.86328125, "learning_rate": 6.137819763752656e-07, "loss": 0.4517, "step": 3070 }, { "epoch": 13.740678909293267, "grad_norm": 1.046875, "learning_rate": 5.90608949048635e-07, "loss": 0.4256, "step": 3080 }, { "epoch": 13.78519755147468, "grad_norm": 1.21875, "learning_rate": 5.678543446646811e-07, "loss": 0.4019, "step": 3090 }, { "epoch": 13.829716193656093, "grad_norm": 0.96875, "learning_rate": 5.455203224894857e-07, "loss": 0.453, "step": 3100 }, { "epoch": 13.874234835837507, "grad_norm": 0.92578125, "learning_rate": 5.236090018785705e-07, "loss": 0.4107, "step": 3110 }, { "epoch": 13.918753478018921, "grad_norm": 0.8984375, "learning_rate": 5.021224620757914e-07, "loss": 0.4475, "step": 3120 }, { "epoch": 13.963272120200333, "grad_norm": 0.9296875, "learning_rate": 4.810627420160269e-07, "loss": 0.4322, "step": 3130 }, { "epoch": 14.010016694490819, "grad_norm": 0.91796875, "learning_rate": 4.604318401317009e-07, "loss": 0.4318, "step": 3140 }, { "epoch": 14.054535336672231, "grad_norm": 1.3046875, "learning_rate": 4.402317141631407e-07, "loss": 0.4489, "step": 3150 }, { "epoch": 14.099053978853645, "grad_norm": 1.40625, "learning_rate": 4.2046428097279766e-07, "loss": 0.4381, "step": 3160 }, { "epoch": 14.143572621035059, "grad_norm": 1.1796875, "learning_rate": 4.011314163633573e-07, "loss": 0.4107, "step": 3170 }, { "epoch": 14.188091263216473, "grad_norm": 1.3203125, "learning_rate": 3.822349548997295e-07, "loss": 0.4399, "step": 3180 }, { "epoch": 14.232609905397885, "grad_norm": 1.3984375, "learning_rate": 3.637766897349654e-07, "loss": 0.417, "step": 3190 }, { "epoch": 14.277128547579299, "grad_norm": 3.5, "learning_rate": 3.4575837244009367e-07, "loss": 0.4449, "step": 3200 }, { "epoch": 14.321647189760712, "grad_norm": 3.671875, "learning_rate": 3.281817128379139e-07, "loss": 0.3875, "step": 3210 }, { "epoch": 14.366165831942126, "grad_norm": 4.625, "learning_rate": 3.1104837884073866e-07, "loss": 0.4187, "step": 3220 }, { "epoch": 14.410684474123538, "grad_norm": 3.875, "learning_rate": 2.943599962921279e-07, "loss": 0.41, "step": 3230 }, { "epoch": 14.455203116304952, "grad_norm": 3.828125, "learning_rate": 2.7811814881259503e-07, "loss": 0.4016, "step": 3240 }, { "epoch": 14.499721758486366, "grad_norm": 8.1875, "learning_rate": 2.623243776493434e-07, "loss": 0.3906, "step": 3250 }, { "epoch": 14.54424040066778, "grad_norm": 8.125, "learning_rate": 2.469801815300027e-07, "loss": 0.4241, "step": 3260 }, { "epoch": 14.588759042849194, "grad_norm": 9.5625, "learning_rate": 2.3208701652041697e-07, "loss": 0.4104, "step": 3270 }, { "epoch": 14.633277685030606, "grad_norm": 7.59375, "learning_rate": 2.1764629588646667e-07, "loss": 0.4031, "step": 3280 }, { "epoch": 14.67779632721202, "grad_norm": 7.125, "learning_rate": 2.036593899599615e-07, "loss": 0.3911, "step": 3290 }, { "epoch": 14.722314969393434, "grad_norm": 2.546875, "learning_rate": 1.9012762600860656e-07, "loss": 0.4137, "step": 3300 }, { "epoch": 14.766833611574848, "grad_norm": 2.609375, "learning_rate": 1.7705228811005004e-07, "loss": 0.4559, "step": 3310 }, { "epoch": 14.81135225375626, "grad_norm": 2.640625, "learning_rate": 1.6443461703003427e-07, "loss": 0.3986, "step": 3320 }, { "epoch": 14.855870895937674, "grad_norm": 2.484375, "learning_rate": 1.5227581010465341e-07, "loss": 0.4073, "step": 3330 }, { "epoch": 14.900389538119088, "grad_norm": 2.734375, "learning_rate": 1.4057702112673765e-07, "loss": 0.4137, "step": 3340 }, { "epoch": 14.944908180300501, "grad_norm": 2.25, "learning_rate": 1.2933936023636073e-07, "loss": 0.4283, "step": 3350 }, { "epoch": 14.989426822481914, "grad_norm": 2.234375, "learning_rate": 1.185638938154976e-07, "loss": 0.4097, "step": 3360 }, { "epoch": 15.0361713967724, "grad_norm": 2.203125, "learning_rate": 1.08251644386832e-07, "loss": 0.4326, "step": 3370 }, { "epoch": 15.080690038953811, "grad_norm": 3.0, "learning_rate": 9.84035905167241e-08, "loss": 0.4338, "step": 3380 }, { "epoch": 15.125208681135225, "grad_norm": 2.359375, "learning_rate": 8.902066672235144e-08, "loss": 0.4197, "step": 3390 }, { "epoch": 15.16972732331664, "grad_norm": 2.234375, "learning_rate": 8.010376338302872e-08, "loss": 0.4277, "step": 3400 }, { "epoch": 15.214245965498053, "grad_norm": 2.25, "learning_rate": 7.165372665571879e-08, "loss": 0.4325, "step": 3410 }, { "epoch": 15.258764607679465, "grad_norm": 2.171875, "learning_rate": 6.367135839473349e-08, "loss": 0.399, "step": 3420 }, { "epoch": 15.303283249860879, "grad_norm": 2.578125, "learning_rate": 5.6157416075648954e-08, "loss": 0.4368, "step": 3430 }, { "epoch": 15.347801892042293, "grad_norm": 2.359375, "learning_rate": 4.911261272341872e-08, "loss": 0.4029, "step": 3440 }, { "epoch": 15.392320534223707, "grad_norm": 2.296875, "learning_rate": 4.25376168447178e-08, "loss": 0.4269, "step": 3450 }, { "epoch": 15.436839176405119, "grad_norm": 2.359375, "learning_rate": 3.643305236450345e-08, "loss": 0.4442, "step": 3460 }, { "epoch": 15.481357818586533, "grad_norm": 2.28125, "learning_rate": 3.079949856680975e-08, "loss": 0.4207, "step": 3470 }, { "epoch": 15.525876460767947, "grad_norm": 2.671875, "learning_rate": 2.5637490039775447e-08, "loss": 0.4257, "step": 3480 }, { "epoch": 15.57039510294936, "grad_norm": 2.296875, "learning_rate": 2.0947516624917898e-08, "loss": 0.4161, "step": 3490 }, { "epoch": 15.614913745130773, "grad_norm": 2.015625, "learning_rate": 1.6730023370645775e-08, "loss": 0.3976, "step": 3500 }, { "epoch": 15.659432387312187, "grad_norm": 2.140625, "learning_rate": 1.298541049003288e-08, "loss": 0.4074, "step": 3510 }, { "epoch": 15.7039510294936, "grad_norm": 2.421875, "learning_rate": 9.714033322833494e-09, "loss": 0.4155, "step": 3520 }, { "epoch": 15.748469671675014, "grad_norm": 2.203125, "learning_rate": 6.9162023017699255e-09, "loss": 0.3747, "step": 3530 }, { "epoch": 15.792988313856428, "grad_norm": 2.09375, "learning_rate": 4.592182923068289e-09, "loss": 0.3766, "step": 3540 }, { "epoch": 15.83750695603784, "grad_norm": 2.125, "learning_rate": 2.7421957212697692e-09, "loss": 0.4017, "step": 3550 }, { "epoch": 15.882025598219254, "grad_norm": 1.90625, "learning_rate": 1.3664162482990296e-09, "loss": 0.42, "step": 3560 }, { "epoch": 15.926544240400668, "grad_norm": 1.7421875, "learning_rate": 4.649750568080924e-10, "loss": 0.44, "step": 3570 }, { "epoch": 15.971062882582082, "grad_norm": 2.421875, "learning_rate": 3.795768778680487e-11, "loss": 0.4156, "step": 3580 }, { "epoch": 15.988870339454646, "step": 3584, "total_flos": 8.551781210951516e+17, "train_loss": 0.7042842949075359, "train_runtime": 4850.1152, "train_samples_per_second": 11.853, "train_steps_per_second": 0.739 } ], "logging_steps": 10, "max_steps": 3584, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.551781210951516e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }