{ "best_metric": 1.1947814226150513, "best_model_checkpoint": "./output/checkpoint-4650", "epoch": 0.21500559910414332, "eval_steps": 150, "global_step": 4800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004479283314669653, "grad_norm": 6.820243835449219, "learning_rate": 7.500000000000001e-07, "loss": 1.2628, "step": 10 }, { "epoch": 0.0008958566629339306, "grad_norm": 5.822151184082031, "learning_rate": 1.5000000000000002e-06, "loss": 1.3652, "step": 20 }, { "epoch": 0.0013437849944008958, "grad_norm": 4.442959785461426, "learning_rate": 2.25e-06, "loss": 1.412, "step": 30 }, { "epoch": 0.0017917133258678612, "grad_norm": 9.916281700134277, "learning_rate": 3.0000000000000005e-06, "loss": 1.5213, "step": 40 }, { "epoch": 0.0022396416573348264, "grad_norm": 22.53717613220215, "learning_rate": 3.7500000000000005e-06, "loss": 1.3189, "step": 50 }, { "epoch": 0.0026875699888017916, "grad_norm": 5.07314920425415, "learning_rate": 4.5e-06, "loss": 1.3022, "step": 60 }, { "epoch": 0.003135498320268757, "grad_norm": 9.401494026184082, "learning_rate": 5.2500000000000006e-06, "loss": 1.5065, "step": 70 }, { "epoch": 0.0035834266517357225, "grad_norm": 8.749906539916992, "learning_rate": 6.000000000000001e-06, "loss": 1.1579, "step": 80 }, { "epoch": 0.004031354983202688, "grad_norm": 6.749314785003662, "learning_rate": 6.7500000000000014e-06, "loss": 1.2524, "step": 90 }, { "epoch": 0.004479283314669653, "grad_norm": 8.411529541015625, "learning_rate": 7.500000000000001e-06, "loss": 1.3242, "step": 100 }, { "epoch": 0.004927211646136618, "grad_norm": 5.293492794036865, "learning_rate": 7.499922926093874e-06, "loss": 0.9967, "step": 110 }, { "epoch": 0.005375139977603583, "grad_norm": 8.860544204711914, "learning_rate": 7.499691707543699e-06, "loss": 1.1881, "step": 120 }, { "epoch": 0.0058230683090705485, "grad_norm": 9.859148979187012, "learning_rate": 7.499306353853963e-06, "loss": 1.0598, "step": 130 }, { "epoch": 0.006270996640537514, "grad_norm": 4.37281608581543, "learning_rate": 7.49876688086505e-06, "loss": 1.1233, "step": 140 }, { "epoch": 0.006718924972004479, "grad_norm": 4.489595890045166, "learning_rate": 7.4980733107525805e-06, "loss": 1.2183, "step": 150 }, { "epoch": 0.006718924972004479, "eval_loss": 1.282976508140564, "eval_runtime": 51.7095, "eval_samples_per_second": 9.669, "eval_steps_per_second": 9.669, "step": 150 }, { "epoch": 0.007166853303471445, "grad_norm": 6.339463233947754, "learning_rate": 7.4972256720265044e-06, "loss": 1.1818, "step": 160 }, { "epoch": 0.00761478163493841, "grad_norm": 6.762680530548096, "learning_rate": 7.496223999529932e-06, "loss": 1.0349, "step": 170 }, { "epoch": 0.008062709966405375, "grad_norm": 7.486023426055908, "learning_rate": 7.4950683344376926e-06, "loss": 1.1735, "step": 180 }, { "epoch": 0.00851063829787234, "grad_norm": 4.099631309509277, "learning_rate": 7.4937587242546544e-06, "loss": 1.2452, "step": 190 }, { "epoch": 0.008958566629339306, "grad_norm": 5.422396183013916, "learning_rate": 7.492295222813762e-06, "loss": 1.1032, "step": 200 }, { "epoch": 0.009406494960806271, "grad_norm": 6.336536407470703, "learning_rate": 7.490677890273828e-06, "loss": 1.0852, "step": 210 }, { "epoch": 0.009854423292273236, "grad_norm": 4.766495704650879, "learning_rate": 7.488906793117058e-06, "loss": 1.2168, "step": 220 }, { "epoch": 0.010302351623740201, "grad_norm": 5.892153263092041, "learning_rate": 7.486982004146319e-06, "loss": 1.1595, "step": 230 }, { "epoch": 0.010750279955207167, "grad_norm": 4.957208633422852, "learning_rate": 7.484903602482148e-06, "loss": 1.1423, "step": 240 }, { "epoch": 0.011198208286674132, "grad_norm": 4.198282718658447, "learning_rate": 7.4826716735594945e-06, "loss": 1.0562, "step": 250 }, { "epoch": 0.011646136618141097, "grad_norm": 3.4756815433502197, "learning_rate": 7.480286309124216e-06, "loss": 0.9894, "step": 260 }, { "epoch": 0.012094064949608062, "grad_norm": 4.725418567657471, "learning_rate": 7.477747607229302e-06, "loss": 1.1761, "step": 270 }, { "epoch": 0.012541993281075027, "grad_norm": 4.241955280303955, "learning_rate": 7.475055672230844e-06, "loss": 1.1118, "step": 280 }, { "epoch": 0.012989921612541993, "grad_norm": 5.7904863357543945, "learning_rate": 7.472210614783745e-06, "loss": 1.0932, "step": 290 }, { "epoch": 0.013437849944008958, "grad_norm": 4.546011924743652, "learning_rate": 7.469212551837173e-06, "loss": 1.1187, "step": 300 }, { "epoch": 0.013437849944008958, "eval_loss": 1.26471745967865, "eval_runtime": 51.7822, "eval_samples_per_second": 9.656, "eval_steps_per_second": 9.656, "step": 300 }, { "epoch": 0.013885778275475923, "grad_norm": 6.256772994995117, "learning_rate": 7.4660616066297565e-06, "loss": 1.2176, "step": 310 }, { "epoch": 0.01433370660694289, "grad_norm": 7.437366485595703, "learning_rate": 7.462757908684509e-06, "loss": 1.046, "step": 320 }, { "epoch": 0.014781634938409855, "grad_norm": 8.049488067626953, "learning_rate": 7.459301593803512e-06, "loss": 1.2396, "step": 330 }, { "epoch": 0.01522956326987682, "grad_norm": 5.115020751953125, "learning_rate": 7.455692804062335e-06, "loss": 1.1018, "step": 340 }, { "epoch": 0.015677491601343786, "grad_norm": 5.805201530456543, "learning_rate": 7.451931687804189e-06, "loss": 1.0083, "step": 350 }, { "epoch": 0.01612541993281075, "grad_norm": 5.960669040679932, "learning_rate": 7.448018399633831e-06, "loss": 1.1773, "step": 360 }, { "epoch": 0.016573348264277716, "grad_norm": 4.82655143737793, "learning_rate": 7.443953100411214e-06, "loss": 1.2279, "step": 370 }, { "epoch": 0.01702127659574468, "grad_norm": 5.768619060516357, "learning_rate": 7.439735957244862e-06, "loss": 1.0924, "step": 380 }, { "epoch": 0.017469204927211646, "grad_norm": 4.603348731994629, "learning_rate": 7.435367143485015e-06, "loss": 0.9547, "step": 390 }, { "epoch": 0.01791713325867861, "grad_norm": 3.802041530609131, "learning_rate": 7.430846838716496e-06, "loss": 1.0569, "step": 400 }, { "epoch": 0.018365061590145577, "grad_norm": 4.473762035369873, "learning_rate": 7.426175228751328e-06, "loss": 1.1299, "step": 410 }, { "epoch": 0.018812989921612542, "grad_norm": 4.674028396606445, "learning_rate": 7.421352505621099e-06, "loss": 1.0512, "step": 420 }, { "epoch": 0.019260918253079507, "grad_norm": 5.1446852684021, "learning_rate": 7.416378867569069e-06, "loss": 1.2024, "step": 430 }, { "epoch": 0.019708846584546472, "grad_norm": 3.742156744003296, "learning_rate": 7.411254519042017e-06, "loss": 1.1778, "step": 440 }, { "epoch": 0.020156774916013438, "grad_norm": 4.0376200675964355, "learning_rate": 7.4059796706818396e-06, "loss": 1.1754, "step": 450 }, { "epoch": 0.020156774916013438, "eval_loss": 1.2499778270721436, "eval_runtime": 51.5995, "eval_samples_per_second": 9.69, "eval_steps_per_second": 9.69, "step": 450 }, { "epoch": 0.020604703247480403, "grad_norm": 3.672325372695923, "learning_rate": 7.400554539316894e-06, "loss": 1.1627, "step": 460 }, { "epoch": 0.021052631578947368, "grad_norm": 4.949635982513428, "learning_rate": 7.394979347953081e-06, "loss": 1.3115, "step": 470 }, { "epoch": 0.021500559910414333, "grad_norm": 4.03855037689209, "learning_rate": 7.389254325764681e-06, "loss": 1.1176, "step": 480 }, { "epoch": 0.0219484882418813, "grad_norm": 4.981250762939453, "learning_rate": 7.383379708084934e-06, "loss": 1.0668, "step": 490 }, { "epoch": 0.022396416573348264, "grad_norm": 4.68571138381958, "learning_rate": 7.377355736396362e-06, "loss": 1.1235, "step": 500 }, { "epoch": 0.02284434490481523, "grad_norm": 5.7003326416015625, "learning_rate": 7.371182658320847e-06, "loss": 1.0535, "step": 510 }, { "epoch": 0.023292273236282194, "grad_norm": 2.357079029083252, "learning_rate": 7.36486072760945e-06, "loss": 0.9768, "step": 520 }, { "epoch": 0.02374020156774916, "grad_norm": 4.828664779663086, "learning_rate": 7.358390204131984e-06, "loss": 1.0385, "step": 530 }, { "epoch": 0.024188129899216124, "grad_norm": 3.4303321838378906, "learning_rate": 7.3517713538663235e-06, "loss": 0.9826, "step": 540 }, { "epoch": 0.02463605823068309, "grad_norm": 8.705097198486328, "learning_rate": 7.345004448887478e-06, "loss": 1.0988, "step": 550 }, { "epoch": 0.025083986562150055, "grad_norm": 4.806099891662598, "learning_rate": 7.3380897673564085e-06, "loss": 1.2765, "step": 560 }, { "epoch": 0.02553191489361702, "grad_norm": 3.948829174041748, "learning_rate": 7.33102759350859e-06, "loss": 1.2548, "step": 570 }, { "epoch": 0.025979843225083985, "grad_norm": 8.706982612609863, "learning_rate": 7.323818217642328e-06, "loss": 1.1907, "step": 580 }, { "epoch": 0.02642777155655095, "grad_norm": 4.196287155151367, "learning_rate": 7.316461936106827e-06, "loss": 1.1541, "step": 590 }, { "epoch": 0.026875699888017916, "grad_norm": 4.2185187339782715, "learning_rate": 7.3089590512900084e-06, "loss": 1.0761, "step": 600 }, { "epoch": 0.026875699888017916, "eval_loss": 1.2407419681549072, "eval_runtime": 51.6589, "eval_samples_per_second": 9.679, "eval_steps_per_second": 9.679, "step": 600 }, { "epoch": 0.02732362821948488, "grad_norm": 4.50939416885376, "learning_rate": 7.301309871606081e-06, "loss": 1.1746, "step": 610 }, { "epoch": 0.027771556550951846, "grad_norm": 5.48988676071167, "learning_rate": 7.293514711482861e-06, "loss": 1.0518, "step": 620 }, { "epoch": 0.028219484882418815, "grad_norm": 4.441885471343994, "learning_rate": 7.285573891348849e-06, "loss": 1.0679, "step": 630 }, { "epoch": 0.02866741321388578, "grad_norm": 6.711030006408691, "learning_rate": 7.27748773762006e-06, "loss": 1.2901, "step": 640 }, { "epoch": 0.029115341545352745, "grad_norm": 5.328275680541992, "learning_rate": 7.269256582686603e-06, "loss": 1.1749, "step": 650 }, { "epoch": 0.02956326987681971, "grad_norm": 3.016313314437866, "learning_rate": 7.260880764899016e-06, "loss": 1.1398, "step": 660 }, { "epoch": 0.030011198208286675, "grad_norm": 4.6470866203308105, "learning_rate": 7.252360628554363e-06, "loss": 1.0427, "step": 670 }, { "epoch": 0.03045912653975364, "grad_norm": 9.044170379638672, "learning_rate": 7.243696523882079e-06, "loss": 1.0913, "step": 680 }, { "epoch": 0.030907054871220606, "grad_norm": 4.983870029449463, "learning_rate": 7.2348888070295705e-06, "loss": 1.1174, "step": 690 }, { "epoch": 0.03135498320268757, "grad_norm": 10.38315486907959, "learning_rate": 7.225937840047583e-06, "loss": 1.2386, "step": 700 }, { "epoch": 0.031802911534154536, "grad_norm": 5.104282855987549, "learning_rate": 7.216843990875307e-06, "loss": 1.1014, "step": 710 }, { "epoch": 0.0322508398656215, "grad_norm": 5.493166446685791, "learning_rate": 7.207607633325266e-06, "loss": 1.2569, "step": 720 }, { "epoch": 0.03269876819708847, "grad_norm": 5.069271564483643, "learning_rate": 7.198229147067941e-06, "loss": 1.1938, "step": 730 }, { "epoch": 0.03314669652855543, "grad_norm": 5.183401107788086, "learning_rate": 7.18870891761617e-06, "loss": 0.9859, "step": 740 }, { "epoch": 0.0335946248600224, "grad_norm": 4.3622965812683105, "learning_rate": 7.1790473363092974e-06, "loss": 1.1359, "step": 750 }, { "epoch": 0.0335946248600224, "eval_loss": 1.2344202995300293, "eval_runtime": 51.6321, "eval_samples_per_second": 9.684, "eval_steps_per_second": 9.684, "step": 750 }, { "epoch": 0.03404255319148936, "grad_norm": 4.141931056976318, "learning_rate": 7.169244800297089e-06, "loss": 1.2613, "step": 760 }, { "epoch": 0.03449048152295633, "grad_norm": 4.191932201385498, "learning_rate": 7.159301712523407e-06, "loss": 1.1802, "step": 770 }, { "epoch": 0.03493840985442329, "grad_norm": 4.759700775146484, "learning_rate": 7.149218481709644e-06, "loss": 1.0651, "step": 780 }, { "epoch": 0.03538633818589026, "grad_norm": 3.969430923461914, "learning_rate": 7.1389955223379266e-06, "loss": 0.9129, "step": 790 }, { "epoch": 0.03583426651735722, "grad_norm": 5.1956467628479, "learning_rate": 7.128633254634072e-06, "loss": 1.2688, "step": 800 }, { "epoch": 0.03628219484882419, "grad_norm": 3.615705966949463, "learning_rate": 7.118132104550322e-06, "loss": 1.1092, "step": 810 }, { "epoch": 0.036730123180291153, "grad_norm": 3.635277271270752, "learning_rate": 7.107492503747826e-06, "loss": 1.0265, "step": 820 }, { "epoch": 0.03717805151175812, "grad_norm": 4.518077373504639, "learning_rate": 7.096714889578898e-06, "loss": 1.0817, "step": 830 }, { "epoch": 0.037625979843225084, "grad_norm": 6.652565002441406, "learning_rate": 7.085799705069046e-06, "loss": 0.9709, "step": 840 }, { "epoch": 0.03807390817469205, "grad_norm": 5.337361812591553, "learning_rate": 7.0747473988987515e-06, "loss": 1.0883, "step": 850 }, { "epoch": 0.038521836506159014, "grad_norm": 5.067249774932861, "learning_rate": 7.063558425385033e-06, "loss": 1.08, "step": 860 }, { "epoch": 0.03896976483762598, "grad_norm": 3.9859232902526855, "learning_rate": 7.052233244462769e-06, "loss": 1.0063, "step": 870 }, { "epoch": 0.039417693169092945, "grad_norm": 5.297623634338379, "learning_rate": 7.040772321665788e-06, "loss": 0.9638, "step": 880 }, { "epoch": 0.03986562150055991, "grad_norm": 6.088709354400635, "learning_rate": 7.029176128107734e-06, "loss": 1.2673, "step": 890 }, { "epoch": 0.040313549832026875, "grad_norm": 7.997159957885742, "learning_rate": 7.017445140462711e-06, "loss": 0.9986, "step": 900 }, { "epoch": 0.040313549832026875, "eval_loss": 1.2309150695800781, "eval_runtime": 51.612, "eval_samples_per_second": 9.688, "eval_steps_per_second": 9.688, "step": 900 }, { "epoch": 0.04076147816349384, "grad_norm": 6.393094062805176, "learning_rate": 7.00557984094567e-06, "loss": 1.066, "step": 910 }, { "epoch": 0.041209406494960805, "grad_norm": 4.47462797164917, "learning_rate": 6.993580717292601e-06, "loss": 1.3117, "step": 920 }, { "epoch": 0.04165733482642777, "grad_norm": 4.160079479217529, "learning_rate": 6.981448262740483e-06, "loss": 1.3003, "step": 930 }, { "epoch": 0.042105263157894736, "grad_norm": 5.260162353515625, "learning_rate": 6.969182976006999e-06, "loss": 1.312, "step": 940 }, { "epoch": 0.0425531914893617, "grad_norm": 4.503716468811035, "learning_rate": 6.95678536127005e-06, "loss": 1.185, "step": 950 }, { "epoch": 0.043001119820828666, "grad_norm": 3.7414872646331787, "learning_rate": 6.944255928147017e-06, "loss": 1.1585, "step": 960 }, { "epoch": 0.04344904815229563, "grad_norm": 5.410964012145996, "learning_rate": 6.931595191673823e-06, "loss": 1.1403, "step": 970 }, { "epoch": 0.0438969764837626, "grad_norm": 4.388716220855713, "learning_rate": 6.9188036722837555e-06, "loss": 1.0452, "step": 980 }, { "epoch": 0.04434490481522956, "grad_norm": 2.7749533653259277, "learning_rate": 6.905881895786076e-06, "loss": 1.0638, "step": 990 }, { "epoch": 0.04479283314669653, "grad_norm": 5.431761741638184, "learning_rate": 6.892830393344403e-06, "loss": 1.2718, "step": 1000 }, { "epoch": 0.04524076147816349, "grad_norm": 4.384571552276611, "learning_rate": 6.879649701454886e-06, "loss": 1.0594, "step": 1010 }, { "epoch": 0.04568868980963046, "grad_norm": 5.040534019470215, "learning_rate": 6.866340361924141e-06, "loss": 1.2255, "step": 1020 }, { "epoch": 0.04613661814109742, "grad_norm": 4.800682544708252, "learning_rate": 6.852902921846988e-06, "loss": 1.1093, "step": 1030 }, { "epoch": 0.04658454647256439, "grad_norm": 5.662080764770508, "learning_rate": 6.8393379335839565e-06, "loss": 1.2003, "step": 1040 }, { "epoch": 0.04703247480403135, "grad_norm": 3.93361234664917, "learning_rate": 6.825645954738586e-06, "loss": 1.0652, "step": 1050 }, { "epoch": 0.04703247480403135, "eval_loss": 1.2271474599838257, "eval_runtime": 51.5746, "eval_samples_per_second": 9.695, "eval_steps_per_second": 9.695, "step": 1050 }, { "epoch": 0.04748040313549832, "grad_norm": 4.918002605438232, "learning_rate": 6.811827548134495e-06, "loss": 1.156, "step": 1060 }, { "epoch": 0.047928331466965284, "grad_norm": 3.533487319946289, "learning_rate": 6.797883281792261e-06, "loss": 1.0533, "step": 1070 }, { "epoch": 0.04837625979843225, "grad_norm": 4.698348045349121, "learning_rate": 6.783813728906054e-06, "loss": 1.2621, "step": 1080 }, { "epoch": 0.048824188129899214, "grad_norm": 3.90852427482605, "learning_rate": 6.769619467820086e-06, "loss": 1.0754, "step": 1090 }, { "epoch": 0.04927211646136618, "grad_norm": 6.924786567687988, "learning_rate": 6.755301082004838e-06, "loss": 1.0617, "step": 1100 }, { "epoch": 0.049720044792833144, "grad_norm": 5.685960292816162, "learning_rate": 6.740859160033068e-06, "loss": 1.2185, "step": 1110 }, { "epoch": 0.05016797312430011, "grad_norm": 5.533092975616455, "learning_rate": 6.726294295555623e-06, "loss": 1.0583, "step": 1120 }, { "epoch": 0.050615901455767075, "grad_norm": 4.5029988288879395, "learning_rate": 6.711607087277034e-06, "loss": 1.1781, "step": 1130 }, { "epoch": 0.05106382978723404, "grad_norm": 3.2203736305236816, "learning_rate": 6.69679813893091e-06, "loss": 1.151, "step": 1140 }, { "epoch": 0.051511758118701005, "grad_norm": 6.602795600891113, "learning_rate": 6.681868059255113e-06, "loss": 1.1373, "step": 1150 }, { "epoch": 0.05195968645016797, "grad_norm": 3.071552038192749, "learning_rate": 6.666817461966741e-06, "loss": 1.1554, "step": 1160 }, { "epoch": 0.052407614781634936, "grad_norm": 5.886751174926758, "learning_rate": 6.651646965736902e-06, "loss": 1.1328, "step": 1170 }, { "epoch": 0.0528555431131019, "grad_norm": 4.323307991027832, "learning_rate": 6.636357194165274e-06, "loss": 1.1535, "step": 1180 }, { "epoch": 0.053303471444568866, "grad_norm": 4.585876941680908, "learning_rate": 6.620948775754481e-06, "loss": 1.1636, "step": 1190 }, { "epoch": 0.05375139977603583, "grad_norm": 3.9351437091827393, "learning_rate": 6.605422343884255e-06, "loss": 1.2689, "step": 1200 }, { "epoch": 0.05375139977603583, "eval_loss": 1.2224195003509521, "eval_runtime": 51.5936, "eval_samples_per_second": 9.691, "eval_steps_per_second": 9.691, "step": 1200 }, { "epoch": 0.054199328107502796, "grad_norm": 3.1242146492004395, "learning_rate": 6.589778536785396e-06, "loss": 1.2646, "step": 1210 }, { "epoch": 0.05464725643896976, "grad_norm": 3.1645703315734863, "learning_rate": 6.5740179975135426e-06, "loss": 0.9831, "step": 1220 }, { "epoch": 0.05509518477043673, "grad_norm": 6.550941467285156, "learning_rate": 6.5581413739227314e-06, "loss": 1.1777, "step": 1230 }, { "epoch": 0.05554311310190369, "grad_norm": 17.51181983947754, "learning_rate": 6.542149318638777e-06, "loss": 1.0765, "step": 1240 }, { "epoch": 0.055991041433370664, "grad_norm": 6.8737664222717285, "learning_rate": 6.526042489032434e-06, "loss": 1.0107, "step": 1250 }, { "epoch": 0.05643896976483763, "grad_norm": 3.5256145000457764, "learning_rate": 6.509821547192383e-06, "loss": 1.1973, "step": 1260 }, { "epoch": 0.056886898096304594, "grad_norm": 5.974047660827637, "learning_rate": 6.493487159898006e-06, "loss": 1.2409, "step": 1270 }, { "epoch": 0.05733482642777156, "grad_norm": 3.98787522315979, "learning_rate": 6.477039998591991e-06, "loss": 1.3272, "step": 1280 }, { "epoch": 0.057782754759238525, "grad_norm": 5.225778102874756, "learning_rate": 6.460480739352719e-06, "loss": 1.2937, "step": 1290 }, { "epoch": 0.05823068309070549, "grad_norm": 3.719729423522949, "learning_rate": 6.4438100628664795e-06, "loss": 1.0965, "step": 1300 }, { "epoch": 0.058678611422172455, "grad_norm": 2.8820245265960693, "learning_rate": 6.4270286543994874e-06, "loss": 1.2178, "step": 1310 }, { "epoch": 0.05912653975363942, "grad_norm": 3.031202793121338, "learning_rate": 6.410137203769718e-06, "loss": 1.354, "step": 1320 }, { "epoch": 0.059574468085106386, "grad_norm": 3.010680675506592, "learning_rate": 6.393136405318545e-06, "loss": 1.185, "step": 1330 }, { "epoch": 0.06002239641657335, "grad_norm": 3.756014823913574, "learning_rate": 6.376026957882207e-06, "loss": 1.1636, "step": 1340 }, { "epoch": 0.060470324748040316, "grad_norm": 4.391636848449707, "learning_rate": 6.3588095647630754e-06, "loss": 1.2252, "step": 1350 }, { "epoch": 0.060470324748040316, "eval_loss": 1.222408652305603, "eval_runtime": 51.5211, "eval_samples_per_second": 9.705, "eval_steps_per_second": 9.705, "step": 1350 }, { "epoch": 0.06091825307950728, "grad_norm": 3.5359737873077393, "learning_rate": 6.341484933700744e-06, "loss": 1.0688, "step": 1360 }, { "epoch": 0.061366181410974247, "grad_norm": 4.412395477294922, "learning_rate": 6.32405377684294e-06, "loss": 1.1889, "step": 1370 }, { "epoch": 0.06181410974244121, "grad_norm": 7.099231719970703, "learning_rate": 6.306516810716249e-06, "loss": 1.0922, "step": 1380 }, { "epoch": 0.06226203807390818, "grad_norm": 3.257270097732544, "learning_rate": 6.288874756196662e-06, "loss": 1.2291, "step": 1390 }, { "epoch": 0.06270996640537514, "grad_norm": 3.6133875846862793, "learning_rate": 6.271128338479939e-06, "loss": 1.0567, "step": 1400 }, { "epoch": 0.06315789473684211, "grad_norm": 4.996825695037842, "learning_rate": 6.253278287051806e-06, "loss": 1.1242, "step": 1410 }, { "epoch": 0.06360582306830907, "grad_norm": 5.642391204833984, "learning_rate": 6.235325335657962e-06, "loss": 1.1998, "step": 1420 }, { "epoch": 0.06405375139977604, "grad_norm": 4.652320384979248, "learning_rate": 6.217270222273923e-06, "loss": 1.0647, "step": 1430 }, { "epoch": 0.064501679731243, "grad_norm": 8.814513206481934, "learning_rate": 6.1991136890746825e-06, "loss": 0.97, "step": 1440 }, { "epoch": 0.06494960806270997, "grad_norm": 4.535324573516846, "learning_rate": 6.180856482404208e-06, "loss": 1.0829, "step": 1450 }, { "epoch": 0.06539753639417693, "grad_norm": 5.13389778137207, "learning_rate": 6.162499352744754e-06, "loss": 1.3333, "step": 1460 }, { "epoch": 0.0658454647256439, "grad_norm": 4.871939182281494, "learning_rate": 6.144043054686022e-06, "loss": 1.1397, "step": 1470 }, { "epoch": 0.06629339305711086, "grad_norm": 3.31581449508667, "learning_rate": 6.125488346894139e-06, "loss": 1.0983, "step": 1480 }, { "epoch": 0.06674132138857783, "grad_norm": 6.067586898803711, "learning_rate": 6.106835992080464e-06, "loss": 1.0931, "step": 1490 }, { "epoch": 0.0671892497200448, "grad_norm": 4.4560465812683105, "learning_rate": 6.088086756970252e-06, "loss": 1.0743, "step": 1500 }, { "epoch": 0.0671892497200448, "eval_loss": 1.21743643283844, "eval_runtime": 51.7437, "eval_samples_per_second": 9.663, "eval_steps_per_second": 9.663, "step": 1500 }, { "epoch": 0.06763717805151176, "grad_norm": 6.724518775939941, "learning_rate": 6.0692414122711184e-06, "loss": 1.2655, "step": 1510 }, { "epoch": 0.06808510638297872, "grad_norm": 4.3255085945129395, "learning_rate": 6.050300732641376e-06, "loss": 1.0058, "step": 1520 }, { "epoch": 0.06853303471444569, "grad_norm": 2.7948145866394043, "learning_rate": 6.0312654966581755e-06, "loss": 1.1331, "step": 1530 }, { "epoch": 0.06898096304591265, "grad_norm": 4.223801612854004, "learning_rate": 6.012136486785512e-06, "loss": 0.9267, "step": 1540 }, { "epoch": 0.06942889137737962, "grad_norm": 8.328617095947266, "learning_rate": 5.992914489342061e-06, "loss": 1.0601, "step": 1550 }, { "epoch": 0.06987681970884659, "grad_norm": 3.9401023387908936, "learning_rate": 5.9736002944688474e-06, "loss": 1.1296, "step": 1560 }, { "epoch": 0.07032474804031355, "grad_norm": 4.462929725646973, "learning_rate": 5.954194696096775e-06, "loss": 1.1266, "step": 1570 }, { "epoch": 0.07077267637178052, "grad_norm": 9.879998207092285, "learning_rate": 5.9346984919139865e-06, "loss": 1.0835, "step": 1580 }, { "epoch": 0.07122060470324748, "grad_norm": 4.088196277618408, "learning_rate": 5.9151124833330745e-06, "loss": 1.1256, "step": 1590 }, { "epoch": 0.07166853303471445, "grad_norm": 6.066174030303955, "learning_rate": 5.895437475458137e-06, "loss": 1.2295, "step": 1600 }, { "epoch": 0.07211646136618141, "grad_norm": 4.754509449005127, "learning_rate": 5.875674277051688e-06, "loss": 1.1676, "step": 1610 }, { "epoch": 0.07256438969764838, "grad_norm": 3.898282289505005, "learning_rate": 5.855823700501406e-06, "loss": 1.2583, "step": 1620 }, { "epoch": 0.07301231802911534, "grad_norm": 5.35301399230957, "learning_rate": 5.835886561786744e-06, "loss": 1.3667, "step": 1630 }, { "epoch": 0.07346024636058231, "grad_norm": 6.24777889251709, "learning_rate": 5.815863680445385e-06, "loss": 1.1099, "step": 1640 }, { "epoch": 0.07390817469204927, "grad_norm": 3.7771286964416504, "learning_rate": 5.795755879539558e-06, "loss": 0.9985, "step": 1650 }, { "epoch": 0.07390817469204927, "eval_loss": 1.2118867635726929, "eval_runtime": 51.6701, "eval_samples_per_second": 9.677, "eval_steps_per_second": 9.677, "step": 1650 }, { "epoch": 0.07435610302351624, "grad_norm": 4.368626117706299, "learning_rate": 5.775563985622202e-06, "loss": 1.1, "step": 1660 }, { "epoch": 0.0748040313549832, "grad_norm": 6.341384410858154, "learning_rate": 5.755288828702987e-06, "loss": 1.0292, "step": 1670 }, { "epoch": 0.07525195968645017, "grad_norm": 5.869757652282715, "learning_rate": 5.734931242214204e-06, "loss": 1.0937, "step": 1680 }, { "epoch": 0.07569988801791713, "grad_norm": 4.857089042663574, "learning_rate": 5.7144920629764955e-06, "loss": 1.0987, "step": 1690 }, { "epoch": 0.0761478163493841, "grad_norm": 5.114626884460449, "learning_rate": 5.693972131164471e-06, "loss": 0.9623, "step": 1700 }, { "epoch": 0.07659574468085106, "grad_norm": 5.152310371398926, "learning_rate": 5.673372290272149e-06, "loss": 1.1423, "step": 1710 }, { "epoch": 0.07704367301231803, "grad_norm": 3.8204965591430664, "learning_rate": 5.652693387078309e-06, "loss": 1.0523, "step": 1720 }, { "epoch": 0.077491601343785, "grad_norm": 3.0346767902374268, "learning_rate": 5.631936271611667e-06, "loss": 1.0483, "step": 1730 }, { "epoch": 0.07793952967525196, "grad_norm": 4.436351299285889, "learning_rate": 5.611101797115939e-06, "loss": 1.0144, "step": 1740 }, { "epoch": 0.07838745800671892, "grad_norm": 5.614783763885498, "learning_rate": 5.5901908200147685e-06, "loss": 1.078, "step": 1750 }, { "epoch": 0.07883538633818589, "grad_norm": 4.0426926612854, "learning_rate": 5.56920419987652e-06, "loss": 1.2628, "step": 1760 }, { "epoch": 0.07928331466965285, "grad_norm": 5.30089807510376, "learning_rate": 5.5481427993789534e-06, "loss": 1.1257, "step": 1770 }, { "epoch": 0.07973124300111982, "grad_norm": 3.5508739948272705, "learning_rate": 5.527007484273746e-06, "loss": 1.0355, "step": 1780 }, { "epoch": 0.08017917133258678, "grad_norm": 4.027277946472168, "learning_rate": 5.5057991233509225e-06, "loss": 0.9196, "step": 1790 }, { "epoch": 0.08062709966405375, "grad_norm": 7.427858352661133, "learning_rate": 5.484518588403134e-06, "loss": 1.1913, "step": 1800 }, { "epoch": 0.08062709966405375, "eval_loss": 1.2111696004867554, "eval_runtime": 51.6854, "eval_samples_per_second": 9.674, "eval_steps_per_second": 9.674, "step": 1800 }, { "epoch": 0.08107502799552072, "grad_norm": 6.3730597496032715, "learning_rate": 5.463166754189819e-06, "loss": 1.171, "step": 1810 }, { "epoch": 0.08152295632698768, "grad_norm": 5.194447994232178, "learning_rate": 5.441744498401255e-06, "loss": 1.2202, "step": 1820 }, { "epoch": 0.08197088465845465, "grad_norm": 4.3045454025268555, "learning_rate": 5.4202527016224725e-06, "loss": 1.1318, "step": 1830 }, { "epoch": 0.08241881298992161, "grad_norm": 5.316900253295898, "learning_rate": 5.398692247297059e-06, "loss": 1.2107, "step": 1840 }, { "epoch": 0.08286674132138858, "grad_norm": 8.284939765930176, "learning_rate": 5.377064021690844e-06, "loss": 1.1683, "step": 1850 }, { "epoch": 0.08331466965285554, "grad_norm": 4.051226615905762, "learning_rate": 5.355368913855472e-06, "loss": 1.2974, "step": 1860 }, { "epoch": 0.0837625979843225, "grad_norm": 5.353118896484375, "learning_rate": 5.333607815591851e-06, "loss": 1.235, "step": 1870 }, { "epoch": 0.08421052631578947, "grad_norm": 5.097784996032715, "learning_rate": 5.311781621413497e-06, "loss": 1.0172, "step": 1880 }, { "epoch": 0.08465845464725644, "grad_norm": 3.437659978866577, "learning_rate": 5.289891228509769e-06, "loss": 1.0104, "step": 1890 }, { "epoch": 0.0851063829787234, "grad_norm": 4.631069660186768, "learning_rate": 5.267937536708977e-06, "loss": 1.0368, "step": 1900 }, { "epoch": 0.08555431131019037, "grad_norm": 5.044907569885254, "learning_rate": 5.245921448441407e-06, "loss": 1.0732, "step": 1910 }, { "epoch": 0.08600223964165733, "grad_norm": 3.2756667137145996, "learning_rate": 5.223843868702214e-06, "loss": 1.2815, "step": 1920 }, { "epoch": 0.0864501679731243, "grad_norm": 5.061473369598389, "learning_rate": 5.201705705014231e-06, "loss": 1.1059, "step": 1930 }, { "epoch": 0.08689809630459126, "grad_norm": 4.924319744110107, "learning_rate": 5.1795078673906575e-06, "loss": 1.0561, "step": 1940 }, { "epoch": 0.08734602463605823, "grad_norm": 4.019739627838135, "learning_rate": 5.1572512682976546e-06, "loss": 0.9889, "step": 1950 }, { "epoch": 0.08734602463605823, "eval_loss": 1.2077045440673828, "eval_runtime": 51.7283, "eval_samples_per_second": 9.666, "eval_steps_per_second": 9.666, "step": 1950 }, { "epoch": 0.0877939529675252, "grad_norm": 6.297740459442139, "learning_rate": 5.134936822616837e-06, "loss": 1.1664, "step": 1960 }, { "epoch": 0.08824188129899216, "grad_norm": 5.478749752044678, "learning_rate": 5.112565447607669e-06, "loss": 1.2503, "step": 1970 }, { "epoch": 0.08868980963045912, "grad_norm": 4.692316055297852, "learning_rate": 5.090138062869755e-06, "loss": 1.1421, "step": 1980 }, { "epoch": 0.08913773796192609, "grad_norm": 3.5623536109924316, "learning_rate": 5.067655590305036e-06, "loss": 1.1203, "step": 1990 }, { "epoch": 0.08958566629339305, "grad_norm": 6.875621318817139, "learning_rate": 5.045118954079904e-06, "loss": 1.1348, "step": 2000 }, { "epoch": 0.09003359462486002, "grad_norm": 5.2604756355285645, "learning_rate": 5.022529080587205e-06, "loss": 1.0326, "step": 2010 }, { "epoch": 0.09048152295632698, "grad_norm": 5.012307643890381, "learning_rate": 4.999886898408157e-06, "loss": 1.12, "step": 2020 }, { "epoch": 0.09092945128779395, "grad_norm": 5.246688365936279, "learning_rate": 4.977193338274189e-06, "loss": 1.1164, "step": 2030 }, { "epoch": 0.09137737961926092, "grad_norm": 3.9779398441314697, "learning_rate": 4.954449333028672e-06, "loss": 1.0607, "step": 2040 }, { "epoch": 0.09182530795072788, "grad_norm": 5.392056465148926, "learning_rate": 4.931655817588579e-06, "loss": 1.1102, "step": 2050 }, { "epoch": 0.09227323628219485, "grad_norm": 5.144470691680908, "learning_rate": 4.9088137289060535e-06, "loss": 1.0649, "step": 2060 }, { "epoch": 0.09272116461366181, "grad_norm": 3.7060792446136475, "learning_rate": 4.885924005929896e-06, "loss": 1.0718, "step": 2070 }, { "epoch": 0.09316909294512878, "grad_norm": 3.357794761657715, "learning_rate": 4.862987589566965e-06, "loss": 1.1003, "step": 2080 }, { "epoch": 0.09361702127659574, "grad_norm": 5.704718589782715, "learning_rate": 4.840005422643503e-06, "loss": 1.2042, "step": 2090 }, { "epoch": 0.0940649496080627, "grad_norm": 5.481514930725098, "learning_rate": 4.816978449866372e-06, "loss": 1.0777, "step": 2100 }, { "epoch": 0.0940649496080627, "eval_loss": 1.2093305587768555, "eval_runtime": 51.7975, "eval_samples_per_second": 9.653, "eval_steps_per_second": 9.653, "step": 2100 }, { "epoch": 0.09451287793952967, "grad_norm": 5.508385181427002, "learning_rate": 4.793907617784238e-06, "loss": 1.5375, "step": 2110 }, { "epoch": 0.09496080627099664, "grad_norm": 4.192409515380859, "learning_rate": 4.770793874748642e-06, "loss": 0.9964, "step": 2120 }, { "epoch": 0.0954087346024636, "grad_norm": 4.068387508392334, "learning_rate": 4.747638170875032e-06, "loss": 0.9244, "step": 2130 }, { "epoch": 0.09585666293393057, "grad_norm": 2.513946771621704, "learning_rate": 4.724441458003699e-06, "loss": 1.1329, "step": 2140 }, { "epoch": 0.09630459126539753, "grad_norm": 4.470638275146484, "learning_rate": 4.701204689660653e-06, "loss": 1.0299, "step": 2150 }, { "epoch": 0.0967525195968645, "grad_norm": 5.644805908203125, "learning_rate": 4.67792882101843e-06, "loss": 1.2654, "step": 2160 }, { "epoch": 0.09720044792833146, "grad_norm": 5.1912736892700195, "learning_rate": 4.654614808856823e-06, "loss": 1.2265, "step": 2170 }, { "epoch": 0.09764837625979843, "grad_norm": 11.092533111572266, "learning_rate": 4.631263611523557e-06, "loss": 1.2182, "step": 2180 }, { "epoch": 0.09809630459126539, "grad_norm": 4.138496398925781, "learning_rate": 4.607876188894896e-06, "loss": 1.2283, "step": 2190 }, { "epoch": 0.09854423292273236, "grad_norm": 5.229914665222168, "learning_rate": 4.58445350233618e-06, "loss": 1.1319, "step": 2200 }, { "epoch": 0.09899216125419932, "grad_norm": 4.059961318969727, "learning_rate": 4.560996514662314e-06, "loss": 1.0411, "step": 2210 }, { "epoch": 0.09944008958566629, "grad_norm": 4.80086088180542, "learning_rate": 4.5375061900981855e-06, "loss": 1.23, "step": 2220 }, { "epoch": 0.09988801791713325, "grad_norm": 5.166756629943848, "learning_rate": 4.513983494239034e-06, "loss": 1.219, "step": 2230 }, { "epoch": 0.10033594624860022, "grad_norm": 5.53660249710083, "learning_rate": 4.490429394010752e-06, "loss": 1.1245, "step": 2240 }, { "epoch": 0.10078387458006718, "grad_norm": 2.9756040573120117, "learning_rate": 4.466844857630147e-06, "loss": 1.1395, "step": 2250 }, { "epoch": 0.10078387458006718, "eval_loss": 1.2089135646820068, "eval_runtime": 51.6342, "eval_samples_per_second": 9.684, "eval_steps_per_second": 9.684, "step": 2250 }, { "epoch": 0.10123180291153415, "grad_norm": 3.644266128540039, "learning_rate": 4.443230854565133e-06, "loss": 1.0985, "step": 2260 }, { "epoch": 0.10167973124300111, "grad_norm": 4.662050724029541, "learning_rate": 4.4195883554948885e-06, "loss": 1.3397, "step": 2270 }, { "epoch": 0.10212765957446808, "grad_norm": 5.3237385749816895, "learning_rate": 4.3959183322699466e-06, "loss": 1.1351, "step": 2280 }, { "epoch": 0.10257558790593505, "grad_norm": 4.3604207038879395, "learning_rate": 4.372221757872255e-06, "loss": 1.1208, "step": 2290 }, { "epoch": 0.10302351623740201, "grad_norm": 3.731410264968872, "learning_rate": 4.3484996063751725e-06, "loss": 1.1584, "step": 2300 }, { "epoch": 0.10347144456886898, "grad_norm": 4.031397342681885, "learning_rate": 4.324752852903435e-06, "loss": 0.9656, "step": 2310 }, { "epoch": 0.10391937290033594, "grad_norm": 3.564148187637329, "learning_rate": 4.300982473593068e-06, "loss": 1.0031, "step": 2320 }, { "epoch": 0.1043673012318029, "grad_norm": 5.459331035614014, "learning_rate": 4.277189445551261e-06, "loss": 1.0037, "step": 2330 }, { "epoch": 0.10481522956326987, "grad_norm": 4.870905876159668, "learning_rate": 4.253374746816209e-06, "loss": 0.9615, "step": 2340 }, { "epoch": 0.10526315789473684, "grad_norm": 5.284097671508789, "learning_rate": 4.229539356316898e-06, "loss": 1.3278, "step": 2350 }, { "epoch": 0.1057110862262038, "grad_norm": 5.323864459991455, "learning_rate": 4.205684253832877e-06, "loss": 1.1903, "step": 2360 }, { "epoch": 0.10615901455767077, "grad_norm": 7.844208717346191, "learning_rate": 4.1818104199539735e-06, "loss": 1.056, "step": 2370 }, { "epoch": 0.10660694288913773, "grad_norm": 4.325316905975342, "learning_rate": 4.1579188360399916e-06, "loss": 1.2431, "step": 2380 }, { "epoch": 0.1070548712206047, "grad_norm": 3.5362424850463867, "learning_rate": 4.134010484180368e-06, "loss": 1.1804, "step": 2390 }, { "epoch": 0.10750279955207166, "grad_norm": 3.2404041290283203, "learning_rate": 4.110086347153807e-06, "loss": 1.1556, "step": 2400 }, { "epoch": 0.10750279955207166, "eval_loss": 1.2038679122924805, "eval_runtime": 51.7303, "eval_samples_per_second": 9.666, "eval_steps_per_second": 9.666, "step": 2400 }, { "epoch": 0.10795072788353863, "grad_norm": 3.8270246982574463, "learning_rate": 4.0861474083878765e-06, "loss": 1.0918, "step": 2410 }, { "epoch": 0.10839865621500559, "grad_norm": 5.627485752105713, "learning_rate": 4.062194651918585e-06, "loss": 1.257, "step": 2420 }, { "epoch": 0.10884658454647256, "grad_norm": 4.910660743713379, "learning_rate": 4.0382290623499384e-06, "loss": 1.2748, "step": 2430 }, { "epoch": 0.10929451287793952, "grad_norm": 2.3609941005706787, "learning_rate": 4.014251624813453e-06, "loss": 0.9422, "step": 2440 }, { "epoch": 0.10974244120940649, "grad_norm": 3.063828706741333, "learning_rate": 3.990263324927675e-06, "loss": 1.1829, "step": 2450 }, { "epoch": 0.11019036954087345, "grad_norm": 2.658452033996582, "learning_rate": 3.966265148757655e-06, "loss": 1.0062, "step": 2460 }, { "epoch": 0.11063829787234042, "grad_norm": 6.130062103271484, "learning_rate": 3.9422580827744224e-06, "loss": 1.1504, "step": 2470 }, { "epoch": 0.11108622620380738, "grad_norm": 3.3496034145355225, "learning_rate": 3.9182431138144315e-06, "loss": 0.8731, "step": 2480 }, { "epoch": 0.11153415453527436, "grad_norm": 3.8455569744110107, "learning_rate": 3.894221229038995e-06, "loss": 1.0125, "step": 2490 }, { "epoch": 0.11198208286674133, "grad_norm": 4.499962329864502, "learning_rate": 3.870193415893709e-06, "loss": 1.0228, "step": 2500 }, { "epoch": 0.1124300111982083, "grad_norm": 6.230105876922607, "learning_rate": 3.846160662067859e-06, "loss": 1.1794, "step": 2510 }, { "epoch": 0.11287793952967526, "grad_norm": 7.316727638244629, "learning_rate": 3.8221239554538275e-06, "loss": 1.2728, "step": 2520 }, { "epoch": 0.11332586786114222, "grad_norm": 3.291714906692505, "learning_rate": 3.798084284106478e-06, "loss": 1.167, "step": 2530 }, { "epoch": 0.11377379619260919, "grad_norm": 5.075141429901123, "learning_rate": 3.7740426362025424e-06, "loss": 1.0547, "step": 2540 }, { "epoch": 0.11422172452407615, "grad_norm": 3.961540937423706, "learning_rate": 3.7500000000000005e-06, "loss": 1.0713, "step": 2550 }, { "epoch": 0.11422172452407615, "eval_loss": 1.2046430110931396, "eval_runtime": 51.7175, "eval_samples_per_second": 9.668, "eval_steps_per_second": 9.668, "step": 2550 }, { "epoch": 0.11466965285554312, "grad_norm": 6.124125003814697, "learning_rate": 3.7259573637974587e-06, "loss": 1.0568, "step": 2560 }, { "epoch": 0.11511758118701008, "grad_norm": 4.3748602867126465, "learning_rate": 3.701915715893523e-06, "loss": 1.4124, "step": 2570 }, { "epoch": 0.11556550951847705, "grad_norm": 7.382061004638672, "learning_rate": 3.677876044546174e-06, "loss": 1.1357, "step": 2580 }, { "epoch": 0.11601343784994401, "grad_norm": 4.097735404968262, "learning_rate": 3.6538393379321427e-06, "loss": 1.0885, "step": 2590 }, { "epoch": 0.11646136618141098, "grad_norm": 5.039736270904541, "learning_rate": 3.6298065841062934e-06, "loss": 1.107, "step": 2600 }, { "epoch": 0.11690929451287795, "grad_norm": 4.383152008056641, "learning_rate": 3.6057787709610064e-06, "loss": 1.1695, "step": 2610 }, { "epoch": 0.11735722284434491, "grad_norm": 4.900496482849121, "learning_rate": 3.5817568861855708e-06, "loss": 1.1107, "step": 2620 }, { "epoch": 0.11780515117581188, "grad_norm": 6.267992973327637, "learning_rate": 3.557741917225579e-06, "loss": 1.1896, "step": 2630 }, { "epoch": 0.11825307950727884, "grad_norm": 3.8060693740844727, "learning_rate": 3.5337348512423468e-06, "loss": 1.2245, "step": 2640 }, { "epoch": 0.1187010078387458, "grad_norm": 3.5068161487579346, "learning_rate": 3.5097366750723275e-06, "loss": 1.0629, "step": 2650 }, { "epoch": 0.11914893617021277, "grad_norm": 4.6765360832214355, "learning_rate": 3.4857483751865478e-06, "loss": 1.1783, "step": 2660 }, { "epoch": 0.11959686450167974, "grad_norm": 7.864380836486816, "learning_rate": 3.461770937650064e-06, "loss": 1.0683, "step": 2670 }, { "epoch": 0.1200447928331467, "grad_norm": 3.138843297958374, "learning_rate": 3.437805348081416e-06, "loss": 0.9814, "step": 2680 }, { "epoch": 0.12049272116461367, "grad_norm": 5.134324550628662, "learning_rate": 3.413852591612125e-06, "loss": 1.1631, "step": 2690 }, { "epoch": 0.12094064949608063, "grad_norm": 4.688596725463867, "learning_rate": 3.389913652846194e-06, "loss": 1.0644, "step": 2700 }, { "epoch": 0.12094064949608063, "eval_loss": 1.2033374309539795, "eval_runtime": 51.6099, "eval_samples_per_second": 9.688, "eval_steps_per_second": 9.688, "step": 2700 }, { "epoch": 0.1213885778275476, "grad_norm": 4.218849182128906, "learning_rate": 3.365989515819633e-06, "loss": 1.1395, "step": 2710 }, { "epoch": 0.12183650615901456, "grad_norm": 5.043267726898193, "learning_rate": 3.34208116396001e-06, "loss": 1.2327, "step": 2720 }, { "epoch": 0.12228443449048153, "grad_norm": 7.991638660430908, "learning_rate": 3.318189580046028e-06, "loss": 1.0106, "step": 2730 }, { "epoch": 0.12273236282194849, "grad_norm": 4.103755474090576, "learning_rate": 3.294315746167124e-06, "loss": 0.9751, "step": 2740 }, { "epoch": 0.12318029115341546, "grad_norm": 4.224274635314941, "learning_rate": 3.2704606436831023e-06, "loss": 1.1427, "step": 2750 }, { "epoch": 0.12362821948488242, "grad_norm": 5.190283298492432, "learning_rate": 3.2466252531837934e-06, "loss": 1.1758, "step": 2760 }, { "epoch": 0.12407614781634939, "grad_norm": 6.470210075378418, "learning_rate": 3.2228105544487405e-06, "loss": 1.2584, "step": 2770 }, { "epoch": 0.12452407614781635, "grad_norm": 4.470674514770508, "learning_rate": 3.1990175264069333e-06, "loss": 1.0279, "step": 2780 }, { "epoch": 0.12497200447928332, "grad_norm": 4.63865327835083, "learning_rate": 3.1752471470965653e-06, "loss": 1.2431, "step": 2790 }, { "epoch": 0.12541993281075028, "grad_norm": 5.2822089195251465, "learning_rate": 3.151500393624829e-06, "loss": 1.0206, "step": 2800 }, { "epoch": 0.12586786114221724, "grad_norm": 3.3929495811462402, "learning_rate": 3.127778242127747e-06, "loss": 0.9654, "step": 2810 }, { "epoch": 0.12631578947368421, "grad_norm": 3.526858329772949, "learning_rate": 3.104081667730055e-06, "loss": 1.0832, "step": 2820 }, { "epoch": 0.12676371780515117, "grad_norm": 5.531039714813232, "learning_rate": 3.0804116445051133e-06, "loss": 1.1649, "step": 2830 }, { "epoch": 0.12721164613661815, "grad_norm": 5.811004161834717, "learning_rate": 3.0567691454348674e-06, "loss": 1.095, "step": 2840 }, { "epoch": 0.1276595744680851, "grad_norm": 4.319146633148193, "learning_rate": 3.033155142369855e-06, "loss": 0.9761, "step": 2850 }, { "epoch": 0.1276595744680851, "eval_loss": 1.2028086185455322, "eval_runtime": 51.6383, "eval_samples_per_second": 9.683, "eval_steps_per_second": 9.683, "step": 2850 }, { "epoch": 0.12810750279955208, "grad_norm": 5.54340124130249, "learning_rate": 3.009570605989249e-06, "loss": 0.999, "step": 2860 }, { "epoch": 0.12855543113101903, "grad_norm": 3.859863758087158, "learning_rate": 2.986016505760967e-06, "loss": 1.025, "step": 2870 }, { "epoch": 0.129003359462486, "grad_norm": 5.119099140167236, "learning_rate": 2.962493809901815e-06, "loss": 1.3963, "step": 2880 }, { "epoch": 0.12945128779395296, "grad_norm": 5.8379130363464355, "learning_rate": 2.9390034853376875e-06, "loss": 1.0822, "step": 2890 }, { "epoch": 0.12989921612541994, "grad_norm": 3.261016845703125, "learning_rate": 2.9155464976638217e-06, "loss": 1.0526, "step": 2900 }, { "epoch": 0.1303471444568869, "grad_norm": 3.678527355194092, "learning_rate": 2.8921238111051057e-06, "loss": 1.1167, "step": 2910 }, { "epoch": 0.13079507278835387, "grad_norm": 4.787365436553955, "learning_rate": 2.8687363884764434e-06, "loss": 1.0829, "step": 2920 }, { "epoch": 0.13124300111982082, "grad_norm": 3.475607395172119, "learning_rate": 2.8453851911431783e-06, "loss": 1.0801, "step": 2930 }, { "epoch": 0.1316909294512878, "grad_norm": 6.456125736236572, "learning_rate": 2.822071178981572e-06, "loss": 1.1287, "step": 2940 }, { "epoch": 0.13213885778275475, "grad_norm": 3.778585910797119, "learning_rate": 2.7987953103393484e-06, "loss": 1.1359, "step": 2950 }, { "epoch": 0.13258678611422173, "grad_norm": 3.37793231010437, "learning_rate": 2.7755585419963026e-06, "loss": 1.0584, "step": 2960 }, { "epoch": 0.13303471444568868, "grad_norm": 5.2485575675964355, "learning_rate": 2.7523618291249687e-06, "loss": 1.2037, "step": 2970 }, { "epoch": 0.13348264277715566, "grad_norm": 4.524936676025391, "learning_rate": 2.729206125251359e-06, "loss": 0.9778, "step": 2980 }, { "epoch": 0.1339305711086226, "grad_norm": 5.820756912231445, "learning_rate": 2.7060923822157638e-06, "loss": 1.0351, "step": 2990 }, { "epoch": 0.1343784994400896, "grad_norm": 5.031400680541992, "learning_rate": 2.6830215501336288e-06, "loss": 1.1926, "step": 3000 }, { "epoch": 0.1343784994400896, "eval_loss": 1.199351191520691, "eval_runtime": 51.5688, "eval_samples_per_second": 9.696, "eval_steps_per_second": 9.696, "step": 3000 }, { "epoch": 0.13482642777155654, "grad_norm": 4.307104587554932, "learning_rate": 2.6599945773564997e-06, "loss": 1.1743, "step": 3010 }, { "epoch": 0.13527435610302352, "grad_norm": 4.9457221031188965, "learning_rate": 2.6370124104330357e-06, "loss": 1.1287, "step": 3020 }, { "epoch": 0.13572228443449047, "grad_norm": 3.17401385307312, "learning_rate": 2.614075994070105e-06, "loss": 1.1686, "step": 3030 }, { "epoch": 0.13617021276595745, "grad_norm": 6.098177433013916, "learning_rate": 2.591186271093948e-06, "loss": 1.1546, "step": 3040 }, { "epoch": 0.1366181410974244, "grad_norm": 4.12905216217041, "learning_rate": 2.568344182411423e-06, "loss": 1.0909, "step": 3050 }, { "epoch": 0.13706606942889138, "grad_norm": 4.946627616882324, "learning_rate": 2.5455506669713293e-06, "loss": 1.2223, "step": 3060 }, { "epoch": 0.13751399776035833, "grad_norm": 4.25789737701416, "learning_rate": 2.522806661725812e-06, "loss": 1.0383, "step": 3070 }, { "epoch": 0.1379619260918253, "grad_norm": 6.536715030670166, "learning_rate": 2.5001131015918444e-06, "loss": 0.9992, "step": 3080 }, { "epoch": 0.13840985442329226, "grad_norm": 5.861030578613281, "learning_rate": 2.4774709194127973e-06, "loss": 1.1678, "step": 3090 }, { "epoch": 0.13885778275475924, "grad_norm": 4.58046293258667, "learning_rate": 2.4548810459200973e-06, "loss": 1.2545, "step": 3100 }, { "epoch": 0.1393057110862262, "grad_norm": 6.048022270202637, "learning_rate": 2.4323444096949647e-06, "loss": 1.0531, "step": 3110 }, { "epoch": 0.13975363941769317, "grad_norm": 5.86400842666626, "learning_rate": 2.409861937130248e-06, "loss": 1.1093, "step": 3120 }, { "epoch": 0.14020156774916012, "grad_norm": 3.7916102409362793, "learning_rate": 2.3874345523923327e-06, "loss": 1.1048, "step": 3130 }, { "epoch": 0.1406494960806271, "grad_norm": 4.009166717529297, "learning_rate": 2.3650631773831644e-06, "loss": 1.0198, "step": 3140 }, { "epoch": 0.14109742441209405, "grad_norm": 4.695572853088379, "learning_rate": 2.3427487317023477e-06, "loss": 1.1909, "step": 3150 }, { "epoch": 0.14109742441209405, "eval_loss": 1.1985480785369873, "eval_runtime": 51.6619, "eval_samples_per_second": 9.678, "eval_steps_per_second": 9.678, "step": 3150 }, { "epoch": 0.14154535274356103, "grad_norm": 5.317529201507568, "learning_rate": 2.320492132609344e-06, "loss": 1.084, "step": 3160 }, { "epoch": 0.14199328107502798, "grad_norm": 3.3507909774780273, "learning_rate": 2.2982942949857705e-06, "loss": 1.0169, "step": 3170 }, { "epoch": 0.14244120940649496, "grad_norm": 5.125346660614014, "learning_rate": 2.276156131297787e-06, "loss": 1.0202, "step": 3180 }, { "epoch": 0.1428891377379619, "grad_norm": 6.09945821762085, "learning_rate": 2.254078551558594e-06, "loss": 1.1235, "step": 3190 }, { "epoch": 0.1433370660694289, "grad_norm": 6.263647079467773, "learning_rate": 2.2320624632910232e-06, "loss": 1.1284, "step": 3200 }, { "epoch": 0.14378499440089584, "grad_norm": 6.879512310028076, "learning_rate": 2.210108771490233e-06, "loss": 1.0602, "step": 3210 }, { "epoch": 0.14423292273236282, "grad_norm": 3.726658582687378, "learning_rate": 2.1882183785865047e-06, "loss": 1.1038, "step": 3220 }, { "epoch": 0.14468085106382977, "grad_norm": 5.486456394195557, "learning_rate": 2.166392184408152e-06, "loss": 1.1794, "step": 3230 }, { "epoch": 0.14512877939529675, "grad_norm": 4.750957012176514, "learning_rate": 2.1446310861445306e-06, "loss": 0.9833, "step": 3240 }, { "epoch": 0.1455767077267637, "grad_norm": 3.6656692028045654, "learning_rate": 2.1229359783091576e-06, "loss": 1.0272, "step": 3250 }, { "epoch": 0.14602463605823068, "grad_norm": 3.691014528274536, "learning_rate": 2.1013077527029428e-06, "loss": 1.0861, "step": 3260 }, { "epoch": 0.14647256438969763, "grad_norm": 5.651008605957031, "learning_rate": 2.079747298377528e-06, "loss": 1.096, "step": 3270 }, { "epoch": 0.14692049272116461, "grad_norm": 4.2657318115234375, "learning_rate": 2.058255501598745e-06, "loss": 1.0871, "step": 3280 }, { "epoch": 0.14736842105263157, "grad_norm": 3.884568452835083, "learning_rate": 2.0368332458101814e-06, "loss": 1.0087, "step": 3290 }, { "epoch": 0.14781634938409854, "grad_norm": 3.191197395324707, "learning_rate": 2.015481411596869e-06, "loss": 1.1387, "step": 3300 }, { "epoch": 0.14781634938409854, "eval_loss": 1.1979233026504517, "eval_runtime": 51.7549, "eval_samples_per_second": 9.661, "eval_steps_per_second": 9.661, "step": 3300 }, { "epoch": 0.14826427771556552, "grad_norm": 6.709813594818115, "learning_rate": 1.9942008766490793e-06, "loss": 1.0685, "step": 3310 }, { "epoch": 0.14871220604703247, "grad_norm": 3.687634229660034, "learning_rate": 1.9729925157262554e-06, "loss": 1.1542, "step": 3320 }, { "epoch": 0.14916013437849945, "grad_norm": 3.637235403060913, "learning_rate": 1.9518572006210484e-06, "loss": 1.1365, "step": 3330 }, { "epoch": 0.1496080627099664, "grad_norm": 3.113184690475464, "learning_rate": 1.9307958001234794e-06, "loss": 1.0218, "step": 3340 }, { "epoch": 0.15005599104143338, "grad_norm": 4.447634220123291, "learning_rate": 1.9098091799852347e-06, "loss": 1.222, "step": 3350 }, { "epoch": 0.15050391937290034, "grad_norm": 3.8236501216888428, "learning_rate": 1.8888982028840636e-06, "loss": 1.2012, "step": 3360 }, { "epoch": 0.15095184770436731, "grad_norm": 5.108892440795898, "learning_rate": 1.8680637283883355e-06, "loss": 1.0181, "step": 3370 }, { "epoch": 0.15139977603583427, "grad_norm": 3.81886887550354, "learning_rate": 1.8473066129216927e-06, "loss": 1.125, "step": 3380 }, { "epoch": 0.15184770436730124, "grad_norm": 4.7799835205078125, "learning_rate": 1.8266277097278527e-06, "loss": 1.1038, "step": 3390 }, { "epoch": 0.1522956326987682, "grad_norm": 6.478558540344238, "learning_rate": 1.8060278688355313e-06, "loss": 0.9218, "step": 3400 }, { "epoch": 0.15274356103023518, "grad_norm": 4.482583522796631, "learning_rate": 1.7855079370235043e-06, "loss": 1.0629, "step": 3410 }, { "epoch": 0.15319148936170213, "grad_norm": 2.6053950786590576, "learning_rate": 1.7650687577857972e-06, "loss": 1.1975, "step": 3420 }, { "epoch": 0.1536394176931691, "grad_norm": 4.930041313171387, "learning_rate": 1.7447111712970138e-06, "loss": 1.0566, "step": 3430 }, { "epoch": 0.15408734602463606, "grad_norm": 4.492660045623779, "learning_rate": 1.7244360143778004e-06, "loss": 1.1441, "step": 3440 }, { "epoch": 0.15453527435610304, "grad_norm": 4.847555637359619, "learning_rate": 1.704244120460443e-06, "loss": 1.231, "step": 3450 }, { "epoch": 0.15453527435610304, "eval_loss": 1.198148488998413, "eval_runtime": 51.6757, "eval_samples_per_second": 9.676, "eval_steps_per_second": 9.676, "step": 3450 }, { "epoch": 0.15498320268757, "grad_norm": 5.320653438568115, "learning_rate": 1.6841363195546162e-06, "loss": 0.996, "step": 3460 }, { "epoch": 0.15543113101903697, "grad_norm": 4.333999156951904, "learning_rate": 1.6641134382132576e-06, "loss": 1.2536, "step": 3470 }, { "epoch": 0.15587905935050392, "grad_norm": 6.867399215698242, "learning_rate": 1.6441762994985947e-06, "loss": 1.1461, "step": 3480 }, { "epoch": 0.1563269876819709, "grad_norm": 3.2110917568206787, "learning_rate": 1.6243257229483141e-06, "loss": 1.1086, "step": 3490 }, { "epoch": 0.15677491601343785, "grad_norm": 3.345970630645752, "learning_rate": 1.6045625245418648e-06, "loss": 0.9485, "step": 3500 }, { "epoch": 0.15722284434490483, "grad_norm": 4.890392780303955, "learning_rate": 1.584887516666928e-06, "loss": 1.0968, "step": 3510 }, { "epoch": 0.15767077267637178, "grad_norm": 5.448171615600586, "learning_rate": 1.565301508086015e-06, "loss": 1.1305, "step": 3520 }, { "epoch": 0.15811870100783876, "grad_norm": 7.16267728805542, "learning_rate": 1.5458053039032263e-06, "loss": 1.2279, "step": 3530 }, { "epoch": 0.1585666293393057, "grad_norm": 5.2700018882751465, "learning_rate": 1.5263997055311536e-06, "loss": 1.0474, "step": 3540 }, { "epoch": 0.1590145576707727, "grad_norm": 5.955024719238281, "learning_rate": 1.5070855106579404e-06, "loss": 1.1283, "step": 3550 }, { "epoch": 0.15946248600223964, "grad_norm": 2.882784366607666, "learning_rate": 1.4878635132144885e-06, "loss": 0.9112, "step": 3560 }, { "epoch": 0.15991041433370662, "grad_norm": 4.2263875007629395, "learning_rate": 1.4687345033418258e-06, "loss": 1.1554, "step": 3570 }, { "epoch": 0.16035834266517357, "grad_norm": 4.622799396514893, "learning_rate": 1.4496992673586262e-06, "loss": 1.3423, "step": 3580 }, { "epoch": 0.16080627099664055, "grad_norm": 5.2950897216796875, "learning_rate": 1.4307585877288822e-06, "loss": 1.0494, "step": 3590 }, { "epoch": 0.1612541993281075, "grad_norm": 5.289889335632324, "learning_rate": 1.4119132430297496e-06, "loss": 1.1448, "step": 3600 }, { "epoch": 0.1612541993281075, "eval_loss": 1.1965739727020264, "eval_runtime": 51.7182, "eval_samples_per_second": 9.668, "eval_steps_per_second": 9.668, "step": 3600 }, { "epoch": 0.16170212765957448, "grad_norm": 6.415092468261719, "learning_rate": 1.3931640079195365e-06, "loss": 1.0204, "step": 3610 }, { "epoch": 0.16215005599104143, "grad_norm": 3.348160743713379, "learning_rate": 1.3745116531058645e-06, "loss": 1.1308, "step": 3620 }, { "epoch": 0.1625979843225084, "grad_norm": 6.698293209075928, "learning_rate": 1.3559569453139797e-06, "loss": 0.9401, "step": 3630 }, { "epoch": 0.16304591265397536, "grad_norm": 3.5045154094696045, "learning_rate": 1.3375006472552483e-06, "loss": 1.152, "step": 3640 }, { "epoch": 0.16349384098544234, "grad_norm": 4.656421661376953, "learning_rate": 1.3191435175957945e-06, "loss": 1.1775, "step": 3650 }, { "epoch": 0.1639417693169093, "grad_norm": 8.8998384475708, "learning_rate": 1.3008863109253174e-06, "loss": 1.0061, "step": 3660 }, { "epoch": 0.16438969764837627, "grad_norm": 3.5046370029449463, "learning_rate": 1.282729777726078e-06, "loss": 1.1871, "step": 3670 }, { "epoch": 0.16483762597984322, "grad_norm": 4.024252891540527, "learning_rate": 1.2646746643420392e-06, "loss": 1.2593, "step": 3680 }, { "epoch": 0.1652855543113102, "grad_norm": 4.861652851104736, "learning_rate": 1.2467217129481952e-06, "loss": 1.1068, "step": 3690 }, { "epoch": 0.16573348264277715, "grad_norm": 6.007284641265869, "learning_rate": 1.2288716615200617e-06, "loss": 1.0237, "step": 3700 }, { "epoch": 0.16618141097424413, "grad_norm": 4.506286144256592, "learning_rate": 1.2111252438033404e-06, "loss": 1.0827, "step": 3710 }, { "epoch": 0.16662933930571108, "grad_norm": 7.5774102210998535, "learning_rate": 1.1934831892837524e-06, "loss": 1.2481, "step": 3720 }, { "epoch": 0.16707726763717806, "grad_norm": 4.199349880218506, "learning_rate": 1.1759462231570618e-06, "loss": 1.1948, "step": 3730 }, { "epoch": 0.167525195968645, "grad_norm": 3.675760269165039, "learning_rate": 1.1585150662992578e-06, "loss": 0.8945, "step": 3740 }, { "epoch": 0.167973124300112, "grad_norm": 4.647981643676758, "learning_rate": 1.1411904352369262e-06, "loss": 1.0746, "step": 3750 }, { "epoch": 0.167973124300112, "eval_loss": 1.1958056688308716, "eval_runtime": 51.7591, "eval_samples_per_second": 9.66, "eval_steps_per_second": 9.66, "step": 3750 }, { "epoch": 0.16842105263157894, "grad_norm": 2.354313611984253, "learning_rate": 1.1239730421177952e-06, "loss": 1.0362, "step": 3760 }, { "epoch": 0.16886898096304592, "grad_norm": 4.00113582611084, "learning_rate": 1.1068635946814569e-06, "loss": 1.0924, "step": 3770 }, { "epoch": 0.16931690929451287, "grad_norm": 3.765235185623169, "learning_rate": 1.0898627962302831e-06, "loss": 1.3452, "step": 3780 }, { "epoch": 0.16976483762597985, "grad_norm": 3.814605236053467, "learning_rate": 1.072971345600513e-06, "loss": 1.0048, "step": 3790 }, { "epoch": 0.1702127659574468, "grad_norm": 3.447803020477295, "learning_rate": 1.056189937133522e-06, "loss": 1.149, "step": 3800 }, { "epoch": 0.17066069428891378, "grad_norm": 7.1337714195251465, "learning_rate": 1.0395192606472822e-06, "loss": 1.1497, "step": 3810 }, { "epoch": 0.17110862262038073, "grad_norm": 5.239931583404541, "learning_rate": 1.0229600014080101e-06, "loss": 0.9874, "step": 3820 }, { "epoch": 0.1715565509518477, "grad_norm": 3.4100687503814697, "learning_rate": 1.006512840101995e-06, "loss": 1.0393, "step": 3830 }, { "epoch": 0.17200447928331467, "grad_norm": 4.527777671813965, "learning_rate": 9.90178452807619e-07, "loss": 0.968, "step": 3840 }, { "epoch": 0.17245240761478164, "grad_norm": 3.7964625358581543, "learning_rate": 9.739575109675674e-07, "loss": 1.1207, "step": 3850 }, { "epoch": 0.1729003359462486, "grad_norm": 4.329505920410156, "learning_rate": 9.578506813612243e-07, "loss": 1.0924, "step": 3860 }, { "epoch": 0.17334826427771557, "grad_norm": 3.9827823638916016, "learning_rate": 9.418586260772695e-07, "loss": 1.0937, "step": 3870 }, { "epoch": 0.17379619260918253, "grad_norm": 4.150352954864502, "learning_rate": 9.259820024864594e-07, "loss": 1.2071, "step": 3880 }, { "epoch": 0.1742441209406495, "grad_norm": 2.648918867111206, "learning_rate": 9.102214632146059e-07, "loss": 1.1754, "step": 3890 }, { "epoch": 0.17469204927211646, "grad_norm": 5.348718166351318, "learning_rate": 8.94577656115746e-07, "loss": 1.1031, "step": 3900 }, { "epoch": 0.17469204927211646, "eval_loss": 1.1968835592269897, "eval_runtime": 51.6518, "eval_samples_per_second": 9.68, "eval_steps_per_second": 9.68, "step": 3900 }, { "epoch": 0.17513997760358344, "grad_norm": 6.799318313598633, "learning_rate": 8.790512242455198e-07, "loss": 1.1188, "step": 3910 }, { "epoch": 0.1755879059350504, "grad_norm": 4.05487060546875, "learning_rate": 8.636428058347274e-07, "loss": 1.3045, "step": 3920 }, { "epoch": 0.17603583426651737, "grad_norm": 4.513579845428467, "learning_rate": 8.483530342630993e-07, "loss": 1.2577, "step": 3930 }, { "epoch": 0.17648376259798432, "grad_norm": 7.971194267272949, "learning_rate": 8.331825380332599e-07, "loss": 1.1376, "step": 3940 }, { "epoch": 0.1769316909294513, "grad_norm": 3.740802764892578, "learning_rate": 8.181319407448884e-07, "loss": 1.1413, "step": 3950 }, { "epoch": 0.17737961926091825, "grad_norm": 3.431658983230591, "learning_rate": 8.032018610690914e-07, "loss": 1.0802, "step": 3960 }, { "epoch": 0.17782754759238523, "grad_norm": 3.8207449913024902, "learning_rate": 7.883929127229665e-07, "loss": 1.173, "step": 3970 }, { "epoch": 0.17827547592385218, "grad_norm": 3.088942289352417, "learning_rate": 7.737057044443793e-07, "loss": 1.1144, "step": 3980 }, { "epoch": 0.17872340425531916, "grad_norm": 3.705589532852173, "learning_rate": 7.591408399669337e-07, "loss": 1.2676, "step": 3990 }, { "epoch": 0.1791713325867861, "grad_norm": 4.925235271453857, "learning_rate": 7.446989179951632e-07, "loss": 1.0197, "step": 4000 }, { "epoch": 0.1796192609182531, "grad_norm": 4.373708248138428, "learning_rate": 7.303805321799146e-07, "loss": 1.0041, "step": 4010 }, { "epoch": 0.18006718924972004, "grad_norm": 4.23321008682251, "learning_rate": 7.161862710939476e-07, "loss": 1.0504, "step": 4020 }, { "epoch": 0.18051511758118702, "grad_norm": 6.634941101074219, "learning_rate": 7.021167182077403e-07, "loss": 1.062, "step": 4030 }, { "epoch": 0.18096304591265397, "grad_norm": 12.015007972717285, "learning_rate": 6.881724518655049e-07, "loss": 1.3095, "step": 4040 }, { "epoch": 0.18141097424412095, "grad_norm": 5.376244068145752, "learning_rate": 6.743540452614152e-07, "loss": 1.0552, "step": 4050 }, { "epoch": 0.18141097424412095, "eval_loss": 1.1952238082885742, "eval_runtime": 51.6946, "eval_samples_per_second": 9.672, "eval_steps_per_second": 9.672, "step": 4050 }, { "epoch": 0.1818589025755879, "grad_norm": 5.1148858070373535, "learning_rate": 6.606620664160438e-07, "loss": 1.0796, "step": 4060 }, { "epoch": 0.18230683090705488, "grad_norm": 3.497487783432007, "learning_rate": 6.470970781530139e-07, "loss": 1.0996, "step": 4070 }, { "epoch": 0.18275475923852183, "grad_norm": 4.02069616317749, "learning_rate": 6.336596380758604e-07, "loss": 1.18, "step": 4080 }, { "epoch": 0.1832026875699888, "grad_norm": 4.936882495880127, "learning_rate": 6.203502985451152e-07, "loss": 1.1434, "step": 4090 }, { "epoch": 0.18365061590145576, "grad_norm": 3.6114046573638916, "learning_rate": 6.071696066555978e-07, "loss": 1.1957, "step": 4100 }, { "epoch": 0.18409854423292274, "grad_norm": 3.0989315509796143, "learning_rate": 5.941181042139258e-07, "loss": 1.1672, "step": 4110 }, { "epoch": 0.1845464725643897, "grad_norm": 3.9395434856414795, "learning_rate": 5.811963277162466e-07, "loss": 1.3213, "step": 4120 }, { "epoch": 0.18499440089585667, "grad_norm": 3.7421300411224365, "learning_rate": 5.684048083261789e-07, "loss": 0.9563, "step": 4130 }, { "epoch": 0.18544232922732362, "grad_norm": 3.190976858139038, "learning_rate": 5.557440718529848e-07, "loss": 1.1234, "step": 4140 }, { "epoch": 0.1858902575587906, "grad_norm": 3.461064100265503, "learning_rate": 5.432146387299522e-07, "loss": 1.0016, "step": 4150 }, { "epoch": 0.18633818589025755, "grad_norm": 6.645826816558838, "learning_rate": 5.308170239930022e-07, "loss": 1.1967, "step": 4160 }, { "epoch": 0.18678611422172453, "grad_norm": 4.823378562927246, "learning_rate": 5.185517372595187e-07, "loss": 1.032, "step": 4170 }, { "epoch": 0.18723404255319148, "grad_norm": 3.5760250091552734, "learning_rate": 5.064192827073995e-07, "loss": 1.1513, "step": 4180 }, { "epoch": 0.18768197088465846, "grad_norm": 3.162781000137329, "learning_rate": 4.944201590543308e-07, "loss": 0.9593, "step": 4190 }, { "epoch": 0.1881298992161254, "grad_norm": 8.633989334106445, "learning_rate": 4.825548595372898e-07, "loss": 1.2696, "step": 4200 }, { "epoch": 0.1881298992161254, "eval_loss": 1.1959577798843384, "eval_runtime": 51.6407, "eval_samples_per_second": 9.682, "eval_steps_per_second": 9.682, "step": 4200 }, { "epoch": 0.1885778275475924, "grad_norm": 4.277423858642578, "learning_rate": 4.7082387189226646e-07, "loss": 1.0834, "step": 4210 }, { "epoch": 0.18902575587905934, "grad_norm": 3.7345645427703857, "learning_rate": 4.5922767833421454e-07, "loss": 1.255, "step": 4220 }, { "epoch": 0.18947368421052632, "grad_norm": 5.163575172424316, "learning_rate": 4.477667555372326e-07, "loss": 1.1317, "step": 4230 }, { "epoch": 0.18992161254199327, "grad_norm": 5.2220892906188965, "learning_rate": 4.364415746149678e-07, "loss": 1.0966, "step": 4240 }, { "epoch": 0.19036954087346025, "grad_norm": 5.796306610107422, "learning_rate": 4.2525260110124964e-07, "loss": 1.0268, "step": 4250 }, { "epoch": 0.1908174692049272, "grad_norm": 4.295403003692627, "learning_rate": 4.1420029493095623e-07, "loss": 1.0465, "step": 4260 }, { "epoch": 0.19126539753639418, "grad_norm": 5.671868324279785, "learning_rate": 4.032851104211036e-07, "loss": 1.2124, "step": 4270 }, { "epoch": 0.19171332586786113, "grad_norm": 4.053644180297852, "learning_rate": 3.925074962521762e-07, "loss": 1.0574, "step": 4280 }, { "epoch": 0.1921612541993281, "grad_norm": 3.7694053649902344, "learning_rate": 3.818678954496787e-07, "loss": 1.0604, "step": 4290 }, { "epoch": 0.19260918253079506, "grad_norm": 4.982527256011963, "learning_rate": 3.713667453659287e-07, "loss": 1.1518, "step": 4300 }, { "epoch": 0.19305711086226204, "grad_norm": 5.036848545074463, "learning_rate": 3.6100447766207473e-07, "loss": 1.0251, "step": 4310 }, { "epoch": 0.193505039193729, "grad_norm": 5.744006633758545, "learning_rate": 3.5078151829035693e-07, "loss": 1.0103, "step": 4320 }, { "epoch": 0.19395296752519597, "grad_norm": 3.843419075012207, "learning_rate": 3.4069828747659405e-07, "loss": 1.0053, "step": 4330 }, { "epoch": 0.19440089585666293, "grad_norm": 4.357511043548584, "learning_rate": 3.3075519970291144e-07, "loss": 1.202, "step": 4340 }, { "epoch": 0.1948488241881299, "grad_norm": 6.164062976837158, "learning_rate": 3.209526636907036e-07, "loss": 1.1136, "step": 4350 }, { "epoch": 0.1948488241881299, "eval_loss": 1.1951868534088135, "eval_runtime": 51.6432, "eval_samples_per_second": 9.682, "eval_steps_per_second": 9.682, "step": 4350 }, { "epoch": 0.19529675251959686, "grad_norm": 3.893348217010498, "learning_rate": 3.1129108238383095e-07, "loss": 1.2238, "step": 4360 }, { "epoch": 0.19574468085106383, "grad_norm": 3.704392433166504, "learning_rate": 3.017708529320604e-07, "loss": 1.0766, "step": 4370 }, { "epoch": 0.19619260918253079, "grad_norm": 4.406269073486328, "learning_rate": 2.923923666747357e-07, "loss": 0.9588, "step": 4380 }, { "epoch": 0.19664053751399777, "grad_norm": 6.578729152679443, "learning_rate": 2.8315600912469477e-07, "loss": 1.1622, "step": 4390 }, { "epoch": 0.19708846584546472, "grad_norm": 4.1804094314575195, "learning_rate": 2.740621599524189e-07, "loss": 1.1999, "step": 4400 }, { "epoch": 0.1975363941769317, "grad_norm": 6.192513465881348, "learning_rate": 2.651111929704303e-07, "loss": 1.1274, "step": 4410 }, { "epoch": 0.19798432250839865, "grad_norm": 4.356874942779541, "learning_rate": 2.563034761179223e-07, "loss": 1.0262, "step": 4420 }, { "epoch": 0.19843225083986563, "grad_norm": 4.435469627380371, "learning_rate": 2.476393714456384e-07, "loss": 1.1814, "step": 4430 }, { "epoch": 0.19888017917133258, "grad_norm": 3.9173505306243896, "learning_rate": 2.391192351009855e-07, "loss": 0.7984, "step": 4440 }, { "epoch": 0.19932810750279956, "grad_norm": 6.546506881713867, "learning_rate": 2.3074341731339837e-07, "loss": 1.168, "step": 4450 }, { "epoch": 0.1997760358342665, "grad_norm": 6.1646223068237305, "learning_rate": 2.225122623799407e-07, "loss": 1.2589, "step": 4460 }, { "epoch": 0.2002239641657335, "grad_norm": 3.210203170776367, "learning_rate": 2.1442610865115135e-07, "loss": 1.0636, "step": 4470 }, { "epoch": 0.20067189249720044, "grad_norm": 5.133816242218018, "learning_rate": 2.0648528851714077e-07, "loss": 1.0195, "step": 4480 }, { "epoch": 0.20111982082866742, "grad_norm": 4.449398517608643, "learning_rate": 1.9869012839392064e-07, "loss": 1.1007, "step": 4490 }, { "epoch": 0.20156774916013437, "grad_norm": 4.8083977699279785, "learning_rate": 1.9104094870999264e-07, "loss": 1.1975, "step": 4500 }, { "epoch": 0.20156774916013437, "eval_loss": 1.1950809955596924, "eval_runtime": 51.7311, "eval_samples_per_second": 9.665, "eval_steps_per_second": 9.665, "step": 4500 }, { "epoch": 0.20201567749160135, "grad_norm": 4.709386348724365, "learning_rate": 1.8353806389317428e-07, "loss": 0.9829, "step": 4510 }, { "epoch": 0.2024636058230683, "grad_norm": 5.23099946975708, "learning_rate": 1.761817823576731e-07, "loss": 1.1149, "step": 4520 }, { "epoch": 0.20291153415453528, "grad_norm": 3.4107179641723633, "learning_rate": 1.6897240649141125e-07, "loss": 0.9822, "step": 4530 }, { "epoch": 0.20335946248600223, "grad_norm": 3.951052188873291, "learning_rate": 1.619102326435923e-07, "loss": 1.2333, "step": 4540 }, { "epoch": 0.2038073908174692, "grad_norm": 4.30809211730957, "learning_rate": 1.5499555111252285e-07, "loss": 1.0641, "step": 4550 }, { "epoch": 0.20425531914893616, "grad_norm": 4.1274189949035645, "learning_rate": 1.4822864613367766e-07, "loss": 1.0962, "step": 4560 }, { "epoch": 0.20470324748040314, "grad_norm": 6.046044826507568, "learning_rate": 1.4160979586801724e-07, "loss": 1.0241, "step": 4570 }, { "epoch": 0.2051511758118701, "grad_norm": 4.066288471221924, "learning_rate": 1.3513927239055036e-07, "loss": 0.9061, "step": 4580 }, { "epoch": 0.20559910414333707, "grad_norm": 3.9250218868255615, "learning_rate": 1.2881734167915425e-07, "loss": 1.1666, "step": 4590 }, { "epoch": 0.20604703247480402, "grad_norm": 4.965548515319824, "learning_rate": 1.2264426360363956e-07, "loss": 0.8048, "step": 4600 }, { "epoch": 0.206494960806271, "grad_norm": 5.192389965057373, "learning_rate": 1.1662029191506775e-07, "loss": 0.9869, "step": 4610 }, { "epoch": 0.20694288913773795, "grad_norm": 4.953862190246582, "learning_rate": 1.107456742353201e-07, "loss": 1.0042, "step": 4620 }, { "epoch": 0.20739081746920493, "grad_norm": 4.955436706542969, "learning_rate": 1.0502065204692062e-07, "loss": 1.101, "step": 4630 }, { "epoch": 0.20783874580067188, "grad_norm": 2.5195674896240234, "learning_rate": 9.94454606831076e-08, "loss": 0.9542, "step": 4640 }, { "epoch": 0.20828667413213886, "grad_norm": 4.142997741699219, "learning_rate": 9.402032931816144e-08, "loss": 1.1318, "step": 4650 }, { "epoch": 0.20828667413213886, "eval_loss": 1.1947814226150513, "eval_runtime": 51.8063, "eval_samples_per_second": 9.651, "eval_steps_per_second": 9.651, "step": 4650 }, { "epoch": 0.2087346024636058, "grad_norm": 4.046876907348633, "learning_rate": 8.874548095798464e-08, "loss": 1.1393, "step": 4660 }, { "epoch": 0.2091825307950728, "grad_norm": 4.740685939788818, "learning_rate": 8.362113243093245e-08, "loss": 1.0529, "step": 4670 }, { "epoch": 0.20963045912653974, "grad_norm": 6.356805324554443, "learning_rate": 7.864749437890173e-08, "loss": 1.2791, "step": 4680 }, { "epoch": 0.21007838745800672, "grad_norm": 4.329228401184082, "learning_rate": 7.382477124867282e-08, "loss": 1.2672, "step": 4690 }, { "epoch": 0.21052631578947367, "grad_norm": 5.217611312866211, "learning_rate": 6.915316128350461e-08, "loss": 0.9357, "step": 4700 }, { "epoch": 0.21097424412094065, "grad_norm": 5.418657302856445, "learning_rate": 6.463285651498563e-08, "loss": 1.011, "step": 4710 }, { "epoch": 0.2114221724524076, "grad_norm": 6.056429386138916, "learning_rate": 6.026404275513875e-08, "loss": 1.4377, "step": 4720 }, { "epoch": 0.21187010078387458, "grad_norm": 3.5456736087799072, "learning_rate": 5.604689958878723e-08, "loss": 1.1192, "step": 4730 }, { "epoch": 0.21231802911534153, "grad_norm": 5.697049140930176, "learning_rate": 5.198160036616898e-08, "loss": 1.0392, "step": 4740 }, { "epoch": 0.2127659574468085, "grad_norm": 4.248316764831543, "learning_rate": 4.8068312195811847e-08, "loss": 1.0041, "step": 4750 }, { "epoch": 0.21321388577827546, "grad_norm": 3.3937604427337646, "learning_rate": 4.4307195937666194e-08, "loss": 0.9791, "step": 4760 }, { "epoch": 0.21366181410974244, "grad_norm": 3.097196340560913, "learning_rate": 4.069840619648935e-08, "loss": 1.1306, "step": 4770 }, { "epoch": 0.2141097424412094, "grad_norm": 5.534854888916016, "learning_rate": 3.72420913154932e-08, "loss": 1.104, "step": 4780 }, { "epoch": 0.21455767077267637, "grad_norm": 5.693947792053223, "learning_rate": 3.3938393370244876e-08, "loss": 1.1541, "step": 4790 }, { "epoch": 0.21500559910414332, "grad_norm": 4.025967597961426, "learning_rate": 3.078744816282731e-08, "loss": 1.1515, "step": 4800 }, { "epoch": 0.21500559910414332, "eval_loss": 1.1954809427261353, "eval_runtime": 51.6284, "eval_samples_per_second": 9.685, "eval_steps_per_second": 9.685, "step": 4800 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.204448348803072e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }