diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4765 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999537845806875, + "eval_steps": 500, + "global_step": 13523, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014788934179999814, + "grad_norm": 1.9570056200027466, + "learning_rate": 1.4781966001478198e-06, + "loss": 12.123, + "step": 20 + }, + { + "epoch": 0.002957786835999963, + "grad_norm": 3.435842990875244, + "learning_rate": 2.9563932002956396e-06, + "loss": 11.8952, + "step": 40 + }, + { + "epoch": 0.0044366802539999445, + "grad_norm": 1.3055179119110107, + "learning_rate": 4.434589800443459e-06, + "loss": 11.1244, + "step": 60 + }, + { + "epoch": 0.005915573671999926, + "grad_norm": 1.1435202360153198, + "learning_rate": 5.912786400591279e-06, + "loss": 10.6584, + "step": 80 + }, + { + "epoch": 0.007394467089999908, + "grad_norm": 1.1122593879699707, + "learning_rate": 7.390983000739099e-06, + "loss": 10.3924, + "step": 100 + }, + { + "epoch": 0.008873360507999889, + "grad_norm": 1.0903944969177246, + "learning_rate": 8.869179600886918e-06, + "loss": 10.1278, + "step": 120 + }, + { + "epoch": 0.010352253925999871, + "grad_norm": 1.0405408143997192, + "learning_rate": 1.0347376201034738e-05, + "loss": 9.829, + "step": 140 + }, + { + "epoch": 0.011831147343999851, + "grad_norm": 1.032538652420044, + "learning_rate": 1.1825572801182558e-05, + "loss": 9.4957, + "step": 160 + }, + { + "epoch": 0.013310040761999833, + "grad_norm": 1.4152177572250366, + "learning_rate": 1.3303769401330377e-05, + "loss": 9.1722, + "step": 180 + }, + { + "epoch": 0.014788934179999816, + "grad_norm": 0.8978266716003418, + "learning_rate": 1.4781966001478198e-05, + "loss": 8.8736, + "step": 200 + }, + { + "epoch": 0.016267827597999798, + "grad_norm": 1.0230133533477783, + "learning_rate": 1.6260162601626018e-05, + "loss": 8.6163, + "step": 220 + }, + { + "epoch": 0.017746721015999778, + "grad_norm": 1.3886386156082153, + "learning_rate": 1.7738359201773837e-05, + "loss": 8.3772, + "step": 240 + }, + { + "epoch": 0.019225614433999758, + "grad_norm": 0.8950226306915283, + "learning_rate": 1.9216555801921658e-05, + "loss": 8.1872, + "step": 260 + }, + { + "epoch": 0.020704507851999742, + "grad_norm": 1.3098183870315552, + "learning_rate": 2.0694752402069477e-05, + "loss": 8.0067, + "step": 280 + }, + { + "epoch": 0.022183401269999722, + "grad_norm": 1.3033353090286255, + "learning_rate": 2.2172949002217298e-05, + "loss": 7.8361, + "step": 300 + }, + { + "epoch": 0.023662294687999703, + "grad_norm": 1.6088228225708008, + "learning_rate": 2.3651145602365117e-05, + "loss": 7.69, + "step": 320 + }, + { + "epoch": 0.025141188105999687, + "grad_norm": 1.0888606309890747, + "learning_rate": 2.5129342202512935e-05, + "loss": 7.5744, + "step": 340 + }, + { + "epoch": 0.026620081523999667, + "grad_norm": 1.0944548845291138, + "learning_rate": 2.6607538802660753e-05, + "loss": 7.4501, + "step": 360 + }, + { + "epoch": 0.028098974941999647, + "grad_norm": 1.5041922330856323, + "learning_rate": 2.8085735402808578e-05, + "loss": 7.3575, + "step": 380 + }, + { + "epoch": 0.02957786835999963, + "grad_norm": 1.4672595262527466, + "learning_rate": 2.9563932002956397e-05, + "loss": 7.2633, + "step": 400 + }, + { + "epoch": 0.03105676177799961, + "grad_norm": 1.3001948595046997, + "learning_rate": 3.104212860310421e-05, + "loss": 7.1749, + "step": 420 + }, + { + "epoch": 0.032535655195999595, + "grad_norm": 1.4149699211120605, + "learning_rate": 3.2520325203252037e-05, + "loss": 7.098, + "step": 440 + }, + { + "epoch": 0.03401454861399957, + "grad_norm": 1.6322951316833496, + "learning_rate": 3.3998521803399855e-05, + "loss": 7.015, + "step": 460 + }, + { + "epoch": 0.035493442031999556, + "grad_norm": 1.659485101699829, + "learning_rate": 3.547671840354767e-05, + "loss": 6.9398, + "step": 480 + }, + { + "epoch": 0.03697233544999954, + "grad_norm": 1.7957265377044678, + "learning_rate": 3.69549150036955e-05, + "loss": 6.8648, + "step": 500 + }, + { + "epoch": 0.038451228867999517, + "grad_norm": 1.4912447929382324, + "learning_rate": 3.8433111603843317e-05, + "loss": 6.7973, + "step": 520 + }, + { + "epoch": 0.0399301222859995, + "grad_norm": 1.7237913608551025, + "learning_rate": 3.9911308203991135e-05, + "loss": 6.7331, + "step": 540 + }, + { + "epoch": 0.041409015703999484, + "grad_norm": 1.8182610273361206, + "learning_rate": 4.138950480413895e-05, + "loss": 6.668, + "step": 560 + }, + { + "epoch": 0.04288790912199946, + "grad_norm": 1.6812163591384888, + "learning_rate": 4.286770140428677e-05, + "loss": 6.5894, + "step": 580 + }, + { + "epoch": 0.044366802539999445, + "grad_norm": 1.818665623664856, + "learning_rate": 4.4345898004434597e-05, + "loss": 6.5361, + "step": 600 + }, + { + "epoch": 0.04584569595799943, + "grad_norm": 1.3113698959350586, + "learning_rate": 4.5824094604582415e-05, + "loss": 6.4732, + "step": 620 + }, + { + "epoch": 0.047324589375999405, + "grad_norm": 1.9587410688400269, + "learning_rate": 4.730229120473023e-05, + "loss": 6.4143, + "step": 640 + }, + { + "epoch": 0.04880348279399939, + "grad_norm": 1.4764151573181152, + "learning_rate": 4.878048780487805e-05, + "loss": 6.358, + "step": 660 + }, + { + "epoch": 0.05028237621199937, + "grad_norm": 1.5685200691223145, + "learning_rate": 5.025868440502587e-05, + "loss": 6.3084, + "step": 680 + }, + { + "epoch": 0.05176126962999935, + "grad_norm": 2.1411592960357666, + "learning_rate": 5.173688100517369e-05, + "loss": 6.2515, + "step": 700 + }, + { + "epoch": 0.053240163047999334, + "grad_norm": 2.6792619228363037, + "learning_rate": 5.3215077605321506e-05, + "loss": 6.2091, + "step": 720 + }, + { + "epoch": 0.05471905646599932, + "grad_norm": 1.5457326173782349, + "learning_rate": 5.4693274205469325e-05, + "loss": 6.1512, + "step": 740 + }, + { + "epoch": 0.056197949883999294, + "grad_norm": 1.931794285774231, + "learning_rate": 5.6171470805617157e-05, + "loss": 6.0981, + "step": 760 + }, + { + "epoch": 0.05767684330199928, + "grad_norm": 2.3924379348754883, + "learning_rate": 5.7649667405764975e-05, + "loss": 6.0439, + "step": 780 + }, + { + "epoch": 0.05915573671999926, + "grad_norm": 2.1078522205352783, + "learning_rate": 5.912786400591279e-05, + "loss": 6.0081, + "step": 800 + }, + { + "epoch": 0.06063463013799924, + "grad_norm": 1.8126791715621948, + "learning_rate": 6.060606060606061e-05, + "loss": 5.9435, + "step": 820 + }, + { + "epoch": 0.06211352355599922, + "grad_norm": 1.6939939260482788, + "learning_rate": 6.208425720620842e-05, + "loss": 5.9, + "step": 840 + }, + { + "epoch": 0.0635924169739992, + "grad_norm": 1.7903132438659668, + "learning_rate": 6.356245380635625e-05, + "loss": 5.8536, + "step": 860 + }, + { + "epoch": 0.06507131039199919, + "grad_norm": 2.1418817043304443, + "learning_rate": 6.504065040650407e-05, + "loss": 5.8192, + "step": 880 + }, + { + "epoch": 0.06655020380999917, + "grad_norm": 1.6386531591415405, + "learning_rate": 6.651884700665188e-05, + "loss": 5.768, + "step": 900 + }, + { + "epoch": 0.06802909722799914, + "grad_norm": 1.82034432888031, + "learning_rate": 6.799704360679971e-05, + "loss": 5.7162, + "step": 920 + }, + { + "epoch": 0.06950799064599913, + "grad_norm": 1.9206963777542114, + "learning_rate": 6.947524020694752e-05, + "loss": 5.6755, + "step": 940 + }, + { + "epoch": 0.07098688406399911, + "grad_norm": 1.4253259897232056, + "learning_rate": 7.095343680709535e-05, + "loss": 5.6321, + "step": 960 + }, + { + "epoch": 0.07246577748199909, + "grad_norm": 2.0578746795654297, + "learning_rate": 7.243163340724317e-05, + "loss": 5.5907, + "step": 980 + }, + { + "epoch": 0.07394467089999908, + "grad_norm": 1.4132108688354492, + "learning_rate": 7.3909830007391e-05, + "loss": 5.5483, + "step": 1000 + }, + { + "epoch": 0.07542356431799906, + "grad_norm": 1.6758071184158325, + "learning_rate": 7.538802660753881e-05, + "loss": 5.5136, + "step": 1020 + }, + { + "epoch": 0.07690245773599903, + "grad_norm": 1.5184019804000854, + "learning_rate": 7.686622320768663e-05, + "loss": 5.4715, + "step": 1040 + }, + { + "epoch": 0.07838135115399902, + "grad_norm": 1.731789231300354, + "learning_rate": 7.834441980783444e-05, + "loss": 5.4289, + "step": 1060 + }, + { + "epoch": 0.079860244571999, + "grad_norm": 1.4423941373825073, + "learning_rate": 7.982261640798227e-05, + "loss": 5.3799, + "step": 1080 + }, + { + "epoch": 0.08133913798999898, + "grad_norm": 1.200088620185852, + "learning_rate": 8.130081300813008e-05, + "loss": 5.3446, + "step": 1100 + }, + { + "epoch": 0.08281803140799897, + "grad_norm": 1.5034804344177246, + "learning_rate": 8.27790096082779e-05, + "loss": 5.3011, + "step": 1120 + }, + { + "epoch": 0.08429692482599895, + "grad_norm": 1.6272141933441162, + "learning_rate": 8.425720620842572e-05, + "loss": 5.2573, + "step": 1140 + }, + { + "epoch": 0.08577581824399892, + "grad_norm": 1.6940892934799194, + "learning_rate": 8.573540280857354e-05, + "loss": 5.2206, + "step": 1160 + }, + { + "epoch": 0.08725471166199891, + "grad_norm": 1.531122088432312, + "learning_rate": 8.721359940872137e-05, + "loss": 5.1842, + "step": 1180 + }, + { + "epoch": 0.08873360507999889, + "grad_norm": 1.3891607522964478, + "learning_rate": 8.869179600886919e-05, + "loss": 5.1574, + "step": 1200 + }, + { + "epoch": 0.09021249849799887, + "grad_norm": 1.5175141096115112, + "learning_rate": 9.0169992609017e-05, + "loss": 5.0965, + "step": 1220 + }, + { + "epoch": 0.09169139191599886, + "grad_norm": 1.2954392433166504, + "learning_rate": 9.164818920916483e-05, + "loss": 5.0615, + "step": 1240 + }, + { + "epoch": 0.09317028533399883, + "grad_norm": 1.1776789426803589, + "learning_rate": 9.312638580931264e-05, + "loss": 5.0263, + "step": 1260 + }, + { + "epoch": 0.09464917875199881, + "grad_norm": 1.342835545539856, + "learning_rate": 9.460458240946047e-05, + "loss": 4.9938, + "step": 1280 + }, + { + "epoch": 0.0961280721699988, + "grad_norm": 1.5098336935043335, + "learning_rate": 9.608277900960828e-05, + "loss": 4.9579, + "step": 1300 + }, + { + "epoch": 0.09760696558799878, + "grad_norm": 1.3883858919143677, + "learning_rate": 9.75609756097561e-05, + "loss": 4.9159, + "step": 1320 + }, + { + "epoch": 0.09908585900599876, + "grad_norm": 1.6131935119628906, + "learning_rate": 9.903917220990391e-05, + "loss": 4.8716, + "step": 1340 + }, + { + "epoch": 0.10056475242399875, + "grad_norm": 1.3793425559997559, + "learning_rate": 9.999991836910476e-05, + "loss": 4.8389, + "step": 1360 + }, + { + "epoch": 0.10204364584199872, + "grad_norm": 1.2413076162338257, + "learning_rate": 9.999878553677705e-05, + "loss": 4.8044, + "step": 1380 + }, + { + "epoch": 0.1035225392599987, + "grad_norm": 1.4875175952911377, + "learning_rate": 9.99963199901083e-05, + "loss": 4.759, + "step": 1400 + }, + { + "epoch": 0.10500143267799869, + "grad_norm": 1.281230092048645, + "learning_rate": 9.999252179481748e-05, + "loss": 4.733, + "step": 1420 + }, + { + "epoch": 0.10648032609599867, + "grad_norm": 1.179935336112976, + "learning_rate": 9.998739105214525e-05, + "loss": 4.6965, + "step": 1440 + }, + { + "epoch": 0.10795921951399864, + "grad_norm": 1.2033872604370117, + "learning_rate": 9.998092789885118e-05, + "loss": 4.649, + "step": 1460 + }, + { + "epoch": 0.10943811293199864, + "grad_norm": 1.310261607170105, + "learning_rate": 9.997313250721026e-05, + "loss": 4.6158, + "step": 1480 + }, + { + "epoch": 0.11091700634999861, + "grad_norm": 1.1370333433151245, + "learning_rate": 9.996400508500809e-05, + "loss": 4.5917, + "step": 1500 + }, + { + "epoch": 0.11239589976799859, + "grad_norm": 0.9518343210220337, + "learning_rate": 9.995354587553553e-05, + "loss": 4.5477, + "step": 1520 + }, + { + "epoch": 0.11387479318599858, + "grad_norm": 1.1209640502929688, + "learning_rate": 9.994175515758211e-05, + "loss": 4.5169, + "step": 1540 + }, + { + "epoch": 0.11535368660399856, + "grad_norm": 1.1134682893753052, + "learning_rate": 9.992863324542865e-05, + "loss": 4.4921, + "step": 1560 + }, + { + "epoch": 0.11683258002199853, + "grad_norm": 1.1962740421295166, + "learning_rate": 9.991418048883885e-05, + "loss": 4.4678, + "step": 1580 + }, + { + "epoch": 0.11831147343999852, + "grad_norm": 1.0190341472625732, + "learning_rate": 9.989839727305e-05, + "loss": 4.4265, + "step": 1600 + }, + { + "epoch": 0.1197903668579985, + "grad_norm": 1.1323659420013428, + "learning_rate": 9.988128401876267e-05, + "loss": 4.3951, + "step": 1620 + }, + { + "epoch": 0.12126926027599848, + "grad_norm": 1.2068976163864136, + "learning_rate": 9.986284118212951e-05, + "loss": 4.3762, + "step": 1640 + }, + { + "epoch": 0.12274815369399847, + "grad_norm": 1.1199101209640503, + "learning_rate": 9.984306925474313e-05, + "loss": 4.3519, + "step": 1660 + }, + { + "epoch": 0.12422704711199845, + "grad_norm": 0.8594743013381958, + "learning_rate": 9.982196876362298e-05, + "loss": 4.3268, + "step": 1680 + }, + { + "epoch": 0.12570594052999842, + "grad_norm": 1.0981128215789795, + "learning_rate": 9.979954027120124e-05, + "loss": 4.3018, + "step": 1700 + }, + { + "epoch": 0.1271848339479984, + "grad_norm": 0.9453332424163818, + "learning_rate": 9.97757843753079e-05, + "loss": 4.2747, + "step": 1720 + }, + { + "epoch": 0.1286637273659984, + "grad_norm": 0.9754221439361572, + "learning_rate": 9.975070170915481e-05, + "loss": 4.2539, + "step": 1740 + }, + { + "epoch": 0.13014262078399838, + "grad_norm": 0.7794106602668762, + "learning_rate": 9.972429294131878e-05, + "loss": 4.2331, + "step": 1760 + }, + { + "epoch": 0.13162151420199836, + "grad_norm": 0.8084755539894104, + "learning_rate": 9.969655877572379e-05, + "loss": 4.2076, + "step": 1780 + }, + { + "epoch": 0.13310040761999833, + "grad_norm": 0.9451693296432495, + "learning_rate": 9.96674999516222e-05, + "loss": 4.2023, + "step": 1800 + }, + { + "epoch": 0.1345793010379983, + "grad_norm": 0.9662824869155884, + "learning_rate": 9.963711724357503e-05, + "loss": 4.1661, + "step": 1820 + }, + { + "epoch": 0.1360581944559983, + "grad_norm": 0.8646146655082703, + "learning_rate": 9.960541146143138e-05, + "loss": 4.1529, + "step": 1840 + }, + { + "epoch": 0.1375370878739983, + "grad_norm": 0.819580078125, + "learning_rate": 9.957238345030681e-05, + "loss": 4.1353, + "step": 1860 + }, + { + "epoch": 0.13901598129199827, + "grad_norm": 0.793268620967865, + "learning_rate": 9.953803409056077e-05, + "loss": 4.1205, + "step": 1880 + }, + { + "epoch": 0.14049487470999825, + "grad_norm": 0.8794734477996826, + "learning_rate": 9.950236429777319e-05, + "loss": 4.1034, + "step": 1900 + }, + { + "epoch": 0.14197376812799822, + "grad_norm": 0.8757349252700806, + "learning_rate": 9.946537502272004e-05, + "loss": 4.0896, + "step": 1920 + }, + { + "epoch": 0.1434526615459982, + "grad_norm": 0.806181788444519, + "learning_rate": 9.942706725134801e-05, + "loss": 4.0792, + "step": 1940 + }, + { + "epoch": 0.14493155496399818, + "grad_norm": 0.568131148815155, + "learning_rate": 9.938744200474825e-05, + "loss": 4.0483, + "step": 1960 + }, + { + "epoch": 0.14641044838199818, + "grad_norm": 0.9386783242225647, + "learning_rate": 9.934650033912909e-05, + "loss": 4.0349, + "step": 1980 + }, + { + "epoch": 0.14788934179999816, + "grad_norm": 0.8668307065963745, + "learning_rate": 9.930424334578793e-05, + "loss": 4.0249, + "step": 2000 + }, + { + "epoch": 0.14936823521799814, + "grad_norm": 0.7728129625320435, + "learning_rate": 9.926067215108216e-05, + "loss": 4.001, + "step": 2020 + }, + { + "epoch": 0.1508471286359981, + "grad_norm": 0.8983877301216125, + "learning_rate": 9.92157879163991e-05, + "loss": 4.0099, + "step": 2040 + }, + { + "epoch": 0.1523260220539981, + "grad_norm": 0.7290263772010803, + "learning_rate": 9.916959183812508e-05, + "loss": 3.9816, + "step": 2060 + }, + { + "epoch": 0.15380491547199807, + "grad_norm": 1.0002912282943726, + "learning_rate": 9.912208514761353e-05, + "loss": 3.964, + "step": 2080 + }, + { + "epoch": 0.15528380888999807, + "grad_norm": 0.8696877956390381, + "learning_rate": 9.907326911115215e-05, + "loss": 3.9532, + "step": 2100 + }, + { + "epoch": 0.15676270230799805, + "grad_norm": 0.9264429211616516, + "learning_rate": 9.90231450299292e-05, + "loss": 3.9405, + "step": 2120 + }, + { + "epoch": 0.15824159572599802, + "grad_norm": 0.6036892533302307, + "learning_rate": 9.897171423999877e-05, + "loss": 3.9308, + "step": 2140 + }, + { + "epoch": 0.159720489143998, + "grad_norm": 0.6206973791122437, + "learning_rate": 9.891897811224516e-05, + "loss": 3.9089, + "step": 2160 + }, + { + "epoch": 0.16119938256199798, + "grad_norm": 0.9498934149742126, + "learning_rate": 9.886493805234642e-05, + "loss": 3.9101, + "step": 2180 + }, + { + "epoch": 0.16267827597999795, + "grad_norm": 0.8084043264389038, + "learning_rate": 9.880959550073676e-05, + "loss": 3.9108, + "step": 2200 + }, + { + "epoch": 0.16415716939799796, + "grad_norm": 0.7810977697372437, + "learning_rate": 9.875295193256829e-05, + "loss": 3.8923, + "step": 2220 + }, + { + "epoch": 0.16563606281599794, + "grad_norm": 0.5951938033103943, + "learning_rate": 9.869500885767156e-05, + "loss": 3.8676, + "step": 2240 + }, + { + "epoch": 0.1671149562339979, + "grad_norm": 0.7140426635742188, + "learning_rate": 9.863576782051544e-05, + "loss": 3.8717, + "step": 2260 + }, + { + "epoch": 0.1685938496519979, + "grad_norm": 0.7328889966011047, + "learning_rate": 9.857523040016588e-05, + "loss": 3.8585, + "step": 2280 + }, + { + "epoch": 0.17007274306999787, + "grad_norm": 0.9172821044921875, + "learning_rate": 9.851339821024383e-05, + "loss": 3.8515, + "step": 2300 + }, + { + "epoch": 0.17155163648799784, + "grad_norm": 0.70406574010849, + "learning_rate": 9.845027289888226e-05, + "loss": 3.8322, + "step": 2320 + }, + { + "epoch": 0.17303052990599785, + "grad_norm": 0.6545581221580505, + "learning_rate": 9.838585614868221e-05, + "loss": 3.8342, + "step": 2340 + }, + { + "epoch": 0.17450942332399783, + "grad_norm": 0.8262337446212769, + "learning_rate": 9.832014967666788e-05, + "loss": 3.8178, + "step": 2360 + }, + { + "epoch": 0.1759883167419978, + "grad_norm": 0.748437225818634, + "learning_rate": 9.825315523424097e-05, + "loss": 3.8054, + "step": 2380 + }, + { + "epoch": 0.17746721015999778, + "grad_norm": 0.7961335778236389, + "learning_rate": 9.818487460713397e-05, + "loss": 3.803, + "step": 2400 + }, + { + "epoch": 0.17894610357799776, + "grad_norm": 0.5949457287788391, + "learning_rate": 9.811530961536246e-05, + "loss": 3.7988, + "step": 2420 + }, + { + "epoch": 0.18042499699599773, + "grad_norm": 0.6500332355499268, + "learning_rate": 9.804446211317677e-05, + "loss": 3.7902, + "step": 2440 + }, + { + "epoch": 0.18190389041399774, + "grad_norm": 0.5734246969223022, + "learning_rate": 9.797233398901238e-05, + "loss": 3.7788, + "step": 2460 + }, + { + "epoch": 0.18338278383199771, + "grad_norm": 0.6358067393302917, + "learning_rate": 9.78989271654397e-05, + "loss": 3.7581, + "step": 2480 + }, + { + "epoch": 0.1848616772499977, + "grad_norm": 0.7676229476928711, + "learning_rate": 9.78242435991128e-05, + "loss": 3.7566, + "step": 2500 + }, + { + "epoch": 0.18634057066799767, + "grad_norm": 0.5594522356987, + "learning_rate": 9.774828528071722e-05, + "loss": 3.7552, + "step": 2520 + }, + { + "epoch": 0.18781946408599764, + "grad_norm": 0.7414741516113281, + "learning_rate": 9.767105423491694e-05, + "loss": 3.7404, + "step": 2540 + }, + { + "epoch": 0.18929835750399762, + "grad_norm": 0.6007790565490723, + "learning_rate": 9.759255252030042e-05, + "loss": 3.7308, + "step": 2560 + }, + { + "epoch": 0.19077725092199763, + "grad_norm": 0.6344082355499268, + "learning_rate": 9.751278222932569e-05, + "loss": 3.7179, + "step": 2580 + }, + { + "epoch": 0.1922561443399976, + "grad_norm": 0.6184104681015015, + "learning_rate": 9.743174548826461e-05, + "loss": 3.7177, + "step": 2600 + }, + { + "epoch": 0.19373503775799758, + "grad_norm": 0.785652756690979, + "learning_rate": 9.734944445714618e-05, + "loss": 3.7022, + "step": 2620 + }, + { + "epoch": 0.19521393117599756, + "grad_norm": 0.664434015750885, + "learning_rate": 9.726588132969901e-05, + "loss": 3.6885, + "step": 2640 + }, + { + "epoch": 0.19669282459399753, + "grad_norm": 0.6987696290016174, + "learning_rate": 9.718105833329272e-05, + "loss": 3.682, + "step": 2660 + }, + { + "epoch": 0.1981717180119975, + "grad_norm": 0.5085122585296631, + "learning_rate": 9.709497772887874e-05, + "loss": 3.6707, + "step": 2680 + }, + { + "epoch": 0.19965061142999752, + "grad_norm": 0.8911309838294983, + "learning_rate": 9.700764181092988e-05, + "loss": 3.6517, + "step": 2700 + }, + { + "epoch": 0.2011295048479975, + "grad_norm": 0.7100036144256592, + "learning_rate": 9.691905290737932e-05, + "loss": 3.6738, + "step": 2720 + }, + { + "epoch": 0.20260839826599747, + "grad_norm": 0.5330691933631897, + "learning_rate": 9.682921337955847e-05, + "loss": 3.664, + "step": 2740 + }, + { + "epoch": 0.20408729168399745, + "grad_norm": 0.5505249500274658, + "learning_rate": 9.673812562213401e-05, + "loss": 3.6491, + "step": 2760 + }, + { + "epoch": 0.20556618510199742, + "grad_norm": 0.7107018232345581, + "learning_rate": 9.664579206304413e-05, + "loss": 3.6406, + "step": 2780 + }, + { + "epoch": 0.2070450785199974, + "grad_norm": 0.5617266893386841, + "learning_rate": 9.65522151634338e-05, + "loss": 3.653, + "step": 2800 + }, + { + "epoch": 0.2085239719379974, + "grad_norm": 0.5702326893806458, + "learning_rate": 9.64573974175891e-05, + "loss": 3.6311, + "step": 2820 + }, + { + "epoch": 0.21000286535599738, + "grad_norm": 0.5759734511375427, + "learning_rate": 9.636134135287081e-05, + "loss": 3.6256, + "step": 2840 + }, + { + "epoch": 0.21148175877399736, + "grad_norm": 0.6595752835273743, + "learning_rate": 9.626404952964704e-05, + "loss": 3.6184, + "step": 2860 + }, + { + "epoch": 0.21296065219199733, + "grad_norm": 0.7071236371994019, + "learning_rate": 9.616552454122492e-05, + "loss": 3.6138, + "step": 2880 + }, + { + "epoch": 0.2144395456099973, + "grad_norm": 0.7660998702049255, + "learning_rate": 9.606576901378156e-05, + "loss": 3.6059, + "step": 2900 + }, + { + "epoch": 0.2159184390279973, + "grad_norm": 0.9190542101860046, + "learning_rate": 9.596478560629397e-05, + "loss": 3.5887, + "step": 2920 + }, + { + "epoch": 0.2173973324459973, + "grad_norm": 0.5795056223869324, + "learning_rate": 9.586257701046824e-05, + "loss": 3.5981, + "step": 2940 + }, + { + "epoch": 0.21887622586399727, + "grad_norm": 0.607071578502655, + "learning_rate": 9.575914595066777e-05, + "loss": 3.592, + "step": 2960 + }, + { + "epoch": 0.22035511928199725, + "grad_norm": 0.7824068069458008, + "learning_rate": 9.565449518384066e-05, + "loss": 3.5919, + "step": 2980 + }, + { + "epoch": 0.22183401269999722, + "grad_norm": 0.5169054269790649, + "learning_rate": 9.554862749944622e-05, + "loss": 3.5899, + "step": 3000 + }, + { + "epoch": 0.2233129061179972, + "grad_norm": 0.8486248850822449, + "learning_rate": 9.544154571938062e-05, + "loss": 3.5707, + "step": 3020 + }, + { + "epoch": 0.22479179953599718, + "grad_norm": 0.47671154141426086, + "learning_rate": 9.533325269790167e-05, + "loss": 3.559, + "step": 3040 + }, + { + "epoch": 0.22627069295399718, + "grad_norm": 0.5938573479652405, + "learning_rate": 9.522375132155272e-05, + "loss": 3.5422, + "step": 3060 + }, + { + "epoch": 0.22774958637199716, + "grad_norm": 0.6117560267448425, + "learning_rate": 9.511304450908576e-05, + "loss": 3.5671, + "step": 3080 + }, + { + "epoch": 0.22922847978999714, + "grad_norm": 0.6173937916755676, + "learning_rate": 9.500113521138361e-05, + "loss": 3.5669, + "step": 3100 + }, + { + "epoch": 0.2307073732079971, + "grad_norm": 0.726667046546936, + "learning_rate": 9.488802641138125e-05, + "loss": 3.5366, + "step": 3120 + }, + { + "epoch": 0.2321862666259971, + "grad_norm": 0.5627657771110535, + "learning_rate": 9.477372112398629e-05, + "loss": 3.53, + "step": 3140 + }, + { + "epoch": 0.23366516004399707, + "grad_norm": 0.49706488847732544, + "learning_rate": 9.465822239599864e-05, + "loss": 3.5406, + "step": 3160 + }, + { + "epoch": 0.23514405346199707, + "grad_norm": 0.9899396896362305, + "learning_rate": 9.454153330602932e-05, + "loss": 3.5231, + "step": 3180 + }, + { + "epoch": 0.23662294687999705, + "grad_norm": 0.4798751771450043, + "learning_rate": 9.442365696441835e-05, + "loss": 3.5116, + "step": 3200 + }, + { + "epoch": 0.23810184029799702, + "grad_norm": 0.6276853084564209, + "learning_rate": 9.430459651315185e-05, + "loss": 3.5184, + "step": 3220 + }, + { + "epoch": 0.239580733715997, + "grad_norm": 0.4986541271209717, + "learning_rate": 9.418435512577833e-05, + "loss": 3.5119, + "step": 3240 + }, + { + "epoch": 0.24105962713399698, + "grad_norm": 0.535453736782074, + "learning_rate": 9.406293600732408e-05, + "loss": 3.5147, + "step": 3260 + }, + { + "epoch": 0.24253852055199696, + "grad_norm": 0.5945438146591187, + "learning_rate": 9.39403423942077e-05, + "loss": 3.5023, + "step": 3280 + }, + { + "epoch": 0.24401741396999696, + "grad_norm": 0.6451681852340698, + "learning_rate": 9.381657755415387e-05, + "loss": 3.4846, + "step": 3300 + }, + { + "epoch": 0.24549630738799694, + "grad_norm": 0.6193166375160217, + "learning_rate": 9.369164478610631e-05, + "loss": 3.488, + "step": 3320 + }, + { + "epoch": 0.24697520080599691, + "grad_norm": 0.7059178352355957, + "learning_rate": 9.35655474201397e-05, + "loss": 3.4883, + "step": 3340 + }, + { + "epoch": 0.2484540942239969, + "grad_norm": 0.6481304168701172, + "learning_rate": 9.343828881737107e-05, + "loss": 3.4762, + "step": 3360 + }, + { + "epoch": 0.24993298764199687, + "grad_norm": 0.5440752506256104, + "learning_rate": 9.330987236987008e-05, + "loss": 3.481, + "step": 3380 + }, + { + "epoch": 0.25141188105999684, + "grad_norm": 0.5582643747329712, + "learning_rate": 9.318030150056869e-05, + "loss": 3.4755, + "step": 3400 + }, + { + "epoch": 0.25289077447799685, + "grad_norm": 0.6249572038650513, + "learning_rate": 9.304957966316995e-05, + "loss": 3.4775, + "step": 3420 + }, + { + "epoch": 0.2543696678959968, + "grad_norm": 0.6695943474769592, + "learning_rate": 9.291771034205578e-05, + "loss": 3.463, + "step": 3440 + }, + { + "epoch": 0.2558485613139968, + "grad_norm": 0.4462078809738159, + "learning_rate": 9.27846970521943e-05, + "loss": 3.4561, + "step": 3460 + }, + { + "epoch": 0.2573274547319968, + "grad_norm": 0.49235352873802185, + "learning_rate": 9.265054333904601e-05, + "loss": 3.4515, + "step": 3480 + }, + { + "epoch": 0.25880634814999676, + "grad_norm": 0.6507192254066467, + "learning_rate": 9.251525277846929e-05, + "loss": 3.4514, + "step": 3500 + }, + { + "epoch": 0.26028524156799676, + "grad_norm": 0.4588228166103363, + "learning_rate": 9.237882897662515e-05, + "loss": 3.4286, + "step": 3520 + }, + { + "epoch": 0.2617641349859967, + "grad_norm": 0.575430691242218, + "learning_rate": 9.224127556988107e-05, + "loss": 3.4458, + "step": 3540 + }, + { + "epoch": 0.2632430284039967, + "grad_norm": 0.7287342548370361, + "learning_rate": 9.210259622471403e-05, + "loss": 3.4318, + "step": 3560 + }, + { + "epoch": 0.26472192182199666, + "grad_norm": 0.6866022348403931, + "learning_rate": 9.19627946376129e-05, + "loss": 3.4361, + "step": 3580 + }, + { + "epoch": 0.26620081523999667, + "grad_norm": 0.5268846750259399, + "learning_rate": 9.182187453497974e-05, + "loss": 3.4364, + "step": 3600 + }, + { + "epoch": 0.2676797086579967, + "grad_norm": 0.6380168795585632, + "learning_rate": 9.167983967303066e-05, + "loss": 3.4389, + "step": 3620 + }, + { + "epoch": 0.2691586020759966, + "grad_norm": 0.6250066757202148, + "learning_rate": 9.153669383769556e-05, + "loss": 3.4322, + "step": 3640 + }, + { + "epoch": 0.2706374954939966, + "grad_norm": 0.6497014164924622, + "learning_rate": 9.139244084451729e-05, + "loss": 3.4068, + "step": 3660 + }, + { + "epoch": 0.2721163889119966, + "grad_norm": 0.8837792277336121, + "learning_rate": 9.124708453854983e-05, + "loss": 3.4132, + "step": 3680 + }, + { + "epoch": 0.2735952823299966, + "grad_norm": 0.5183786153793335, + "learning_rate": 9.110062879425602e-05, + "loss": 3.4081, + "step": 3700 + }, + { + "epoch": 0.2750741757479966, + "grad_norm": 0.7497463226318359, + "learning_rate": 9.095307751540407e-05, + "loss": 3.3986, + "step": 3720 + }, + { + "epoch": 0.27655306916599653, + "grad_norm": 0.5026047825813293, + "learning_rate": 9.080443463496363e-05, + "loss": 3.4111, + "step": 3740 + }, + { + "epoch": 0.27803196258399654, + "grad_norm": 0.4640219211578369, + "learning_rate": 9.06547041150009e-05, + "loss": 3.3865, + "step": 3760 + }, + { + "epoch": 0.2795108560019965, + "grad_norm": 0.5095507502555847, + "learning_rate": 9.050388994657303e-05, + "loss": 3.3915, + "step": 3780 + }, + { + "epoch": 0.2809897494199965, + "grad_norm": 0.5542161464691162, + "learning_rate": 9.035199614962178e-05, + "loss": 3.3924, + "step": 3800 + }, + { + "epoch": 0.28246864283799644, + "grad_norm": 0.44914740324020386, + "learning_rate": 9.019902677286631e-05, + "loss": 3.3968, + "step": 3820 + }, + { + "epoch": 0.28394753625599645, + "grad_norm": 0.4764072000980377, + "learning_rate": 9.004498589369532e-05, + "loss": 3.3937, + "step": 3840 + }, + { + "epoch": 0.28542642967399645, + "grad_norm": 1.0480468273162842, + "learning_rate": 8.98898776180583e-05, + "loss": 3.3926, + "step": 3860 + }, + { + "epoch": 0.2869053230919964, + "grad_norm": 0.5355066061019897, + "learning_rate": 8.973370608035612e-05, + "loss": 3.3895, + "step": 3880 + }, + { + "epoch": 0.2883842165099964, + "grad_norm": 0.4495852589607239, + "learning_rate": 8.957647544333088e-05, + "loss": 3.3717, + "step": 3900 + }, + { + "epoch": 0.28986310992799635, + "grad_norm": 0.5025330781936646, + "learning_rate": 8.941818989795487e-05, + "loss": 3.3653, + "step": 3920 + }, + { + "epoch": 0.29134200334599636, + "grad_norm": 0.7565049529075623, + "learning_rate": 8.925885366331887e-05, + "loss": 3.3668, + "step": 3940 + }, + { + "epoch": 0.29282089676399636, + "grad_norm": 0.8078230619430542, + "learning_rate": 8.909847098651978e-05, + "loss": 3.3678, + "step": 3960 + }, + { + "epoch": 0.2942997901819963, + "grad_norm": 0.532131552696228, + "learning_rate": 8.893704614254725e-05, + "loss": 3.3616, + "step": 3980 + }, + { + "epoch": 0.2957786835999963, + "grad_norm": 0.6017030477523804, + "learning_rate": 8.877458343416993e-05, + "loss": 3.349, + "step": 4000 + }, + { + "epoch": 0.29725757701799627, + "grad_norm": 0.5634870529174805, + "learning_rate": 8.861108719182061e-05, + "loss": 3.3385, + "step": 4020 + }, + { + "epoch": 0.29873647043599627, + "grad_norm": 0.5135075449943542, + "learning_rate": 8.844656177348087e-05, + "loss": 3.353, + "step": 4040 + }, + { + "epoch": 0.3002153638539962, + "grad_norm": 0.49317190051078796, + "learning_rate": 8.828101156456493e-05, + "loss": 3.3455, + "step": 4060 + }, + { + "epoch": 0.3016942572719962, + "grad_norm": 0.5618060827255249, + "learning_rate": 8.811444097780273e-05, + "loss": 3.3444, + "step": 4080 + }, + { + "epoch": 0.30317315068999623, + "grad_norm": 0.5211082100868225, + "learning_rate": 8.79468544531223e-05, + "loss": 3.3491, + "step": 4100 + }, + { + "epoch": 0.3046520441079962, + "grad_norm": 0.5708051919937134, + "learning_rate": 8.777825645753144e-05, + "loss": 3.3345, + "step": 4120 + }, + { + "epoch": 0.3061309375259962, + "grad_norm": 0.5056930184364319, + "learning_rate": 8.760865148499862e-05, + "loss": 3.3333, + "step": 4140 + }, + { + "epoch": 0.30760983094399613, + "grad_norm": 0.5034912824630737, + "learning_rate": 8.743804405633327e-05, + "loss": 3.3313, + "step": 4160 + }, + { + "epoch": 0.30908872436199614, + "grad_norm": 0.6101865768432617, + "learning_rate": 8.726643871906512e-05, + "loss": 3.3211, + "step": 4180 + }, + { + "epoch": 0.31056761777999614, + "grad_norm": 0.49354320764541626, + "learning_rate": 8.709384004732322e-05, + "loss": 3.328, + "step": 4200 + }, + { + "epoch": 0.3120465111979961, + "grad_norm": 1.0049197673797607, + "learning_rate": 8.69202526417138e-05, + "loss": 3.3256, + "step": 4220 + }, + { + "epoch": 0.3135254046159961, + "grad_norm": 0.4796050786972046, + "learning_rate": 8.67456811291977e-05, + "loss": 3.3264, + "step": 4240 + }, + { + "epoch": 0.31500429803399604, + "grad_norm": 0.6114419102668762, + "learning_rate": 8.657013016296716e-05, + "loss": 3.3041, + "step": 4260 + }, + { + "epoch": 0.31648319145199605, + "grad_norm": 0.6853553652763367, + "learning_rate": 8.639360442232163e-05, + "loss": 3.3123, + "step": 4280 + }, + { + "epoch": 0.317962084869996, + "grad_norm": 0.4117718040943146, + "learning_rate": 8.621610861254307e-05, + "loss": 3.3036, + "step": 4300 + }, + { + "epoch": 0.319440978287996, + "grad_norm": 0.4868248701095581, + "learning_rate": 8.60376474647707e-05, + "loss": 3.3112, + "step": 4320 + }, + { + "epoch": 0.320919871705996, + "grad_norm": 0.4655211865901947, + "learning_rate": 8.585822573587463e-05, + "loss": 3.2959, + "step": 4340 + }, + { + "epoch": 0.32239876512399596, + "grad_norm": 0.4244300127029419, + "learning_rate": 8.567784820832926e-05, + "loss": 3.3006, + "step": 4360 + }, + { + "epoch": 0.32387765854199596, + "grad_norm": 0.5585177540779114, + "learning_rate": 8.549651969008572e-05, + "loss": 3.304, + "step": 4380 + }, + { + "epoch": 0.3253565519599959, + "grad_norm": 0.4044816493988037, + "learning_rate": 8.531424501444376e-05, + "loss": 3.2943, + "step": 4400 + }, + { + "epoch": 0.3268354453779959, + "grad_norm": 0.5332701802253723, + "learning_rate": 8.513102903992285e-05, + "loss": 3.2691, + "step": 4420 + }, + { + "epoch": 0.3283143387959959, + "grad_norm": 0.6828725934028625, + "learning_rate": 8.494687665013274e-05, + "loss": 3.2757, + "step": 4440 + }, + { + "epoch": 0.32979323221399587, + "grad_norm": 0.4340764284133911, + "learning_rate": 8.476179275364331e-05, + "loss": 3.2798, + "step": 4460 + }, + { + "epoch": 0.3312721256319959, + "grad_norm": 0.5927674770355225, + "learning_rate": 8.457578228385362e-05, + "loss": 3.277, + "step": 4480 + }, + { + "epoch": 0.3327510190499958, + "grad_norm": 0.5142761468887329, + "learning_rate": 8.438885019886051e-05, + "loss": 3.2745, + "step": 4500 + }, + { + "epoch": 0.3342299124679958, + "grad_norm": 0.5035094618797302, + "learning_rate": 8.420100148132643e-05, + "loss": 3.282, + "step": 4520 + }, + { + "epoch": 0.33570880588599583, + "grad_norm": 0.4529162049293518, + "learning_rate": 8.40122411383466e-05, + "loss": 3.2741, + "step": 4540 + }, + { + "epoch": 0.3371876993039958, + "grad_norm": 0.47236135601997375, + "learning_rate": 8.382257420131554e-05, + "loss": 3.2566, + "step": 4560 + }, + { + "epoch": 0.3386665927219958, + "grad_norm": 0.5067903995513916, + "learning_rate": 8.363200572579297e-05, + "loss": 3.2729, + "step": 4580 + }, + { + "epoch": 0.34014548613999573, + "grad_norm": 0.5891897678375244, + "learning_rate": 8.344054079136911e-05, + "loss": 3.254, + "step": 4600 + }, + { + "epoch": 0.34162437955799574, + "grad_norm": 0.4857490062713623, + "learning_rate": 8.324818450152917e-05, + "loss": 3.2704, + "step": 4620 + }, + { + "epoch": 0.3431032729759957, + "grad_norm": 0.5922226309776306, + "learning_rate": 8.305494198351741e-05, + "loss": 3.2511, + "step": 4640 + }, + { + "epoch": 0.3445821663939957, + "grad_norm": 0.5176606178283691, + "learning_rate": 8.286081838820047e-05, + "loss": 3.2577, + "step": 4660 + }, + { + "epoch": 0.3460610598119957, + "grad_norm": 0.4542312026023865, + "learning_rate": 8.266581888993e-05, + "loss": 3.269, + "step": 4680 + }, + { + "epoch": 0.34753995322999565, + "grad_norm": 0.4864133596420288, + "learning_rate": 8.246994868640478e-05, + "loss": 3.2468, + "step": 4700 + }, + { + "epoch": 0.34901884664799565, + "grad_norm": 0.5213157534599304, + "learning_rate": 8.227321299853225e-05, + "loss": 3.2431, + "step": 4720 + }, + { + "epoch": 0.3504977400659956, + "grad_norm": 0.495194673538208, + "learning_rate": 8.207561707028921e-05, + "loss": 3.26, + "step": 4740 + }, + { + "epoch": 0.3519766334839956, + "grad_norm": 0.47876933217048645, + "learning_rate": 8.187716616858217e-05, + "loss": 3.2397, + "step": 4760 + }, + { + "epoch": 0.3534555269019956, + "grad_norm": 0.558392345905304, + "learning_rate": 8.167786558310679e-05, + "loss": 3.2357, + "step": 4780 + }, + { + "epoch": 0.35493442031999556, + "grad_norm": 0.5333178043365479, + "learning_rate": 8.147772062620715e-05, + "loss": 3.2374, + "step": 4800 + }, + { + "epoch": 0.35641331373799556, + "grad_norm": 0.41947266459465027, + "learning_rate": 8.127673663273388e-05, + "loss": 3.238, + "step": 4820 + }, + { + "epoch": 0.3578922071559955, + "grad_norm": 0.6376889944076538, + "learning_rate": 8.107491895990213e-05, + "loss": 3.2295, + "step": 4840 + }, + { + "epoch": 0.3593711005739955, + "grad_norm": 0.46790727972984314, + "learning_rate": 8.087227298714865e-05, + "loss": 3.2203, + "step": 4860 + }, + { + "epoch": 0.36084999399199547, + "grad_norm": 0.4850638508796692, + "learning_rate": 8.06688041159886e-05, + "loss": 3.2282, + "step": 4880 + }, + { + "epoch": 0.36232888740999547, + "grad_norm": 0.48408469557762146, + "learning_rate": 8.04645177698713e-05, + "loss": 3.2156, + "step": 4900 + }, + { + "epoch": 0.3638077808279955, + "grad_norm": 0.4044775068759918, + "learning_rate": 8.025941939403589e-05, + "loss": 3.2054, + "step": 4920 + }, + { + "epoch": 0.3652866742459954, + "grad_norm": 0.5881346464157104, + "learning_rate": 8.005351445536611e-05, + "loss": 3.2179, + "step": 4940 + }, + { + "epoch": 0.36676556766399543, + "grad_norm": 0.49967604875564575, + "learning_rate": 7.984680844224455e-05, + "loss": 3.2243, + "step": 4960 + }, + { + "epoch": 0.3682444610819954, + "grad_norm": 0.3812451958656311, + "learning_rate": 7.963930686440638e-05, + "loss": 3.2071, + "step": 4980 + }, + { + "epoch": 0.3697233544999954, + "grad_norm": 0.5718510150909424, + "learning_rate": 7.943101525279254e-05, + "loss": 3.2097, + "step": 5000 + }, + { + "epoch": 0.3712022479179954, + "grad_norm": 0.4486338198184967, + "learning_rate": 7.922193915940223e-05, + "loss": 3.2108, + "step": 5020 + }, + { + "epoch": 0.37268114133599534, + "grad_norm": 0.3966203033924103, + "learning_rate": 7.901208415714498e-05, + "loss": 3.2079, + "step": 5040 + }, + { + "epoch": 0.37416003475399534, + "grad_norm": 0.5968387722969055, + "learning_rate": 7.880145583969208e-05, + "loss": 3.2194, + "step": 5060 + }, + { + "epoch": 0.3756389281719953, + "grad_norm": 0.4266614019870758, + "learning_rate": 7.859005982132746e-05, + "loss": 3.2041, + "step": 5080 + }, + { + "epoch": 0.3771178215899953, + "grad_norm": 0.39778637886047363, + "learning_rate": 7.83779017367981e-05, + "loss": 3.1994, + "step": 5100 + }, + { + "epoch": 0.37859671500799524, + "grad_norm": 0.5236369967460632, + "learning_rate": 7.816498724116384e-05, + "loss": 3.1862, + "step": 5120 + }, + { + "epoch": 0.38007560842599525, + "grad_norm": 0.7279762625694275, + "learning_rate": 7.79513220096465e-05, + "loss": 3.1994, + "step": 5140 + }, + { + "epoch": 0.38155450184399525, + "grad_norm": 0.4763568639755249, + "learning_rate": 7.773691173747878e-05, + "loss": 3.1906, + "step": 5160 + }, + { + "epoch": 0.3830333952619952, + "grad_norm": 0.44299814105033875, + "learning_rate": 7.752176213975242e-05, + "loss": 3.1834, + "step": 5180 + }, + { + "epoch": 0.3845122886799952, + "grad_norm": 0.5032374262809753, + "learning_rate": 7.73058789512658e-05, + "loss": 3.195, + "step": 5200 + }, + { + "epoch": 0.38599118209799516, + "grad_norm": 0.4971736669540405, + "learning_rate": 7.708926792637109e-05, + "loss": 3.1912, + "step": 5220 + }, + { + "epoch": 0.38747007551599516, + "grad_norm": 0.3745681941509247, + "learning_rate": 7.687193483882094e-05, + "loss": 3.1822, + "step": 5240 + }, + { + "epoch": 0.38894896893399517, + "grad_norm": 0.45209985971450806, + "learning_rate": 7.665388548161449e-05, + "loss": 3.1747, + "step": 5260 + }, + { + "epoch": 0.3904278623519951, + "grad_norm": 0.45653989911079407, + "learning_rate": 7.643512566684302e-05, + "loss": 3.1586, + "step": 5280 + }, + { + "epoch": 0.3919067557699951, + "grad_norm": 0.5007410049438477, + "learning_rate": 7.621566122553503e-05, + "loss": 3.1777, + "step": 5300 + }, + { + "epoch": 0.39338564918799507, + "grad_norm": 0.39367878437042236, + "learning_rate": 7.599549800750075e-05, + "loss": 3.1713, + "step": 5320 + }, + { + "epoch": 0.3948645426059951, + "grad_norm": 0.41411903500556946, + "learning_rate": 7.577464188117629e-05, + "loss": 3.1743, + "step": 5340 + }, + { + "epoch": 0.396343436023995, + "grad_norm": 0.45292773842811584, + "learning_rate": 7.555309873346719e-05, + "loss": 3.1615, + "step": 5360 + }, + { + "epoch": 0.397822329441995, + "grad_norm": 0.8281717300415039, + "learning_rate": 7.533087446959146e-05, + "loss": 3.167, + "step": 5380 + }, + { + "epoch": 0.39930122285999503, + "grad_norm": 0.4002739489078522, + "learning_rate": 7.510797501292224e-05, + "loss": 3.1778, + "step": 5400 + }, + { + "epoch": 0.400780116277995, + "grad_norm": 0.4849472641944885, + "learning_rate": 7.488440630482993e-05, + "loss": 3.156, + "step": 5420 + }, + { + "epoch": 0.402259009695995, + "grad_norm": 0.5112612247467041, + "learning_rate": 7.466017430452372e-05, + "loss": 3.1722, + "step": 5440 + }, + { + "epoch": 0.40373790311399493, + "grad_norm": 0.7139009833335876, + "learning_rate": 7.443528498889282e-05, + "loss": 3.1638, + "step": 5460 + }, + { + "epoch": 0.40521679653199494, + "grad_norm": 0.508050262928009, + "learning_rate": 7.420974435234718e-05, + "loss": 3.178, + "step": 5480 + }, + { + "epoch": 0.40669568994999494, + "grad_norm": 0.42061784863471985, + "learning_rate": 7.398355840665762e-05, + "loss": 3.1644, + "step": 5500 + }, + { + "epoch": 0.4081745833679949, + "grad_norm": 0.4205974340438843, + "learning_rate": 7.375673318079566e-05, + "loss": 3.1405, + "step": 5520 + }, + { + "epoch": 0.4096534767859949, + "grad_norm": 0.37122201919555664, + "learning_rate": 7.352927472077278e-05, + "loss": 3.1446, + "step": 5540 + }, + { + "epoch": 0.41113237020399485, + "grad_norm": 0.42649346590042114, + "learning_rate": 7.330118908947927e-05, + "loss": 3.1553, + "step": 5560 + }, + { + "epoch": 0.41261126362199485, + "grad_norm": 0.4024769365787506, + "learning_rate": 7.307248236652264e-05, + "loss": 3.1468, + "step": 5580 + }, + { + "epoch": 0.4140901570399948, + "grad_norm": 0.44164013862609863, + "learning_rate": 7.284316064806555e-05, + "loss": 3.1431, + "step": 5600 + }, + { + "epoch": 0.4155690504579948, + "grad_norm": 0.43745094537734985, + "learning_rate": 7.261323004666332e-05, + "loss": 3.1566, + "step": 5620 + }, + { + "epoch": 0.4170479438759948, + "grad_norm": 0.5233656764030457, + "learning_rate": 7.238269669110104e-05, + "loss": 3.1387, + "step": 5640 + }, + { + "epoch": 0.41852683729399476, + "grad_norm": 0.5196412801742554, + "learning_rate": 7.215156672623011e-05, + "loss": 3.1359, + "step": 5660 + }, + { + "epoch": 0.42000573071199476, + "grad_norm": 0.46823379397392273, + "learning_rate": 7.191984631280457e-05, + "loss": 3.1274, + "step": 5680 + }, + { + "epoch": 0.4214846241299947, + "grad_norm": 0.4213380217552185, + "learning_rate": 7.168754162731682e-05, + "loss": 3.1261, + "step": 5700 + }, + { + "epoch": 0.4229635175479947, + "grad_norm": 0.48972517251968384, + "learning_rate": 7.145465886183291e-05, + "loss": 3.1367, + "step": 5720 + }, + { + "epoch": 0.4244424109659947, + "grad_norm": 0.4298087954521179, + "learning_rate": 7.122120422382771e-05, + "loss": 3.1342, + "step": 5740 + }, + { + "epoch": 0.42592130438399467, + "grad_norm": 0.6111768484115601, + "learning_rate": 7.098718393601922e-05, + "loss": 3.1323, + "step": 5760 + }, + { + "epoch": 0.4274001978019947, + "grad_norm": 0.4182634949684143, + "learning_rate": 7.075260423620284e-05, + "loss": 3.1206, + "step": 5780 + }, + { + "epoch": 0.4288790912199946, + "grad_norm": 0.4418911337852478, + "learning_rate": 7.051747137708503e-05, + "loss": 3.1252, + "step": 5800 + }, + { + "epoch": 0.43035798463799463, + "grad_norm": 0.4269157350063324, + "learning_rate": 7.028179162611668e-05, + "loss": 3.1291, + "step": 5820 + }, + { + "epoch": 0.4318368780559946, + "grad_norm": 0.38284796476364136, + "learning_rate": 7.004557126532608e-05, + "loss": 3.1272, + "step": 5840 + }, + { + "epoch": 0.4333157714739946, + "grad_norm": 0.42110738158226013, + "learning_rate": 6.98088165911514e-05, + "loss": 3.1277, + "step": 5860 + }, + { + "epoch": 0.4347946648919946, + "grad_norm": 0.45251357555389404, + "learning_rate": 6.957153391427293e-05, + "loss": 3.1258, + "step": 5880 + }, + { + "epoch": 0.43627355830999454, + "grad_norm": 0.5021226406097412, + "learning_rate": 6.933372955944478e-05, + "loss": 3.1132, + "step": 5900 + }, + { + "epoch": 0.43775245172799454, + "grad_norm": 0.5621367692947388, + "learning_rate": 6.909540986532644e-05, + "loss": 3.1223, + "step": 5920 + }, + { + "epoch": 0.4392313451459945, + "grad_norm": 0.48778969049453735, + "learning_rate": 6.885658118431367e-05, + "loss": 3.1239, + "step": 5940 + }, + { + "epoch": 0.4407102385639945, + "grad_norm": 0.4777956008911133, + "learning_rate": 6.861724988236926e-05, + "loss": 3.1096, + "step": 5960 + }, + { + "epoch": 0.4421891319819945, + "grad_norm": 0.5108891725540161, + "learning_rate": 6.83774223388533e-05, + "loss": 3.1172, + "step": 5980 + }, + { + "epoch": 0.44366802539999445, + "grad_norm": 0.42329996824264526, + "learning_rate": 6.813710494635325e-05, + "loss": 3.0999, + "step": 6000 + }, + { + "epoch": 0.44514691881799445, + "grad_norm": 0.538500964641571, + "learning_rate": 6.789630411051336e-05, + "loss": 3.1098, + "step": 6020 + }, + { + "epoch": 0.4466258122359944, + "grad_norm": 0.51045823097229, + "learning_rate": 6.765502624986409e-05, + "loss": 3.1021, + "step": 6040 + }, + { + "epoch": 0.4481047056539944, + "grad_norm": 0.46791911125183105, + "learning_rate": 6.741327779565096e-05, + "loss": 3.1031, + "step": 6060 + }, + { + "epoch": 0.44958359907199436, + "grad_norm": 0.4351001977920532, + "learning_rate": 6.71710651916631e-05, + "loss": 3.0976, + "step": 6080 + }, + { + "epoch": 0.45106249248999436, + "grad_norm": 0.3884891867637634, + "learning_rate": 6.692839489406155e-05, + "loss": 3.0977, + "step": 6100 + }, + { + "epoch": 0.45254138590799436, + "grad_norm": 0.44683268666267395, + "learning_rate": 6.668527337120717e-05, + "loss": 3.0915, + "step": 6120 + }, + { + "epoch": 0.4540202793259943, + "grad_norm": 0.36208999156951904, + "learning_rate": 6.644170710348813e-05, + "loss": 3.1036, + "step": 6140 + }, + { + "epoch": 0.4554991727439943, + "grad_norm": 0.6256937384605408, + "learning_rate": 6.619770258314729e-05, + "loss": 3.0841, + "step": 6160 + }, + { + "epoch": 0.45697806616199427, + "grad_norm": 0.44526803493499756, + "learning_rate": 6.595326631410911e-05, + "loss": 3.0801, + "step": 6180 + }, + { + "epoch": 0.45845695957999427, + "grad_norm": 0.37642255425453186, + "learning_rate": 6.570840481180624e-05, + "loss": 3.0923, + "step": 6200 + }, + { + "epoch": 0.4599358529979943, + "grad_norm": 0.4022856056690216, + "learning_rate": 6.546312460300595e-05, + "loss": 3.0865, + "step": 6220 + }, + { + "epoch": 0.4614147464159942, + "grad_norm": 0.41262638568878174, + "learning_rate": 6.521743222563608e-05, + "loss": 3.0895, + "step": 6240 + }, + { + "epoch": 0.46289363983399423, + "grad_norm": 0.6894219517707825, + "learning_rate": 6.49713342286108e-05, + "loss": 3.0882, + "step": 6260 + }, + { + "epoch": 0.4643725332519942, + "grad_norm": 0.4044055938720703, + "learning_rate": 6.4724837171656e-05, + "loss": 3.0811, + "step": 6280 + }, + { + "epoch": 0.4658514266699942, + "grad_norm": 0.5523516535758972, + "learning_rate": 6.447794762513456e-05, + "loss": 3.0687, + "step": 6300 + }, + { + "epoch": 0.46733032008799413, + "grad_norm": 0.6067591309547424, + "learning_rate": 6.42306721698711e-05, + "loss": 3.0651, + "step": 6320 + }, + { + "epoch": 0.46880921350599414, + "grad_norm": 0.48093098402023315, + "learning_rate": 6.398301739697661e-05, + "loss": 3.0862, + "step": 6340 + }, + { + "epoch": 0.47028810692399414, + "grad_norm": 0.516197144985199, + "learning_rate": 6.373498990767281e-05, + "loss": 3.0879, + "step": 6360 + }, + { + "epoch": 0.4717670003419941, + "grad_norm": 0.4190840721130371, + "learning_rate": 6.348659631311608e-05, + "loss": 3.0786, + "step": 6380 + }, + { + "epoch": 0.4732458937599941, + "grad_norm": 0.42481333017349243, + "learning_rate": 6.32378432342214e-05, + "loss": 3.0701, + "step": 6400 + }, + { + "epoch": 0.47472478717799405, + "grad_norm": 0.5522997379302979, + "learning_rate": 6.29887373014857e-05, + "loss": 3.0722, + "step": 6420 + }, + { + "epoch": 0.47620368059599405, + "grad_norm": 0.3823126554489136, + "learning_rate": 6.27392851548112e-05, + "loss": 3.0722, + "step": 6440 + }, + { + "epoch": 0.47768257401399405, + "grad_norm": 0.38790881633758545, + "learning_rate": 6.248949344332853e-05, + "loss": 3.0726, + "step": 6460 + }, + { + "epoch": 0.479161467431994, + "grad_norm": 0.503336489200592, + "learning_rate": 6.223936882521935e-05, + "loss": 3.0652, + "step": 6480 + }, + { + "epoch": 0.480640360849994, + "grad_norm": 0.5279501080513, + "learning_rate": 6.198891796753885e-05, + "loss": 3.0771, + "step": 6500 + }, + { + "epoch": 0.48211925426799396, + "grad_norm": 0.4080502986907959, + "learning_rate": 6.17381475460382e-05, + "loss": 3.064, + "step": 6520 + }, + { + "epoch": 0.48359814768599396, + "grad_norm": 0.45085135102272034, + "learning_rate": 6.148706424498649e-05, + "loss": 3.0594, + "step": 6540 + }, + { + "epoch": 0.4850770411039939, + "grad_norm": 0.42239508032798767, + "learning_rate": 6.123567475699261e-05, + "loss": 3.064, + "step": 6560 + }, + { + "epoch": 0.4865559345219939, + "grad_norm": 0.43709495663642883, + "learning_rate": 6.098398578282682e-05, + "loss": 3.0563, + "step": 6580 + }, + { + "epoch": 0.4880348279399939, + "grad_norm": 0.6891195178031921, + "learning_rate": 6.073200403124222e-05, + "loss": 3.0594, + "step": 6600 + }, + { + "epoch": 0.48951372135799387, + "grad_norm": 0.37419646978378296, + "learning_rate": 6.047973621879577e-05, + "loss": 3.0448, + "step": 6620 + }, + { + "epoch": 0.4909926147759939, + "grad_norm": 0.3710575997829437, + "learning_rate": 6.0227189069669464e-05, + "loss": 3.0518, + "step": 6640 + }, + { + "epoch": 0.4924715081939938, + "grad_norm": 0.7165172696113586, + "learning_rate": 5.997436931549096e-05, + "loss": 3.0589, + "step": 6660 + }, + { + "epoch": 0.49395040161199383, + "grad_norm": 0.48645517230033875, + "learning_rate": 5.972128369515415e-05, + "loss": 3.0507, + "step": 6680 + }, + { + "epoch": 0.49542929502999383, + "grad_norm": 0.3613664507865906, + "learning_rate": 5.9467938954639624e-05, + "loss": 3.05, + "step": 6700 + }, + { + "epoch": 0.4969081884479938, + "grad_norm": 0.44066616892814636, + "learning_rate": 5.921434184683479e-05, + "loss": 3.0452, + "step": 6720 + }, + { + "epoch": 0.4983870818659938, + "grad_norm": 0.4224984049797058, + "learning_rate": 5.896049913135386e-05, + "loss": 3.0474, + "step": 6740 + }, + { + "epoch": 0.49986597528399374, + "grad_norm": 0.4076259434223175, + "learning_rate": 5.870641757435775e-05, + "loss": 3.0424, + "step": 6760 + }, + { + "epoch": 0.5013448687019937, + "grad_norm": 0.6098340153694153, + "learning_rate": 5.845210394837366e-05, + "loss": 3.0581, + "step": 6780 + }, + { + "epoch": 0.5028237621199937, + "grad_norm": 1.0002901554107666, + "learning_rate": 5.8197565032114533e-05, + "loss": 3.0335, + "step": 6800 + }, + { + "epoch": 0.5043026555379937, + "grad_norm": 0.4866860508918762, + "learning_rate": 5.7942807610298456e-05, + "loss": 3.0329, + "step": 6820 + }, + { + "epoch": 0.5057815489559937, + "grad_norm": 0.4324921667575836, + "learning_rate": 5.768783847346779e-05, + "loss": 3.0366, + "step": 6840 + }, + { + "epoch": 0.5072604423739937, + "grad_norm": 0.40503060817718506, + "learning_rate": 5.743266441780808e-05, + "loss": 3.0461, + "step": 6860 + }, + { + "epoch": 0.5087393357919936, + "grad_norm": 0.38576483726501465, + "learning_rate": 5.717729224496703e-05, + "loss": 3.0238, + "step": 6880 + }, + { + "epoch": 0.5102182292099936, + "grad_norm": 0.4007696211338043, + "learning_rate": 5.6921728761873086e-05, + "loss": 3.0221, + "step": 6900 + }, + { + "epoch": 0.5116971226279936, + "grad_norm": 0.4254515469074249, + "learning_rate": 5.6665980780554096e-05, + "loss": 3.0421, + "step": 6920 + }, + { + "epoch": 0.5131760160459936, + "grad_norm": 0.42919921875, + "learning_rate": 5.6410055117955695e-05, + "loss": 3.0435, + "step": 6940 + }, + { + "epoch": 0.5146549094639936, + "grad_norm": 0.45048367977142334, + "learning_rate": 5.615395859575958e-05, + "loss": 3.0331, + "step": 6960 + }, + { + "epoch": 0.5161338028819935, + "grad_norm": 0.3860481381416321, + "learning_rate": 5.589769804020173e-05, + "loss": 3.0255, + "step": 6980 + }, + { + "epoch": 0.5176126962999935, + "grad_norm": 0.3789386749267578, + "learning_rate": 5.5641280281890394e-05, + "loss": 3.0364, + "step": 7000 + }, + { + "epoch": 0.5190915897179935, + "grad_norm": 0.3918616473674774, + "learning_rate": 5.538471215562406e-05, + "loss": 3.0288, + "step": 7020 + }, + { + "epoch": 0.5205704831359935, + "grad_norm": 0.5674075484275818, + "learning_rate": 5.5128000500209254e-05, + "loss": 3.034, + "step": 7040 + }, + { + "epoch": 0.5220493765539935, + "grad_norm": 0.38289138674736023, + "learning_rate": 5.48711521582783e-05, + "loss": 3.0228, + "step": 7060 + }, + { + "epoch": 0.5235282699719934, + "grad_norm": 0.5652275681495667, + "learning_rate": 5.461417397610682e-05, + "loss": 3.0148, + "step": 7080 + }, + { + "epoch": 0.5250071633899934, + "grad_norm": 0.39682313799858093, + "learning_rate": 5.4357072803431396e-05, + "loss": 3.0168, + "step": 7100 + }, + { + "epoch": 0.5264860568079934, + "grad_norm": 0.5409131646156311, + "learning_rate": 5.4099855493266896e-05, + "loss": 3.0071, + "step": 7120 + }, + { + "epoch": 0.5279649502259934, + "grad_norm": 0.465202659368515, + "learning_rate": 5.3842528901723786e-05, + "loss": 3.0236, + "step": 7140 + }, + { + "epoch": 0.5294438436439933, + "grad_norm": 0.4230177104473114, + "learning_rate": 5.358509988782543e-05, + "loss": 3.0209, + "step": 7160 + }, + { + "epoch": 0.5309227370619933, + "grad_norm": 0.3867465555667877, + "learning_rate": 5.332757531332529e-05, + "loss": 3.0212, + "step": 7180 + }, + { + "epoch": 0.5324016304799933, + "grad_norm": 0.57347172498703, + "learning_rate": 5.306996204252397e-05, + "loss": 3.0197, + "step": 7200 + }, + { + "epoch": 0.5338805238979933, + "grad_norm": 0.45516273379325867, + "learning_rate": 5.2812266942086256e-05, + "loss": 3.0118, + "step": 7220 + }, + { + "epoch": 0.5353594173159933, + "grad_norm": 0.45842480659484863, + "learning_rate": 5.2554496880858106e-05, + "loss": 3.0229, + "step": 7240 + }, + { + "epoch": 0.5368383107339932, + "grad_norm": 0.4081624448299408, + "learning_rate": 5.2296658729683555e-05, + "loss": 3.0109, + "step": 7260 + }, + { + "epoch": 0.5383172041519932, + "grad_norm": 0.36024734377861023, + "learning_rate": 5.203875936122158e-05, + "loss": 3.007, + "step": 7280 + }, + { + "epoch": 0.5397960975699932, + "grad_norm": 0.5755016803741455, + "learning_rate": 5.178080564976287e-05, + "loss": 3.0073, + "step": 7300 + }, + { + "epoch": 0.5412749909879933, + "grad_norm": 0.4267408847808838, + "learning_rate": 5.152280447104665e-05, + "loss": 3.0077, + "step": 7320 + }, + { + "epoch": 0.5427538844059933, + "grad_norm": 0.4339446723461151, + "learning_rate": 5.126476270207739e-05, + "loss": 2.9991, + "step": 7340 + }, + { + "epoch": 0.5442327778239932, + "grad_norm": 0.3711448907852173, + "learning_rate": 5.1006687220941455e-05, + "loss": 3.0091, + "step": 7360 + }, + { + "epoch": 0.5457116712419932, + "grad_norm": 0.4235258996486664, + "learning_rate": 5.074858490662384e-05, + "loss": 3.0015, + "step": 7380 + }, + { + "epoch": 0.5471905646599932, + "grad_norm": 0.3901888430118561, + "learning_rate": 5.0490462638824764e-05, + "loss": 2.9862, + "step": 7400 + }, + { + "epoch": 0.5486694580779932, + "grad_norm": 0.40519407391548157, + "learning_rate": 5.023232729777628e-05, + "loss": 3.0052, + "step": 7420 + }, + { + "epoch": 0.5501483514959932, + "grad_norm": 0.5243799686431885, + "learning_rate": 4.997418576405896e-05, + "loss": 3.0002, + "step": 7440 + }, + { + "epoch": 0.5516272449139931, + "grad_norm": 0.444050133228302, + "learning_rate": 4.9716044918418414e-05, + "loss": 3.0037, + "step": 7460 + }, + { + "epoch": 0.5531061383319931, + "grad_norm": 0.3496316075325012, + "learning_rate": 4.945791164158188e-05, + "loss": 3.0084, + "step": 7480 + }, + { + "epoch": 0.5545850317499931, + "grad_norm": 0.5127915740013123, + "learning_rate": 4.9199792814074896e-05, + "loss": 2.9986, + "step": 7500 + }, + { + "epoch": 0.5560639251679931, + "grad_norm": 0.4601123332977295, + "learning_rate": 4.8941695316037865e-05, + "loss": 3.0057, + "step": 7520 + }, + { + "epoch": 0.5575428185859931, + "grad_norm": 0.48755237460136414, + "learning_rate": 4.868362602704258e-05, + "loss": 2.9809, + "step": 7540 + }, + { + "epoch": 0.559021712003993, + "grad_norm": 0.3724111318588257, + "learning_rate": 4.842559182590899e-05, + "loss": 2.9975, + "step": 7560 + }, + { + "epoch": 0.560500605421993, + "grad_norm": 0.46181684732437134, + "learning_rate": 4.816759959052177e-05, + "loss": 2.9781, + "step": 7580 + }, + { + "epoch": 0.561979498839993, + "grad_norm": 0.39748480916023254, + "learning_rate": 4.790965619764698e-05, + "loss": 2.9965, + "step": 7600 + }, + { + "epoch": 0.563458392257993, + "grad_norm": 0.5718439221382141, + "learning_rate": 4.76517685227488e-05, + "loss": 2.9806, + "step": 7620 + }, + { + "epoch": 0.5649372856759929, + "grad_norm": 0.5939317941665649, + "learning_rate": 4.7393943439806264e-05, + "loss": 2.9801, + "step": 7640 + }, + { + "epoch": 0.5664161790939929, + "grad_norm": 0.4281553626060486, + "learning_rate": 4.713618782112997e-05, + "loss": 2.9829, + "step": 7660 + }, + { + "epoch": 0.5678950725119929, + "grad_norm": 0.37646615505218506, + "learning_rate": 4.6878508537179015e-05, + "loss": 2.9829, + "step": 7680 + }, + { + "epoch": 0.5693739659299929, + "grad_norm": 0.4106582701206207, + "learning_rate": 4.662091245637777e-05, + "loss": 2.9694, + "step": 7700 + }, + { + "epoch": 0.5708528593479929, + "grad_norm": 0.3310515582561493, + "learning_rate": 4.6363406444932814e-05, + "loss": 2.9799, + "step": 7720 + }, + { + "epoch": 0.5723317527659928, + "grad_norm": 0.36721667647361755, + "learning_rate": 4.610599736664996e-05, + "loss": 2.9794, + "step": 7740 + }, + { + "epoch": 0.5738106461839928, + "grad_norm": 0.45474308729171753, + "learning_rate": 4.5848692082751296e-05, + "loss": 2.9848, + "step": 7760 + }, + { + "epoch": 0.5752895396019928, + "grad_norm": 0.6072131991386414, + "learning_rate": 4.559149745169218e-05, + "loss": 2.972, + "step": 7780 + }, + { + "epoch": 0.5767684330199928, + "grad_norm": 0.486600786447525, + "learning_rate": 4.533442032897864e-05, + "loss": 2.9602, + "step": 7800 + }, + { + "epoch": 0.5782473264379928, + "grad_norm": 0.4024549126625061, + "learning_rate": 4.5077467566984474e-05, + "loss": 2.9852, + "step": 7820 + }, + { + "epoch": 0.5797262198559927, + "grad_norm": 0.3547488749027252, + "learning_rate": 4.4820646014768644e-05, + "loss": 2.9794, + "step": 7840 + }, + { + "epoch": 0.5812051132739927, + "grad_norm": 0.38729000091552734, + "learning_rate": 4.456396251789274e-05, + "loss": 2.9822, + "step": 7860 + }, + { + "epoch": 0.5826840066919927, + "grad_norm": 0.35460221767425537, + "learning_rate": 4.430742391823853e-05, + "loss": 2.9768, + "step": 7880 + }, + { + "epoch": 0.5841629001099927, + "grad_norm": 0.3545529544353485, + "learning_rate": 4.405103705382547e-05, + "loss": 2.9681, + "step": 7900 + }, + { + "epoch": 0.5856417935279927, + "grad_norm": 0.3542696237564087, + "learning_rate": 4.379480875862859e-05, + "loss": 2.9748, + "step": 7920 + }, + { + "epoch": 0.5871206869459926, + "grad_norm": 0.34213724732398987, + "learning_rate": 4.3538745862396275e-05, + "loss": 2.969, + "step": 7940 + }, + { + "epoch": 0.5885995803639926, + "grad_norm": 0.35730448365211487, + "learning_rate": 4.328285519046815e-05, + "loss": 2.9627, + "step": 7960 + }, + { + "epoch": 0.5900784737819926, + "grad_norm": 0.4420771598815918, + "learning_rate": 4.302714356359327e-05, + "loss": 2.9781, + "step": 7980 + }, + { + "epoch": 0.5915573671999926, + "grad_norm": 0.47289857268333435, + "learning_rate": 4.2771617797748256e-05, + "loss": 2.9637, + "step": 8000 + }, + { + "epoch": 0.5930362606179926, + "grad_norm": 0.4006676971912384, + "learning_rate": 4.251628470395556e-05, + "loss": 2.9721, + "step": 8020 + }, + { + "epoch": 0.5945151540359925, + "grad_norm": 0.39483192563056946, + "learning_rate": 4.226115108810201e-05, + "loss": 2.9607, + "step": 8040 + }, + { + "epoch": 0.5959940474539925, + "grad_norm": 0.49096304178237915, + "learning_rate": 4.20062237507574e-05, + "loss": 2.9567, + "step": 8060 + }, + { + "epoch": 0.5974729408719925, + "grad_norm": 0.373417466878891, + "learning_rate": 4.175150948699311e-05, + "loss": 2.965, + "step": 8080 + }, + { + "epoch": 0.5989518342899925, + "grad_norm": 0.33696213364601135, + "learning_rate": 4.149701508620109e-05, + "loss": 2.9636, + "step": 8100 + }, + { + "epoch": 0.6004307277079924, + "grad_norm": 0.5063782930374146, + "learning_rate": 4.124274733191291e-05, + "loss": 2.9737, + "step": 8120 + }, + { + "epoch": 0.6019096211259924, + "grad_norm": 0.39363813400268555, + "learning_rate": 4.098871300161878e-05, + "loss": 2.9516, + "step": 8140 + }, + { + "epoch": 0.6033885145439924, + "grad_norm": 0.3740212023258209, + "learning_rate": 4.07349188665871e-05, + "loss": 2.9472, + "step": 8160 + }, + { + "epoch": 0.6048674079619925, + "grad_norm": 0.42378878593444824, + "learning_rate": 4.048137169168385e-05, + "loss": 2.9684, + "step": 8180 + }, + { + "epoch": 0.6063463013799925, + "grad_norm": 0.4358353614807129, + "learning_rate": 4.02280782351923e-05, + "loss": 2.9643, + "step": 8200 + }, + { + "epoch": 0.6078251947979924, + "grad_norm": 0.35567548871040344, + "learning_rate": 3.997504524863291e-05, + "loss": 2.9435, + "step": 8220 + }, + { + "epoch": 0.6093040882159924, + "grad_norm": 0.3486579358577728, + "learning_rate": 3.972227947658325e-05, + "loss": 2.9605, + "step": 8240 + }, + { + "epoch": 0.6107829816339924, + "grad_norm": 0.42745381593704224, + "learning_rate": 3.946978765649838e-05, + "loss": 2.9481, + "step": 8260 + }, + { + "epoch": 0.6122618750519924, + "grad_norm": 0.4889651834964752, + "learning_rate": 3.921757651853117e-05, + "loss": 2.9492, + "step": 8280 + }, + { + "epoch": 0.6137407684699924, + "grad_norm": 0.44278714060783386, + "learning_rate": 3.896565278535291e-05, + "loss": 2.9578, + "step": 8300 + }, + { + "epoch": 0.6152196618879923, + "grad_norm": 0.42498791217803955, + "learning_rate": 3.8714023171974135e-05, + "loss": 2.9439, + "step": 8320 + }, + { + "epoch": 0.6166985553059923, + "grad_norm": 0.36626169085502625, + "learning_rate": 3.846269438556568e-05, + "loss": 2.9549, + "step": 8340 + }, + { + "epoch": 0.6181774487239923, + "grad_norm": 0.369567334651947, + "learning_rate": 3.8211673125279776e-05, + "loss": 2.947, + "step": 8360 + }, + { + "epoch": 0.6196563421419923, + "grad_norm": 0.43409767746925354, + "learning_rate": 3.7960966082071636e-05, + "loss": 2.9363, + "step": 8380 + }, + { + "epoch": 0.6211352355599923, + "grad_norm": 0.4202839434146881, + "learning_rate": 3.771057993852101e-05, + "loss": 2.9501, + "step": 8400 + }, + { + "epoch": 0.6226141289779922, + "grad_norm": 0.3709544241428375, + "learning_rate": 3.746052136865409e-05, + "loss": 2.9452, + "step": 8420 + }, + { + "epoch": 0.6240930223959922, + "grad_norm": 0.3776955008506775, + "learning_rate": 3.721079703776561e-05, + "loss": 2.9249, + "step": 8440 + }, + { + "epoch": 0.6255719158139922, + "grad_norm": 0.41565999388694763, + "learning_rate": 3.6961413602241215e-05, + "loss": 2.9304, + "step": 8460 + }, + { + "epoch": 0.6270508092319922, + "grad_norm": 0.3948330581188202, + "learning_rate": 3.6712377709379944e-05, + "loss": 2.9371, + "step": 8480 + }, + { + "epoch": 0.6285297026499922, + "grad_norm": 0.3861006498336792, + "learning_rate": 3.646369599721716e-05, + "loss": 2.9399, + "step": 8500 + }, + { + "epoch": 0.6300085960679921, + "grad_norm": 0.3641924560070038, + "learning_rate": 3.621537509434757e-05, + "loss": 2.9283, + "step": 8520 + }, + { + "epoch": 0.6314874894859921, + "grad_norm": 0.4140797555446625, + "learning_rate": 3.596742161974848e-05, + "loss": 2.9321, + "step": 8540 + }, + { + "epoch": 0.6329663829039921, + "grad_norm": 0.40179234743118286, + "learning_rate": 3.571984218260348e-05, + "loss": 2.9439, + "step": 8560 + }, + { + "epoch": 0.6344452763219921, + "grad_norm": 0.4169887602329254, + "learning_rate": 3.547264338212619e-05, + "loss": 2.9299, + "step": 8580 + }, + { + "epoch": 0.635924169739992, + "grad_norm": 0.4229363203048706, + "learning_rate": 3.522583180738436e-05, + "loss": 2.927, + "step": 8600 + }, + { + "epoch": 0.637403063157992, + "grad_norm": 0.33680644631385803, + "learning_rate": 3.497941403712429e-05, + "loss": 2.9373, + "step": 8620 + }, + { + "epoch": 0.638881956575992, + "grad_norm": 0.39601895213127136, + "learning_rate": 3.473339663959547e-05, + "loss": 2.9363, + "step": 8640 + }, + { + "epoch": 0.640360849993992, + "grad_norm": 0.356684148311615, + "learning_rate": 3.448778617237543e-05, + "loss": 2.9275, + "step": 8660 + }, + { + "epoch": 0.641839743411992, + "grad_norm": 0.37500935792922974, + "learning_rate": 3.424258918219503e-05, + "loss": 2.9224, + "step": 8680 + }, + { + "epoch": 0.6433186368299919, + "grad_norm": 0.3620283901691437, + "learning_rate": 3.399781220476394e-05, + "loss": 2.9294, + "step": 8700 + }, + { + "epoch": 0.6447975302479919, + "grad_norm": 0.3849022090435028, + "learning_rate": 3.3753461764596375e-05, + "loss": 2.9332, + "step": 8720 + }, + { + "epoch": 0.6462764236659919, + "grad_norm": 0.598598837852478, + "learning_rate": 3.350954437483725e-05, + "loss": 2.9268, + "step": 8740 + }, + { + "epoch": 0.6477553170839919, + "grad_norm": 0.42141565680503845, + "learning_rate": 3.326606653708857e-05, + "loss": 2.926, + "step": 8760 + }, + { + "epoch": 0.6492342105019919, + "grad_norm": 0.39355704188346863, + "learning_rate": 3.302303474123608e-05, + "loss": 2.9302, + "step": 8780 + }, + { + "epoch": 0.6507131039199918, + "grad_norm": 0.3644985258579254, + "learning_rate": 3.278045546527633e-05, + "loss": 2.9178, + "step": 8800 + }, + { + "epoch": 0.6521919973379918, + "grad_norm": 0.3427523672580719, + "learning_rate": 3.253833517514397e-05, + "loss": 2.9291, + "step": 8820 + }, + { + "epoch": 0.6536708907559918, + "grad_norm": 0.433736652135849, + "learning_rate": 3.22966803245394e-05, + "loss": 2.914, + "step": 8840 + }, + { + "epoch": 0.6551497841739918, + "grad_norm": 0.38325321674346924, + "learning_rate": 3.205549735475677e-05, + "loss": 2.9242, + "step": 8860 + }, + { + "epoch": 0.6566286775919918, + "grad_norm": 0.4170295000076294, + "learning_rate": 3.181479269451231e-05, + "loss": 2.9175, + "step": 8880 + }, + { + "epoch": 0.6581075710099917, + "grad_norm": 0.4253075420856476, + "learning_rate": 3.1574572759772885e-05, + "loss": 2.9211, + "step": 8900 + }, + { + "epoch": 0.6595864644279917, + "grad_norm": 0.38273829221725464, + "learning_rate": 3.133484395358507e-05, + "loss": 2.914, + "step": 8920 + }, + { + "epoch": 0.6610653578459917, + "grad_norm": 0.3915143609046936, + "learning_rate": 3.109561266590445e-05, + "loss": 2.9207, + "step": 8940 + }, + { + "epoch": 0.6625442512639917, + "grad_norm": 0.37426161766052246, + "learning_rate": 3.085688527342524e-05, + "loss": 2.927, + "step": 8960 + }, + { + "epoch": 0.6640231446819918, + "grad_norm": 0.34895965456962585, + "learning_rate": 3.06186681394104e-05, + "loss": 2.9157, + "step": 8980 + }, + { + "epoch": 0.6655020380999916, + "grad_norm": 0.3564130663871765, + "learning_rate": 3.038096761352199e-05, + "loss": 2.9178, + "step": 9000 + }, + { + "epoch": 0.6669809315179916, + "grad_norm": 0.3817369043827057, + "learning_rate": 3.0143790031651863e-05, + "loss": 2.9252, + "step": 9020 + }, + { + "epoch": 0.6684598249359917, + "grad_norm": 0.37359967827796936, + "learning_rate": 2.9907141715752906e-05, + "loss": 2.9134, + "step": 9040 + }, + { + "epoch": 0.6699387183539917, + "grad_norm": 0.3740251660346985, + "learning_rate": 2.9671028973670418e-05, + "loss": 2.9175, + "step": 9060 + }, + { + "epoch": 0.6714176117719917, + "grad_norm": 0.3896474242210388, + "learning_rate": 2.943545809897398e-05, + "loss": 2.9153, + "step": 9080 + }, + { + "epoch": 0.6728965051899916, + "grad_norm": 0.4986639618873596, + "learning_rate": 2.9200435370789792e-05, + "loss": 2.9215, + "step": 9100 + }, + { + "epoch": 0.6743753986079916, + "grad_norm": 0.3836432099342346, + "learning_rate": 2.8965967053633225e-05, + "loss": 2.9123, + "step": 9120 + }, + { + "epoch": 0.6758542920259916, + "grad_norm": 0.3539137840270996, + "learning_rate": 2.873205939724185e-05, + "loss": 2.9172, + "step": 9140 + }, + { + "epoch": 0.6773331854439916, + "grad_norm": 0.4474085569381714, + "learning_rate": 2.8498718636408862e-05, + "loss": 2.9126, + "step": 9160 + }, + { + "epoch": 0.6788120788619915, + "grad_norm": 0.3727508783340454, + "learning_rate": 2.8265950990816926e-05, + "loss": 2.9136, + "step": 9180 + }, + { + "epoch": 0.6802909722799915, + "grad_norm": 0.3365872800350189, + "learning_rate": 2.8033762664872293e-05, + "loss": 2.9074, + "step": 9200 + }, + { + "epoch": 0.6817698656979915, + "grad_norm": 0.3774373233318329, + "learning_rate": 2.7802159847539545e-05, + "loss": 2.9078, + "step": 9220 + }, + { + "epoch": 0.6832487591159915, + "grad_norm": 0.34899139404296875, + "learning_rate": 2.757114871217656e-05, + "loss": 2.9117, + "step": 9240 + }, + { + "epoch": 0.6847276525339915, + "grad_norm": 0.3489275276660919, + "learning_rate": 2.7340735416369934e-05, + "loss": 2.9, + "step": 9260 + }, + { + "epoch": 0.6862065459519914, + "grad_norm": 0.3772989511489868, + "learning_rate": 2.7110926101770927e-05, + "loss": 2.8968, + "step": 9280 + }, + { + "epoch": 0.6876854393699914, + "grad_norm": 0.3743598461151123, + "learning_rate": 2.688172689393172e-05, + "loss": 2.8978, + "step": 9300 + }, + { + "epoch": 0.6891643327879914, + "grad_norm": 0.3543947637081146, + "learning_rate": 2.665314390214212e-05, + "loss": 2.9029, + "step": 9320 + }, + { + "epoch": 0.6906432262059914, + "grad_norm": 0.3778015673160553, + "learning_rate": 2.6425183219266746e-05, + "loss": 2.8875, + "step": 9340 + }, + { + "epoch": 0.6921221196239914, + "grad_norm": 0.3994954824447632, + "learning_rate": 2.6197850921582633e-05, + "loss": 2.8988, + "step": 9360 + }, + { + "epoch": 0.6936010130419913, + "grad_norm": 0.4375861883163452, + "learning_rate": 2.5971153068617195e-05, + "loss": 2.8888, + "step": 9380 + }, + { + "epoch": 0.6950799064599913, + "grad_norm": 0.3965347111225128, + "learning_rate": 2.57450957029868e-05, + "loss": 2.896, + "step": 9400 + }, + { + "epoch": 0.6965587998779913, + "grad_norm": 0.3397294580936432, + "learning_rate": 2.5519684850235703e-05, + "loss": 2.8979, + "step": 9420 + }, + { + "epoch": 0.6980376932959913, + "grad_norm": 0.38435131311416626, + "learning_rate": 2.529492651867531e-05, + "loss": 2.8914, + "step": 9440 + }, + { + "epoch": 0.6995165867139913, + "grad_norm": 0.4583021402359009, + "learning_rate": 2.5070826699224202e-05, + "loss": 2.8994, + "step": 9460 + }, + { + "epoch": 0.7009954801319912, + "grad_norm": 0.35780495405197144, + "learning_rate": 2.4847391365248346e-05, + "loss": 2.904, + "step": 9480 + }, + { + "epoch": 0.7024743735499912, + "grad_norm": 0.48425179719924927, + "learning_rate": 2.4624626472401834e-05, + "loss": 2.8902, + "step": 9500 + }, + { + "epoch": 0.7039532669679912, + "grad_norm": 0.34029942750930786, + "learning_rate": 2.440253795846827e-05, + "loss": 2.8964, + "step": 9520 + }, + { + "epoch": 0.7054321603859912, + "grad_norm": 0.33855918049812317, + "learning_rate": 2.4181131743202377e-05, + "loss": 2.8917, + "step": 9540 + }, + { + "epoch": 0.7069110538039912, + "grad_norm": 0.3716065287590027, + "learning_rate": 2.3960413728172277e-05, + "loss": 2.9, + "step": 9560 + }, + { + "epoch": 0.7083899472219911, + "grad_norm": 0.3275023102760315, + "learning_rate": 2.374038979660214e-05, + "loss": 2.9032, + "step": 9580 + }, + { + "epoch": 0.7098688406399911, + "grad_norm": 0.3434765040874481, + "learning_rate": 2.352106581321542e-05, + "loss": 2.8992, + "step": 9600 + }, + { + "epoch": 0.7113477340579911, + "grad_norm": 0.3282793462276459, + "learning_rate": 2.3302447624078427e-05, + "loss": 2.8918, + "step": 9620 + }, + { + "epoch": 0.7128266274759911, + "grad_norm": 0.4167431890964508, + "learning_rate": 2.3084541056444654e-05, + "loss": 2.8844, + "step": 9640 + }, + { + "epoch": 0.714305520893991, + "grad_norm": 0.3788709342479706, + "learning_rate": 2.2867351918599333e-05, + "loss": 2.8737, + "step": 9660 + }, + { + "epoch": 0.715784414311991, + "grad_norm": 0.32435911893844604, + "learning_rate": 2.2650885999704628e-05, + "loss": 2.8946, + "step": 9680 + }, + { + "epoch": 0.717263307729991, + "grad_norm": 0.37471237778663635, + "learning_rate": 2.243514906964539e-05, + "loss": 2.8935, + "step": 9700 + }, + { + "epoch": 0.718742201147991, + "grad_norm": 0.3652307093143463, + "learning_rate": 2.222014687887532e-05, + "loss": 2.8767, + "step": 9720 + }, + { + "epoch": 0.720221094565991, + "grad_norm": 0.37537747621536255, + "learning_rate": 2.2005885158263645e-05, + "loss": 2.8802, + "step": 9740 + }, + { + "epoch": 0.7216999879839909, + "grad_norm": 0.40164393186569214, + "learning_rate": 2.1792369618942455e-05, + "loss": 2.881, + "step": 9760 + }, + { + "epoch": 0.7231788814019909, + "grad_norm": 0.35087114572525024, + "learning_rate": 2.1579605952154435e-05, + "loss": 2.8904, + "step": 9780 + }, + { + "epoch": 0.7246577748199909, + "grad_norm": 0.4332689046859741, + "learning_rate": 2.136759982910107e-05, + "loss": 2.8778, + "step": 9800 + }, + { + "epoch": 0.726136668237991, + "grad_norm": 0.34787076711654663, + "learning_rate": 2.1156356900791695e-05, + "loss": 2.8845, + "step": 9820 + }, + { + "epoch": 0.727615561655991, + "grad_norm": 0.37883126735687256, + "learning_rate": 2.0945882797892673e-05, + "loss": 2.8876, + "step": 9840 + }, + { + "epoch": 0.7290944550739908, + "grad_norm": 0.3691736161708832, + "learning_rate": 2.0736183130577335e-05, + "loss": 2.8887, + "step": 9860 + }, + { + "epoch": 0.7305733484919908, + "grad_norm": 0.31982922554016113, + "learning_rate": 2.0527263488376552e-05, + "loss": 2.8815, + "step": 9880 + }, + { + "epoch": 0.7320522419099909, + "grad_norm": 0.3566115200519562, + "learning_rate": 2.031912944002966e-05, + "loss": 2.8884, + "step": 9900 + }, + { + "epoch": 0.7335311353279909, + "grad_norm": 0.33468520641326904, + "learning_rate": 2.0111786533336e-05, + "loss": 2.8818, + "step": 9920 + }, + { + "epoch": 0.7350100287459909, + "grad_norm": 0.3208761513233185, + "learning_rate": 1.9905240295007145e-05, + "loss": 2.8803, + "step": 9940 + }, + { + "epoch": 0.7364889221639908, + "grad_norm": 0.34477704763412476, + "learning_rate": 1.9699496230519497e-05, + "loss": 2.8917, + "step": 9960 + }, + { + "epoch": 0.7379678155819908, + "grad_norm": 0.37035301327705383, + "learning_rate": 1.949455982396755e-05, + "loss": 2.8786, + "step": 9980 + }, + { + "epoch": 0.7394467089999908, + "grad_norm": 0.3365253210067749, + "learning_rate": 1.929043653791775e-05, + "loss": 2.8675, + "step": 10000 + }, + { + "epoch": 0.7409256024179908, + "grad_norm": 0.3333218991756439, + "learning_rate": 1.9087131813262886e-05, + "loss": 2.8687, + "step": 10020 + }, + { + "epoch": 0.7424044958359908, + "grad_norm": 0.3710993230342865, + "learning_rate": 1.8884651069076992e-05, + "loss": 2.8718, + "step": 10040 + }, + { + "epoch": 0.7438833892539907, + "grad_norm": 0.36842554807662964, + "learning_rate": 1.8682999702471014e-05, + "loss": 2.8631, + "step": 10060 + }, + { + "epoch": 0.7453622826719907, + "grad_norm": 0.35305920243263245, + "learning_rate": 1.8482183088448862e-05, + "loss": 2.8708, + "step": 10080 + }, + { + "epoch": 0.7468411760899907, + "grad_norm": 0.3375717103481293, + "learning_rate": 1.828220657976419e-05, + "loss": 2.8817, + "step": 10100 + }, + { + "epoch": 0.7483200695079907, + "grad_norm": 0.37821289896965027, + "learning_rate": 1.8083075506777676e-05, + "loss": 2.8787, + "step": 10120 + }, + { + "epoch": 0.7497989629259906, + "grad_norm": 0.3393423557281494, + "learning_rate": 1.7884795177314995e-05, + "loss": 2.8681, + "step": 10140 + }, + { + "epoch": 0.7512778563439906, + "grad_norm": 0.35140156745910645, + "learning_rate": 1.7687370876525273e-05, + "loss": 2.8742, + "step": 10160 + }, + { + "epoch": 0.7527567497619906, + "grad_norm": 0.3378312587738037, + "learning_rate": 1.7490807866740268e-05, + "loss": 2.8736, + "step": 10180 + }, + { + "epoch": 0.7542356431799906, + "grad_norm": 0.37517204880714417, + "learning_rate": 1.7295111387334103e-05, + "loss": 2.8623, + "step": 10200 + }, + { + "epoch": 0.7557145365979906, + "grad_norm": 0.3355712890625, + "learning_rate": 1.7100286654583543e-05, + "loss": 2.8721, + "step": 10220 + }, + { + "epoch": 0.7571934300159905, + "grad_norm": 0.3331904411315918, + "learning_rate": 1.690633886152903e-05, + "loss": 2.8701, + "step": 10240 + }, + { + "epoch": 0.7586723234339905, + "grad_norm": 0.34373047947883606, + "learning_rate": 1.6713273177836276e-05, + "loss": 2.8718, + "step": 10260 + }, + { + "epoch": 0.7601512168519905, + "grad_norm": 0.3202342987060547, + "learning_rate": 1.6521094749658328e-05, + "loss": 2.8658, + "step": 10280 + }, + { + "epoch": 0.7616301102699905, + "grad_norm": 0.33778509497642517, + "learning_rate": 1.6329808699498588e-05, + "loss": 2.8786, + "step": 10300 + }, + { + "epoch": 0.7631090036879905, + "grad_norm": 0.33873429894447327, + "learning_rate": 1.613942012607414e-05, + "loss": 2.8731, + "step": 10320 + }, + { + "epoch": 0.7645878971059904, + "grad_norm": 0.3424777090549469, + "learning_rate": 1.5949934104179887e-05, + "loss": 2.8715, + "step": 10340 + }, + { + "epoch": 0.7660667905239904, + "grad_norm": 0.33158713579177856, + "learning_rate": 1.5761355684553286e-05, + "loss": 2.8545, + "step": 10360 + }, + { + "epoch": 0.7675456839419904, + "grad_norm": 0.3395291566848755, + "learning_rate": 1.557368989373973e-05, + "loss": 2.8533, + "step": 10380 + }, + { + "epoch": 0.7690245773599904, + "grad_norm": 0.31933024525642395, + "learning_rate": 1.5386941733958503e-05, + "loss": 2.8651, + "step": 10400 + }, + { + "epoch": 0.7705034707779904, + "grad_norm": 0.3164694309234619, + "learning_rate": 1.5201116182969538e-05, + "loss": 2.8773, + "step": 10420 + }, + { + "epoch": 0.7719823641959903, + "grad_norm": 0.35544392466545105, + "learning_rate": 1.50162181939407e-05, + "loss": 2.859, + "step": 10440 + }, + { + "epoch": 0.7734612576139903, + "grad_norm": 0.3556651175022125, + "learning_rate": 1.4832252695315691e-05, + "loss": 2.8463, + "step": 10460 + }, + { + "epoch": 0.7749401510319903, + "grad_norm": 0.335028737783432, + "learning_rate": 1.4649224590682802e-05, + "loss": 2.8635, + "step": 10480 + }, + { + "epoch": 0.7764190444499903, + "grad_norm": 0.4239474833011627, + "learning_rate": 1.4467138758644139e-05, + "loss": 2.8493, + "step": 10500 + }, + { + "epoch": 0.7778979378679903, + "grad_norm": 0.3199774920940399, + "learning_rate": 1.4286000052685556e-05, + "loss": 2.8687, + "step": 10520 + }, + { + "epoch": 0.7793768312859902, + "grad_norm": 0.3779512643814087, + "learning_rate": 1.4105813301047366e-05, + "loss": 2.8518, + "step": 10540 + }, + { + "epoch": 0.7808557247039902, + "grad_norm": 0.3382132649421692, + "learning_rate": 1.3926583306595581e-05, + "loss": 2.8572, + "step": 10560 + }, + { + "epoch": 0.7823346181219902, + "grad_norm": 0.3185078203678131, + "learning_rate": 1.374831484669392e-05, + "loss": 2.8607, + "step": 10580 + }, + { + "epoch": 0.7838135115399902, + "grad_norm": 0.35780152678489685, + "learning_rate": 1.3571012673076472e-05, + "loss": 2.8564, + "step": 10600 + }, + { + "epoch": 0.7852924049579901, + "grad_norm": 0.3039771616458893, + "learning_rate": 1.3394681511721013e-05, + "loss": 2.8587, + "step": 10620 + }, + { + "epoch": 0.7867712983759901, + "grad_norm": 0.3119048774242401, + "learning_rate": 1.3219326062723042e-05, + "loss": 2.864, + "step": 10640 + }, + { + "epoch": 0.7882501917939901, + "grad_norm": 0.3685562312602997, + "learning_rate": 1.304495100017053e-05, + "loss": 2.8551, + "step": 10660 + }, + { + "epoch": 0.7897290852119901, + "grad_norm": 0.32328301668167114, + "learning_rate": 1.2871560972019314e-05, + "loss": 2.8537, + "step": 10680 + }, + { + "epoch": 0.7912079786299901, + "grad_norm": 0.32044264674186707, + "learning_rate": 1.2699160599969174e-05, + "loss": 2.8647, + "step": 10700 + }, + { + "epoch": 0.79268687204799, + "grad_norm": 0.39615657925605774, + "learning_rate": 1.2527754479340703e-05, + "loss": 2.8558, + "step": 10720 + }, + { + "epoch": 0.79416576546599, + "grad_norm": 0.31399622559547424, + "learning_rate": 1.2357347178952788e-05, + "loss": 2.8582, + "step": 10740 + }, + { + "epoch": 0.79564465888399, + "grad_norm": 0.33324578404426575, + "learning_rate": 1.2187943241000794e-05, + "loss": 2.8447, + "step": 10760 + }, + { + "epoch": 0.7971235523019901, + "grad_norm": 0.32412442564964294, + "learning_rate": 1.2019547180935552e-05, + "loss": 2.842, + "step": 10780 + }, + { + "epoch": 0.7986024457199901, + "grad_norm": 0.3198014795780182, + "learning_rate": 1.1852163487342981e-05, + "loss": 2.8594, + "step": 10800 + }, + { + "epoch": 0.80008133913799, + "grad_norm": 0.3332209289073944, + "learning_rate": 1.1685796621824423e-05, + "loss": 2.8542, + "step": 10820 + }, + { + "epoch": 0.80156023255599, + "grad_norm": 0.3251478374004364, + "learning_rate": 1.1520451018877742e-05, + "loss": 2.8623, + "step": 10840 + }, + { + "epoch": 0.80303912597399, + "grad_norm": 0.3332981765270233, + "learning_rate": 1.1356131085779131e-05, + "loss": 2.8566, + "step": 10860 + }, + { + "epoch": 0.80451801939199, + "grad_norm": 0.30493640899658203, + "learning_rate": 1.1192841202465565e-05, + "loss": 2.8596, + "step": 10880 + }, + { + "epoch": 0.80599691280999, + "grad_norm": 0.3335663974285126, + "learning_rate": 1.1030585721418174e-05, + "loss": 2.854, + "step": 10900 + }, + { + "epoch": 0.8074758062279899, + "grad_norm": 0.3442290127277374, + "learning_rate": 1.0869368967546134e-05, + "loss": 2.8471, + "step": 10920 + }, + { + "epoch": 0.8089546996459899, + "grad_norm": 0.3200606107711792, + "learning_rate": 1.0709195238071407e-05, + "loss": 2.8553, + "step": 10940 + }, + { + "epoch": 0.8104335930639899, + "grad_norm": 0.30462324619293213, + "learning_rate": 1.0550068802414231e-05, + "loss": 2.8487, + "step": 10960 + }, + { + "epoch": 0.8119124864819899, + "grad_norm": 0.3395856022834778, + "learning_rate": 1.0391993902079295e-05, + "loss": 2.8472, + "step": 10980 + }, + { + "epoch": 0.8133913798999899, + "grad_norm": 0.3614775836467743, + "learning_rate": 1.0234974750542647e-05, + "loss": 2.8427, + "step": 11000 + }, + { + "epoch": 0.8148702733179898, + "grad_norm": 0.3020230829715729, + "learning_rate": 1.0079015533139463e-05, + "loss": 2.8606, + "step": 11020 + }, + { + "epoch": 0.8163491667359898, + "grad_norm": 0.32456544041633606, + "learning_rate": 9.924120406952431e-06, + "loss": 2.8508, + "step": 11040 + }, + { + "epoch": 0.8178280601539898, + "grad_norm": 0.3214119076728821, + "learning_rate": 9.77029350070095e-06, + "loss": 2.8391, + "step": 11060 + }, + { + "epoch": 0.8193069535719898, + "grad_norm": 0.3201681077480316, + "learning_rate": 9.61753891463109e-06, + "loss": 2.8532, + "step": 11080 + }, + { + "epoch": 0.8207858469899897, + "grad_norm": 0.323337584733963, + "learning_rate": 9.465860720406327e-06, + "loss": 2.8499, + "step": 11100 + }, + { + "epoch": 0.8222647404079897, + "grad_norm": 0.31912675499916077, + "learning_rate": 9.315262960998911e-06, + "loss": 2.852, + "step": 11120 + }, + { + "epoch": 0.8237436338259897, + "grad_norm": 0.31801870465278625, + "learning_rate": 9.165749650582239e-06, + "loss": 2.8373, + "step": 11140 + }, + { + "epoch": 0.8252225272439897, + "grad_norm": 0.3083365559577942, + "learning_rate": 9.017324774423785e-06, + "loss": 2.8565, + "step": 11160 + }, + { + "epoch": 0.8267014206619897, + "grad_norm": 0.34097760915756226, + "learning_rate": 8.869992288778834e-06, + "loss": 2.8389, + "step": 11180 + }, + { + "epoch": 0.8281803140799896, + "grad_norm": 0.32595744729042053, + "learning_rate": 8.72375612078511e-06, + "loss": 2.8588, + "step": 11200 + }, + { + "epoch": 0.8296592074979896, + "grad_norm": 0.3241618275642395, + "learning_rate": 8.578620168358082e-06, + "loss": 2.8527, + "step": 11220 + }, + { + "epoch": 0.8311381009159896, + "grad_norm": 0.31303274631500244, + "learning_rate": 8.434588300086988e-06, + "loss": 2.8326, + "step": 11240 + }, + { + "epoch": 0.8326169943339896, + "grad_norm": 0.3417539596557617, + "learning_rate": 8.291664355131818e-06, + "loss": 2.8477, + "step": 11260 + }, + { + "epoch": 0.8340958877519896, + "grad_norm": 0.3075898289680481, + "learning_rate": 8.149852143120923e-06, + "loss": 2.8353, + "step": 11280 + }, + { + "epoch": 0.8355747811699895, + "grad_norm": 0.32699164748191833, + "learning_rate": 8.009155444049499e-06, + "loss": 2.8432, + "step": 11300 + }, + { + "epoch": 0.8370536745879895, + "grad_norm": 0.29232412576675415, + "learning_rate": 7.869578008178808e-06, + "loss": 2.8538, + "step": 11320 + }, + { + "epoch": 0.8385325680059895, + "grad_norm": 0.2949979901313782, + "learning_rate": 7.731123555936232e-06, + "loss": 2.8494, + "step": 11340 + }, + { + "epoch": 0.8400114614239895, + "grad_norm": 0.2993783950805664, + "learning_rate": 7.593795777816071e-06, + "loss": 2.8439, + "step": 11360 + }, + { + "epoch": 0.8414903548419895, + "grad_norm": 0.31987783312797546, + "learning_rate": 7.457598334281235e-06, + "loss": 2.8364, + "step": 11380 + }, + { + "epoch": 0.8429692482599894, + "grad_norm": 0.3066832721233368, + "learning_rate": 7.322534855665636e-06, + "loss": 2.8414, + "step": 11400 + }, + { + "epoch": 0.8444481416779894, + "grad_norm": 0.3674749433994293, + "learning_rate": 7.1886089420773965e-06, + "loss": 2.8346, + "step": 11420 + }, + { + "epoch": 0.8459270350959894, + "grad_norm": 0.3142234981060028, + "learning_rate": 7.055824163302943e-06, + "loss": 2.8478, + "step": 11440 + }, + { + "epoch": 0.8474059285139894, + "grad_norm": 0.30251550674438477, + "learning_rate": 6.924184058711836e-06, + "loss": 2.8447, + "step": 11460 + }, + { + "epoch": 0.8488848219319894, + "grad_norm": 0.35557475686073303, + "learning_rate": 6.7936921371623885e-06, + "loss": 2.8387, + "step": 11480 + }, + { + "epoch": 0.8503637153499893, + "grad_norm": 0.2999821901321411, + "learning_rate": 6.6643518769082036e-06, + "loss": 2.8484, + "step": 11500 + }, + { + "epoch": 0.8518426087679893, + "grad_norm": 0.29102715849876404, + "learning_rate": 6.536166725505405e-06, + "loss": 2.8418, + "step": 11520 + }, + { + "epoch": 0.8533215021859893, + "grad_norm": 0.3709971606731415, + "learning_rate": 6.4091400997207785e-06, + "loss": 2.8393, + "step": 11540 + }, + { + "epoch": 0.8548003956039893, + "grad_norm": 0.3058640658855438, + "learning_rate": 6.2832753854406846e-06, + "loss": 2.8428, + "step": 11560 + }, + { + "epoch": 0.8562792890219892, + "grad_norm": 0.2915048599243164, + "learning_rate": 6.158575937580818e-06, + "loss": 2.8446, + "step": 11580 + }, + { + "epoch": 0.8577581824399892, + "grad_norm": 0.31149548292160034, + "learning_rate": 6.035045079996743e-06, + "loss": 2.8438, + "step": 11600 + }, + { + "epoch": 0.8592370758579893, + "grad_norm": 0.2985529601573944, + "learning_rate": 5.9126861053953595e-06, + "loss": 2.8246, + "step": 11620 + }, + { + "epoch": 0.8607159692759893, + "grad_norm": 0.33099082112312317, + "learning_rate": 5.791502275247079e-06, + "loss": 2.8412, + "step": 11640 + }, + { + "epoch": 0.8621948626939893, + "grad_norm": 0.28865981101989746, + "learning_rate": 5.6714968196989295e-06, + "loss": 2.8299, + "step": 11660 + }, + { + "epoch": 0.8636737561119892, + "grad_norm": 0.34115445613861084, + "learning_rate": 5.5526729374884456e-06, + "loss": 2.8368, + "step": 11680 + }, + { + "epoch": 0.8651526495299892, + "grad_norm": 0.3019537925720215, + "learning_rate": 5.435033795858385e-06, + "loss": 2.8424, + "step": 11700 + }, + { + "epoch": 0.8666315429479892, + "grad_norm": 0.2919292449951172, + "learning_rate": 5.318582530472338e-06, + "loss": 2.8449, + "step": 11720 + }, + { + "epoch": 0.8681104363659892, + "grad_norm": 0.2975643575191498, + "learning_rate": 5.203322245331127e-06, + "loss": 2.8484, + "step": 11740 + }, + { + "epoch": 0.8695893297839892, + "grad_norm": 0.30803442001342773, + "learning_rate": 5.089256012690069e-06, + "loss": 2.839, + "step": 11760 + }, + { + "epoch": 0.8710682232019891, + "grad_norm": 0.3415025770664215, + "learning_rate": 4.976386872977107e-06, + "loss": 2.8406, + "step": 11780 + }, + { + "epoch": 0.8725471166199891, + "grad_norm": 0.3077727258205414, + "learning_rate": 4.864717834711735e-06, + "loss": 2.8262, + "step": 11800 + }, + { + "epoch": 0.8740260100379891, + "grad_norm": 0.3027855455875397, + "learning_rate": 4.75425187442482e-06, + "loss": 2.8394, + "step": 11820 + }, + { + "epoch": 0.8755049034559891, + "grad_norm": 0.3020201027393341, + "learning_rate": 4.644991936579268e-06, + "loss": 2.8397, + "step": 11840 + }, + { + "epoch": 0.8769837968739891, + "grad_norm": 0.2942678928375244, + "learning_rate": 4.536940933491552e-06, + "loss": 2.8506, + "step": 11860 + }, + { + "epoch": 0.878462690291989, + "grad_norm": 0.30446386337280273, + "learning_rate": 4.43010174525404e-06, + "loss": 2.8323, + "step": 11880 + }, + { + "epoch": 0.879941583709989, + "grad_norm": 0.2892758250236511, + "learning_rate": 4.324477219658274e-06, + "loss": 2.8268, + "step": 11900 + }, + { + "epoch": 0.881420477127989, + "grad_norm": 0.29356256127357483, + "learning_rate": 4.220070172119045e-06, + "loss": 2.8561, + "step": 11920 + }, + { + "epoch": 0.882899370545989, + "grad_norm": 0.2972046136856079, + "learning_rate": 4.116883385599335e-06, + "loss": 2.8459, + "step": 11940 + }, + { + "epoch": 0.884378263963989, + "grad_norm": 0.30883651971817017, + "learning_rate": 4.01491961053615e-06, + "loss": 2.8526, + "step": 11960 + }, + { + "epoch": 0.8858571573819889, + "grad_norm": 0.30948570370674133, + "learning_rate": 3.914181564767216e-06, + "loss": 2.8335, + "step": 11980 + }, + { + "epoch": 0.8873360507999889, + "grad_norm": 0.2896897494792938, + "learning_rate": 3.8146719334585246e-06, + "loss": 2.8353, + "step": 12000 + }, + { + "epoch": 0.8888149442179889, + "grad_norm": 0.29304638504981995, + "learning_rate": 3.7163933690327447e-06, + "loss": 2.8352, + "step": 12020 + }, + { + "epoch": 0.8902938376359889, + "grad_norm": 0.29079097509384155, + "learning_rate": 3.619348491098562e-06, + "loss": 2.8256, + "step": 12040 + }, + { + "epoch": 0.8917727310539888, + "grad_norm": 0.3122529089450836, + "learning_rate": 3.5235398863808055e-06, + "loss": 2.8211, + "step": 12060 + }, + { + "epoch": 0.8932516244719888, + "grad_norm": 0.2927321493625641, + "learning_rate": 3.4289701086515357e-06, + "loss": 2.8338, + "step": 12080 + }, + { + "epoch": 0.8947305178899888, + "grad_norm": 0.2869907319545746, + "learning_rate": 3.3356416786619716e-06, + "loss": 2.8313, + "step": 12100 + }, + { + "epoch": 0.8962094113079888, + "grad_norm": 0.27835631370544434, + "learning_rate": 3.2435570840752605e-06, + "loss": 2.8346, + "step": 12120 + }, + { + "epoch": 0.8976883047259888, + "grad_norm": 0.2780158817768097, + "learning_rate": 3.152718779400221e-06, + "loss": 2.8315, + "step": 12140 + }, + { + "epoch": 0.8991671981439887, + "grad_norm": 0.2955233156681061, + "learning_rate": 3.0631291859259114e-06, + "loss": 2.8241, + "step": 12160 + }, + { + "epoch": 0.9006460915619887, + "grad_norm": 0.29205450415611267, + "learning_rate": 2.9747906916570258e-06, + "loss": 2.8308, + "step": 12180 + }, + { + "epoch": 0.9021249849799887, + "grad_norm": 0.289033979177475, + "learning_rate": 2.8877056512503386e-06, + "loss": 2.8469, + "step": 12200 + }, + { + "epoch": 0.9036038783979887, + "grad_norm": 0.29402533173561096, + "learning_rate": 2.8018763859518736e-06, + "loss": 2.82, + "step": 12220 + }, + { + "epoch": 0.9050827718159887, + "grad_norm": 0.30112123489379883, + "learning_rate": 2.7173051835350517e-06, + "loss": 2.8269, + "step": 12240 + }, + { + "epoch": 0.9065616652339886, + "grad_norm": 0.2986692488193512, + "learning_rate": 2.6339942982397116e-06, + "loss": 2.8269, + "step": 12260 + }, + { + "epoch": 0.9080405586519886, + "grad_norm": 0.3106101453304291, + "learning_rate": 2.5519459507120313e-06, + "loss": 2.8415, + "step": 12280 + }, + { + "epoch": 0.9095194520699886, + "grad_norm": 0.2930283844470978, + "learning_rate": 2.471162327945303e-06, + "loss": 2.8353, + "step": 12300 + }, + { + "epoch": 0.9109983454879886, + "grad_norm": 0.28059104084968567, + "learning_rate": 2.3916455832216964e-06, + "loss": 2.8318, + "step": 12320 + }, + { + "epoch": 0.9124772389059886, + "grad_norm": 0.2927623987197876, + "learning_rate": 2.313397836054815e-06, + "loss": 2.841, + "step": 12340 + }, + { + "epoch": 0.9139561323239885, + "grad_norm": 0.28432729840278625, + "learning_rate": 2.2364211721331964e-06, + "loss": 2.8294, + "step": 12360 + }, + { + "epoch": 0.9154350257419885, + "grad_norm": 0.2854309678077698, + "learning_rate": 2.1607176432647703e-06, + "loss": 2.8389, + "step": 12380 + }, + { + "epoch": 0.9169139191599885, + "grad_norm": 0.2870195209980011, + "learning_rate": 2.0862892673221224e-06, + "loss": 2.8355, + "step": 12400 + }, + { + "epoch": 0.9183928125779885, + "grad_norm": 0.27523091435432434, + "learning_rate": 2.01313802818871e-06, + "loss": 2.8379, + "step": 12420 + }, + { + "epoch": 0.9198717059959886, + "grad_norm": 0.2815629839897156, + "learning_rate": 1.9412658757060053e-06, + "loss": 2.8279, + "step": 12440 + }, + { + "epoch": 0.9213505994139884, + "grad_norm": 0.28886112570762634, + "learning_rate": 1.870674725621513e-06, + "loss": 2.8242, + "step": 12460 + }, + { + "epoch": 0.9228294928319885, + "grad_norm": 0.2753719985485077, + "learning_rate": 1.80136645953769e-06, + "loss": 2.8234, + "step": 12480 + }, + { + "epoch": 0.9243083862499885, + "grad_norm": 0.2705097496509552, + "learning_rate": 1.7333429248618194e-06, + "loss": 2.8209, + "step": 12500 + }, + { + "epoch": 0.9257872796679885, + "grad_norm": 0.284212589263916, + "learning_rate": 1.6666059347567485e-06, + "loss": 2.838, + "step": 12520 + }, + { + "epoch": 0.9272661730859884, + "grad_norm": 0.28033483028411865, + "learning_rate": 1.6011572680925458e-06, + "loss": 2.827, + "step": 12540 + }, + { + "epoch": 0.9287450665039884, + "grad_norm": 0.27618134021759033, + "learning_rate": 1.5369986693991255e-06, + "loss": 2.8415, + "step": 12560 + }, + { + "epoch": 0.9302239599219884, + "grad_norm": 0.28289562463760376, + "learning_rate": 1.474131848819721e-06, + "loss": 2.834, + "step": 12580 + }, + { + "epoch": 0.9317028533399884, + "grad_norm": 0.2737962305545807, + "learning_rate": 1.4125584820652959e-06, + "loss": 2.8228, + "step": 12600 + }, + { + "epoch": 0.9331817467579884, + "grad_norm": 0.27976194024086, + "learning_rate": 1.352280210369894e-06, + "loss": 2.8387, + "step": 12620 + }, + { + "epoch": 0.9346606401759883, + "grad_norm": 0.27253544330596924, + "learning_rate": 1.2932986404468883e-06, + "loss": 2.8417, + "step": 12640 + }, + { + "epoch": 0.9361395335939883, + "grad_norm": 0.2787373661994934, + "learning_rate": 1.2356153444461393e-06, + "loss": 2.8295, + "step": 12660 + }, + { + "epoch": 0.9376184270119883, + "grad_norm": 0.27786681056022644, + "learning_rate": 1.1792318599121165e-06, + "loss": 2.8238, + "step": 12680 + }, + { + "epoch": 0.9390973204299883, + "grad_norm": 0.2707980275154114, + "learning_rate": 1.1241496897428872e-06, + "loss": 2.8216, + "step": 12700 + }, + { + "epoch": 0.9405762138479883, + "grad_norm": 0.2854357063770294, + "learning_rate": 1.0703703021500811e-06, + "loss": 2.8108, + "step": 12720 + }, + { + "epoch": 0.9420551072659882, + "grad_norm": 0.2822173833847046, + "learning_rate": 1.0178951306197337e-06, + "loss": 2.8093, + "step": 12740 + }, + { + "epoch": 0.9435340006839882, + "grad_norm": 0.29024040699005127, + "learning_rate": 9.667255738740943e-07, + "loss": 2.8258, + "step": 12760 + }, + { + "epoch": 0.9450128941019882, + "grad_norm": 0.2967122793197632, + "learning_rate": 9.168629958343334e-07, + "loss": 2.842, + "step": 12780 + }, + { + "epoch": 0.9464917875199882, + "grad_norm": 0.2722231149673462, + "learning_rate": 8.683087255841881e-07, + "loss": 2.8341, + "step": 12800 + }, + { + "epoch": 0.9479706809379882, + "grad_norm": 0.2952738106250763, + "learning_rate": 8.210640573345474e-07, + "loss": 2.8212, + "step": 12820 + }, + { + "epoch": 0.9494495743559881, + "grad_norm": 0.27017560601234436, + "learning_rate": 7.751302503889224e-07, + "loss": 2.8123, + "step": 12840 + }, + { + "epoch": 0.9509284677739881, + "grad_norm": 0.2811236083507538, + "learning_rate": 7.305085291099301e-07, + "loss": 2.8426, + "step": 12860 + }, + { + "epoch": 0.9524073611919881, + "grad_norm": 0.282913476228714, + "learning_rate": 6.872000828866131e-07, + "loss": 2.8348, + "step": 12880 + }, + { + "epoch": 0.9538862546099881, + "grad_norm": 0.2759126126766205, + "learning_rate": 6.452060661027548e-07, + "loss": 2.8301, + "step": 12900 + }, + { + "epoch": 0.9553651480279881, + "grad_norm": 0.2853533923625946, + "learning_rate": 6.045275981061138e-07, + "loss": 2.8415, + "step": 12920 + }, + { + "epoch": 0.956844041445988, + "grad_norm": 0.2731573283672333, + "learning_rate": 5.651657631785878e-07, + "loss": 2.826, + "step": 12940 + }, + { + "epoch": 0.958322934863988, + "grad_norm": 0.2759709060192108, + "learning_rate": 5.271216105072863e-07, + "loss": 2.8261, + "step": 12960 + }, + { + "epoch": 0.959801828281988, + "grad_norm": 0.2832717001438141, + "learning_rate": 4.903961541565971e-07, + "loss": 2.8332, + "step": 12980 + }, + { + "epoch": 0.961280721699988, + "grad_norm": 0.269037127494812, + "learning_rate": 4.5499037304115866e-07, + "loss": 2.8229, + "step": 13000 + }, + { + "epoch": 0.9627596151179879, + "grad_norm": 0.271410197019577, + "learning_rate": 4.2090521089972466e-07, + "loss": 2.8401, + "step": 13020 + }, + { + "epoch": 0.9642385085359879, + "grad_norm": 0.26483696699142456, + "learning_rate": 3.8814157627005685e-07, + "loss": 2.8376, + "step": 13040 + }, + { + "epoch": 0.9657174019539879, + "grad_norm": 0.2761934697628021, + "learning_rate": 3.567003424646831e-07, + "loss": 2.8374, + "step": 13060 + }, + { + "epoch": 0.9671962953719879, + "grad_norm": 0.27471932768821716, + "learning_rate": 3.265823475476215e-07, + "loss": 2.8358, + "step": 13080 + }, + { + "epoch": 0.9686751887899879, + "grad_norm": 0.27371978759765625, + "learning_rate": 2.97788394312043e-07, + "loss": 2.8289, + "step": 13100 + }, + { + "epoch": 0.9701540822079878, + "grad_norm": 0.2889103889465332, + "learning_rate": 2.7031925025888247e-07, + "loss": 2.8145, + "step": 13120 + }, + { + "epoch": 0.9716329756259878, + "grad_norm": 0.2687681317329407, + "learning_rate": 2.441756475763668e-07, + "loss": 2.818, + "step": 13140 + }, + { + "epoch": 0.9731118690439878, + "grad_norm": 0.2686457931995392, + "learning_rate": 2.1935828312050766e-07, + "loss": 2.8344, + "step": 13160 + }, + { + "epoch": 0.9745907624619878, + "grad_norm": 0.26769590377807617, + "learning_rate": 1.9586781839652235e-07, + "loss": 2.8236, + "step": 13180 + }, + { + "epoch": 0.9760696558799878, + "grad_norm": 0.27022501826286316, + "learning_rate": 1.737048795412033e-07, + "loss": 2.8307, + "step": 13200 + }, + { + "epoch": 0.9775485492979877, + "grad_norm": 0.2741018533706665, + "learning_rate": 1.5287005730623138e-07, + "loss": 2.8312, + "step": 13220 + }, + { + "epoch": 0.9790274427159877, + "grad_norm": 0.27768802642822266, + "learning_rate": 1.333639070424164e-07, + "loss": 2.8281, + "step": 13240 + }, + { + "epoch": 0.9805063361339877, + "grad_norm": 0.26736685633659363, + "learning_rate": 1.1518694868491442e-07, + "loss": 2.8342, + "step": 13260 + }, + { + "epoch": 0.9819852295519877, + "grad_norm": 0.26495057344436646, + "learning_rate": 9.833966673935546e-08, + "loss": 2.8236, + "step": 13280 + }, + { + "epoch": 0.9834641229699878, + "grad_norm": 0.27052661776542664, + "learning_rate": 8.282251026893728e-08, + "loss": 2.8214, + "step": 13300 + }, + { + "epoch": 0.9849430163879876, + "grad_norm": 0.2683194875717163, + "learning_rate": 6.863589288244043e-08, + "loss": 2.8468, + "step": 13320 + }, + { + "epoch": 0.9864219098059877, + "grad_norm": 0.27812352776527405, + "learning_rate": 5.5780192723214884e-08, + "loss": 2.8254, + "step": 13340 + }, + { + "epoch": 0.9879008032239877, + "grad_norm": 0.2842520773410797, + "learning_rate": 4.425575245911029e-08, + "loss": 2.8273, + "step": 13360 + }, + { + "epoch": 0.9893796966419877, + "grad_norm": 0.2864263355731964, + "learning_rate": 3.406287927332219e-08, + "loss": 2.8311, + "step": 13380 + }, + { + "epoch": 0.9908585900599877, + "grad_norm": 0.26490774750709534, + "learning_rate": 2.520184485620969e-08, + "loss": 2.8298, + "step": 13400 + }, + { + "epoch": 0.9923374834779876, + "grad_norm": 0.2666003406047821, + "learning_rate": 1.7672885398067883e-08, + "loss": 2.8303, + "step": 13420 + }, + { + "epoch": 0.9938163768959876, + "grad_norm": 0.27174392342567444, + "learning_rate": 1.147620158281626e-08, + "loss": 2.8177, + "step": 13440 + }, + { + "epoch": 0.9952952703139876, + "grad_norm": 0.2677934467792511, + "learning_rate": 6.6119585826529554e-09, + "loss": 2.8123, + "step": 13460 + }, + { + "epoch": 0.9967741637319876, + "grad_norm": 0.2655700445175171, + "learning_rate": 3.0802860536582876e-09, + "loss": 2.8268, + "step": 13480 + }, + { + "epoch": 0.9982530571499876, + "grad_norm": 0.2759760022163391, + "learning_rate": 8.812781323253027e-10, + "loss": 2.8247, + "step": 13500 + }, + { + "epoch": 0.9997319505679875, + "grad_norm": 0.2634597718715668, + "learning_rate": 1.4993433072874042e-11, + "loss": 2.831, + "step": 13520 + } + ], + "logging_steps": 20, + "max_steps": 13523, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.070897645108016e+19, + "train_batch_size": 12, + "trial_name": null, + "trial_params": null +}