{ "best_metric": 0.2128431349992752, "best_model_checkpoint": "./output/checkpoint-4950", "epoch": 0.4058375010248422, "eval_steps": 150, "global_step": 4950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008198737394441257, "grad_norm": 11.523909568786621, "learning_rate": 7.500000000000001e-07, "loss": 0.39, "step": 10 }, { "epoch": 0.0016397474788882513, "grad_norm": 9.020567893981934, "learning_rate": 1.5000000000000002e-06, "loss": 0.3576, "step": 20 }, { "epoch": 0.002459621218332377, "grad_norm": 9.512846946716309, "learning_rate": 2.25e-06, "loss": 0.3874, "step": 30 }, { "epoch": 0.0032794949577765026, "grad_norm": 39.97313690185547, "learning_rate": 3.0000000000000005e-06, "loss": 0.3568, "step": 40 }, { "epoch": 0.004099368697220628, "grad_norm": 12.515055656433105, "learning_rate": 3.7500000000000005e-06, "loss": 0.3314, "step": 50 }, { "epoch": 0.004919242436664754, "grad_norm": 11.462284088134766, "learning_rate": 4.5e-06, "loss": 0.3641, "step": 60 }, { "epoch": 0.005739116176108879, "grad_norm": 18.380435943603516, "learning_rate": 5.2500000000000006e-06, "loss": 0.348, "step": 70 }, { "epoch": 0.006558989915553005, "grad_norm": 13.468473434448242, "learning_rate": 6.000000000000001e-06, "loss": 0.348, "step": 80 }, { "epoch": 0.007378863654997131, "grad_norm": 10.285468101501465, "learning_rate": 6.7500000000000014e-06, "loss": 0.3352, "step": 90 }, { "epoch": 0.008198737394441257, "grad_norm": 17.571596145629883, "learning_rate": 7.500000000000001e-06, "loss": 0.3438, "step": 100 }, { "epoch": 0.009018611133885381, "grad_norm": 19.84699249267578, "learning_rate": 7.499922926093874e-06, "loss": 0.3253, "step": 110 }, { "epoch": 0.009838484873329507, "grad_norm": 16.91347885131836, "learning_rate": 7.499691707543699e-06, "loss": 0.3328, "step": 120 }, { "epoch": 0.010658358612773634, "grad_norm": 11.190834999084473, "learning_rate": 7.499306353853963e-06, "loss": 0.3308, "step": 130 }, { "epoch": 0.011478232352217758, "grad_norm": 11.117925643920898, "learning_rate": 7.49876688086505e-06, "loss": 0.3401, "step": 140 }, { "epoch": 0.012298106091661884, "grad_norm": 12.28294563293457, "learning_rate": 7.4980733107525805e-06, "loss": 0.303, "step": 150 }, { "epoch": 0.012298106091661884, "eval_loss": 0.32195183634757996, "eval_runtime": 58.0333, "eval_samples_per_second": 8.616, "eval_steps_per_second": 8.616, "step": 150 }, { "epoch": 0.01311797983110601, "grad_norm": 12.885525703430176, "learning_rate": 7.4972256720265044e-06, "loss": 0.3595, "step": 160 }, { "epoch": 0.013937853570550135, "grad_norm": 12.438248634338379, "learning_rate": 7.496223999529932e-06, "loss": 0.3361, "step": 170 }, { "epoch": 0.014757727309994261, "grad_norm": 14.641826629638672, "learning_rate": 7.4950683344376926e-06, "loss": 0.3296, "step": 180 }, { "epoch": 0.015577601049438386, "grad_norm": 9.628592491149902, "learning_rate": 7.4937587242546544e-06, "loss": 0.3225, "step": 190 }, { "epoch": 0.016397474788882514, "grad_norm": 15.733799934387207, "learning_rate": 7.492295222813762e-06, "loss": 0.3284, "step": 200 }, { "epoch": 0.017217348528326636, "grad_norm": 12.937703132629395, "learning_rate": 7.490677890273828e-06, "loss": 0.3434, "step": 210 }, { "epoch": 0.018037222267770762, "grad_norm": 16.046674728393555, "learning_rate": 7.488906793117058e-06, "loss": 0.3519, "step": 220 }, { "epoch": 0.01885709600721489, "grad_norm": 11.472362518310547, "learning_rate": 7.486982004146319e-06, "loss": 0.3587, "step": 230 }, { "epoch": 0.019676969746659015, "grad_norm": 15.215801239013672, "learning_rate": 7.484903602482148e-06, "loss": 0.3197, "step": 240 }, { "epoch": 0.02049684348610314, "grad_norm": 11.658143997192383, "learning_rate": 7.4826716735594945e-06, "loss": 0.3114, "step": 250 }, { "epoch": 0.021316717225547267, "grad_norm": 7.448172092437744, "learning_rate": 7.480286309124216e-06, "loss": 0.2912, "step": 260 }, { "epoch": 0.02213659096499139, "grad_norm": 12.367362022399902, "learning_rate": 7.477747607229302e-06, "loss": 0.3167, "step": 270 }, { "epoch": 0.022956464704435516, "grad_norm": 13.513625144958496, "learning_rate": 7.475055672230844e-06, "loss": 0.3093, "step": 280 }, { "epoch": 0.023776338443879642, "grad_norm": 19.878536224365234, "learning_rate": 7.472210614783745e-06, "loss": 0.3256, "step": 290 }, { "epoch": 0.02459621218332377, "grad_norm": 22.84262466430664, "learning_rate": 7.469212551837173e-06, "loss": 0.3104, "step": 300 }, { "epoch": 0.02459621218332377, "eval_loss": 0.3093046247959137, "eval_runtime": 58.7245, "eval_samples_per_second": 8.514, "eval_steps_per_second": 8.514, "step": 300 }, { "epoch": 0.025416085922767895, "grad_norm": 9.043919563293457, "learning_rate": 7.4660616066297565e-06, "loss": 0.3089, "step": 310 }, { "epoch": 0.02623595966221202, "grad_norm": 6.400809288024902, "learning_rate": 7.462757908684509e-06, "loss": 0.2959, "step": 320 }, { "epoch": 0.027055833401656144, "grad_norm": 19.60870361328125, "learning_rate": 7.459301593803512e-06, "loss": 0.3251, "step": 330 }, { "epoch": 0.02787570714110027, "grad_norm": 8.441984176635742, "learning_rate": 7.455692804062335e-06, "loss": 0.3108, "step": 340 }, { "epoch": 0.028695580880544396, "grad_norm": 20.126216888427734, "learning_rate": 7.451931687804189e-06, "loss": 0.3152, "step": 350 }, { "epoch": 0.029515454619988522, "grad_norm": 11.44316291809082, "learning_rate": 7.448018399633831e-06, "loss": 0.3302, "step": 360 }, { "epoch": 0.03033532835943265, "grad_norm": 10.247148513793945, "learning_rate": 7.443953100411214e-06, "loss": 0.289, "step": 370 }, { "epoch": 0.03115520209887677, "grad_norm": 10.746755599975586, "learning_rate": 7.439735957244862e-06, "loss": 0.2886, "step": 380 }, { "epoch": 0.0319750758383209, "grad_norm": 19.19182014465332, "learning_rate": 7.435367143485015e-06, "loss": 0.325, "step": 390 }, { "epoch": 0.03279494957776503, "grad_norm": 12.273555755615234, "learning_rate": 7.430846838716496e-06, "loss": 0.3107, "step": 400 }, { "epoch": 0.03361482331720915, "grad_norm": 13.099973678588867, "learning_rate": 7.426175228751328e-06, "loss": 0.3103, "step": 410 }, { "epoch": 0.03443469705665327, "grad_norm": 20.098796844482422, "learning_rate": 7.421352505621099e-06, "loss": 0.284, "step": 420 }, { "epoch": 0.0352545707960974, "grad_norm": 10.289865493774414, "learning_rate": 7.416378867569069e-06, "loss": 0.3337, "step": 430 }, { "epoch": 0.036074444535541525, "grad_norm": 13.34965705871582, "learning_rate": 7.411254519042017e-06, "loss": 0.3085, "step": 440 }, { "epoch": 0.036894318274985655, "grad_norm": 11.321673393249512, "learning_rate": 7.4059796706818396e-06, "loss": 0.3043, "step": 450 }, { "epoch": 0.036894318274985655, "eval_loss": 0.2889861762523651, "eval_runtime": 56.9295, "eval_samples_per_second": 8.783, "eval_steps_per_second": 8.783, "step": 450 }, { "epoch": 0.03771419201442978, "grad_norm": 15.978049278259277, "learning_rate": 7.400554539316894e-06, "loss": 0.2942, "step": 460 }, { "epoch": 0.0385340657538739, "grad_norm": 16.420135498046875, "learning_rate": 7.394979347953081e-06, "loss": 0.3139, "step": 470 }, { "epoch": 0.03935393949331803, "grad_norm": 15.941482543945312, "learning_rate": 7.389254325764681e-06, "loss": 0.3018, "step": 480 }, { "epoch": 0.04017381323276215, "grad_norm": 9.359827041625977, "learning_rate": 7.383379708084934e-06, "loss": 0.3048, "step": 490 }, { "epoch": 0.04099368697220628, "grad_norm": 11.175127983093262, "learning_rate": 7.377355736396362e-06, "loss": 0.3001, "step": 500 }, { "epoch": 0.041813560711650405, "grad_norm": 18.719478607177734, "learning_rate": 7.371182658320847e-06, "loss": 0.3105, "step": 510 }, { "epoch": 0.042633434451094535, "grad_norm": 9.761693954467773, "learning_rate": 7.36486072760945e-06, "loss": 0.3024, "step": 520 }, { "epoch": 0.04345330819053866, "grad_norm": 15.880053520202637, "learning_rate": 7.358390204131984e-06, "loss": 0.3099, "step": 530 }, { "epoch": 0.04427318192998278, "grad_norm": 10.00100326538086, "learning_rate": 7.3517713538663235e-06, "loss": 0.3215, "step": 540 }, { "epoch": 0.04509305566942691, "grad_norm": 7.478984355926514, "learning_rate": 7.345004448887478e-06, "loss": 0.2974, "step": 550 }, { "epoch": 0.04591292940887103, "grad_norm": 9.254852294921875, "learning_rate": 7.3380897673564085e-06, "loss": 0.3126, "step": 560 }, { "epoch": 0.04673280314831516, "grad_norm": 13.706809997558594, "learning_rate": 7.33102759350859e-06, "loss": 0.3018, "step": 570 }, { "epoch": 0.047552676887759285, "grad_norm": 16.57872200012207, "learning_rate": 7.323818217642328e-06, "loss": 0.2904, "step": 580 }, { "epoch": 0.04837255062720341, "grad_norm": 14.819424629211426, "learning_rate": 7.316461936106827e-06, "loss": 0.2855, "step": 590 }, { "epoch": 0.04919242436664754, "grad_norm": 17.543973922729492, "learning_rate": 7.3089590512900084e-06, "loss": 0.3169, "step": 600 }, { "epoch": 0.04919242436664754, "eval_loss": 0.2996714413166046, "eval_runtime": 58.2745, "eval_samples_per_second": 8.58, "eval_steps_per_second": 8.58, "step": 600 }, { "epoch": 0.05001229810609166, "grad_norm": 10.767305374145508, "learning_rate": 7.301309871606081e-06, "loss": 0.3011, "step": 610 }, { "epoch": 0.05083217184553579, "grad_norm": 6.571865081787109, "learning_rate": 7.293514711482861e-06, "loss": 0.2783, "step": 620 }, { "epoch": 0.05165204558497991, "grad_norm": 12.295404434204102, "learning_rate": 7.285573891348849e-06, "loss": 0.2829, "step": 630 }, { "epoch": 0.05247191932442404, "grad_norm": 12.576509475708008, "learning_rate": 7.27748773762006e-06, "loss": 0.3021, "step": 640 }, { "epoch": 0.053291793063868165, "grad_norm": 7.258118629455566, "learning_rate": 7.269256582686603e-06, "loss": 0.3041, "step": 650 }, { "epoch": 0.05411166680331229, "grad_norm": 14.7495756149292, "learning_rate": 7.260880764899016e-06, "loss": 0.285, "step": 660 }, { "epoch": 0.05493154054275642, "grad_norm": 18.141632080078125, "learning_rate": 7.252360628554363e-06, "loss": 0.2916, "step": 670 }, { "epoch": 0.05575141428220054, "grad_norm": 18.141878128051758, "learning_rate": 7.243696523882079e-06, "loss": 0.3007, "step": 680 }, { "epoch": 0.05657128802164467, "grad_norm": 13.596381187438965, "learning_rate": 7.2348888070295705e-06, "loss": 0.2627, "step": 690 }, { "epoch": 0.05739116176108879, "grad_norm": 14.028800964355469, "learning_rate": 7.225937840047583e-06, "loss": 0.2959, "step": 700 }, { "epoch": 0.058211035500532915, "grad_norm": 19.28914451599121, "learning_rate": 7.216843990875307e-06, "loss": 0.3088, "step": 710 }, { "epoch": 0.059030909239977045, "grad_norm": 10.676041603088379, "learning_rate": 7.207607633325266e-06, "loss": 0.2762, "step": 720 }, { "epoch": 0.05985078297942117, "grad_norm": 9.311237335205078, "learning_rate": 7.198229147067941e-06, "loss": 0.313, "step": 730 }, { "epoch": 0.0606706567188653, "grad_norm": 12.335597038269043, "learning_rate": 7.18870891761617e-06, "loss": 0.2797, "step": 740 }, { "epoch": 0.06149053045830942, "grad_norm": 11.885544776916504, "learning_rate": 7.1790473363092974e-06, "loss": 0.2681, "step": 750 }, { "epoch": 0.06149053045830942, "eval_loss": 0.3024304211139679, "eval_runtime": 57.0493, "eval_samples_per_second": 8.764, "eval_steps_per_second": 8.764, "step": 750 }, { "epoch": 0.06231040419775354, "grad_norm": 12.44359016418457, "learning_rate": 7.169244800297089e-06, "loss": 0.311, "step": 760 }, { "epoch": 0.06313027793719767, "grad_norm": 18.710712432861328, "learning_rate": 7.159301712523407e-06, "loss": 0.2949, "step": 770 }, { "epoch": 0.0639501516766418, "grad_norm": 9.658717155456543, "learning_rate": 7.149218481709644e-06, "loss": 0.2852, "step": 780 }, { "epoch": 0.06477002541608592, "grad_norm": 10.276803970336914, "learning_rate": 7.1389955223379266e-06, "loss": 0.2818, "step": 790 }, { "epoch": 0.06558989915553005, "grad_norm": 13.862250328063965, "learning_rate": 7.128633254634072e-06, "loss": 0.2834, "step": 800 }, { "epoch": 0.06640977289497417, "grad_norm": 17.020177841186523, "learning_rate": 7.118132104550322e-06, "loss": 0.2677, "step": 810 }, { "epoch": 0.0672296466344183, "grad_norm": 18.547590255737305, "learning_rate": 7.107492503747826e-06, "loss": 0.2898, "step": 820 }, { "epoch": 0.06804952037386243, "grad_norm": 15.957967758178711, "learning_rate": 7.096714889578898e-06, "loss": 0.326, "step": 830 }, { "epoch": 0.06886939411330655, "grad_norm": 24.1992130279541, "learning_rate": 7.085799705069046e-06, "loss": 0.2677, "step": 840 }, { "epoch": 0.06968926785275067, "grad_norm": 12.799731254577637, "learning_rate": 7.0747473988987515e-06, "loss": 0.2806, "step": 850 }, { "epoch": 0.0705091415921948, "grad_norm": 18.750246047973633, "learning_rate": 7.063558425385033e-06, "loss": 0.2937, "step": 860 }, { "epoch": 0.07132901533163893, "grad_norm": 13.083860397338867, "learning_rate": 7.052233244462769e-06, "loss": 0.2957, "step": 870 }, { "epoch": 0.07214888907108305, "grad_norm": 11.227791786193848, "learning_rate": 7.040772321665788e-06, "loss": 0.2855, "step": 880 }, { "epoch": 0.07296876281052718, "grad_norm": 8.911324501037598, "learning_rate": 7.029176128107734e-06, "loss": 0.3105, "step": 890 }, { "epoch": 0.07378863654997131, "grad_norm": 17.020790100097656, "learning_rate": 7.017445140462711e-06, "loss": 0.2728, "step": 900 }, { "epoch": 0.07378863654997131, "eval_loss": 0.2869480550289154, "eval_runtime": 58.9095, "eval_samples_per_second": 8.488, "eval_steps_per_second": 8.488, "step": 900 }, { "epoch": 0.07460851028941543, "grad_norm": 14.960102081298828, "learning_rate": 7.00557984094567e-06, "loss": 0.2955, "step": 910 }, { "epoch": 0.07542838402885955, "grad_norm": 8.271307945251465, "learning_rate": 6.993580717292601e-06, "loss": 0.2666, "step": 920 }, { "epoch": 0.07624825776830368, "grad_norm": 8.779189109802246, "learning_rate": 6.981448262740483e-06, "loss": 0.2938, "step": 930 }, { "epoch": 0.0770681315077478, "grad_norm": 9.497313499450684, "learning_rate": 6.969182976006999e-06, "loss": 0.2875, "step": 940 }, { "epoch": 0.07788800524719193, "grad_norm": 13.439544677734375, "learning_rate": 6.95678536127005e-06, "loss": 0.2893, "step": 950 }, { "epoch": 0.07870787898663606, "grad_norm": 10.986952781677246, "learning_rate": 6.944255928147017e-06, "loss": 0.29, "step": 960 }, { "epoch": 0.07952775272608019, "grad_norm": 14.666671752929688, "learning_rate": 6.931595191673823e-06, "loss": 0.2798, "step": 970 }, { "epoch": 0.0803476264655243, "grad_norm": 9.045489311218262, "learning_rate": 6.9188036722837555e-06, "loss": 0.2526, "step": 980 }, { "epoch": 0.08116750020496843, "grad_norm": 12.083099365234375, "learning_rate": 6.905881895786076e-06, "loss": 0.2825, "step": 990 }, { "epoch": 0.08198737394441256, "grad_norm": 20.973670959472656, "learning_rate": 6.892830393344403e-06, "loss": 0.2703, "step": 1000 }, { "epoch": 0.08280724768385668, "grad_norm": 12.959758758544922, "learning_rate": 6.879649701454886e-06, "loss": 0.2766, "step": 1010 }, { "epoch": 0.08362712142330081, "grad_norm": 11.118098258972168, "learning_rate": 6.866340361924141e-06, "loss": 0.2927, "step": 1020 }, { "epoch": 0.08444699516274494, "grad_norm": 12.703455924987793, "learning_rate": 6.852902921846988e-06, "loss": 0.2468, "step": 1030 }, { "epoch": 0.08526686890218907, "grad_norm": 33.15513229370117, "learning_rate": 6.8393379335839565e-06, "loss": 0.2845, "step": 1040 }, { "epoch": 0.08608674264163318, "grad_norm": 12.013687133789062, "learning_rate": 6.825645954738586e-06, "loss": 0.2879, "step": 1050 }, { "epoch": 0.08608674264163318, "eval_loss": 0.2693183720111847, "eval_runtime": 56.9849, "eval_samples_per_second": 8.774, "eval_steps_per_second": 8.774, "step": 1050 }, { "epoch": 0.08690661638107731, "grad_norm": 10.128811836242676, "learning_rate": 6.811827548134495e-06, "loss": 0.2873, "step": 1060 }, { "epoch": 0.08772649012052144, "grad_norm": 10.001947402954102, "learning_rate": 6.797883281792261e-06, "loss": 0.2931, "step": 1070 }, { "epoch": 0.08854636385996556, "grad_norm": 13.15841293334961, "learning_rate": 6.783813728906054e-06, "loss": 0.3, "step": 1080 }, { "epoch": 0.08936623759940969, "grad_norm": 8.157013893127441, "learning_rate": 6.769619467820086e-06, "loss": 0.2692, "step": 1090 }, { "epoch": 0.09018611133885382, "grad_norm": 8.676292419433594, "learning_rate": 6.755301082004838e-06, "loss": 0.3111, "step": 1100 }, { "epoch": 0.09100598507829795, "grad_norm": 14.835556030273438, "learning_rate": 6.740859160033068e-06, "loss": 0.2932, "step": 1110 }, { "epoch": 0.09182585881774206, "grad_norm": 14.752832412719727, "learning_rate": 6.726294295555623e-06, "loss": 0.2942, "step": 1120 }, { "epoch": 0.0926457325571862, "grad_norm": 9.42294979095459, "learning_rate": 6.711607087277034e-06, "loss": 0.2807, "step": 1130 }, { "epoch": 0.09346560629663032, "grad_norm": 6.576030731201172, "learning_rate": 6.69679813893091e-06, "loss": 0.2656, "step": 1140 }, { "epoch": 0.09428548003607444, "grad_norm": 14.54617977142334, "learning_rate": 6.681868059255113e-06, "loss": 0.2708, "step": 1150 }, { "epoch": 0.09510535377551857, "grad_norm": 19.004695892333984, "learning_rate": 6.666817461966741e-06, "loss": 0.2974, "step": 1160 }, { "epoch": 0.0959252275149627, "grad_norm": 13.359691619873047, "learning_rate": 6.651646965736902e-06, "loss": 0.2641, "step": 1170 }, { "epoch": 0.09674510125440682, "grad_norm": 9.031187057495117, "learning_rate": 6.636357194165274e-06, "loss": 0.2794, "step": 1180 }, { "epoch": 0.09756497499385094, "grad_norm": 11.242755889892578, "learning_rate": 6.620948775754481e-06, "loss": 0.2708, "step": 1190 }, { "epoch": 0.09838484873329507, "grad_norm": 9.727982521057129, "learning_rate": 6.605422343884255e-06, "loss": 0.2936, "step": 1200 }, { "epoch": 0.09838484873329507, "eval_loss": 0.2741548418998718, "eval_runtime": 56.2393, "eval_samples_per_second": 8.891, "eval_steps_per_second": 8.891, "step": 1200 }, { "epoch": 0.0992047224727392, "grad_norm": 11.938862800598145, "learning_rate": 6.589778536785396e-06, "loss": 0.2776, "step": 1210 }, { "epoch": 0.10002459621218332, "grad_norm": 9.253863334655762, "learning_rate": 6.5740179975135426e-06, "loss": 0.2695, "step": 1220 }, { "epoch": 0.10084446995162745, "grad_norm": 13.18783950805664, "learning_rate": 6.5581413739227314e-06, "loss": 0.2863, "step": 1230 }, { "epoch": 0.10166434369107158, "grad_norm": 10.108220100402832, "learning_rate": 6.542149318638777e-06, "loss": 0.2831, "step": 1240 }, { "epoch": 0.1024842174305157, "grad_norm": 13.539487838745117, "learning_rate": 6.526042489032434e-06, "loss": 0.2626, "step": 1250 }, { "epoch": 0.10330409116995982, "grad_norm": 9.928237915039062, "learning_rate": 6.509821547192383e-06, "loss": 0.2706, "step": 1260 }, { "epoch": 0.10412396490940395, "grad_norm": 10.978721618652344, "learning_rate": 6.493487159898006e-06, "loss": 0.2695, "step": 1270 }, { "epoch": 0.10494383864884808, "grad_norm": 9.98459243774414, "learning_rate": 6.477039998591991e-06, "loss": 0.2801, "step": 1280 }, { "epoch": 0.1057637123882922, "grad_norm": 12.930992126464844, "learning_rate": 6.460480739352719e-06, "loss": 0.2842, "step": 1290 }, { "epoch": 0.10658358612773633, "grad_norm": 12.851746559143066, "learning_rate": 6.4438100628664795e-06, "loss": 0.2635, "step": 1300 }, { "epoch": 0.10740345986718046, "grad_norm": 10.791857719421387, "learning_rate": 6.4270286543994874e-06, "loss": 0.2947, "step": 1310 }, { "epoch": 0.10822333360662457, "grad_norm": 9.770176887512207, "learning_rate": 6.410137203769718e-06, "loss": 0.2606, "step": 1320 }, { "epoch": 0.1090432073460687, "grad_norm": 17.897979736328125, "learning_rate": 6.393136405318545e-06, "loss": 0.2868, "step": 1330 }, { "epoch": 0.10986308108551283, "grad_norm": 19.892559051513672, "learning_rate": 6.376026957882207e-06, "loss": 0.2605, "step": 1340 }, { "epoch": 0.11068295482495695, "grad_norm": 9.193521499633789, "learning_rate": 6.3588095647630754e-06, "loss": 0.2454, "step": 1350 }, { "epoch": 0.11068295482495695, "eval_loss": 0.2674501836299896, "eval_runtime": 56.3954, "eval_samples_per_second": 8.866, "eval_steps_per_second": 8.866, "step": 1350 }, { "epoch": 0.11150282856440108, "grad_norm": 15.698138236999512, "learning_rate": 6.341484933700744e-06, "loss": 0.2639, "step": 1360 }, { "epoch": 0.11232270230384521, "grad_norm": 11.653697967529297, "learning_rate": 6.32405377684294e-06, "loss": 0.2711, "step": 1370 }, { "epoch": 0.11314257604328934, "grad_norm": 10.41117000579834, "learning_rate": 6.306516810716249e-06, "loss": 0.274, "step": 1380 }, { "epoch": 0.11396244978273345, "grad_norm": 17.14838981628418, "learning_rate": 6.288874756196662e-06, "loss": 0.2919, "step": 1390 }, { "epoch": 0.11478232352217758, "grad_norm": 12.094561576843262, "learning_rate": 6.271128338479939e-06, "loss": 0.272, "step": 1400 }, { "epoch": 0.11560219726162171, "grad_norm": 7.186673641204834, "learning_rate": 6.253278287051806e-06, "loss": 0.2614, "step": 1410 }, { "epoch": 0.11642207100106583, "grad_norm": 27.63665008544922, "learning_rate": 6.235325335657962e-06, "loss": 0.2581, "step": 1420 }, { "epoch": 0.11724194474050996, "grad_norm": 9.12143611907959, "learning_rate": 6.217270222273923e-06, "loss": 0.2497, "step": 1430 }, { "epoch": 0.11806181847995409, "grad_norm": 10.814976692199707, "learning_rate": 6.1991136890746825e-06, "loss": 0.2659, "step": 1440 }, { "epoch": 0.11888169221939822, "grad_norm": 13.897311210632324, "learning_rate": 6.180856482404208e-06, "loss": 0.2575, "step": 1450 }, { "epoch": 0.11970156595884233, "grad_norm": 14.34624195098877, "learning_rate": 6.162499352744754e-06, "loss": 0.276, "step": 1460 }, { "epoch": 0.12052143969828646, "grad_norm": 15.839101791381836, "learning_rate": 6.144043054686022e-06, "loss": 0.267, "step": 1470 }, { "epoch": 0.1213413134377306, "grad_norm": 13.110719680786133, "learning_rate": 6.125488346894139e-06, "loss": 0.2777, "step": 1480 }, { "epoch": 0.12216118717717471, "grad_norm": 11.638336181640625, "learning_rate": 6.106835992080464e-06, "loss": 0.2454, "step": 1490 }, { "epoch": 0.12298106091661884, "grad_norm": 12.756601333618164, "learning_rate": 6.088086756970252e-06, "loss": 0.2605, "step": 1500 }, { "epoch": 0.12298106091661884, "eval_loss": 0.2679287791252136, "eval_runtime": 56.0794, "eval_samples_per_second": 8.916, "eval_steps_per_second": 8.916, "step": 1500 }, { "epoch": 0.12380093465606297, "grad_norm": 20.72138214111328, "learning_rate": 6.0692414122711184e-06, "loss": 0.2593, "step": 1510 }, { "epoch": 0.12462080839550708, "grad_norm": 9.595439910888672, "learning_rate": 6.050300732641376e-06, "loss": 0.2719, "step": 1520 }, { "epoch": 0.12544068213495121, "grad_norm": 16.999011993408203, "learning_rate": 6.0312654966581755e-06, "loss": 0.2885, "step": 1530 }, { "epoch": 0.12626055587439533, "grad_norm": 14.768747329711914, "learning_rate": 6.012136486785512e-06, "loss": 0.2702, "step": 1540 }, { "epoch": 0.12708042961383947, "grad_norm": 8.815911293029785, "learning_rate": 5.992914489342061e-06, "loss": 0.2507, "step": 1550 }, { "epoch": 0.1279003033532836, "grad_norm": 20.083023071289062, "learning_rate": 5.9736002944688474e-06, "loss": 0.2632, "step": 1560 }, { "epoch": 0.12872017709272773, "grad_norm": 17.51641082763672, "learning_rate": 5.954194696096775e-06, "loss": 0.2937, "step": 1570 }, { "epoch": 0.12954005083217185, "grad_norm": 9.186761856079102, "learning_rate": 5.9346984919139865e-06, "loss": 0.2611, "step": 1580 }, { "epoch": 0.13035992457161596, "grad_norm": 13.085734367370605, "learning_rate": 5.9151124833330745e-06, "loss": 0.2507, "step": 1590 }, { "epoch": 0.1311797983110601, "grad_norm": 13.729114532470703, "learning_rate": 5.895437475458137e-06, "loss": 0.2774, "step": 1600 }, { "epoch": 0.13199967205050422, "grad_norm": 19.03725242614746, "learning_rate": 5.875674277051688e-06, "loss": 0.2687, "step": 1610 }, { "epoch": 0.13281954578994834, "grad_norm": 15.545515060424805, "learning_rate": 5.855823700501406e-06, "loss": 0.2765, "step": 1620 }, { "epoch": 0.13363941952939248, "grad_norm": 11.668421745300293, "learning_rate": 5.835886561786744e-06, "loss": 0.2682, "step": 1630 }, { "epoch": 0.1344592932688366, "grad_norm": 8.778451919555664, "learning_rate": 5.815863680445385e-06, "loss": 0.2347, "step": 1640 }, { "epoch": 0.13527916700828072, "grad_norm": 5.889225959777832, "learning_rate": 5.795755879539558e-06, "loss": 0.2709, "step": 1650 }, { "epoch": 0.13527916700828072, "eval_loss": 0.25923365354537964, "eval_runtime": 56.2341, "eval_samples_per_second": 8.891, "eval_steps_per_second": 8.891, "step": 1650 }, { "epoch": 0.13609904074772486, "grad_norm": 12.518867492675781, "learning_rate": 5.775563985622202e-06, "loss": 0.2833, "step": 1660 }, { "epoch": 0.13691891448716897, "grad_norm": 14.924880027770996, "learning_rate": 5.755288828702987e-06, "loss": 0.2863, "step": 1670 }, { "epoch": 0.1377387882266131, "grad_norm": 16.47811508178711, "learning_rate": 5.734931242214204e-06, "loss": 0.2596, "step": 1680 }, { "epoch": 0.13855866196605723, "grad_norm": 13.941671371459961, "learning_rate": 5.7144920629764955e-06, "loss": 0.2819, "step": 1690 }, { "epoch": 0.13937853570550135, "grad_norm": 16.261932373046875, "learning_rate": 5.693972131164471e-06, "loss": 0.303, "step": 1700 }, { "epoch": 0.14019840944494547, "grad_norm": 12.289247512817383, "learning_rate": 5.673372290272149e-06, "loss": 0.2855, "step": 1710 }, { "epoch": 0.1410182831843896, "grad_norm": 8.7142915725708, "learning_rate": 5.652693387078309e-06, "loss": 0.2615, "step": 1720 }, { "epoch": 0.14183815692383372, "grad_norm": 16.864688873291016, "learning_rate": 5.631936271611667e-06, "loss": 0.2813, "step": 1730 }, { "epoch": 0.14265803066327787, "grad_norm": 16.40870475769043, "learning_rate": 5.611101797115939e-06, "loss": 0.275, "step": 1740 }, { "epoch": 0.14347790440272198, "grad_norm": 14.436688423156738, "learning_rate": 5.5901908200147685e-06, "loss": 0.2788, "step": 1750 }, { "epoch": 0.1442977781421661, "grad_norm": 11.943658828735352, "learning_rate": 5.56920419987652e-06, "loss": 0.2805, "step": 1760 }, { "epoch": 0.14511765188161024, "grad_norm": 14.252999305725098, "learning_rate": 5.5481427993789534e-06, "loss": 0.2806, "step": 1770 }, { "epoch": 0.14593752562105436, "grad_norm": 11.182486534118652, "learning_rate": 5.527007484273746e-06, "loss": 0.2675, "step": 1780 }, { "epoch": 0.14675739936049848, "grad_norm": 12.846651077270508, "learning_rate": 5.5057991233509225e-06, "loss": 0.2744, "step": 1790 }, { "epoch": 0.14757727309994262, "grad_norm": 9.701010704040527, "learning_rate": 5.484518588403134e-06, "loss": 0.2808, "step": 1800 }, { "epoch": 0.14757727309994262, "eval_loss": 0.2612378001213074, "eval_runtime": 57.022, "eval_samples_per_second": 8.769, "eval_steps_per_second": 8.769, "step": 1800 }, { "epoch": 0.14839714683938673, "grad_norm": 7.793675422668457, "learning_rate": 5.463166754189819e-06, "loss": 0.27, "step": 1810 }, { "epoch": 0.14921702057883085, "grad_norm": 13.162193298339844, "learning_rate": 5.441744498401255e-06, "loss": 0.2574, "step": 1820 }, { "epoch": 0.150036894318275, "grad_norm": 15.428301811218262, "learning_rate": 5.4202527016224725e-06, "loss": 0.2675, "step": 1830 }, { "epoch": 0.1508567680577191, "grad_norm": 24.684080123901367, "learning_rate": 5.398692247297059e-06, "loss": 0.2916, "step": 1840 }, { "epoch": 0.15167664179716323, "grad_norm": 7.947139263153076, "learning_rate": 5.377064021690844e-06, "loss": 0.2841, "step": 1850 }, { "epoch": 0.15249651553660737, "grad_norm": 11.595500946044922, "learning_rate": 5.355368913855472e-06, "loss": 0.2562, "step": 1860 }, { "epoch": 0.15331638927605148, "grad_norm": 11.803101539611816, "learning_rate": 5.333607815591851e-06, "loss": 0.2292, "step": 1870 }, { "epoch": 0.1541362630154956, "grad_norm": 17.95461654663086, "learning_rate": 5.311781621413497e-06, "loss": 0.2787, "step": 1880 }, { "epoch": 0.15495613675493974, "grad_norm": 25.276002883911133, "learning_rate": 5.289891228509769e-06, "loss": 0.2889, "step": 1890 }, { "epoch": 0.15577601049438386, "grad_norm": 8.79496955871582, "learning_rate": 5.267937536708977e-06, "loss": 0.2667, "step": 1900 }, { "epoch": 0.156595884233828, "grad_norm": 10.413036346435547, "learning_rate": 5.245921448441407e-06, "loss": 0.2823, "step": 1910 }, { "epoch": 0.15741575797327212, "grad_norm": 11.163688659667969, "learning_rate": 5.223843868702214e-06, "loss": 0.2655, "step": 1920 }, { "epoch": 0.15823563171271623, "grad_norm": 16.093170166015625, "learning_rate": 5.201705705014231e-06, "loss": 0.2709, "step": 1930 }, { "epoch": 0.15905550545216038, "grad_norm": 18.966991424560547, "learning_rate": 5.1795078673906575e-06, "loss": 0.2593, "step": 1940 }, { "epoch": 0.1598753791916045, "grad_norm": 12.139580726623535, "learning_rate": 5.1572512682976546e-06, "loss": 0.2602, "step": 1950 }, { "epoch": 0.1598753791916045, "eval_loss": 0.2535741329193115, "eval_runtime": 56.9513, "eval_samples_per_second": 8.779, "eval_steps_per_second": 8.779, "step": 1950 }, { "epoch": 0.1606952529310486, "grad_norm": 17.421117782592773, "learning_rate": 5.134936822616837e-06, "loss": 0.2507, "step": 1960 }, { "epoch": 0.16151512667049275, "grad_norm": 8.096160888671875, "learning_rate": 5.112565447607669e-06, "loss": 0.2405, "step": 1970 }, { "epoch": 0.16233500040993687, "grad_norm": 10.138191223144531, "learning_rate": 5.090138062869755e-06, "loss": 0.2435, "step": 1980 }, { "epoch": 0.16315487414938099, "grad_norm": 32.244873046875, "learning_rate": 5.067655590305036e-06, "loss": 0.2546, "step": 1990 }, { "epoch": 0.16397474788882513, "grad_norm": 11.093918800354004, "learning_rate": 5.045118954079904e-06, "loss": 0.2595, "step": 2000 }, { "epoch": 0.16479462162826924, "grad_norm": 11.482741355895996, "learning_rate": 5.022529080587205e-06, "loss": 0.2294, "step": 2010 }, { "epoch": 0.16561449536771336, "grad_norm": 13.456998825073242, "learning_rate": 4.999886898408157e-06, "loss": 0.2556, "step": 2020 }, { "epoch": 0.1664343691071575, "grad_norm": 11.575148582458496, "learning_rate": 4.977193338274189e-06, "loss": 0.2538, "step": 2030 }, { "epoch": 0.16725424284660162, "grad_norm": 12.712217330932617, "learning_rate": 4.954449333028672e-06, "loss": 0.2985, "step": 2040 }, { "epoch": 0.16807411658604574, "grad_norm": 25.477855682373047, "learning_rate": 4.931655817588579e-06, "loss": 0.2516, "step": 2050 }, { "epoch": 0.16889399032548988, "grad_norm": 17.030961990356445, "learning_rate": 4.9088137289060535e-06, "loss": 0.2544, "step": 2060 }, { "epoch": 0.169713864064934, "grad_norm": 10.903443336486816, "learning_rate": 4.885924005929896e-06, "loss": 0.2581, "step": 2070 }, { "epoch": 0.17053373780437814, "grad_norm": 9.746002197265625, "learning_rate": 4.862987589566965e-06, "loss": 0.2332, "step": 2080 }, { "epoch": 0.17135361154382225, "grad_norm": 14.084914207458496, "learning_rate": 4.840005422643503e-06, "loss": 0.2643, "step": 2090 }, { "epoch": 0.17217348528326637, "grad_norm": 9.59061336517334, "learning_rate": 4.816978449866372e-06, "loss": 0.2461, "step": 2100 }, { "epoch": 0.17217348528326637, "eval_loss": 0.2557007670402527, "eval_runtime": 56.7258, "eval_samples_per_second": 8.814, "eval_steps_per_second": 8.814, "step": 2100 }, { "epoch": 0.1729933590227105, "grad_norm": 12.96509075164795, "learning_rate": 4.793907617784238e-06, "loss": 0.2623, "step": 2110 }, { "epoch": 0.17381323276215463, "grad_norm": 21.171913146972656, "learning_rate": 4.770793874748642e-06, "loss": 0.2481, "step": 2120 }, { "epoch": 0.17463310650159874, "grad_norm": 15.18250560760498, "learning_rate": 4.747638170875032e-06, "loss": 0.2644, "step": 2130 }, { "epoch": 0.1754529802410429, "grad_norm": 13.478678703308105, "learning_rate": 4.724441458003699e-06, "loss": 0.2548, "step": 2140 }, { "epoch": 0.176272853980487, "grad_norm": 7.877747535705566, "learning_rate": 4.701204689660653e-06, "loss": 0.2468, "step": 2150 }, { "epoch": 0.17709272771993112, "grad_norm": 14.340051651000977, "learning_rate": 4.67792882101843e-06, "loss": 0.2652, "step": 2160 }, { "epoch": 0.17791260145937526, "grad_norm": 11.43173885345459, "learning_rate": 4.654614808856823e-06, "loss": 0.245, "step": 2170 }, { "epoch": 0.17873247519881938, "grad_norm": 16.191015243530273, "learning_rate": 4.631263611523557e-06, "loss": 0.2561, "step": 2180 }, { "epoch": 0.1795523489382635, "grad_norm": 14.481834411621094, "learning_rate": 4.607876188894896e-06, "loss": 0.2783, "step": 2190 }, { "epoch": 0.18037222267770764, "grad_norm": 12.716588973999023, "learning_rate": 4.58445350233618e-06, "loss": 0.2526, "step": 2200 }, { "epoch": 0.18119209641715175, "grad_norm": 16.625707626342773, "learning_rate": 4.560996514662314e-06, "loss": 0.2386, "step": 2210 }, { "epoch": 0.1820119701565959, "grad_norm": 15.23642635345459, "learning_rate": 4.5375061900981855e-06, "loss": 0.2522, "step": 2220 }, { "epoch": 0.18283184389604, "grad_norm": 22.573617935180664, "learning_rate": 4.513983494239034e-06, "loss": 0.2605, "step": 2230 }, { "epoch": 0.18365171763548413, "grad_norm": 16.085651397705078, "learning_rate": 4.490429394010752e-06, "loss": 0.2811, "step": 2240 }, { "epoch": 0.18447159137492827, "grad_norm": 23.764911651611328, "learning_rate": 4.466844857630147e-06, "loss": 0.2495, "step": 2250 }, { "epoch": 0.18447159137492827, "eval_loss": 0.2652283310890198, "eval_runtime": 56.3594, "eval_samples_per_second": 8.872, "eval_steps_per_second": 8.872, "step": 2250 }, { "epoch": 0.1852914651143724, "grad_norm": 17.39873504638672, "learning_rate": 4.443230854565133e-06, "loss": 0.2562, "step": 2260 }, { "epoch": 0.1861113388538165, "grad_norm": 11.883243560791016, "learning_rate": 4.4195883554948885e-06, "loss": 0.2777, "step": 2270 }, { "epoch": 0.18693121259326065, "grad_norm": 8.622486114501953, "learning_rate": 4.3959183322699466e-06, "loss": 0.2272, "step": 2280 }, { "epoch": 0.18775108633270476, "grad_norm": 16.060256958007812, "learning_rate": 4.372221757872255e-06, "loss": 0.2388, "step": 2290 }, { "epoch": 0.18857096007214888, "grad_norm": 9.97546100616455, "learning_rate": 4.3484996063751725e-06, "loss": 0.2736, "step": 2300 }, { "epoch": 0.18939083381159302, "grad_norm": 11.587379455566406, "learning_rate": 4.324752852903435e-06, "loss": 0.2321, "step": 2310 }, { "epoch": 0.19021070755103714, "grad_norm": 134.054931640625, "learning_rate": 4.300982473593068e-06, "loss": 0.2583, "step": 2320 }, { "epoch": 0.19103058129048126, "grad_norm": 15.653196334838867, "learning_rate": 4.277189445551261e-06, "loss": 0.2702, "step": 2330 }, { "epoch": 0.1918504550299254, "grad_norm": 14.868865966796875, "learning_rate": 4.253374746816209e-06, "loss": 0.2749, "step": 2340 }, { "epoch": 0.19267032876936951, "grad_norm": 18.965742111206055, "learning_rate": 4.229539356316898e-06, "loss": 0.2635, "step": 2350 }, { "epoch": 0.19349020250881363, "grad_norm": 21.16566276550293, "learning_rate": 4.205684253832877e-06, "loss": 0.2366, "step": 2360 }, { "epoch": 0.19431007624825777, "grad_norm": 9.739816665649414, "learning_rate": 4.1818104199539735e-06, "loss": 0.2507, "step": 2370 }, { "epoch": 0.1951299499877019, "grad_norm": 9.094308853149414, "learning_rate": 4.1579188360399916e-06, "loss": 0.2508, "step": 2380 }, { "epoch": 0.19594982372714603, "grad_norm": 13.532063484191895, "learning_rate": 4.134010484180368e-06, "loss": 0.2432, "step": 2390 }, { "epoch": 0.19676969746659015, "grad_norm": 10.089424133300781, "learning_rate": 4.110086347153807e-06, "loss": 0.2496, "step": 2400 }, { "epoch": 0.19676969746659015, "eval_loss": 0.24164016544818878, "eval_runtime": 58.2028, "eval_samples_per_second": 8.591, "eval_steps_per_second": 8.591, "step": 2400 }, { "epoch": 0.19758957120603426, "grad_norm": 14.62680721282959, "learning_rate": 4.0861474083878765e-06, "loss": 0.2585, "step": 2410 }, { "epoch": 0.1984094449454784, "grad_norm": 22.528297424316406, "learning_rate": 4.062194651918585e-06, "loss": 0.2341, "step": 2420 }, { "epoch": 0.19922931868492252, "grad_norm": 11.753854751586914, "learning_rate": 4.0382290623499384e-06, "loss": 0.2953, "step": 2430 }, { "epoch": 0.20004919242436664, "grad_norm": 16.247995376586914, "learning_rate": 4.014251624813453e-06, "loss": 0.2657, "step": 2440 }, { "epoch": 0.20086906616381078, "grad_norm": 15.834903717041016, "learning_rate": 3.990263324927675e-06, "loss": 0.2341, "step": 2450 }, { "epoch": 0.2016889399032549, "grad_norm": 6.7929887771606445, "learning_rate": 3.966265148757655e-06, "loss": 0.2355, "step": 2460 }, { "epoch": 0.20250881364269901, "grad_norm": 35.777835845947266, "learning_rate": 3.9422580827744224e-06, "loss": 0.2329, "step": 2470 }, { "epoch": 0.20332868738214316, "grad_norm": 15.361977577209473, "learning_rate": 3.9182431138144315e-06, "loss": 0.2515, "step": 2480 }, { "epoch": 0.20414856112158727, "grad_norm": 10.340039253234863, "learning_rate": 3.894221229038995e-06, "loss": 0.2397, "step": 2490 }, { "epoch": 0.2049684348610314, "grad_norm": 15.93770980834961, "learning_rate": 3.870193415893709e-06, "loss": 0.2432, "step": 2500 }, { "epoch": 0.20578830860047553, "grad_norm": 19.398086547851562, "learning_rate": 3.846160662067859e-06, "loss": 0.2471, "step": 2510 }, { "epoch": 0.20660818233991965, "grad_norm": 7.482428550720215, "learning_rate": 3.8221239554538275e-06, "loss": 0.2498, "step": 2520 }, { "epoch": 0.20742805607936377, "grad_norm": 7.209218502044678, "learning_rate": 3.798084284106478e-06, "loss": 0.263, "step": 2530 }, { "epoch": 0.2082479298188079, "grad_norm": 7.973605155944824, "learning_rate": 3.7740426362025424e-06, "loss": 0.2182, "step": 2540 }, { "epoch": 0.20906780355825202, "grad_norm": 17.178762435913086, "learning_rate": 3.7500000000000005e-06, "loss": 0.2368, "step": 2550 }, { "epoch": 0.20906780355825202, "eval_loss": 0.24929100275039673, "eval_runtime": 56.544, "eval_samples_per_second": 8.843, "eval_steps_per_second": 8.843, "step": 2550 }, { "epoch": 0.20988767729769617, "grad_norm": 19.6829776763916, "learning_rate": 3.7259573637974587e-06, "loss": 0.2556, "step": 2560 }, { "epoch": 0.21070755103714028, "grad_norm": 18.270166397094727, "learning_rate": 3.701915715893523e-06, "loss": 0.2306, "step": 2570 }, { "epoch": 0.2115274247765844, "grad_norm": 14.25434398651123, "learning_rate": 3.677876044546174e-06, "loss": 0.2597, "step": 2580 }, { "epoch": 0.21234729851602854, "grad_norm": 9.318758964538574, "learning_rate": 3.6538393379321427e-06, "loss": 0.2659, "step": 2590 }, { "epoch": 0.21316717225547266, "grad_norm": 18.77834701538086, "learning_rate": 3.6298065841062934e-06, "loss": 0.2299, "step": 2600 }, { "epoch": 0.21398704599491677, "grad_norm": 17.720027923583984, "learning_rate": 3.6057787709610064e-06, "loss": 0.266, "step": 2610 }, { "epoch": 0.21480691973436092, "grad_norm": 7.643661022186279, "learning_rate": 3.5817568861855708e-06, "loss": 0.2362, "step": 2620 }, { "epoch": 0.21562679347380503, "grad_norm": 10.200757026672363, "learning_rate": 3.557741917225579e-06, "loss": 0.2405, "step": 2630 }, { "epoch": 0.21644666721324915, "grad_norm": 46.2437744140625, "learning_rate": 3.5337348512423468e-06, "loss": 0.252, "step": 2640 }, { "epoch": 0.2172665409526933, "grad_norm": 13.160014152526855, "learning_rate": 3.5097366750723275e-06, "loss": 0.247, "step": 2650 }, { "epoch": 0.2180864146921374, "grad_norm": 12.211856842041016, "learning_rate": 3.4857483751865478e-06, "loss": 0.2515, "step": 2660 }, { "epoch": 0.21890628843158152, "grad_norm": 14.44340705871582, "learning_rate": 3.461770937650064e-06, "loss": 0.2228, "step": 2670 }, { "epoch": 0.21972616217102567, "grad_norm": 43.0201530456543, "learning_rate": 3.437805348081416e-06, "loss": 0.2721, "step": 2680 }, { "epoch": 0.22054603591046978, "grad_norm": 9.385405540466309, "learning_rate": 3.413852591612125e-06, "loss": 0.2883, "step": 2690 }, { "epoch": 0.2213659096499139, "grad_norm": 14.081421852111816, "learning_rate": 3.389913652846194e-06, "loss": 0.2411, "step": 2700 }, { "epoch": 0.2213659096499139, "eval_loss": 0.23700179159641266, "eval_runtime": 56.0414, "eval_samples_per_second": 8.922, "eval_steps_per_second": 8.922, "step": 2700 }, { "epoch": 0.22218578338935804, "grad_norm": 7.245662689208984, "learning_rate": 3.365989515819633e-06, "loss": 0.2538, "step": 2710 }, { "epoch": 0.22300565712880216, "grad_norm": 15.124368667602539, "learning_rate": 3.34208116396001e-06, "loss": 0.2469, "step": 2720 }, { "epoch": 0.2238255308682463, "grad_norm": 15.782695770263672, "learning_rate": 3.318189580046028e-06, "loss": 0.2412, "step": 2730 }, { "epoch": 0.22464540460769042, "grad_norm": 21.473407745361328, "learning_rate": 3.294315746167124e-06, "loss": 0.2745, "step": 2740 }, { "epoch": 0.22546527834713453, "grad_norm": 14.113616943359375, "learning_rate": 3.2704606436831023e-06, "loss": 0.2329, "step": 2750 }, { "epoch": 0.22628515208657868, "grad_norm": 16.563539505004883, "learning_rate": 3.2466252531837934e-06, "loss": 0.2275, "step": 2760 }, { "epoch": 0.2271050258260228, "grad_norm": 15.176487922668457, "learning_rate": 3.2228105544487405e-06, "loss": 0.236, "step": 2770 }, { "epoch": 0.2279248995654669, "grad_norm": 21.701990127563477, "learning_rate": 3.1990175264069333e-06, "loss": 0.2619, "step": 2780 }, { "epoch": 0.22874477330491105, "grad_norm": 24.164974212646484, "learning_rate": 3.1752471470965653e-06, "loss": 0.2545, "step": 2790 }, { "epoch": 0.22956464704435517, "grad_norm": 18.652359008789062, "learning_rate": 3.151500393624829e-06, "loss": 0.2538, "step": 2800 }, { "epoch": 0.23038452078379928, "grad_norm": 17.519634246826172, "learning_rate": 3.127778242127747e-06, "loss": 0.2457, "step": 2810 }, { "epoch": 0.23120439452324343, "grad_norm": 32.73554992675781, "learning_rate": 3.104081667730055e-06, "loss": 0.2597, "step": 2820 }, { "epoch": 0.23202426826268754, "grad_norm": 14.897638320922852, "learning_rate": 3.0804116445051133e-06, "loss": 0.2565, "step": 2830 }, { "epoch": 0.23284414200213166, "grad_norm": 12.081779479980469, "learning_rate": 3.0567691454348674e-06, "loss": 0.2222, "step": 2840 }, { "epoch": 0.2336640157415758, "grad_norm": 12.295435905456543, "learning_rate": 3.033155142369855e-06, "loss": 0.2344, "step": 2850 }, { "epoch": 0.2336640157415758, "eval_loss": 0.23474246263504028, "eval_runtime": 55.6184, "eval_samples_per_second": 8.99, "eval_steps_per_second": 8.99, "step": 2850 }, { "epoch": 0.23448388948101992, "grad_norm": 14.579584121704102, "learning_rate": 3.009570605989249e-06, "loss": 0.2352, "step": 2860 }, { "epoch": 0.23530376322046404, "grad_norm": 22.36095428466797, "learning_rate": 2.986016505760967e-06, "loss": 0.2394, "step": 2870 }, { "epoch": 0.23612363695990818, "grad_norm": 10.306982040405273, "learning_rate": 2.962493809901815e-06, "loss": 0.2333, "step": 2880 }, { "epoch": 0.2369435106993523, "grad_norm": 36.44614791870117, "learning_rate": 2.9390034853376875e-06, "loss": 0.2539, "step": 2890 }, { "epoch": 0.23776338443879644, "grad_norm": 10.238338470458984, "learning_rate": 2.9155464976638217e-06, "loss": 0.2639, "step": 2900 }, { "epoch": 0.23858325817824055, "grad_norm": 22.99175262451172, "learning_rate": 2.8921238111051057e-06, "loss": 0.2769, "step": 2910 }, { "epoch": 0.23940313191768467, "grad_norm": 15.648612976074219, "learning_rate": 2.8687363884764434e-06, "loss": 0.2348, "step": 2920 }, { "epoch": 0.2402230056571288, "grad_norm": 9.030691146850586, "learning_rate": 2.8453851911431783e-06, "loss": 0.2223, "step": 2930 }, { "epoch": 0.24104287939657293, "grad_norm": 13.751124382019043, "learning_rate": 2.822071178981572e-06, "loss": 0.2474, "step": 2940 }, { "epoch": 0.24186275313601704, "grad_norm": 16.013547897338867, "learning_rate": 2.7987953103393484e-06, "loss": 0.2541, "step": 2950 }, { "epoch": 0.2426826268754612, "grad_norm": 11.65927791595459, "learning_rate": 2.7755585419963026e-06, "loss": 0.2535, "step": 2960 }, { "epoch": 0.2435025006149053, "grad_norm": 20.403488159179688, "learning_rate": 2.7523618291249687e-06, "loss": 0.2439, "step": 2970 }, { "epoch": 0.24432237435434942, "grad_norm": 15.705227851867676, "learning_rate": 2.729206125251359e-06, "loss": 0.2073, "step": 2980 }, { "epoch": 0.24514224809379356, "grad_norm": 16.818626403808594, "learning_rate": 2.7060923822157638e-06, "loss": 0.2592, "step": 2990 }, { "epoch": 0.24596212183323768, "grad_norm": 29.800796508789062, "learning_rate": 2.6830215501336288e-06, "loss": 0.2328, "step": 3000 }, { "epoch": 0.24596212183323768, "eval_loss": 0.24091680347919464, "eval_runtime": 55.7565, "eval_samples_per_second": 8.968, "eval_steps_per_second": 8.968, "step": 3000 }, { "epoch": 0.2467819955726818, "grad_norm": 18.235761642456055, "learning_rate": 2.6599945773564997e-06, "loss": 0.2505, "step": 3010 }, { "epoch": 0.24760186931212594, "grad_norm": 13.632527351379395, "learning_rate": 2.6370124104330357e-06, "loss": 0.2626, "step": 3020 }, { "epoch": 0.24842174305157005, "grad_norm": 29.359901428222656, "learning_rate": 2.614075994070105e-06, "loss": 0.2372, "step": 3030 }, { "epoch": 0.24924161679101417, "grad_norm": 23.87677574157715, "learning_rate": 2.591186271093948e-06, "loss": 0.2103, "step": 3040 }, { "epoch": 0.2500614905304583, "grad_norm": 13.893345832824707, "learning_rate": 2.568344182411423e-06, "loss": 0.2299, "step": 3050 }, { "epoch": 0.25088136426990243, "grad_norm": 30.01930809020996, "learning_rate": 2.5455506669713293e-06, "loss": 0.237, "step": 3060 }, { "epoch": 0.2517012380093466, "grad_norm": 21.540925979614258, "learning_rate": 2.522806661725812e-06, "loss": 0.245, "step": 3070 }, { "epoch": 0.25252111174879066, "grad_norm": 11.055063247680664, "learning_rate": 2.5001131015918444e-06, "loss": 0.2386, "step": 3080 }, { "epoch": 0.2533409854882348, "grad_norm": 25.467863082885742, "learning_rate": 2.4774709194127973e-06, "loss": 0.2028, "step": 3090 }, { "epoch": 0.25416085922767895, "grad_norm": 16.482820510864258, "learning_rate": 2.4548810459200973e-06, "loss": 0.2559, "step": 3100 }, { "epoch": 0.25498073296712304, "grad_norm": 15.558172225952148, "learning_rate": 2.4323444096949647e-06, "loss": 0.2443, "step": 3110 }, { "epoch": 0.2558006067065672, "grad_norm": 12.034625053405762, "learning_rate": 2.409861937130248e-06, "loss": 0.2607, "step": 3120 }, { "epoch": 0.2566204804460113, "grad_norm": 11.549402236938477, "learning_rate": 2.3874345523923327e-06, "loss": 0.2182, "step": 3130 }, { "epoch": 0.25744035418545547, "grad_norm": 37.64973068237305, "learning_rate": 2.3650631773831644e-06, "loss": 0.2756, "step": 3140 }, { "epoch": 0.25826022792489955, "grad_norm": 10.317972183227539, "learning_rate": 2.3427487317023477e-06, "loss": 0.2325, "step": 3150 }, { "epoch": 0.25826022792489955, "eval_loss": 0.2304079383611679, "eval_runtime": 55.9839, "eval_samples_per_second": 8.931, "eval_steps_per_second": 8.931, "step": 3150 }, { "epoch": 0.2590801016643437, "grad_norm": 13.487903594970703, "learning_rate": 2.320492132609344e-06, "loss": 0.2491, "step": 3160 }, { "epoch": 0.25989997540378784, "grad_norm": 18.3017520904541, "learning_rate": 2.2982942949857705e-06, "loss": 0.2203, "step": 3170 }, { "epoch": 0.26071984914323193, "grad_norm": 35.3414421081543, "learning_rate": 2.276156131297787e-06, "loss": 0.2076, "step": 3180 }, { "epoch": 0.2615397228826761, "grad_norm": 7.3131327629089355, "learning_rate": 2.254078551558594e-06, "loss": 0.2476, "step": 3190 }, { "epoch": 0.2623595966221202, "grad_norm": 21.195293426513672, "learning_rate": 2.2320624632910232e-06, "loss": 0.2347, "step": 3200 }, { "epoch": 0.2631794703615643, "grad_norm": 19.634109497070312, "learning_rate": 2.210108771490233e-06, "loss": 0.2395, "step": 3210 }, { "epoch": 0.26399934410100845, "grad_norm": 16.585100173950195, "learning_rate": 2.1882183785865047e-06, "loss": 0.2258, "step": 3220 }, { "epoch": 0.2648192178404526, "grad_norm": 16.569671630859375, "learning_rate": 2.166392184408152e-06, "loss": 0.2379, "step": 3230 }, { "epoch": 0.2656390915798967, "grad_norm": 14.845422744750977, "learning_rate": 2.1446310861445306e-06, "loss": 0.2183, "step": 3240 }, { "epoch": 0.2664589653193408, "grad_norm": 16.37993621826172, "learning_rate": 2.1229359783091576e-06, "loss": 0.2249, "step": 3250 }, { "epoch": 0.26727883905878497, "grad_norm": 24.308523178100586, "learning_rate": 2.1013077527029428e-06, "loss": 0.2314, "step": 3260 }, { "epoch": 0.26809871279822906, "grad_norm": 20.230369567871094, "learning_rate": 2.079747298377528e-06, "loss": 0.2072, "step": 3270 }, { "epoch": 0.2689185865376732, "grad_norm": 18.310514450073242, "learning_rate": 2.058255501598745e-06, "loss": 0.2528, "step": 3280 }, { "epoch": 0.26973846027711734, "grad_norm": 15.269632339477539, "learning_rate": 2.0368332458101814e-06, "loss": 0.2206, "step": 3290 }, { "epoch": 0.27055833401656143, "grad_norm": 24.385452270507812, "learning_rate": 2.015481411596869e-06, "loss": 0.2341, "step": 3300 }, { "epoch": 0.27055833401656143, "eval_loss": 0.23421980440616608, "eval_runtime": 60.4493, "eval_samples_per_second": 8.271, "eval_steps_per_second": 8.271, "step": 3300 }, { "epoch": 0.2713782077560056, "grad_norm": 21.876766204833984, "learning_rate": 1.9942008766490793e-06, "loss": 0.235, "step": 3310 }, { "epoch": 0.2721980814954497, "grad_norm": 11.376224517822266, "learning_rate": 1.9729925157262554e-06, "loss": 0.2509, "step": 3320 }, { "epoch": 0.2730179552348938, "grad_norm": 27.929759979248047, "learning_rate": 1.9518572006210484e-06, "loss": 0.242, "step": 3330 }, { "epoch": 0.27383782897433795, "grad_norm": 23.26350975036621, "learning_rate": 1.9307958001234794e-06, "loss": 0.2507, "step": 3340 }, { "epoch": 0.2746577027137821, "grad_norm": 24.858692169189453, "learning_rate": 1.9098091799852347e-06, "loss": 0.2375, "step": 3350 }, { "epoch": 0.2754775764532262, "grad_norm": 16.973976135253906, "learning_rate": 1.8888982028840636e-06, "loss": 0.2341, "step": 3360 }, { "epoch": 0.2762974501926703, "grad_norm": 26.544775009155273, "learning_rate": 1.8680637283883355e-06, "loss": 0.2457, "step": 3370 }, { "epoch": 0.27711732393211447, "grad_norm": 16.246021270751953, "learning_rate": 1.8473066129216927e-06, "loss": 0.2484, "step": 3380 }, { "epoch": 0.27793719767155856, "grad_norm": 12.570246696472168, "learning_rate": 1.8266277097278527e-06, "loss": 0.2579, "step": 3390 }, { "epoch": 0.2787570714110027, "grad_norm": 17.455217361450195, "learning_rate": 1.8060278688355313e-06, "loss": 0.2213, "step": 3400 }, { "epoch": 0.27957694515044684, "grad_norm": 13.560107231140137, "learning_rate": 1.7855079370235043e-06, "loss": 0.2168, "step": 3410 }, { "epoch": 0.28039681888989093, "grad_norm": 19.205720901489258, "learning_rate": 1.7650687577857972e-06, "loss": 0.2166, "step": 3420 }, { "epoch": 0.2812166926293351, "grad_norm": 31.231449127197266, "learning_rate": 1.7447111712970138e-06, "loss": 0.2472, "step": 3430 }, { "epoch": 0.2820365663687792, "grad_norm": 18.0344181060791, "learning_rate": 1.7244360143778004e-06, "loss": 0.2376, "step": 3440 }, { "epoch": 0.2828564401082233, "grad_norm": 16.178203582763672, "learning_rate": 1.704244120460443e-06, "loss": 0.2209, "step": 3450 }, { "epoch": 0.2828564401082233, "eval_loss": 0.22183214128017426, "eval_runtime": 56.128, "eval_samples_per_second": 8.908, "eval_steps_per_second": 8.908, "step": 3450 }, { "epoch": 0.28367631384766745, "grad_norm": 18.059825897216797, "learning_rate": 1.6841363195546162e-06, "loss": 0.2267, "step": 3460 }, { "epoch": 0.2844961875871116, "grad_norm": 22.400646209716797, "learning_rate": 1.6641134382132576e-06, "loss": 0.2297, "step": 3470 }, { "epoch": 0.28531606132655574, "grad_norm": 18.88297462463379, "learning_rate": 1.6441762994985947e-06, "loss": 0.2087, "step": 3480 }, { "epoch": 0.2861359350659998, "grad_norm": 9.259561538696289, "learning_rate": 1.6243257229483141e-06, "loss": 0.2341, "step": 3490 }, { "epoch": 0.28695580880544397, "grad_norm": 9.176309585571289, "learning_rate": 1.6045625245418648e-06, "loss": 0.2314, "step": 3500 }, { "epoch": 0.2877756825448881, "grad_norm": 16.64775276184082, "learning_rate": 1.584887516666928e-06, "loss": 0.221, "step": 3510 }, { "epoch": 0.2885955562843322, "grad_norm": 16.043312072753906, "learning_rate": 1.565301508086015e-06, "loss": 0.2307, "step": 3520 }, { "epoch": 0.28941543002377634, "grad_norm": 28.55023765563965, "learning_rate": 1.5458053039032263e-06, "loss": 0.2013, "step": 3530 }, { "epoch": 0.2902353037632205, "grad_norm": 22.9605712890625, "learning_rate": 1.5263997055311536e-06, "loss": 0.2258, "step": 3540 }, { "epoch": 0.2910551775026646, "grad_norm": 11.065112113952637, "learning_rate": 1.5070855106579404e-06, "loss": 0.2375, "step": 3550 }, { "epoch": 0.2918750512421087, "grad_norm": 13.265893936157227, "learning_rate": 1.4878635132144885e-06, "loss": 0.2409, "step": 3560 }, { "epoch": 0.29269492498155286, "grad_norm": 22.174110412597656, "learning_rate": 1.4687345033418258e-06, "loss": 0.2424, "step": 3570 }, { "epoch": 0.29351479872099695, "grad_norm": 12.81115436553955, "learning_rate": 1.4496992673586262e-06, "loss": 0.2236, "step": 3580 }, { "epoch": 0.2943346724604411, "grad_norm": 12.606128692626953, "learning_rate": 1.4307585877288822e-06, "loss": 0.2262, "step": 3590 }, { "epoch": 0.29515454619988524, "grad_norm": 29.290117263793945, "learning_rate": 1.4119132430297496e-06, "loss": 0.2305, "step": 3600 }, { "epoch": 0.29515454619988524, "eval_loss": 0.22281211614608765, "eval_runtime": 55.6771, "eval_samples_per_second": 8.98, "eval_steps_per_second": 8.98, "step": 3600 }, { "epoch": 0.2959744199393293, "grad_norm": 19.89222526550293, "learning_rate": 1.3931640079195365e-06, "loss": 0.2354, "step": 3610 }, { "epoch": 0.29679429367877347, "grad_norm": 10.584065437316895, "learning_rate": 1.3745116531058645e-06, "loss": 0.2272, "step": 3620 }, { "epoch": 0.2976141674182176, "grad_norm": 18.46734619140625, "learning_rate": 1.3559569453139797e-06, "loss": 0.2192, "step": 3630 }, { "epoch": 0.2984340411576617, "grad_norm": 17.607667922973633, "learning_rate": 1.3375006472552483e-06, "loss": 0.2466, "step": 3640 }, { "epoch": 0.29925391489710584, "grad_norm": 19.822507858276367, "learning_rate": 1.3191435175957945e-06, "loss": 0.2271, "step": 3650 }, { "epoch": 0.30007378863655, "grad_norm": 7.999312400817871, "learning_rate": 1.3008863109253174e-06, "loss": 0.2244, "step": 3660 }, { "epoch": 0.3008936623759941, "grad_norm": 15.04226016998291, "learning_rate": 1.282729777726078e-06, "loss": 0.2303, "step": 3670 }, { "epoch": 0.3017135361154382, "grad_norm": 12.127747535705566, "learning_rate": 1.2646746643420392e-06, "loss": 0.2289, "step": 3680 }, { "epoch": 0.30253340985488236, "grad_norm": 10.014680862426758, "learning_rate": 1.2467217129481952e-06, "loss": 0.2176, "step": 3690 }, { "epoch": 0.30335328359432645, "grad_norm": 15.543107986450195, "learning_rate": 1.2288716615200617e-06, "loss": 0.2338, "step": 3700 }, { "epoch": 0.3041731573337706, "grad_norm": 12.86021614074707, "learning_rate": 1.2111252438033404e-06, "loss": 0.2192, "step": 3710 }, { "epoch": 0.30499303107321474, "grad_norm": 32.52058792114258, "learning_rate": 1.1934831892837524e-06, "loss": 0.2205, "step": 3720 }, { "epoch": 0.3058129048126588, "grad_norm": 6.391150951385498, "learning_rate": 1.1759462231570618e-06, "loss": 0.2043, "step": 3730 }, { "epoch": 0.30663277855210297, "grad_norm": 18.806997299194336, "learning_rate": 1.1585150662992578e-06, "loss": 0.2203, "step": 3740 }, { "epoch": 0.3074526522915471, "grad_norm": 16.80451774597168, "learning_rate": 1.1411904352369262e-06, "loss": 0.228, "step": 3750 }, { "epoch": 0.3074526522915471, "eval_loss": 0.2207518219947815, "eval_runtime": 56.5561, "eval_samples_per_second": 8.841, "eval_steps_per_second": 8.841, "step": 3750 }, { "epoch": 0.3082725260309912, "grad_norm": 14.464019775390625, "learning_rate": 1.1239730421177952e-06, "loss": 0.2285, "step": 3760 }, { "epoch": 0.30909239977043534, "grad_norm": 18.73137664794922, "learning_rate": 1.1068635946814569e-06, "loss": 0.2234, "step": 3770 }, { "epoch": 0.3099122735098795, "grad_norm": 10.308956146240234, "learning_rate": 1.0898627962302831e-06, "loss": 0.2208, "step": 3780 }, { "epoch": 0.31073214724932363, "grad_norm": 39.88100051879883, "learning_rate": 1.072971345600513e-06, "loss": 0.2376, "step": 3790 }, { "epoch": 0.3115520209887677, "grad_norm": 12.245576858520508, "learning_rate": 1.056189937133522e-06, "loss": 0.2283, "step": 3800 }, { "epoch": 0.31237189472821186, "grad_norm": 14.314285278320312, "learning_rate": 1.0395192606472822e-06, "loss": 0.2073, "step": 3810 }, { "epoch": 0.313191768467656, "grad_norm": 15.187841415405273, "learning_rate": 1.0229600014080101e-06, "loss": 0.2495, "step": 3820 }, { "epoch": 0.3140116422071001, "grad_norm": 13.99637508392334, "learning_rate": 1.006512840101995e-06, "loss": 0.2154, "step": 3830 }, { "epoch": 0.31483151594654424, "grad_norm": 7.902044773101807, "learning_rate": 9.90178452807619e-07, "loss": 0.2435, "step": 3840 }, { "epoch": 0.3156513896859884, "grad_norm": 12.850071907043457, "learning_rate": 9.739575109675674e-07, "loss": 0.2247, "step": 3850 }, { "epoch": 0.31647126342543247, "grad_norm": 14.898462295532227, "learning_rate": 9.578506813612243e-07, "loss": 0.221, "step": 3860 }, { "epoch": 0.3172911371648766, "grad_norm": 24.208559036254883, "learning_rate": 9.418586260772695e-07, "loss": 0.2303, "step": 3870 }, { "epoch": 0.31811101090432076, "grad_norm": 17.132963180541992, "learning_rate": 9.259820024864594e-07, "loss": 0.2283, "step": 3880 }, { "epoch": 0.31893088464376484, "grad_norm": 19.788406372070312, "learning_rate": 9.102214632146059e-07, "loss": 0.2465, "step": 3890 }, { "epoch": 0.319750758383209, "grad_norm": 26.01558494567871, "learning_rate": 8.94577656115746e-07, "loss": 0.2321, "step": 3900 }, { "epoch": 0.319750758383209, "eval_loss": 0.22018083930015564, "eval_runtime": 56.099, "eval_samples_per_second": 8.913, "eval_steps_per_second": 8.913, "step": 3900 }, { "epoch": 0.32057063212265313, "grad_norm": 13.368496894836426, "learning_rate": 8.790512242455198e-07, "loss": 0.2401, "step": 3910 }, { "epoch": 0.3213905058620972, "grad_norm": 17.882627487182617, "learning_rate": 8.636428058347274e-07, "loss": 0.2045, "step": 3920 }, { "epoch": 0.32221037960154136, "grad_norm": 21.98712158203125, "learning_rate": 8.483530342630993e-07, "loss": 0.243, "step": 3930 }, { "epoch": 0.3230302533409855, "grad_norm": 33.167381286621094, "learning_rate": 8.331825380332599e-07, "loss": 0.2258, "step": 3940 }, { "epoch": 0.3238501270804296, "grad_norm": 16.276443481445312, "learning_rate": 8.181319407448884e-07, "loss": 0.2489, "step": 3950 }, { "epoch": 0.32467000081987374, "grad_norm": 12.20262336730957, "learning_rate": 8.032018610690914e-07, "loss": 0.2074, "step": 3960 }, { "epoch": 0.3254898745593179, "grad_norm": 23.053037643432617, "learning_rate": 7.883929127229665e-07, "loss": 0.2238, "step": 3970 }, { "epoch": 0.32630974829876197, "grad_norm": 9.354714393615723, "learning_rate": 7.737057044443793e-07, "loss": 0.2268, "step": 3980 }, { "epoch": 0.3271296220382061, "grad_norm": 13.12759780883789, "learning_rate": 7.591408399669337e-07, "loss": 0.2259, "step": 3990 }, { "epoch": 0.32794949577765026, "grad_norm": 12.080741882324219, "learning_rate": 7.446989179951632e-07, "loss": 0.214, "step": 4000 }, { "epoch": 0.32876936951709435, "grad_norm": 13.813101768493652, "learning_rate": 7.303805321799146e-07, "loss": 0.218, "step": 4010 }, { "epoch": 0.3295892432565385, "grad_norm": 12.327116012573242, "learning_rate": 7.161862710939476e-07, "loss": 0.2295, "step": 4020 }, { "epoch": 0.33040911699598263, "grad_norm": 15.953246116638184, "learning_rate": 7.021167182077403e-07, "loss": 0.2197, "step": 4030 }, { "epoch": 0.3312289907354267, "grad_norm": 19.298919677734375, "learning_rate": 6.881724518655049e-07, "loss": 0.2326, "step": 4040 }, { "epoch": 0.33204886447487086, "grad_norm": 38.68765640258789, "learning_rate": 6.743540452614152e-07, "loss": 0.2303, "step": 4050 }, { "epoch": 0.33204886447487086, "eval_loss": 0.21772576868534088, "eval_runtime": 56.5668, "eval_samples_per_second": 8.839, "eval_steps_per_second": 8.839, "step": 4050 }, { "epoch": 0.332868738214315, "grad_norm": 11.087291717529297, "learning_rate": 6.606620664160438e-07, "loss": 0.2071, "step": 4060 }, { "epoch": 0.3336886119537591, "grad_norm": 50.521053314208984, "learning_rate": 6.470970781530139e-07, "loss": 0.2204, "step": 4070 }, { "epoch": 0.33450848569320324, "grad_norm": 32.14698028564453, "learning_rate": 6.336596380758604e-07, "loss": 0.2466, "step": 4080 }, { "epoch": 0.3353283594326474, "grad_norm": 19.88819694519043, "learning_rate": 6.203502985451152e-07, "loss": 0.2291, "step": 4090 }, { "epoch": 0.33614823317209147, "grad_norm": 11.445552825927734, "learning_rate": 6.071696066555978e-07, "loss": 0.2549, "step": 4100 }, { "epoch": 0.3369681069115356, "grad_norm": 17.117246627807617, "learning_rate": 5.941181042139258e-07, "loss": 0.2077, "step": 4110 }, { "epoch": 0.33778798065097976, "grad_norm": 10.231658935546875, "learning_rate": 5.811963277162466e-07, "loss": 0.2182, "step": 4120 }, { "epoch": 0.3386078543904239, "grad_norm": 14.68455696105957, "learning_rate": 5.684048083261789e-07, "loss": 0.2445, "step": 4130 }, { "epoch": 0.339427728129868, "grad_norm": 22.658329010009766, "learning_rate": 5.557440718529848e-07, "loss": 0.1938, "step": 4140 }, { "epoch": 0.34024760186931213, "grad_norm": 12.441681861877441, "learning_rate": 5.432146387299522e-07, "loss": 0.224, "step": 4150 }, { "epoch": 0.3410674756087563, "grad_norm": 16.301542282104492, "learning_rate": 5.308170239930022e-07, "loss": 0.2092, "step": 4160 }, { "epoch": 0.34188734934820036, "grad_norm": 17.414865493774414, "learning_rate": 5.185517372595187e-07, "loss": 0.2429, "step": 4170 }, { "epoch": 0.3427072230876445, "grad_norm": 37.58354949951172, "learning_rate": 5.064192827073995e-07, "loss": 0.2236, "step": 4180 }, { "epoch": 0.34352709682708865, "grad_norm": 19.772306442260742, "learning_rate": 4.944201590543308e-07, "loss": 0.2209, "step": 4190 }, { "epoch": 0.34434697056653274, "grad_norm": 10.470952987670898, "learning_rate": 4.825548595372898e-07, "loss": 0.2441, "step": 4200 }, { "epoch": 0.34434697056653274, "eval_loss": 0.2149660438299179, "eval_runtime": 55.9997, "eval_samples_per_second": 8.929, "eval_steps_per_second": 8.929, "step": 4200 }, { "epoch": 0.3451668443059769, "grad_norm": 12.9829683303833, "learning_rate": 4.7082387189226646e-07, "loss": 0.2012, "step": 4210 }, { "epoch": 0.345986718045421, "grad_norm": 11.852750778198242, "learning_rate": 4.5922767833421454e-07, "loss": 0.2172, "step": 4220 }, { "epoch": 0.3468065917848651, "grad_norm": 33.68533706665039, "learning_rate": 4.477667555372326e-07, "loss": 0.2114, "step": 4230 }, { "epoch": 0.34762646552430926, "grad_norm": 24.621292114257812, "learning_rate": 4.364415746149678e-07, "loss": 0.2264, "step": 4240 }, { "epoch": 0.3484463392637534, "grad_norm": 23.111419677734375, "learning_rate": 4.2525260110124964e-07, "loss": 0.2146, "step": 4250 }, { "epoch": 0.3492662130031975, "grad_norm": 22.753629684448242, "learning_rate": 4.1420029493095623e-07, "loss": 0.2181, "step": 4260 }, { "epoch": 0.35008608674264163, "grad_norm": 12.422630310058594, "learning_rate": 4.032851104211036e-07, "loss": 0.2059, "step": 4270 }, { "epoch": 0.3509059604820858, "grad_norm": 21.33889389038086, "learning_rate": 3.925074962521762e-07, "loss": 0.2041, "step": 4280 }, { "epoch": 0.35172583422152986, "grad_norm": 21.088577270507812, "learning_rate": 3.818678954496787e-07, "loss": 0.2162, "step": 4290 }, { "epoch": 0.352545707960974, "grad_norm": 14.029748916625977, "learning_rate": 3.713667453659287e-07, "loss": 0.2291, "step": 4300 }, { "epoch": 0.35336558170041815, "grad_norm": 11.585044860839844, "learning_rate": 3.6100447766207473e-07, "loss": 0.2139, "step": 4310 }, { "epoch": 0.35418545543986224, "grad_norm": 13.666373252868652, "learning_rate": 3.5078151829035693e-07, "loss": 0.2311, "step": 4320 }, { "epoch": 0.3550053291793064, "grad_norm": 24.15358543395996, "learning_rate": 3.4069828747659405e-07, "loss": 0.2149, "step": 4330 }, { "epoch": 0.3558252029187505, "grad_norm": 25.829856872558594, "learning_rate": 3.3075519970291144e-07, "loss": 0.2055, "step": 4340 }, { "epoch": 0.3566450766581946, "grad_norm": 23.233440399169922, "learning_rate": 3.209526636907036e-07, "loss": 0.2444, "step": 4350 }, { "epoch": 0.3566450766581946, "eval_loss": 0.2148878425359726, "eval_runtime": 56.223, "eval_samples_per_second": 8.893, "eval_steps_per_second": 8.893, "step": 4350 }, { "epoch": 0.35746495039763876, "grad_norm": 19.731224060058594, "learning_rate": 3.1129108238383095e-07, "loss": 0.2199, "step": 4360 }, { "epoch": 0.3582848241370829, "grad_norm": 23.215808868408203, "learning_rate": 3.017708529320604e-07, "loss": 0.2228, "step": 4370 }, { "epoch": 0.359104697876527, "grad_norm": 17.997251510620117, "learning_rate": 2.923923666747357e-07, "loss": 0.2336, "step": 4380 }, { "epoch": 0.35992457161597113, "grad_norm": 14.64735221862793, "learning_rate": 2.8315600912469477e-07, "loss": 0.2831, "step": 4390 }, { "epoch": 0.3607444453554153, "grad_norm": 18.220691680908203, "learning_rate": 2.740621599524189e-07, "loss": 0.2277, "step": 4400 }, { "epoch": 0.36156431909485937, "grad_norm": 16.92856216430664, "learning_rate": 2.651111929704303e-07, "loss": 0.2139, "step": 4410 }, { "epoch": 0.3623841928343035, "grad_norm": 30.373014450073242, "learning_rate": 2.563034761179223e-07, "loss": 0.2354, "step": 4420 }, { "epoch": 0.36320406657374765, "grad_norm": 16.33125114440918, "learning_rate": 2.476393714456384e-07, "loss": 0.2209, "step": 4430 }, { "epoch": 0.3640239403131918, "grad_norm": 13.93752670288086, "learning_rate": 2.391192351009855e-07, "loss": 0.2285, "step": 4440 }, { "epoch": 0.3648438140526359, "grad_norm": 24.299808502197266, "learning_rate": 2.3074341731339837e-07, "loss": 0.2487, "step": 4450 }, { "epoch": 0.36566368779208, "grad_norm": 15.581805229187012, "learning_rate": 2.225122623799407e-07, "loss": 0.2112, "step": 4460 }, { "epoch": 0.36648356153152417, "grad_norm": 21.24774932861328, "learning_rate": 2.1442610865115135e-07, "loss": 0.2253, "step": 4470 }, { "epoch": 0.36730343527096826, "grad_norm": 20.960872650146484, "learning_rate": 2.0648528851714077e-07, "loss": 0.2208, "step": 4480 }, { "epoch": 0.3681233090104124, "grad_norm": 22.186767578125, "learning_rate": 1.9869012839392064e-07, "loss": 0.218, "step": 4490 }, { "epoch": 0.36894318274985655, "grad_norm": 15.852953910827637, "learning_rate": 1.9104094870999264e-07, "loss": 0.2123, "step": 4500 }, { "epoch": 0.36894318274985655, "eval_loss": 0.21366393566131592, "eval_runtime": 55.673, "eval_samples_per_second": 8.981, "eval_steps_per_second": 8.981, "step": 4500 }, { "epoch": 0.36976305648930063, "grad_norm": 11.23139476776123, "learning_rate": 1.8353806389317428e-07, "loss": 0.2201, "step": 4510 }, { "epoch": 0.3705829302287448, "grad_norm": 15.876472473144531, "learning_rate": 1.761817823576731e-07, "loss": 0.2382, "step": 4520 }, { "epoch": 0.3714028039681889, "grad_norm": 18.092660903930664, "learning_rate": 1.6897240649141125e-07, "loss": 0.2359, "step": 4530 }, { "epoch": 0.372222677707633, "grad_norm": 20.05590057373047, "learning_rate": 1.619102326435923e-07, "loss": 0.2304, "step": 4540 }, { "epoch": 0.37304255144707715, "grad_norm": 14.876965522766113, "learning_rate": 1.5499555111252285e-07, "loss": 0.2305, "step": 4550 }, { "epoch": 0.3738624251865213, "grad_norm": 24.27523422241211, "learning_rate": 1.4822864613367766e-07, "loss": 0.229, "step": 4560 }, { "epoch": 0.3746822989259654, "grad_norm": 36.034820556640625, "learning_rate": 1.4160979586801724e-07, "loss": 0.2099, "step": 4570 }, { "epoch": 0.37550217266540953, "grad_norm": 14.821313858032227, "learning_rate": 1.3513927239055036e-07, "loss": 0.2069, "step": 4580 }, { "epoch": 0.37632204640485367, "grad_norm": 24.151025772094727, "learning_rate": 1.2881734167915425e-07, "loss": 0.2477, "step": 4590 }, { "epoch": 0.37714192014429776, "grad_norm": 34.51681900024414, "learning_rate": 1.2264426360363956e-07, "loss": 0.2169, "step": 4600 }, { "epoch": 0.3779617938837419, "grad_norm": 18.54802894592285, "learning_rate": 1.1662029191506775e-07, "loss": 0.2053, "step": 4610 }, { "epoch": 0.37878166762318605, "grad_norm": 18.75210189819336, "learning_rate": 1.107456742353201e-07, "loss": 0.2313, "step": 4620 }, { "epoch": 0.37960154136263013, "grad_norm": 14.032902717590332, "learning_rate": 1.0502065204692062e-07, "loss": 0.2253, "step": 4630 }, { "epoch": 0.3804214151020743, "grad_norm": 16.711780548095703, "learning_rate": 9.94454606831076e-08, "loss": 0.208, "step": 4640 }, { "epoch": 0.3812412888415184, "grad_norm": 33.53385543823242, "learning_rate": 9.402032931816144e-08, "loss": 0.2256, "step": 4650 }, { "epoch": 0.3812412888415184, "eval_loss": 0.2128845751285553, "eval_runtime": 55.1573, "eval_samples_per_second": 9.065, "eval_steps_per_second": 9.065, "step": 4650 }, { "epoch": 0.3820611625809625, "grad_norm": 9.32500171661377, "learning_rate": 8.874548095798464e-08, "loss": 0.227, "step": 4660 }, { "epoch": 0.38288103632040665, "grad_norm": 12.115835189819336, "learning_rate": 8.362113243093245e-08, "loss": 0.2148, "step": 4670 }, { "epoch": 0.3837009100598508, "grad_norm": 26.36838722229004, "learning_rate": 7.864749437890173e-08, "loss": 0.2228, "step": 4680 }, { "epoch": 0.3845207837992949, "grad_norm": 12.476286888122559, "learning_rate": 7.382477124867282e-08, "loss": 0.2057, "step": 4690 }, { "epoch": 0.38534065753873903, "grad_norm": 15.308034896850586, "learning_rate": 6.915316128350461e-08, "loss": 0.2278, "step": 4700 }, { "epoch": 0.3861605312781832, "grad_norm": 9.208645820617676, "learning_rate": 6.463285651498563e-08, "loss": 0.2227, "step": 4710 }, { "epoch": 0.38698040501762726, "grad_norm": 9.877080917358398, "learning_rate": 6.026404275513875e-08, "loss": 0.2197, "step": 4720 }, { "epoch": 0.3878002787570714, "grad_norm": 16.259761810302734, "learning_rate": 5.604689958878723e-08, "loss": 0.2413, "step": 4730 }, { "epoch": 0.38862015249651555, "grad_norm": 17.41680908203125, "learning_rate": 5.198160036616898e-08, "loss": 0.2159, "step": 4740 }, { "epoch": 0.38944002623595964, "grad_norm": 17.588123321533203, "learning_rate": 4.8068312195811847e-08, "loss": 0.2191, "step": 4750 }, { "epoch": 0.3902598999754038, "grad_norm": 14.38376235961914, "learning_rate": 4.4307195937666194e-08, "loss": 0.2332, "step": 4760 }, { "epoch": 0.3910797737148479, "grad_norm": 12.54135799407959, "learning_rate": 4.069840619648935e-08, "loss": 0.2176, "step": 4770 }, { "epoch": 0.39189964745429207, "grad_norm": 20.703615188598633, "learning_rate": 3.72420913154932e-08, "loss": 0.2204, "step": 4780 }, { "epoch": 0.39271952119373615, "grad_norm": 28.904329299926758, "learning_rate": 3.3938393370244876e-08, "loss": 0.2389, "step": 4790 }, { "epoch": 0.3935393949331803, "grad_norm": 15.144803047180176, "learning_rate": 3.078744816282731e-08, "loss": 0.2306, "step": 4800 }, { "epoch": 0.3935393949331803, "eval_loss": 0.2134290486574173, "eval_runtime": 55.5119, "eval_samples_per_second": 9.007, "eval_steps_per_second": 9.007, "step": 4800 }, { "epoch": 0.39435926867262444, "grad_norm": 18.657732009887695, "learning_rate": 2.778938521625613e-08, "loss": 0.2454, "step": 4810 }, { "epoch": 0.39517914241206853, "grad_norm": 20.660715103149414, "learning_rate": 2.4944327769157314e-08, "loss": 0.2211, "step": 4820 }, { "epoch": 0.3959990161515127, "grad_norm": 13.545777320861816, "learning_rate": 2.225239277069871e-08, "loss": 0.1803, "step": 4830 }, { "epoch": 0.3968188898909568, "grad_norm": 20.064281463623047, "learning_rate": 1.971369087578473e-08, "loss": 0.2226, "step": 4840 }, { "epoch": 0.3976387636304009, "grad_norm": 11.630465507507324, "learning_rate": 1.7328326440506637e-08, "loss": 0.2117, "step": 4850 }, { "epoch": 0.39845863736984505, "grad_norm": 16.434839248657227, "learning_rate": 1.5096397517853497e-08, "loss": 0.2381, "step": 4860 }, { "epoch": 0.3992785111092892, "grad_norm": 14.184981346130371, "learning_rate": 1.3017995853681631e-08, "loss": 0.2262, "step": 4870 }, { "epoch": 0.4000983848487333, "grad_norm": 17.047590255737305, "learning_rate": 1.1093206882943076e-08, "loss": 0.2164, "step": 4880 }, { "epoch": 0.4009182585881774, "grad_norm": 15.3792142868042, "learning_rate": 9.322109726172952e-09, "loss": 0.2288, "step": 4890 }, { "epoch": 0.40173813232762157, "grad_norm": 14.833084106445312, "learning_rate": 7.704777186238744e-09, "loss": 0.209, "step": 4900 }, { "epoch": 0.40255800606706565, "grad_norm": 22.476787567138672, "learning_rate": 6.241275745346859e-09, "loss": 0.2118, "step": 4910 }, { "epoch": 0.4033778798065098, "grad_norm": 14.301311492919922, "learning_rate": 4.931665562308563e-09, "loss": 0.2222, "step": 4920 }, { "epoch": 0.40419775354595394, "grad_norm": 13.92874813079834, "learning_rate": 3.7760004700702905e-09, "loss": 0.2283, "step": 4930 }, { "epoch": 0.40501762728539803, "grad_norm": 20.181961059570312, "learning_rate": 2.7743279734962494e-09, "loss": 0.2132, "step": 4940 }, { "epoch": 0.4058375010248422, "grad_norm": 22.093725204467773, "learning_rate": 1.926689247420399e-09, "loss": 0.2127, "step": 4950 }, { "epoch": 0.4058375010248422, "eval_loss": 0.2128431349992752, "eval_runtime": 55.4771, "eval_samples_per_second": 9.013, "eval_steps_per_second": 9.013, "step": 4950 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.911768952965693e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }