|
{ |
|
"best_metric": 0.2128431349992752, |
|
"best_model_checkpoint": "./output/checkpoint-4950", |
|
"epoch": 0.4058375010248422, |
|
"eval_steps": 150, |
|
"global_step": 4950, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008198737394441257, |
|
"grad_norm": 11.523909568786621, |
|
"learning_rate": 7.500000000000001e-07, |
|
"loss": 0.39, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0016397474788882513, |
|
"grad_norm": 9.020567893981934, |
|
"learning_rate": 1.5000000000000002e-06, |
|
"loss": 0.3576, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.002459621218332377, |
|
"grad_norm": 9.512846946716309, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.3874, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0032794949577765026, |
|
"grad_norm": 39.97313690185547, |
|
"learning_rate": 3.0000000000000005e-06, |
|
"loss": 0.3568, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.004099368697220628, |
|
"grad_norm": 12.515055656433105, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.3314, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.004919242436664754, |
|
"grad_norm": 11.462284088134766, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.3641, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.005739116176108879, |
|
"grad_norm": 18.380435943603516, |
|
"learning_rate": 5.2500000000000006e-06, |
|
"loss": 0.348, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.006558989915553005, |
|
"grad_norm": 13.468473434448242, |
|
"learning_rate": 6.000000000000001e-06, |
|
"loss": 0.348, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.007378863654997131, |
|
"grad_norm": 10.285468101501465, |
|
"learning_rate": 6.7500000000000014e-06, |
|
"loss": 0.3352, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.008198737394441257, |
|
"grad_norm": 17.571596145629883, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.3438, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.009018611133885381, |
|
"grad_norm": 19.84699249267578, |
|
"learning_rate": 7.499922926093874e-06, |
|
"loss": 0.3253, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.009838484873329507, |
|
"grad_norm": 16.91347885131836, |
|
"learning_rate": 7.499691707543699e-06, |
|
"loss": 0.3328, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.010658358612773634, |
|
"grad_norm": 11.190834999084473, |
|
"learning_rate": 7.499306353853963e-06, |
|
"loss": 0.3308, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.011478232352217758, |
|
"grad_norm": 11.117925643920898, |
|
"learning_rate": 7.49876688086505e-06, |
|
"loss": 0.3401, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.012298106091661884, |
|
"grad_norm": 12.28294563293457, |
|
"learning_rate": 7.4980733107525805e-06, |
|
"loss": 0.303, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.012298106091661884, |
|
"eval_loss": 0.32195183634757996, |
|
"eval_runtime": 58.0333, |
|
"eval_samples_per_second": 8.616, |
|
"eval_steps_per_second": 8.616, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.01311797983110601, |
|
"grad_norm": 12.885525703430176, |
|
"learning_rate": 7.4972256720265044e-06, |
|
"loss": 0.3595, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.013937853570550135, |
|
"grad_norm": 12.438248634338379, |
|
"learning_rate": 7.496223999529932e-06, |
|
"loss": 0.3361, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.014757727309994261, |
|
"grad_norm": 14.641826629638672, |
|
"learning_rate": 7.4950683344376926e-06, |
|
"loss": 0.3296, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.015577601049438386, |
|
"grad_norm": 9.628592491149902, |
|
"learning_rate": 7.4937587242546544e-06, |
|
"loss": 0.3225, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.016397474788882514, |
|
"grad_norm": 15.733799934387207, |
|
"learning_rate": 7.492295222813762e-06, |
|
"loss": 0.3284, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.017217348528326636, |
|
"grad_norm": 12.937703132629395, |
|
"learning_rate": 7.490677890273828e-06, |
|
"loss": 0.3434, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.018037222267770762, |
|
"grad_norm": 16.046674728393555, |
|
"learning_rate": 7.488906793117058e-06, |
|
"loss": 0.3519, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.01885709600721489, |
|
"grad_norm": 11.472362518310547, |
|
"learning_rate": 7.486982004146319e-06, |
|
"loss": 0.3587, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.019676969746659015, |
|
"grad_norm": 15.215801239013672, |
|
"learning_rate": 7.484903602482148e-06, |
|
"loss": 0.3197, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.02049684348610314, |
|
"grad_norm": 11.658143997192383, |
|
"learning_rate": 7.4826716735594945e-06, |
|
"loss": 0.3114, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.021316717225547267, |
|
"grad_norm": 7.448172092437744, |
|
"learning_rate": 7.480286309124216e-06, |
|
"loss": 0.2912, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.02213659096499139, |
|
"grad_norm": 12.367362022399902, |
|
"learning_rate": 7.477747607229302e-06, |
|
"loss": 0.3167, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.022956464704435516, |
|
"grad_norm": 13.513625144958496, |
|
"learning_rate": 7.475055672230844e-06, |
|
"loss": 0.3093, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.023776338443879642, |
|
"grad_norm": 19.878536224365234, |
|
"learning_rate": 7.472210614783745e-06, |
|
"loss": 0.3256, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.02459621218332377, |
|
"grad_norm": 22.84262466430664, |
|
"learning_rate": 7.469212551837173e-06, |
|
"loss": 0.3104, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.02459621218332377, |
|
"eval_loss": 0.3093046247959137, |
|
"eval_runtime": 58.7245, |
|
"eval_samples_per_second": 8.514, |
|
"eval_steps_per_second": 8.514, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.025416085922767895, |
|
"grad_norm": 9.043919563293457, |
|
"learning_rate": 7.4660616066297565e-06, |
|
"loss": 0.3089, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.02623595966221202, |
|
"grad_norm": 6.400809288024902, |
|
"learning_rate": 7.462757908684509e-06, |
|
"loss": 0.2959, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.027055833401656144, |
|
"grad_norm": 19.60870361328125, |
|
"learning_rate": 7.459301593803512e-06, |
|
"loss": 0.3251, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.02787570714110027, |
|
"grad_norm": 8.441984176635742, |
|
"learning_rate": 7.455692804062335e-06, |
|
"loss": 0.3108, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.028695580880544396, |
|
"grad_norm": 20.126216888427734, |
|
"learning_rate": 7.451931687804189e-06, |
|
"loss": 0.3152, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.029515454619988522, |
|
"grad_norm": 11.44316291809082, |
|
"learning_rate": 7.448018399633831e-06, |
|
"loss": 0.3302, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03033532835943265, |
|
"grad_norm": 10.247148513793945, |
|
"learning_rate": 7.443953100411214e-06, |
|
"loss": 0.289, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.03115520209887677, |
|
"grad_norm": 10.746755599975586, |
|
"learning_rate": 7.439735957244862e-06, |
|
"loss": 0.2886, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0319750758383209, |
|
"grad_norm": 19.19182014465332, |
|
"learning_rate": 7.435367143485015e-06, |
|
"loss": 0.325, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.03279494957776503, |
|
"grad_norm": 12.273555755615234, |
|
"learning_rate": 7.430846838716496e-06, |
|
"loss": 0.3107, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03361482331720915, |
|
"grad_norm": 13.099973678588867, |
|
"learning_rate": 7.426175228751328e-06, |
|
"loss": 0.3103, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.03443469705665327, |
|
"grad_norm": 20.098796844482422, |
|
"learning_rate": 7.421352505621099e-06, |
|
"loss": 0.284, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.0352545707960974, |
|
"grad_norm": 10.289865493774414, |
|
"learning_rate": 7.416378867569069e-06, |
|
"loss": 0.3337, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.036074444535541525, |
|
"grad_norm": 13.34965705871582, |
|
"learning_rate": 7.411254519042017e-06, |
|
"loss": 0.3085, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.036894318274985655, |
|
"grad_norm": 11.321673393249512, |
|
"learning_rate": 7.4059796706818396e-06, |
|
"loss": 0.3043, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.036894318274985655, |
|
"eval_loss": 0.2889861762523651, |
|
"eval_runtime": 56.9295, |
|
"eval_samples_per_second": 8.783, |
|
"eval_steps_per_second": 8.783, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.03771419201442978, |
|
"grad_norm": 15.978049278259277, |
|
"learning_rate": 7.400554539316894e-06, |
|
"loss": 0.2942, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.0385340657538739, |
|
"grad_norm": 16.420135498046875, |
|
"learning_rate": 7.394979347953081e-06, |
|
"loss": 0.3139, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.03935393949331803, |
|
"grad_norm": 15.941482543945312, |
|
"learning_rate": 7.389254325764681e-06, |
|
"loss": 0.3018, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.04017381323276215, |
|
"grad_norm": 9.359827041625977, |
|
"learning_rate": 7.383379708084934e-06, |
|
"loss": 0.3048, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.04099368697220628, |
|
"grad_norm": 11.175127983093262, |
|
"learning_rate": 7.377355736396362e-06, |
|
"loss": 0.3001, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.041813560711650405, |
|
"grad_norm": 18.719478607177734, |
|
"learning_rate": 7.371182658320847e-06, |
|
"loss": 0.3105, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.042633434451094535, |
|
"grad_norm": 9.761693954467773, |
|
"learning_rate": 7.36486072760945e-06, |
|
"loss": 0.3024, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.04345330819053866, |
|
"grad_norm": 15.880053520202637, |
|
"learning_rate": 7.358390204131984e-06, |
|
"loss": 0.3099, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.04427318192998278, |
|
"grad_norm": 10.00100326538086, |
|
"learning_rate": 7.3517713538663235e-06, |
|
"loss": 0.3215, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.04509305566942691, |
|
"grad_norm": 7.478984355926514, |
|
"learning_rate": 7.345004448887478e-06, |
|
"loss": 0.2974, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.04591292940887103, |
|
"grad_norm": 9.254852294921875, |
|
"learning_rate": 7.3380897673564085e-06, |
|
"loss": 0.3126, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.04673280314831516, |
|
"grad_norm": 13.706809997558594, |
|
"learning_rate": 7.33102759350859e-06, |
|
"loss": 0.3018, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.047552676887759285, |
|
"grad_norm": 16.57872200012207, |
|
"learning_rate": 7.323818217642328e-06, |
|
"loss": 0.2904, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.04837255062720341, |
|
"grad_norm": 14.819424629211426, |
|
"learning_rate": 7.316461936106827e-06, |
|
"loss": 0.2855, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.04919242436664754, |
|
"grad_norm": 17.543973922729492, |
|
"learning_rate": 7.3089590512900084e-06, |
|
"loss": 0.3169, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.04919242436664754, |
|
"eval_loss": 0.2996714413166046, |
|
"eval_runtime": 58.2745, |
|
"eval_samples_per_second": 8.58, |
|
"eval_steps_per_second": 8.58, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.05001229810609166, |
|
"grad_norm": 10.767305374145508, |
|
"learning_rate": 7.301309871606081e-06, |
|
"loss": 0.3011, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.05083217184553579, |
|
"grad_norm": 6.571865081787109, |
|
"learning_rate": 7.293514711482861e-06, |
|
"loss": 0.2783, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.05165204558497991, |
|
"grad_norm": 12.295404434204102, |
|
"learning_rate": 7.285573891348849e-06, |
|
"loss": 0.2829, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.05247191932442404, |
|
"grad_norm": 12.576509475708008, |
|
"learning_rate": 7.27748773762006e-06, |
|
"loss": 0.3021, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.053291793063868165, |
|
"grad_norm": 7.258118629455566, |
|
"learning_rate": 7.269256582686603e-06, |
|
"loss": 0.3041, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.05411166680331229, |
|
"grad_norm": 14.7495756149292, |
|
"learning_rate": 7.260880764899016e-06, |
|
"loss": 0.285, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.05493154054275642, |
|
"grad_norm": 18.141632080078125, |
|
"learning_rate": 7.252360628554363e-06, |
|
"loss": 0.2916, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.05575141428220054, |
|
"grad_norm": 18.141878128051758, |
|
"learning_rate": 7.243696523882079e-06, |
|
"loss": 0.3007, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.05657128802164467, |
|
"grad_norm": 13.596381187438965, |
|
"learning_rate": 7.2348888070295705e-06, |
|
"loss": 0.2627, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.05739116176108879, |
|
"grad_norm": 14.028800964355469, |
|
"learning_rate": 7.225937840047583e-06, |
|
"loss": 0.2959, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.058211035500532915, |
|
"grad_norm": 19.28914451599121, |
|
"learning_rate": 7.216843990875307e-06, |
|
"loss": 0.3088, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.059030909239977045, |
|
"grad_norm": 10.676041603088379, |
|
"learning_rate": 7.207607633325266e-06, |
|
"loss": 0.2762, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.05985078297942117, |
|
"grad_norm": 9.311237335205078, |
|
"learning_rate": 7.198229147067941e-06, |
|
"loss": 0.313, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.0606706567188653, |
|
"grad_norm": 12.335597038269043, |
|
"learning_rate": 7.18870891761617e-06, |
|
"loss": 0.2797, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.06149053045830942, |
|
"grad_norm": 11.885544776916504, |
|
"learning_rate": 7.1790473363092974e-06, |
|
"loss": 0.2681, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.06149053045830942, |
|
"eval_loss": 0.3024304211139679, |
|
"eval_runtime": 57.0493, |
|
"eval_samples_per_second": 8.764, |
|
"eval_steps_per_second": 8.764, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.06231040419775354, |
|
"grad_norm": 12.44359016418457, |
|
"learning_rate": 7.169244800297089e-06, |
|
"loss": 0.311, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.06313027793719767, |
|
"grad_norm": 18.710712432861328, |
|
"learning_rate": 7.159301712523407e-06, |
|
"loss": 0.2949, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.0639501516766418, |
|
"grad_norm": 9.658717155456543, |
|
"learning_rate": 7.149218481709644e-06, |
|
"loss": 0.2852, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.06477002541608592, |
|
"grad_norm": 10.276803970336914, |
|
"learning_rate": 7.1389955223379266e-06, |
|
"loss": 0.2818, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.06558989915553005, |
|
"grad_norm": 13.862250328063965, |
|
"learning_rate": 7.128633254634072e-06, |
|
"loss": 0.2834, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.06640977289497417, |
|
"grad_norm": 17.020177841186523, |
|
"learning_rate": 7.118132104550322e-06, |
|
"loss": 0.2677, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.0672296466344183, |
|
"grad_norm": 18.547590255737305, |
|
"learning_rate": 7.107492503747826e-06, |
|
"loss": 0.2898, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.06804952037386243, |
|
"grad_norm": 15.957967758178711, |
|
"learning_rate": 7.096714889578898e-06, |
|
"loss": 0.326, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.06886939411330655, |
|
"grad_norm": 24.1992130279541, |
|
"learning_rate": 7.085799705069046e-06, |
|
"loss": 0.2677, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.06968926785275067, |
|
"grad_norm": 12.799731254577637, |
|
"learning_rate": 7.0747473988987515e-06, |
|
"loss": 0.2806, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.0705091415921948, |
|
"grad_norm": 18.750246047973633, |
|
"learning_rate": 7.063558425385033e-06, |
|
"loss": 0.2937, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.07132901533163893, |
|
"grad_norm": 13.083860397338867, |
|
"learning_rate": 7.052233244462769e-06, |
|
"loss": 0.2957, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.07214888907108305, |
|
"grad_norm": 11.227791786193848, |
|
"learning_rate": 7.040772321665788e-06, |
|
"loss": 0.2855, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.07296876281052718, |
|
"grad_norm": 8.911324501037598, |
|
"learning_rate": 7.029176128107734e-06, |
|
"loss": 0.3105, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.07378863654997131, |
|
"grad_norm": 17.020790100097656, |
|
"learning_rate": 7.017445140462711e-06, |
|
"loss": 0.2728, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.07378863654997131, |
|
"eval_loss": 0.2869480550289154, |
|
"eval_runtime": 58.9095, |
|
"eval_samples_per_second": 8.488, |
|
"eval_steps_per_second": 8.488, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.07460851028941543, |
|
"grad_norm": 14.960102081298828, |
|
"learning_rate": 7.00557984094567e-06, |
|
"loss": 0.2955, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.07542838402885955, |
|
"grad_norm": 8.271307945251465, |
|
"learning_rate": 6.993580717292601e-06, |
|
"loss": 0.2666, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.07624825776830368, |
|
"grad_norm": 8.779189109802246, |
|
"learning_rate": 6.981448262740483e-06, |
|
"loss": 0.2938, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.0770681315077478, |
|
"grad_norm": 9.497313499450684, |
|
"learning_rate": 6.969182976006999e-06, |
|
"loss": 0.2875, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.07788800524719193, |
|
"grad_norm": 13.439544677734375, |
|
"learning_rate": 6.95678536127005e-06, |
|
"loss": 0.2893, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.07870787898663606, |
|
"grad_norm": 10.986952781677246, |
|
"learning_rate": 6.944255928147017e-06, |
|
"loss": 0.29, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.07952775272608019, |
|
"grad_norm": 14.666671752929688, |
|
"learning_rate": 6.931595191673823e-06, |
|
"loss": 0.2798, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.0803476264655243, |
|
"grad_norm": 9.045489311218262, |
|
"learning_rate": 6.9188036722837555e-06, |
|
"loss": 0.2526, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.08116750020496843, |
|
"grad_norm": 12.083099365234375, |
|
"learning_rate": 6.905881895786076e-06, |
|
"loss": 0.2825, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.08198737394441256, |
|
"grad_norm": 20.973670959472656, |
|
"learning_rate": 6.892830393344403e-06, |
|
"loss": 0.2703, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08280724768385668, |
|
"grad_norm": 12.959758758544922, |
|
"learning_rate": 6.879649701454886e-06, |
|
"loss": 0.2766, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.08362712142330081, |
|
"grad_norm": 11.118098258972168, |
|
"learning_rate": 6.866340361924141e-06, |
|
"loss": 0.2927, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.08444699516274494, |
|
"grad_norm": 12.703455924987793, |
|
"learning_rate": 6.852902921846988e-06, |
|
"loss": 0.2468, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.08526686890218907, |
|
"grad_norm": 33.15513229370117, |
|
"learning_rate": 6.8393379335839565e-06, |
|
"loss": 0.2845, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.08608674264163318, |
|
"grad_norm": 12.013687133789062, |
|
"learning_rate": 6.825645954738586e-06, |
|
"loss": 0.2879, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.08608674264163318, |
|
"eval_loss": 0.2693183720111847, |
|
"eval_runtime": 56.9849, |
|
"eval_samples_per_second": 8.774, |
|
"eval_steps_per_second": 8.774, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.08690661638107731, |
|
"grad_norm": 10.128811836242676, |
|
"learning_rate": 6.811827548134495e-06, |
|
"loss": 0.2873, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.08772649012052144, |
|
"grad_norm": 10.001947402954102, |
|
"learning_rate": 6.797883281792261e-06, |
|
"loss": 0.2931, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.08854636385996556, |
|
"grad_norm": 13.15841293334961, |
|
"learning_rate": 6.783813728906054e-06, |
|
"loss": 0.3, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.08936623759940969, |
|
"grad_norm": 8.157013893127441, |
|
"learning_rate": 6.769619467820086e-06, |
|
"loss": 0.2692, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.09018611133885382, |
|
"grad_norm": 8.676292419433594, |
|
"learning_rate": 6.755301082004838e-06, |
|
"loss": 0.3111, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.09100598507829795, |
|
"grad_norm": 14.835556030273438, |
|
"learning_rate": 6.740859160033068e-06, |
|
"loss": 0.2932, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.09182585881774206, |
|
"grad_norm": 14.752832412719727, |
|
"learning_rate": 6.726294295555623e-06, |
|
"loss": 0.2942, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.0926457325571862, |
|
"grad_norm": 9.42294979095459, |
|
"learning_rate": 6.711607087277034e-06, |
|
"loss": 0.2807, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.09346560629663032, |
|
"grad_norm": 6.576030731201172, |
|
"learning_rate": 6.69679813893091e-06, |
|
"loss": 0.2656, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.09428548003607444, |
|
"grad_norm": 14.54617977142334, |
|
"learning_rate": 6.681868059255113e-06, |
|
"loss": 0.2708, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.09510535377551857, |
|
"grad_norm": 19.004695892333984, |
|
"learning_rate": 6.666817461966741e-06, |
|
"loss": 0.2974, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.0959252275149627, |
|
"grad_norm": 13.359691619873047, |
|
"learning_rate": 6.651646965736902e-06, |
|
"loss": 0.2641, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.09674510125440682, |
|
"grad_norm": 9.031187057495117, |
|
"learning_rate": 6.636357194165274e-06, |
|
"loss": 0.2794, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.09756497499385094, |
|
"grad_norm": 11.242755889892578, |
|
"learning_rate": 6.620948775754481e-06, |
|
"loss": 0.2708, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.09838484873329507, |
|
"grad_norm": 9.727982521057129, |
|
"learning_rate": 6.605422343884255e-06, |
|
"loss": 0.2936, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.09838484873329507, |
|
"eval_loss": 0.2741548418998718, |
|
"eval_runtime": 56.2393, |
|
"eval_samples_per_second": 8.891, |
|
"eval_steps_per_second": 8.891, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.0992047224727392, |
|
"grad_norm": 11.938862800598145, |
|
"learning_rate": 6.589778536785396e-06, |
|
"loss": 0.2776, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.10002459621218332, |
|
"grad_norm": 9.253863334655762, |
|
"learning_rate": 6.5740179975135426e-06, |
|
"loss": 0.2695, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.10084446995162745, |
|
"grad_norm": 13.18783950805664, |
|
"learning_rate": 6.5581413739227314e-06, |
|
"loss": 0.2863, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.10166434369107158, |
|
"grad_norm": 10.108220100402832, |
|
"learning_rate": 6.542149318638777e-06, |
|
"loss": 0.2831, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.1024842174305157, |
|
"grad_norm": 13.539487838745117, |
|
"learning_rate": 6.526042489032434e-06, |
|
"loss": 0.2626, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.10330409116995982, |
|
"grad_norm": 9.928237915039062, |
|
"learning_rate": 6.509821547192383e-06, |
|
"loss": 0.2706, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.10412396490940395, |
|
"grad_norm": 10.978721618652344, |
|
"learning_rate": 6.493487159898006e-06, |
|
"loss": 0.2695, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.10494383864884808, |
|
"grad_norm": 9.98459243774414, |
|
"learning_rate": 6.477039998591991e-06, |
|
"loss": 0.2801, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.1057637123882922, |
|
"grad_norm": 12.930992126464844, |
|
"learning_rate": 6.460480739352719e-06, |
|
"loss": 0.2842, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.10658358612773633, |
|
"grad_norm": 12.851746559143066, |
|
"learning_rate": 6.4438100628664795e-06, |
|
"loss": 0.2635, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.10740345986718046, |
|
"grad_norm": 10.791857719421387, |
|
"learning_rate": 6.4270286543994874e-06, |
|
"loss": 0.2947, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.10822333360662457, |
|
"grad_norm": 9.770176887512207, |
|
"learning_rate": 6.410137203769718e-06, |
|
"loss": 0.2606, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.1090432073460687, |
|
"grad_norm": 17.897979736328125, |
|
"learning_rate": 6.393136405318545e-06, |
|
"loss": 0.2868, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.10986308108551283, |
|
"grad_norm": 19.892559051513672, |
|
"learning_rate": 6.376026957882207e-06, |
|
"loss": 0.2605, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.11068295482495695, |
|
"grad_norm": 9.193521499633789, |
|
"learning_rate": 6.3588095647630754e-06, |
|
"loss": 0.2454, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.11068295482495695, |
|
"eval_loss": 0.2674501836299896, |
|
"eval_runtime": 56.3954, |
|
"eval_samples_per_second": 8.866, |
|
"eval_steps_per_second": 8.866, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.11150282856440108, |
|
"grad_norm": 15.698138236999512, |
|
"learning_rate": 6.341484933700744e-06, |
|
"loss": 0.2639, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.11232270230384521, |
|
"grad_norm": 11.653697967529297, |
|
"learning_rate": 6.32405377684294e-06, |
|
"loss": 0.2711, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.11314257604328934, |
|
"grad_norm": 10.41117000579834, |
|
"learning_rate": 6.306516810716249e-06, |
|
"loss": 0.274, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.11396244978273345, |
|
"grad_norm": 17.14838981628418, |
|
"learning_rate": 6.288874756196662e-06, |
|
"loss": 0.2919, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.11478232352217758, |
|
"grad_norm": 12.094561576843262, |
|
"learning_rate": 6.271128338479939e-06, |
|
"loss": 0.272, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.11560219726162171, |
|
"grad_norm": 7.186673641204834, |
|
"learning_rate": 6.253278287051806e-06, |
|
"loss": 0.2614, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.11642207100106583, |
|
"grad_norm": 27.63665008544922, |
|
"learning_rate": 6.235325335657962e-06, |
|
"loss": 0.2581, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.11724194474050996, |
|
"grad_norm": 9.12143611907959, |
|
"learning_rate": 6.217270222273923e-06, |
|
"loss": 0.2497, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.11806181847995409, |
|
"grad_norm": 10.814976692199707, |
|
"learning_rate": 6.1991136890746825e-06, |
|
"loss": 0.2659, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.11888169221939822, |
|
"grad_norm": 13.897311210632324, |
|
"learning_rate": 6.180856482404208e-06, |
|
"loss": 0.2575, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.11970156595884233, |
|
"grad_norm": 14.34624195098877, |
|
"learning_rate": 6.162499352744754e-06, |
|
"loss": 0.276, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.12052143969828646, |
|
"grad_norm": 15.839101791381836, |
|
"learning_rate": 6.144043054686022e-06, |
|
"loss": 0.267, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.1213413134377306, |
|
"grad_norm": 13.110719680786133, |
|
"learning_rate": 6.125488346894139e-06, |
|
"loss": 0.2777, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.12216118717717471, |
|
"grad_norm": 11.638336181640625, |
|
"learning_rate": 6.106835992080464e-06, |
|
"loss": 0.2454, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.12298106091661884, |
|
"grad_norm": 12.756601333618164, |
|
"learning_rate": 6.088086756970252e-06, |
|
"loss": 0.2605, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.12298106091661884, |
|
"eval_loss": 0.2679287791252136, |
|
"eval_runtime": 56.0794, |
|
"eval_samples_per_second": 8.916, |
|
"eval_steps_per_second": 8.916, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.12380093465606297, |
|
"grad_norm": 20.72138214111328, |
|
"learning_rate": 6.0692414122711184e-06, |
|
"loss": 0.2593, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.12462080839550708, |
|
"grad_norm": 9.595439910888672, |
|
"learning_rate": 6.050300732641376e-06, |
|
"loss": 0.2719, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.12544068213495121, |
|
"grad_norm": 16.999011993408203, |
|
"learning_rate": 6.0312654966581755e-06, |
|
"loss": 0.2885, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.12626055587439533, |
|
"grad_norm": 14.768747329711914, |
|
"learning_rate": 6.012136486785512e-06, |
|
"loss": 0.2702, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.12708042961383947, |
|
"grad_norm": 8.815911293029785, |
|
"learning_rate": 5.992914489342061e-06, |
|
"loss": 0.2507, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.1279003033532836, |
|
"grad_norm": 20.083023071289062, |
|
"learning_rate": 5.9736002944688474e-06, |
|
"loss": 0.2632, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.12872017709272773, |
|
"grad_norm": 17.51641082763672, |
|
"learning_rate": 5.954194696096775e-06, |
|
"loss": 0.2937, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.12954005083217185, |
|
"grad_norm": 9.186761856079102, |
|
"learning_rate": 5.9346984919139865e-06, |
|
"loss": 0.2611, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.13035992457161596, |
|
"grad_norm": 13.085734367370605, |
|
"learning_rate": 5.9151124833330745e-06, |
|
"loss": 0.2507, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.1311797983110601, |
|
"grad_norm": 13.729114532470703, |
|
"learning_rate": 5.895437475458137e-06, |
|
"loss": 0.2774, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.13199967205050422, |
|
"grad_norm": 19.03725242614746, |
|
"learning_rate": 5.875674277051688e-06, |
|
"loss": 0.2687, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.13281954578994834, |
|
"grad_norm": 15.545515060424805, |
|
"learning_rate": 5.855823700501406e-06, |
|
"loss": 0.2765, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.13363941952939248, |
|
"grad_norm": 11.668421745300293, |
|
"learning_rate": 5.835886561786744e-06, |
|
"loss": 0.2682, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.1344592932688366, |
|
"grad_norm": 8.778451919555664, |
|
"learning_rate": 5.815863680445385e-06, |
|
"loss": 0.2347, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.13527916700828072, |
|
"grad_norm": 5.889225959777832, |
|
"learning_rate": 5.795755879539558e-06, |
|
"loss": 0.2709, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.13527916700828072, |
|
"eval_loss": 0.25923365354537964, |
|
"eval_runtime": 56.2341, |
|
"eval_samples_per_second": 8.891, |
|
"eval_steps_per_second": 8.891, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.13609904074772486, |
|
"grad_norm": 12.518867492675781, |
|
"learning_rate": 5.775563985622202e-06, |
|
"loss": 0.2833, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.13691891448716897, |
|
"grad_norm": 14.924880027770996, |
|
"learning_rate": 5.755288828702987e-06, |
|
"loss": 0.2863, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.1377387882266131, |
|
"grad_norm": 16.47811508178711, |
|
"learning_rate": 5.734931242214204e-06, |
|
"loss": 0.2596, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.13855866196605723, |
|
"grad_norm": 13.941671371459961, |
|
"learning_rate": 5.7144920629764955e-06, |
|
"loss": 0.2819, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.13937853570550135, |
|
"grad_norm": 16.261932373046875, |
|
"learning_rate": 5.693972131164471e-06, |
|
"loss": 0.303, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.14019840944494547, |
|
"grad_norm": 12.289247512817383, |
|
"learning_rate": 5.673372290272149e-06, |
|
"loss": 0.2855, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.1410182831843896, |
|
"grad_norm": 8.7142915725708, |
|
"learning_rate": 5.652693387078309e-06, |
|
"loss": 0.2615, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.14183815692383372, |
|
"grad_norm": 16.864688873291016, |
|
"learning_rate": 5.631936271611667e-06, |
|
"loss": 0.2813, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.14265803066327787, |
|
"grad_norm": 16.40870475769043, |
|
"learning_rate": 5.611101797115939e-06, |
|
"loss": 0.275, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.14347790440272198, |
|
"grad_norm": 14.436688423156738, |
|
"learning_rate": 5.5901908200147685e-06, |
|
"loss": 0.2788, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.1442977781421661, |
|
"grad_norm": 11.943658828735352, |
|
"learning_rate": 5.56920419987652e-06, |
|
"loss": 0.2805, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.14511765188161024, |
|
"grad_norm": 14.252999305725098, |
|
"learning_rate": 5.5481427993789534e-06, |
|
"loss": 0.2806, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.14593752562105436, |
|
"grad_norm": 11.182486534118652, |
|
"learning_rate": 5.527007484273746e-06, |
|
"loss": 0.2675, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.14675739936049848, |
|
"grad_norm": 12.846651077270508, |
|
"learning_rate": 5.5057991233509225e-06, |
|
"loss": 0.2744, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.14757727309994262, |
|
"grad_norm": 9.701010704040527, |
|
"learning_rate": 5.484518588403134e-06, |
|
"loss": 0.2808, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.14757727309994262, |
|
"eval_loss": 0.2612378001213074, |
|
"eval_runtime": 57.022, |
|
"eval_samples_per_second": 8.769, |
|
"eval_steps_per_second": 8.769, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.14839714683938673, |
|
"grad_norm": 7.793675422668457, |
|
"learning_rate": 5.463166754189819e-06, |
|
"loss": 0.27, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.14921702057883085, |
|
"grad_norm": 13.162193298339844, |
|
"learning_rate": 5.441744498401255e-06, |
|
"loss": 0.2574, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.150036894318275, |
|
"grad_norm": 15.428301811218262, |
|
"learning_rate": 5.4202527016224725e-06, |
|
"loss": 0.2675, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.1508567680577191, |
|
"grad_norm": 24.684080123901367, |
|
"learning_rate": 5.398692247297059e-06, |
|
"loss": 0.2916, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.15167664179716323, |
|
"grad_norm": 7.947139263153076, |
|
"learning_rate": 5.377064021690844e-06, |
|
"loss": 0.2841, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.15249651553660737, |
|
"grad_norm": 11.595500946044922, |
|
"learning_rate": 5.355368913855472e-06, |
|
"loss": 0.2562, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.15331638927605148, |
|
"grad_norm": 11.803101539611816, |
|
"learning_rate": 5.333607815591851e-06, |
|
"loss": 0.2292, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.1541362630154956, |
|
"grad_norm": 17.95461654663086, |
|
"learning_rate": 5.311781621413497e-06, |
|
"loss": 0.2787, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.15495613675493974, |
|
"grad_norm": 25.276002883911133, |
|
"learning_rate": 5.289891228509769e-06, |
|
"loss": 0.2889, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.15577601049438386, |
|
"grad_norm": 8.79496955871582, |
|
"learning_rate": 5.267937536708977e-06, |
|
"loss": 0.2667, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.156595884233828, |
|
"grad_norm": 10.413036346435547, |
|
"learning_rate": 5.245921448441407e-06, |
|
"loss": 0.2823, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.15741575797327212, |
|
"grad_norm": 11.163688659667969, |
|
"learning_rate": 5.223843868702214e-06, |
|
"loss": 0.2655, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.15823563171271623, |
|
"grad_norm": 16.093170166015625, |
|
"learning_rate": 5.201705705014231e-06, |
|
"loss": 0.2709, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.15905550545216038, |
|
"grad_norm": 18.966991424560547, |
|
"learning_rate": 5.1795078673906575e-06, |
|
"loss": 0.2593, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.1598753791916045, |
|
"grad_norm": 12.139580726623535, |
|
"learning_rate": 5.1572512682976546e-06, |
|
"loss": 0.2602, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.1598753791916045, |
|
"eval_loss": 0.2535741329193115, |
|
"eval_runtime": 56.9513, |
|
"eval_samples_per_second": 8.779, |
|
"eval_steps_per_second": 8.779, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.1606952529310486, |
|
"grad_norm": 17.421117782592773, |
|
"learning_rate": 5.134936822616837e-06, |
|
"loss": 0.2507, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.16151512667049275, |
|
"grad_norm": 8.096160888671875, |
|
"learning_rate": 5.112565447607669e-06, |
|
"loss": 0.2405, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.16233500040993687, |
|
"grad_norm": 10.138191223144531, |
|
"learning_rate": 5.090138062869755e-06, |
|
"loss": 0.2435, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.16315487414938099, |
|
"grad_norm": 32.244873046875, |
|
"learning_rate": 5.067655590305036e-06, |
|
"loss": 0.2546, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.16397474788882513, |
|
"grad_norm": 11.093918800354004, |
|
"learning_rate": 5.045118954079904e-06, |
|
"loss": 0.2595, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16479462162826924, |
|
"grad_norm": 11.482741355895996, |
|
"learning_rate": 5.022529080587205e-06, |
|
"loss": 0.2294, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.16561449536771336, |
|
"grad_norm": 13.456998825073242, |
|
"learning_rate": 4.999886898408157e-06, |
|
"loss": 0.2556, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.1664343691071575, |
|
"grad_norm": 11.575148582458496, |
|
"learning_rate": 4.977193338274189e-06, |
|
"loss": 0.2538, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.16725424284660162, |
|
"grad_norm": 12.712217330932617, |
|
"learning_rate": 4.954449333028672e-06, |
|
"loss": 0.2985, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.16807411658604574, |
|
"grad_norm": 25.477855682373047, |
|
"learning_rate": 4.931655817588579e-06, |
|
"loss": 0.2516, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.16889399032548988, |
|
"grad_norm": 17.030961990356445, |
|
"learning_rate": 4.9088137289060535e-06, |
|
"loss": 0.2544, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.169713864064934, |
|
"grad_norm": 10.903443336486816, |
|
"learning_rate": 4.885924005929896e-06, |
|
"loss": 0.2581, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.17053373780437814, |
|
"grad_norm": 9.746002197265625, |
|
"learning_rate": 4.862987589566965e-06, |
|
"loss": 0.2332, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.17135361154382225, |
|
"grad_norm": 14.084914207458496, |
|
"learning_rate": 4.840005422643503e-06, |
|
"loss": 0.2643, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.17217348528326637, |
|
"grad_norm": 9.59061336517334, |
|
"learning_rate": 4.816978449866372e-06, |
|
"loss": 0.2461, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.17217348528326637, |
|
"eval_loss": 0.2557007670402527, |
|
"eval_runtime": 56.7258, |
|
"eval_samples_per_second": 8.814, |
|
"eval_steps_per_second": 8.814, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.1729933590227105, |
|
"grad_norm": 12.96509075164795, |
|
"learning_rate": 4.793907617784238e-06, |
|
"loss": 0.2623, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.17381323276215463, |
|
"grad_norm": 21.171913146972656, |
|
"learning_rate": 4.770793874748642e-06, |
|
"loss": 0.2481, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.17463310650159874, |
|
"grad_norm": 15.18250560760498, |
|
"learning_rate": 4.747638170875032e-06, |
|
"loss": 0.2644, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.1754529802410429, |
|
"grad_norm": 13.478678703308105, |
|
"learning_rate": 4.724441458003699e-06, |
|
"loss": 0.2548, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.176272853980487, |
|
"grad_norm": 7.877747535705566, |
|
"learning_rate": 4.701204689660653e-06, |
|
"loss": 0.2468, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.17709272771993112, |
|
"grad_norm": 14.340051651000977, |
|
"learning_rate": 4.67792882101843e-06, |
|
"loss": 0.2652, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.17791260145937526, |
|
"grad_norm": 11.43173885345459, |
|
"learning_rate": 4.654614808856823e-06, |
|
"loss": 0.245, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.17873247519881938, |
|
"grad_norm": 16.191015243530273, |
|
"learning_rate": 4.631263611523557e-06, |
|
"loss": 0.2561, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.1795523489382635, |
|
"grad_norm": 14.481834411621094, |
|
"learning_rate": 4.607876188894896e-06, |
|
"loss": 0.2783, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.18037222267770764, |
|
"grad_norm": 12.716588973999023, |
|
"learning_rate": 4.58445350233618e-06, |
|
"loss": 0.2526, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.18119209641715175, |
|
"grad_norm": 16.625707626342773, |
|
"learning_rate": 4.560996514662314e-06, |
|
"loss": 0.2386, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.1820119701565959, |
|
"grad_norm": 15.23642635345459, |
|
"learning_rate": 4.5375061900981855e-06, |
|
"loss": 0.2522, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.18283184389604, |
|
"grad_norm": 22.573617935180664, |
|
"learning_rate": 4.513983494239034e-06, |
|
"loss": 0.2605, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.18365171763548413, |
|
"grad_norm": 16.085651397705078, |
|
"learning_rate": 4.490429394010752e-06, |
|
"loss": 0.2811, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.18447159137492827, |
|
"grad_norm": 23.764911651611328, |
|
"learning_rate": 4.466844857630147e-06, |
|
"loss": 0.2495, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.18447159137492827, |
|
"eval_loss": 0.2652283310890198, |
|
"eval_runtime": 56.3594, |
|
"eval_samples_per_second": 8.872, |
|
"eval_steps_per_second": 8.872, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.1852914651143724, |
|
"grad_norm": 17.39873504638672, |
|
"learning_rate": 4.443230854565133e-06, |
|
"loss": 0.2562, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.1861113388538165, |
|
"grad_norm": 11.883243560791016, |
|
"learning_rate": 4.4195883554948885e-06, |
|
"loss": 0.2777, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.18693121259326065, |
|
"grad_norm": 8.622486114501953, |
|
"learning_rate": 4.3959183322699466e-06, |
|
"loss": 0.2272, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.18775108633270476, |
|
"grad_norm": 16.060256958007812, |
|
"learning_rate": 4.372221757872255e-06, |
|
"loss": 0.2388, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.18857096007214888, |
|
"grad_norm": 9.97546100616455, |
|
"learning_rate": 4.3484996063751725e-06, |
|
"loss": 0.2736, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.18939083381159302, |
|
"grad_norm": 11.587379455566406, |
|
"learning_rate": 4.324752852903435e-06, |
|
"loss": 0.2321, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.19021070755103714, |
|
"grad_norm": 134.054931640625, |
|
"learning_rate": 4.300982473593068e-06, |
|
"loss": 0.2583, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.19103058129048126, |
|
"grad_norm": 15.653196334838867, |
|
"learning_rate": 4.277189445551261e-06, |
|
"loss": 0.2702, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.1918504550299254, |
|
"grad_norm": 14.868865966796875, |
|
"learning_rate": 4.253374746816209e-06, |
|
"loss": 0.2749, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.19267032876936951, |
|
"grad_norm": 18.965742111206055, |
|
"learning_rate": 4.229539356316898e-06, |
|
"loss": 0.2635, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.19349020250881363, |
|
"grad_norm": 21.16566276550293, |
|
"learning_rate": 4.205684253832877e-06, |
|
"loss": 0.2366, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.19431007624825777, |
|
"grad_norm": 9.739816665649414, |
|
"learning_rate": 4.1818104199539735e-06, |
|
"loss": 0.2507, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.1951299499877019, |
|
"grad_norm": 9.094308853149414, |
|
"learning_rate": 4.1579188360399916e-06, |
|
"loss": 0.2508, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.19594982372714603, |
|
"grad_norm": 13.532063484191895, |
|
"learning_rate": 4.134010484180368e-06, |
|
"loss": 0.2432, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.19676969746659015, |
|
"grad_norm": 10.089424133300781, |
|
"learning_rate": 4.110086347153807e-06, |
|
"loss": 0.2496, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.19676969746659015, |
|
"eval_loss": 0.24164016544818878, |
|
"eval_runtime": 58.2028, |
|
"eval_samples_per_second": 8.591, |
|
"eval_steps_per_second": 8.591, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.19758957120603426, |
|
"grad_norm": 14.62680721282959, |
|
"learning_rate": 4.0861474083878765e-06, |
|
"loss": 0.2585, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.1984094449454784, |
|
"grad_norm": 22.528297424316406, |
|
"learning_rate": 4.062194651918585e-06, |
|
"loss": 0.2341, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.19922931868492252, |
|
"grad_norm": 11.753854751586914, |
|
"learning_rate": 4.0382290623499384e-06, |
|
"loss": 0.2953, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.20004919242436664, |
|
"grad_norm": 16.247995376586914, |
|
"learning_rate": 4.014251624813453e-06, |
|
"loss": 0.2657, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.20086906616381078, |
|
"grad_norm": 15.834903717041016, |
|
"learning_rate": 3.990263324927675e-06, |
|
"loss": 0.2341, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.2016889399032549, |
|
"grad_norm": 6.7929887771606445, |
|
"learning_rate": 3.966265148757655e-06, |
|
"loss": 0.2355, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.20250881364269901, |
|
"grad_norm": 35.777835845947266, |
|
"learning_rate": 3.9422580827744224e-06, |
|
"loss": 0.2329, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.20332868738214316, |
|
"grad_norm": 15.361977577209473, |
|
"learning_rate": 3.9182431138144315e-06, |
|
"loss": 0.2515, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.20414856112158727, |
|
"grad_norm": 10.340039253234863, |
|
"learning_rate": 3.894221229038995e-06, |
|
"loss": 0.2397, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.2049684348610314, |
|
"grad_norm": 15.93770980834961, |
|
"learning_rate": 3.870193415893709e-06, |
|
"loss": 0.2432, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.20578830860047553, |
|
"grad_norm": 19.398086547851562, |
|
"learning_rate": 3.846160662067859e-06, |
|
"loss": 0.2471, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.20660818233991965, |
|
"grad_norm": 7.482428550720215, |
|
"learning_rate": 3.8221239554538275e-06, |
|
"loss": 0.2498, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.20742805607936377, |
|
"grad_norm": 7.209218502044678, |
|
"learning_rate": 3.798084284106478e-06, |
|
"loss": 0.263, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.2082479298188079, |
|
"grad_norm": 7.973605155944824, |
|
"learning_rate": 3.7740426362025424e-06, |
|
"loss": 0.2182, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.20906780355825202, |
|
"grad_norm": 17.178762435913086, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.2368, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.20906780355825202, |
|
"eval_loss": 0.24929100275039673, |
|
"eval_runtime": 56.544, |
|
"eval_samples_per_second": 8.843, |
|
"eval_steps_per_second": 8.843, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.20988767729769617, |
|
"grad_norm": 19.6829776763916, |
|
"learning_rate": 3.7259573637974587e-06, |
|
"loss": 0.2556, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.21070755103714028, |
|
"grad_norm": 18.270166397094727, |
|
"learning_rate": 3.701915715893523e-06, |
|
"loss": 0.2306, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.2115274247765844, |
|
"grad_norm": 14.25434398651123, |
|
"learning_rate": 3.677876044546174e-06, |
|
"loss": 0.2597, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.21234729851602854, |
|
"grad_norm": 9.318758964538574, |
|
"learning_rate": 3.6538393379321427e-06, |
|
"loss": 0.2659, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.21316717225547266, |
|
"grad_norm": 18.77834701538086, |
|
"learning_rate": 3.6298065841062934e-06, |
|
"loss": 0.2299, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.21398704599491677, |
|
"grad_norm": 17.720027923583984, |
|
"learning_rate": 3.6057787709610064e-06, |
|
"loss": 0.266, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.21480691973436092, |
|
"grad_norm": 7.643661022186279, |
|
"learning_rate": 3.5817568861855708e-06, |
|
"loss": 0.2362, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.21562679347380503, |
|
"grad_norm": 10.200757026672363, |
|
"learning_rate": 3.557741917225579e-06, |
|
"loss": 0.2405, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.21644666721324915, |
|
"grad_norm": 46.2437744140625, |
|
"learning_rate": 3.5337348512423468e-06, |
|
"loss": 0.252, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.2172665409526933, |
|
"grad_norm": 13.160014152526855, |
|
"learning_rate": 3.5097366750723275e-06, |
|
"loss": 0.247, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.2180864146921374, |
|
"grad_norm": 12.211856842041016, |
|
"learning_rate": 3.4857483751865478e-06, |
|
"loss": 0.2515, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.21890628843158152, |
|
"grad_norm": 14.44340705871582, |
|
"learning_rate": 3.461770937650064e-06, |
|
"loss": 0.2228, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.21972616217102567, |
|
"grad_norm": 43.0201530456543, |
|
"learning_rate": 3.437805348081416e-06, |
|
"loss": 0.2721, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.22054603591046978, |
|
"grad_norm": 9.385405540466309, |
|
"learning_rate": 3.413852591612125e-06, |
|
"loss": 0.2883, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.2213659096499139, |
|
"grad_norm": 14.081421852111816, |
|
"learning_rate": 3.389913652846194e-06, |
|
"loss": 0.2411, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.2213659096499139, |
|
"eval_loss": 0.23700179159641266, |
|
"eval_runtime": 56.0414, |
|
"eval_samples_per_second": 8.922, |
|
"eval_steps_per_second": 8.922, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.22218578338935804, |
|
"grad_norm": 7.245662689208984, |
|
"learning_rate": 3.365989515819633e-06, |
|
"loss": 0.2538, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.22300565712880216, |
|
"grad_norm": 15.124368667602539, |
|
"learning_rate": 3.34208116396001e-06, |
|
"loss": 0.2469, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.2238255308682463, |
|
"grad_norm": 15.782695770263672, |
|
"learning_rate": 3.318189580046028e-06, |
|
"loss": 0.2412, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.22464540460769042, |
|
"grad_norm": 21.473407745361328, |
|
"learning_rate": 3.294315746167124e-06, |
|
"loss": 0.2745, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.22546527834713453, |
|
"grad_norm": 14.113616943359375, |
|
"learning_rate": 3.2704606436831023e-06, |
|
"loss": 0.2329, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.22628515208657868, |
|
"grad_norm": 16.563539505004883, |
|
"learning_rate": 3.2466252531837934e-06, |
|
"loss": 0.2275, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.2271050258260228, |
|
"grad_norm": 15.176487922668457, |
|
"learning_rate": 3.2228105544487405e-06, |
|
"loss": 0.236, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.2279248995654669, |
|
"grad_norm": 21.701990127563477, |
|
"learning_rate": 3.1990175264069333e-06, |
|
"loss": 0.2619, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.22874477330491105, |
|
"grad_norm": 24.164974212646484, |
|
"learning_rate": 3.1752471470965653e-06, |
|
"loss": 0.2545, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.22956464704435517, |
|
"grad_norm": 18.652359008789062, |
|
"learning_rate": 3.151500393624829e-06, |
|
"loss": 0.2538, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.23038452078379928, |
|
"grad_norm": 17.519634246826172, |
|
"learning_rate": 3.127778242127747e-06, |
|
"loss": 0.2457, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.23120439452324343, |
|
"grad_norm": 32.73554992675781, |
|
"learning_rate": 3.104081667730055e-06, |
|
"loss": 0.2597, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.23202426826268754, |
|
"grad_norm": 14.897638320922852, |
|
"learning_rate": 3.0804116445051133e-06, |
|
"loss": 0.2565, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.23284414200213166, |
|
"grad_norm": 12.081779479980469, |
|
"learning_rate": 3.0567691454348674e-06, |
|
"loss": 0.2222, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.2336640157415758, |
|
"grad_norm": 12.295435905456543, |
|
"learning_rate": 3.033155142369855e-06, |
|
"loss": 0.2344, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.2336640157415758, |
|
"eval_loss": 0.23474246263504028, |
|
"eval_runtime": 55.6184, |
|
"eval_samples_per_second": 8.99, |
|
"eval_steps_per_second": 8.99, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.23448388948101992, |
|
"grad_norm": 14.579584121704102, |
|
"learning_rate": 3.009570605989249e-06, |
|
"loss": 0.2352, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.23530376322046404, |
|
"grad_norm": 22.36095428466797, |
|
"learning_rate": 2.986016505760967e-06, |
|
"loss": 0.2394, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.23612363695990818, |
|
"grad_norm": 10.306982040405273, |
|
"learning_rate": 2.962493809901815e-06, |
|
"loss": 0.2333, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.2369435106993523, |
|
"grad_norm": 36.44614791870117, |
|
"learning_rate": 2.9390034853376875e-06, |
|
"loss": 0.2539, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.23776338443879644, |
|
"grad_norm": 10.238338470458984, |
|
"learning_rate": 2.9155464976638217e-06, |
|
"loss": 0.2639, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.23858325817824055, |
|
"grad_norm": 22.99175262451172, |
|
"learning_rate": 2.8921238111051057e-06, |
|
"loss": 0.2769, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.23940313191768467, |
|
"grad_norm": 15.648612976074219, |
|
"learning_rate": 2.8687363884764434e-06, |
|
"loss": 0.2348, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.2402230056571288, |
|
"grad_norm": 9.030691146850586, |
|
"learning_rate": 2.8453851911431783e-06, |
|
"loss": 0.2223, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.24104287939657293, |
|
"grad_norm": 13.751124382019043, |
|
"learning_rate": 2.822071178981572e-06, |
|
"loss": 0.2474, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.24186275313601704, |
|
"grad_norm": 16.013547897338867, |
|
"learning_rate": 2.7987953103393484e-06, |
|
"loss": 0.2541, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.2426826268754612, |
|
"grad_norm": 11.65927791595459, |
|
"learning_rate": 2.7755585419963026e-06, |
|
"loss": 0.2535, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.2435025006149053, |
|
"grad_norm": 20.403488159179688, |
|
"learning_rate": 2.7523618291249687e-06, |
|
"loss": 0.2439, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.24432237435434942, |
|
"grad_norm": 15.705227851867676, |
|
"learning_rate": 2.729206125251359e-06, |
|
"loss": 0.2073, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.24514224809379356, |
|
"grad_norm": 16.818626403808594, |
|
"learning_rate": 2.7060923822157638e-06, |
|
"loss": 0.2592, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.24596212183323768, |
|
"grad_norm": 29.800796508789062, |
|
"learning_rate": 2.6830215501336288e-06, |
|
"loss": 0.2328, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.24596212183323768, |
|
"eval_loss": 0.24091680347919464, |
|
"eval_runtime": 55.7565, |
|
"eval_samples_per_second": 8.968, |
|
"eval_steps_per_second": 8.968, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2467819955726818, |
|
"grad_norm": 18.235761642456055, |
|
"learning_rate": 2.6599945773564997e-06, |
|
"loss": 0.2505, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.24760186931212594, |
|
"grad_norm": 13.632527351379395, |
|
"learning_rate": 2.6370124104330357e-06, |
|
"loss": 0.2626, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.24842174305157005, |
|
"grad_norm": 29.359901428222656, |
|
"learning_rate": 2.614075994070105e-06, |
|
"loss": 0.2372, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.24924161679101417, |
|
"grad_norm": 23.87677574157715, |
|
"learning_rate": 2.591186271093948e-06, |
|
"loss": 0.2103, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.2500614905304583, |
|
"grad_norm": 13.893345832824707, |
|
"learning_rate": 2.568344182411423e-06, |
|
"loss": 0.2299, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.25088136426990243, |
|
"grad_norm": 30.01930809020996, |
|
"learning_rate": 2.5455506669713293e-06, |
|
"loss": 0.237, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.2517012380093466, |
|
"grad_norm": 21.540925979614258, |
|
"learning_rate": 2.522806661725812e-06, |
|
"loss": 0.245, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.25252111174879066, |
|
"grad_norm": 11.055063247680664, |
|
"learning_rate": 2.5001131015918444e-06, |
|
"loss": 0.2386, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.2533409854882348, |
|
"grad_norm": 25.467863082885742, |
|
"learning_rate": 2.4774709194127973e-06, |
|
"loss": 0.2028, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.25416085922767895, |
|
"grad_norm": 16.482820510864258, |
|
"learning_rate": 2.4548810459200973e-06, |
|
"loss": 0.2559, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.25498073296712304, |
|
"grad_norm": 15.558172225952148, |
|
"learning_rate": 2.4323444096949647e-06, |
|
"loss": 0.2443, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.2558006067065672, |
|
"grad_norm": 12.034625053405762, |
|
"learning_rate": 2.409861937130248e-06, |
|
"loss": 0.2607, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.2566204804460113, |
|
"grad_norm": 11.549402236938477, |
|
"learning_rate": 2.3874345523923327e-06, |
|
"loss": 0.2182, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.25744035418545547, |
|
"grad_norm": 37.64973068237305, |
|
"learning_rate": 2.3650631773831644e-06, |
|
"loss": 0.2756, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.25826022792489955, |
|
"grad_norm": 10.317972183227539, |
|
"learning_rate": 2.3427487317023477e-06, |
|
"loss": 0.2325, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.25826022792489955, |
|
"eval_loss": 0.2304079383611679, |
|
"eval_runtime": 55.9839, |
|
"eval_samples_per_second": 8.931, |
|
"eval_steps_per_second": 8.931, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.2590801016643437, |
|
"grad_norm": 13.487903594970703, |
|
"learning_rate": 2.320492132609344e-06, |
|
"loss": 0.2491, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.25989997540378784, |
|
"grad_norm": 18.3017520904541, |
|
"learning_rate": 2.2982942949857705e-06, |
|
"loss": 0.2203, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.26071984914323193, |
|
"grad_norm": 35.3414421081543, |
|
"learning_rate": 2.276156131297787e-06, |
|
"loss": 0.2076, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.2615397228826761, |
|
"grad_norm": 7.3131327629089355, |
|
"learning_rate": 2.254078551558594e-06, |
|
"loss": 0.2476, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.2623595966221202, |
|
"grad_norm": 21.195293426513672, |
|
"learning_rate": 2.2320624632910232e-06, |
|
"loss": 0.2347, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.2631794703615643, |
|
"grad_norm": 19.634109497070312, |
|
"learning_rate": 2.210108771490233e-06, |
|
"loss": 0.2395, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.26399934410100845, |
|
"grad_norm": 16.585100173950195, |
|
"learning_rate": 2.1882183785865047e-06, |
|
"loss": 0.2258, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.2648192178404526, |
|
"grad_norm": 16.569671630859375, |
|
"learning_rate": 2.166392184408152e-06, |
|
"loss": 0.2379, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.2656390915798967, |
|
"grad_norm": 14.845422744750977, |
|
"learning_rate": 2.1446310861445306e-06, |
|
"loss": 0.2183, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.2664589653193408, |
|
"grad_norm": 16.37993621826172, |
|
"learning_rate": 2.1229359783091576e-06, |
|
"loss": 0.2249, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.26727883905878497, |
|
"grad_norm": 24.308523178100586, |
|
"learning_rate": 2.1013077527029428e-06, |
|
"loss": 0.2314, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.26809871279822906, |
|
"grad_norm": 20.230369567871094, |
|
"learning_rate": 2.079747298377528e-06, |
|
"loss": 0.2072, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.2689185865376732, |
|
"grad_norm": 18.310514450073242, |
|
"learning_rate": 2.058255501598745e-06, |
|
"loss": 0.2528, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.26973846027711734, |
|
"grad_norm": 15.269632339477539, |
|
"learning_rate": 2.0368332458101814e-06, |
|
"loss": 0.2206, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.27055833401656143, |
|
"grad_norm": 24.385452270507812, |
|
"learning_rate": 2.015481411596869e-06, |
|
"loss": 0.2341, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.27055833401656143, |
|
"eval_loss": 0.23421980440616608, |
|
"eval_runtime": 60.4493, |
|
"eval_samples_per_second": 8.271, |
|
"eval_steps_per_second": 8.271, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.2713782077560056, |
|
"grad_norm": 21.876766204833984, |
|
"learning_rate": 1.9942008766490793e-06, |
|
"loss": 0.235, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.2721980814954497, |
|
"grad_norm": 11.376224517822266, |
|
"learning_rate": 1.9729925157262554e-06, |
|
"loss": 0.2509, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.2730179552348938, |
|
"grad_norm": 27.929759979248047, |
|
"learning_rate": 1.9518572006210484e-06, |
|
"loss": 0.242, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.27383782897433795, |
|
"grad_norm": 23.26350975036621, |
|
"learning_rate": 1.9307958001234794e-06, |
|
"loss": 0.2507, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.2746577027137821, |
|
"grad_norm": 24.858692169189453, |
|
"learning_rate": 1.9098091799852347e-06, |
|
"loss": 0.2375, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.2754775764532262, |
|
"grad_norm": 16.973976135253906, |
|
"learning_rate": 1.8888982028840636e-06, |
|
"loss": 0.2341, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.2762974501926703, |
|
"grad_norm": 26.544775009155273, |
|
"learning_rate": 1.8680637283883355e-06, |
|
"loss": 0.2457, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.27711732393211447, |
|
"grad_norm": 16.246021270751953, |
|
"learning_rate": 1.8473066129216927e-06, |
|
"loss": 0.2484, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.27793719767155856, |
|
"grad_norm": 12.570246696472168, |
|
"learning_rate": 1.8266277097278527e-06, |
|
"loss": 0.2579, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.2787570714110027, |
|
"grad_norm": 17.455217361450195, |
|
"learning_rate": 1.8060278688355313e-06, |
|
"loss": 0.2213, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.27957694515044684, |
|
"grad_norm": 13.560107231140137, |
|
"learning_rate": 1.7855079370235043e-06, |
|
"loss": 0.2168, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.28039681888989093, |
|
"grad_norm": 19.205720901489258, |
|
"learning_rate": 1.7650687577857972e-06, |
|
"loss": 0.2166, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.2812166926293351, |
|
"grad_norm": 31.231449127197266, |
|
"learning_rate": 1.7447111712970138e-06, |
|
"loss": 0.2472, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.2820365663687792, |
|
"grad_norm": 18.0344181060791, |
|
"learning_rate": 1.7244360143778004e-06, |
|
"loss": 0.2376, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.2828564401082233, |
|
"grad_norm": 16.178203582763672, |
|
"learning_rate": 1.704244120460443e-06, |
|
"loss": 0.2209, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.2828564401082233, |
|
"eval_loss": 0.22183214128017426, |
|
"eval_runtime": 56.128, |
|
"eval_samples_per_second": 8.908, |
|
"eval_steps_per_second": 8.908, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.28367631384766745, |
|
"grad_norm": 18.059825897216797, |
|
"learning_rate": 1.6841363195546162e-06, |
|
"loss": 0.2267, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.2844961875871116, |
|
"grad_norm": 22.400646209716797, |
|
"learning_rate": 1.6641134382132576e-06, |
|
"loss": 0.2297, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.28531606132655574, |
|
"grad_norm": 18.88297462463379, |
|
"learning_rate": 1.6441762994985947e-06, |
|
"loss": 0.2087, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.2861359350659998, |
|
"grad_norm": 9.259561538696289, |
|
"learning_rate": 1.6243257229483141e-06, |
|
"loss": 0.2341, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.28695580880544397, |
|
"grad_norm": 9.176309585571289, |
|
"learning_rate": 1.6045625245418648e-06, |
|
"loss": 0.2314, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.2877756825448881, |
|
"grad_norm": 16.64775276184082, |
|
"learning_rate": 1.584887516666928e-06, |
|
"loss": 0.221, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.2885955562843322, |
|
"grad_norm": 16.043312072753906, |
|
"learning_rate": 1.565301508086015e-06, |
|
"loss": 0.2307, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.28941543002377634, |
|
"grad_norm": 28.55023765563965, |
|
"learning_rate": 1.5458053039032263e-06, |
|
"loss": 0.2013, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.2902353037632205, |
|
"grad_norm": 22.9605712890625, |
|
"learning_rate": 1.5263997055311536e-06, |
|
"loss": 0.2258, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.2910551775026646, |
|
"grad_norm": 11.065112113952637, |
|
"learning_rate": 1.5070855106579404e-06, |
|
"loss": 0.2375, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.2918750512421087, |
|
"grad_norm": 13.265893936157227, |
|
"learning_rate": 1.4878635132144885e-06, |
|
"loss": 0.2409, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.29269492498155286, |
|
"grad_norm": 22.174110412597656, |
|
"learning_rate": 1.4687345033418258e-06, |
|
"loss": 0.2424, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.29351479872099695, |
|
"grad_norm": 12.81115436553955, |
|
"learning_rate": 1.4496992673586262e-06, |
|
"loss": 0.2236, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.2943346724604411, |
|
"grad_norm": 12.606128692626953, |
|
"learning_rate": 1.4307585877288822e-06, |
|
"loss": 0.2262, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.29515454619988524, |
|
"grad_norm": 29.290117263793945, |
|
"learning_rate": 1.4119132430297496e-06, |
|
"loss": 0.2305, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.29515454619988524, |
|
"eval_loss": 0.22281211614608765, |
|
"eval_runtime": 55.6771, |
|
"eval_samples_per_second": 8.98, |
|
"eval_steps_per_second": 8.98, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.2959744199393293, |
|
"grad_norm": 19.89222526550293, |
|
"learning_rate": 1.3931640079195365e-06, |
|
"loss": 0.2354, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.29679429367877347, |
|
"grad_norm": 10.584065437316895, |
|
"learning_rate": 1.3745116531058645e-06, |
|
"loss": 0.2272, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.2976141674182176, |
|
"grad_norm": 18.46734619140625, |
|
"learning_rate": 1.3559569453139797e-06, |
|
"loss": 0.2192, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.2984340411576617, |
|
"grad_norm": 17.607667922973633, |
|
"learning_rate": 1.3375006472552483e-06, |
|
"loss": 0.2466, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.29925391489710584, |
|
"grad_norm": 19.822507858276367, |
|
"learning_rate": 1.3191435175957945e-06, |
|
"loss": 0.2271, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.30007378863655, |
|
"grad_norm": 7.999312400817871, |
|
"learning_rate": 1.3008863109253174e-06, |
|
"loss": 0.2244, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.3008936623759941, |
|
"grad_norm": 15.04226016998291, |
|
"learning_rate": 1.282729777726078e-06, |
|
"loss": 0.2303, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.3017135361154382, |
|
"grad_norm": 12.127747535705566, |
|
"learning_rate": 1.2646746643420392e-06, |
|
"loss": 0.2289, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.30253340985488236, |
|
"grad_norm": 10.014680862426758, |
|
"learning_rate": 1.2467217129481952e-06, |
|
"loss": 0.2176, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.30335328359432645, |
|
"grad_norm": 15.543107986450195, |
|
"learning_rate": 1.2288716615200617e-06, |
|
"loss": 0.2338, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.3041731573337706, |
|
"grad_norm": 12.86021614074707, |
|
"learning_rate": 1.2111252438033404e-06, |
|
"loss": 0.2192, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.30499303107321474, |
|
"grad_norm": 32.52058792114258, |
|
"learning_rate": 1.1934831892837524e-06, |
|
"loss": 0.2205, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.3058129048126588, |
|
"grad_norm": 6.391150951385498, |
|
"learning_rate": 1.1759462231570618e-06, |
|
"loss": 0.2043, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.30663277855210297, |
|
"grad_norm": 18.806997299194336, |
|
"learning_rate": 1.1585150662992578e-06, |
|
"loss": 0.2203, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.3074526522915471, |
|
"grad_norm": 16.80451774597168, |
|
"learning_rate": 1.1411904352369262e-06, |
|
"loss": 0.228, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.3074526522915471, |
|
"eval_loss": 0.2207518219947815, |
|
"eval_runtime": 56.5561, |
|
"eval_samples_per_second": 8.841, |
|
"eval_steps_per_second": 8.841, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.3082725260309912, |
|
"grad_norm": 14.464019775390625, |
|
"learning_rate": 1.1239730421177952e-06, |
|
"loss": 0.2285, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.30909239977043534, |
|
"grad_norm": 18.73137664794922, |
|
"learning_rate": 1.1068635946814569e-06, |
|
"loss": 0.2234, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.3099122735098795, |
|
"grad_norm": 10.308956146240234, |
|
"learning_rate": 1.0898627962302831e-06, |
|
"loss": 0.2208, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.31073214724932363, |
|
"grad_norm": 39.88100051879883, |
|
"learning_rate": 1.072971345600513e-06, |
|
"loss": 0.2376, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.3115520209887677, |
|
"grad_norm": 12.245576858520508, |
|
"learning_rate": 1.056189937133522e-06, |
|
"loss": 0.2283, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.31237189472821186, |
|
"grad_norm": 14.314285278320312, |
|
"learning_rate": 1.0395192606472822e-06, |
|
"loss": 0.2073, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.313191768467656, |
|
"grad_norm": 15.187841415405273, |
|
"learning_rate": 1.0229600014080101e-06, |
|
"loss": 0.2495, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.3140116422071001, |
|
"grad_norm": 13.99637508392334, |
|
"learning_rate": 1.006512840101995e-06, |
|
"loss": 0.2154, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.31483151594654424, |
|
"grad_norm": 7.902044773101807, |
|
"learning_rate": 9.90178452807619e-07, |
|
"loss": 0.2435, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.3156513896859884, |
|
"grad_norm": 12.850071907043457, |
|
"learning_rate": 9.739575109675674e-07, |
|
"loss": 0.2247, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.31647126342543247, |
|
"grad_norm": 14.898462295532227, |
|
"learning_rate": 9.578506813612243e-07, |
|
"loss": 0.221, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.3172911371648766, |
|
"grad_norm": 24.208559036254883, |
|
"learning_rate": 9.418586260772695e-07, |
|
"loss": 0.2303, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.31811101090432076, |
|
"grad_norm": 17.132963180541992, |
|
"learning_rate": 9.259820024864594e-07, |
|
"loss": 0.2283, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.31893088464376484, |
|
"grad_norm": 19.788406372070312, |
|
"learning_rate": 9.102214632146059e-07, |
|
"loss": 0.2465, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.319750758383209, |
|
"grad_norm": 26.01558494567871, |
|
"learning_rate": 8.94577656115746e-07, |
|
"loss": 0.2321, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.319750758383209, |
|
"eval_loss": 0.22018083930015564, |
|
"eval_runtime": 56.099, |
|
"eval_samples_per_second": 8.913, |
|
"eval_steps_per_second": 8.913, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.32057063212265313, |
|
"grad_norm": 13.368496894836426, |
|
"learning_rate": 8.790512242455198e-07, |
|
"loss": 0.2401, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.3213905058620972, |
|
"grad_norm": 17.882627487182617, |
|
"learning_rate": 8.636428058347274e-07, |
|
"loss": 0.2045, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.32221037960154136, |
|
"grad_norm": 21.98712158203125, |
|
"learning_rate": 8.483530342630993e-07, |
|
"loss": 0.243, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.3230302533409855, |
|
"grad_norm": 33.167381286621094, |
|
"learning_rate": 8.331825380332599e-07, |
|
"loss": 0.2258, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.3238501270804296, |
|
"grad_norm": 16.276443481445312, |
|
"learning_rate": 8.181319407448884e-07, |
|
"loss": 0.2489, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.32467000081987374, |
|
"grad_norm": 12.20262336730957, |
|
"learning_rate": 8.032018610690914e-07, |
|
"loss": 0.2074, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.3254898745593179, |
|
"grad_norm": 23.053037643432617, |
|
"learning_rate": 7.883929127229665e-07, |
|
"loss": 0.2238, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.32630974829876197, |
|
"grad_norm": 9.354714393615723, |
|
"learning_rate": 7.737057044443793e-07, |
|
"loss": 0.2268, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.3271296220382061, |
|
"grad_norm": 13.12759780883789, |
|
"learning_rate": 7.591408399669337e-07, |
|
"loss": 0.2259, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.32794949577765026, |
|
"grad_norm": 12.080741882324219, |
|
"learning_rate": 7.446989179951632e-07, |
|
"loss": 0.214, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.32876936951709435, |
|
"grad_norm": 13.813101768493652, |
|
"learning_rate": 7.303805321799146e-07, |
|
"loss": 0.218, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.3295892432565385, |
|
"grad_norm": 12.327116012573242, |
|
"learning_rate": 7.161862710939476e-07, |
|
"loss": 0.2295, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.33040911699598263, |
|
"grad_norm": 15.953246116638184, |
|
"learning_rate": 7.021167182077403e-07, |
|
"loss": 0.2197, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.3312289907354267, |
|
"grad_norm": 19.298919677734375, |
|
"learning_rate": 6.881724518655049e-07, |
|
"loss": 0.2326, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.33204886447487086, |
|
"grad_norm": 38.68765640258789, |
|
"learning_rate": 6.743540452614152e-07, |
|
"loss": 0.2303, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.33204886447487086, |
|
"eval_loss": 0.21772576868534088, |
|
"eval_runtime": 56.5668, |
|
"eval_samples_per_second": 8.839, |
|
"eval_steps_per_second": 8.839, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.332868738214315, |
|
"grad_norm": 11.087291717529297, |
|
"learning_rate": 6.606620664160438e-07, |
|
"loss": 0.2071, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.3336886119537591, |
|
"grad_norm": 50.521053314208984, |
|
"learning_rate": 6.470970781530139e-07, |
|
"loss": 0.2204, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.33450848569320324, |
|
"grad_norm": 32.14698028564453, |
|
"learning_rate": 6.336596380758604e-07, |
|
"loss": 0.2466, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.3353283594326474, |
|
"grad_norm": 19.88819694519043, |
|
"learning_rate": 6.203502985451152e-07, |
|
"loss": 0.2291, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.33614823317209147, |
|
"grad_norm": 11.445552825927734, |
|
"learning_rate": 6.071696066555978e-07, |
|
"loss": 0.2549, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.3369681069115356, |
|
"grad_norm": 17.117246627807617, |
|
"learning_rate": 5.941181042139258e-07, |
|
"loss": 0.2077, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.33778798065097976, |
|
"grad_norm": 10.231658935546875, |
|
"learning_rate": 5.811963277162466e-07, |
|
"loss": 0.2182, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.3386078543904239, |
|
"grad_norm": 14.68455696105957, |
|
"learning_rate": 5.684048083261789e-07, |
|
"loss": 0.2445, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.339427728129868, |
|
"grad_norm": 22.658329010009766, |
|
"learning_rate": 5.557440718529848e-07, |
|
"loss": 0.1938, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.34024760186931213, |
|
"grad_norm": 12.441681861877441, |
|
"learning_rate": 5.432146387299522e-07, |
|
"loss": 0.224, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.3410674756087563, |
|
"grad_norm": 16.301542282104492, |
|
"learning_rate": 5.308170239930022e-07, |
|
"loss": 0.2092, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.34188734934820036, |
|
"grad_norm": 17.414865493774414, |
|
"learning_rate": 5.185517372595187e-07, |
|
"loss": 0.2429, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.3427072230876445, |
|
"grad_norm": 37.58354949951172, |
|
"learning_rate": 5.064192827073995e-07, |
|
"loss": 0.2236, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.34352709682708865, |
|
"grad_norm": 19.772306442260742, |
|
"learning_rate": 4.944201590543308e-07, |
|
"loss": 0.2209, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.34434697056653274, |
|
"grad_norm": 10.470952987670898, |
|
"learning_rate": 4.825548595372898e-07, |
|
"loss": 0.2441, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.34434697056653274, |
|
"eval_loss": 0.2149660438299179, |
|
"eval_runtime": 55.9997, |
|
"eval_samples_per_second": 8.929, |
|
"eval_steps_per_second": 8.929, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.3451668443059769, |
|
"grad_norm": 12.9829683303833, |
|
"learning_rate": 4.7082387189226646e-07, |
|
"loss": 0.2012, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.345986718045421, |
|
"grad_norm": 11.852750778198242, |
|
"learning_rate": 4.5922767833421454e-07, |
|
"loss": 0.2172, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.3468065917848651, |
|
"grad_norm": 33.68533706665039, |
|
"learning_rate": 4.477667555372326e-07, |
|
"loss": 0.2114, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.34762646552430926, |
|
"grad_norm": 24.621292114257812, |
|
"learning_rate": 4.364415746149678e-07, |
|
"loss": 0.2264, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.3484463392637534, |
|
"grad_norm": 23.111419677734375, |
|
"learning_rate": 4.2525260110124964e-07, |
|
"loss": 0.2146, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.3492662130031975, |
|
"grad_norm": 22.753629684448242, |
|
"learning_rate": 4.1420029493095623e-07, |
|
"loss": 0.2181, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.35008608674264163, |
|
"grad_norm": 12.422630310058594, |
|
"learning_rate": 4.032851104211036e-07, |
|
"loss": 0.2059, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.3509059604820858, |
|
"grad_norm": 21.33889389038086, |
|
"learning_rate": 3.925074962521762e-07, |
|
"loss": 0.2041, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.35172583422152986, |
|
"grad_norm": 21.088577270507812, |
|
"learning_rate": 3.818678954496787e-07, |
|
"loss": 0.2162, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.352545707960974, |
|
"grad_norm": 14.029748916625977, |
|
"learning_rate": 3.713667453659287e-07, |
|
"loss": 0.2291, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.35336558170041815, |
|
"grad_norm": 11.585044860839844, |
|
"learning_rate": 3.6100447766207473e-07, |
|
"loss": 0.2139, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.35418545543986224, |
|
"grad_norm": 13.666373252868652, |
|
"learning_rate": 3.5078151829035693e-07, |
|
"loss": 0.2311, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.3550053291793064, |
|
"grad_norm": 24.15358543395996, |
|
"learning_rate": 3.4069828747659405e-07, |
|
"loss": 0.2149, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.3558252029187505, |
|
"grad_norm": 25.829856872558594, |
|
"learning_rate": 3.3075519970291144e-07, |
|
"loss": 0.2055, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.3566450766581946, |
|
"grad_norm": 23.233440399169922, |
|
"learning_rate": 3.209526636907036e-07, |
|
"loss": 0.2444, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.3566450766581946, |
|
"eval_loss": 0.2148878425359726, |
|
"eval_runtime": 56.223, |
|
"eval_samples_per_second": 8.893, |
|
"eval_steps_per_second": 8.893, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.35746495039763876, |
|
"grad_norm": 19.731224060058594, |
|
"learning_rate": 3.1129108238383095e-07, |
|
"loss": 0.2199, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.3582848241370829, |
|
"grad_norm": 23.215808868408203, |
|
"learning_rate": 3.017708529320604e-07, |
|
"loss": 0.2228, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.359104697876527, |
|
"grad_norm": 17.997251510620117, |
|
"learning_rate": 2.923923666747357e-07, |
|
"loss": 0.2336, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.35992457161597113, |
|
"grad_norm": 14.64735221862793, |
|
"learning_rate": 2.8315600912469477e-07, |
|
"loss": 0.2831, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.3607444453554153, |
|
"grad_norm": 18.220691680908203, |
|
"learning_rate": 2.740621599524189e-07, |
|
"loss": 0.2277, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.36156431909485937, |
|
"grad_norm": 16.92856216430664, |
|
"learning_rate": 2.651111929704303e-07, |
|
"loss": 0.2139, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.3623841928343035, |
|
"grad_norm": 30.373014450073242, |
|
"learning_rate": 2.563034761179223e-07, |
|
"loss": 0.2354, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.36320406657374765, |
|
"grad_norm": 16.33125114440918, |
|
"learning_rate": 2.476393714456384e-07, |
|
"loss": 0.2209, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.3640239403131918, |
|
"grad_norm": 13.93752670288086, |
|
"learning_rate": 2.391192351009855e-07, |
|
"loss": 0.2285, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.3648438140526359, |
|
"grad_norm": 24.299808502197266, |
|
"learning_rate": 2.3074341731339837e-07, |
|
"loss": 0.2487, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.36566368779208, |
|
"grad_norm": 15.581805229187012, |
|
"learning_rate": 2.225122623799407e-07, |
|
"loss": 0.2112, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.36648356153152417, |
|
"grad_norm": 21.24774932861328, |
|
"learning_rate": 2.1442610865115135e-07, |
|
"loss": 0.2253, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.36730343527096826, |
|
"grad_norm": 20.960872650146484, |
|
"learning_rate": 2.0648528851714077e-07, |
|
"loss": 0.2208, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.3681233090104124, |
|
"grad_norm": 22.186767578125, |
|
"learning_rate": 1.9869012839392064e-07, |
|
"loss": 0.218, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.36894318274985655, |
|
"grad_norm": 15.852953910827637, |
|
"learning_rate": 1.9104094870999264e-07, |
|
"loss": 0.2123, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.36894318274985655, |
|
"eval_loss": 0.21366393566131592, |
|
"eval_runtime": 55.673, |
|
"eval_samples_per_second": 8.981, |
|
"eval_steps_per_second": 8.981, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.36976305648930063, |
|
"grad_norm": 11.23139476776123, |
|
"learning_rate": 1.8353806389317428e-07, |
|
"loss": 0.2201, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.3705829302287448, |
|
"grad_norm": 15.876472473144531, |
|
"learning_rate": 1.761817823576731e-07, |
|
"loss": 0.2382, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.3714028039681889, |
|
"grad_norm": 18.092660903930664, |
|
"learning_rate": 1.6897240649141125e-07, |
|
"loss": 0.2359, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.372222677707633, |
|
"grad_norm": 20.05590057373047, |
|
"learning_rate": 1.619102326435923e-07, |
|
"loss": 0.2304, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.37304255144707715, |
|
"grad_norm": 14.876965522766113, |
|
"learning_rate": 1.5499555111252285e-07, |
|
"loss": 0.2305, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.3738624251865213, |
|
"grad_norm": 24.27523422241211, |
|
"learning_rate": 1.4822864613367766e-07, |
|
"loss": 0.229, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.3746822989259654, |
|
"grad_norm": 36.034820556640625, |
|
"learning_rate": 1.4160979586801724e-07, |
|
"loss": 0.2099, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.37550217266540953, |
|
"grad_norm": 14.821313858032227, |
|
"learning_rate": 1.3513927239055036e-07, |
|
"loss": 0.2069, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.37632204640485367, |
|
"grad_norm": 24.151025772094727, |
|
"learning_rate": 1.2881734167915425e-07, |
|
"loss": 0.2477, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.37714192014429776, |
|
"grad_norm": 34.51681900024414, |
|
"learning_rate": 1.2264426360363956e-07, |
|
"loss": 0.2169, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.3779617938837419, |
|
"grad_norm": 18.54802894592285, |
|
"learning_rate": 1.1662029191506775e-07, |
|
"loss": 0.2053, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.37878166762318605, |
|
"grad_norm": 18.75210189819336, |
|
"learning_rate": 1.107456742353201e-07, |
|
"loss": 0.2313, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.37960154136263013, |
|
"grad_norm": 14.032902717590332, |
|
"learning_rate": 1.0502065204692062e-07, |
|
"loss": 0.2253, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.3804214151020743, |
|
"grad_norm": 16.711780548095703, |
|
"learning_rate": 9.94454606831076e-08, |
|
"loss": 0.208, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.3812412888415184, |
|
"grad_norm": 33.53385543823242, |
|
"learning_rate": 9.402032931816144e-08, |
|
"loss": 0.2256, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.3812412888415184, |
|
"eval_loss": 0.2128845751285553, |
|
"eval_runtime": 55.1573, |
|
"eval_samples_per_second": 9.065, |
|
"eval_steps_per_second": 9.065, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.3820611625809625, |
|
"grad_norm": 9.32500171661377, |
|
"learning_rate": 8.874548095798464e-08, |
|
"loss": 0.227, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.38288103632040665, |
|
"grad_norm": 12.115835189819336, |
|
"learning_rate": 8.362113243093245e-08, |
|
"loss": 0.2148, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.3837009100598508, |
|
"grad_norm": 26.36838722229004, |
|
"learning_rate": 7.864749437890173e-08, |
|
"loss": 0.2228, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.3845207837992949, |
|
"grad_norm": 12.476286888122559, |
|
"learning_rate": 7.382477124867282e-08, |
|
"loss": 0.2057, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.38534065753873903, |
|
"grad_norm": 15.308034896850586, |
|
"learning_rate": 6.915316128350461e-08, |
|
"loss": 0.2278, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.3861605312781832, |
|
"grad_norm": 9.208645820617676, |
|
"learning_rate": 6.463285651498563e-08, |
|
"loss": 0.2227, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.38698040501762726, |
|
"grad_norm": 9.877080917358398, |
|
"learning_rate": 6.026404275513875e-08, |
|
"loss": 0.2197, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.3878002787570714, |
|
"grad_norm": 16.259761810302734, |
|
"learning_rate": 5.604689958878723e-08, |
|
"loss": 0.2413, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.38862015249651555, |
|
"grad_norm": 17.41680908203125, |
|
"learning_rate": 5.198160036616898e-08, |
|
"loss": 0.2159, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.38944002623595964, |
|
"grad_norm": 17.588123321533203, |
|
"learning_rate": 4.8068312195811847e-08, |
|
"loss": 0.2191, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.3902598999754038, |
|
"grad_norm": 14.38376235961914, |
|
"learning_rate": 4.4307195937666194e-08, |
|
"loss": 0.2332, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.3910797737148479, |
|
"grad_norm": 12.54135799407959, |
|
"learning_rate": 4.069840619648935e-08, |
|
"loss": 0.2176, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.39189964745429207, |
|
"grad_norm": 20.703615188598633, |
|
"learning_rate": 3.72420913154932e-08, |
|
"loss": 0.2204, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.39271952119373615, |
|
"grad_norm": 28.904329299926758, |
|
"learning_rate": 3.3938393370244876e-08, |
|
"loss": 0.2389, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.3935393949331803, |
|
"grad_norm": 15.144803047180176, |
|
"learning_rate": 3.078744816282731e-08, |
|
"loss": 0.2306, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.3935393949331803, |
|
"eval_loss": 0.2134290486574173, |
|
"eval_runtime": 55.5119, |
|
"eval_samples_per_second": 9.007, |
|
"eval_steps_per_second": 9.007, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.39435926867262444, |
|
"grad_norm": 18.657732009887695, |
|
"learning_rate": 2.778938521625613e-08, |
|
"loss": 0.2454, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.39517914241206853, |
|
"grad_norm": 20.660715103149414, |
|
"learning_rate": 2.4944327769157314e-08, |
|
"loss": 0.2211, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.3959990161515127, |
|
"grad_norm": 13.545777320861816, |
|
"learning_rate": 2.225239277069871e-08, |
|
"loss": 0.1803, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.3968188898909568, |
|
"grad_norm": 20.064281463623047, |
|
"learning_rate": 1.971369087578473e-08, |
|
"loss": 0.2226, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.3976387636304009, |
|
"grad_norm": 11.630465507507324, |
|
"learning_rate": 1.7328326440506637e-08, |
|
"loss": 0.2117, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.39845863736984505, |
|
"grad_norm": 16.434839248657227, |
|
"learning_rate": 1.5096397517853497e-08, |
|
"loss": 0.2381, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.3992785111092892, |
|
"grad_norm": 14.184981346130371, |
|
"learning_rate": 1.3017995853681631e-08, |
|
"loss": 0.2262, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.4000983848487333, |
|
"grad_norm": 17.047590255737305, |
|
"learning_rate": 1.1093206882943076e-08, |
|
"loss": 0.2164, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.4009182585881774, |
|
"grad_norm": 15.3792142868042, |
|
"learning_rate": 9.322109726172952e-09, |
|
"loss": 0.2288, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.40173813232762157, |
|
"grad_norm": 14.833084106445312, |
|
"learning_rate": 7.704777186238744e-09, |
|
"loss": 0.209, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.40255800606706565, |
|
"grad_norm": 22.476787567138672, |
|
"learning_rate": 6.241275745346859e-09, |
|
"loss": 0.2118, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.4033778798065098, |
|
"grad_norm": 14.301311492919922, |
|
"learning_rate": 4.931665562308563e-09, |
|
"loss": 0.2222, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.40419775354595394, |
|
"grad_norm": 13.92874813079834, |
|
"learning_rate": 3.7760004700702905e-09, |
|
"loss": 0.2283, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.40501762728539803, |
|
"grad_norm": 20.181961059570312, |
|
"learning_rate": 2.7743279734962494e-09, |
|
"loss": 0.2132, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.4058375010248422, |
|
"grad_norm": 22.093725204467773, |
|
"learning_rate": 1.926689247420399e-09, |
|
"loss": 0.2127, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.4058375010248422, |
|
"eval_loss": 0.2128431349992752, |
|
"eval_runtime": 55.4771, |
|
"eval_samples_per_second": 9.013, |
|
"eval_steps_per_second": 9.013, |
|
"step": 4950 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.911768952965693e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|