alicegoesdown's picture
Training in progress, step 4950, checkpoint
f940a68 verified
raw
history blame
94.2 kB
{
"best_metric": 0.2128431349992752,
"best_model_checkpoint": "./output/checkpoint-4950",
"epoch": 0.4058375010248422,
"eval_steps": 150,
"global_step": 4950,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008198737394441257,
"grad_norm": 11.523909568786621,
"learning_rate": 7.500000000000001e-07,
"loss": 0.39,
"step": 10
},
{
"epoch": 0.0016397474788882513,
"grad_norm": 9.020567893981934,
"learning_rate": 1.5000000000000002e-06,
"loss": 0.3576,
"step": 20
},
{
"epoch": 0.002459621218332377,
"grad_norm": 9.512846946716309,
"learning_rate": 2.25e-06,
"loss": 0.3874,
"step": 30
},
{
"epoch": 0.0032794949577765026,
"grad_norm": 39.97313690185547,
"learning_rate": 3.0000000000000005e-06,
"loss": 0.3568,
"step": 40
},
{
"epoch": 0.004099368697220628,
"grad_norm": 12.515055656433105,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.3314,
"step": 50
},
{
"epoch": 0.004919242436664754,
"grad_norm": 11.462284088134766,
"learning_rate": 4.5e-06,
"loss": 0.3641,
"step": 60
},
{
"epoch": 0.005739116176108879,
"grad_norm": 18.380435943603516,
"learning_rate": 5.2500000000000006e-06,
"loss": 0.348,
"step": 70
},
{
"epoch": 0.006558989915553005,
"grad_norm": 13.468473434448242,
"learning_rate": 6.000000000000001e-06,
"loss": 0.348,
"step": 80
},
{
"epoch": 0.007378863654997131,
"grad_norm": 10.285468101501465,
"learning_rate": 6.7500000000000014e-06,
"loss": 0.3352,
"step": 90
},
{
"epoch": 0.008198737394441257,
"grad_norm": 17.571596145629883,
"learning_rate": 7.500000000000001e-06,
"loss": 0.3438,
"step": 100
},
{
"epoch": 0.009018611133885381,
"grad_norm": 19.84699249267578,
"learning_rate": 7.499922926093874e-06,
"loss": 0.3253,
"step": 110
},
{
"epoch": 0.009838484873329507,
"grad_norm": 16.91347885131836,
"learning_rate": 7.499691707543699e-06,
"loss": 0.3328,
"step": 120
},
{
"epoch": 0.010658358612773634,
"grad_norm": 11.190834999084473,
"learning_rate": 7.499306353853963e-06,
"loss": 0.3308,
"step": 130
},
{
"epoch": 0.011478232352217758,
"grad_norm": 11.117925643920898,
"learning_rate": 7.49876688086505e-06,
"loss": 0.3401,
"step": 140
},
{
"epoch": 0.012298106091661884,
"grad_norm": 12.28294563293457,
"learning_rate": 7.4980733107525805e-06,
"loss": 0.303,
"step": 150
},
{
"epoch": 0.012298106091661884,
"eval_loss": 0.32195183634757996,
"eval_runtime": 58.0333,
"eval_samples_per_second": 8.616,
"eval_steps_per_second": 8.616,
"step": 150
},
{
"epoch": 0.01311797983110601,
"grad_norm": 12.885525703430176,
"learning_rate": 7.4972256720265044e-06,
"loss": 0.3595,
"step": 160
},
{
"epoch": 0.013937853570550135,
"grad_norm": 12.438248634338379,
"learning_rate": 7.496223999529932e-06,
"loss": 0.3361,
"step": 170
},
{
"epoch": 0.014757727309994261,
"grad_norm": 14.641826629638672,
"learning_rate": 7.4950683344376926e-06,
"loss": 0.3296,
"step": 180
},
{
"epoch": 0.015577601049438386,
"grad_norm": 9.628592491149902,
"learning_rate": 7.4937587242546544e-06,
"loss": 0.3225,
"step": 190
},
{
"epoch": 0.016397474788882514,
"grad_norm": 15.733799934387207,
"learning_rate": 7.492295222813762e-06,
"loss": 0.3284,
"step": 200
},
{
"epoch": 0.017217348528326636,
"grad_norm": 12.937703132629395,
"learning_rate": 7.490677890273828e-06,
"loss": 0.3434,
"step": 210
},
{
"epoch": 0.018037222267770762,
"grad_norm": 16.046674728393555,
"learning_rate": 7.488906793117058e-06,
"loss": 0.3519,
"step": 220
},
{
"epoch": 0.01885709600721489,
"grad_norm": 11.472362518310547,
"learning_rate": 7.486982004146319e-06,
"loss": 0.3587,
"step": 230
},
{
"epoch": 0.019676969746659015,
"grad_norm": 15.215801239013672,
"learning_rate": 7.484903602482148e-06,
"loss": 0.3197,
"step": 240
},
{
"epoch": 0.02049684348610314,
"grad_norm": 11.658143997192383,
"learning_rate": 7.4826716735594945e-06,
"loss": 0.3114,
"step": 250
},
{
"epoch": 0.021316717225547267,
"grad_norm": 7.448172092437744,
"learning_rate": 7.480286309124216e-06,
"loss": 0.2912,
"step": 260
},
{
"epoch": 0.02213659096499139,
"grad_norm": 12.367362022399902,
"learning_rate": 7.477747607229302e-06,
"loss": 0.3167,
"step": 270
},
{
"epoch": 0.022956464704435516,
"grad_norm": 13.513625144958496,
"learning_rate": 7.475055672230844e-06,
"loss": 0.3093,
"step": 280
},
{
"epoch": 0.023776338443879642,
"grad_norm": 19.878536224365234,
"learning_rate": 7.472210614783745e-06,
"loss": 0.3256,
"step": 290
},
{
"epoch": 0.02459621218332377,
"grad_norm": 22.84262466430664,
"learning_rate": 7.469212551837173e-06,
"loss": 0.3104,
"step": 300
},
{
"epoch": 0.02459621218332377,
"eval_loss": 0.3093046247959137,
"eval_runtime": 58.7245,
"eval_samples_per_second": 8.514,
"eval_steps_per_second": 8.514,
"step": 300
},
{
"epoch": 0.025416085922767895,
"grad_norm": 9.043919563293457,
"learning_rate": 7.4660616066297565e-06,
"loss": 0.3089,
"step": 310
},
{
"epoch": 0.02623595966221202,
"grad_norm": 6.400809288024902,
"learning_rate": 7.462757908684509e-06,
"loss": 0.2959,
"step": 320
},
{
"epoch": 0.027055833401656144,
"grad_norm": 19.60870361328125,
"learning_rate": 7.459301593803512e-06,
"loss": 0.3251,
"step": 330
},
{
"epoch": 0.02787570714110027,
"grad_norm": 8.441984176635742,
"learning_rate": 7.455692804062335e-06,
"loss": 0.3108,
"step": 340
},
{
"epoch": 0.028695580880544396,
"grad_norm": 20.126216888427734,
"learning_rate": 7.451931687804189e-06,
"loss": 0.3152,
"step": 350
},
{
"epoch": 0.029515454619988522,
"grad_norm": 11.44316291809082,
"learning_rate": 7.448018399633831e-06,
"loss": 0.3302,
"step": 360
},
{
"epoch": 0.03033532835943265,
"grad_norm": 10.247148513793945,
"learning_rate": 7.443953100411214e-06,
"loss": 0.289,
"step": 370
},
{
"epoch": 0.03115520209887677,
"grad_norm": 10.746755599975586,
"learning_rate": 7.439735957244862e-06,
"loss": 0.2886,
"step": 380
},
{
"epoch": 0.0319750758383209,
"grad_norm": 19.19182014465332,
"learning_rate": 7.435367143485015e-06,
"loss": 0.325,
"step": 390
},
{
"epoch": 0.03279494957776503,
"grad_norm": 12.273555755615234,
"learning_rate": 7.430846838716496e-06,
"loss": 0.3107,
"step": 400
},
{
"epoch": 0.03361482331720915,
"grad_norm": 13.099973678588867,
"learning_rate": 7.426175228751328e-06,
"loss": 0.3103,
"step": 410
},
{
"epoch": 0.03443469705665327,
"grad_norm": 20.098796844482422,
"learning_rate": 7.421352505621099e-06,
"loss": 0.284,
"step": 420
},
{
"epoch": 0.0352545707960974,
"grad_norm": 10.289865493774414,
"learning_rate": 7.416378867569069e-06,
"loss": 0.3337,
"step": 430
},
{
"epoch": 0.036074444535541525,
"grad_norm": 13.34965705871582,
"learning_rate": 7.411254519042017e-06,
"loss": 0.3085,
"step": 440
},
{
"epoch": 0.036894318274985655,
"grad_norm": 11.321673393249512,
"learning_rate": 7.4059796706818396e-06,
"loss": 0.3043,
"step": 450
},
{
"epoch": 0.036894318274985655,
"eval_loss": 0.2889861762523651,
"eval_runtime": 56.9295,
"eval_samples_per_second": 8.783,
"eval_steps_per_second": 8.783,
"step": 450
},
{
"epoch": 0.03771419201442978,
"grad_norm": 15.978049278259277,
"learning_rate": 7.400554539316894e-06,
"loss": 0.2942,
"step": 460
},
{
"epoch": 0.0385340657538739,
"grad_norm": 16.420135498046875,
"learning_rate": 7.394979347953081e-06,
"loss": 0.3139,
"step": 470
},
{
"epoch": 0.03935393949331803,
"grad_norm": 15.941482543945312,
"learning_rate": 7.389254325764681e-06,
"loss": 0.3018,
"step": 480
},
{
"epoch": 0.04017381323276215,
"grad_norm": 9.359827041625977,
"learning_rate": 7.383379708084934e-06,
"loss": 0.3048,
"step": 490
},
{
"epoch": 0.04099368697220628,
"grad_norm": 11.175127983093262,
"learning_rate": 7.377355736396362e-06,
"loss": 0.3001,
"step": 500
},
{
"epoch": 0.041813560711650405,
"grad_norm": 18.719478607177734,
"learning_rate": 7.371182658320847e-06,
"loss": 0.3105,
"step": 510
},
{
"epoch": 0.042633434451094535,
"grad_norm": 9.761693954467773,
"learning_rate": 7.36486072760945e-06,
"loss": 0.3024,
"step": 520
},
{
"epoch": 0.04345330819053866,
"grad_norm": 15.880053520202637,
"learning_rate": 7.358390204131984e-06,
"loss": 0.3099,
"step": 530
},
{
"epoch": 0.04427318192998278,
"grad_norm": 10.00100326538086,
"learning_rate": 7.3517713538663235e-06,
"loss": 0.3215,
"step": 540
},
{
"epoch": 0.04509305566942691,
"grad_norm": 7.478984355926514,
"learning_rate": 7.345004448887478e-06,
"loss": 0.2974,
"step": 550
},
{
"epoch": 0.04591292940887103,
"grad_norm": 9.254852294921875,
"learning_rate": 7.3380897673564085e-06,
"loss": 0.3126,
"step": 560
},
{
"epoch": 0.04673280314831516,
"grad_norm": 13.706809997558594,
"learning_rate": 7.33102759350859e-06,
"loss": 0.3018,
"step": 570
},
{
"epoch": 0.047552676887759285,
"grad_norm": 16.57872200012207,
"learning_rate": 7.323818217642328e-06,
"loss": 0.2904,
"step": 580
},
{
"epoch": 0.04837255062720341,
"grad_norm": 14.819424629211426,
"learning_rate": 7.316461936106827e-06,
"loss": 0.2855,
"step": 590
},
{
"epoch": 0.04919242436664754,
"grad_norm": 17.543973922729492,
"learning_rate": 7.3089590512900084e-06,
"loss": 0.3169,
"step": 600
},
{
"epoch": 0.04919242436664754,
"eval_loss": 0.2996714413166046,
"eval_runtime": 58.2745,
"eval_samples_per_second": 8.58,
"eval_steps_per_second": 8.58,
"step": 600
},
{
"epoch": 0.05001229810609166,
"grad_norm": 10.767305374145508,
"learning_rate": 7.301309871606081e-06,
"loss": 0.3011,
"step": 610
},
{
"epoch": 0.05083217184553579,
"grad_norm": 6.571865081787109,
"learning_rate": 7.293514711482861e-06,
"loss": 0.2783,
"step": 620
},
{
"epoch": 0.05165204558497991,
"grad_norm": 12.295404434204102,
"learning_rate": 7.285573891348849e-06,
"loss": 0.2829,
"step": 630
},
{
"epoch": 0.05247191932442404,
"grad_norm": 12.576509475708008,
"learning_rate": 7.27748773762006e-06,
"loss": 0.3021,
"step": 640
},
{
"epoch": 0.053291793063868165,
"grad_norm": 7.258118629455566,
"learning_rate": 7.269256582686603e-06,
"loss": 0.3041,
"step": 650
},
{
"epoch": 0.05411166680331229,
"grad_norm": 14.7495756149292,
"learning_rate": 7.260880764899016e-06,
"loss": 0.285,
"step": 660
},
{
"epoch": 0.05493154054275642,
"grad_norm": 18.141632080078125,
"learning_rate": 7.252360628554363e-06,
"loss": 0.2916,
"step": 670
},
{
"epoch": 0.05575141428220054,
"grad_norm": 18.141878128051758,
"learning_rate": 7.243696523882079e-06,
"loss": 0.3007,
"step": 680
},
{
"epoch": 0.05657128802164467,
"grad_norm": 13.596381187438965,
"learning_rate": 7.2348888070295705e-06,
"loss": 0.2627,
"step": 690
},
{
"epoch": 0.05739116176108879,
"grad_norm": 14.028800964355469,
"learning_rate": 7.225937840047583e-06,
"loss": 0.2959,
"step": 700
},
{
"epoch": 0.058211035500532915,
"grad_norm": 19.28914451599121,
"learning_rate": 7.216843990875307e-06,
"loss": 0.3088,
"step": 710
},
{
"epoch": 0.059030909239977045,
"grad_norm": 10.676041603088379,
"learning_rate": 7.207607633325266e-06,
"loss": 0.2762,
"step": 720
},
{
"epoch": 0.05985078297942117,
"grad_norm": 9.311237335205078,
"learning_rate": 7.198229147067941e-06,
"loss": 0.313,
"step": 730
},
{
"epoch": 0.0606706567188653,
"grad_norm": 12.335597038269043,
"learning_rate": 7.18870891761617e-06,
"loss": 0.2797,
"step": 740
},
{
"epoch": 0.06149053045830942,
"grad_norm": 11.885544776916504,
"learning_rate": 7.1790473363092974e-06,
"loss": 0.2681,
"step": 750
},
{
"epoch": 0.06149053045830942,
"eval_loss": 0.3024304211139679,
"eval_runtime": 57.0493,
"eval_samples_per_second": 8.764,
"eval_steps_per_second": 8.764,
"step": 750
},
{
"epoch": 0.06231040419775354,
"grad_norm": 12.44359016418457,
"learning_rate": 7.169244800297089e-06,
"loss": 0.311,
"step": 760
},
{
"epoch": 0.06313027793719767,
"grad_norm": 18.710712432861328,
"learning_rate": 7.159301712523407e-06,
"loss": 0.2949,
"step": 770
},
{
"epoch": 0.0639501516766418,
"grad_norm": 9.658717155456543,
"learning_rate": 7.149218481709644e-06,
"loss": 0.2852,
"step": 780
},
{
"epoch": 0.06477002541608592,
"grad_norm": 10.276803970336914,
"learning_rate": 7.1389955223379266e-06,
"loss": 0.2818,
"step": 790
},
{
"epoch": 0.06558989915553005,
"grad_norm": 13.862250328063965,
"learning_rate": 7.128633254634072e-06,
"loss": 0.2834,
"step": 800
},
{
"epoch": 0.06640977289497417,
"grad_norm": 17.020177841186523,
"learning_rate": 7.118132104550322e-06,
"loss": 0.2677,
"step": 810
},
{
"epoch": 0.0672296466344183,
"grad_norm": 18.547590255737305,
"learning_rate": 7.107492503747826e-06,
"loss": 0.2898,
"step": 820
},
{
"epoch": 0.06804952037386243,
"grad_norm": 15.957967758178711,
"learning_rate": 7.096714889578898e-06,
"loss": 0.326,
"step": 830
},
{
"epoch": 0.06886939411330655,
"grad_norm": 24.1992130279541,
"learning_rate": 7.085799705069046e-06,
"loss": 0.2677,
"step": 840
},
{
"epoch": 0.06968926785275067,
"grad_norm": 12.799731254577637,
"learning_rate": 7.0747473988987515e-06,
"loss": 0.2806,
"step": 850
},
{
"epoch": 0.0705091415921948,
"grad_norm": 18.750246047973633,
"learning_rate": 7.063558425385033e-06,
"loss": 0.2937,
"step": 860
},
{
"epoch": 0.07132901533163893,
"grad_norm": 13.083860397338867,
"learning_rate": 7.052233244462769e-06,
"loss": 0.2957,
"step": 870
},
{
"epoch": 0.07214888907108305,
"grad_norm": 11.227791786193848,
"learning_rate": 7.040772321665788e-06,
"loss": 0.2855,
"step": 880
},
{
"epoch": 0.07296876281052718,
"grad_norm": 8.911324501037598,
"learning_rate": 7.029176128107734e-06,
"loss": 0.3105,
"step": 890
},
{
"epoch": 0.07378863654997131,
"grad_norm": 17.020790100097656,
"learning_rate": 7.017445140462711e-06,
"loss": 0.2728,
"step": 900
},
{
"epoch": 0.07378863654997131,
"eval_loss": 0.2869480550289154,
"eval_runtime": 58.9095,
"eval_samples_per_second": 8.488,
"eval_steps_per_second": 8.488,
"step": 900
},
{
"epoch": 0.07460851028941543,
"grad_norm": 14.960102081298828,
"learning_rate": 7.00557984094567e-06,
"loss": 0.2955,
"step": 910
},
{
"epoch": 0.07542838402885955,
"grad_norm": 8.271307945251465,
"learning_rate": 6.993580717292601e-06,
"loss": 0.2666,
"step": 920
},
{
"epoch": 0.07624825776830368,
"grad_norm": 8.779189109802246,
"learning_rate": 6.981448262740483e-06,
"loss": 0.2938,
"step": 930
},
{
"epoch": 0.0770681315077478,
"grad_norm": 9.497313499450684,
"learning_rate": 6.969182976006999e-06,
"loss": 0.2875,
"step": 940
},
{
"epoch": 0.07788800524719193,
"grad_norm": 13.439544677734375,
"learning_rate": 6.95678536127005e-06,
"loss": 0.2893,
"step": 950
},
{
"epoch": 0.07870787898663606,
"grad_norm": 10.986952781677246,
"learning_rate": 6.944255928147017e-06,
"loss": 0.29,
"step": 960
},
{
"epoch": 0.07952775272608019,
"grad_norm": 14.666671752929688,
"learning_rate": 6.931595191673823e-06,
"loss": 0.2798,
"step": 970
},
{
"epoch": 0.0803476264655243,
"grad_norm": 9.045489311218262,
"learning_rate": 6.9188036722837555e-06,
"loss": 0.2526,
"step": 980
},
{
"epoch": 0.08116750020496843,
"grad_norm": 12.083099365234375,
"learning_rate": 6.905881895786076e-06,
"loss": 0.2825,
"step": 990
},
{
"epoch": 0.08198737394441256,
"grad_norm": 20.973670959472656,
"learning_rate": 6.892830393344403e-06,
"loss": 0.2703,
"step": 1000
},
{
"epoch": 0.08280724768385668,
"grad_norm": 12.959758758544922,
"learning_rate": 6.879649701454886e-06,
"loss": 0.2766,
"step": 1010
},
{
"epoch": 0.08362712142330081,
"grad_norm": 11.118098258972168,
"learning_rate": 6.866340361924141e-06,
"loss": 0.2927,
"step": 1020
},
{
"epoch": 0.08444699516274494,
"grad_norm": 12.703455924987793,
"learning_rate": 6.852902921846988e-06,
"loss": 0.2468,
"step": 1030
},
{
"epoch": 0.08526686890218907,
"grad_norm": 33.15513229370117,
"learning_rate": 6.8393379335839565e-06,
"loss": 0.2845,
"step": 1040
},
{
"epoch": 0.08608674264163318,
"grad_norm": 12.013687133789062,
"learning_rate": 6.825645954738586e-06,
"loss": 0.2879,
"step": 1050
},
{
"epoch": 0.08608674264163318,
"eval_loss": 0.2693183720111847,
"eval_runtime": 56.9849,
"eval_samples_per_second": 8.774,
"eval_steps_per_second": 8.774,
"step": 1050
},
{
"epoch": 0.08690661638107731,
"grad_norm": 10.128811836242676,
"learning_rate": 6.811827548134495e-06,
"loss": 0.2873,
"step": 1060
},
{
"epoch": 0.08772649012052144,
"grad_norm": 10.001947402954102,
"learning_rate": 6.797883281792261e-06,
"loss": 0.2931,
"step": 1070
},
{
"epoch": 0.08854636385996556,
"grad_norm": 13.15841293334961,
"learning_rate": 6.783813728906054e-06,
"loss": 0.3,
"step": 1080
},
{
"epoch": 0.08936623759940969,
"grad_norm": 8.157013893127441,
"learning_rate": 6.769619467820086e-06,
"loss": 0.2692,
"step": 1090
},
{
"epoch": 0.09018611133885382,
"grad_norm": 8.676292419433594,
"learning_rate": 6.755301082004838e-06,
"loss": 0.3111,
"step": 1100
},
{
"epoch": 0.09100598507829795,
"grad_norm": 14.835556030273438,
"learning_rate": 6.740859160033068e-06,
"loss": 0.2932,
"step": 1110
},
{
"epoch": 0.09182585881774206,
"grad_norm": 14.752832412719727,
"learning_rate": 6.726294295555623e-06,
"loss": 0.2942,
"step": 1120
},
{
"epoch": 0.0926457325571862,
"grad_norm": 9.42294979095459,
"learning_rate": 6.711607087277034e-06,
"loss": 0.2807,
"step": 1130
},
{
"epoch": 0.09346560629663032,
"grad_norm": 6.576030731201172,
"learning_rate": 6.69679813893091e-06,
"loss": 0.2656,
"step": 1140
},
{
"epoch": 0.09428548003607444,
"grad_norm": 14.54617977142334,
"learning_rate": 6.681868059255113e-06,
"loss": 0.2708,
"step": 1150
},
{
"epoch": 0.09510535377551857,
"grad_norm": 19.004695892333984,
"learning_rate": 6.666817461966741e-06,
"loss": 0.2974,
"step": 1160
},
{
"epoch": 0.0959252275149627,
"grad_norm": 13.359691619873047,
"learning_rate": 6.651646965736902e-06,
"loss": 0.2641,
"step": 1170
},
{
"epoch": 0.09674510125440682,
"grad_norm": 9.031187057495117,
"learning_rate": 6.636357194165274e-06,
"loss": 0.2794,
"step": 1180
},
{
"epoch": 0.09756497499385094,
"grad_norm": 11.242755889892578,
"learning_rate": 6.620948775754481e-06,
"loss": 0.2708,
"step": 1190
},
{
"epoch": 0.09838484873329507,
"grad_norm": 9.727982521057129,
"learning_rate": 6.605422343884255e-06,
"loss": 0.2936,
"step": 1200
},
{
"epoch": 0.09838484873329507,
"eval_loss": 0.2741548418998718,
"eval_runtime": 56.2393,
"eval_samples_per_second": 8.891,
"eval_steps_per_second": 8.891,
"step": 1200
},
{
"epoch": 0.0992047224727392,
"grad_norm": 11.938862800598145,
"learning_rate": 6.589778536785396e-06,
"loss": 0.2776,
"step": 1210
},
{
"epoch": 0.10002459621218332,
"grad_norm": 9.253863334655762,
"learning_rate": 6.5740179975135426e-06,
"loss": 0.2695,
"step": 1220
},
{
"epoch": 0.10084446995162745,
"grad_norm": 13.18783950805664,
"learning_rate": 6.5581413739227314e-06,
"loss": 0.2863,
"step": 1230
},
{
"epoch": 0.10166434369107158,
"grad_norm": 10.108220100402832,
"learning_rate": 6.542149318638777e-06,
"loss": 0.2831,
"step": 1240
},
{
"epoch": 0.1024842174305157,
"grad_norm": 13.539487838745117,
"learning_rate": 6.526042489032434e-06,
"loss": 0.2626,
"step": 1250
},
{
"epoch": 0.10330409116995982,
"grad_norm": 9.928237915039062,
"learning_rate": 6.509821547192383e-06,
"loss": 0.2706,
"step": 1260
},
{
"epoch": 0.10412396490940395,
"grad_norm": 10.978721618652344,
"learning_rate": 6.493487159898006e-06,
"loss": 0.2695,
"step": 1270
},
{
"epoch": 0.10494383864884808,
"grad_norm": 9.98459243774414,
"learning_rate": 6.477039998591991e-06,
"loss": 0.2801,
"step": 1280
},
{
"epoch": 0.1057637123882922,
"grad_norm": 12.930992126464844,
"learning_rate": 6.460480739352719e-06,
"loss": 0.2842,
"step": 1290
},
{
"epoch": 0.10658358612773633,
"grad_norm": 12.851746559143066,
"learning_rate": 6.4438100628664795e-06,
"loss": 0.2635,
"step": 1300
},
{
"epoch": 0.10740345986718046,
"grad_norm": 10.791857719421387,
"learning_rate": 6.4270286543994874e-06,
"loss": 0.2947,
"step": 1310
},
{
"epoch": 0.10822333360662457,
"grad_norm": 9.770176887512207,
"learning_rate": 6.410137203769718e-06,
"loss": 0.2606,
"step": 1320
},
{
"epoch": 0.1090432073460687,
"grad_norm": 17.897979736328125,
"learning_rate": 6.393136405318545e-06,
"loss": 0.2868,
"step": 1330
},
{
"epoch": 0.10986308108551283,
"grad_norm": 19.892559051513672,
"learning_rate": 6.376026957882207e-06,
"loss": 0.2605,
"step": 1340
},
{
"epoch": 0.11068295482495695,
"grad_norm": 9.193521499633789,
"learning_rate": 6.3588095647630754e-06,
"loss": 0.2454,
"step": 1350
},
{
"epoch": 0.11068295482495695,
"eval_loss": 0.2674501836299896,
"eval_runtime": 56.3954,
"eval_samples_per_second": 8.866,
"eval_steps_per_second": 8.866,
"step": 1350
},
{
"epoch": 0.11150282856440108,
"grad_norm": 15.698138236999512,
"learning_rate": 6.341484933700744e-06,
"loss": 0.2639,
"step": 1360
},
{
"epoch": 0.11232270230384521,
"grad_norm": 11.653697967529297,
"learning_rate": 6.32405377684294e-06,
"loss": 0.2711,
"step": 1370
},
{
"epoch": 0.11314257604328934,
"grad_norm": 10.41117000579834,
"learning_rate": 6.306516810716249e-06,
"loss": 0.274,
"step": 1380
},
{
"epoch": 0.11396244978273345,
"grad_norm": 17.14838981628418,
"learning_rate": 6.288874756196662e-06,
"loss": 0.2919,
"step": 1390
},
{
"epoch": 0.11478232352217758,
"grad_norm": 12.094561576843262,
"learning_rate": 6.271128338479939e-06,
"loss": 0.272,
"step": 1400
},
{
"epoch": 0.11560219726162171,
"grad_norm": 7.186673641204834,
"learning_rate": 6.253278287051806e-06,
"loss": 0.2614,
"step": 1410
},
{
"epoch": 0.11642207100106583,
"grad_norm": 27.63665008544922,
"learning_rate": 6.235325335657962e-06,
"loss": 0.2581,
"step": 1420
},
{
"epoch": 0.11724194474050996,
"grad_norm": 9.12143611907959,
"learning_rate": 6.217270222273923e-06,
"loss": 0.2497,
"step": 1430
},
{
"epoch": 0.11806181847995409,
"grad_norm": 10.814976692199707,
"learning_rate": 6.1991136890746825e-06,
"loss": 0.2659,
"step": 1440
},
{
"epoch": 0.11888169221939822,
"grad_norm": 13.897311210632324,
"learning_rate": 6.180856482404208e-06,
"loss": 0.2575,
"step": 1450
},
{
"epoch": 0.11970156595884233,
"grad_norm": 14.34624195098877,
"learning_rate": 6.162499352744754e-06,
"loss": 0.276,
"step": 1460
},
{
"epoch": 0.12052143969828646,
"grad_norm": 15.839101791381836,
"learning_rate": 6.144043054686022e-06,
"loss": 0.267,
"step": 1470
},
{
"epoch": 0.1213413134377306,
"grad_norm": 13.110719680786133,
"learning_rate": 6.125488346894139e-06,
"loss": 0.2777,
"step": 1480
},
{
"epoch": 0.12216118717717471,
"grad_norm": 11.638336181640625,
"learning_rate": 6.106835992080464e-06,
"loss": 0.2454,
"step": 1490
},
{
"epoch": 0.12298106091661884,
"grad_norm": 12.756601333618164,
"learning_rate": 6.088086756970252e-06,
"loss": 0.2605,
"step": 1500
},
{
"epoch": 0.12298106091661884,
"eval_loss": 0.2679287791252136,
"eval_runtime": 56.0794,
"eval_samples_per_second": 8.916,
"eval_steps_per_second": 8.916,
"step": 1500
},
{
"epoch": 0.12380093465606297,
"grad_norm": 20.72138214111328,
"learning_rate": 6.0692414122711184e-06,
"loss": 0.2593,
"step": 1510
},
{
"epoch": 0.12462080839550708,
"grad_norm": 9.595439910888672,
"learning_rate": 6.050300732641376e-06,
"loss": 0.2719,
"step": 1520
},
{
"epoch": 0.12544068213495121,
"grad_norm": 16.999011993408203,
"learning_rate": 6.0312654966581755e-06,
"loss": 0.2885,
"step": 1530
},
{
"epoch": 0.12626055587439533,
"grad_norm": 14.768747329711914,
"learning_rate": 6.012136486785512e-06,
"loss": 0.2702,
"step": 1540
},
{
"epoch": 0.12708042961383947,
"grad_norm": 8.815911293029785,
"learning_rate": 5.992914489342061e-06,
"loss": 0.2507,
"step": 1550
},
{
"epoch": 0.1279003033532836,
"grad_norm": 20.083023071289062,
"learning_rate": 5.9736002944688474e-06,
"loss": 0.2632,
"step": 1560
},
{
"epoch": 0.12872017709272773,
"grad_norm": 17.51641082763672,
"learning_rate": 5.954194696096775e-06,
"loss": 0.2937,
"step": 1570
},
{
"epoch": 0.12954005083217185,
"grad_norm": 9.186761856079102,
"learning_rate": 5.9346984919139865e-06,
"loss": 0.2611,
"step": 1580
},
{
"epoch": 0.13035992457161596,
"grad_norm": 13.085734367370605,
"learning_rate": 5.9151124833330745e-06,
"loss": 0.2507,
"step": 1590
},
{
"epoch": 0.1311797983110601,
"grad_norm": 13.729114532470703,
"learning_rate": 5.895437475458137e-06,
"loss": 0.2774,
"step": 1600
},
{
"epoch": 0.13199967205050422,
"grad_norm": 19.03725242614746,
"learning_rate": 5.875674277051688e-06,
"loss": 0.2687,
"step": 1610
},
{
"epoch": 0.13281954578994834,
"grad_norm": 15.545515060424805,
"learning_rate": 5.855823700501406e-06,
"loss": 0.2765,
"step": 1620
},
{
"epoch": 0.13363941952939248,
"grad_norm": 11.668421745300293,
"learning_rate": 5.835886561786744e-06,
"loss": 0.2682,
"step": 1630
},
{
"epoch": 0.1344592932688366,
"grad_norm": 8.778451919555664,
"learning_rate": 5.815863680445385e-06,
"loss": 0.2347,
"step": 1640
},
{
"epoch": 0.13527916700828072,
"grad_norm": 5.889225959777832,
"learning_rate": 5.795755879539558e-06,
"loss": 0.2709,
"step": 1650
},
{
"epoch": 0.13527916700828072,
"eval_loss": 0.25923365354537964,
"eval_runtime": 56.2341,
"eval_samples_per_second": 8.891,
"eval_steps_per_second": 8.891,
"step": 1650
},
{
"epoch": 0.13609904074772486,
"grad_norm": 12.518867492675781,
"learning_rate": 5.775563985622202e-06,
"loss": 0.2833,
"step": 1660
},
{
"epoch": 0.13691891448716897,
"grad_norm": 14.924880027770996,
"learning_rate": 5.755288828702987e-06,
"loss": 0.2863,
"step": 1670
},
{
"epoch": 0.1377387882266131,
"grad_norm": 16.47811508178711,
"learning_rate": 5.734931242214204e-06,
"loss": 0.2596,
"step": 1680
},
{
"epoch": 0.13855866196605723,
"grad_norm": 13.941671371459961,
"learning_rate": 5.7144920629764955e-06,
"loss": 0.2819,
"step": 1690
},
{
"epoch": 0.13937853570550135,
"grad_norm": 16.261932373046875,
"learning_rate": 5.693972131164471e-06,
"loss": 0.303,
"step": 1700
},
{
"epoch": 0.14019840944494547,
"grad_norm": 12.289247512817383,
"learning_rate": 5.673372290272149e-06,
"loss": 0.2855,
"step": 1710
},
{
"epoch": 0.1410182831843896,
"grad_norm": 8.7142915725708,
"learning_rate": 5.652693387078309e-06,
"loss": 0.2615,
"step": 1720
},
{
"epoch": 0.14183815692383372,
"grad_norm": 16.864688873291016,
"learning_rate": 5.631936271611667e-06,
"loss": 0.2813,
"step": 1730
},
{
"epoch": 0.14265803066327787,
"grad_norm": 16.40870475769043,
"learning_rate": 5.611101797115939e-06,
"loss": 0.275,
"step": 1740
},
{
"epoch": 0.14347790440272198,
"grad_norm": 14.436688423156738,
"learning_rate": 5.5901908200147685e-06,
"loss": 0.2788,
"step": 1750
},
{
"epoch": 0.1442977781421661,
"grad_norm": 11.943658828735352,
"learning_rate": 5.56920419987652e-06,
"loss": 0.2805,
"step": 1760
},
{
"epoch": 0.14511765188161024,
"grad_norm": 14.252999305725098,
"learning_rate": 5.5481427993789534e-06,
"loss": 0.2806,
"step": 1770
},
{
"epoch": 0.14593752562105436,
"grad_norm": 11.182486534118652,
"learning_rate": 5.527007484273746e-06,
"loss": 0.2675,
"step": 1780
},
{
"epoch": 0.14675739936049848,
"grad_norm": 12.846651077270508,
"learning_rate": 5.5057991233509225e-06,
"loss": 0.2744,
"step": 1790
},
{
"epoch": 0.14757727309994262,
"grad_norm": 9.701010704040527,
"learning_rate": 5.484518588403134e-06,
"loss": 0.2808,
"step": 1800
},
{
"epoch": 0.14757727309994262,
"eval_loss": 0.2612378001213074,
"eval_runtime": 57.022,
"eval_samples_per_second": 8.769,
"eval_steps_per_second": 8.769,
"step": 1800
},
{
"epoch": 0.14839714683938673,
"grad_norm": 7.793675422668457,
"learning_rate": 5.463166754189819e-06,
"loss": 0.27,
"step": 1810
},
{
"epoch": 0.14921702057883085,
"grad_norm": 13.162193298339844,
"learning_rate": 5.441744498401255e-06,
"loss": 0.2574,
"step": 1820
},
{
"epoch": 0.150036894318275,
"grad_norm": 15.428301811218262,
"learning_rate": 5.4202527016224725e-06,
"loss": 0.2675,
"step": 1830
},
{
"epoch": 0.1508567680577191,
"grad_norm": 24.684080123901367,
"learning_rate": 5.398692247297059e-06,
"loss": 0.2916,
"step": 1840
},
{
"epoch": 0.15167664179716323,
"grad_norm": 7.947139263153076,
"learning_rate": 5.377064021690844e-06,
"loss": 0.2841,
"step": 1850
},
{
"epoch": 0.15249651553660737,
"grad_norm": 11.595500946044922,
"learning_rate": 5.355368913855472e-06,
"loss": 0.2562,
"step": 1860
},
{
"epoch": 0.15331638927605148,
"grad_norm": 11.803101539611816,
"learning_rate": 5.333607815591851e-06,
"loss": 0.2292,
"step": 1870
},
{
"epoch": 0.1541362630154956,
"grad_norm": 17.95461654663086,
"learning_rate": 5.311781621413497e-06,
"loss": 0.2787,
"step": 1880
},
{
"epoch": 0.15495613675493974,
"grad_norm": 25.276002883911133,
"learning_rate": 5.289891228509769e-06,
"loss": 0.2889,
"step": 1890
},
{
"epoch": 0.15577601049438386,
"grad_norm": 8.79496955871582,
"learning_rate": 5.267937536708977e-06,
"loss": 0.2667,
"step": 1900
},
{
"epoch": 0.156595884233828,
"grad_norm": 10.413036346435547,
"learning_rate": 5.245921448441407e-06,
"loss": 0.2823,
"step": 1910
},
{
"epoch": 0.15741575797327212,
"grad_norm": 11.163688659667969,
"learning_rate": 5.223843868702214e-06,
"loss": 0.2655,
"step": 1920
},
{
"epoch": 0.15823563171271623,
"grad_norm": 16.093170166015625,
"learning_rate": 5.201705705014231e-06,
"loss": 0.2709,
"step": 1930
},
{
"epoch": 0.15905550545216038,
"grad_norm": 18.966991424560547,
"learning_rate": 5.1795078673906575e-06,
"loss": 0.2593,
"step": 1940
},
{
"epoch": 0.1598753791916045,
"grad_norm": 12.139580726623535,
"learning_rate": 5.1572512682976546e-06,
"loss": 0.2602,
"step": 1950
},
{
"epoch": 0.1598753791916045,
"eval_loss": 0.2535741329193115,
"eval_runtime": 56.9513,
"eval_samples_per_second": 8.779,
"eval_steps_per_second": 8.779,
"step": 1950
},
{
"epoch": 0.1606952529310486,
"grad_norm": 17.421117782592773,
"learning_rate": 5.134936822616837e-06,
"loss": 0.2507,
"step": 1960
},
{
"epoch": 0.16151512667049275,
"grad_norm": 8.096160888671875,
"learning_rate": 5.112565447607669e-06,
"loss": 0.2405,
"step": 1970
},
{
"epoch": 0.16233500040993687,
"grad_norm": 10.138191223144531,
"learning_rate": 5.090138062869755e-06,
"loss": 0.2435,
"step": 1980
},
{
"epoch": 0.16315487414938099,
"grad_norm": 32.244873046875,
"learning_rate": 5.067655590305036e-06,
"loss": 0.2546,
"step": 1990
},
{
"epoch": 0.16397474788882513,
"grad_norm": 11.093918800354004,
"learning_rate": 5.045118954079904e-06,
"loss": 0.2595,
"step": 2000
},
{
"epoch": 0.16479462162826924,
"grad_norm": 11.482741355895996,
"learning_rate": 5.022529080587205e-06,
"loss": 0.2294,
"step": 2010
},
{
"epoch": 0.16561449536771336,
"grad_norm": 13.456998825073242,
"learning_rate": 4.999886898408157e-06,
"loss": 0.2556,
"step": 2020
},
{
"epoch": 0.1664343691071575,
"grad_norm": 11.575148582458496,
"learning_rate": 4.977193338274189e-06,
"loss": 0.2538,
"step": 2030
},
{
"epoch": 0.16725424284660162,
"grad_norm": 12.712217330932617,
"learning_rate": 4.954449333028672e-06,
"loss": 0.2985,
"step": 2040
},
{
"epoch": 0.16807411658604574,
"grad_norm": 25.477855682373047,
"learning_rate": 4.931655817588579e-06,
"loss": 0.2516,
"step": 2050
},
{
"epoch": 0.16889399032548988,
"grad_norm": 17.030961990356445,
"learning_rate": 4.9088137289060535e-06,
"loss": 0.2544,
"step": 2060
},
{
"epoch": 0.169713864064934,
"grad_norm": 10.903443336486816,
"learning_rate": 4.885924005929896e-06,
"loss": 0.2581,
"step": 2070
},
{
"epoch": 0.17053373780437814,
"grad_norm": 9.746002197265625,
"learning_rate": 4.862987589566965e-06,
"loss": 0.2332,
"step": 2080
},
{
"epoch": 0.17135361154382225,
"grad_norm": 14.084914207458496,
"learning_rate": 4.840005422643503e-06,
"loss": 0.2643,
"step": 2090
},
{
"epoch": 0.17217348528326637,
"grad_norm": 9.59061336517334,
"learning_rate": 4.816978449866372e-06,
"loss": 0.2461,
"step": 2100
},
{
"epoch": 0.17217348528326637,
"eval_loss": 0.2557007670402527,
"eval_runtime": 56.7258,
"eval_samples_per_second": 8.814,
"eval_steps_per_second": 8.814,
"step": 2100
},
{
"epoch": 0.1729933590227105,
"grad_norm": 12.96509075164795,
"learning_rate": 4.793907617784238e-06,
"loss": 0.2623,
"step": 2110
},
{
"epoch": 0.17381323276215463,
"grad_norm": 21.171913146972656,
"learning_rate": 4.770793874748642e-06,
"loss": 0.2481,
"step": 2120
},
{
"epoch": 0.17463310650159874,
"grad_norm": 15.18250560760498,
"learning_rate": 4.747638170875032e-06,
"loss": 0.2644,
"step": 2130
},
{
"epoch": 0.1754529802410429,
"grad_norm": 13.478678703308105,
"learning_rate": 4.724441458003699e-06,
"loss": 0.2548,
"step": 2140
},
{
"epoch": 0.176272853980487,
"grad_norm": 7.877747535705566,
"learning_rate": 4.701204689660653e-06,
"loss": 0.2468,
"step": 2150
},
{
"epoch": 0.17709272771993112,
"grad_norm": 14.340051651000977,
"learning_rate": 4.67792882101843e-06,
"loss": 0.2652,
"step": 2160
},
{
"epoch": 0.17791260145937526,
"grad_norm": 11.43173885345459,
"learning_rate": 4.654614808856823e-06,
"loss": 0.245,
"step": 2170
},
{
"epoch": 0.17873247519881938,
"grad_norm": 16.191015243530273,
"learning_rate": 4.631263611523557e-06,
"loss": 0.2561,
"step": 2180
},
{
"epoch": 0.1795523489382635,
"grad_norm": 14.481834411621094,
"learning_rate": 4.607876188894896e-06,
"loss": 0.2783,
"step": 2190
},
{
"epoch": 0.18037222267770764,
"grad_norm": 12.716588973999023,
"learning_rate": 4.58445350233618e-06,
"loss": 0.2526,
"step": 2200
},
{
"epoch": 0.18119209641715175,
"grad_norm": 16.625707626342773,
"learning_rate": 4.560996514662314e-06,
"loss": 0.2386,
"step": 2210
},
{
"epoch": 0.1820119701565959,
"grad_norm": 15.23642635345459,
"learning_rate": 4.5375061900981855e-06,
"loss": 0.2522,
"step": 2220
},
{
"epoch": 0.18283184389604,
"grad_norm": 22.573617935180664,
"learning_rate": 4.513983494239034e-06,
"loss": 0.2605,
"step": 2230
},
{
"epoch": 0.18365171763548413,
"grad_norm": 16.085651397705078,
"learning_rate": 4.490429394010752e-06,
"loss": 0.2811,
"step": 2240
},
{
"epoch": 0.18447159137492827,
"grad_norm": 23.764911651611328,
"learning_rate": 4.466844857630147e-06,
"loss": 0.2495,
"step": 2250
},
{
"epoch": 0.18447159137492827,
"eval_loss": 0.2652283310890198,
"eval_runtime": 56.3594,
"eval_samples_per_second": 8.872,
"eval_steps_per_second": 8.872,
"step": 2250
},
{
"epoch": 0.1852914651143724,
"grad_norm": 17.39873504638672,
"learning_rate": 4.443230854565133e-06,
"loss": 0.2562,
"step": 2260
},
{
"epoch": 0.1861113388538165,
"grad_norm": 11.883243560791016,
"learning_rate": 4.4195883554948885e-06,
"loss": 0.2777,
"step": 2270
},
{
"epoch": 0.18693121259326065,
"grad_norm": 8.622486114501953,
"learning_rate": 4.3959183322699466e-06,
"loss": 0.2272,
"step": 2280
},
{
"epoch": 0.18775108633270476,
"grad_norm": 16.060256958007812,
"learning_rate": 4.372221757872255e-06,
"loss": 0.2388,
"step": 2290
},
{
"epoch": 0.18857096007214888,
"grad_norm": 9.97546100616455,
"learning_rate": 4.3484996063751725e-06,
"loss": 0.2736,
"step": 2300
},
{
"epoch": 0.18939083381159302,
"grad_norm": 11.587379455566406,
"learning_rate": 4.324752852903435e-06,
"loss": 0.2321,
"step": 2310
},
{
"epoch": 0.19021070755103714,
"grad_norm": 134.054931640625,
"learning_rate": 4.300982473593068e-06,
"loss": 0.2583,
"step": 2320
},
{
"epoch": 0.19103058129048126,
"grad_norm": 15.653196334838867,
"learning_rate": 4.277189445551261e-06,
"loss": 0.2702,
"step": 2330
},
{
"epoch": 0.1918504550299254,
"grad_norm": 14.868865966796875,
"learning_rate": 4.253374746816209e-06,
"loss": 0.2749,
"step": 2340
},
{
"epoch": 0.19267032876936951,
"grad_norm": 18.965742111206055,
"learning_rate": 4.229539356316898e-06,
"loss": 0.2635,
"step": 2350
},
{
"epoch": 0.19349020250881363,
"grad_norm": 21.16566276550293,
"learning_rate": 4.205684253832877e-06,
"loss": 0.2366,
"step": 2360
},
{
"epoch": 0.19431007624825777,
"grad_norm": 9.739816665649414,
"learning_rate": 4.1818104199539735e-06,
"loss": 0.2507,
"step": 2370
},
{
"epoch": 0.1951299499877019,
"grad_norm": 9.094308853149414,
"learning_rate": 4.1579188360399916e-06,
"loss": 0.2508,
"step": 2380
},
{
"epoch": 0.19594982372714603,
"grad_norm": 13.532063484191895,
"learning_rate": 4.134010484180368e-06,
"loss": 0.2432,
"step": 2390
},
{
"epoch": 0.19676969746659015,
"grad_norm": 10.089424133300781,
"learning_rate": 4.110086347153807e-06,
"loss": 0.2496,
"step": 2400
},
{
"epoch": 0.19676969746659015,
"eval_loss": 0.24164016544818878,
"eval_runtime": 58.2028,
"eval_samples_per_second": 8.591,
"eval_steps_per_second": 8.591,
"step": 2400
},
{
"epoch": 0.19758957120603426,
"grad_norm": 14.62680721282959,
"learning_rate": 4.0861474083878765e-06,
"loss": 0.2585,
"step": 2410
},
{
"epoch": 0.1984094449454784,
"grad_norm": 22.528297424316406,
"learning_rate": 4.062194651918585e-06,
"loss": 0.2341,
"step": 2420
},
{
"epoch": 0.19922931868492252,
"grad_norm": 11.753854751586914,
"learning_rate": 4.0382290623499384e-06,
"loss": 0.2953,
"step": 2430
},
{
"epoch": 0.20004919242436664,
"grad_norm": 16.247995376586914,
"learning_rate": 4.014251624813453e-06,
"loss": 0.2657,
"step": 2440
},
{
"epoch": 0.20086906616381078,
"grad_norm": 15.834903717041016,
"learning_rate": 3.990263324927675e-06,
"loss": 0.2341,
"step": 2450
},
{
"epoch": 0.2016889399032549,
"grad_norm": 6.7929887771606445,
"learning_rate": 3.966265148757655e-06,
"loss": 0.2355,
"step": 2460
},
{
"epoch": 0.20250881364269901,
"grad_norm": 35.777835845947266,
"learning_rate": 3.9422580827744224e-06,
"loss": 0.2329,
"step": 2470
},
{
"epoch": 0.20332868738214316,
"grad_norm": 15.361977577209473,
"learning_rate": 3.9182431138144315e-06,
"loss": 0.2515,
"step": 2480
},
{
"epoch": 0.20414856112158727,
"grad_norm": 10.340039253234863,
"learning_rate": 3.894221229038995e-06,
"loss": 0.2397,
"step": 2490
},
{
"epoch": 0.2049684348610314,
"grad_norm": 15.93770980834961,
"learning_rate": 3.870193415893709e-06,
"loss": 0.2432,
"step": 2500
},
{
"epoch": 0.20578830860047553,
"grad_norm": 19.398086547851562,
"learning_rate": 3.846160662067859e-06,
"loss": 0.2471,
"step": 2510
},
{
"epoch": 0.20660818233991965,
"grad_norm": 7.482428550720215,
"learning_rate": 3.8221239554538275e-06,
"loss": 0.2498,
"step": 2520
},
{
"epoch": 0.20742805607936377,
"grad_norm": 7.209218502044678,
"learning_rate": 3.798084284106478e-06,
"loss": 0.263,
"step": 2530
},
{
"epoch": 0.2082479298188079,
"grad_norm": 7.973605155944824,
"learning_rate": 3.7740426362025424e-06,
"loss": 0.2182,
"step": 2540
},
{
"epoch": 0.20906780355825202,
"grad_norm": 17.178762435913086,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.2368,
"step": 2550
},
{
"epoch": 0.20906780355825202,
"eval_loss": 0.24929100275039673,
"eval_runtime": 56.544,
"eval_samples_per_second": 8.843,
"eval_steps_per_second": 8.843,
"step": 2550
},
{
"epoch": 0.20988767729769617,
"grad_norm": 19.6829776763916,
"learning_rate": 3.7259573637974587e-06,
"loss": 0.2556,
"step": 2560
},
{
"epoch": 0.21070755103714028,
"grad_norm": 18.270166397094727,
"learning_rate": 3.701915715893523e-06,
"loss": 0.2306,
"step": 2570
},
{
"epoch": 0.2115274247765844,
"grad_norm": 14.25434398651123,
"learning_rate": 3.677876044546174e-06,
"loss": 0.2597,
"step": 2580
},
{
"epoch": 0.21234729851602854,
"grad_norm": 9.318758964538574,
"learning_rate": 3.6538393379321427e-06,
"loss": 0.2659,
"step": 2590
},
{
"epoch": 0.21316717225547266,
"grad_norm": 18.77834701538086,
"learning_rate": 3.6298065841062934e-06,
"loss": 0.2299,
"step": 2600
},
{
"epoch": 0.21398704599491677,
"grad_norm": 17.720027923583984,
"learning_rate": 3.6057787709610064e-06,
"loss": 0.266,
"step": 2610
},
{
"epoch": 0.21480691973436092,
"grad_norm": 7.643661022186279,
"learning_rate": 3.5817568861855708e-06,
"loss": 0.2362,
"step": 2620
},
{
"epoch": 0.21562679347380503,
"grad_norm": 10.200757026672363,
"learning_rate": 3.557741917225579e-06,
"loss": 0.2405,
"step": 2630
},
{
"epoch": 0.21644666721324915,
"grad_norm": 46.2437744140625,
"learning_rate": 3.5337348512423468e-06,
"loss": 0.252,
"step": 2640
},
{
"epoch": 0.2172665409526933,
"grad_norm": 13.160014152526855,
"learning_rate": 3.5097366750723275e-06,
"loss": 0.247,
"step": 2650
},
{
"epoch": 0.2180864146921374,
"grad_norm": 12.211856842041016,
"learning_rate": 3.4857483751865478e-06,
"loss": 0.2515,
"step": 2660
},
{
"epoch": 0.21890628843158152,
"grad_norm": 14.44340705871582,
"learning_rate": 3.461770937650064e-06,
"loss": 0.2228,
"step": 2670
},
{
"epoch": 0.21972616217102567,
"grad_norm": 43.0201530456543,
"learning_rate": 3.437805348081416e-06,
"loss": 0.2721,
"step": 2680
},
{
"epoch": 0.22054603591046978,
"grad_norm": 9.385405540466309,
"learning_rate": 3.413852591612125e-06,
"loss": 0.2883,
"step": 2690
},
{
"epoch": 0.2213659096499139,
"grad_norm": 14.081421852111816,
"learning_rate": 3.389913652846194e-06,
"loss": 0.2411,
"step": 2700
},
{
"epoch": 0.2213659096499139,
"eval_loss": 0.23700179159641266,
"eval_runtime": 56.0414,
"eval_samples_per_second": 8.922,
"eval_steps_per_second": 8.922,
"step": 2700
},
{
"epoch": 0.22218578338935804,
"grad_norm": 7.245662689208984,
"learning_rate": 3.365989515819633e-06,
"loss": 0.2538,
"step": 2710
},
{
"epoch": 0.22300565712880216,
"grad_norm": 15.124368667602539,
"learning_rate": 3.34208116396001e-06,
"loss": 0.2469,
"step": 2720
},
{
"epoch": 0.2238255308682463,
"grad_norm": 15.782695770263672,
"learning_rate": 3.318189580046028e-06,
"loss": 0.2412,
"step": 2730
},
{
"epoch": 0.22464540460769042,
"grad_norm": 21.473407745361328,
"learning_rate": 3.294315746167124e-06,
"loss": 0.2745,
"step": 2740
},
{
"epoch": 0.22546527834713453,
"grad_norm": 14.113616943359375,
"learning_rate": 3.2704606436831023e-06,
"loss": 0.2329,
"step": 2750
},
{
"epoch": 0.22628515208657868,
"grad_norm": 16.563539505004883,
"learning_rate": 3.2466252531837934e-06,
"loss": 0.2275,
"step": 2760
},
{
"epoch": 0.2271050258260228,
"grad_norm": 15.176487922668457,
"learning_rate": 3.2228105544487405e-06,
"loss": 0.236,
"step": 2770
},
{
"epoch": 0.2279248995654669,
"grad_norm": 21.701990127563477,
"learning_rate": 3.1990175264069333e-06,
"loss": 0.2619,
"step": 2780
},
{
"epoch": 0.22874477330491105,
"grad_norm": 24.164974212646484,
"learning_rate": 3.1752471470965653e-06,
"loss": 0.2545,
"step": 2790
},
{
"epoch": 0.22956464704435517,
"grad_norm": 18.652359008789062,
"learning_rate": 3.151500393624829e-06,
"loss": 0.2538,
"step": 2800
},
{
"epoch": 0.23038452078379928,
"grad_norm": 17.519634246826172,
"learning_rate": 3.127778242127747e-06,
"loss": 0.2457,
"step": 2810
},
{
"epoch": 0.23120439452324343,
"grad_norm": 32.73554992675781,
"learning_rate": 3.104081667730055e-06,
"loss": 0.2597,
"step": 2820
},
{
"epoch": 0.23202426826268754,
"grad_norm": 14.897638320922852,
"learning_rate": 3.0804116445051133e-06,
"loss": 0.2565,
"step": 2830
},
{
"epoch": 0.23284414200213166,
"grad_norm": 12.081779479980469,
"learning_rate": 3.0567691454348674e-06,
"loss": 0.2222,
"step": 2840
},
{
"epoch": 0.2336640157415758,
"grad_norm": 12.295435905456543,
"learning_rate": 3.033155142369855e-06,
"loss": 0.2344,
"step": 2850
},
{
"epoch": 0.2336640157415758,
"eval_loss": 0.23474246263504028,
"eval_runtime": 55.6184,
"eval_samples_per_second": 8.99,
"eval_steps_per_second": 8.99,
"step": 2850
},
{
"epoch": 0.23448388948101992,
"grad_norm": 14.579584121704102,
"learning_rate": 3.009570605989249e-06,
"loss": 0.2352,
"step": 2860
},
{
"epoch": 0.23530376322046404,
"grad_norm": 22.36095428466797,
"learning_rate": 2.986016505760967e-06,
"loss": 0.2394,
"step": 2870
},
{
"epoch": 0.23612363695990818,
"grad_norm": 10.306982040405273,
"learning_rate": 2.962493809901815e-06,
"loss": 0.2333,
"step": 2880
},
{
"epoch": 0.2369435106993523,
"grad_norm": 36.44614791870117,
"learning_rate": 2.9390034853376875e-06,
"loss": 0.2539,
"step": 2890
},
{
"epoch": 0.23776338443879644,
"grad_norm": 10.238338470458984,
"learning_rate": 2.9155464976638217e-06,
"loss": 0.2639,
"step": 2900
},
{
"epoch": 0.23858325817824055,
"grad_norm": 22.99175262451172,
"learning_rate": 2.8921238111051057e-06,
"loss": 0.2769,
"step": 2910
},
{
"epoch": 0.23940313191768467,
"grad_norm": 15.648612976074219,
"learning_rate": 2.8687363884764434e-06,
"loss": 0.2348,
"step": 2920
},
{
"epoch": 0.2402230056571288,
"grad_norm": 9.030691146850586,
"learning_rate": 2.8453851911431783e-06,
"loss": 0.2223,
"step": 2930
},
{
"epoch": 0.24104287939657293,
"grad_norm": 13.751124382019043,
"learning_rate": 2.822071178981572e-06,
"loss": 0.2474,
"step": 2940
},
{
"epoch": 0.24186275313601704,
"grad_norm": 16.013547897338867,
"learning_rate": 2.7987953103393484e-06,
"loss": 0.2541,
"step": 2950
},
{
"epoch": 0.2426826268754612,
"grad_norm": 11.65927791595459,
"learning_rate": 2.7755585419963026e-06,
"loss": 0.2535,
"step": 2960
},
{
"epoch": 0.2435025006149053,
"grad_norm": 20.403488159179688,
"learning_rate": 2.7523618291249687e-06,
"loss": 0.2439,
"step": 2970
},
{
"epoch": 0.24432237435434942,
"grad_norm": 15.705227851867676,
"learning_rate": 2.729206125251359e-06,
"loss": 0.2073,
"step": 2980
},
{
"epoch": 0.24514224809379356,
"grad_norm": 16.818626403808594,
"learning_rate": 2.7060923822157638e-06,
"loss": 0.2592,
"step": 2990
},
{
"epoch": 0.24596212183323768,
"grad_norm": 29.800796508789062,
"learning_rate": 2.6830215501336288e-06,
"loss": 0.2328,
"step": 3000
},
{
"epoch": 0.24596212183323768,
"eval_loss": 0.24091680347919464,
"eval_runtime": 55.7565,
"eval_samples_per_second": 8.968,
"eval_steps_per_second": 8.968,
"step": 3000
},
{
"epoch": 0.2467819955726818,
"grad_norm": 18.235761642456055,
"learning_rate": 2.6599945773564997e-06,
"loss": 0.2505,
"step": 3010
},
{
"epoch": 0.24760186931212594,
"grad_norm": 13.632527351379395,
"learning_rate": 2.6370124104330357e-06,
"loss": 0.2626,
"step": 3020
},
{
"epoch": 0.24842174305157005,
"grad_norm": 29.359901428222656,
"learning_rate": 2.614075994070105e-06,
"loss": 0.2372,
"step": 3030
},
{
"epoch": 0.24924161679101417,
"grad_norm": 23.87677574157715,
"learning_rate": 2.591186271093948e-06,
"loss": 0.2103,
"step": 3040
},
{
"epoch": 0.2500614905304583,
"grad_norm": 13.893345832824707,
"learning_rate": 2.568344182411423e-06,
"loss": 0.2299,
"step": 3050
},
{
"epoch": 0.25088136426990243,
"grad_norm": 30.01930809020996,
"learning_rate": 2.5455506669713293e-06,
"loss": 0.237,
"step": 3060
},
{
"epoch": 0.2517012380093466,
"grad_norm": 21.540925979614258,
"learning_rate": 2.522806661725812e-06,
"loss": 0.245,
"step": 3070
},
{
"epoch": 0.25252111174879066,
"grad_norm": 11.055063247680664,
"learning_rate": 2.5001131015918444e-06,
"loss": 0.2386,
"step": 3080
},
{
"epoch": 0.2533409854882348,
"grad_norm": 25.467863082885742,
"learning_rate": 2.4774709194127973e-06,
"loss": 0.2028,
"step": 3090
},
{
"epoch": 0.25416085922767895,
"grad_norm": 16.482820510864258,
"learning_rate": 2.4548810459200973e-06,
"loss": 0.2559,
"step": 3100
},
{
"epoch": 0.25498073296712304,
"grad_norm": 15.558172225952148,
"learning_rate": 2.4323444096949647e-06,
"loss": 0.2443,
"step": 3110
},
{
"epoch": 0.2558006067065672,
"grad_norm": 12.034625053405762,
"learning_rate": 2.409861937130248e-06,
"loss": 0.2607,
"step": 3120
},
{
"epoch": 0.2566204804460113,
"grad_norm": 11.549402236938477,
"learning_rate": 2.3874345523923327e-06,
"loss": 0.2182,
"step": 3130
},
{
"epoch": 0.25744035418545547,
"grad_norm": 37.64973068237305,
"learning_rate": 2.3650631773831644e-06,
"loss": 0.2756,
"step": 3140
},
{
"epoch": 0.25826022792489955,
"grad_norm": 10.317972183227539,
"learning_rate": 2.3427487317023477e-06,
"loss": 0.2325,
"step": 3150
},
{
"epoch": 0.25826022792489955,
"eval_loss": 0.2304079383611679,
"eval_runtime": 55.9839,
"eval_samples_per_second": 8.931,
"eval_steps_per_second": 8.931,
"step": 3150
},
{
"epoch": 0.2590801016643437,
"grad_norm": 13.487903594970703,
"learning_rate": 2.320492132609344e-06,
"loss": 0.2491,
"step": 3160
},
{
"epoch": 0.25989997540378784,
"grad_norm": 18.3017520904541,
"learning_rate": 2.2982942949857705e-06,
"loss": 0.2203,
"step": 3170
},
{
"epoch": 0.26071984914323193,
"grad_norm": 35.3414421081543,
"learning_rate": 2.276156131297787e-06,
"loss": 0.2076,
"step": 3180
},
{
"epoch": 0.2615397228826761,
"grad_norm": 7.3131327629089355,
"learning_rate": 2.254078551558594e-06,
"loss": 0.2476,
"step": 3190
},
{
"epoch": 0.2623595966221202,
"grad_norm": 21.195293426513672,
"learning_rate": 2.2320624632910232e-06,
"loss": 0.2347,
"step": 3200
},
{
"epoch": 0.2631794703615643,
"grad_norm": 19.634109497070312,
"learning_rate": 2.210108771490233e-06,
"loss": 0.2395,
"step": 3210
},
{
"epoch": 0.26399934410100845,
"grad_norm": 16.585100173950195,
"learning_rate": 2.1882183785865047e-06,
"loss": 0.2258,
"step": 3220
},
{
"epoch": 0.2648192178404526,
"grad_norm": 16.569671630859375,
"learning_rate": 2.166392184408152e-06,
"loss": 0.2379,
"step": 3230
},
{
"epoch": 0.2656390915798967,
"grad_norm": 14.845422744750977,
"learning_rate": 2.1446310861445306e-06,
"loss": 0.2183,
"step": 3240
},
{
"epoch": 0.2664589653193408,
"grad_norm": 16.37993621826172,
"learning_rate": 2.1229359783091576e-06,
"loss": 0.2249,
"step": 3250
},
{
"epoch": 0.26727883905878497,
"grad_norm": 24.308523178100586,
"learning_rate": 2.1013077527029428e-06,
"loss": 0.2314,
"step": 3260
},
{
"epoch": 0.26809871279822906,
"grad_norm": 20.230369567871094,
"learning_rate": 2.079747298377528e-06,
"loss": 0.2072,
"step": 3270
},
{
"epoch": 0.2689185865376732,
"grad_norm": 18.310514450073242,
"learning_rate": 2.058255501598745e-06,
"loss": 0.2528,
"step": 3280
},
{
"epoch": 0.26973846027711734,
"grad_norm": 15.269632339477539,
"learning_rate": 2.0368332458101814e-06,
"loss": 0.2206,
"step": 3290
},
{
"epoch": 0.27055833401656143,
"grad_norm": 24.385452270507812,
"learning_rate": 2.015481411596869e-06,
"loss": 0.2341,
"step": 3300
},
{
"epoch": 0.27055833401656143,
"eval_loss": 0.23421980440616608,
"eval_runtime": 60.4493,
"eval_samples_per_second": 8.271,
"eval_steps_per_second": 8.271,
"step": 3300
},
{
"epoch": 0.2713782077560056,
"grad_norm": 21.876766204833984,
"learning_rate": 1.9942008766490793e-06,
"loss": 0.235,
"step": 3310
},
{
"epoch": 0.2721980814954497,
"grad_norm": 11.376224517822266,
"learning_rate": 1.9729925157262554e-06,
"loss": 0.2509,
"step": 3320
},
{
"epoch": 0.2730179552348938,
"grad_norm": 27.929759979248047,
"learning_rate": 1.9518572006210484e-06,
"loss": 0.242,
"step": 3330
},
{
"epoch": 0.27383782897433795,
"grad_norm": 23.26350975036621,
"learning_rate": 1.9307958001234794e-06,
"loss": 0.2507,
"step": 3340
},
{
"epoch": 0.2746577027137821,
"grad_norm": 24.858692169189453,
"learning_rate": 1.9098091799852347e-06,
"loss": 0.2375,
"step": 3350
},
{
"epoch": 0.2754775764532262,
"grad_norm": 16.973976135253906,
"learning_rate": 1.8888982028840636e-06,
"loss": 0.2341,
"step": 3360
},
{
"epoch": 0.2762974501926703,
"grad_norm": 26.544775009155273,
"learning_rate": 1.8680637283883355e-06,
"loss": 0.2457,
"step": 3370
},
{
"epoch": 0.27711732393211447,
"grad_norm": 16.246021270751953,
"learning_rate": 1.8473066129216927e-06,
"loss": 0.2484,
"step": 3380
},
{
"epoch": 0.27793719767155856,
"grad_norm": 12.570246696472168,
"learning_rate": 1.8266277097278527e-06,
"loss": 0.2579,
"step": 3390
},
{
"epoch": 0.2787570714110027,
"grad_norm": 17.455217361450195,
"learning_rate": 1.8060278688355313e-06,
"loss": 0.2213,
"step": 3400
},
{
"epoch": 0.27957694515044684,
"grad_norm": 13.560107231140137,
"learning_rate": 1.7855079370235043e-06,
"loss": 0.2168,
"step": 3410
},
{
"epoch": 0.28039681888989093,
"grad_norm": 19.205720901489258,
"learning_rate": 1.7650687577857972e-06,
"loss": 0.2166,
"step": 3420
},
{
"epoch": 0.2812166926293351,
"grad_norm": 31.231449127197266,
"learning_rate": 1.7447111712970138e-06,
"loss": 0.2472,
"step": 3430
},
{
"epoch": 0.2820365663687792,
"grad_norm": 18.0344181060791,
"learning_rate": 1.7244360143778004e-06,
"loss": 0.2376,
"step": 3440
},
{
"epoch": 0.2828564401082233,
"grad_norm": 16.178203582763672,
"learning_rate": 1.704244120460443e-06,
"loss": 0.2209,
"step": 3450
},
{
"epoch": 0.2828564401082233,
"eval_loss": 0.22183214128017426,
"eval_runtime": 56.128,
"eval_samples_per_second": 8.908,
"eval_steps_per_second": 8.908,
"step": 3450
},
{
"epoch": 0.28367631384766745,
"grad_norm": 18.059825897216797,
"learning_rate": 1.6841363195546162e-06,
"loss": 0.2267,
"step": 3460
},
{
"epoch": 0.2844961875871116,
"grad_norm": 22.400646209716797,
"learning_rate": 1.6641134382132576e-06,
"loss": 0.2297,
"step": 3470
},
{
"epoch": 0.28531606132655574,
"grad_norm": 18.88297462463379,
"learning_rate": 1.6441762994985947e-06,
"loss": 0.2087,
"step": 3480
},
{
"epoch": 0.2861359350659998,
"grad_norm": 9.259561538696289,
"learning_rate": 1.6243257229483141e-06,
"loss": 0.2341,
"step": 3490
},
{
"epoch": 0.28695580880544397,
"grad_norm": 9.176309585571289,
"learning_rate": 1.6045625245418648e-06,
"loss": 0.2314,
"step": 3500
},
{
"epoch": 0.2877756825448881,
"grad_norm": 16.64775276184082,
"learning_rate": 1.584887516666928e-06,
"loss": 0.221,
"step": 3510
},
{
"epoch": 0.2885955562843322,
"grad_norm": 16.043312072753906,
"learning_rate": 1.565301508086015e-06,
"loss": 0.2307,
"step": 3520
},
{
"epoch": 0.28941543002377634,
"grad_norm": 28.55023765563965,
"learning_rate": 1.5458053039032263e-06,
"loss": 0.2013,
"step": 3530
},
{
"epoch": 0.2902353037632205,
"grad_norm": 22.9605712890625,
"learning_rate": 1.5263997055311536e-06,
"loss": 0.2258,
"step": 3540
},
{
"epoch": 0.2910551775026646,
"grad_norm": 11.065112113952637,
"learning_rate": 1.5070855106579404e-06,
"loss": 0.2375,
"step": 3550
},
{
"epoch": 0.2918750512421087,
"grad_norm": 13.265893936157227,
"learning_rate": 1.4878635132144885e-06,
"loss": 0.2409,
"step": 3560
},
{
"epoch": 0.29269492498155286,
"grad_norm": 22.174110412597656,
"learning_rate": 1.4687345033418258e-06,
"loss": 0.2424,
"step": 3570
},
{
"epoch": 0.29351479872099695,
"grad_norm": 12.81115436553955,
"learning_rate": 1.4496992673586262e-06,
"loss": 0.2236,
"step": 3580
},
{
"epoch": 0.2943346724604411,
"grad_norm": 12.606128692626953,
"learning_rate": 1.4307585877288822e-06,
"loss": 0.2262,
"step": 3590
},
{
"epoch": 0.29515454619988524,
"grad_norm": 29.290117263793945,
"learning_rate": 1.4119132430297496e-06,
"loss": 0.2305,
"step": 3600
},
{
"epoch": 0.29515454619988524,
"eval_loss": 0.22281211614608765,
"eval_runtime": 55.6771,
"eval_samples_per_second": 8.98,
"eval_steps_per_second": 8.98,
"step": 3600
},
{
"epoch": 0.2959744199393293,
"grad_norm": 19.89222526550293,
"learning_rate": 1.3931640079195365e-06,
"loss": 0.2354,
"step": 3610
},
{
"epoch": 0.29679429367877347,
"grad_norm": 10.584065437316895,
"learning_rate": 1.3745116531058645e-06,
"loss": 0.2272,
"step": 3620
},
{
"epoch": 0.2976141674182176,
"grad_norm": 18.46734619140625,
"learning_rate": 1.3559569453139797e-06,
"loss": 0.2192,
"step": 3630
},
{
"epoch": 0.2984340411576617,
"grad_norm": 17.607667922973633,
"learning_rate": 1.3375006472552483e-06,
"loss": 0.2466,
"step": 3640
},
{
"epoch": 0.29925391489710584,
"grad_norm": 19.822507858276367,
"learning_rate": 1.3191435175957945e-06,
"loss": 0.2271,
"step": 3650
},
{
"epoch": 0.30007378863655,
"grad_norm": 7.999312400817871,
"learning_rate": 1.3008863109253174e-06,
"loss": 0.2244,
"step": 3660
},
{
"epoch": 0.3008936623759941,
"grad_norm": 15.04226016998291,
"learning_rate": 1.282729777726078e-06,
"loss": 0.2303,
"step": 3670
},
{
"epoch": 0.3017135361154382,
"grad_norm": 12.127747535705566,
"learning_rate": 1.2646746643420392e-06,
"loss": 0.2289,
"step": 3680
},
{
"epoch": 0.30253340985488236,
"grad_norm": 10.014680862426758,
"learning_rate": 1.2467217129481952e-06,
"loss": 0.2176,
"step": 3690
},
{
"epoch": 0.30335328359432645,
"grad_norm": 15.543107986450195,
"learning_rate": 1.2288716615200617e-06,
"loss": 0.2338,
"step": 3700
},
{
"epoch": 0.3041731573337706,
"grad_norm": 12.86021614074707,
"learning_rate": 1.2111252438033404e-06,
"loss": 0.2192,
"step": 3710
},
{
"epoch": 0.30499303107321474,
"grad_norm": 32.52058792114258,
"learning_rate": 1.1934831892837524e-06,
"loss": 0.2205,
"step": 3720
},
{
"epoch": 0.3058129048126588,
"grad_norm": 6.391150951385498,
"learning_rate": 1.1759462231570618e-06,
"loss": 0.2043,
"step": 3730
},
{
"epoch": 0.30663277855210297,
"grad_norm": 18.806997299194336,
"learning_rate": 1.1585150662992578e-06,
"loss": 0.2203,
"step": 3740
},
{
"epoch": 0.3074526522915471,
"grad_norm": 16.80451774597168,
"learning_rate": 1.1411904352369262e-06,
"loss": 0.228,
"step": 3750
},
{
"epoch": 0.3074526522915471,
"eval_loss": 0.2207518219947815,
"eval_runtime": 56.5561,
"eval_samples_per_second": 8.841,
"eval_steps_per_second": 8.841,
"step": 3750
},
{
"epoch": 0.3082725260309912,
"grad_norm": 14.464019775390625,
"learning_rate": 1.1239730421177952e-06,
"loss": 0.2285,
"step": 3760
},
{
"epoch": 0.30909239977043534,
"grad_norm": 18.73137664794922,
"learning_rate": 1.1068635946814569e-06,
"loss": 0.2234,
"step": 3770
},
{
"epoch": 0.3099122735098795,
"grad_norm": 10.308956146240234,
"learning_rate": 1.0898627962302831e-06,
"loss": 0.2208,
"step": 3780
},
{
"epoch": 0.31073214724932363,
"grad_norm": 39.88100051879883,
"learning_rate": 1.072971345600513e-06,
"loss": 0.2376,
"step": 3790
},
{
"epoch": 0.3115520209887677,
"grad_norm": 12.245576858520508,
"learning_rate": 1.056189937133522e-06,
"loss": 0.2283,
"step": 3800
},
{
"epoch": 0.31237189472821186,
"grad_norm": 14.314285278320312,
"learning_rate": 1.0395192606472822e-06,
"loss": 0.2073,
"step": 3810
},
{
"epoch": 0.313191768467656,
"grad_norm": 15.187841415405273,
"learning_rate": 1.0229600014080101e-06,
"loss": 0.2495,
"step": 3820
},
{
"epoch": 0.3140116422071001,
"grad_norm": 13.99637508392334,
"learning_rate": 1.006512840101995e-06,
"loss": 0.2154,
"step": 3830
},
{
"epoch": 0.31483151594654424,
"grad_norm": 7.902044773101807,
"learning_rate": 9.90178452807619e-07,
"loss": 0.2435,
"step": 3840
},
{
"epoch": 0.3156513896859884,
"grad_norm": 12.850071907043457,
"learning_rate": 9.739575109675674e-07,
"loss": 0.2247,
"step": 3850
},
{
"epoch": 0.31647126342543247,
"grad_norm": 14.898462295532227,
"learning_rate": 9.578506813612243e-07,
"loss": 0.221,
"step": 3860
},
{
"epoch": 0.3172911371648766,
"grad_norm": 24.208559036254883,
"learning_rate": 9.418586260772695e-07,
"loss": 0.2303,
"step": 3870
},
{
"epoch": 0.31811101090432076,
"grad_norm": 17.132963180541992,
"learning_rate": 9.259820024864594e-07,
"loss": 0.2283,
"step": 3880
},
{
"epoch": 0.31893088464376484,
"grad_norm": 19.788406372070312,
"learning_rate": 9.102214632146059e-07,
"loss": 0.2465,
"step": 3890
},
{
"epoch": 0.319750758383209,
"grad_norm": 26.01558494567871,
"learning_rate": 8.94577656115746e-07,
"loss": 0.2321,
"step": 3900
},
{
"epoch": 0.319750758383209,
"eval_loss": 0.22018083930015564,
"eval_runtime": 56.099,
"eval_samples_per_second": 8.913,
"eval_steps_per_second": 8.913,
"step": 3900
},
{
"epoch": 0.32057063212265313,
"grad_norm": 13.368496894836426,
"learning_rate": 8.790512242455198e-07,
"loss": 0.2401,
"step": 3910
},
{
"epoch": 0.3213905058620972,
"grad_norm": 17.882627487182617,
"learning_rate": 8.636428058347274e-07,
"loss": 0.2045,
"step": 3920
},
{
"epoch": 0.32221037960154136,
"grad_norm": 21.98712158203125,
"learning_rate": 8.483530342630993e-07,
"loss": 0.243,
"step": 3930
},
{
"epoch": 0.3230302533409855,
"grad_norm": 33.167381286621094,
"learning_rate": 8.331825380332599e-07,
"loss": 0.2258,
"step": 3940
},
{
"epoch": 0.3238501270804296,
"grad_norm": 16.276443481445312,
"learning_rate": 8.181319407448884e-07,
"loss": 0.2489,
"step": 3950
},
{
"epoch": 0.32467000081987374,
"grad_norm": 12.20262336730957,
"learning_rate": 8.032018610690914e-07,
"loss": 0.2074,
"step": 3960
},
{
"epoch": 0.3254898745593179,
"grad_norm": 23.053037643432617,
"learning_rate": 7.883929127229665e-07,
"loss": 0.2238,
"step": 3970
},
{
"epoch": 0.32630974829876197,
"grad_norm": 9.354714393615723,
"learning_rate": 7.737057044443793e-07,
"loss": 0.2268,
"step": 3980
},
{
"epoch": 0.3271296220382061,
"grad_norm": 13.12759780883789,
"learning_rate": 7.591408399669337e-07,
"loss": 0.2259,
"step": 3990
},
{
"epoch": 0.32794949577765026,
"grad_norm": 12.080741882324219,
"learning_rate": 7.446989179951632e-07,
"loss": 0.214,
"step": 4000
},
{
"epoch": 0.32876936951709435,
"grad_norm": 13.813101768493652,
"learning_rate": 7.303805321799146e-07,
"loss": 0.218,
"step": 4010
},
{
"epoch": 0.3295892432565385,
"grad_norm": 12.327116012573242,
"learning_rate": 7.161862710939476e-07,
"loss": 0.2295,
"step": 4020
},
{
"epoch": 0.33040911699598263,
"grad_norm": 15.953246116638184,
"learning_rate": 7.021167182077403e-07,
"loss": 0.2197,
"step": 4030
},
{
"epoch": 0.3312289907354267,
"grad_norm": 19.298919677734375,
"learning_rate": 6.881724518655049e-07,
"loss": 0.2326,
"step": 4040
},
{
"epoch": 0.33204886447487086,
"grad_norm": 38.68765640258789,
"learning_rate": 6.743540452614152e-07,
"loss": 0.2303,
"step": 4050
},
{
"epoch": 0.33204886447487086,
"eval_loss": 0.21772576868534088,
"eval_runtime": 56.5668,
"eval_samples_per_second": 8.839,
"eval_steps_per_second": 8.839,
"step": 4050
},
{
"epoch": 0.332868738214315,
"grad_norm": 11.087291717529297,
"learning_rate": 6.606620664160438e-07,
"loss": 0.2071,
"step": 4060
},
{
"epoch": 0.3336886119537591,
"grad_norm": 50.521053314208984,
"learning_rate": 6.470970781530139e-07,
"loss": 0.2204,
"step": 4070
},
{
"epoch": 0.33450848569320324,
"grad_norm": 32.14698028564453,
"learning_rate": 6.336596380758604e-07,
"loss": 0.2466,
"step": 4080
},
{
"epoch": 0.3353283594326474,
"grad_norm": 19.88819694519043,
"learning_rate": 6.203502985451152e-07,
"loss": 0.2291,
"step": 4090
},
{
"epoch": 0.33614823317209147,
"grad_norm": 11.445552825927734,
"learning_rate": 6.071696066555978e-07,
"loss": 0.2549,
"step": 4100
},
{
"epoch": 0.3369681069115356,
"grad_norm": 17.117246627807617,
"learning_rate": 5.941181042139258e-07,
"loss": 0.2077,
"step": 4110
},
{
"epoch": 0.33778798065097976,
"grad_norm": 10.231658935546875,
"learning_rate": 5.811963277162466e-07,
"loss": 0.2182,
"step": 4120
},
{
"epoch": 0.3386078543904239,
"grad_norm": 14.68455696105957,
"learning_rate": 5.684048083261789e-07,
"loss": 0.2445,
"step": 4130
},
{
"epoch": 0.339427728129868,
"grad_norm": 22.658329010009766,
"learning_rate": 5.557440718529848e-07,
"loss": 0.1938,
"step": 4140
},
{
"epoch": 0.34024760186931213,
"grad_norm": 12.441681861877441,
"learning_rate": 5.432146387299522e-07,
"loss": 0.224,
"step": 4150
},
{
"epoch": 0.3410674756087563,
"grad_norm": 16.301542282104492,
"learning_rate": 5.308170239930022e-07,
"loss": 0.2092,
"step": 4160
},
{
"epoch": 0.34188734934820036,
"grad_norm": 17.414865493774414,
"learning_rate": 5.185517372595187e-07,
"loss": 0.2429,
"step": 4170
},
{
"epoch": 0.3427072230876445,
"grad_norm": 37.58354949951172,
"learning_rate": 5.064192827073995e-07,
"loss": 0.2236,
"step": 4180
},
{
"epoch": 0.34352709682708865,
"grad_norm": 19.772306442260742,
"learning_rate": 4.944201590543308e-07,
"loss": 0.2209,
"step": 4190
},
{
"epoch": 0.34434697056653274,
"grad_norm": 10.470952987670898,
"learning_rate": 4.825548595372898e-07,
"loss": 0.2441,
"step": 4200
},
{
"epoch": 0.34434697056653274,
"eval_loss": 0.2149660438299179,
"eval_runtime": 55.9997,
"eval_samples_per_second": 8.929,
"eval_steps_per_second": 8.929,
"step": 4200
},
{
"epoch": 0.3451668443059769,
"grad_norm": 12.9829683303833,
"learning_rate": 4.7082387189226646e-07,
"loss": 0.2012,
"step": 4210
},
{
"epoch": 0.345986718045421,
"grad_norm": 11.852750778198242,
"learning_rate": 4.5922767833421454e-07,
"loss": 0.2172,
"step": 4220
},
{
"epoch": 0.3468065917848651,
"grad_norm": 33.68533706665039,
"learning_rate": 4.477667555372326e-07,
"loss": 0.2114,
"step": 4230
},
{
"epoch": 0.34762646552430926,
"grad_norm": 24.621292114257812,
"learning_rate": 4.364415746149678e-07,
"loss": 0.2264,
"step": 4240
},
{
"epoch": 0.3484463392637534,
"grad_norm": 23.111419677734375,
"learning_rate": 4.2525260110124964e-07,
"loss": 0.2146,
"step": 4250
},
{
"epoch": 0.3492662130031975,
"grad_norm": 22.753629684448242,
"learning_rate": 4.1420029493095623e-07,
"loss": 0.2181,
"step": 4260
},
{
"epoch": 0.35008608674264163,
"grad_norm": 12.422630310058594,
"learning_rate": 4.032851104211036e-07,
"loss": 0.2059,
"step": 4270
},
{
"epoch": 0.3509059604820858,
"grad_norm": 21.33889389038086,
"learning_rate": 3.925074962521762e-07,
"loss": 0.2041,
"step": 4280
},
{
"epoch": 0.35172583422152986,
"grad_norm": 21.088577270507812,
"learning_rate": 3.818678954496787e-07,
"loss": 0.2162,
"step": 4290
},
{
"epoch": 0.352545707960974,
"grad_norm": 14.029748916625977,
"learning_rate": 3.713667453659287e-07,
"loss": 0.2291,
"step": 4300
},
{
"epoch": 0.35336558170041815,
"grad_norm": 11.585044860839844,
"learning_rate": 3.6100447766207473e-07,
"loss": 0.2139,
"step": 4310
},
{
"epoch": 0.35418545543986224,
"grad_norm": 13.666373252868652,
"learning_rate": 3.5078151829035693e-07,
"loss": 0.2311,
"step": 4320
},
{
"epoch": 0.3550053291793064,
"grad_norm": 24.15358543395996,
"learning_rate": 3.4069828747659405e-07,
"loss": 0.2149,
"step": 4330
},
{
"epoch": 0.3558252029187505,
"grad_norm": 25.829856872558594,
"learning_rate": 3.3075519970291144e-07,
"loss": 0.2055,
"step": 4340
},
{
"epoch": 0.3566450766581946,
"grad_norm": 23.233440399169922,
"learning_rate": 3.209526636907036e-07,
"loss": 0.2444,
"step": 4350
},
{
"epoch": 0.3566450766581946,
"eval_loss": 0.2148878425359726,
"eval_runtime": 56.223,
"eval_samples_per_second": 8.893,
"eval_steps_per_second": 8.893,
"step": 4350
},
{
"epoch": 0.35746495039763876,
"grad_norm": 19.731224060058594,
"learning_rate": 3.1129108238383095e-07,
"loss": 0.2199,
"step": 4360
},
{
"epoch": 0.3582848241370829,
"grad_norm": 23.215808868408203,
"learning_rate": 3.017708529320604e-07,
"loss": 0.2228,
"step": 4370
},
{
"epoch": 0.359104697876527,
"grad_norm": 17.997251510620117,
"learning_rate": 2.923923666747357e-07,
"loss": 0.2336,
"step": 4380
},
{
"epoch": 0.35992457161597113,
"grad_norm": 14.64735221862793,
"learning_rate": 2.8315600912469477e-07,
"loss": 0.2831,
"step": 4390
},
{
"epoch": 0.3607444453554153,
"grad_norm": 18.220691680908203,
"learning_rate": 2.740621599524189e-07,
"loss": 0.2277,
"step": 4400
},
{
"epoch": 0.36156431909485937,
"grad_norm": 16.92856216430664,
"learning_rate": 2.651111929704303e-07,
"loss": 0.2139,
"step": 4410
},
{
"epoch": 0.3623841928343035,
"grad_norm": 30.373014450073242,
"learning_rate": 2.563034761179223e-07,
"loss": 0.2354,
"step": 4420
},
{
"epoch": 0.36320406657374765,
"grad_norm": 16.33125114440918,
"learning_rate": 2.476393714456384e-07,
"loss": 0.2209,
"step": 4430
},
{
"epoch": 0.3640239403131918,
"grad_norm": 13.93752670288086,
"learning_rate": 2.391192351009855e-07,
"loss": 0.2285,
"step": 4440
},
{
"epoch": 0.3648438140526359,
"grad_norm": 24.299808502197266,
"learning_rate": 2.3074341731339837e-07,
"loss": 0.2487,
"step": 4450
},
{
"epoch": 0.36566368779208,
"grad_norm": 15.581805229187012,
"learning_rate": 2.225122623799407e-07,
"loss": 0.2112,
"step": 4460
},
{
"epoch": 0.36648356153152417,
"grad_norm": 21.24774932861328,
"learning_rate": 2.1442610865115135e-07,
"loss": 0.2253,
"step": 4470
},
{
"epoch": 0.36730343527096826,
"grad_norm": 20.960872650146484,
"learning_rate": 2.0648528851714077e-07,
"loss": 0.2208,
"step": 4480
},
{
"epoch": 0.3681233090104124,
"grad_norm": 22.186767578125,
"learning_rate": 1.9869012839392064e-07,
"loss": 0.218,
"step": 4490
},
{
"epoch": 0.36894318274985655,
"grad_norm": 15.852953910827637,
"learning_rate": 1.9104094870999264e-07,
"loss": 0.2123,
"step": 4500
},
{
"epoch": 0.36894318274985655,
"eval_loss": 0.21366393566131592,
"eval_runtime": 55.673,
"eval_samples_per_second": 8.981,
"eval_steps_per_second": 8.981,
"step": 4500
},
{
"epoch": 0.36976305648930063,
"grad_norm": 11.23139476776123,
"learning_rate": 1.8353806389317428e-07,
"loss": 0.2201,
"step": 4510
},
{
"epoch": 0.3705829302287448,
"grad_norm": 15.876472473144531,
"learning_rate": 1.761817823576731e-07,
"loss": 0.2382,
"step": 4520
},
{
"epoch": 0.3714028039681889,
"grad_norm": 18.092660903930664,
"learning_rate": 1.6897240649141125e-07,
"loss": 0.2359,
"step": 4530
},
{
"epoch": 0.372222677707633,
"grad_norm": 20.05590057373047,
"learning_rate": 1.619102326435923e-07,
"loss": 0.2304,
"step": 4540
},
{
"epoch": 0.37304255144707715,
"grad_norm": 14.876965522766113,
"learning_rate": 1.5499555111252285e-07,
"loss": 0.2305,
"step": 4550
},
{
"epoch": 0.3738624251865213,
"grad_norm": 24.27523422241211,
"learning_rate": 1.4822864613367766e-07,
"loss": 0.229,
"step": 4560
},
{
"epoch": 0.3746822989259654,
"grad_norm": 36.034820556640625,
"learning_rate": 1.4160979586801724e-07,
"loss": 0.2099,
"step": 4570
},
{
"epoch": 0.37550217266540953,
"grad_norm": 14.821313858032227,
"learning_rate": 1.3513927239055036e-07,
"loss": 0.2069,
"step": 4580
},
{
"epoch": 0.37632204640485367,
"grad_norm": 24.151025772094727,
"learning_rate": 1.2881734167915425e-07,
"loss": 0.2477,
"step": 4590
},
{
"epoch": 0.37714192014429776,
"grad_norm": 34.51681900024414,
"learning_rate": 1.2264426360363956e-07,
"loss": 0.2169,
"step": 4600
},
{
"epoch": 0.3779617938837419,
"grad_norm": 18.54802894592285,
"learning_rate": 1.1662029191506775e-07,
"loss": 0.2053,
"step": 4610
},
{
"epoch": 0.37878166762318605,
"grad_norm": 18.75210189819336,
"learning_rate": 1.107456742353201e-07,
"loss": 0.2313,
"step": 4620
},
{
"epoch": 0.37960154136263013,
"grad_norm": 14.032902717590332,
"learning_rate": 1.0502065204692062e-07,
"loss": 0.2253,
"step": 4630
},
{
"epoch": 0.3804214151020743,
"grad_norm": 16.711780548095703,
"learning_rate": 9.94454606831076e-08,
"loss": 0.208,
"step": 4640
},
{
"epoch": 0.3812412888415184,
"grad_norm": 33.53385543823242,
"learning_rate": 9.402032931816144e-08,
"loss": 0.2256,
"step": 4650
},
{
"epoch": 0.3812412888415184,
"eval_loss": 0.2128845751285553,
"eval_runtime": 55.1573,
"eval_samples_per_second": 9.065,
"eval_steps_per_second": 9.065,
"step": 4650
},
{
"epoch": 0.3820611625809625,
"grad_norm": 9.32500171661377,
"learning_rate": 8.874548095798464e-08,
"loss": 0.227,
"step": 4660
},
{
"epoch": 0.38288103632040665,
"grad_norm": 12.115835189819336,
"learning_rate": 8.362113243093245e-08,
"loss": 0.2148,
"step": 4670
},
{
"epoch": 0.3837009100598508,
"grad_norm": 26.36838722229004,
"learning_rate": 7.864749437890173e-08,
"loss": 0.2228,
"step": 4680
},
{
"epoch": 0.3845207837992949,
"grad_norm": 12.476286888122559,
"learning_rate": 7.382477124867282e-08,
"loss": 0.2057,
"step": 4690
},
{
"epoch": 0.38534065753873903,
"grad_norm": 15.308034896850586,
"learning_rate": 6.915316128350461e-08,
"loss": 0.2278,
"step": 4700
},
{
"epoch": 0.3861605312781832,
"grad_norm": 9.208645820617676,
"learning_rate": 6.463285651498563e-08,
"loss": 0.2227,
"step": 4710
},
{
"epoch": 0.38698040501762726,
"grad_norm": 9.877080917358398,
"learning_rate": 6.026404275513875e-08,
"loss": 0.2197,
"step": 4720
},
{
"epoch": 0.3878002787570714,
"grad_norm": 16.259761810302734,
"learning_rate": 5.604689958878723e-08,
"loss": 0.2413,
"step": 4730
},
{
"epoch": 0.38862015249651555,
"grad_norm": 17.41680908203125,
"learning_rate": 5.198160036616898e-08,
"loss": 0.2159,
"step": 4740
},
{
"epoch": 0.38944002623595964,
"grad_norm": 17.588123321533203,
"learning_rate": 4.8068312195811847e-08,
"loss": 0.2191,
"step": 4750
},
{
"epoch": 0.3902598999754038,
"grad_norm": 14.38376235961914,
"learning_rate": 4.4307195937666194e-08,
"loss": 0.2332,
"step": 4760
},
{
"epoch": 0.3910797737148479,
"grad_norm": 12.54135799407959,
"learning_rate": 4.069840619648935e-08,
"loss": 0.2176,
"step": 4770
},
{
"epoch": 0.39189964745429207,
"grad_norm": 20.703615188598633,
"learning_rate": 3.72420913154932e-08,
"loss": 0.2204,
"step": 4780
},
{
"epoch": 0.39271952119373615,
"grad_norm": 28.904329299926758,
"learning_rate": 3.3938393370244876e-08,
"loss": 0.2389,
"step": 4790
},
{
"epoch": 0.3935393949331803,
"grad_norm": 15.144803047180176,
"learning_rate": 3.078744816282731e-08,
"loss": 0.2306,
"step": 4800
},
{
"epoch": 0.3935393949331803,
"eval_loss": 0.2134290486574173,
"eval_runtime": 55.5119,
"eval_samples_per_second": 9.007,
"eval_steps_per_second": 9.007,
"step": 4800
},
{
"epoch": 0.39435926867262444,
"grad_norm": 18.657732009887695,
"learning_rate": 2.778938521625613e-08,
"loss": 0.2454,
"step": 4810
},
{
"epoch": 0.39517914241206853,
"grad_norm": 20.660715103149414,
"learning_rate": 2.4944327769157314e-08,
"loss": 0.2211,
"step": 4820
},
{
"epoch": 0.3959990161515127,
"grad_norm": 13.545777320861816,
"learning_rate": 2.225239277069871e-08,
"loss": 0.1803,
"step": 4830
},
{
"epoch": 0.3968188898909568,
"grad_norm": 20.064281463623047,
"learning_rate": 1.971369087578473e-08,
"loss": 0.2226,
"step": 4840
},
{
"epoch": 0.3976387636304009,
"grad_norm": 11.630465507507324,
"learning_rate": 1.7328326440506637e-08,
"loss": 0.2117,
"step": 4850
},
{
"epoch": 0.39845863736984505,
"grad_norm": 16.434839248657227,
"learning_rate": 1.5096397517853497e-08,
"loss": 0.2381,
"step": 4860
},
{
"epoch": 0.3992785111092892,
"grad_norm": 14.184981346130371,
"learning_rate": 1.3017995853681631e-08,
"loss": 0.2262,
"step": 4870
},
{
"epoch": 0.4000983848487333,
"grad_norm": 17.047590255737305,
"learning_rate": 1.1093206882943076e-08,
"loss": 0.2164,
"step": 4880
},
{
"epoch": 0.4009182585881774,
"grad_norm": 15.3792142868042,
"learning_rate": 9.322109726172952e-09,
"loss": 0.2288,
"step": 4890
},
{
"epoch": 0.40173813232762157,
"grad_norm": 14.833084106445312,
"learning_rate": 7.704777186238744e-09,
"loss": 0.209,
"step": 4900
},
{
"epoch": 0.40255800606706565,
"grad_norm": 22.476787567138672,
"learning_rate": 6.241275745346859e-09,
"loss": 0.2118,
"step": 4910
},
{
"epoch": 0.4033778798065098,
"grad_norm": 14.301311492919922,
"learning_rate": 4.931665562308563e-09,
"loss": 0.2222,
"step": 4920
},
{
"epoch": 0.40419775354595394,
"grad_norm": 13.92874813079834,
"learning_rate": 3.7760004700702905e-09,
"loss": 0.2283,
"step": 4930
},
{
"epoch": 0.40501762728539803,
"grad_norm": 20.181961059570312,
"learning_rate": 2.7743279734962494e-09,
"loss": 0.2132,
"step": 4940
},
{
"epoch": 0.4058375010248422,
"grad_norm": 22.093725204467773,
"learning_rate": 1.926689247420399e-09,
"loss": 0.2127,
"step": 4950
},
{
"epoch": 0.4058375010248422,
"eval_loss": 0.2128431349992752,
"eval_runtime": 55.4771,
"eval_samples_per_second": 9.013,
"eval_steps_per_second": 9.013,
"step": 4950
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.911768952965693e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}