hf-100's picture
Upload folder using huggingface_hub
e1336b9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5275206529312233,
"eval_steps": 100,
"global_step": 5300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009953219866626853,
"grad_norm": 1.912980556488037,
"learning_rate": 9.995023390066686e-06,
"loss": 1.8703,
"step": 10
},
{
"epoch": 0.0019906439733253707,
"grad_norm": 1.866821050643921,
"learning_rate": 9.990046780133374e-06,
"loss": 1.8723,
"step": 20
},
{
"epoch": 0.002985965959988056,
"grad_norm": 2.058809280395508,
"learning_rate": 9.985070170200061e-06,
"loss": 1.8097,
"step": 30
},
{
"epoch": 0.003981287946650741,
"grad_norm": 1.459013819694519,
"learning_rate": 9.980093560266747e-06,
"loss": 1.7456,
"step": 40
},
{
"epoch": 0.004976609933313427,
"grad_norm": 0.9095586538314819,
"learning_rate": 9.975116950333434e-06,
"loss": 1.7195,
"step": 50
},
{
"epoch": 0.005971931919976112,
"grad_norm": 1.1065226793289185,
"learning_rate": 9.970140340400121e-06,
"loss": 1.6502,
"step": 60
},
{
"epoch": 0.0069672539066387975,
"grad_norm": 0.8301252126693726,
"learning_rate": 9.965163730466807e-06,
"loss": 1.5699,
"step": 70
},
{
"epoch": 0.007962575893301483,
"grad_norm": 1.0762828588485718,
"learning_rate": 9.960187120533493e-06,
"loss": 1.5072,
"step": 80
},
{
"epoch": 0.008957897879964169,
"grad_norm": 1.0814900398254395,
"learning_rate": 9.95521051060018e-06,
"loss": 1.4369,
"step": 90
},
{
"epoch": 0.009953219866626855,
"grad_norm": 1.3561326265335083,
"learning_rate": 9.950233900666867e-06,
"loss": 1.3467,
"step": 100
},
{
"epoch": 0.009953219866626855,
"eval_loss": 1.2846794128417969,
"eval_runtime": 147.6242,
"eval_samples_per_second": 1.375,
"eval_steps_per_second": 0.691,
"step": 100
},
{
"epoch": 0.010948541853289539,
"grad_norm": 1.438547968864441,
"learning_rate": 9.945257290733553e-06,
"loss": 1.2222,
"step": 110
},
{
"epoch": 0.011943863839952225,
"grad_norm": 1.402588963508606,
"learning_rate": 9.94028068080024e-06,
"loss": 1.1001,
"step": 120
},
{
"epoch": 0.012939185826614909,
"grad_norm": 1.4357985258102417,
"learning_rate": 9.935304070866926e-06,
"loss": 0.9657,
"step": 130
},
{
"epoch": 0.013934507813277595,
"grad_norm": 2.137953042984009,
"learning_rate": 9.930327460933613e-06,
"loss": 0.8211,
"step": 140
},
{
"epoch": 0.014929829799940281,
"grad_norm": 1.374299168586731,
"learning_rate": 9.925350851000299e-06,
"loss": 0.7142,
"step": 150
},
{
"epoch": 0.015925151786602965,
"grad_norm": 1.1510456800460815,
"learning_rate": 9.920374241066986e-06,
"loss": 0.656,
"step": 160
},
{
"epoch": 0.01692047377326565,
"grad_norm": 1.0226788520812988,
"learning_rate": 9.915397631133673e-06,
"loss": 0.6212,
"step": 170
},
{
"epoch": 0.017915795759928337,
"grad_norm": 0.9365411400794983,
"learning_rate": 9.910421021200359e-06,
"loss": 0.6069,
"step": 180
},
{
"epoch": 0.018911117746591023,
"grad_norm": 0.6880003213882446,
"learning_rate": 9.905444411267046e-06,
"loss": 0.6128,
"step": 190
},
{
"epoch": 0.01990643973325371,
"grad_norm": 1.1190361976623535,
"learning_rate": 9.900467801333732e-06,
"loss": 0.5426,
"step": 200
},
{
"epoch": 0.01990643973325371,
"eval_loss": 0.5788590908050537,
"eval_runtime": 147.511,
"eval_samples_per_second": 1.376,
"eval_steps_per_second": 0.691,
"step": 200
},
{
"epoch": 0.02090176171991639,
"grad_norm": 1.184279441833496,
"learning_rate": 9.895491191400419e-06,
"loss": 0.5887,
"step": 210
},
{
"epoch": 0.021897083706579078,
"grad_norm": 0.7627615928649902,
"learning_rate": 9.890514581467106e-06,
"loss": 0.5433,
"step": 220
},
{
"epoch": 0.022892405693241764,
"grad_norm": 0.7858164310455322,
"learning_rate": 9.885537971533792e-06,
"loss": 0.5843,
"step": 230
},
{
"epoch": 0.02388772767990445,
"grad_norm": 0.695697009563446,
"learning_rate": 9.880561361600478e-06,
"loss": 0.5365,
"step": 240
},
{
"epoch": 0.024883049666567136,
"grad_norm": 0.8994197845458984,
"learning_rate": 9.875584751667165e-06,
"loss": 0.5662,
"step": 250
},
{
"epoch": 0.025878371653229818,
"grad_norm": 0.8016309142112732,
"learning_rate": 9.870608141733852e-06,
"loss": 0.5592,
"step": 260
},
{
"epoch": 0.026873693639892504,
"grad_norm": 0.8534384369850159,
"learning_rate": 9.865631531800538e-06,
"loss": 0.5248,
"step": 270
},
{
"epoch": 0.02786901562655519,
"grad_norm": 0.9857029914855957,
"learning_rate": 9.860654921867225e-06,
"loss": 0.5294,
"step": 280
},
{
"epoch": 0.028864337613217876,
"grad_norm": 0.7766090631484985,
"learning_rate": 9.855678311933912e-06,
"loss": 0.5198,
"step": 290
},
{
"epoch": 0.029859659599880562,
"grad_norm": 0.6832401752471924,
"learning_rate": 9.850701702000598e-06,
"loss": 0.5844,
"step": 300
},
{
"epoch": 0.029859659599880562,
"eval_loss": 0.536589503288269,
"eval_runtime": 147.4968,
"eval_samples_per_second": 1.376,
"eval_steps_per_second": 0.692,
"step": 300
},
{
"epoch": 0.030854981586543248,
"grad_norm": 0.7720848917961121,
"learning_rate": 9.845725092067284e-06,
"loss": 0.5365,
"step": 310
},
{
"epoch": 0.03185030357320593,
"grad_norm": 0.7022100687026978,
"learning_rate": 9.840748482133971e-06,
"loss": 0.4841,
"step": 320
},
{
"epoch": 0.03284562555986862,
"grad_norm": 1.0030310153961182,
"learning_rate": 9.835771872200658e-06,
"loss": 0.4635,
"step": 330
},
{
"epoch": 0.0338409475465313,
"grad_norm": 0.8628882765769958,
"learning_rate": 9.830795262267344e-06,
"loss": 0.4932,
"step": 340
},
{
"epoch": 0.034836269533193985,
"grad_norm": 0.7178316712379456,
"learning_rate": 9.825818652334031e-06,
"loss": 0.6057,
"step": 350
},
{
"epoch": 0.035831591519856675,
"grad_norm": 0.9564626216888428,
"learning_rate": 9.820842042400718e-06,
"loss": 0.5371,
"step": 360
},
{
"epoch": 0.03682691350651936,
"grad_norm": 0.7041760683059692,
"learning_rate": 9.815865432467404e-06,
"loss": 0.513,
"step": 370
},
{
"epoch": 0.037822235493182046,
"grad_norm": 1.0203750133514404,
"learning_rate": 9.81088882253409e-06,
"loss": 0.5118,
"step": 380
},
{
"epoch": 0.03881755747984473,
"grad_norm": 0.8765382170677185,
"learning_rate": 9.805912212600777e-06,
"loss": 0.4529,
"step": 390
},
{
"epoch": 0.03981287946650742,
"grad_norm": 0.9951983690261841,
"learning_rate": 9.800935602667464e-06,
"loss": 0.5336,
"step": 400
},
{
"epoch": 0.03981287946650742,
"eval_loss": 0.5151349306106567,
"eval_runtime": 147.6615,
"eval_samples_per_second": 1.375,
"eval_steps_per_second": 0.691,
"step": 400
},
{
"epoch": 0.0408082014531701,
"grad_norm": 0.7691435813903809,
"learning_rate": 9.79595899273415e-06,
"loss": 0.506,
"step": 410
},
{
"epoch": 0.04180352343983278,
"grad_norm": 1.1955533027648926,
"learning_rate": 9.790982382800837e-06,
"loss": 0.4692,
"step": 420
},
{
"epoch": 0.04279884542649547,
"grad_norm": 1.128085732460022,
"learning_rate": 9.786005772867525e-06,
"loss": 0.4608,
"step": 430
},
{
"epoch": 0.043794167413158155,
"grad_norm": 0.5518949627876282,
"learning_rate": 9.78102916293421e-06,
"loss": 0.5006,
"step": 440
},
{
"epoch": 0.044789489399820845,
"grad_norm": 0.7164484858512878,
"learning_rate": 9.776052553000896e-06,
"loss": 0.4996,
"step": 450
},
{
"epoch": 0.04578481138648353,
"grad_norm": 0.5959630012512207,
"learning_rate": 9.771075943067583e-06,
"loss": 0.4843,
"step": 460
},
{
"epoch": 0.04678013337314621,
"grad_norm": 0.743648111820221,
"learning_rate": 9.76609933313427e-06,
"loss": 0.4363,
"step": 470
},
{
"epoch": 0.0477754553598089,
"grad_norm": 0.8757079243659973,
"learning_rate": 9.761122723200956e-06,
"loss": 0.4665,
"step": 480
},
{
"epoch": 0.04877077734647158,
"grad_norm": 1.0122153759002686,
"learning_rate": 9.756146113267643e-06,
"loss": 0.492,
"step": 490
},
{
"epoch": 0.04976609933313427,
"grad_norm": 0.6179729700088501,
"learning_rate": 9.751169503334329e-06,
"loss": 0.5022,
"step": 500
},
{
"epoch": 0.04976609933313427,
"eval_loss": 0.4993921220302582,
"eval_runtime": 147.7401,
"eval_samples_per_second": 1.374,
"eval_steps_per_second": 0.69,
"step": 500
},
{
"epoch": 0.050761421319796954,
"grad_norm": 0.952812671661377,
"learning_rate": 9.746192893401016e-06,
"loss": 0.4901,
"step": 510
},
{
"epoch": 0.051756743306459636,
"grad_norm": 0.6715916991233826,
"learning_rate": 9.741216283467702e-06,
"loss": 0.5055,
"step": 520
},
{
"epoch": 0.052752065293122326,
"grad_norm": 0.674640953540802,
"learning_rate": 9.736239673534389e-06,
"loss": 0.4874,
"step": 530
},
{
"epoch": 0.05374738727978501,
"grad_norm": 0.7867962718009949,
"learning_rate": 9.731263063601075e-06,
"loss": 0.4956,
"step": 540
},
{
"epoch": 0.0547427092664477,
"grad_norm": 0.9035332202911377,
"learning_rate": 9.726286453667762e-06,
"loss": 0.499,
"step": 550
},
{
"epoch": 0.05573803125311038,
"grad_norm": 0.7009295225143433,
"learning_rate": 9.72130984373445e-06,
"loss": 0.5034,
"step": 560
},
{
"epoch": 0.05673335323977307,
"grad_norm": 0.7018862366676331,
"learning_rate": 9.716333233801135e-06,
"loss": 0.5137,
"step": 570
},
{
"epoch": 0.05772867522643575,
"grad_norm": 0.7812825441360474,
"learning_rate": 9.711356623867822e-06,
"loss": 0.4724,
"step": 580
},
{
"epoch": 0.058723997213098435,
"grad_norm": 0.6245225071907043,
"learning_rate": 9.70638001393451e-06,
"loss": 0.4446,
"step": 590
},
{
"epoch": 0.059719319199761124,
"grad_norm": 0.9083976149559021,
"learning_rate": 9.701403404001195e-06,
"loss": 0.4884,
"step": 600
},
{
"epoch": 0.059719319199761124,
"eval_loss": 0.4891846477985382,
"eval_runtime": 147.5284,
"eval_samples_per_second": 1.376,
"eval_steps_per_second": 0.691,
"step": 600
},
{
"epoch": 0.06071464118642381,
"grad_norm": 0.6195352673530579,
"learning_rate": 9.69642679406788e-06,
"loss": 0.5121,
"step": 610
},
{
"epoch": 0.061709963173086496,
"grad_norm": 0.8068727254867554,
"learning_rate": 9.691450184134568e-06,
"loss": 0.4689,
"step": 620
},
{
"epoch": 0.06270528515974919,
"grad_norm": 1.0427749156951904,
"learning_rate": 9.686473574201255e-06,
"loss": 0.4968,
"step": 630
},
{
"epoch": 0.06370060714641186,
"grad_norm": 0.698349118232727,
"learning_rate": 9.681496964267941e-06,
"loss": 0.4691,
"step": 640
},
{
"epoch": 0.06469592913307455,
"grad_norm": 0.9104384183883667,
"learning_rate": 9.676520354334628e-06,
"loss": 0.4775,
"step": 650
},
{
"epoch": 0.06569125111973724,
"grad_norm": 0.8729726076126099,
"learning_rate": 9.671543744401316e-06,
"loss": 0.5201,
"step": 660
},
{
"epoch": 0.06668657310639992,
"grad_norm": 0.9858236908912659,
"learning_rate": 9.666567134468001e-06,
"loss": 0.4268,
"step": 670
},
{
"epoch": 0.0676818950930626,
"grad_norm": 2.322754383087158,
"learning_rate": 9.661590524534687e-06,
"loss": 0.4744,
"step": 680
},
{
"epoch": 0.0686772170797253,
"grad_norm": 0.9327623248100281,
"learning_rate": 9.656613914601374e-06,
"loss": 0.4355,
"step": 690
},
{
"epoch": 0.06967253906638797,
"grad_norm": 0.6949413418769836,
"learning_rate": 9.651637304668062e-06,
"loss": 0.465,
"step": 700
},
{
"epoch": 0.06967253906638797,
"eval_loss": 0.4817120432853699,
"eval_runtime": 147.5643,
"eval_samples_per_second": 1.376,
"eval_steps_per_second": 0.691,
"step": 700
},
{
"epoch": 0.07066786105305066,
"grad_norm": 0.5208165049552917,
"learning_rate": 9.646660694734747e-06,
"loss": 0.4973,
"step": 710
},
{
"epoch": 0.07166318303971335,
"grad_norm": 0.8434884548187256,
"learning_rate": 9.641684084801434e-06,
"loss": 0.4721,
"step": 720
},
{
"epoch": 0.07265850502637604,
"grad_norm": 0.7161769866943359,
"learning_rate": 9.636707474868122e-06,
"loss": 0.498,
"step": 730
},
{
"epoch": 0.07365382701303871,
"grad_norm": 0.7036088705062866,
"learning_rate": 9.631730864934807e-06,
"loss": 0.4672,
"step": 740
},
{
"epoch": 0.0746491489997014,
"grad_norm": 0.9175013899803162,
"learning_rate": 9.626754255001493e-06,
"loss": 0.4781,
"step": 750
},
{
"epoch": 0.07564447098636409,
"grad_norm": 0.678519606590271,
"learning_rate": 9.62177764506818e-06,
"loss": 0.4048,
"step": 760
},
{
"epoch": 0.07663979297302677,
"grad_norm": 0.6295528411865234,
"learning_rate": 9.616801035134868e-06,
"loss": 0.449,
"step": 770
},
{
"epoch": 0.07763511495968946,
"grad_norm": 0.5424385666847229,
"learning_rate": 9.611824425201553e-06,
"loss": 0.4394,
"step": 780
},
{
"epoch": 0.07863043694635215,
"grad_norm": 0.508836030960083,
"learning_rate": 9.60684781526824e-06,
"loss": 0.4317,
"step": 790
},
{
"epoch": 0.07962575893301484,
"grad_norm": 0.6004147529602051,
"learning_rate": 9.601871205334926e-06,
"loss": 0.4308,
"step": 800
},
{
"epoch": 0.07962575893301484,
"eval_loss": 0.47557342052459717,
"eval_runtime": 147.5812,
"eval_samples_per_second": 1.376,
"eval_steps_per_second": 0.691,
"step": 800
},
{
"epoch": 0.08062108091967751,
"grad_norm": 0.5553786754608154,
"learning_rate": 9.596894595401613e-06,
"loss": 0.4376,
"step": 810
},
{
"epoch": 0.0816164029063402,
"grad_norm": 0.7254445552825928,
"learning_rate": 9.591917985468299e-06,
"loss": 0.4884,
"step": 820
},
{
"epoch": 0.08261172489300289,
"grad_norm": 0.7175013422966003,
"learning_rate": 9.586941375534986e-06,
"loss": 0.4167,
"step": 830
},
{
"epoch": 0.08360704687966557,
"grad_norm": 0.6464620232582092,
"learning_rate": 9.581964765601674e-06,
"loss": 0.4622,
"step": 840
},
{
"epoch": 0.08460236886632826,
"grad_norm": 0.6999176144599915,
"learning_rate": 9.57698815566836e-06,
"loss": 0.4708,
"step": 850
},
{
"epoch": 0.08559769085299095,
"grad_norm": 0.7939727306365967,
"learning_rate": 9.572011545735047e-06,
"loss": 0.4633,
"step": 860
},
{
"epoch": 0.08659301283965362,
"grad_norm": 0.473017156124115,
"learning_rate": 9.567034935801732e-06,
"loss": 0.4585,
"step": 870
},
{
"epoch": 0.08758833482631631,
"grad_norm": 0.7265183329582214,
"learning_rate": 9.56205832586842e-06,
"loss": 0.4485,
"step": 880
},
{
"epoch": 0.088583656812979,
"grad_norm": 0.539735734462738,
"learning_rate": 9.557081715935105e-06,
"loss": 0.475,
"step": 890
},
{
"epoch": 0.08957897879964169,
"grad_norm": 0.7587076425552368,
"learning_rate": 9.552105106001792e-06,
"loss": 0.4347,
"step": 900
},
{
"epoch": 0.08957897879964169,
"eval_loss": 0.4690374732017517,
"eval_runtime": 147.5672,
"eval_samples_per_second": 1.376,
"eval_steps_per_second": 0.691,
"step": 900
},
{
"epoch": 0.09057430078630437,
"grad_norm": 0.7549741864204407,
"learning_rate": 9.547128496068478e-06,
"loss": 0.4434,
"step": 910
},
{
"epoch": 0.09156962277296705,
"grad_norm": 0.686689555644989,
"learning_rate": 9.542151886135165e-06,
"loss": 0.4052,
"step": 920
},
{
"epoch": 0.09256494475962974,
"grad_norm": 1.02870512008667,
"learning_rate": 9.537175276201853e-06,
"loss": 0.4806,
"step": 930
},
{
"epoch": 0.09356026674629242,
"grad_norm": 0.7680675983428955,
"learning_rate": 9.532198666268538e-06,
"loss": 0.4609,
"step": 940
},
{
"epoch": 0.09455558873295511,
"grad_norm": 0.5478435754776001,
"learning_rate": 9.527222056335224e-06,
"loss": 0.4171,
"step": 950
},
{
"epoch": 0.0955509107196178,
"grad_norm": 0.5974985361099243,
"learning_rate": 9.522245446401913e-06,
"loss": 0.4686,
"step": 960
},
{
"epoch": 0.09654623270628049,
"grad_norm": 0.997151792049408,
"learning_rate": 9.517268836468598e-06,
"loss": 0.4676,
"step": 970
},
{
"epoch": 0.09754155469294316,
"grad_norm": 0.6366075277328491,
"learning_rate": 9.512292226535284e-06,
"loss": 0.4467,
"step": 980
},
{
"epoch": 0.09853687667960585,
"grad_norm": 0.5682553052902222,
"learning_rate": 9.507315616601971e-06,
"loss": 0.4772,
"step": 990
},
{
"epoch": 0.09953219866626854,
"grad_norm": 0.5869882106781006,
"learning_rate": 9.502339006668659e-06,
"loss": 0.3976,
"step": 1000
},
{
"epoch": 0.09953219866626854,
"eval_loss": 0.46156319975852966,
"eval_runtime": 147.6656,
"eval_samples_per_second": 1.375,
"eval_steps_per_second": 0.691,
"step": 1000
},
{
"epoch": 0.10052752065293122,
"grad_norm": 0.5758237838745117,
"learning_rate": 9.497362396735344e-06,
"loss": 0.4528,
"step": 1010
},
{
"epoch": 0.10152284263959391,
"grad_norm": 0.700281023979187,
"learning_rate": 9.492385786802032e-06,
"loss": 0.4545,
"step": 1020
},
{
"epoch": 0.1025181646262566,
"grad_norm": 1.1320914030075073,
"learning_rate": 9.487409176868719e-06,
"loss": 0.4331,
"step": 1030
},
{
"epoch": 0.10351348661291927,
"grad_norm": 0.6469867825508118,
"learning_rate": 9.482432566935405e-06,
"loss": 0.3759,
"step": 1040
},
{
"epoch": 0.10450880859958196,
"grad_norm": 0.9471383094787598,
"learning_rate": 9.47745595700209e-06,
"loss": 0.4041,
"step": 1050
},
{
"epoch": 0.10550413058624465,
"grad_norm": 0.5729160904884338,
"learning_rate": 9.472479347068777e-06,
"loss": 0.4871,
"step": 1060
},
{
"epoch": 0.10649945257290734,
"grad_norm": 0.642436683177948,
"learning_rate": 9.467502737135465e-06,
"loss": 0.3893,
"step": 1070
},
{
"epoch": 0.10749477455957002,
"grad_norm": 0.95659339427948,
"learning_rate": 9.46252612720215e-06,
"loss": 0.4486,
"step": 1080
},
{
"epoch": 0.1084900965462327,
"grad_norm": 0.6642667055130005,
"learning_rate": 9.457549517268838e-06,
"loss": 0.5168,
"step": 1090
},
{
"epoch": 0.1094854185328954,
"grad_norm": 0.5805796980857849,
"learning_rate": 9.452572907335525e-06,
"loss": 0.4019,
"step": 1100
},
{
"epoch": 0.1094854185328954,
"eval_loss": 0.4559178054332733,
"eval_runtime": 147.5891,
"eval_samples_per_second": 1.375,
"eval_steps_per_second": 0.691,
"step": 1100
},
{
"epoch": 0.11048074051955807,
"grad_norm": 0.7006909251213074,
"learning_rate": 9.44759629740221e-06,
"loss": 0.457,
"step": 1110
},
{
"epoch": 0.11147606250622076,
"grad_norm": 1.1821540594100952,
"learning_rate": 9.442619687468896e-06,
"loss": 0.3484,
"step": 1120
},
{
"epoch": 0.11247138449288345,
"grad_norm": 0.7232743501663208,
"learning_rate": 9.437643077535584e-06,
"loss": 0.417,
"step": 1130
},
{
"epoch": 0.11346670647954614,
"grad_norm": 0.6104183197021484,
"learning_rate": 9.43266646760227e-06,
"loss": 0.4821,
"step": 1140
},
{
"epoch": 0.11446202846620881,
"grad_norm": 0.5961386561393738,
"learning_rate": 9.427689857668956e-06,
"loss": 0.4834,
"step": 1150
},
{
"epoch": 0.1154573504528715,
"grad_norm": 0.5530894994735718,
"learning_rate": 9.422713247735644e-06,
"loss": 0.443,
"step": 1160
},
{
"epoch": 0.1164526724395342,
"grad_norm": 0.5148622393608093,
"learning_rate": 9.41773663780233e-06,
"loss": 0.4029,
"step": 1170
},
{
"epoch": 0.11744799442619687,
"grad_norm": 0.6148583292961121,
"learning_rate": 9.412760027869017e-06,
"loss": 0.4308,
"step": 1180
},
{
"epoch": 0.11844331641285956,
"grad_norm": 0.7840449213981628,
"learning_rate": 9.407783417935702e-06,
"loss": 0.499,
"step": 1190
},
{
"epoch": 0.11943863839952225,
"grad_norm": 0.6757422089576721,
"learning_rate": 9.40280680800239e-06,
"loss": 0.4263,
"step": 1200
},
{
"epoch": 0.11943863839952225,
"eval_loss": 0.4505193829536438,
"eval_runtime": 147.6664,
"eval_samples_per_second": 1.375,
"eval_steps_per_second": 0.691,
"step": 1200
},
{
"epoch": 0.12043396038618492,
"grad_norm": 0.630874752998352,
"learning_rate": 9.397830198069075e-06,
"loss": 0.492,
"step": 1210
},
{
"epoch": 0.12142928237284761,
"grad_norm": 0.7458256483078003,
"learning_rate": 9.392853588135763e-06,
"loss": 0.4612,
"step": 1220
},
{
"epoch": 0.1224246043595103,
"grad_norm": 0.6903111934661865,
"learning_rate": 9.38787697820245e-06,
"loss": 0.4882,
"step": 1230
},
{
"epoch": 0.12341992634617299,
"grad_norm": 1.0817712545394897,
"learning_rate": 9.382900368269135e-06,
"loss": 0.4658,
"step": 1240
},
{
"epoch": 0.12441524833283567,
"grad_norm": 0.8182739615440369,
"learning_rate": 9.377923758335823e-06,
"loss": 0.4281,
"step": 1250
},
{
"epoch": 0.12541057031949837,
"grad_norm": 0.5155394077301025,
"learning_rate": 9.372947148402508e-06,
"loss": 0.4312,
"step": 1260
},
{
"epoch": 0.12640589230616103,
"grad_norm": 0.6190319657325745,
"learning_rate": 9.367970538469196e-06,
"loss": 0.4537,
"step": 1270
},
{
"epoch": 0.12740121429282372,
"grad_norm": 0.7704219222068787,
"learning_rate": 9.362993928535881e-06,
"loss": 0.4873,
"step": 1280
},
{
"epoch": 0.1283965362794864,
"grad_norm": 0.6395025849342346,
"learning_rate": 9.358017318602569e-06,
"loss": 0.4374,
"step": 1290
},
{
"epoch": 0.1293918582661491,
"grad_norm": 0.9248729944229126,
"learning_rate": 9.353040708669256e-06,
"loss": 0.4183,
"step": 1300
},
{
"epoch": 0.1293918582661491,
"eval_loss": 0.44450852274894714,
"eval_runtime": 147.6747,
"eval_samples_per_second": 1.375,
"eval_steps_per_second": 0.691,
"step": 1300
},
{
"epoch": 0.1303871802528118,
"grad_norm": 0.6703208088874817,
"learning_rate": 9.348064098735942e-06,
"loss": 0.4017,
"step": 1310
},
{
"epoch": 0.13138250223947448,
"grad_norm": 0.7091432213783264,
"learning_rate": 9.343087488802627e-06,
"loss": 0.4858,
"step": 1320
},
{
"epoch": 0.13237782422613714,
"grad_norm": 0.6519076824188232,
"learning_rate": 9.338110878869316e-06,
"loss": 0.402,
"step": 1330
},
{
"epoch": 0.13337314621279983,
"grad_norm": 0.7192474603652954,
"learning_rate": 9.333134268936002e-06,
"loss": 0.4275,
"step": 1340
},
{
"epoch": 0.13436846819946252,
"grad_norm": 0.626981794834137,
"learning_rate": 9.328157659002687e-06,
"loss": 0.4276,
"step": 1350
},
{
"epoch": 0.1353637901861252,
"grad_norm": 0.8239569664001465,
"learning_rate": 9.323181049069375e-06,
"loss": 0.4384,
"step": 1360
},
{
"epoch": 0.1363591121727879,
"grad_norm": 0.727737307548523,
"learning_rate": 9.318204439136062e-06,
"loss": 0.3892,
"step": 1370
},
{
"epoch": 0.1373544341594506,
"grad_norm": 0.6430094242095947,
"learning_rate": 9.313227829202748e-06,
"loss": 0.3579,
"step": 1380
},
{
"epoch": 0.13834975614611328,
"grad_norm": 0.7504476308822632,
"learning_rate": 9.308251219269435e-06,
"loss": 0.4585,
"step": 1390
},
{
"epoch": 0.13934507813277594,
"grad_norm": 1.0239664316177368,
"learning_rate": 9.303274609336122e-06,
"loss": 0.4696,
"step": 1400
},
{
"epoch": 0.13934507813277594,
"eval_loss": 0.43923673033714294,
"eval_runtime": 147.7239,
"eval_samples_per_second": 1.374,
"eval_steps_per_second": 0.69,
"step": 1400
},
{
"epoch": 0.14034040011943863,
"grad_norm": 0.6847706437110901,
"learning_rate": 9.298297999402808e-06,
"loss": 0.4823,
"step": 1410
},
{
"epoch": 0.14133572210610132,
"grad_norm": 0.5733935832977295,
"learning_rate": 9.293321389469493e-06,
"loss": 0.4088,
"step": 1420
},
{
"epoch": 0.142331044092764,
"grad_norm": 0.8858775496482849,
"learning_rate": 9.28834477953618e-06,
"loss": 0.3863,
"step": 1430
},
{
"epoch": 0.1433263660794267,
"grad_norm": 0.6404774785041809,
"learning_rate": 9.283368169602868e-06,
"loss": 0.3951,
"step": 1440
},
{
"epoch": 0.1443216880660894,
"grad_norm": 0.6125516891479492,
"learning_rate": 9.278391559669554e-06,
"loss": 0.4408,
"step": 1450
},
{
"epoch": 0.14531701005275208,
"grad_norm": 0.5629742741584778,
"learning_rate": 9.273414949736241e-06,
"loss": 0.4319,
"step": 1460
},
{
"epoch": 0.14631233203941474,
"grad_norm": 0.6768545508384705,
"learning_rate": 9.268438339802927e-06,
"loss": 0.4002,
"step": 1470
},
{
"epoch": 0.14730765402607743,
"grad_norm": 0.6743785738945007,
"learning_rate": 9.263461729869614e-06,
"loss": 0.4779,
"step": 1480
},
{
"epoch": 0.14830297601274012,
"grad_norm": 0.5943326354026794,
"learning_rate": 9.2584851199363e-06,
"loss": 0.4406,
"step": 1490
},
{
"epoch": 0.1492982979994028,
"grad_norm": 0.8586482405662537,
"learning_rate": 9.253508510002987e-06,
"loss": 0.4326,
"step": 1500
},
{
"epoch": 0.1492982979994028,
"eval_loss": 0.43489304184913635,
"eval_runtime": 147.6747,
"eval_samples_per_second": 1.375,
"eval_steps_per_second": 0.691,
"step": 1500
},
{
"epoch": 0.1502936199860655,
"grad_norm": 0.862763524055481,
"learning_rate": 9.248531900069674e-06,
"loss": 0.4917,
"step": 1510
},
{
"epoch": 0.15128894197272819,
"grad_norm": 0.6556192636489868,
"learning_rate": 9.24355529013636e-06,
"loss": 0.4333,
"step": 1520
},
{
"epoch": 0.15228426395939088,
"grad_norm": 0.5479542016983032,
"learning_rate": 9.238578680203047e-06,
"loss": 0.4176,
"step": 1530
},
{
"epoch": 0.15327958594605354,
"grad_norm": 0.8119767308235168,
"learning_rate": 9.233602070269733e-06,
"loss": 0.4171,
"step": 1540
},
{
"epoch": 0.15427490793271623,
"grad_norm": 0.9051875472068787,
"learning_rate": 9.22862546033642e-06,
"loss": 0.4529,
"step": 1550
},
{
"epoch": 0.15527022991937892,
"grad_norm": 0.5972510576248169,
"learning_rate": 9.223648850403106e-06,
"loss": 0.4752,
"step": 1560
},
{
"epoch": 0.1562655519060416,
"grad_norm": 0.6712588667869568,
"learning_rate": 9.218672240469793e-06,
"loss": 0.4179,
"step": 1570
},
{
"epoch": 0.1572608738927043,
"grad_norm": 0.637656569480896,
"learning_rate": 9.213695630536478e-06,
"loss": 0.4624,
"step": 1580
},
{
"epoch": 0.15825619587936698,
"grad_norm": 0.7319675087928772,
"learning_rate": 9.208719020603166e-06,
"loss": 0.4149,
"step": 1590
},
{
"epoch": 0.15925151786602967,
"grad_norm": 0.6740835905075073,
"learning_rate": 9.203742410669853e-06,
"loss": 0.4348,
"step": 1600
},
{
"epoch": 0.15925151786602967,
"eval_loss": 0.4290333390235901,
"eval_runtime": 147.7478,
"eval_samples_per_second": 1.374,
"eval_steps_per_second": 0.69,
"step": 1600
},
{
"epoch": 0.16024683985269234,
"grad_norm": 0.7110456824302673,
"learning_rate": 9.198765800736539e-06,
"loss": 0.3808,
"step": 1610
},
{
"epoch": 0.16124216183935502,
"grad_norm": 0.6934688091278076,
"learning_rate": 9.193789190803224e-06,
"loss": 0.4279,
"step": 1620
},
{
"epoch": 0.16223748382601771,
"grad_norm": 0.6783742308616638,
"learning_rate": 9.188812580869912e-06,
"loss": 0.413,
"step": 1630
},
{
"epoch": 0.1632328058126804,
"grad_norm": 0.5934478044509888,
"learning_rate": 9.183835970936599e-06,
"loss": 0.476,
"step": 1640
},
{
"epoch": 0.1642281277993431,
"grad_norm": 0.9043450951576233,
"learning_rate": 9.178859361003285e-06,
"loss": 0.392,
"step": 1650
},
{
"epoch": 0.16522344978600578,
"grad_norm": 0.4757988154888153,
"learning_rate": 9.173882751069972e-06,
"loss": 0.3812,
"step": 1660
},
{
"epoch": 0.16621877177266844,
"grad_norm": 0.7402971982955933,
"learning_rate": 9.16890614113666e-06,
"loss": 0.4293,
"step": 1670
},
{
"epoch": 0.16721409375933113,
"grad_norm": 0.6279808282852173,
"learning_rate": 9.163929531203345e-06,
"loss": 0.4453,
"step": 1680
},
{
"epoch": 0.16820941574599382,
"grad_norm": 0.6272904276847839,
"learning_rate": 9.15895292127003e-06,
"loss": 0.4215,
"step": 1690
},
{
"epoch": 0.1692047377326565,
"grad_norm": 0.806103527545929,
"learning_rate": 9.15397631133672e-06,
"loss": 0.4236,
"step": 1700
},
{
"epoch": 0.1692047377326565,
"eval_loss": 0.424538791179657,
"eval_runtime": 147.6192,
"eval_samples_per_second": 1.375,
"eval_steps_per_second": 0.691,
"step": 1700
},
{
"epoch": 0.1702000597193192,
"grad_norm": 0.7595136165618896,
"learning_rate": 9.148999701403405e-06,
"loss": 0.4473,
"step": 1710
},
{
"epoch": 0.1711953817059819,
"grad_norm": 0.5029250979423523,
"learning_rate": 9.14402309147009e-06,
"loss": 0.4248,
"step": 1720
},
{
"epoch": 0.17219070369264458,
"grad_norm": 0.7487345933914185,
"learning_rate": 9.139046481536778e-06,
"loss": 0.3795,
"step": 1730
},
{
"epoch": 0.17318602567930724,
"grad_norm": 1.122206211090088,
"learning_rate": 9.134069871603465e-06,
"loss": 0.4026,
"step": 1740
},
{
"epoch": 0.17418134766596993,
"grad_norm": 0.6429542899131775,
"learning_rate": 9.129093261670151e-06,
"loss": 0.4142,
"step": 1750
},
{
"epoch": 0.17517666965263262,
"grad_norm": 0.7902116775512695,
"learning_rate": 9.124116651736838e-06,
"loss": 0.4266,
"step": 1760
},
{
"epoch": 0.1761719916392953,
"grad_norm": 0.6928035020828247,
"learning_rate": 9.119140041803524e-06,
"loss": 0.4036,
"step": 1770
},
{
"epoch": 0.177167313625958,
"grad_norm": 0.637829601764679,
"learning_rate": 9.114163431870211e-06,
"loss": 0.4139,
"step": 1780
},
{
"epoch": 0.1781626356126207,
"grad_norm": 0.8418923616409302,
"learning_rate": 9.109186821936897e-06,
"loss": 0.4538,
"step": 1790
},
{
"epoch": 0.17915795759928338,
"grad_norm": 0.6597120761871338,
"learning_rate": 9.104210212003584e-06,
"loss": 0.428,
"step": 1800
},
{
"epoch": 0.17915795759928338,
"eval_loss": 0.4206041693687439,
"eval_runtime": 147.6714,
"eval_samples_per_second": 1.375,
"eval_steps_per_second": 0.691,
"step": 1800
},
{
"epoch": 0.18015327958594604,
"grad_norm": 0.9092034101486206,
"learning_rate": 9.099233602070271e-06,
"loss": 0.3827,
"step": 1810
},
{
"epoch": 0.18114860157260873,
"grad_norm": 0.7151809334754944,
"learning_rate": 9.094256992136957e-06,
"loss": 0.4096,
"step": 1820
},
{
"epoch": 0.18214392355927142,
"grad_norm": 0.812656819820404,
"learning_rate": 9.089280382203644e-06,
"loss": 0.398,
"step": 1830
},
{
"epoch": 0.1831392455459341,
"grad_norm": 0.6819058060646057,
"learning_rate": 9.08430377227033e-06,
"loss": 0.4289,
"step": 1840
},
{
"epoch": 0.1841345675325968,
"grad_norm": 0.6796212792396545,
"learning_rate": 9.079327162337017e-06,
"loss": 0.4107,
"step": 1850
},
{
"epoch": 0.1851298895192595,
"grad_norm": 0.604881227016449,
"learning_rate": 9.074350552403703e-06,
"loss": 0.3888,
"step": 1860
},
{
"epoch": 0.18612521150592218,
"grad_norm": 0.5823159217834473,
"learning_rate": 9.06937394247039e-06,
"loss": 0.4292,
"step": 1870
},
{
"epoch": 0.18712053349258484,
"grad_norm": 0.6591698527336121,
"learning_rate": 9.064397332537076e-06,
"loss": 0.4559,
"step": 1880
},
{
"epoch": 0.18811585547924753,
"grad_norm": 0.666591465473175,
"learning_rate": 9.059420722603763e-06,
"loss": 0.4486,
"step": 1890
},
{
"epoch": 0.18911117746591022,
"grad_norm": 0.8700873255729675,
"learning_rate": 9.05444411267045e-06,
"loss": 0.3934,
"step": 1900
},
{
"epoch": 0.18911117746591022,
"eval_loss": 0.41719409823417664,
"eval_runtime": 147.6671,
"eval_samples_per_second": 1.375,
"eval_steps_per_second": 0.691,
"step": 1900
},
{
"epoch": 0.1901064994525729,
"grad_norm": 0.5683835744857788,
"learning_rate": 9.049467502737136e-06,
"loss": 0.4148,
"step": 1910
},
{
"epoch": 0.1911018214392356,
"grad_norm": 0.7323755621910095,
"learning_rate": 9.044490892803823e-06,
"loss": 0.4473,
"step": 1920
},
{
"epoch": 0.1920971434258983,
"grad_norm": 0.8059419393539429,
"learning_rate": 9.039514282870509e-06,
"loss": 0.4092,
"step": 1930
},
{
"epoch": 0.19309246541256098,
"grad_norm": 0.5238020420074463,
"learning_rate": 9.034537672937196e-06,
"loss": 0.4161,
"step": 1940
},
{
"epoch": 0.19408778739922364,
"grad_norm": 0.7691717147827148,
"learning_rate": 9.029561063003882e-06,
"loss": 0.3996,
"step": 1950
},
{
"epoch": 0.19508310938588633,
"grad_norm": 0.5275344848632812,
"learning_rate": 9.024584453070569e-06,
"loss": 0.3936,
"step": 1960
},
{
"epoch": 0.19607843137254902,
"grad_norm": 0.9201516509056091,
"learning_rate": 9.019607843137256e-06,
"loss": 0.4327,
"step": 1970
},
{
"epoch": 0.1970737533592117,
"grad_norm": 0.6645549535751343,
"learning_rate": 9.014631233203942e-06,
"loss": 0.439,
"step": 1980
},
{
"epoch": 0.1980690753458744,
"grad_norm": 0.4919885993003845,
"learning_rate": 9.009654623270628e-06,
"loss": 0.3584,
"step": 1990
},
{
"epoch": 0.19906439733253709,
"grad_norm": 0.7819716930389404,
"learning_rate": 9.004678013337315e-06,
"loss": 0.4258,
"step": 2000
},
{
"epoch": 0.19906439733253709,
"eval_loss": 0.4135349690914154,
"eval_runtime": 147.6676,
"eval_samples_per_second": 1.375,
"eval_steps_per_second": 0.691,
"step": 2000
},
{
"epoch": 0.20005971931919977,
"grad_norm": 0.6763346195220947,
"learning_rate": 8.999701403404002e-06,
"loss": 0.3734,
"step": 2010
},
{
"epoch": 0.20105504130586244,
"grad_norm": 0.974773108959198,
"learning_rate": 8.994724793470688e-06,
"loss": 0.4128,
"step": 2020
},
{
"epoch": 0.20205036329252513,
"grad_norm": 0.7922454476356506,
"learning_rate": 8.989748183537375e-06,
"loss": 0.4699,
"step": 2030
},
{
"epoch": 0.20304568527918782,
"grad_norm": 0.7217792272567749,
"learning_rate": 8.984771573604062e-06,
"loss": 0.4368,
"step": 2040
},
{
"epoch": 0.2040410072658505,
"grad_norm": 0.9531657695770264,
"learning_rate": 8.979794963670748e-06,
"loss": 0.4124,
"step": 2050
},
{
"epoch": 0.2050363292525132,
"grad_norm": 0.5895671248435974,
"learning_rate": 8.974818353737434e-06,
"loss": 0.4065,
"step": 2060
},
{
"epoch": 0.20603165123917588,
"grad_norm": 0.6587451100349426,
"learning_rate": 8.969841743804123e-06,
"loss": 0.4182,
"step": 2070
},
{
"epoch": 0.20702697322583855,
"grad_norm": 0.5056644678115845,
"learning_rate": 8.964865133870808e-06,
"loss": 0.4146,
"step": 2080
},
{
"epoch": 0.20802229521250123,
"grad_norm": 0.8369359374046326,
"learning_rate": 8.959888523937494e-06,
"loss": 0.4258,
"step": 2090
},
{
"epoch": 0.20901761719916392,
"grad_norm": 0.8079156279563904,
"learning_rate": 8.954911914004181e-06,
"loss": 0.4172,
"step": 2100
},
{
"epoch": 0.20901761719916392,
"eval_loss": 0.40956470370292664,
"eval_runtime": 147.7554,
"eval_samples_per_second": 1.374,
"eval_steps_per_second": 0.69,
"step": 2100
},
{
"epoch": 0.2100129391858266,
"grad_norm": 0.5938236117362976,
"learning_rate": 8.949935304070869e-06,
"loss": 0.4058,
"step": 2110
},
{
"epoch": 0.2110082611724893,
"grad_norm": 0.5103029608726501,
"learning_rate": 8.944958694137554e-06,
"loss": 0.3338,
"step": 2120
},
{
"epoch": 0.212003583159152,
"grad_norm": 0.8399671316146851,
"learning_rate": 8.939982084204241e-06,
"loss": 0.4135,
"step": 2130
},
{
"epoch": 0.21299890514581468,
"grad_norm": 0.8162589073181152,
"learning_rate": 8.935005474270927e-06,
"loss": 0.379,
"step": 2140
},
{
"epoch": 0.21399422713247734,
"grad_norm": 0.5345713496208191,
"learning_rate": 8.930028864337614e-06,
"loss": 0.4356,
"step": 2150
},
{
"epoch": 0.21498954911914003,
"grad_norm": 0.5709038972854614,
"learning_rate": 8.9250522544043e-06,
"loss": 0.3961,
"step": 2160
},
{
"epoch": 0.21598487110580272,
"grad_norm": 0.8017010688781738,
"learning_rate": 8.920075644470987e-06,
"loss": 0.3934,
"step": 2170
},
{
"epoch": 0.2169801930924654,
"grad_norm": 0.7133475542068481,
"learning_rate": 8.915099034537673e-06,
"loss": 0.386,
"step": 2180
},
{
"epoch": 0.2179755150791281,
"grad_norm": 0.861768901348114,
"learning_rate": 8.91012242460436e-06,
"loss": 0.3981,
"step": 2190
},
{
"epoch": 0.2189708370657908,
"grad_norm": 0.6387837529182434,
"learning_rate": 8.905145814671047e-06,
"loss": 0.4277,
"step": 2200
},
{
"epoch": 0.2189708370657908,
"eval_loss": 0.40670302510261536,
"eval_runtime": 147.76,
"eval_samples_per_second": 1.374,
"eval_steps_per_second": 0.69,
"step": 2200
},
{
"epoch": 0.21996615905245348,
"grad_norm": 0.9591347575187683,
"learning_rate": 8.900169204737733e-06,
"loss": 0.3809,
"step": 2210
},
{
"epoch": 0.22096148103911614,
"grad_norm": 0.6483083963394165,
"learning_rate": 8.89519259480442e-06,
"loss": 0.4071,
"step": 2220
},
{
"epoch": 0.22195680302577883,
"grad_norm": 1.0261069536209106,
"learning_rate": 8.890215984871106e-06,
"loss": 0.4145,
"step": 2230
},
{
"epoch": 0.22295212501244152,
"grad_norm": 0.6538086533546448,
"learning_rate": 8.885239374937793e-06,
"loss": 0.4322,
"step": 2240
},
{
"epoch": 0.2239474469991042,
"grad_norm": 0.4469331204891205,
"learning_rate": 8.880262765004479e-06,
"loss": 0.4052,
"step": 2250
},
{
"epoch": 0.2249427689857669,
"grad_norm": 0.5114856958389282,
"learning_rate": 8.875286155071166e-06,
"loss": 0.4143,
"step": 2260
},
{
"epoch": 0.2259380909724296,
"grad_norm": 0.7658188343048096,
"learning_rate": 8.870309545137854e-06,
"loss": 0.4345,
"step": 2270
},
{
"epoch": 0.22693341295909228,
"grad_norm": 0.6381837725639343,
"learning_rate": 8.86533293520454e-06,
"loss": 0.3868,
"step": 2280
},
{
"epoch": 0.22792873494575494,
"grad_norm": 0.5213243961334229,
"learning_rate": 8.860356325271225e-06,
"loss": 0.3849,
"step": 2290
},
{
"epoch": 0.22892405693241763,
"grad_norm": 0.7393907904624939,
"learning_rate": 8.855379715337912e-06,
"loss": 0.4282,
"step": 2300
},
{
"epoch": 0.22892405693241763,
"eval_loss": 0.4041208326816559,
"eval_runtime": 147.7723,
"eval_samples_per_second": 1.374,
"eval_steps_per_second": 0.69,
"step": 2300
},
{
"epoch": 0.22991937891908032,
"grad_norm": 0.5622240304946899,
"learning_rate": 8.8504031054046e-06,
"loss": 0.3818,
"step": 2310
},
{
"epoch": 0.230914700905743,
"grad_norm": 0.7211191654205322,
"learning_rate": 8.845426495471285e-06,
"loss": 0.3596,
"step": 2320
},
{
"epoch": 0.2319100228924057,
"grad_norm": 0.5431678295135498,
"learning_rate": 8.840449885537972e-06,
"loss": 0.3645,
"step": 2330
},
{
"epoch": 0.2329053448790684,
"grad_norm": 1.0264047384262085,
"learning_rate": 8.83547327560466e-06,
"loss": 0.4152,
"step": 2340
},
{
"epoch": 0.23390066686573108,
"grad_norm": 0.6439436078071594,
"learning_rate": 8.830496665671345e-06,
"loss": 0.4169,
"step": 2350
},
{
"epoch": 0.23489598885239374,
"grad_norm": 0.6291099786758423,
"learning_rate": 8.825520055738031e-06,
"loss": 0.4246,
"step": 2360
},
{
"epoch": 0.23589131083905643,
"grad_norm": 0.5020752549171448,
"learning_rate": 8.820543445804718e-06,
"loss": 0.3649,
"step": 2370
},
{
"epoch": 0.23688663282571912,
"grad_norm": 0.5813655257225037,
"learning_rate": 8.815566835871405e-06,
"loss": 0.403,
"step": 2380
},
{
"epoch": 0.2378819548123818,
"grad_norm": 0.7793263792991638,
"learning_rate": 8.810590225938091e-06,
"loss": 0.4044,
"step": 2390
},
{
"epoch": 0.2388772767990445,
"grad_norm": 1.0214496850967407,
"learning_rate": 8.805613616004778e-06,
"loss": 0.3804,
"step": 2400
},
{
"epoch": 0.2388772767990445,
"eval_loss": 0.4011123776435852,
"eval_runtime": 147.7863,
"eval_samples_per_second": 1.374,
"eval_steps_per_second": 0.69,
"step": 2400
},
{
"epoch": 0.2398725987857072,
"grad_norm": 0.8854981064796448,
"learning_rate": 8.800637006071466e-06,
"loss": 0.3915,
"step": 2410
},
{
"epoch": 0.24086792077236985,
"grad_norm": 0.6463388800621033,
"learning_rate": 8.795660396138151e-06,
"loss": 0.412,
"step": 2420
},
{
"epoch": 0.24186324275903254,
"grad_norm": 1.0134918689727783,
"learning_rate": 8.790683786204837e-06,
"loss": 0.4514,
"step": 2430
},
{
"epoch": 0.24285856474569523,
"grad_norm": 0.5260724425315857,
"learning_rate": 8.785707176271524e-06,
"loss": 0.393,
"step": 2440
},
{
"epoch": 0.24385388673235792,
"grad_norm": 0.7072359323501587,
"learning_rate": 8.780730566338212e-06,
"loss": 0.4061,
"step": 2450
},
{
"epoch": 0.2448492087190206,
"grad_norm": 0.505009114742279,
"learning_rate": 8.775753956404897e-06,
"loss": 0.4435,
"step": 2460
},
{
"epoch": 0.2458445307056833,
"grad_norm": 0.707790195941925,
"learning_rate": 8.770777346471584e-06,
"loss": 0.3803,
"step": 2470
},
{
"epoch": 0.24683985269234598,
"grad_norm": 1.0153621435165405,
"learning_rate": 8.765800736538272e-06,
"loss": 0.3942,
"step": 2480
},
{
"epoch": 0.24783517467900865,
"grad_norm": 0.6652597188949585,
"learning_rate": 8.760824126604957e-06,
"loss": 0.3481,
"step": 2490
},
{
"epoch": 0.24883049666567134,
"grad_norm": 0.49689826369285583,
"learning_rate": 8.755847516671645e-06,
"loss": 0.4101,
"step": 2500
},
{
"epoch": 0.24883049666567134,
"eval_loss": 0.39822638034820557,
"eval_runtime": 147.9245,
"eval_samples_per_second": 1.372,
"eval_steps_per_second": 0.69,
"step": 2500
},
{
"epoch": 0.24982581865233403,
"grad_norm": 0.7141602635383606,
"learning_rate": 8.75087090673833e-06,
"loss": 0.362,
"step": 2510
},
{
"epoch": 0.25082114063899674,
"grad_norm": 0.5883095264434814,
"learning_rate": 8.745894296805018e-06,
"loss": 0.4115,
"step": 2520
},
{
"epoch": 0.2518164626256594,
"grad_norm": 0.6165831685066223,
"learning_rate": 8.740917686871703e-06,
"loss": 0.3849,
"step": 2530
},
{
"epoch": 0.25281178461232207,
"grad_norm": 0.5670954585075378,
"learning_rate": 8.73594107693839e-06,
"loss": 0.3491,
"step": 2540
},
{
"epoch": 0.25380710659898476,
"grad_norm": 1.0700769424438477,
"learning_rate": 8.730964467005076e-06,
"loss": 0.4068,
"step": 2550
},
{
"epoch": 0.25480242858564744,
"grad_norm": 0.7089443206787109,
"learning_rate": 8.725987857071763e-06,
"loss": 0.4567,
"step": 2560
},
{
"epoch": 0.25579775057231013,
"grad_norm": 0.5670477747917175,
"learning_rate": 8.72101124713845e-06,
"loss": 0.4037,
"step": 2570
},
{
"epoch": 0.2567930725589728,
"grad_norm": 0.6892909407615662,
"learning_rate": 8.716034637205136e-06,
"loss": 0.3714,
"step": 2580
},
{
"epoch": 0.2577883945456355,
"grad_norm": 0.8213964104652405,
"learning_rate": 8.711058027271822e-06,
"loss": 0.4305,
"step": 2590
},
{
"epoch": 0.2587837165322982,
"grad_norm": 0.7234606146812439,
"learning_rate": 8.70608141733851e-06,
"loss": 0.4213,
"step": 2600
},
{
"epoch": 0.2587837165322982,
"eval_loss": 0.39483293890953064,
"eval_runtime": 147.915,
"eval_samples_per_second": 1.372,
"eval_steps_per_second": 0.69,
"step": 2600
},
{
"epoch": 0.2597790385189609,
"grad_norm": 0.6947128176689148,
"learning_rate": 8.701104807405197e-06,
"loss": 0.3851,
"step": 2610
},
{
"epoch": 0.2607743605056236,
"grad_norm": 0.8997359275817871,
"learning_rate": 8.696128197471882e-06,
"loss": 0.379,
"step": 2620
},
{
"epoch": 0.26176968249228627,
"grad_norm": 0.8184422254562378,
"learning_rate": 8.69115158753857e-06,
"loss": 0.3615,
"step": 2630
},
{
"epoch": 0.26276500447894896,
"grad_norm": 0.7109666466712952,
"learning_rate": 8.686174977605257e-06,
"loss": 0.4233,
"step": 2640
},
{
"epoch": 0.26376032646561165,
"grad_norm": 0.6844655275344849,
"learning_rate": 8.681198367671942e-06,
"loss": 0.4142,
"step": 2650
},
{
"epoch": 0.2647556484522743,
"grad_norm": 0.8344716429710388,
"learning_rate": 8.676221757738628e-06,
"loss": 0.3611,
"step": 2660
},
{
"epoch": 0.265750970438937,
"grad_norm": 0.7269201278686523,
"learning_rate": 8.671245147805315e-06,
"loss": 0.4397,
"step": 2670
},
{
"epoch": 0.26674629242559966,
"grad_norm": 0.5457523465156555,
"learning_rate": 8.666268537872003e-06,
"loss": 0.3724,
"step": 2680
},
{
"epoch": 0.26774161441226235,
"grad_norm": 0.7520753145217896,
"learning_rate": 8.661291927938688e-06,
"loss": 0.3882,
"step": 2690
},
{
"epoch": 0.26873693639892504,
"grad_norm": 0.49623236060142517,
"learning_rate": 8.656315318005376e-06,
"loss": 0.4115,
"step": 2700
},
{
"epoch": 0.26873693639892504,
"eval_loss": 0.39236727356910706,
"eval_runtime": 147.7377,
"eval_samples_per_second": 1.374,
"eval_steps_per_second": 0.69,
"step": 2700
},
{
"epoch": 0.26973225838558773,
"grad_norm": 0.6592463254928589,
"learning_rate": 8.651338708072063e-06,
"loss": 0.3628,
"step": 2710
},
{
"epoch": 0.2707275803722504,
"grad_norm": 0.9473317265510559,
"learning_rate": 8.646362098138749e-06,
"loss": 0.3842,
"step": 2720
},
{
"epoch": 0.2717229023589131,
"grad_norm": 0.7774178385734558,
"learning_rate": 8.641385488205434e-06,
"loss": 0.3643,
"step": 2730
},
{
"epoch": 0.2727182243455758,
"grad_norm": 0.6194160580635071,
"learning_rate": 8.636408878272121e-06,
"loss": 0.4647,
"step": 2740
},
{
"epoch": 0.2737135463322385,
"grad_norm": 0.5518766641616821,
"learning_rate": 8.631432268338809e-06,
"loss": 0.3755,
"step": 2750
},
{
"epoch": 0.2747088683189012,
"grad_norm": 0.9331585764884949,
"learning_rate": 8.626455658405494e-06,
"loss": 0.3881,
"step": 2760
},
{
"epoch": 0.27570419030556387,
"grad_norm": 0.6080964207649231,
"learning_rate": 8.621479048472182e-06,
"loss": 0.3965,
"step": 2770
},
{
"epoch": 0.27669951229222656,
"grad_norm": 0.8619922399520874,
"learning_rate": 8.616502438538869e-06,
"loss": 0.387,
"step": 2780
},
{
"epoch": 0.27769483427888925,
"grad_norm": 0.7429324984550476,
"learning_rate": 8.611525828605555e-06,
"loss": 0.3837,
"step": 2790
},
{
"epoch": 0.2786901562655519,
"grad_norm": 0.7918853759765625,
"learning_rate": 8.60654921867224e-06,
"loss": 0.3921,
"step": 2800
},
{
"epoch": 0.2786901562655519,
"eval_loss": 0.3901057541370392,
"eval_runtime": 147.7809,
"eval_samples_per_second": 1.374,
"eval_steps_per_second": 0.69,
"step": 2800
},
{
"epoch": 0.27968547825221457,
"grad_norm": 0.6200188398361206,
"learning_rate": 8.601572608738928e-06,
"loss": 0.398,
"step": 2810
},
{
"epoch": 0.28068080023887726,
"grad_norm": 0.6285167336463928,
"learning_rate": 8.596595998805615e-06,
"loss": 0.3676,
"step": 2820
},
{
"epoch": 0.28167612222553995,
"grad_norm": 0.7586702704429626,
"learning_rate": 8.5916193888723e-06,
"loss": 0.3658,
"step": 2830
},
{
"epoch": 0.28267144421220264,
"grad_norm": 0.915360152721405,
"learning_rate": 8.586642778938988e-06,
"loss": 0.3444,
"step": 2840
},
{
"epoch": 0.2836667661988653,
"grad_norm": 0.8675612807273865,
"learning_rate": 8.581666169005673e-06,
"loss": 0.3939,
"step": 2850
},
{
"epoch": 0.284662088185528,
"grad_norm": 0.8629066944122314,
"learning_rate": 8.57668955907236e-06,
"loss": 0.4055,
"step": 2860
},
{
"epoch": 0.2856574101721907,
"grad_norm": 0.8615571856498718,
"learning_rate": 8.571712949139048e-06,
"loss": 0.4392,
"step": 2870
},
{
"epoch": 0.2866527321588534,
"grad_norm": 0.675205409526825,
"learning_rate": 8.566736339205734e-06,
"loss": 0.3289,
"step": 2880
},
{
"epoch": 0.2876480541455161,
"grad_norm": 0.6187378764152527,
"learning_rate": 8.561759729272421e-06,
"loss": 0.4067,
"step": 2890
},
{
"epoch": 0.2886433761321788,
"grad_norm": 0.7826117277145386,
"learning_rate": 8.556783119339106e-06,
"loss": 0.367,
"step": 2900
},
{
"epoch": 0.2886433761321788,
"eval_loss": 0.38809624314308167,
"eval_runtime": 147.8617,
"eval_samples_per_second": 1.373,
"eval_steps_per_second": 0.69,
"step": 2900
},
{
"epoch": 0.28963869811884146,
"grad_norm": 0.6546410322189331,
"learning_rate": 8.551806509405794e-06,
"loss": 0.3727,
"step": 2910
},
{
"epoch": 0.29063402010550415,
"grad_norm": 0.8760982155799866,
"learning_rate": 8.54682989947248e-06,
"loss": 0.3967,
"step": 2920
},
{
"epoch": 0.29162934209216684,
"grad_norm": 0.64844810962677,
"learning_rate": 8.541853289539167e-06,
"loss": 0.4046,
"step": 2930
},
{
"epoch": 0.2926246640788295,
"grad_norm": 0.5126065015792847,
"learning_rate": 8.536876679605854e-06,
"loss": 0.3783,
"step": 2940
},
{
"epoch": 0.29361998606549217,
"grad_norm": 0.7168049216270447,
"learning_rate": 8.53190006967254e-06,
"loss": 0.3606,
"step": 2950
},
{
"epoch": 0.29461530805215486,
"grad_norm": 0.4847118854522705,
"learning_rate": 8.526923459739225e-06,
"loss": 0.3617,
"step": 2960
},
{
"epoch": 0.29561063003881755,
"grad_norm": 0.6937541365623474,
"learning_rate": 8.521946849805913e-06,
"loss": 0.3878,
"step": 2970
},
{
"epoch": 0.29660595202548024,
"grad_norm": 0.7482075095176697,
"learning_rate": 8.5169702398726e-06,
"loss": 0.4173,
"step": 2980
},
{
"epoch": 0.2976012740121429,
"grad_norm": 0.7130847573280334,
"learning_rate": 8.511993629939285e-06,
"loss": 0.3717,
"step": 2990
},
{
"epoch": 0.2985965959988056,
"grad_norm": 0.7087443470954895,
"learning_rate": 8.507017020005973e-06,
"loss": 0.3945,
"step": 3000
},
{
"epoch": 0.2985965959988056,
"eval_loss": 0.3846234977245331,
"eval_runtime": 147.9506,
"eval_samples_per_second": 1.372,
"eval_steps_per_second": 0.689,
"step": 3000
},
{
"epoch": 0.2995919179854683,
"grad_norm": 0.5839470624923706,
"learning_rate": 8.50204041007266e-06,
"loss": 0.3672,
"step": 3010
},
{
"epoch": 0.300587239972131,
"grad_norm": 0.5632269978523254,
"learning_rate": 8.497063800139346e-06,
"loss": 0.4038,
"step": 3020
},
{
"epoch": 0.3015825619587937,
"grad_norm": 0.9807242155075073,
"learning_rate": 8.492087190206031e-06,
"loss": 0.435,
"step": 3030
},
{
"epoch": 0.30257788394545637,
"grad_norm": 0.6134958267211914,
"learning_rate": 8.487110580272719e-06,
"loss": 0.3857,
"step": 3040
},
{
"epoch": 0.30357320593211906,
"grad_norm": 0.9714884757995605,
"learning_rate": 8.482133970339406e-06,
"loss": 0.3375,
"step": 3050
},
{
"epoch": 0.30456852791878175,
"grad_norm": 0.6158900856971741,
"learning_rate": 8.477157360406092e-06,
"loss": 0.3768,
"step": 3060
},
{
"epoch": 0.3055638499054444,
"grad_norm": 0.5510846376419067,
"learning_rate": 8.472180750472779e-06,
"loss": 0.3618,
"step": 3070
},
{
"epoch": 0.3065591718921071,
"grad_norm": 0.6374019384384155,
"learning_rate": 8.467204140539466e-06,
"loss": 0.3444,
"step": 3080
},
{
"epoch": 0.30755449387876976,
"grad_norm": 0.6322264075279236,
"learning_rate": 8.462227530606152e-06,
"loss": 0.3841,
"step": 3090
},
{
"epoch": 0.30854981586543245,
"grad_norm": 0.6326218843460083,
"learning_rate": 8.457250920672837e-06,
"loss": 0.3627,
"step": 3100
},
{
"epoch": 0.30854981586543245,
"eval_loss": 0.38287338614463806,
"eval_runtime": 147.987,
"eval_samples_per_second": 1.372,
"eval_steps_per_second": 0.689,
"step": 3100
},
{
"epoch": 0.30954513785209514,
"grad_norm": 0.8483834862709045,
"learning_rate": 8.452274310739525e-06,
"loss": 0.4364,
"step": 3110
},
{
"epoch": 0.31054045983875783,
"grad_norm": 0.9434365034103394,
"learning_rate": 8.447297700806212e-06,
"loss": 0.4027,
"step": 3120
},
{
"epoch": 0.3115357818254205,
"grad_norm": 0.7766565680503845,
"learning_rate": 8.442321090872898e-06,
"loss": 0.3241,
"step": 3130
},
{
"epoch": 0.3125311038120832,
"grad_norm": 0.7761719822883606,
"learning_rate": 8.437344480939585e-06,
"loss": 0.4041,
"step": 3140
},
{
"epoch": 0.3135264257987459,
"grad_norm": 0.8227534890174866,
"learning_rate": 8.432367871006272e-06,
"loss": 0.3915,
"step": 3150
},
{
"epoch": 0.3145217477854086,
"grad_norm": 0.6961987614631653,
"learning_rate": 8.427391261072958e-06,
"loss": 0.4119,
"step": 3160
},
{
"epoch": 0.3155170697720713,
"grad_norm": 0.725043773651123,
"learning_rate": 8.422414651139643e-06,
"loss": 0.3811,
"step": 3170
},
{
"epoch": 0.31651239175873397,
"grad_norm": 0.6801613569259644,
"learning_rate": 8.41743804120633e-06,
"loss": 0.3752,
"step": 3180
},
{
"epoch": 0.31750771374539666,
"grad_norm": 0.6735227108001709,
"learning_rate": 8.412461431273018e-06,
"loss": 0.3538,
"step": 3190
},
{
"epoch": 0.31850303573205935,
"grad_norm": 0.7424077391624451,
"learning_rate": 8.407484821339704e-06,
"loss": 0.3347,
"step": 3200
},
{
"epoch": 0.31850303573205935,
"eval_loss": 0.38142284750938416,
"eval_runtime": 148.3323,
"eval_samples_per_second": 1.369,
"eval_steps_per_second": 0.688,
"step": 3200
},
{
"epoch": 0.319498357718722,
"grad_norm": 0.6526059508323669,
"learning_rate": 8.402508211406391e-06,
"loss": 0.4098,
"step": 3210
},
{
"epoch": 0.32049367970538467,
"grad_norm": 0.8221137523651123,
"learning_rate": 8.397531601473077e-06,
"loss": 0.4044,
"step": 3220
},
{
"epoch": 0.32148900169204736,
"grad_norm": 0.7967231869697571,
"learning_rate": 8.392554991539764e-06,
"loss": 0.3989,
"step": 3230
},
{
"epoch": 0.32248432367871005,
"grad_norm": 0.8786621689796448,
"learning_rate": 8.387578381606451e-06,
"loss": 0.3113,
"step": 3240
},
{
"epoch": 0.32347964566537274,
"grad_norm": 1.084957480430603,
"learning_rate": 8.382601771673137e-06,
"loss": 0.3855,
"step": 3250
},
{
"epoch": 0.32447496765203543,
"grad_norm": 0.6978799104690552,
"learning_rate": 8.377625161739822e-06,
"loss": 0.3752,
"step": 3260
},
{
"epoch": 0.3254702896386981,
"grad_norm": 0.6280369162559509,
"learning_rate": 8.37264855180651e-06,
"loss": 0.3831,
"step": 3270
},
{
"epoch": 0.3264656116253608,
"grad_norm": 0.5700563192367554,
"learning_rate": 8.367671941873197e-06,
"loss": 0.3848,
"step": 3280
},
{
"epoch": 0.3274609336120235,
"grad_norm": 0.6714605093002319,
"learning_rate": 8.362695331939883e-06,
"loss": 0.3894,
"step": 3290
},
{
"epoch": 0.3284562555986862,
"grad_norm": 0.6634580492973328,
"learning_rate": 8.35771872200657e-06,
"loss": 0.4055,
"step": 3300
},
{
"epoch": 0.3284562555986862,
"eval_loss": 0.3794529438018799,
"eval_runtime": 147.906,
"eval_samples_per_second": 1.372,
"eval_steps_per_second": 0.69,
"step": 3300
},
{
"epoch": 0.3294515775853489,
"grad_norm": 0.6699293255805969,
"learning_rate": 8.352742112073257e-06,
"loss": 0.3997,
"step": 3310
},
{
"epoch": 0.33044689957201157,
"grad_norm": 0.5837434530258179,
"learning_rate": 8.347765502139943e-06,
"loss": 0.3506,
"step": 3320
},
{
"epoch": 0.33144222155867425,
"grad_norm": 0.7900473475456238,
"learning_rate": 8.342788892206629e-06,
"loss": 0.3712,
"step": 3330
},
{
"epoch": 0.3324375435453369,
"grad_norm": 0.5419691205024719,
"learning_rate": 8.337812282273316e-06,
"loss": 0.3755,
"step": 3340
},
{
"epoch": 0.3334328655319996,
"grad_norm": 0.635683536529541,
"learning_rate": 8.332835672340003e-06,
"loss": 0.3995,
"step": 3350
},
{
"epoch": 0.33442818751866227,
"grad_norm": 0.7266948223114014,
"learning_rate": 8.327859062406689e-06,
"loss": 0.398,
"step": 3360
},
{
"epoch": 0.33542350950532496,
"grad_norm": 0.8439323902130127,
"learning_rate": 8.322882452473376e-06,
"loss": 0.4093,
"step": 3370
},
{
"epoch": 0.33641883149198765,
"grad_norm": 0.6754797697067261,
"learning_rate": 8.317905842540063e-06,
"loss": 0.3638,
"step": 3380
},
{
"epoch": 0.33741415347865034,
"grad_norm": 0.7690572142601013,
"learning_rate": 8.312929232606749e-06,
"loss": 0.3408,
"step": 3390
},
{
"epoch": 0.338409475465313,
"grad_norm": 0.765877902507782,
"learning_rate": 8.307952622673435e-06,
"loss": 0.3418,
"step": 3400
},
{
"epoch": 0.338409475465313,
"eval_loss": 0.37782156467437744,
"eval_runtime": 147.8891,
"eval_samples_per_second": 1.373,
"eval_steps_per_second": 0.69,
"step": 3400
},
{
"epoch": 0.3394047974519757,
"grad_norm": 0.7344104051589966,
"learning_rate": 8.302976012740122e-06,
"loss": 0.3443,
"step": 3410
},
{
"epoch": 0.3404001194386384,
"grad_norm": 1.0199452638626099,
"learning_rate": 8.29799940280681e-06,
"loss": 0.4294,
"step": 3420
},
{
"epoch": 0.3413954414253011,
"grad_norm": 0.5666326880455017,
"learning_rate": 8.293022792873495e-06,
"loss": 0.3274,
"step": 3430
},
{
"epoch": 0.3423907634119638,
"grad_norm": 0.8385756611824036,
"learning_rate": 8.288046182940182e-06,
"loss": 0.4122,
"step": 3440
},
{
"epoch": 0.3433860853986265,
"grad_norm": 0.777019739151001,
"learning_rate": 8.28306957300687e-06,
"loss": 0.4089,
"step": 3450
},
{
"epoch": 0.34438140738528916,
"grad_norm": 0.682658851146698,
"learning_rate": 8.278092963073555e-06,
"loss": 0.3772,
"step": 3460
},
{
"epoch": 0.34537672937195185,
"grad_norm": 0.6811783313751221,
"learning_rate": 8.27311635314024e-06,
"loss": 0.3523,
"step": 3470
},
{
"epoch": 0.3463720513586145,
"grad_norm": 0.9056878685951233,
"learning_rate": 8.268139743206928e-06,
"loss": 0.3292,
"step": 3480
},
{
"epoch": 0.3473673733452772,
"grad_norm": 0.6763057708740234,
"learning_rate": 8.263163133273615e-06,
"loss": 0.3326,
"step": 3490
},
{
"epoch": 0.34836269533193986,
"grad_norm": 0.8847700953483582,
"learning_rate": 8.258186523340301e-06,
"loss": 0.4062,
"step": 3500
},
{
"epoch": 0.34836269533193986,
"eval_loss": 0.37572577595710754,
"eval_runtime": 147.9751,
"eval_samples_per_second": 1.372,
"eval_steps_per_second": 0.689,
"step": 3500
},
{
"epoch": 0.34935801731860255,
"grad_norm": 0.7903834581375122,
"learning_rate": 8.253209913406988e-06,
"loss": 0.3546,
"step": 3510
},
{
"epoch": 0.35035333930526524,
"grad_norm": 0.6501933336257935,
"learning_rate": 8.248233303473674e-06,
"loss": 0.3909,
"step": 3520
},
{
"epoch": 0.35134866129192793,
"grad_norm": 0.6443967819213867,
"learning_rate": 8.243256693540361e-06,
"loss": 0.3315,
"step": 3530
},
{
"epoch": 0.3523439832785906,
"grad_norm": 0.7020339965820312,
"learning_rate": 8.238280083607047e-06,
"loss": 0.383,
"step": 3540
},
{
"epoch": 0.3533393052652533,
"grad_norm": 0.8711917400360107,
"learning_rate": 8.233303473673734e-06,
"loss": 0.3771,
"step": 3550
},
{
"epoch": 0.354334627251916,
"grad_norm": 0.788311243057251,
"learning_rate": 8.228326863740421e-06,
"loss": 0.3299,
"step": 3560
},
{
"epoch": 0.3553299492385787,
"grad_norm": 0.43669214844703674,
"learning_rate": 8.223350253807107e-06,
"loss": 0.3659,
"step": 3570
},
{
"epoch": 0.3563252712252414,
"grad_norm": 0.550014078617096,
"learning_rate": 8.218373643873794e-06,
"loss": 0.3586,
"step": 3580
},
{
"epoch": 0.35732059321190407,
"grad_norm": 0.9948114156723022,
"learning_rate": 8.21339703394048e-06,
"loss": 0.3743,
"step": 3590
},
{
"epoch": 0.35831591519856676,
"grad_norm": 0.6710416078567505,
"learning_rate": 8.208420424007167e-06,
"loss": 0.3724,
"step": 3600
},
{
"epoch": 0.35831591519856676,
"eval_loss": 0.37383729219436646,
"eval_runtime": 147.9999,
"eval_samples_per_second": 1.372,
"eval_steps_per_second": 0.689,
"step": 3600
},
{
"epoch": 0.35931123718522945,
"grad_norm": 0.7629538774490356,
"learning_rate": 8.203443814073854e-06,
"loss": 0.3942,
"step": 3610
},
{
"epoch": 0.3603065591718921,
"grad_norm": 0.7567903399467468,
"learning_rate": 8.19846720414054e-06,
"loss": 0.3895,
"step": 3620
},
{
"epoch": 0.36130188115855477,
"grad_norm": 0.5209780335426331,
"learning_rate": 8.193490594207226e-06,
"loss": 0.3395,
"step": 3630
},
{
"epoch": 0.36229720314521746,
"grad_norm": 0.5655366778373718,
"learning_rate": 8.188513984273913e-06,
"loss": 0.3435,
"step": 3640
},
{
"epoch": 0.36329252513188015,
"grad_norm": 0.8822707533836365,
"learning_rate": 8.1835373743406e-06,
"loss": 0.3442,
"step": 3650
},
{
"epoch": 0.36428784711854284,
"grad_norm": 0.6264866590499878,
"learning_rate": 8.178560764407286e-06,
"loss": 0.3902,
"step": 3660
},
{
"epoch": 0.36528316910520553,
"grad_norm": 0.6163113713264465,
"learning_rate": 8.173584154473973e-06,
"loss": 0.301,
"step": 3670
},
{
"epoch": 0.3662784910918682,
"grad_norm": 0.7627054452896118,
"learning_rate": 8.16860754454066e-06,
"loss": 0.3504,
"step": 3680
},
{
"epoch": 0.3672738130785309,
"grad_norm": 0.7021706104278564,
"learning_rate": 8.163630934607346e-06,
"loss": 0.3761,
"step": 3690
},
{
"epoch": 0.3682691350651936,
"grad_norm": 0.8463016152381897,
"learning_rate": 8.158654324674032e-06,
"loss": 0.4096,
"step": 3700
},
{
"epoch": 0.3682691350651936,
"eval_loss": 0.3721456229686737,
"eval_runtime": 148.0333,
"eval_samples_per_second": 1.371,
"eval_steps_per_second": 0.689,
"step": 3700
},
{
"epoch": 0.3692644570518563,
"grad_norm": 0.7081176042556763,
"learning_rate": 8.153677714740719e-06,
"loss": 0.3609,
"step": 3710
},
{
"epoch": 0.370259779038519,
"grad_norm": 0.6312963366508484,
"learning_rate": 8.148701104807406e-06,
"loss": 0.3964,
"step": 3720
},
{
"epoch": 0.37125510102518167,
"grad_norm": 0.5755221247673035,
"learning_rate": 8.143724494874092e-06,
"loss": 0.3701,
"step": 3730
},
{
"epoch": 0.37225042301184436,
"grad_norm": 0.584368884563446,
"learning_rate": 8.13874788494078e-06,
"loss": 0.3748,
"step": 3740
},
{
"epoch": 0.373245744998507,
"grad_norm": 0.588197648525238,
"learning_rate": 8.133771275007467e-06,
"loss": 0.3775,
"step": 3750
},
{
"epoch": 0.3742410669851697,
"grad_norm": 0.6824856996536255,
"learning_rate": 8.128794665074152e-06,
"loss": 0.3842,
"step": 3760
},
{
"epoch": 0.37523638897183237,
"grad_norm": 0.4867573082447052,
"learning_rate": 8.123818055140838e-06,
"loss": 0.3349,
"step": 3770
},
{
"epoch": 0.37623171095849506,
"grad_norm": 1.023980975151062,
"learning_rate": 8.118841445207525e-06,
"loss": 0.2991,
"step": 3780
},
{
"epoch": 0.37722703294515775,
"grad_norm": 0.8464593291282654,
"learning_rate": 8.113864835274212e-06,
"loss": 0.3673,
"step": 3790
},
{
"epoch": 0.37822235493182044,
"grad_norm": 0.7149996757507324,
"learning_rate": 8.108888225340898e-06,
"loss": 0.3913,
"step": 3800
},
{
"epoch": 0.37822235493182044,
"eval_loss": 0.37008264660835266,
"eval_runtime": 148.0619,
"eval_samples_per_second": 1.371,
"eval_steps_per_second": 0.689,
"step": 3800
},
{
"epoch": 0.3792176769184831,
"grad_norm": 0.5620415210723877,
"learning_rate": 8.103911615407585e-06,
"loss": 0.3409,
"step": 3810
},
{
"epoch": 0.3802129989051458,
"grad_norm": 0.7163406014442444,
"learning_rate": 8.098935005474273e-06,
"loss": 0.3566,
"step": 3820
},
{
"epoch": 0.3812083208918085,
"grad_norm": 0.6729508638381958,
"learning_rate": 8.093958395540958e-06,
"loss": 0.3606,
"step": 3830
},
{
"epoch": 0.3822036428784712,
"grad_norm": 0.5905406475067139,
"learning_rate": 8.088981785607644e-06,
"loss": 0.3948,
"step": 3840
},
{
"epoch": 0.3831989648651339,
"grad_norm": 0.896960437297821,
"learning_rate": 8.084005175674331e-06,
"loss": 0.3881,
"step": 3850
},
{
"epoch": 0.3841942868517966,
"grad_norm": 0.6188758015632629,
"learning_rate": 8.079028565741019e-06,
"loss": 0.3632,
"step": 3860
},
{
"epoch": 0.38518960883845926,
"grad_norm": 0.7011315822601318,
"learning_rate": 8.074051955807704e-06,
"loss": 0.3768,
"step": 3870
},
{
"epoch": 0.38618493082512195,
"grad_norm": 0.546981930732727,
"learning_rate": 8.069075345874391e-06,
"loss": 0.3556,
"step": 3880
},
{
"epoch": 0.3871802528117846,
"grad_norm": 0.6722966432571411,
"learning_rate": 8.064098735941077e-06,
"loss": 0.4264,
"step": 3890
},
{
"epoch": 0.3881755747984473,
"grad_norm": 0.6407563090324402,
"learning_rate": 8.059122126007764e-06,
"loss": 0.3592,
"step": 3900
},
{
"epoch": 0.3881755747984473,
"eval_loss": 0.3688708245754242,
"eval_runtime": 148.1311,
"eval_samples_per_second": 1.37,
"eval_steps_per_second": 0.689,
"step": 3900
},
{
"epoch": 0.38917089678510997,
"grad_norm": 0.45177608728408813,
"learning_rate": 8.05414551607445e-06,
"loss": 0.3733,
"step": 3910
},
{
"epoch": 0.39016621877177265,
"grad_norm": 1.0299266576766968,
"learning_rate": 8.049168906141137e-06,
"loss": 0.351,
"step": 3920
},
{
"epoch": 0.39116154075843534,
"grad_norm": 0.6861090660095215,
"learning_rate": 8.044192296207823e-06,
"loss": 0.3899,
"step": 3930
},
{
"epoch": 0.39215686274509803,
"grad_norm": 0.6434109210968018,
"learning_rate": 8.03921568627451e-06,
"loss": 0.3285,
"step": 3940
},
{
"epoch": 0.3931521847317607,
"grad_norm": 0.6049661040306091,
"learning_rate": 8.034239076341198e-06,
"loss": 0.37,
"step": 3950
},
{
"epoch": 0.3941475067184234,
"grad_norm": 0.6799841523170471,
"learning_rate": 8.029262466407883e-06,
"loss": 0.381,
"step": 3960
},
{
"epoch": 0.3951428287050861,
"grad_norm": 0.7383856177330017,
"learning_rate": 8.02428585647457e-06,
"loss": 0.3707,
"step": 3970
},
{
"epoch": 0.3961381506917488,
"grad_norm": 0.8234820365905762,
"learning_rate": 8.019309246541258e-06,
"loss": 0.379,
"step": 3980
},
{
"epoch": 0.3971334726784115,
"grad_norm": 0.743027925491333,
"learning_rate": 8.014332636607943e-06,
"loss": 0.362,
"step": 3990
},
{
"epoch": 0.39812879466507417,
"grad_norm": 0.48385190963745117,
"learning_rate": 8.009356026674629e-06,
"loss": 0.3726,
"step": 4000
},
{
"epoch": 0.39812879466507417,
"eval_loss": 0.36677852272987366,
"eval_runtime": 148.1274,
"eval_samples_per_second": 1.37,
"eval_steps_per_second": 0.689,
"step": 4000
},
{
"epoch": 0.39912411665173686,
"grad_norm": 0.776292622089386,
"learning_rate": 8.004379416741316e-06,
"loss": 0.3258,
"step": 4010
},
{
"epoch": 0.40011943863839955,
"grad_norm": 0.7187590599060059,
"learning_rate": 7.999402806808004e-06,
"loss": 0.3639,
"step": 4020
},
{
"epoch": 0.4011147606250622,
"grad_norm": 0.6233355402946472,
"learning_rate": 7.99442619687469e-06,
"loss": 0.3418,
"step": 4030
},
{
"epoch": 0.4021100826117249,
"grad_norm": 0.9605082869529724,
"learning_rate": 7.989449586941377e-06,
"loss": 0.3686,
"step": 4040
},
{
"epoch": 0.40310540459838756,
"grad_norm": 0.7882612943649292,
"learning_rate": 7.984472977008064e-06,
"loss": 0.3386,
"step": 4050
},
{
"epoch": 0.40410072658505025,
"grad_norm": 0.8124802708625793,
"learning_rate": 7.97949636707475e-06,
"loss": 0.3412,
"step": 4060
},
{
"epoch": 0.40509604857171294,
"grad_norm": 0.6348981857299805,
"learning_rate": 7.974519757141435e-06,
"loss": 0.3624,
"step": 4070
},
{
"epoch": 0.40609137055837563,
"grad_norm": 0.8518906831741333,
"learning_rate": 7.969543147208122e-06,
"loss": 0.3494,
"step": 4080
},
{
"epoch": 0.4070866925450383,
"grad_norm": 0.979092538356781,
"learning_rate": 7.96456653727481e-06,
"loss": 0.3677,
"step": 4090
},
{
"epoch": 0.408082014531701,
"grad_norm": 0.6732219457626343,
"learning_rate": 7.959589927341495e-06,
"loss": 0.3395,
"step": 4100
},
{
"epoch": 0.408082014531701,
"eval_loss": 0.365203857421875,
"eval_runtime": 148.0813,
"eval_samples_per_second": 1.371,
"eval_steps_per_second": 0.689,
"step": 4100
},
{
"epoch": 0.4090773365183637,
"grad_norm": 0.9068031907081604,
"learning_rate": 7.954613317408183e-06,
"loss": 0.3715,
"step": 4110
},
{
"epoch": 0.4100726585050264,
"grad_norm": 0.8246614336967468,
"learning_rate": 7.94963670747487e-06,
"loss": 0.3661,
"step": 4120
},
{
"epoch": 0.4110679804916891,
"grad_norm": 0.5856474041938782,
"learning_rate": 7.944660097541556e-06,
"loss": 0.3567,
"step": 4130
},
{
"epoch": 0.41206330247835177,
"grad_norm": 0.4393113851547241,
"learning_rate": 7.939683487608241e-06,
"loss": 0.3469,
"step": 4140
},
{
"epoch": 0.41305862446501446,
"grad_norm": 1.0827676057815552,
"learning_rate": 7.934706877674928e-06,
"loss": 0.3318,
"step": 4150
},
{
"epoch": 0.4140539464516771,
"grad_norm": 0.6830149292945862,
"learning_rate": 7.929730267741616e-06,
"loss": 0.3726,
"step": 4160
},
{
"epoch": 0.4150492684383398,
"grad_norm": 0.563925564289093,
"learning_rate": 7.924753657808301e-06,
"loss": 0.3732,
"step": 4170
},
{
"epoch": 0.41604459042500247,
"grad_norm": 0.5630573034286499,
"learning_rate": 7.919777047874989e-06,
"loss": 0.3626,
"step": 4180
},
{
"epoch": 0.41703991241166516,
"grad_norm": 0.7267017960548401,
"learning_rate": 7.914800437941674e-06,
"loss": 0.3414,
"step": 4190
},
{
"epoch": 0.41803523439832785,
"grad_norm": 0.7420011758804321,
"learning_rate": 7.909823828008362e-06,
"loss": 0.379,
"step": 4200
},
{
"epoch": 0.41803523439832785,
"eval_loss": 0.3634182810783386,
"eval_runtime": 148.0601,
"eval_samples_per_second": 1.371,
"eval_steps_per_second": 0.689,
"step": 4200
},
{
"epoch": 0.41903055638499054,
"grad_norm": 0.6270275115966797,
"learning_rate": 7.904847218075047e-06,
"loss": 0.347,
"step": 4210
},
{
"epoch": 0.4200258783716532,
"grad_norm": 0.6264152526855469,
"learning_rate": 7.899870608141735e-06,
"loss": 0.3984,
"step": 4220
},
{
"epoch": 0.4210212003583159,
"grad_norm": 0.7452067136764526,
"learning_rate": 7.894893998208422e-06,
"loss": 0.392,
"step": 4230
},
{
"epoch": 0.4220165223449786,
"grad_norm": 0.5158396363258362,
"learning_rate": 7.889917388275107e-06,
"loss": 0.3624,
"step": 4240
},
{
"epoch": 0.4230118443316413,
"grad_norm": 0.6692706942558289,
"learning_rate": 7.884940778341795e-06,
"loss": 0.359,
"step": 4250
},
{
"epoch": 0.424007166318304,
"grad_norm": 1.1387830972671509,
"learning_rate": 7.87996416840848e-06,
"loss": 0.39,
"step": 4260
},
{
"epoch": 0.4250024883049667,
"grad_norm": 0.76036137342453,
"learning_rate": 7.874987558475168e-06,
"loss": 0.299,
"step": 4270
},
{
"epoch": 0.42599781029162936,
"grad_norm": 0.45447903871536255,
"learning_rate": 7.870010948541853e-06,
"loss": 0.3926,
"step": 4280
},
{
"epoch": 0.42699313227829205,
"grad_norm": 0.8221507668495178,
"learning_rate": 7.86503433860854e-06,
"loss": 0.3743,
"step": 4290
},
{
"epoch": 0.4279884542649547,
"grad_norm": 0.7328831553459167,
"learning_rate": 7.860057728675226e-06,
"loss": 0.3699,
"step": 4300
},
{
"epoch": 0.4279884542649547,
"eval_loss": 0.36196640133857727,
"eval_runtime": 148.0658,
"eval_samples_per_second": 1.371,
"eval_steps_per_second": 0.689,
"step": 4300
},
{
"epoch": 0.4289837762516174,
"grad_norm": 0.8411442637443542,
"learning_rate": 7.855081118741913e-06,
"loss": 0.4047,
"step": 4310
},
{
"epoch": 0.42997909823828007,
"grad_norm": 0.7502423524856567,
"learning_rate": 7.8501045088086e-06,
"loss": 0.3513,
"step": 4320
},
{
"epoch": 0.43097442022494276,
"grad_norm": 0.566929042339325,
"learning_rate": 7.845127898875286e-06,
"loss": 0.3935,
"step": 4330
},
{
"epoch": 0.43196974221160545,
"grad_norm": 0.7588290572166443,
"learning_rate": 7.840151288941972e-06,
"loss": 0.3324,
"step": 4340
},
{
"epoch": 0.43296506419826813,
"grad_norm": 0.7947611808776855,
"learning_rate": 7.835174679008661e-06,
"loss": 0.3506,
"step": 4350
},
{
"epoch": 0.4339603861849308,
"grad_norm": 0.6475954651832581,
"learning_rate": 7.830198069075347e-06,
"loss": 0.3103,
"step": 4360
},
{
"epoch": 0.4349557081715935,
"grad_norm": 0.5702581405639648,
"learning_rate": 7.825221459142032e-06,
"loss": 0.3373,
"step": 4370
},
{
"epoch": 0.4359510301582562,
"grad_norm": 0.7424353957176208,
"learning_rate": 7.82024484920872e-06,
"loss": 0.3593,
"step": 4380
},
{
"epoch": 0.4369463521449189,
"grad_norm": 0.5749756693840027,
"learning_rate": 7.815268239275407e-06,
"loss": 0.3133,
"step": 4390
},
{
"epoch": 0.4379416741315816,
"grad_norm": 0.5407712459564209,
"learning_rate": 7.810291629342092e-06,
"loss": 0.3584,
"step": 4400
},
{
"epoch": 0.4379416741315816,
"eval_loss": 0.360762357711792,
"eval_runtime": 148.1111,
"eval_samples_per_second": 1.371,
"eval_steps_per_second": 0.689,
"step": 4400
},
{
"epoch": 0.43893699611824427,
"grad_norm": 0.5194666981697083,
"learning_rate": 7.80531501940878e-06,
"loss": 0.2957,
"step": 4410
},
{
"epoch": 0.43993231810490696,
"grad_norm": 0.7961593866348267,
"learning_rate": 7.800338409475467e-06,
"loss": 0.3819,
"step": 4420
},
{
"epoch": 0.44092764009156965,
"grad_norm": 0.6336628198623657,
"learning_rate": 7.795361799542153e-06,
"loss": 0.3123,
"step": 4430
},
{
"epoch": 0.4419229620782323,
"grad_norm": 0.6935514211654663,
"learning_rate": 7.790385189608838e-06,
"loss": 0.3519,
"step": 4440
},
{
"epoch": 0.442918284064895,
"grad_norm": 0.6400023698806763,
"learning_rate": 7.785408579675526e-06,
"loss": 0.3806,
"step": 4450
},
{
"epoch": 0.44391360605155766,
"grad_norm": 0.9406591057777405,
"learning_rate": 7.780431969742213e-06,
"loss": 0.3282,
"step": 4460
},
{
"epoch": 0.44490892803822035,
"grad_norm": 0.6432562470436096,
"learning_rate": 7.775455359808899e-06,
"loss": 0.302,
"step": 4470
},
{
"epoch": 0.44590425002488304,
"grad_norm": 0.5700191259384155,
"learning_rate": 7.770478749875586e-06,
"loss": 0.3608,
"step": 4480
},
{
"epoch": 0.44689957201154573,
"grad_norm": 0.7987110614776611,
"learning_rate": 7.765502139942271e-06,
"loss": 0.3363,
"step": 4490
},
{
"epoch": 0.4478948939982084,
"grad_norm": 0.6581839323043823,
"learning_rate": 7.760525530008959e-06,
"loss": 0.3414,
"step": 4500
},
{
"epoch": 0.4478948939982084,
"eval_loss": 0.35966184735298157,
"eval_runtime": 148.1465,
"eval_samples_per_second": 1.37,
"eval_steps_per_second": 0.689,
"step": 4500
},
{
"epoch": 0.4488902159848711,
"grad_norm": 0.6311335563659668,
"learning_rate": 7.755548920075644e-06,
"loss": 0.3768,
"step": 4510
},
{
"epoch": 0.4498855379715338,
"grad_norm": 0.8850741982460022,
"learning_rate": 7.750572310142332e-06,
"loss": 0.3763,
"step": 4520
},
{
"epoch": 0.4508808599581965,
"grad_norm": 0.5066502094268799,
"learning_rate": 7.745595700209019e-06,
"loss": 0.3412,
"step": 4530
},
{
"epoch": 0.4518761819448592,
"grad_norm": 0.545430600643158,
"learning_rate": 7.740619090275705e-06,
"loss": 0.3737,
"step": 4540
},
{
"epoch": 0.45287150393152187,
"grad_norm": 0.7061020731925964,
"learning_rate": 7.735642480342392e-06,
"loss": 0.3218,
"step": 4550
},
{
"epoch": 0.45386682591818456,
"grad_norm": 0.5185464024543762,
"learning_rate": 7.730665870409078e-06,
"loss": 0.3489,
"step": 4560
},
{
"epoch": 0.4548621479048472,
"grad_norm": 0.9102675318717957,
"learning_rate": 7.725689260475765e-06,
"loss": 0.3515,
"step": 4570
},
{
"epoch": 0.4558574698915099,
"grad_norm": 0.7395256757736206,
"learning_rate": 7.72071265054245e-06,
"loss": 0.2873,
"step": 4580
},
{
"epoch": 0.45685279187817257,
"grad_norm": 0.9186689853668213,
"learning_rate": 7.715736040609138e-06,
"loss": 0.3705,
"step": 4590
},
{
"epoch": 0.45784811386483526,
"grad_norm": 0.6102734804153442,
"learning_rate": 7.710759430675823e-06,
"loss": 0.3389,
"step": 4600
},
{
"epoch": 0.45784811386483526,
"eval_loss": 0.35844776034355164,
"eval_runtime": 148.1097,
"eval_samples_per_second": 1.371,
"eval_steps_per_second": 0.689,
"step": 4600
},
{
"epoch": 0.45884343585149795,
"grad_norm": 0.418071985244751,
"learning_rate": 7.70578282074251e-06,
"loss": 0.3454,
"step": 4610
},
{
"epoch": 0.45983875783816064,
"grad_norm": 0.504802942276001,
"learning_rate": 7.700806210809198e-06,
"loss": 0.3419,
"step": 4620
},
{
"epoch": 0.46083407982482333,
"grad_norm": 0.7918646335601807,
"learning_rate": 7.695829600875884e-06,
"loss": 0.3992,
"step": 4630
},
{
"epoch": 0.461829401811486,
"grad_norm": 0.6944281458854675,
"learning_rate": 7.690852990942571e-06,
"loss": 0.3945,
"step": 4640
},
{
"epoch": 0.4628247237981487,
"grad_norm": 0.648303210735321,
"learning_rate": 7.685876381009257e-06,
"loss": 0.3401,
"step": 4650
},
{
"epoch": 0.4638200457848114,
"grad_norm": 0.812044084072113,
"learning_rate": 7.680899771075944e-06,
"loss": 0.3548,
"step": 4660
},
{
"epoch": 0.4648153677714741,
"grad_norm": 0.7709999680519104,
"learning_rate": 7.67592316114263e-06,
"loss": 0.3702,
"step": 4670
},
{
"epoch": 0.4658106897581368,
"grad_norm": 0.7904644012451172,
"learning_rate": 7.670946551209317e-06,
"loss": 0.3763,
"step": 4680
},
{
"epoch": 0.46680601174479947,
"grad_norm": 0.7763231992721558,
"learning_rate": 7.665969941276004e-06,
"loss": 0.3495,
"step": 4690
},
{
"epoch": 0.46780133373146215,
"grad_norm": 0.5270109176635742,
"learning_rate": 7.66099333134269e-06,
"loss": 0.3016,
"step": 4700
},
{
"epoch": 0.46780133373146215,
"eval_loss": 0.35714709758758545,
"eval_runtime": 148.1115,
"eval_samples_per_second": 1.371,
"eval_steps_per_second": 0.689,
"step": 4700
},
{
"epoch": 0.4687966557181248,
"grad_norm": 0.6368373036384583,
"learning_rate": 7.656016721409375e-06,
"loss": 0.3323,
"step": 4710
},
{
"epoch": 0.4697919777047875,
"grad_norm": 0.3973361551761627,
"learning_rate": 7.651040111476064e-06,
"loss": 0.3405,
"step": 4720
},
{
"epoch": 0.47078729969145017,
"grad_norm": 0.8075085878372192,
"learning_rate": 7.64606350154275e-06,
"loss": 0.3436,
"step": 4730
},
{
"epoch": 0.47178262167811286,
"grad_norm": 0.892672598361969,
"learning_rate": 7.641086891609436e-06,
"loss": 0.3662,
"step": 4740
},
{
"epoch": 0.47277794366477555,
"grad_norm": 0.6311262845993042,
"learning_rate": 7.636110281676123e-06,
"loss": 0.3559,
"step": 4750
},
{
"epoch": 0.47377326565143824,
"grad_norm": 0.7950363159179688,
"learning_rate": 7.63113367174281e-06,
"loss": 0.2974,
"step": 4760
},
{
"epoch": 0.4747685876381009,
"grad_norm": 0.6539332270622253,
"learning_rate": 7.626157061809496e-06,
"loss": 0.3312,
"step": 4770
},
{
"epoch": 0.4757639096247636,
"grad_norm": 0.7384660840034485,
"learning_rate": 7.621180451876182e-06,
"loss": 0.3825,
"step": 4780
},
{
"epoch": 0.4767592316114263,
"grad_norm": 0.43817830085754395,
"learning_rate": 7.6162038419428695e-06,
"loss": 0.3462,
"step": 4790
},
{
"epoch": 0.477754553598089,
"grad_norm": 0.7346156239509583,
"learning_rate": 7.611227232009556e-06,
"loss": 0.3377,
"step": 4800
},
{
"epoch": 0.477754553598089,
"eval_loss": 0.355719655752182,
"eval_runtime": 148.1914,
"eval_samples_per_second": 1.37,
"eval_steps_per_second": 0.688,
"step": 4800
},
{
"epoch": 0.4787498755847517,
"grad_norm": 0.8043003082275391,
"learning_rate": 7.6062506220762424e-06,
"loss": 0.3625,
"step": 4810
},
{
"epoch": 0.4797451975714144,
"grad_norm": 0.6644107103347778,
"learning_rate": 7.601274012142929e-06,
"loss": 0.3023,
"step": 4820
},
{
"epoch": 0.48074051955807706,
"grad_norm": 0.7794090509414673,
"learning_rate": 7.596297402209616e-06,
"loss": 0.3552,
"step": 4830
},
{
"epoch": 0.4817358415447397,
"grad_norm": 0.7449871301651001,
"learning_rate": 7.591320792276302e-06,
"loss": 0.3659,
"step": 4840
},
{
"epoch": 0.4827311635314024,
"grad_norm": 0.881610631942749,
"learning_rate": 7.586344182342988e-06,
"loss": 0.3184,
"step": 4850
},
{
"epoch": 0.4837264855180651,
"grad_norm": 0.8672296404838562,
"learning_rate": 7.581367572409675e-06,
"loss": 0.3324,
"step": 4860
},
{
"epoch": 0.48472180750472776,
"grad_norm": 0.4788852334022522,
"learning_rate": 7.576390962476362e-06,
"loss": 0.3406,
"step": 4870
},
{
"epoch": 0.48571712949139045,
"grad_norm": 0.6023631691932678,
"learning_rate": 7.5714143525430485e-06,
"loss": 0.3797,
"step": 4880
},
{
"epoch": 0.48671245147805314,
"grad_norm": 0.6595234870910645,
"learning_rate": 7.566437742609735e-06,
"loss": 0.3199,
"step": 4890
},
{
"epoch": 0.48770777346471583,
"grad_norm": 0.6189759969711304,
"learning_rate": 7.561461132676421e-06,
"loss": 0.373,
"step": 4900
},
{
"epoch": 0.48770777346471583,
"eval_loss": 0.35428938269615173,
"eval_runtime": 148.2777,
"eval_samples_per_second": 1.369,
"eval_steps_per_second": 0.688,
"step": 4900
},
{
"epoch": 0.4887030954513785,
"grad_norm": 0.71135413646698,
"learning_rate": 7.556484522743108e-06,
"loss": 0.3232,
"step": 4910
},
{
"epoch": 0.4896984174380412,
"grad_norm": 0.5228835940361023,
"learning_rate": 7.551507912809794e-06,
"loss": 0.3428,
"step": 4920
},
{
"epoch": 0.4906937394247039,
"grad_norm": 0.9015726447105408,
"learning_rate": 7.546531302876481e-06,
"loss": 0.3889,
"step": 4930
},
{
"epoch": 0.4916890614113666,
"grad_norm": 0.8351202011108398,
"learning_rate": 7.541554692943168e-06,
"loss": 0.3367,
"step": 4940
},
{
"epoch": 0.4926843833980293,
"grad_norm": 0.6578547954559326,
"learning_rate": 7.536578083009855e-06,
"loss": 0.3646,
"step": 4950
},
{
"epoch": 0.49367970538469197,
"grad_norm": 1.1061774492263794,
"learning_rate": 7.531601473076541e-06,
"loss": 0.351,
"step": 4960
},
{
"epoch": 0.49467502737135466,
"grad_norm": 0.636061429977417,
"learning_rate": 7.526624863143227e-06,
"loss": 0.3434,
"step": 4970
},
{
"epoch": 0.4956703493580173,
"grad_norm": 0.6666164994239807,
"learning_rate": 7.521648253209915e-06,
"loss": 0.3462,
"step": 4980
},
{
"epoch": 0.49666567134468,
"grad_norm": 0.8288053274154663,
"learning_rate": 7.5166716432766e-06,
"loss": 0.3862,
"step": 4990
},
{
"epoch": 0.49766099333134267,
"grad_norm": 0.5653735399246216,
"learning_rate": 7.511695033343287e-06,
"loss": 0.3559,
"step": 5000
},
{
"epoch": 0.49766099333134267,
"eval_loss": 0.35338979959487915,
"eval_runtime": 148.2313,
"eval_samples_per_second": 1.369,
"eval_steps_per_second": 0.688,
"step": 5000
},
{
"epoch": 0.49865631531800536,
"grad_norm": 1.083835482597351,
"learning_rate": 7.506718423409973e-06,
"loss": 0.3697,
"step": 5010
},
{
"epoch": 0.49965163730466805,
"grad_norm": 0.7271355986595154,
"learning_rate": 7.501741813476661e-06,
"loss": 0.2915,
"step": 5020
},
{
"epoch": 0.5006469592913307,
"grad_norm": 0.6525740027427673,
"learning_rate": 7.496765203543347e-06,
"loss": 0.3571,
"step": 5030
},
{
"epoch": 0.5016422812779935,
"grad_norm": 1.00348961353302,
"learning_rate": 7.4917885936100336e-06,
"loss": 0.3254,
"step": 5040
},
{
"epoch": 0.5026376032646561,
"grad_norm": 0.7707570195198059,
"learning_rate": 7.486811983676721e-06,
"loss": 0.3544,
"step": 5050
},
{
"epoch": 0.5036329252513188,
"grad_norm": 0.7804340720176697,
"learning_rate": 7.4818353737434065e-06,
"loss": 0.3346,
"step": 5060
},
{
"epoch": 0.5046282472379815,
"grad_norm": 1.0899609327316284,
"learning_rate": 7.476858763810093e-06,
"loss": 0.3296,
"step": 5070
},
{
"epoch": 0.5056235692246441,
"grad_norm": 0.6863502264022827,
"learning_rate": 7.471882153876779e-06,
"loss": 0.3569,
"step": 5080
},
{
"epoch": 0.5066188912113069,
"grad_norm": 1.15005362033844,
"learning_rate": 7.466905543943467e-06,
"loss": 0.2829,
"step": 5090
},
{
"epoch": 0.5076142131979695,
"grad_norm": 0.699102520942688,
"learning_rate": 7.461928934010153e-06,
"loss": 0.3727,
"step": 5100
},
{
"epoch": 0.5076142131979695,
"eval_loss": 0.35266318917274475,
"eval_runtime": 148.2339,
"eval_samples_per_second": 1.369,
"eval_steps_per_second": 0.688,
"step": 5100
},
{
"epoch": 0.5086095351846323,
"grad_norm": 0.9547719359397888,
"learning_rate": 7.45695232407684e-06,
"loss": 0.4042,
"step": 5110
},
{
"epoch": 0.5096048571712949,
"grad_norm": 0.9959189891815186,
"learning_rate": 7.451975714143525e-06,
"loss": 0.3115,
"step": 5120
},
{
"epoch": 0.5106001791579576,
"grad_norm": 0.6266285181045532,
"learning_rate": 7.446999104210213e-06,
"loss": 0.3485,
"step": 5130
},
{
"epoch": 0.5115955011446203,
"grad_norm": 0.711664617061615,
"learning_rate": 7.442022494276899e-06,
"loss": 0.3699,
"step": 5140
},
{
"epoch": 0.512590823131283,
"grad_norm": 1.0690807104110718,
"learning_rate": 7.4370458843435855e-06,
"loss": 0.3248,
"step": 5150
},
{
"epoch": 0.5135861451179456,
"grad_norm": 1.2619460821151733,
"learning_rate": 7.432069274410272e-06,
"loss": 0.3284,
"step": 5160
},
{
"epoch": 0.5145814671046084,
"grad_norm": 0.9510999917984009,
"learning_rate": 7.427092664476959e-06,
"loss": 0.3491,
"step": 5170
},
{
"epoch": 0.515576789091271,
"grad_norm": 1.012990117073059,
"learning_rate": 7.422116054543646e-06,
"loss": 0.3659,
"step": 5180
},
{
"epoch": 0.5165721110779337,
"grad_norm": 0.5469540953636169,
"learning_rate": 7.417139444610332e-06,
"loss": 0.2709,
"step": 5190
},
{
"epoch": 0.5175674330645964,
"grad_norm": 0.6974226236343384,
"learning_rate": 7.4121628346770195e-06,
"loss": 0.3668,
"step": 5200
},
{
"epoch": 0.5175674330645964,
"eval_loss": 0.35165390372276306,
"eval_runtime": 148.2087,
"eval_samples_per_second": 1.37,
"eval_steps_per_second": 0.688,
"step": 5200
},
{
"epoch": 0.518562755051259,
"grad_norm": 0.8949996829032898,
"learning_rate": 7.407186224743705e-06,
"loss": 0.3305,
"step": 5210
},
{
"epoch": 0.5195580770379218,
"grad_norm": 0.6786302328109741,
"learning_rate": 7.4022096148103915e-06,
"loss": 0.3312,
"step": 5220
},
{
"epoch": 0.5205533990245844,
"grad_norm": 0.6699957251548767,
"learning_rate": 7.397233004877078e-06,
"loss": 0.3429,
"step": 5230
},
{
"epoch": 0.5215487210112472,
"grad_norm": 0.5877237915992737,
"learning_rate": 7.392256394943765e-06,
"loss": 0.3214,
"step": 5240
},
{
"epoch": 0.5225440429979098,
"grad_norm": 0.7005926966667175,
"learning_rate": 7.387279785010452e-06,
"loss": 0.3816,
"step": 5250
},
{
"epoch": 0.5235393649845725,
"grad_norm": 0.7223731279373169,
"learning_rate": 7.382303175077138e-06,
"loss": 0.3773,
"step": 5260
},
{
"epoch": 0.5245346869712352,
"grad_norm": 0.9617743492126465,
"learning_rate": 7.377326565143824e-06,
"loss": 0.3441,
"step": 5270
},
{
"epoch": 0.5255300089578979,
"grad_norm": 0.6759951114654541,
"learning_rate": 7.372349955210511e-06,
"loss": 0.3464,
"step": 5280
},
{
"epoch": 0.5265253309445606,
"grad_norm": 0.600290834903717,
"learning_rate": 7.367373345277198e-06,
"loss": 0.3202,
"step": 5290
},
{
"epoch": 0.5275206529312233,
"grad_norm": 0.6212776899337769,
"learning_rate": 7.362396735343884e-06,
"loss": 0.3995,
"step": 5300
},
{
"epoch": 0.5275206529312233,
"eval_loss": 0.35058361291885376,
"eval_runtime": 148.2235,
"eval_samples_per_second": 1.37,
"eval_steps_per_second": 0.688,
"step": 5300
}
],
"logging_steps": 10,
"max_steps": 20094,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.109336661739546e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}