|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5275206529312233, |
|
"eval_steps": 100, |
|
"global_step": 5300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0009953219866626853, |
|
"grad_norm": 1.912980556488037, |
|
"learning_rate": 9.995023390066686e-06, |
|
"loss": 1.8703, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0019906439733253707, |
|
"grad_norm": 1.866821050643921, |
|
"learning_rate": 9.990046780133374e-06, |
|
"loss": 1.8723, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.002985965959988056, |
|
"grad_norm": 2.058809280395508, |
|
"learning_rate": 9.985070170200061e-06, |
|
"loss": 1.8097, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.003981287946650741, |
|
"grad_norm": 1.459013819694519, |
|
"learning_rate": 9.980093560266747e-06, |
|
"loss": 1.7456, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.004976609933313427, |
|
"grad_norm": 0.9095586538314819, |
|
"learning_rate": 9.975116950333434e-06, |
|
"loss": 1.7195, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.005971931919976112, |
|
"grad_norm": 1.1065226793289185, |
|
"learning_rate": 9.970140340400121e-06, |
|
"loss": 1.6502, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0069672539066387975, |
|
"grad_norm": 0.8301252126693726, |
|
"learning_rate": 9.965163730466807e-06, |
|
"loss": 1.5699, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.007962575893301483, |
|
"grad_norm": 1.0762828588485718, |
|
"learning_rate": 9.960187120533493e-06, |
|
"loss": 1.5072, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.008957897879964169, |
|
"grad_norm": 1.0814900398254395, |
|
"learning_rate": 9.95521051060018e-06, |
|
"loss": 1.4369, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.009953219866626855, |
|
"grad_norm": 1.3561326265335083, |
|
"learning_rate": 9.950233900666867e-06, |
|
"loss": 1.3467, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.009953219866626855, |
|
"eval_loss": 1.2846794128417969, |
|
"eval_runtime": 147.6242, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.010948541853289539, |
|
"grad_norm": 1.438547968864441, |
|
"learning_rate": 9.945257290733553e-06, |
|
"loss": 1.2222, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.011943863839952225, |
|
"grad_norm": 1.402588963508606, |
|
"learning_rate": 9.94028068080024e-06, |
|
"loss": 1.1001, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.012939185826614909, |
|
"grad_norm": 1.4357985258102417, |
|
"learning_rate": 9.935304070866926e-06, |
|
"loss": 0.9657, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.013934507813277595, |
|
"grad_norm": 2.137953042984009, |
|
"learning_rate": 9.930327460933613e-06, |
|
"loss": 0.8211, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.014929829799940281, |
|
"grad_norm": 1.374299168586731, |
|
"learning_rate": 9.925350851000299e-06, |
|
"loss": 0.7142, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.015925151786602965, |
|
"grad_norm": 1.1510456800460815, |
|
"learning_rate": 9.920374241066986e-06, |
|
"loss": 0.656, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.01692047377326565, |
|
"grad_norm": 1.0226788520812988, |
|
"learning_rate": 9.915397631133673e-06, |
|
"loss": 0.6212, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.017915795759928337, |
|
"grad_norm": 0.9365411400794983, |
|
"learning_rate": 9.910421021200359e-06, |
|
"loss": 0.6069, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.018911117746591023, |
|
"grad_norm": 0.6880003213882446, |
|
"learning_rate": 9.905444411267046e-06, |
|
"loss": 0.6128, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.01990643973325371, |
|
"grad_norm": 1.1190361976623535, |
|
"learning_rate": 9.900467801333732e-06, |
|
"loss": 0.5426, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01990643973325371, |
|
"eval_loss": 0.5788590908050537, |
|
"eval_runtime": 147.511, |
|
"eval_samples_per_second": 1.376, |
|
"eval_steps_per_second": 0.691, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02090176171991639, |
|
"grad_norm": 1.184279441833496, |
|
"learning_rate": 9.895491191400419e-06, |
|
"loss": 0.5887, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.021897083706579078, |
|
"grad_norm": 0.7627615928649902, |
|
"learning_rate": 9.890514581467106e-06, |
|
"loss": 0.5433, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.022892405693241764, |
|
"grad_norm": 0.7858164310455322, |
|
"learning_rate": 9.885537971533792e-06, |
|
"loss": 0.5843, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.02388772767990445, |
|
"grad_norm": 0.695697009563446, |
|
"learning_rate": 9.880561361600478e-06, |
|
"loss": 0.5365, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.024883049666567136, |
|
"grad_norm": 0.8994197845458984, |
|
"learning_rate": 9.875584751667165e-06, |
|
"loss": 0.5662, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.025878371653229818, |
|
"grad_norm": 0.8016309142112732, |
|
"learning_rate": 9.870608141733852e-06, |
|
"loss": 0.5592, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.026873693639892504, |
|
"grad_norm": 0.8534384369850159, |
|
"learning_rate": 9.865631531800538e-06, |
|
"loss": 0.5248, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.02786901562655519, |
|
"grad_norm": 0.9857029914855957, |
|
"learning_rate": 9.860654921867225e-06, |
|
"loss": 0.5294, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.028864337613217876, |
|
"grad_norm": 0.7766090631484985, |
|
"learning_rate": 9.855678311933912e-06, |
|
"loss": 0.5198, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.029859659599880562, |
|
"grad_norm": 0.6832401752471924, |
|
"learning_rate": 9.850701702000598e-06, |
|
"loss": 0.5844, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.029859659599880562, |
|
"eval_loss": 0.536589503288269, |
|
"eval_runtime": 147.4968, |
|
"eval_samples_per_second": 1.376, |
|
"eval_steps_per_second": 0.692, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.030854981586543248, |
|
"grad_norm": 0.7720848917961121, |
|
"learning_rate": 9.845725092067284e-06, |
|
"loss": 0.5365, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.03185030357320593, |
|
"grad_norm": 0.7022100687026978, |
|
"learning_rate": 9.840748482133971e-06, |
|
"loss": 0.4841, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.03284562555986862, |
|
"grad_norm": 1.0030310153961182, |
|
"learning_rate": 9.835771872200658e-06, |
|
"loss": 0.4635, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.0338409475465313, |
|
"grad_norm": 0.8628882765769958, |
|
"learning_rate": 9.830795262267344e-06, |
|
"loss": 0.4932, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.034836269533193985, |
|
"grad_norm": 0.7178316712379456, |
|
"learning_rate": 9.825818652334031e-06, |
|
"loss": 0.6057, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.035831591519856675, |
|
"grad_norm": 0.9564626216888428, |
|
"learning_rate": 9.820842042400718e-06, |
|
"loss": 0.5371, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03682691350651936, |
|
"grad_norm": 0.7041760683059692, |
|
"learning_rate": 9.815865432467404e-06, |
|
"loss": 0.513, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.037822235493182046, |
|
"grad_norm": 1.0203750133514404, |
|
"learning_rate": 9.81088882253409e-06, |
|
"loss": 0.5118, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.03881755747984473, |
|
"grad_norm": 0.8765382170677185, |
|
"learning_rate": 9.805912212600777e-06, |
|
"loss": 0.4529, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.03981287946650742, |
|
"grad_norm": 0.9951983690261841, |
|
"learning_rate": 9.800935602667464e-06, |
|
"loss": 0.5336, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03981287946650742, |
|
"eval_loss": 0.5151349306106567, |
|
"eval_runtime": 147.6615, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0408082014531701, |
|
"grad_norm": 0.7691435813903809, |
|
"learning_rate": 9.79595899273415e-06, |
|
"loss": 0.506, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.04180352343983278, |
|
"grad_norm": 1.1955533027648926, |
|
"learning_rate": 9.790982382800837e-06, |
|
"loss": 0.4692, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.04279884542649547, |
|
"grad_norm": 1.128085732460022, |
|
"learning_rate": 9.786005772867525e-06, |
|
"loss": 0.4608, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.043794167413158155, |
|
"grad_norm": 0.5518949627876282, |
|
"learning_rate": 9.78102916293421e-06, |
|
"loss": 0.5006, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.044789489399820845, |
|
"grad_norm": 0.7164484858512878, |
|
"learning_rate": 9.776052553000896e-06, |
|
"loss": 0.4996, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.04578481138648353, |
|
"grad_norm": 0.5959630012512207, |
|
"learning_rate": 9.771075943067583e-06, |
|
"loss": 0.4843, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04678013337314621, |
|
"grad_norm": 0.743648111820221, |
|
"learning_rate": 9.76609933313427e-06, |
|
"loss": 0.4363, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.0477754553598089, |
|
"grad_norm": 0.8757079243659973, |
|
"learning_rate": 9.761122723200956e-06, |
|
"loss": 0.4665, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.04877077734647158, |
|
"grad_norm": 1.0122153759002686, |
|
"learning_rate": 9.756146113267643e-06, |
|
"loss": 0.492, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.04976609933313427, |
|
"grad_norm": 0.6179729700088501, |
|
"learning_rate": 9.751169503334329e-06, |
|
"loss": 0.5022, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04976609933313427, |
|
"eval_loss": 0.4993921220302582, |
|
"eval_runtime": 147.7401, |
|
"eval_samples_per_second": 1.374, |
|
"eval_steps_per_second": 0.69, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.050761421319796954, |
|
"grad_norm": 0.952812671661377, |
|
"learning_rate": 9.746192893401016e-06, |
|
"loss": 0.4901, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.051756743306459636, |
|
"grad_norm": 0.6715916991233826, |
|
"learning_rate": 9.741216283467702e-06, |
|
"loss": 0.5055, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.052752065293122326, |
|
"grad_norm": 0.674640953540802, |
|
"learning_rate": 9.736239673534389e-06, |
|
"loss": 0.4874, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.05374738727978501, |
|
"grad_norm": 0.7867962718009949, |
|
"learning_rate": 9.731263063601075e-06, |
|
"loss": 0.4956, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.0547427092664477, |
|
"grad_norm": 0.9035332202911377, |
|
"learning_rate": 9.726286453667762e-06, |
|
"loss": 0.499, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.05573803125311038, |
|
"grad_norm": 0.7009295225143433, |
|
"learning_rate": 9.72130984373445e-06, |
|
"loss": 0.5034, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.05673335323977307, |
|
"grad_norm": 0.7018862366676331, |
|
"learning_rate": 9.716333233801135e-06, |
|
"loss": 0.5137, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.05772867522643575, |
|
"grad_norm": 0.7812825441360474, |
|
"learning_rate": 9.711356623867822e-06, |
|
"loss": 0.4724, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.058723997213098435, |
|
"grad_norm": 0.6245225071907043, |
|
"learning_rate": 9.70638001393451e-06, |
|
"loss": 0.4446, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.059719319199761124, |
|
"grad_norm": 0.9083976149559021, |
|
"learning_rate": 9.701403404001195e-06, |
|
"loss": 0.4884, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.059719319199761124, |
|
"eval_loss": 0.4891846477985382, |
|
"eval_runtime": 147.5284, |
|
"eval_samples_per_second": 1.376, |
|
"eval_steps_per_second": 0.691, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.06071464118642381, |
|
"grad_norm": 0.6195352673530579, |
|
"learning_rate": 9.69642679406788e-06, |
|
"loss": 0.5121, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.061709963173086496, |
|
"grad_norm": 0.8068727254867554, |
|
"learning_rate": 9.691450184134568e-06, |
|
"loss": 0.4689, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.06270528515974919, |
|
"grad_norm": 1.0427749156951904, |
|
"learning_rate": 9.686473574201255e-06, |
|
"loss": 0.4968, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.06370060714641186, |
|
"grad_norm": 0.698349118232727, |
|
"learning_rate": 9.681496964267941e-06, |
|
"loss": 0.4691, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.06469592913307455, |
|
"grad_norm": 0.9104384183883667, |
|
"learning_rate": 9.676520354334628e-06, |
|
"loss": 0.4775, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.06569125111973724, |
|
"grad_norm": 0.8729726076126099, |
|
"learning_rate": 9.671543744401316e-06, |
|
"loss": 0.5201, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.06668657310639992, |
|
"grad_norm": 0.9858236908912659, |
|
"learning_rate": 9.666567134468001e-06, |
|
"loss": 0.4268, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.0676818950930626, |
|
"grad_norm": 2.322754383087158, |
|
"learning_rate": 9.661590524534687e-06, |
|
"loss": 0.4744, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.0686772170797253, |
|
"grad_norm": 0.9327623248100281, |
|
"learning_rate": 9.656613914601374e-06, |
|
"loss": 0.4355, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.06967253906638797, |
|
"grad_norm": 0.6949413418769836, |
|
"learning_rate": 9.651637304668062e-06, |
|
"loss": 0.465, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.06967253906638797, |
|
"eval_loss": 0.4817120432853699, |
|
"eval_runtime": 147.5643, |
|
"eval_samples_per_second": 1.376, |
|
"eval_steps_per_second": 0.691, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.07066786105305066, |
|
"grad_norm": 0.5208165049552917, |
|
"learning_rate": 9.646660694734747e-06, |
|
"loss": 0.4973, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.07166318303971335, |
|
"grad_norm": 0.8434884548187256, |
|
"learning_rate": 9.641684084801434e-06, |
|
"loss": 0.4721, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.07265850502637604, |
|
"grad_norm": 0.7161769866943359, |
|
"learning_rate": 9.636707474868122e-06, |
|
"loss": 0.498, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.07365382701303871, |
|
"grad_norm": 0.7036088705062866, |
|
"learning_rate": 9.631730864934807e-06, |
|
"loss": 0.4672, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.0746491489997014, |
|
"grad_norm": 0.9175013899803162, |
|
"learning_rate": 9.626754255001493e-06, |
|
"loss": 0.4781, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.07564447098636409, |
|
"grad_norm": 0.678519606590271, |
|
"learning_rate": 9.62177764506818e-06, |
|
"loss": 0.4048, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.07663979297302677, |
|
"grad_norm": 0.6295528411865234, |
|
"learning_rate": 9.616801035134868e-06, |
|
"loss": 0.449, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.07763511495968946, |
|
"grad_norm": 0.5424385666847229, |
|
"learning_rate": 9.611824425201553e-06, |
|
"loss": 0.4394, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.07863043694635215, |
|
"grad_norm": 0.508836030960083, |
|
"learning_rate": 9.60684781526824e-06, |
|
"loss": 0.4317, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.07962575893301484, |
|
"grad_norm": 0.6004147529602051, |
|
"learning_rate": 9.601871205334926e-06, |
|
"loss": 0.4308, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07962575893301484, |
|
"eval_loss": 0.47557342052459717, |
|
"eval_runtime": 147.5812, |
|
"eval_samples_per_second": 1.376, |
|
"eval_steps_per_second": 0.691, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.08062108091967751, |
|
"grad_norm": 0.5553786754608154, |
|
"learning_rate": 9.596894595401613e-06, |
|
"loss": 0.4376, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.0816164029063402, |
|
"grad_norm": 0.7254445552825928, |
|
"learning_rate": 9.591917985468299e-06, |
|
"loss": 0.4884, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.08261172489300289, |
|
"grad_norm": 0.7175013422966003, |
|
"learning_rate": 9.586941375534986e-06, |
|
"loss": 0.4167, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.08360704687966557, |
|
"grad_norm": 0.6464620232582092, |
|
"learning_rate": 9.581964765601674e-06, |
|
"loss": 0.4622, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.08460236886632826, |
|
"grad_norm": 0.6999176144599915, |
|
"learning_rate": 9.57698815566836e-06, |
|
"loss": 0.4708, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.08559769085299095, |
|
"grad_norm": 0.7939727306365967, |
|
"learning_rate": 9.572011545735047e-06, |
|
"loss": 0.4633, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.08659301283965362, |
|
"grad_norm": 0.473017156124115, |
|
"learning_rate": 9.567034935801732e-06, |
|
"loss": 0.4585, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.08758833482631631, |
|
"grad_norm": 0.7265183329582214, |
|
"learning_rate": 9.56205832586842e-06, |
|
"loss": 0.4485, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.088583656812979, |
|
"grad_norm": 0.539735734462738, |
|
"learning_rate": 9.557081715935105e-06, |
|
"loss": 0.475, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.08957897879964169, |
|
"grad_norm": 0.7587076425552368, |
|
"learning_rate": 9.552105106001792e-06, |
|
"loss": 0.4347, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08957897879964169, |
|
"eval_loss": 0.4690374732017517, |
|
"eval_runtime": 147.5672, |
|
"eval_samples_per_second": 1.376, |
|
"eval_steps_per_second": 0.691, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.09057430078630437, |
|
"grad_norm": 0.7549741864204407, |
|
"learning_rate": 9.547128496068478e-06, |
|
"loss": 0.4434, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.09156962277296705, |
|
"grad_norm": 0.686689555644989, |
|
"learning_rate": 9.542151886135165e-06, |
|
"loss": 0.4052, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.09256494475962974, |
|
"grad_norm": 1.02870512008667, |
|
"learning_rate": 9.537175276201853e-06, |
|
"loss": 0.4806, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.09356026674629242, |
|
"grad_norm": 0.7680675983428955, |
|
"learning_rate": 9.532198666268538e-06, |
|
"loss": 0.4609, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.09455558873295511, |
|
"grad_norm": 0.5478435754776001, |
|
"learning_rate": 9.527222056335224e-06, |
|
"loss": 0.4171, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.0955509107196178, |
|
"grad_norm": 0.5974985361099243, |
|
"learning_rate": 9.522245446401913e-06, |
|
"loss": 0.4686, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.09654623270628049, |
|
"grad_norm": 0.997151792049408, |
|
"learning_rate": 9.517268836468598e-06, |
|
"loss": 0.4676, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.09754155469294316, |
|
"grad_norm": 0.6366075277328491, |
|
"learning_rate": 9.512292226535284e-06, |
|
"loss": 0.4467, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.09853687667960585, |
|
"grad_norm": 0.5682553052902222, |
|
"learning_rate": 9.507315616601971e-06, |
|
"loss": 0.4772, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.09953219866626854, |
|
"grad_norm": 0.5869882106781006, |
|
"learning_rate": 9.502339006668659e-06, |
|
"loss": 0.3976, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.09953219866626854, |
|
"eval_loss": 0.46156319975852966, |
|
"eval_runtime": 147.6656, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.10052752065293122, |
|
"grad_norm": 0.5758237838745117, |
|
"learning_rate": 9.497362396735344e-06, |
|
"loss": 0.4528, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.10152284263959391, |
|
"grad_norm": 0.700281023979187, |
|
"learning_rate": 9.492385786802032e-06, |
|
"loss": 0.4545, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.1025181646262566, |
|
"grad_norm": 1.1320914030075073, |
|
"learning_rate": 9.487409176868719e-06, |
|
"loss": 0.4331, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.10351348661291927, |
|
"grad_norm": 0.6469867825508118, |
|
"learning_rate": 9.482432566935405e-06, |
|
"loss": 0.3759, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.10450880859958196, |
|
"grad_norm": 0.9471383094787598, |
|
"learning_rate": 9.47745595700209e-06, |
|
"loss": 0.4041, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.10550413058624465, |
|
"grad_norm": 0.5729160904884338, |
|
"learning_rate": 9.472479347068777e-06, |
|
"loss": 0.4871, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.10649945257290734, |
|
"grad_norm": 0.642436683177948, |
|
"learning_rate": 9.467502737135465e-06, |
|
"loss": 0.3893, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.10749477455957002, |
|
"grad_norm": 0.95659339427948, |
|
"learning_rate": 9.46252612720215e-06, |
|
"loss": 0.4486, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.1084900965462327, |
|
"grad_norm": 0.6642667055130005, |
|
"learning_rate": 9.457549517268838e-06, |
|
"loss": 0.5168, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.1094854185328954, |
|
"grad_norm": 0.5805796980857849, |
|
"learning_rate": 9.452572907335525e-06, |
|
"loss": 0.4019, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1094854185328954, |
|
"eval_loss": 0.4559178054332733, |
|
"eval_runtime": 147.5891, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.11048074051955807, |
|
"grad_norm": 0.7006909251213074, |
|
"learning_rate": 9.44759629740221e-06, |
|
"loss": 0.457, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.11147606250622076, |
|
"grad_norm": 1.1821540594100952, |
|
"learning_rate": 9.442619687468896e-06, |
|
"loss": 0.3484, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.11247138449288345, |
|
"grad_norm": 0.7232743501663208, |
|
"learning_rate": 9.437643077535584e-06, |
|
"loss": 0.417, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.11346670647954614, |
|
"grad_norm": 0.6104183197021484, |
|
"learning_rate": 9.43266646760227e-06, |
|
"loss": 0.4821, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.11446202846620881, |
|
"grad_norm": 0.5961386561393738, |
|
"learning_rate": 9.427689857668956e-06, |
|
"loss": 0.4834, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.1154573504528715, |
|
"grad_norm": 0.5530894994735718, |
|
"learning_rate": 9.422713247735644e-06, |
|
"loss": 0.443, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.1164526724395342, |
|
"grad_norm": 0.5148622393608093, |
|
"learning_rate": 9.41773663780233e-06, |
|
"loss": 0.4029, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.11744799442619687, |
|
"grad_norm": 0.6148583292961121, |
|
"learning_rate": 9.412760027869017e-06, |
|
"loss": 0.4308, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.11844331641285956, |
|
"grad_norm": 0.7840449213981628, |
|
"learning_rate": 9.407783417935702e-06, |
|
"loss": 0.499, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.11943863839952225, |
|
"grad_norm": 0.6757422089576721, |
|
"learning_rate": 9.40280680800239e-06, |
|
"loss": 0.4263, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.11943863839952225, |
|
"eval_loss": 0.4505193829536438, |
|
"eval_runtime": 147.6664, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.12043396038618492, |
|
"grad_norm": 0.630874752998352, |
|
"learning_rate": 9.397830198069075e-06, |
|
"loss": 0.492, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.12142928237284761, |
|
"grad_norm": 0.7458256483078003, |
|
"learning_rate": 9.392853588135763e-06, |
|
"loss": 0.4612, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.1224246043595103, |
|
"grad_norm": 0.6903111934661865, |
|
"learning_rate": 9.38787697820245e-06, |
|
"loss": 0.4882, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.12341992634617299, |
|
"grad_norm": 1.0817712545394897, |
|
"learning_rate": 9.382900368269135e-06, |
|
"loss": 0.4658, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.12441524833283567, |
|
"grad_norm": 0.8182739615440369, |
|
"learning_rate": 9.377923758335823e-06, |
|
"loss": 0.4281, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.12541057031949837, |
|
"grad_norm": 0.5155394077301025, |
|
"learning_rate": 9.372947148402508e-06, |
|
"loss": 0.4312, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.12640589230616103, |
|
"grad_norm": 0.6190319657325745, |
|
"learning_rate": 9.367970538469196e-06, |
|
"loss": 0.4537, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.12740121429282372, |
|
"grad_norm": 0.7704219222068787, |
|
"learning_rate": 9.362993928535881e-06, |
|
"loss": 0.4873, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.1283965362794864, |
|
"grad_norm": 0.6395025849342346, |
|
"learning_rate": 9.358017318602569e-06, |
|
"loss": 0.4374, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.1293918582661491, |
|
"grad_norm": 0.9248729944229126, |
|
"learning_rate": 9.353040708669256e-06, |
|
"loss": 0.4183, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.1293918582661491, |
|
"eval_loss": 0.44450852274894714, |
|
"eval_runtime": 147.6747, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.1303871802528118, |
|
"grad_norm": 0.6703208088874817, |
|
"learning_rate": 9.348064098735942e-06, |
|
"loss": 0.4017, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.13138250223947448, |
|
"grad_norm": 0.7091432213783264, |
|
"learning_rate": 9.343087488802627e-06, |
|
"loss": 0.4858, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.13237782422613714, |
|
"grad_norm": 0.6519076824188232, |
|
"learning_rate": 9.338110878869316e-06, |
|
"loss": 0.402, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.13337314621279983, |
|
"grad_norm": 0.7192474603652954, |
|
"learning_rate": 9.333134268936002e-06, |
|
"loss": 0.4275, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.13436846819946252, |
|
"grad_norm": 0.626981794834137, |
|
"learning_rate": 9.328157659002687e-06, |
|
"loss": 0.4276, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.1353637901861252, |
|
"grad_norm": 0.8239569664001465, |
|
"learning_rate": 9.323181049069375e-06, |
|
"loss": 0.4384, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.1363591121727879, |
|
"grad_norm": 0.727737307548523, |
|
"learning_rate": 9.318204439136062e-06, |
|
"loss": 0.3892, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.1373544341594506, |
|
"grad_norm": 0.6430094242095947, |
|
"learning_rate": 9.313227829202748e-06, |
|
"loss": 0.3579, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.13834975614611328, |
|
"grad_norm": 0.7504476308822632, |
|
"learning_rate": 9.308251219269435e-06, |
|
"loss": 0.4585, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.13934507813277594, |
|
"grad_norm": 1.0239664316177368, |
|
"learning_rate": 9.303274609336122e-06, |
|
"loss": 0.4696, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.13934507813277594, |
|
"eval_loss": 0.43923673033714294, |
|
"eval_runtime": 147.7239, |
|
"eval_samples_per_second": 1.374, |
|
"eval_steps_per_second": 0.69, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.14034040011943863, |
|
"grad_norm": 0.6847706437110901, |
|
"learning_rate": 9.298297999402808e-06, |
|
"loss": 0.4823, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.14133572210610132, |
|
"grad_norm": 0.5733935832977295, |
|
"learning_rate": 9.293321389469493e-06, |
|
"loss": 0.4088, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.142331044092764, |
|
"grad_norm": 0.8858775496482849, |
|
"learning_rate": 9.28834477953618e-06, |
|
"loss": 0.3863, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.1433263660794267, |
|
"grad_norm": 0.6404774785041809, |
|
"learning_rate": 9.283368169602868e-06, |
|
"loss": 0.3951, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.1443216880660894, |
|
"grad_norm": 0.6125516891479492, |
|
"learning_rate": 9.278391559669554e-06, |
|
"loss": 0.4408, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.14531701005275208, |
|
"grad_norm": 0.5629742741584778, |
|
"learning_rate": 9.273414949736241e-06, |
|
"loss": 0.4319, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.14631233203941474, |
|
"grad_norm": 0.6768545508384705, |
|
"learning_rate": 9.268438339802927e-06, |
|
"loss": 0.4002, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.14730765402607743, |
|
"grad_norm": 0.6743785738945007, |
|
"learning_rate": 9.263461729869614e-06, |
|
"loss": 0.4779, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.14830297601274012, |
|
"grad_norm": 0.5943326354026794, |
|
"learning_rate": 9.2584851199363e-06, |
|
"loss": 0.4406, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.1492982979994028, |
|
"grad_norm": 0.8586482405662537, |
|
"learning_rate": 9.253508510002987e-06, |
|
"loss": 0.4326, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1492982979994028, |
|
"eval_loss": 0.43489304184913635, |
|
"eval_runtime": 147.6747, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1502936199860655, |
|
"grad_norm": 0.862763524055481, |
|
"learning_rate": 9.248531900069674e-06, |
|
"loss": 0.4917, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.15128894197272819, |
|
"grad_norm": 0.6556192636489868, |
|
"learning_rate": 9.24355529013636e-06, |
|
"loss": 0.4333, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.15228426395939088, |
|
"grad_norm": 0.5479542016983032, |
|
"learning_rate": 9.238578680203047e-06, |
|
"loss": 0.4176, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.15327958594605354, |
|
"grad_norm": 0.8119767308235168, |
|
"learning_rate": 9.233602070269733e-06, |
|
"loss": 0.4171, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.15427490793271623, |
|
"grad_norm": 0.9051875472068787, |
|
"learning_rate": 9.22862546033642e-06, |
|
"loss": 0.4529, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.15527022991937892, |
|
"grad_norm": 0.5972510576248169, |
|
"learning_rate": 9.223648850403106e-06, |
|
"loss": 0.4752, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.1562655519060416, |
|
"grad_norm": 0.6712588667869568, |
|
"learning_rate": 9.218672240469793e-06, |
|
"loss": 0.4179, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.1572608738927043, |
|
"grad_norm": 0.637656569480896, |
|
"learning_rate": 9.213695630536478e-06, |
|
"loss": 0.4624, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.15825619587936698, |
|
"grad_norm": 0.7319675087928772, |
|
"learning_rate": 9.208719020603166e-06, |
|
"loss": 0.4149, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.15925151786602967, |
|
"grad_norm": 0.6740835905075073, |
|
"learning_rate": 9.203742410669853e-06, |
|
"loss": 0.4348, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.15925151786602967, |
|
"eval_loss": 0.4290333390235901, |
|
"eval_runtime": 147.7478, |
|
"eval_samples_per_second": 1.374, |
|
"eval_steps_per_second": 0.69, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.16024683985269234, |
|
"grad_norm": 0.7110456824302673, |
|
"learning_rate": 9.198765800736539e-06, |
|
"loss": 0.3808, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.16124216183935502, |
|
"grad_norm": 0.6934688091278076, |
|
"learning_rate": 9.193789190803224e-06, |
|
"loss": 0.4279, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.16223748382601771, |
|
"grad_norm": 0.6783742308616638, |
|
"learning_rate": 9.188812580869912e-06, |
|
"loss": 0.413, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.1632328058126804, |
|
"grad_norm": 0.5934478044509888, |
|
"learning_rate": 9.183835970936599e-06, |
|
"loss": 0.476, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.1642281277993431, |
|
"grad_norm": 0.9043450951576233, |
|
"learning_rate": 9.178859361003285e-06, |
|
"loss": 0.392, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.16522344978600578, |
|
"grad_norm": 0.4757988154888153, |
|
"learning_rate": 9.173882751069972e-06, |
|
"loss": 0.3812, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.16621877177266844, |
|
"grad_norm": 0.7402971982955933, |
|
"learning_rate": 9.16890614113666e-06, |
|
"loss": 0.4293, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.16721409375933113, |
|
"grad_norm": 0.6279808282852173, |
|
"learning_rate": 9.163929531203345e-06, |
|
"loss": 0.4453, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.16820941574599382, |
|
"grad_norm": 0.6272904276847839, |
|
"learning_rate": 9.15895292127003e-06, |
|
"loss": 0.4215, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.1692047377326565, |
|
"grad_norm": 0.806103527545929, |
|
"learning_rate": 9.15397631133672e-06, |
|
"loss": 0.4236, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.1692047377326565, |
|
"eval_loss": 0.424538791179657, |
|
"eval_runtime": 147.6192, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.1702000597193192, |
|
"grad_norm": 0.7595136165618896, |
|
"learning_rate": 9.148999701403405e-06, |
|
"loss": 0.4473, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.1711953817059819, |
|
"grad_norm": 0.5029250979423523, |
|
"learning_rate": 9.14402309147009e-06, |
|
"loss": 0.4248, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.17219070369264458, |
|
"grad_norm": 0.7487345933914185, |
|
"learning_rate": 9.139046481536778e-06, |
|
"loss": 0.3795, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.17318602567930724, |
|
"grad_norm": 1.122206211090088, |
|
"learning_rate": 9.134069871603465e-06, |
|
"loss": 0.4026, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.17418134766596993, |
|
"grad_norm": 0.6429542899131775, |
|
"learning_rate": 9.129093261670151e-06, |
|
"loss": 0.4142, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.17517666965263262, |
|
"grad_norm": 0.7902116775512695, |
|
"learning_rate": 9.124116651736838e-06, |
|
"loss": 0.4266, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.1761719916392953, |
|
"grad_norm": 0.6928035020828247, |
|
"learning_rate": 9.119140041803524e-06, |
|
"loss": 0.4036, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.177167313625958, |
|
"grad_norm": 0.637829601764679, |
|
"learning_rate": 9.114163431870211e-06, |
|
"loss": 0.4139, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.1781626356126207, |
|
"grad_norm": 0.8418923616409302, |
|
"learning_rate": 9.109186821936897e-06, |
|
"loss": 0.4538, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.17915795759928338, |
|
"grad_norm": 0.6597120761871338, |
|
"learning_rate": 9.104210212003584e-06, |
|
"loss": 0.428, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.17915795759928338, |
|
"eval_loss": 0.4206041693687439, |
|
"eval_runtime": 147.6714, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.18015327958594604, |
|
"grad_norm": 0.9092034101486206, |
|
"learning_rate": 9.099233602070271e-06, |
|
"loss": 0.3827, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.18114860157260873, |
|
"grad_norm": 0.7151809334754944, |
|
"learning_rate": 9.094256992136957e-06, |
|
"loss": 0.4096, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.18214392355927142, |
|
"grad_norm": 0.812656819820404, |
|
"learning_rate": 9.089280382203644e-06, |
|
"loss": 0.398, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.1831392455459341, |
|
"grad_norm": 0.6819058060646057, |
|
"learning_rate": 9.08430377227033e-06, |
|
"loss": 0.4289, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.1841345675325968, |
|
"grad_norm": 0.6796212792396545, |
|
"learning_rate": 9.079327162337017e-06, |
|
"loss": 0.4107, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.1851298895192595, |
|
"grad_norm": 0.604881227016449, |
|
"learning_rate": 9.074350552403703e-06, |
|
"loss": 0.3888, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.18612521150592218, |
|
"grad_norm": 0.5823159217834473, |
|
"learning_rate": 9.06937394247039e-06, |
|
"loss": 0.4292, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.18712053349258484, |
|
"grad_norm": 0.6591698527336121, |
|
"learning_rate": 9.064397332537076e-06, |
|
"loss": 0.4559, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.18811585547924753, |
|
"grad_norm": 0.666591465473175, |
|
"learning_rate": 9.059420722603763e-06, |
|
"loss": 0.4486, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.18911117746591022, |
|
"grad_norm": 0.8700873255729675, |
|
"learning_rate": 9.05444411267045e-06, |
|
"loss": 0.3934, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.18911117746591022, |
|
"eval_loss": 0.41719409823417664, |
|
"eval_runtime": 147.6671, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.1901064994525729, |
|
"grad_norm": 0.5683835744857788, |
|
"learning_rate": 9.049467502737136e-06, |
|
"loss": 0.4148, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.1911018214392356, |
|
"grad_norm": 0.7323755621910095, |
|
"learning_rate": 9.044490892803823e-06, |
|
"loss": 0.4473, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.1920971434258983, |
|
"grad_norm": 0.8059419393539429, |
|
"learning_rate": 9.039514282870509e-06, |
|
"loss": 0.4092, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.19309246541256098, |
|
"grad_norm": 0.5238020420074463, |
|
"learning_rate": 9.034537672937196e-06, |
|
"loss": 0.4161, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.19408778739922364, |
|
"grad_norm": 0.7691717147827148, |
|
"learning_rate": 9.029561063003882e-06, |
|
"loss": 0.3996, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.19508310938588633, |
|
"grad_norm": 0.5275344848632812, |
|
"learning_rate": 9.024584453070569e-06, |
|
"loss": 0.3936, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.19607843137254902, |
|
"grad_norm": 0.9201516509056091, |
|
"learning_rate": 9.019607843137256e-06, |
|
"loss": 0.4327, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.1970737533592117, |
|
"grad_norm": 0.6645549535751343, |
|
"learning_rate": 9.014631233203942e-06, |
|
"loss": 0.439, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.1980690753458744, |
|
"grad_norm": 0.4919885993003845, |
|
"learning_rate": 9.009654623270628e-06, |
|
"loss": 0.3584, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.19906439733253709, |
|
"grad_norm": 0.7819716930389404, |
|
"learning_rate": 9.004678013337315e-06, |
|
"loss": 0.4258, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.19906439733253709, |
|
"eval_loss": 0.4135349690914154, |
|
"eval_runtime": 147.6676, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.20005971931919977, |
|
"grad_norm": 0.6763346195220947, |
|
"learning_rate": 8.999701403404002e-06, |
|
"loss": 0.3734, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.20105504130586244, |
|
"grad_norm": 0.974773108959198, |
|
"learning_rate": 8.994724793470688e-06, |
|
"loss": 0.4128, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.20205036329252513, |
|
"grad_norm": 0.7922454476356506, |
|
"learning_rate": 8.989748183537375e-06, |
|
"loss": 0.4699, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.20304568527918782, |
|
"grad_norm": 0.7217792272567749, |
|
"learning_rate": 8.984771573604062e-06, |
|
"loss": 0.4368, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.2040410072658505, |
|
"grad_norm": 0.9531657695770264, |
|
"learning_rate": 8.979794963670748e-06, |
|
"loss": 0.4124, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.2050363292525132, |
|
"grad_norm": 0.5895671248435974, |
|
"learning_rate": 8.974818353737434e-06, |
|
"loss": 0.4065, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.20603165123917588, |
|
"grad_norm": 0.6587451100349426, |
|
"learning_rate": 8.969841743804123e-06, |
|
"loss": 0.4182, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.20702697322583855, |
|
"grad_norm": 0.5056644678115845, |
|
"learning_rate": 8.964865133870808e-06, |
|
"loss": 0.4146, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.20802229521250123, |
|
"grad_norm": 0.8369359374046326, |
|
"learning_rate": 8.959888523937494e-06, |
|
"loss": 0.4258, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.20901761719916392, |
|
"grad_norm": 0.8079156279563904, |
|
"learning_rate": 8.954911914004181e-06, |
|
"loss": 0.4172, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.20901761719916392, |
|
"eval_loss": 0.40956470370292664, |
|
"eval_runtime": 147.7554, |
|
"eval_samples_per_second": 1.374, |
|
"eval_steps_per_second": 0.69, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.2100129391858266, |
|
"grad_norm": 0.5938236117362976, |
|
"learning_rate": 8.949935304070869e-06, |
|
"loss": 0.4058, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.2110082611724893, |
|
"grad_norm": 0.5103029608726501, |
|
"learning_rate": 8.944958694137554e-06, |
|
"loss": 0.3338, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.212003583159152, |
|
"grad_norm": 0.8399671316146851, |
|
"learning_rate": 8.939982084204241e-06, |
|
"loss": 0.4135, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.21299890514581468, |
|
"grad_norm": 0.8162589073181152, |
|
"learning_rate": 8.935005474270927e-06, |
|
"loss": 0.379, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.21399422713247734, |
|
"grad_norm": 0.5345713496208191, |
|
"learning_rate": 8.930028864337614e-06, |
|
"loss": 0.4356, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.21498954911914003, |
|
"grad_norm": 0.5709038972854614, |
|
"learning_rate": 8.9250522544043e-06, |
|
"loss": 0.3961, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.21598487110580272, |
|
"grad_norm": 0.8017010688781738, |
|
"learning_rate": 8.920075644470987e-06, |
|
"loss": 0.3934, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.2169801930924654, |
|
"grad_norm": 0.7133475542068481, |
|
"learning_rate": 8.915099034537673e-06, |
|
"loss": 0.386, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.2179755150791281, |
|
"grad_norm": 0.861768901348114, |
|
"learning_rate": 8.91012242460436e-06, |
|
"loss": 0.3981, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.2189708370657908, |
|
"grad_norm": 0.6387837529182434, |
|
"learning_rate": 8.905145814671047e-06, |
|
"loss": 0.4277, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.2189708370657908, |
|
"eval_loss": 0.40670302510261536, |
|
"eval_runtime": 147.76, |
|
"eval_samples_per_second": 1.374, |
|
"eval_steps_per_second": 0.69, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.21996615905245348, |
|
"grad_norm": 0.9591347575187683, |
|
"learning_rate": 8.900169204737733e-06, |
|
"loss": 0.3809, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.22096148103911614, |
|
"grad_norm": 0.6483083963394165, |
|
"learning_rate": 8.89519259480442e-06, |
|
"loss": 0.4071, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.22195680302577883, |
|
"grad_norm": 1.0261069536209106, |
|
"learning_rate": 8.890215984871106e-06, |
|
"loss": 0.4145, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.22295212501244152, |
|
"grad_norm": 0.6538086533546448, |
|
"learning_rate": 8.885239374937793e-06, |
|
"loss": 0.4322, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.2239474469991042, |
|
"grad_norm": 0.4469331204891205, |
|
"learning_rate": 8.880262765004479e-06, |
|
"loss": 0.4052, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.2249427689857669, |
|
"grad_norm": 0.5114856958389282, |
|
"learning_rate": 8.875286155071166e-06, |
|
"loss": 0.4143, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.2259380909724296, |
|
"grad_norm": 0.7658188343048096, |
|
"learning_rate": 8.870309545137854e-06, |
|
"loss": 0.4345, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.22693341295909228, |
|
"grad_norm": 0.6381837725639343, |
|
"learning_rate": 8.86533293520454e-06, |
|
"loss": 0.3868, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.22792873494575494, |
|
"grad_norm": 0.5213243961334229, |
|
"learning_rate": 8.860356325271225e-06, |
|
"loss": 0.3849, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.22892405693241763, |
|
"grad_norm": 0.7393907904624939, |
|
"learning_rate": 8.855379715337912e-06, |
|
"loss": 0.4282, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.22892405693241763, |
|
"eval_loss": 0.4041208326816559, |
|
"eval_runtime": 147.7723, |
|
"eval_samples_per_second": 1.374, |
|
"eval_steps_per_second": 0.69, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.22991937891908032, |
|
"grad_norm": 0.5622240304946899, |
|
"learning_rate": 8.8504031054046e-06, |
|
"loss": 0.3818, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.230914700905743, |
|
"grad_norm": 0.7211191654205322, |
|
"learning_rate": 8.845426495471285e-06, |
|
"loss": 0.3596, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.2319100228924057, |
|
"grad_norm": 0.5431678295135498, |
|
"learning_rate": 8.840449885537972e-06, |
|
"loss": 0.3645, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.2329053448790684, |
|
"grad_norm": 1.0264047384262085, |
|
"learning_rate": 8.83547327560466e-06, |
|
"loss": 0.4152, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.23390066686573108, |
|
"grad_norm": 0.6439436078071594, |
|
"learning_rate": 8.830496665671345e-06, |
|
"loss": 0.4169, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.23489598885239374, |
|
"grad_norm": 0.6291099786758423, |
|
"learning_rate": 8.825520055738031e-06, |
|
"loss": 0.4246, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.23589131083905643, |
|
"grad_norm": 0.5020752549171448, |
|
"learning_rate": 8.820543445804718e-06, |
|
"loss": 0.3649, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.23688663282571912, |
|
"grad_norm": 0.5813655257225037, |
|
"learning_rate": 8.815566835871405e-06, |
|
"loss": 0.403, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.2378819548123818, |
|
"grad_norm": 0.7793263792991638, |
|
"learning_rate": 8.810590225938091e-06, |
|
"loss": 0.4044, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.2388772767990445, |
|
"grad_norm": 1.0214496850967407, |
|
"learning_rate": 8.805613616004778e-06, |
|
"loss": 0.3804, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.2388772767990445, |
|
"eval_loss": 0.4011123776435852, |
|
"eval_runtime": 147.7863, |
|
"eval_samples_per_second": 1.374, |
|
"eval_steps_per_second": 0.69, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.2398725987857072, |
|
"grad_norm": 0.8854981064796448, |
|
"learning_rate": 8.800637006071466e-06, |
|
"loss": 0.3915, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.24086792077236985, |
|
"grad_norm": 0.6463388800621033, |
|
"learning_rate": 8.795660396138151e-06, |
|
"loss": 0.412, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.24186324275903254, |
|
"grad_norm": 1.0134918689727783, |
|
"learning_rate": 8.790683786204837e-06, |
|
"loss": 0.4514, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.24285856474569523, |
|
"grad_norm": 0.5260724425315857, |
|
"learning_rate": 8.785707176271524e-06, |
|
"loss": 0.393, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.24385388673235792, |
|
"grad_norm": 0.7072359323501587, |
|
"learning_rate": 8.780730566338212e-06, |
|
"loss": 0.4061, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.2448492087190206, |
|
"grad_norm": 0.505009114742279, |
|
"learning_rate": 8.775753956404897e-06, |
|
"loss": 0.4435, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.2458445307056833, |
|
"grad_norm": 0.707790195941925, |
|
"learning_rate": 8.770777346471584e-06, |
|
"loss": 0.3803, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.24683985269234598, |
|
"grad_norm": 1.0153621435165405, |
|
"learning_rate": 8.765800736538272e-06, |
|
"loss": 0.3942, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.24783517467900865, |
|
"grad_norm": 0.6652597188949585, |
|
"learning_rate": 8.760824126604957e-06, |
|
"loss": 0.3481, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.24883049666567134, |
|
"grad_norm": 0.49689826369285583, |
|
"learning_rate": 8.755847516671645e-06, |
|
"loss": 0.4101, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.24883049666567134, |
|
"eval_loss": 0.39822638034820557, |
|
"eval_runtime": 147.9245, |
|
"eval_samples_per_second": 1.372, |
|
"eval_steps_per_second": 0.69, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.24982581865233403, |
|
"grad_norm": 0.7141602635383606, |
|
"learning_rate": 8.75087090673833e-06, |
|
"loss": 0.362, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.25082114063899674, |
|
"grad_norm": 0.5883095264434814, |
|
"learning_rate": 8.745894296805018e-06, |
|
"loss": 0.4115, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.2518164626256594, |
|
"grad_norm": 0.6165831685066223, |
|
"learning_rate": 8.740917686871703e-06, |
|
"loss": 0.3849, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.25281178461232207, |
|
"grad_norm": 0.5670954585075378, |
|
"learning_rate": 8.73594107693839e-06, |
|
"loss": 0.3491, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.25380710659898476, |
|
"grad_norm": 1.0700769424438477, |
|
"learning_rate": 8.730964467005076e-06, |
|
"loss": 0.4068, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.25480242858564744, |
|
"grad_norm": 0.7089443206787109, |
|
"learning_rate": 8.725987857071763e-06, |
|
"loss": 0.4567, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.25579775057231013, |
|
"grad_norm": 0.5670477747917175, |
|
"learning_rate": 8.72101124713845e-06, |
|
"loss": 0.4037, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.2567930725589728, |
|
"grad_norm": 0.6892909407615662, |
|
"learning_rate": 8.716034637205136e-06, |
|
"loss": 0.3714, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.2577883945456355, |
|
"grad_norm": 0.8213964104652405, |
|
"learning_rate": 8.711058027271822e-06, |
|
"loss": 0.4305, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.2587837165322982, |
|
"grad_norm": 0.7234606146812439, |
|
"learning_rate": 8.70608141733851e-06, |
|
"loss": 0.4213, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.2587837165322982, |
|
"eval_loss": 0.39483293890953064, |
|
"eval_runtime": 147.915, |
|
"eval_samples_per_second": 1.372, |
|
"eval_steps_per_second": 0.69, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.2597790385189609, |
|
"grad_norm": 0.6947128176689148, |
|
"learning_rate": 8.701104807405197e-06, |
|
"loss": 0.3851, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.2607743605056236, |
|
"grad_norm": 0.8997359275817871, |
|
"learning_rate": 8.696128197471882e-06, |
|
"loss": 0.379, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.26176968249228627, |
|
"grad_norm": 0.8184422254562378, |
|
"learning_rate": 8.69115158753857e-06, |
|
"loss": 0.3615, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.26276500447894896, |
|
"grad_norm": 0.7109666466712952, |
|
"learning_rate": 8.686174977605257e-06, |
|
"loss": 0.4233, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.26376032646561165, |
|
"grad_norm": 0.6844655275344849, |
|
"learning_rate": 8.681198367671942e-06, |
|
"loss": 0.4142, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.2647556484522743, |
|
"grad_norm": 0.8344716429710388, |
|
"learning_rate": 8.676221757738628e-06, |
|
"loss": 0.3611, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.265750970438937, |
|
"grad_norm": 0.7269201278686523, |
|
"learning_rate": 8.671245147805315e-06, |
|
"loss": 0.4397, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.26674629242559966, |
|
"grad_norm": 0.5457523465156555, |
|
"learning_rate": 8.666268537872003e-06, |
|
"loss": 0.3724, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.26774161441226235, |
|
"grad_norm": 0.7520753145217896, |
|
"learning_rate": 8.661291927938688e-06, |
|
"loss": 0.3882, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.26873693639892504, |
|
"grad_norm": 0.49623236060142517, |
|
"learning_rate": 8.656315318005376e-06, |
|
"loss": 0.4115, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.26873693639892504, |
|
"eval_loss": 0.39236727356910706, |
|
"eval_runtime": 147.7377, |
|
"eval_samples_per_second": 1.374, |
|
"eval_steps_per_second": 0.69, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.26973225838558773, |
|
"grad_norm": 0.6592463254928589, |
|
"learning_rate": 8.651338708072063e-06, |
|
"loss": 0.3628, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.2707275803722504, |
|
"grad_norm": 0.9473317265510559, |
|
"learning_rate": 8.646362098138749e-06, |
|
"loss": 0.3842, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.2717229023589131, |
|
"grad_norm": 0.7774178385734558, |
|
"learning_rate": 8.641385488205434e-06, |
|
"loss": 0.3643, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.2727182243455758, |
|
"grad_norm": 0.6194160580635071, |
|
"learning_rate": 8.636408878272121e-06, |
|
"loss": 0.4647, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.2737135463322385, |
|
"grad_norm": 0.5518766641616821, |
|
"learning_rate": 8.631432268338809e-06, |
|
"loss": 0.3755, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.2747088683189012, |
|
"grad_norm": 0.9331585764884949, |
|
"learning_rate": 8.626455658405494e-06, |
|
"loss": 0.3881, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.27570419030556387, |
|
"grad_norm": 0.6080964207649231, |
|
"learning_rate": 8.621479048472182e-06, |
|
"loss": 0.3965, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.27669951229222656, |
|
"grad_norm": 0.8619922399520874, |
|
"learning_rate": 8.616502438538869e-06, |
|
"loss": 0.387, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.27769483427888925, |
|
"grad_norm": 0.7429324984550476, |
|
"learning_rate": 8.611525828605555e-06, |
|
"loss": 0.3837, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.2786901562655519, |
|
"grad_norm": 0.7918853759765625, |
|
"learning_rate": 8.60654921867224e-06, |
|
"loss": 0.3921, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.2786901562655519, |
|
"eval_loss": 0.3901057541370392, |
|
"eval_runtime": 147.7809, |
|
"eval_samples_per_second": 1.374, |
|
"eval_steps_per_second": 0.69, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.27968547825221457, |
|
"grad_norm": 0.6200188398361206, |
|
"learning_rate": 8.601572608738928e-06, |
|
"loss": 0.398, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.28068080023887726, |
|
"grad_norm": 0.6285167336463928, |
|
"learning_rate": 8.596595998805615e-06, |
|
"loss": 0.3676, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.28167612222553995, |
|
"grad_norm": 0.7586702704429626, |
|
"learning_rate": 8.5916193888723e-06, |
|
"loss": 0.3658, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.28267144421220264, |
|
"grad_norm": 0.915360152721405, |
|
"learning_rate": 8.586642778938988e-06, |
|
"loss": 0.3444, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.2836667661988653, |
|
"grad_norm": 0.8675612807273865, |
|
"learning_rate": 8.581666169005673e-06, |
|
"loss": 0.3939, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.284662088185528, |
|
"grad_norm": 0.8629066944122314, |
|
"learning_rate": 8.57668955907236e-06, |
|
"loss": 0.4055, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.2856574101721907, |
|
"grad_norm": 0.8615571856498718, |
|
"learning_rate": 8.571712949139048e-06, |
|
"loss": 0.4392, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.2866527321588534, |
|
"grad_norm": 0.675205409526825, |
|
"learning_rate": 8.566736339205734e-06, |
|
"loss": 0.3289, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.2876480541455161, |
|
"grad_norm": 0.6187378764152527, |
|
"learning_rate": 8.561759729272421e-06, |
|
"loss": 0.4067, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.2886433761321788, |
|
"grad_norm": 0.7826117277145386, |
|
"learning_rate": 8.556783119339106e-06, |
|
"loss": 0.367, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.2886433761321788, |
|
"eval_loss": 0.38809624314308167, |
|
"eval_runtime": 147.8617, |
|
"eval_samples_per_second": 1.373, |
|
"eval_steps_per_second": 0.69, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.28963869811884146, |
|
"grad_norm": 0.6546410322189331, |
|
"learning_rate": 8.551806509405794e-06, |
|
"loss": 0.3727, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.29063402010550415, |
|
"grad_norm": 0.8760982155799866, |
|
"learning_rate": 8.54682989947248e-06, |
|
"loss": 0.3967, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.29162934209216684, |
|
"grad_norm": 0.64844810962677, |
|
"learning_rate": 8.541853289539167e-06, |
|
"loss": 0.4046, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.2926246640788295, |
|
"grad_norm": 0.5126065015792847, |
|
"learning_rate": 8.536876679605854e-06, |
|
"loss": 0.3783, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.29361998606549217, |
|
"grad_norm": 0.7168049216270447, |
|
"learning_rate": 8.53190006967254e-06, |
|
"loss": 0.3606, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.29461530805215486, |
|
"grad_norm": 0.4847118854522705, |
|
"learning_rate": 8.526923459739225e-06, |
|
"loss": 0.3617, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.29561063003881755, |
|
"grad_norm": 0.6937541365623474, |
|
"learning_rate": 8.521946849805913e-06, |
|
"loss": 0.3878, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.29660595202548024, |
|
"grad_norm": 0.7482075095176697, |
|
"learning_rate": 8.5169702398726e-06, |
|
"loss": 0.4173, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.2976012740121429, |
|
"grad_norm": 0.7130847573280334, |
|
"learning_rate": 8.511993629939285e-06, |
|
"loss": 0.3717, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.2985965959988056, |
|
"grad_norm": 0.7087443470954895, |
|
"learning_rate": 8.507017020005973e-06, |
|
"loss": 0.3945, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2985965959988056, |
|
"eval_loss": 0.3846234977245331, |
|
"eval_runtime": 147.9506, |
|
"eval_samples_per_second": 1.372, |
|
"eval_steps_per_second": 0.689, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2995919179854683, |
|
"grad_norm": 0.5839470624923706, |
|
"learning_rate": 8.50204041007266e-06, |
|
"loss": 0.3672, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.300587239972131, |
|
"grad_norm": 0.5632269978523254, |
|
"learning_rate": 8.497063800139346e-06, |
|
"loss": 0.4038, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.3015825619587937, |
|
"grad_norm": 0.9807242155075073, |
|
"learning_rate": 8.492087190206031e-06, |
|
"loss": 0.435, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.30257788394545637, |
|
"grad_norm": 0.6134958267211914, |
|
"learning_rate": 8.487110580272719e-06, |
|
"loss": 0.3857, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.30357320593211906, |
|
"grad_norm": 0.9714884757995605, |
|
"learning_rate": 8.482133970339406e-06, |
|
"loss": 0.3375, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.30456852791878175, |
|
"grad_norm": 0.6158900856971741, |
|
"learning_rate": 8.477157360406092e-06, |
|
"loss": 0.3768, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.3055638499054444, |
|
"grad_norm": 0.5510846376419067, |
|
"learning_rate": 8.472180750472779e-06, |
|
"loss": 0.3618, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.3065591718921071, |
|
"grad_norm": 0.6374019384384155, |
|
"learning_rate": 8.467204140539466e-06, |
|
"loss": 0.3444, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.30755449387876976, |
|
"grad_norm": 0.6322264075279236, |
|
"learning_rate": 8.462227530606152e-06, |
|
"loss": 0.3841, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.30854981586543245, |
|
"grad_norm": 0.6326218843460083, |
|
"learning_rate": 8.457250920672837e-06, |
|
"loss": 0.3627, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.30854981586543245, |
|
"eval_loss": 0.38287338614463806, |
|
"eval_runtime": 147.987, |
|
"eval_samples_per_second": 1.372, |
|
"eval_steps_per_second": 0.689, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.30954513785209514, |
|
"grad_norm": 0.8483834862709045, |
|
"learning_rate": 8.452274310739525e-06, |
|
"loss": 0.4364, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.31054045983875783, |
|
"grad_norm": 0.9434365034103394, |
|
"learning_rate": 8.447297700806212e-06, |
|
"loss": 0.4027, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.3115357818254205, |
|
"grad_norm": 0.7766565680503845, |
|
"learning_rate": 8.442321090872898e-06, |
|
"loss": 0.3241, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.3125311038120832, |
|
"grad_norm": 0.7761719822883606, |
|
"learning_rate": 8.437344480939585e-06, |
|
"loss": 0.4041, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.3135264257987459, |
|
"grad_norm": 0.8227534890174866, |
|
"learning_rate": 8.432367871006272e-06, |
|
"loss": 0.3915, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.3145217477854086, |
|
"grad_norm": 0.6961987614631653, |
|
"learning_rate": 8.427391261072958e-06, |
|
"loss": 0.4119, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.3155170697720713, |
|
"grad_norm": 0.725043773651123, |
|
"learning_rate": 8.422414651139643e-06, |
|
"loss": 0.3811, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.31651239175873397, |
|
"grad_norm": 0.6801613569259644, |
|
"learning_rate": 8.41743804120633e-06, |
|
"loss": 0.3752, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.31750771374539666, |
|
"grad_norm": 0.6735227108001709, |
|
"learning_rate": 8.412461431273018e-06, |
|
"loss": 0.3538, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.31850303573205935, |
|
"grad_norm": 0.7424077391624451, |
|
"learning_rate": 8.407484821339704e-06, |
|
"loss": 0.3347, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.31850303573205935, |
|
"eval_loss": 0.38142284750938416, |
|
"eval_runtime": 148.3323, |
|
"eval_samples_per_second": 1.369, |
|
"eval_steps_per_second": 0.688, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.319498357718722, |
|
"grad_norm": 0.6526059508323669, |
|
"learning_rate": 8.402508211406391e-06, |
|
"loss": 0.4098, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.32049367970538467, |
|
"grad_norm": 0.8221137523651123, |
|
"learning_rate": 8.397531601473077e-06, |
|
"loss": 0.4044, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.32148900169204736, |
|
"grad_norm": 0.7967231869697571, |
|
"learning_rate": 8.392554991539764e-06, |
|
"loss": 0.3989, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.32248432367871005, |
|
"grad_norm": 0.8786621689796448, |
|
"learning_rate": 8.387578381606451e-06, |
|
"loss": 0.3113, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.32347964566537274, |
|
"grad_norm": 1.084957480430603, |
|
"learning_rate": 8.382601771673137e-06, |
|
"loss": 0.3855, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.32447496765203543, |
|
"grad_norm": 0.6978799104690552, |
|
"learning_rate": 8.377625161739822e-06, |
|
"loss": 0.3752, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.3254702896386981, |
|
"grad_norm": 0.6280369162559509, |
|
"learning_rate": 8.37264855180651e-06, |
|
"loss": 0.3831, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.3264656116253608, |
|
"grad_norm": 0.5700563192367554, |
|
"learning_rate": 8.367671941873197e-06, |
|
"loss": 0.3848, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.3274609336120235, |
|
"grad_norm": 0.6714605093002319, |
|
"learning_rate": 8.362695331939883e-06, |
|
"loss": 0.3894, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.3284562555986862, |
|
"grad_norm": 0.6634580492973328, |
|
"learning_rate": 8.35771872200657e-06, |
|
"loss": 0.4055, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.3284562555986862, |
|
"eval_loss": 0.3794529438018799, |
|
"eval_runtime": 147.906, |
|
"eval_samples_per_second": 1.372, |
|
"eval_steps_per_second": 0.69, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.3294515775853489, |
|
"grad_norm": 0.6699293255805969, |
|
"learning_rate": 8.352742112073257e-06, |
|
"loss": 0.3997, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.33044689957201157, |
|
"grad_norm": 0.5837434530258179, |
|
"learning_rate": 8.347765502139943e-06, |
|
"loss": 0.3506, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.33144222155867425, |
|
"grad_norm": 0.7900473475456238, |
|
"learning_rate": 8.342788892206629e-06, |
|
"loss": 0.3712, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.3324375435453369, |
|
"grad_norm": 0.5419691205024719, |
|
"learning_rate": 8.337812282273316e-06, |
|
"loss": 0.3755, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.3334328655319996, |
|
"grad_norm": 0.635683536529541, |
|
"learning_rate": 8.332835672340003e-06, |
|
"loss": 0.3995, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.33442818751866227, |
|
"grad_norm": 0.7266948223114014, |
|
"learning_rate": 8.327859062406689e-06, |
|
"loss": 0.398, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.33542350950532496, |
|
"grad_norm": 0.8439323902130127, |
|
"learning_rate": 8.322882452473376e-06, |
|
"loss": 0.4093, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.33641883149198765, |
|
"grad_norm": 0.6754797697067261, |
|
"learning_rate": 8.317905842540063e-06, |
|
"loss": 0.3638, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.33741415347865034, |
|
"grad_norm": 0.7690572142601013, |
|
"learning_rate": 8.312929232606749e-06, |
|
"loss": 0.3408, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.338409475465313, |
|
"grad_norm": 0.765877902507782, |
|
"learning_rate": 8.307952622673435e-06, |
|
"loss": 0.3418, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.338409475465313, |
|
"eval_loss": 0.37782156467437744, |
|
"eval_runtime": 147.8891, |
|
"eval_samples_per_second": 1.373, |
|
"eval_steps_per_second": 0.69, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.3394047974519757, |
|
"grad_norm": 0.7344104051589966, |
|
"learning_rate": 8.302976012740122e-06, |
|
"loss": 0.3443, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.3404001194386384, |
|
"grad_norm": 1.0199452638626099, |
|
"learning_rate": 8.29799940280681e-06, |
|
"loss": 0.4294, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.3413954414253011, |
|
"grad_norm": 0.5666326880455017, |
|
"learning_rate": 8.293022792873495e-06, |
|
"loss": 0.3274, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.3423907634119638, |
|
"grad_norm": 0.8385756611824036, |
|
"learning_rate": 8.288046182940182e-06, |
|
"loss": 0.4122, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.3433860853986265, |
|
"grad_norm": 0.777019739151001, |
|
"learning_rate": 8.28306957300687e-06, |
|
"loss": 0.4089, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.34438140738528916, |
|
"grad_norm": 0.682658851146698, |
|
"learning_rate": 8.278092963073555e-06, |
|
"loss": 0.3772, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.34537672937195185, |
|
"grad_norm": 0.6811783313751221, |
|
"learning_rate": 8.27311635314024e-06, |
|
"loss": 0.3523, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.3463720513586145, |
|
"grad_norm": 0.9056878685951233, |
|
"learning_rate": 8.268139743206928e-06, |
|
"loss": 0.3292, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.3473673733452772, |
|
"grad_norm": 0.6763057708740234, |
|
"learning_rate": 8.263163133273615e-06, |
|
"loss": 0.3326, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.34836269533193986, |
|
"grad_norm": 0.8847700953483582, |
|
"learning_rate": 8.258186523340301e-06, |
|
"loss": 0.4062, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.34836269533193986, |
|
"eval_loss": 0.37572577595710754, |
|
"eval_runtime": 147.9751, |
|
"eval_samples_per_second": 1.372, |
|
"eval_steps_per_second": 0.689, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.34935801731860255, |
|
"grad_norm": 0.7903834581375122, |
|
"learning_rate": 8.253209913406988e-06, |
|
"loss": 0.3546, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.35035333930526524, |
|
"grad_norm": 0.6501933336257935, |
|
"learning_rate": 8.248233303473674e-06, |
|
"loss": 0.3909, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.35134866129192793, |
|
"grad_norm": 0.6443967819213867, |
|
"learning_rate": 8.243256693540361e-06, |
|
"loss": 0.3315, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.3523439832785906, |
|
"grad_norm": 0.7020339965820312, |
|
"learning_rate": 8.238280083607047e-06, |
|
"loss": 0.383, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.3533393052652533, |
|
"grad_norm": 0.8711917400360107, |
|
"learning_rate": 8.233303473673734e-06, |
|
"loss": 0.3771, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.354334627251916, |
|
"grad_norm": 0.788311243057251, |
|
"learning_rate": 8.228326863740421e-06, |
|
"loss": 0.3299, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.3553299492385787, |
|
"grad_norm": 0.43669214844703674, |
|
"learning_rate": 8.223350253807107e-06, |
|
"loss": 0.3659, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.3563252712252414, |
|
"grad_norm": 0.550014078617096, |
|
"learning_rate": 8.218373643873794e-06, |
|
"loss": 0.3586, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.35732059321190407, |
|
"grad_norm": 0.9948114156723022, |
|
"learning_rate": 8.21339703394048e-06, |
|
"loss": 0.3743, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.35831591519856676, |
|
"grad_norm": 0.6710416078567505, |
|
"learning_rate": 8.208420424007167e-06, |
|
"loss": 0.3724, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.35831591519856676, |
|
"eval_loss": 0.37383729219436646, |
|
"eval_runtime": 147.9999, |
|
"eval_samples_per_second": 1.372, |
|
"eval_steps_per_second": 0.689, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.35931123718522945, |
|
"grad_norm": 0.7629538774490356, |
|
"learning_rate": 8.203443814073854e-06, |
|
"loss": 0.3942, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.3603065591718921, |
|
"grad_norm": 0.7567903399467468, |
|
"learning_rate": 8.19846720414054e-06, |
|
"loss": 0.3895, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.36130188115855477, |
|
"grad_norm": 0.5209780335426331, |
|
"learning_rate": 8.193490594207226e-06, |
|
"loss": 0.3395, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.36229720314521746, |
|
"grad_norm": 0.5655366778373718, |
|
"learning_rate": 8.188513984273913e-06, |
|
"loss": 0.3435, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.36329252513188015, |
|
"grad_norm": 0.8822707533836365, |
|
"learning_rate": 8.1835373743406e-06, |
|
"loss": 0.3442, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.36428784711854284, |
|
"grad_norm": 0.6264866590499878, |
|
"learning_rate": 8.178560764407286e-06, |
|
"loss": 0.3902, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.36528316910520553, |
|
"grad_norm": 0.6163113713264465, |
|
"learning_rate": 8.173584154473973e-06, |
|
"loss": 0.301, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.3662784910918682, |
|
"grad_norm": 0.7627054452896118, |
|
"learning_rate": 8.16860754454066e-06, |
|
"loss": 0.3504, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.3672738130785309, |
|
"grad_norm": 0.7021706104278564, |
|
"learning_rate": 8.163630934607346e-06, |
|
"loss": 0.3761, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.3682691350651936, |
|
"grad_norm": 0.8463016152381897, |
|
"learning_rate": 8.158654324674032e-06, |
|
"loss": 0.4096, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.3682691350651936, |
|
"eval_loss": 0.3721456229686737, |
|
"eval_runtime": 148.0333, |
|
"eval_samples_per_second": 1.371, |
|
"eval_steps_per_second": 0.689, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.3692644570518563, |
|
"grad_norm": 0.7081176042556763, |
|
"learning_rate": 8.153677714740719e-06, |
|
"loss": 0.3609, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.370259779038519, |
|
"grad_norm": 0.6312963366508484, |
|
"learning_rate": 8.148701104807406e-06, |
|
"loss": 0.3964, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.37125510102518167, |
|
"grad_norm": 0.5755221247673035, |
|
"learning_rate": 8.143724494874092e-06, |
|
"loss": 0.3701, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.37225042301184436, |
|
"grad_norm": 0.584368884563446, |
|
"learning_rate": 8.13874788494078e-06, |
|
"loss": 0.3748, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.373245744998507, |
|
"grad_norm": 0.588197648525238, |
|
"learning_rate": 8.133771275007467e-06, |
|
"loss": 0.3775, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.3742410669851697, |
|
"grad_norm": 0.6824856996536255, |
|
"learning_rate": 8.128794665074152e-06, |
|
"loss": 0.3842, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.37523638897183237, |
|
"grad_norm": 0.4867573082447052, |
|
"learning_rate": 8.123818055140838e-06, |
|
"loss": 0.3349, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.37623171095849506, |
|
"grad_norm": 1.023980975151062, |
|
"learning_rate": 8.118841445207525e-06, |
|
"loss": 0.2991, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.37722703294515775, |
|
"grad_norm": 0.8464593291282654, |
|
"learning_rate": 8.113864835274212e-06, |
|
"loss": 0.3673, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.37822235493182044, |
|
"grad_norm": 0.7149996757507324, |
|
"learning_rate": 8.108888225340898e-06, |
|
"loss": 0.3913, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.37822235493182044, |
|
"eval_loss": 0.37008264660835266, |
|
"eval_runtime": 148.0619, |
|
"eval_samples_per_second": 1.371, |
|
"eval_steps_per_second": 0.689, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.3792176769184831, |
|
"grad_norm": 0.5620415210723877, |
|
"learning_rate": 8.103911615407585e-06, |
|
"loss": 0.3409, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.3802129989051458, |
|
"grad_norm": 0.7163406014442444, |
|
"learning_rate": 8.098935005474273e-06, |
|
"loss": 0.3566, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.3812083208918085, |
|
"grad_norm": 0.6729508638381958, |
|
"learning_rate": 8.093958395540958e-06, |
|
"loss": 0.3606, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.3822036428784712, |
|
"grad_norm": 0.5905406475067139, |
|
"learning_rate": 8.088981785607644e-06, |
|
"loss": 0.3948, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.3831989648651339, |
|
"grad_norm": 0.896960437297821, |
|
"learning_rate": 8.084005175674331e-06, |
|
"loss": 0.3881, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.3841942868517966, |
|
"grad_norm": 0.6188758015632629, |
|
"learning_rate": 8.079028565741019e-06, |
|
"loss": 0.3632, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.38518960883845926, |
|
"grad_norm": 0.7011315822601318, |
|
"learning_rate": 8.074051955807704e-06, |
|
"loss": 0.3768, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.38618493082512195, |
|
"grad_norm": 0.546981930732727, |
|
"learning_rate": 8.069075345874391e-06, |
|
"loss": 0.3556, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.3871802528117846, |
|
"grad_norm": 0.6722966432571411, |
|
"learning_rate": 8.064098735941077e-06, |
|
"loss": 0.4264, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.3881755747984473, |
|
"grad_norm": 0.6407563090324402, |
|
"learning_rate": 8.059122126007764e-06, |
|
"loss": 0.3592, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.3881755747984473, |
|
"eval_loss": 0.3688708245754242, |
|
"eval_runtime": 148.1311, |
|
"eval_samples_per_second": 1.37, |
|
"eval_steps_per_second": 0.689, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.38917089678510997, |
|
"grad_norm": 0.45177608728408813, |
|
"learning_rate": 8.05414551607445e-06, |
|
"loss": 0.3733, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.39016621877177265, |
|
"grad_norm": 1.0299266576766968, |
|
"learning_rate": 8.049168906141137e-06, |
|
"loss": 0.351, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.39116154075843534, |
|
"grad_norm": 0.6861090660095215, |
|
"learning_rate": 8.044192296207823e-06, |
|
"loss": 0.3899, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.39215686274509803, |
|
"grad_norm": 0.6434109210968018, |
|
"learning_rate": 8.03921568627451e-06, |
|
"loss": 0.3285, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.3931521847317607, |
|
"grad_norm": 0.6049661040306091, |
|
"learning_rate": 8.034239076341198e-06, |
|
"loss": 0.37, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.3941475067184234, |
|
"grad_norm": 0.6799841523170471, |
|
"learning_rate": 8.029262466407883e-06, |
|
"loss": 0.381, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.3951428287050861, |
|
"grad_norm": 0.7383856177330017, |
|
"learning_rate": 8.02428585647457e-06, |
|
"loss": 0.3707, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.3961381506917488, |
|
"grad_norm": 0.8234820365905762, |
|
"learning_rate": 8.019309246541258e-06, |
|
"loss": 0.379, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.3971334726784115, |
|
"grad_norm": 0.743027925491333, |
|
"learning_rate": 8.014332636607943e-06, |
|
"loss": 0.362, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.39812879466507417, |
|
"grad_norm": 0.48385190963745117, |
|
"learning_rate": 8.009356026674629e-06, |
|
"loss": 0.3726, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.39812879466507417, |
|
"eval_loss": 0.36677852272987366, |
|
"eval_runtime": 148.1274, |
|
"eval_samples_per_second": 1.37, |
|
"eval_steps_per_second": 0.689, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.39912411665173686, |
|
"grad_norm": 0.776292622089386, |
|
"learning_rate": 8.004379416741316e-06, |
|
"loss": 0.3258, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.40011943863839955, |
|
"grad_norm": 0.7187590599060059, |
|
"learning_rate": 7.999402806808004e-06, |
|
"loss": 0.3639, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.4011147606250622, |
|
"grad_norm": 0.6233355402946472, |
|
"learning_rate": 7.99442619687469e-06, |
|
"loss": 0.3418, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.4021100826117249, |
|
"grad_norm": 0.9605082869529724, |
|
"learning_rate": 7.989449586941377e-06, |
|
"loss": 0.3686, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.40310540459838756, |
|
"grad_norm": 0.7882612943649292, |
|
"learning_rate": 7.984472977008064e-06, |
|
"loss": 0.3386, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.40410072658505025, |
|
"grad_norm": 0.8124802708625793, |
|
"learning_rate": 7.97949636707475e-06, |
|
"loss": 0.3412, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.40509604857171294, |
|
"grad_norm": 0.6348981857299805, |
|
"learning_rate": 7.974519757141435e-06, |
|
"loss": 0.3624, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.40609137055837563, |
|
"grad_norm": 0.8518906831741333, |
|
"learning_rate": 7.969543147208122e-06, |
|
"loss": 0.3494, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.4070866925450383, |
|
"grad_norm": 0.979092538356781, |
|
"learning_rate": 7.96456653727481e-06, |
|
"loss": 0.3677, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.408082014531701, |
|
"grad_norm": 0.6732219457626343, |
|
"learning_rate": 7.959589927341495e-06, |
|
"loss": 0.3395, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.408082014531701, |
|
"eval_loss": 0.365203857421875, |
|
"eval_runtime": 148.0813, |
|
"eval_samples_per_second": 1.371, |
|
"eval_steps_per_second": 0.689, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.4090773365183637, |
|
"grad_norm": 0.9068031907081604, |
|
"learning_rate": 7.954613317408183e-06, |
|
"loss": 0.3715, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.4100726585050264, |
|
"grad_norm": 0.8246614336967468, |
|
"learning_rate": 7.94963670747487e-06, |
|
"loss": 0.3661, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.4110679804916891, |
|
"grad_norm": 0.5856474041938782, |
|
"learning_rate": 7.944660097541556e-06, |
|
"loss": 0.3567, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.41206330247835177, |
|
"grad_norm": 0.4393113851547241, |
|
"learning_rate": 7.939683487608241e-06, |
|
"loss": 0.3469, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.41305862446501446, |
|
"grad_norm": 1.0827676057815552, |
|
"learning_rate": 7.934706877674928e-06, |
|
"loss": 0.3318, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.4140539464516771, |
|
"grad_norm": 0.6830149292945862, |
|
"learning_rate": 7.929730267741616e-06, |
|
"loss": 0.3726, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.4150492684383398, |
|
"grad_norm": 0.563925564289093, |
|
"learning_rate": 7.924753657808301e-06, |
|
"loss": 0.3732, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.41604459042500247, |
|
"grad_norm": 0.5630573034286499, |
|
"learning_rate": 7.919777047874989e-06, |
|
"loss": 0.3626, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.41703991241166516, |
|
"grad_norm": 0.7267017960548401, |
|
"learning_rate": 7.914800437941674e-06, |
|
"loss": 0.3414, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.41803523439832785, |
|
"grad_norm": 0.7420011758804321, |
|
"learning_rate": 7.909823828008362e-06, |
|
"loss": 0.379, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.41803523439832785, |
|
"eval_loss": 0.3634182810783386, |
|
"eval_runtime": 148.0601, |
|
"eval_samples_per_second": 1.371, |
|
"eval_steps_per_second": 0.689, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.41903055638499054, |
|
"grad_norm": 0.6270275115966797, |
|
"learning_rate": 7.904847218075047e-06, |
|
"loss": 0.347, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.4200258783716532, |
|
"grad_norm": 0.6264152526855469, |
|
"learning_rate": 7.899870608141735e-06, |
|
"loss": 0.3984, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.4210212003583159, |
|
"grad_norm": 0.7452067136764526, |
|
"learning_rate": 7.894893998208422e-06, |
|
"loss": 0.392, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.4220165223449786, |
|
"grad_norm": 0.5158396363258362, |
|
"learning_rate": 7.889917388275107e-06, |
|
"loss": 0.3624, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.4230118443316413, |
|
"grad_norm": 0.6692706942558289, |
|
"learning_rate": 7.884940778341795e-06, |
|
"loss": 0.359, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.424007166318304, |
|
"grad_norm": 1.1387830972671509, |
|
"learning_rate": 7.87996416840848e-06, |
|
"loss": 0.39, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.4250024883049667, |
|
"grad_norm": 0.76036137342453, |
|
"learning_rate": 7.874987558475168e-06, |
|
"loss": 0.299, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.42599781029162936, |
|
"grad_norm": 0.45447903871536255, |
|
"learning_rate": 7.870010948541853e-06, |
|
"loss": 0.3926, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.42699313227829205, |
|
"grad_norm": 0.8221507668495178, |
|
"learning_rate": 7.86503433860854e-06, |
|
"loss": 0.3743, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.4279884542649547, |
|
"grad_norm": 0.7328831553459167, |
|
"learning_rate": 7.860057728675226e-06, |
|
"loss": 0.3699, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.4279884542649547, |
|
"eval_loss": 0.36196640133857727, |
|
"eval_runtime": 148.0658, |
|
"eval_samples_per_second": 1.371, |
|
"eval_steps_per_second": 0.689, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.4289837762516174, |
|
"grad_norm": 0.8411442637443542, |
|
"learning_rate": 7.855081118741913e-06, |
|
"loss": 0.4047, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.42997909823828007, |
|
"grad_norm": 0.7502423524856567, |
|
"learning_rate": 7.8501045088086e-06, |
|
"loss": 0.3513, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.43097442022494276, |
|
"grad_norm": 0.566929042339325, |
|
"learning_rate": 7.845127898875286e-06, |
|
"loss": 0.3935, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.43196974221160545, |
|
"grad_norm": 0.7588290572166443, |
|
"learning_rate": 7.840151288941972e-06, |
|
"loss": 0.3324, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.43296506419826813, |
|
"grad_norm": 0.7947611808776855, |
|
"learning_rate": 7.835174679008661e-06, |
|
"loss": 0.3506, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.4339603861849308, |
|
"grad_norm": 0.6475954651832581, |
|
"learning_rate": 7.830198069075347e-06, |
|
"loss": 0.3103, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.4349557081715935, |
|
"grad_norm": 0.5702581405639648, |
|
"learning_rate": 7.825221459142032e-06, |
|
"loss": 0.3373, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.4359510301582562, |
|
"grad_norm": 0.7424353957176208, |
|
"learning_rate": 7.82024484920872e-06, |
|
"loss": 0.3593, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.4369463521449189, |
|
"grad_norm": 0.5749756693840027, |
|
"learning_rate": 7.815268239275407e-06, |
|
"loss": 0.3133, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.4379416741315816, |
|
"grad_norm": 0.5407712459564209, |
|
"learning_rate": 7.810291629342092e-06, |
|
"loss": 0.3584, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.4379416741315816, |
|
"eval_loss": 0.360762357711792, |
|
"eval_runtime": 148.1111, |
|
"eval_samples_per_second": 1.371, |
|
"eval_steps_per_second": 0.689, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.43893699611824427, |
|
"grad_norm": 0.5194666981697083, |
|
"learning_rate": 7.80531501940878e-06, |
|
"loss": 0.2957, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.43993231810490696, |
|
"grad_norm": 0.7961593866348267, |
|
"learning_rate": 7.800338409475467e-06, |
|
"loss": 0.3819, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.44092764009156965, |
|
"grad_norm": 0.6336628198623657, |
|
"learning_rate": 7.795361799542153e-06, |
|
"loss": 0.3123, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.4419229620782323, |
|
"grad_norm": 0.6935514211654663, |
|
"learning_rate": 7.790385189608838e-06, |
|
"loss": 0.3519, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.442918284064895, |
|
"grad_norm": 0.6400023698806763, |
|
"learning_rate": 7.785408579675526e-06, |
|
"loss": 0.3806, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.44391360605155766, |
|
"grad_norm": 0.9406591057777405, |
|
"learning_rate": 7.780431969742213e-06, |
|
"loss": 0.3282, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.44490892803822035, |
|
"grad_norm": 0.6432562470436096, |
|
"learning_rate": 7.775455359808899e-06, |
|
"loss": 0.302, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.44590425002488304, |
|
"grad_norm": 0.5700191259384155, |
|
"learning_rate": 7.770478749875586e-06, |
|
"loss": 0.3608, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.44689957201154573, |
|
"grad_norm": 0.7987110614776611, |
|
"learning_rate": 7.765502139942271e-06, |
|
"loss": 0.3363, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.4478948939982084, |
|
"grad_norm": 0.6581839323043823, |
|
"learning_rate": 7.760525530008959e-06, |
|
"loss": 0.3414, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.4478948939982084, |
|
"eval_loss": 0.35966184735298157, |
|
"eval_runtime": 148.1465, |
|
"eval_samples_per_second": 1.37, |
|
"eval_steps_per_second": 0.689, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.4488902159848711, |
|
"grad_norm": 0.6311335563659668, |
|
"learning_rate": 7.755548920075644e-06, |
|
"loss": 0.3768, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.4498855379715338, |
|
"grad_norm": 0.8850741982460022, |
|
"learning_rate": 7.750572310142332e-06, |
|
"loss": 0.3763, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.4508808599581965, |
|
"grad_norm": 0.5066502094268799, |
|
"learning_rate": 7.745595700209019e-06, |
|
"loss": 0.3412, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.4518761819448592, |
|
"grad_norm": 0.545430600643158, |
|
"learning_rate": 7.740619090275705e-06, |
|
"loss": 0.3737, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.45287150393152187, |
|
"grad_norm": 0.7061020731925964, |
|
"learning_rate": 7.735642480342392e-06, |
|
"loss": 0.3218, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.45386682591818456, |
|
"grad_norm": 0.5185464024543762, |
|
"learning_rate": 7.730665870409078e-06, |
|
"loss": 0.3489, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.4548621479048472, |
|
"grad_norm": 0.9102675318717957, |
|
"learning_rate": 7.725689260475765e-06, |
|
"loss": 0.3515, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.4558574698915099, |
|
"grad_norm": 0.7395256757736206, |
|
"learning_rate": 7.72071265054245e-06, |
|
"loss": 0.2873, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.45685279187817257, |
|
"grad_norm": 0.9186689853668213, |
|
"learning_rate": 7.715736040609138e-06, |
|
"loss": 0.3705, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.45784811386483526, |
|
"grad_norm": 0.6102734804153442, |
|
"learning_rate": 7.710759430675823e-06, |
|
"loss": 0.3389, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.45784811386483526, |
|
"eval_loss": 0.35844776034355164, |
|
"eval_runtime": 148.1097, |
|
"eval_samples_per_second": 1.371, |
|
"eval_steps_per_second": 0.689, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.45884343585149795, |
|
"grad_norm": 0.418071985244751, |
|
"learning_rate": 7.70578282074251e-06, |
|
"loss": 0.3454, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.45983875783816064, |
|
"grad_norm": 0.504802942276001, |
|
"learning_rate": 7.700806210809198e-06, |
|
"loss": 0.3419, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.46083407982482333, |
|
"grad_norm": 0.7918646335601807, |
|
"learning_rate": 7.695829600875884e-06, |
|
"loss": 0.3992, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.461829401811486, |
|
"grad_norm": 0.6944281458854675, |
|
"learning_rate": 7.690852990942571e-06, |
|
"loss": 0.3945, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.4628247237981487, |
|
"grad_norm": 0.648303210735321, |
|
"learning_rate": 7.685876381009257e-06, |
|
"loss": 0.3401, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.4638200457848114, |
|
"grad_norm": 0.812044084072113, |
|
"learning_rate": 7.680899771075944e-06, |
|
"loss": 0.3548, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.4648153677714741, |
|
"grad_norm": 0.7709999680519104, |
|
"learning_rate": 7.67592316114263e-06, |
|
"loss": 0.3702, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.4658106897581368, |
|
"grad_norm": 0.7904644012451172, |
|
"learning_rate": 7.670946551209317e-06, |
|
"loss": 0.3763, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.46680601174479947, |
|
"grad_norm": 0.7763231992721558, |
|
"learning_rate": 7.665969941276004e-06, |
|
"loss": 0.3495, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.46780133373146215, |
|
"grad_norm": 0.5270109176635742, |
|
"learning_rate": 7.66099333134269e-06, |
|
"loss": 0.3016, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.46780133373146215, |
|
"eval_loss": 0.35714709758758545, |
|
"eval_runtime": 148.1115, |
|
"eval_samples_per_second": 1.371, |
|
"eval_steps_per_second": 0.689, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.4687966557181248, |
|
"grad_norm": 0.6368373036384583, |
|
"learning_rate": 7.656016721409375e-06, |
|
"loss": 0.3323, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.4697919777047875, |
|
"grad_norm": 0.3973361551761627, |
|
"learning_rate": 7.651040111476064e-06, |
|
"loss": 0.3405, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.47078729969145017, |
|
"grad_norm": 0.8075085878372192, |
|
"learning_rate": 7.64606350154275e-06, |
|
"loss": 0.3436, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.47178262167811286, |
|
"grad_norm": 0.892672598361969, |
|
"learning_rate": 7.641086891609436e-06, |
|
"loss": 0.3662, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.47277794366477555, |
|
"grad_norm": 0.6311262845993042, |
|
"learning_rate": 7.636110281676123e-06, |
|
"loss": 0.3559, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.47377326565143824, |
|
"grad_norm": 0.7950363159179688, |
|
"learning_rate": 7.63113367174281e-06, |
|
"loss": 0.2974, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.4747685876381009, |
|
"grad_norm": 0.6539332270622253, |
|
"learning_rate": 7.626157061809496e-06, |
|
"loss": 0.3312, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.4757639096247636, |
|
"grad_norm": 0.7384660840034485, |
|
"learning_rate": 7.621180451876182e-06, |
|
"loss": 0.3825, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.4767592316114263, |
|
"grad_norm": 0.43817830085754395, |
|
"learning_rate": 7.6162038419428695e-06, |
|
"loss": 0.3462, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.477754553598089, |
|
"grad_norm": 0.7346156239509583, |
|
"learning_rate": 7.611227232009556e-06, |
|
"loss": 0.3377, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.477754553598089, |
|
"eval_loss": 0.355719655752182, |
|
"eval_runtime": 148.1914, |
|
"eval_samples_per_second": 1.37, |
|
"eval_steps_per_second": 0.688, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.4787498755847517, |
|
"grad_norm": 0.8043003082275391, |
|
"learning_rate": 7.6062506220762424e-06, |
|
"loss": 0.3625, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.4797451975714144, |
|
"grad_norm": 0.6644107103347778, |
|
"learning_rate": 7.601274012142929e-06, |
|
"loss": 0.3023, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.48074051955807706, |
|
"grad_norm": 0.7794090509414673, |
|
"learning_rate": 7.596297402209616e-06, |
|
"loss": 0.3552, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.4817358415447397, |
|
"grad_norm": 0.7449871301651001, |
|
"learning_rate": 7.591320792276302e-06, |
|
"loss": 0.3659, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.4827311635314024, |
|
"grad_norm": 0.881610631942749, |
|
"learning_rate": 7.586344182342988e-06, |
|
"loss": 0.3184, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.4837264855180651, |
|
"grad_norm": 0.8672296404838562, |
|
"learning_rate": 7.581367572409675e-06, |
|
"loss": 0.3324, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.48472180750472776, |
|
"grad_norm": 0.4788852334022522, |
|
"learning_rate": 7.576390962476362e-06, |
|
"loss": 0.3406, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.48571712949139045, |
|
"grad_norm": 0.6023631691932678, |
|
"learning_rate": 7.5714143525430485e-06, |
|
"loss": 0.3797, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.48671245147805314, |
|
"grad_norm": 0.6595234870910645, |
|
"learning_rate": 7.566437742609735e-06, |
|
"loss": 0.3199, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.48770777346471583, |
|
"grad_norm": 0.6189759969711304, |
|
"learning_rate": 7.561461132676421e-06, |
|
"loss": 0.373, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.48770777346471583, |
|
"eval_loss": 0.35428938269615173, |
|
"eval_runtime": 148.2777, |
|
"eval_samples_per_second": 1.369, |
|
"eval_steps_per_second": 0.688, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.4887030954513785, |
|
"grad_norm": 0.71135413646698, |
|
"learning_rate": 7.556484522743108e-06, |
|
"loss": 0.3232, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.4896984174380412, |
|
"grad_norm": 0.5228835940361023, |
|
"learning_rate": 7.551507912809794e-06, |
|
"loss": 0.3428, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.4906937394247039, |
|
"grad_norm": 0.9015726447105408, |
|
"learning_rate": 7.546531302876481e-06, |
|
"loss": 0.3889, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.4916890614113666, |
|
"grad_norm": 0.8351202011108398, |
|
"learning_rate": 7.541554692943168e-06, |
|
"loss": 0.3367, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.4926843833980293, |
|
"grad_norm": 0.6578547954559326, |
|
"learning_rate": 7.536578083009855e-06, |
|
"loss": 0.3646, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.49367970538469197, |
|
"grad_norm": 1.1061774492263794, |
|
"learning_rate": 7.531601473076541e-06, |
|
"loss": 0.351, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.49467502737135466, |
|
"grad_norm": 0.636061429977417, |
|
"learning_rate": 7.526624863143227e-06, |
|
"loss": 0.3434, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.4956703493580173, |
|
"grad_norm": 0.6666164994239807, |
|
"learning_rate": 7.521648253209915e-06, |
|
"loss": 0.3462, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.49666567134468, |
|
"grad_norm": 0.8288053274154663, |
|
"learning_rate": 7.5166716432766e-06, |
|
"loss": 0.3862, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.49766099333134267, |
|
"grad_norm": 0.5653735399246216, |
|
"learning_rate": 7.511695033343287e-06, |
|
"loss": 0.3559, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.49766099333134267, |
|
"eval_loss": 0.35338979959487915, |
|
"eval_runtime": 148.2313, |
|
"eval_samples_per_second": 1.369, |
|
"eval_steps_per_second": 0.688, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.49865631531800536, |
|
"grad_norm": 1.083835482597351, |
|
"learning_rate": 7.506718423409973e-06, |
|
"loss": 0.3697, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.49965163730466805, |
|
"grad_norm": 0.7271355986595154, |
|
"learning_rate": 7.501741813476661e-06, |
|
"loss": 0.2915, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.5006469592913307, |
|
"grad_norm": 0.6525740027427673, |
|
"learning_rate": 7.496765203543347e-06, |
|
"loss": 0.3571, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.5016422812779935, |
|
"grad_norm": 1.00348961353302, |
|
"learning_rate": 7.4917885936100336e-06, |
|
"loss": 0.3254, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.5026376032646561, |
|
"grad_norm": 0.7707570195198059, |
|
"learning_rate": 7.486811983676721e-06, |
|
"loss": 0.3544, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.5036329252513188, |
|
"grad_norm": 0.7804340720176697, |
|
"learning_rate": 7.4818353737434065e-06, |
|
"loss": 0.3346, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.5046282472379815, |
|
"grad_norm": 1.0899609327316284, |
|
"learning_rate": 7.476858763810093e-06, |
|
"loss": 0.3296, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.5056235692246441, |
|
"grad_norm": 0.6863502264022827, |
|
"learning_rate": 7.471882153876779e-06, |
|
"loss": 0.3569, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.5066188912113069, |
|
"grad_norm": 1.15005362033844, |
|
"learning_rate": 7.466905543943467e-06, |
|
"loss": 0.2829, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.5076142131979695, |
|
"grad_norm": 0.699102520942688, |
|
"learning_rate": 7.461928934010153e-06, |
|
"loss": 0.3727, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.5076142131979695, |
|
"eval_loss": 0.35266318917274475, |
|
"eval_runtime": 148.2339, |
|
"eval_samples_per_second": 1.369, |
|
"eval_steps_per_second": 0.688, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.5086095351846323, |
|
"grad_norm": 0.9547719359397888, |
|
"learning_rate": 7.45695232407684e-06, |
|
"loss": 0.4042, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.5096048571712949, |
|
"grad_norm": 0.9959189891815186, |
|
"learning_rate": 7.451975714143525e-06, |
|
"loss": 0.3115, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.5106001791579576, |
|
"grad_norm": 0.6266285181045532, |
|
"learning_rate": 7.446999104210213e-06, |
|
"loss": 0.3485, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.5115955011446203, |
|
"grad_norm": 0.711664617061615, |
|
"learning_rate": 7.442022494276899e-06, |
|
"loss": 0.3699, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.512590823131283, |
|
"grad_norm": 1.0690807104110718, |
|
"learning_rate": 7.4370458843435855e-06, |
|
"loss": 0.3248, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.5135861451179456, |
|
"grad_norm": 1.2619460821151733, |
|
"learning_rate": 7.432069274410272e-06, |
|
"loss": 0.3284, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.5145814671046084, |
|
"grad_norm": 0.9510999917984009, |
|
"learning_rate": 7.427092664476959e-06, |
|
"loss": 0.3491, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.515576789091271, |
|
"grad_norm": 1.012990117073059, |
|
"learning_rate": 7.422116054543646e-06, |
|
"loss": 0.3659, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.5165721110779337, |
|
"grad_norm": 0.5469540953636169, |
|
"learning_rate": 7.417139444610332e-06, |
|
"loss": 0.2709, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.5175674330645964, |
|
"grad_norm": 0.6974226236343384, |
|
"learning_rate": 7.4121628346770195e-06, |
|
"loss": 0.3668, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.5175674330645964, |
|
"eval_loss": 0.35165390372276306, |
|
"eval_runtime": 148.2087, |
|
"eval_samples_per_second": 1.37, |
|
"eval_steps_per_second": 0.688, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.518562755051259, |
|
"grad_norm": 0.8949996829032898, |
|
"learning_rate": 7.407186224743705e-06, |
|
"loss": 0.3305, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.5195580770379218, |
|
"grad_norm": 0.6786302328109741, |
|
"learning_rate": 7.4022096148103915e-06, |
|
"loss": 0.3312, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.5205533990245844, |
|
"grad_norm": 0.6699957251548767, |
|
"learning_rate": 7.397233004877078e-06, |
|
"loss": 0.3429, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.5215487210112472, |
|
"grad_norm": 0.5877237915992737, |
|
"learning_rate": 7.392256394943765e-06, |
|
"loss": 0.3214, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.5225440429979098, |
|
"grad_norm": 0.7005926966667175, |
|
"learning_rate": 7.387279785010452e-06, |
|
"loss": 0.3816, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.5235393649845725, |
|
"grad_norm": 0.7223731279373169, |
|
"learning_rate": 7.382303175077138e-06, |
|
"loss": 0.3773, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.5245346869712352, |
|
"grad_norm": 0.9617743492126465, |
|
"learning_rate": 7.377326565143824e-06, |
|
"loss": 0.3441, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.5255300089578979, |
|
"grad_norm": 0.6759951114654541, |
|
"learning_rate": 7.372349955210511e-06, |
|
"loss": 0.3464, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.5265253309445606, |
|
"grad_norm": 0.600290834903717, |
|
"learning_rate": 7.367373345277198e-06, |
|
"loss": 0.3202, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.5275206529312233, |
|
"grad_norm": 0.6212776899337769, |
|
"learning_rate": 7.362396735343884e-06, |
|
"loss": 0.3995, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.5275206529312233, |
|
"eval_loss": 0.35058361291885376, |
|
"eval_runtime": 148.2235, |
|
"eval_samples_per_second": 1.37, |
|
"eval_steps_per_second": 0.688, |
|
"step": 5300 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 20094, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.109336661739546e+19, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|