|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.11943863839952225, |
|
"eval_steps": 100, |
|
"global_step": 1200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0009953219866626853, |
|
"grad_norm": 1.912980556488037, |
|
"learning_rate": 9.995023390066686e-06, |
|
"loss": 1.8703, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0019906439733253707, |
|
"grad_norm": 1.866821050643921, |
|
"learning_rate": 9.990046780133374e-06, |
|
"loss": 1.8723, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.002985965959988056, |
|
"grad_norm": 2.058809280395508, |
|
"learning_rate": 9.985070170200061e-06, |
|
"loss": 1.8097, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.003981287946650741, |
|
"grad_norm": 1.459013819694519, |
|
"learning_rate": 9.980093560266747e-06, |
|
"loss": 1.7456, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.004976609933313427, |
|
"grad_norm": 0.9095586538314819, |
|
"learning_rate": 9.975116950333434e-06, |
|
"loss": 1.7195, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.005971931919976112, |
|
"grad_norm": 1.1065226793289185, |
|
"learning_rate": 9.970140340400121e-06, |
|
"loss": 1.6502, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0069672539066387975, |
|
"grad_norm": 0.8301252126693726, |
|
"learning_rate": 9.965163730466807e-06, |
|
"loss": 1.5699, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.007962575893301483, |
|
"grad_norm": 1.0762828588485718, |
|
"learning_rate": 9.960187120533493e-06, |
|
"loss": 1.5072, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.008957897879964169, |
|
"grad_norm": 1.0814900398254395, |
|
"learning_rate": 9.95521051060018e-06, |
|
"loss": 1.4369, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.009953219866626855, |
|
"grad_norm": 1.3561326265335083, |
|
"learning_rate": 9.950233900666867e-06, |
|
"loss": 1.3467, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.009953219866626855, |
|
"eval_loss": 1.2846794128417969, |
|
"eval_runtime": 147.6242, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.010948541853289539, |
|
"grad_norm": 1.438547968864441, |
|
"learning_rate": 9.945257290733553e-06, |
|
"loss": 1.2222, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.011943863839952225, |
|
"grad_norm": 1.402588963508606, |
|
"learning_rate": 9.94028068080024e-06, |
|
"loss": 1.1001, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.012939185826614909, |
|
"grad_norm": 1.4357985258102417, |
|
"learning_rate": 9.935304070866926e-06, |
|
"loss": 0.9657, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.013934507813277595, |
|
"grad_norm": 2.137953042984009, |
|
"learning_rate": 9.930327460933613e-06, |
|
"loss": 0.8211, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.014929829799940281, |
|
"grad_norm": 1.374299168586731, |
|
"learning_rate": 9.925350851000299e-06, |
|
"loss": 0.7142, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.015925151786602965, |
|
"grad_norm": 1.1510456800460815, |
|
"learning_rate": 9.920374241066986e-06, |
|
"loss": 0.656, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.01692047377326565, |
|
"grad_norm": 1.0226788520812988, |
|
"learning_rate": 9.915397631133673e-06, |
|
"loss": 0.6212, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.017915795759928337, |
|
"grad_norm": 0.9365411400794983, |
|
"learning_rate": 9.910421021200359e-06, |
|
"loss": 0.6069, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.018911117746591023, |
|
"grad_norm": 0.6880003213882446, |
|
"learning_rate": 9.905444411267046e-06, |
|
"loss": 0.6128, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.01990643973325371, |
|
"grad_norm": 1.1190361976623535, |
|
"learning_rate": 9.900467801333732e-06, |
|
"loss": 0.5426, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01990643973325371, |
|
"eval_loss": 0.5788590908050537, |
|
"eval_runtime": 147.511, |
|
"eval_samples_per_second": 1.376, |
|
"eval_steps_per_second": 0.691, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02090176171991639, |
|
"grad_norm": 1.184279441833496, |
|
"learning_rate": 9.895491191400419e-06, |
|
"loss": 0.5887, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.021897083706579078, |
|
"grad_norm": 0.7627615928649902, |
|
"learning_rate": 9.890514581467106e-06, |
|
"loss": 0.5433, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.022892405693241764, |
|
"grad_norm": 0.7858164310455322, |
|
"learning_rate": 9.885537971533792e-06, |
|
"loss": 0.5843, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.02388772767990445, |
|
"grad_norm": 0.695697009563446, |
|
"learning_rate": 9.880561361600478e-06, |
|
"loss": 0.5365, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.024883049666567136, |
|
"grad_norm": 0.8994197845458984, |
|
"learning_rate": 9.875584751667165e-06, |
|
"loss": 0.5662, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.025878371653229818, |
|
"grad_norm": 0.8016309142112732, |
|
"learning_rate": 9.870608141733852e-06, |
|
"loss": 0.5592, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.026873693639892504, |
|
"grad_norm": 0.8534384369850159, |
|
"learning_rate": 9.865631531800538e-06, |
|
"loss": 0.5248, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.02786901562655519, |
|
"grad_norm": 0.9857029914855957, |
|
"learning_rate": 9.860654921867225e-06, |
|
"loss": 0.5294, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.028864337613217876, |
|
"grad_norm": 0.7766090631484985, |
|
"learning_rate": 9.855678311933912e-06, |
|
"loss": 0.5198, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.029859659599880562, |
|
"grad_norm": 0.6832401752471924, |
|
"learning_rate": 9.850701702000598e-06, |
|
"loss": 0.5844, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.029859659599880562, |
|
"eval_loss": 0.536589503288269, |
|
"eval_runtime": 147.4968, |
|
"eval_samples_per_second": 1.376, |
|
"eval_steps_per_second": 0.692, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.030854981586543248, |
|
"grad_norm": 0.7720848917961121, |
|
"learning_rate": 9.845725092067284e-06, |
|
"loss": 0.5365, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.03185030357320593, |
|
"grad_norm": 0.7022100687026978, |
|
"learning_rate": 9.840748482133971e-06, |
|
"loss": 0.4841, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.03284562555986862, |
|
"grad_norm": 1.0030310153961182, |
|
"learning_rate": 9.835771872200658e-06, |
|
"loss": 0.4635, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.0338409475465313, |
|
"grad_norm": 0.8628882765769958, |
|
"learning_rate": 9.830795262267344e-06, |
|
"loss": 0.4932, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.034836269533193985, |
|
"grad_norm": 0.7178316712379456, |
|
"learning_rate": 9.825818652334031e-06, |
|
"loss": 0.6057, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.035831591519856675, |
|
"grad_norm": 0.9564626216888428, |
|
"learning_rate": 9.820842042400718e-06, |
|
"loss": 0.5371, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03682691350651936, |
|
"grad_norm": 0.7041760683059692, |
|
"learning_rate": 9.815865432467404e-06, |
|
"loss": 0.513, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.037822235493182046, |
|
"grad_norm": 1.0203750133514404, |
|
"learning_rate": 9.81088882253409e-06, |
|
"loss": 0.5118, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.03881755747984473, |
|
"grad_norm": 0.8765382170677185, |
|
"learning_rate": 9.805912212600777e-06, |
|
"loss": 0.4529, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.03981287946650742, |
|
"grad_norm": 0.9951983690261841, |
|
"learning_rate": 9.800935602667464e-06, |
|
"loss": 0.5336, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03981287946650742, |
|
"eval_loss": 0.5151349306106567, |
|
"eval_runtime": 147.6615, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0408082014531701, |
|
"grad_norm": 0.7691435813903809, |
|
"learning_rate": 9.79595899273415e-06, |
|
"loss": 0.506, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.04180352343983278, |
|
"grad_norm": 1.1955533027648926, |
|
"learning_rate": 9.790982382800837e-06, |
|
"loss": 0.4692, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.04279884542649547, |
|
"grad_norm": 1.128085732460022, |
|
"learning_rate": 9.786005772867525e-06, |
|
"loss": 0.4608, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.043794167413158155, |
|
"grad_norm": 0.5518949627876282, |
|
"learning_rate": 9.78102916293421e-06, |
|
"loss": 0.5006, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.044789489399820845, |
|
"grad_norm": 0.7164484858512878, |
|
"learning_rate": 9.776052553000896e-06, |
|
"loss": 0.4996, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.04578481138648353, |
|
"grad_norm": 0.5959630012512207, |
|
"learning_rate": 9.771075943067583e-06, |
|
"loss": 0.4843, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04678013337314621, |
|
"grad_norm": 0.743648111820221, |
|
"learning_rate": 9.76609933313427e-06, |
|
"loss": 0.4363, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.0477754553598089, |
|
"grad_norm": 0.8757079243659973, |
|
"learning_rate": 9.761122723200956e-06, |
|
"loss": 0.4665, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.04877077734647158, |
|
"grad_norm": 1.0122153759002686, |
|
"learning_rate": 9.756146113267643e-06, |
|
"loss": 0.492, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.04976609933313427, |
|
"grad_norm": 0.6179729700088501, |
|
"learning_rate": 9.751169503334329e-06, |
|
"loss": 0.5022, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04976609933313427, |
|
"eval_loss": 0.4993921220302582, |
|
"eval_runtime": 147.7401, |
|
"eval_samples_per_second": 1.374, |
|
"eval_steps_per_second": 0.69, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.050761421319796954, |
|
"grad_norm": 0.952812671661377, |
|
"learning_rate": 9.746192893401016e-06, |
|
"loss": 0.4901, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.051756743306459636, |
|
"grad_norm": 0.6715916991233826, |
|
"learning_rate": 9.741216283467702e-06, |
|
"loss": 0.5055, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.052752065293122326, |
|
"grad_norm": 0.674640953540802, |
|
"learning_rate": 9.736239673534389e-06, |
|
"loss": 0.4874, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.05374738727978501, |
|
"grad_norm": 0.7867962718009949, |
|
"learning_rate": 9.731263063601075e-06, |
|
"loss": 0.4956, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.0547427092664477, |
|
"grad_norm": 0.9035332202911377, |
|
"learning_rate": 9.726286453667762e-06, |
|
"loss": 0.499, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.05573803125311038, |
|
"grad_norm": 0.7009295225143433, |
|
"learning_rate": 9.72130984373445e-06, |
|
"loss": 0.5034, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.05673335323977307, |
|
"grad_norm": 0.7018862366676331, |
|
"learning_rate": 9.716333233801135e-06, |
|
"loss": 0.5137, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.05772867522643575, |
|
"grad_norm": 0.7812825441360474, |
|
"learning_rate": 9.711356623867822e-06, |
|
"loss": 0.4724, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.058723997213098435, |
|
"grad_norm": 0.6245225071907043, |
|
"learning_rate": 9.70638001393451e-06, |
|
"loss": 0.4446, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.059719319199761124, |
|
"grad_norm": 0.9083976149559021, |
|
"learning_rate": 9.701403404001195e-06, |
|
"loss": 0.4884, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.059719319199761124, |
|
"eval_loss": 0.4891846477985382, |
|
"eval_runtime": 147.5284, |
|
"eval_samples_per_second": 1.376, |
|
"eval_steps_per_second": 0.691, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.06071464118642381, |
|
"grad_norm": 0.6195352673530579, |
|
"learning_rate": 9.69642679406788e-06, |
|
"loss": 0.5121, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.061709963173086496, |
|
"grad_norm": 0.8068727254867554, |
|
"learning_rate": 9.691450184134568e-06, |
|
"loss": 0.4689, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.06270528515974919, |
|
"grad_norm": 1.0427749156951904, |
|
"learning_rate": 9.686473574201255e-06, |
|
"loss": 0.4968, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.06370060714641186, |
|
"grad_norm": 0.698349118232727, |
|
"learning_rate": 9.681496964267941e-06, |
|
"loss": 0.4691, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.06469592913307455, |
|
"grad_norm": 0.9104384183883667, |
|
"learning_rate": 9.676520354334628e-06, |
|
"loss": 0.4775, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.06569125111973724, |
|
"grad_norm": 0.8729726076126099, |
|
"learning_rate": 9.671543744401316e-06, |
|
"loss": 0.5201, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.06668657310639992, |
|
"grad_norm": 0.9858236908912659, |
|
"learning_rate": 9.666567134468001e-06, |
|
"loss": 0.4268, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.0676818950930626, |
|
"grad_norm": 2.322754383087158, |
|
"learning_rate": 9.661590524534687e-06, |
|
"loss": 0.4744, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.0686772170797253, |
|
"grad_norm": 0.9327623248100281, |
|
"learning_rate": 9.656613914601374e-06, |
|
"loss": 0.4355, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.06967253906638797, |
|
"grad_norm": 0.6949413418769836, |
|
"learning_rate": 9.651637304668062e-06, |
|
"loss": 0.465, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.06967253906638797, |
|
"eval_loss": 0.4817120432853699, |
|
"eval_runtime": 147.5643, |
|
"eval_samples_per_second": 1.376, |
|
"eval_steps_per_second": 0.691, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.07066786105305066, |
|
"grad_norm": 0.5208165049552917, |
|
"learning_rate": 9.646660694734747e-06, |
|
"loss": 0.4973, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.07166318303971335, |
|
"grad_norm": 0.8434884548187256, |
|
"learning_rate": 9.641684084801434e-06, |
|
"loss": 0.4721, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.07265850502637604, |
|
"grad_norm": 0.7161769866943359, |
|
"learning_rate": 9.636707474868122e-06, |
|
"loss": 0.498, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.07365382701303871, |
|
"grad_norm": 0.7036088705062866, |
|
"learning_rate": 9.631730864934807e-06, |
|
"loss": 0.4672, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.0746491489997014, |
|
"grad_norm": 0.9175013899803162, |
|
"learning_rate": 9.626754255001493e-06, |
|
"loss": 0.4781, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.07564447098636409, |
|
"grad_norm": 0.678519606590271, |
|
"learning_rate": 9.62177764506818e-06, |
|
"loss": 0.4048, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.07663979297302677, |
|
"grad_norm": 0.6295528411865234, |
|
"learning_rate": 9.616801035134868e-06, |
|
"loss": 0.449, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.07763511495968946, |
|
"grad_norm": 0.5424385666847229, |
|
"learning_rate": 9.611824425201553e-06, |
|
"loss": 0.4394, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.07863043694635215, |
|
"grad_norm": 0.508836030960083, |
|
"learning_rate": 9.60684781526824e-06, |
|
"loss": 0.4317, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.07962575893301484, |
|
"grad_norm": 0.6004147529602051, |
|
"learning_rate": 9.601871205334926e-06, |
|
"loss": 0.4308, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07962575893301484, |
|
"eval_loss": 0.47557342052459717, |
|
"eval_runtime": 147.5812, |
|
"eval_samples_per_second": 1.376, |
|
"eval_steps_per_second": 0.691, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.08062108091967751, |
|
"grad_norm": 0.5553786754608154, |
|
"learning_rate": 9.596894595401613e-06, |
|
"loss": 0.4376, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.0816164029063402, |
|
"grad_norm": 0.7254445552825928, |
|
"learning_rate": 9.591917985468299e-06, |
|
"loss": 0.4884, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.08261172489300289, |
|
"grad_norm": 0.7175013422966003, |
|
"learning_rate": 9.586941375534986e-06, |
|
"loss": 0.4167, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.08360704687966557, |
|
"grad_norm": 0.6464620232582092, |
|
"learning_rate": 9.581964765601674e-06, |
|
"loss": 0.4622, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.08460236886632826, |
|
"grad_norm": 0.6999176144599915, |
|
"learning_rate": 9.57698815566836e-06, |
|
"loss": 0.4708, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.08559769085299095, |
|
"grad_norm": 0.7939727306365967, |
|
"learning_rate": 9.572011545735047e-06, |
|
"loss": 0.4633, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.08659301283965362, |
|
"grad_norm": 0.473017156124115, |
|
"learning_rate": 9.567034935801732e-06, |
|
"loss": 0.4585, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.08758833482631631, |
|
"grad_norm": 0.7265183329582214, |
|
"learning_rate": 9.56205832586842e-06, |
|
"loss": 0.4485, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.088583656812979, |
|
"grad_norm": 0.539735734462738, |
|
"learning_rate": 9.557081715935105e-06, |
|
"loss": 0.475, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.08957897879964169, |
|
"grad_norm": 0.7587076425552368, |
|
"learning_rate": 9.552105106001792e-06, |
|
"loss": 0.4347, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08957897879964169, |
|
"eval_loss": 0.4690374732017517, |
|
"eval_runtime": 147.5672, |
|
"eval_samples_per_second": 1.376, |
|
"eval_steps_per_second": 0.691, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.09057430078630437, |
|
"grad_norm": 0.7549741864204407, |
|
"learning_rate": 9.547128496068478e-06, |
|
"loss": 0.4434, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.09156962277296705, |
|
"grad_norm": 0.686689555644989, |
|
"learning_rate": 9.542151886135165e-06, |
|
"loss": 0.4052, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.09256494475962974, |
|
"grad_norm": 1.02870512008667, |
|
"learning_rate": 9.537175276201853e-06, |
|
"loss": 0.4806, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.09356026674629242, |
|
"grad_norm": 0.7680675983428955, |
|
"learning_rate": 9.532198666268538e-06, |
|
"loss": 0.4609, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.09455558873295511, |
|
"grad_norm": 0.5478435754776001, |
|
"learning_rate": 9.527222056335224e-06, |
|
"loss": 0.4171, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.0955509107196178, |
|
"grad_norm": 0.5974985361099243, |
|
"learning_rate": 9.522245446401913e-06, |
|
"loss": 0.4686, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.09654623270628049, |
|
"grad_norm": 0.997151792049408, |
|
"learning_rate": 9.517268836468598e-06, |
|
"loss": 0.4676, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.09754155469294316, |
|
"grad_norm": 0.6366075277328491, |
|
"learning_rate": 9.512292226535284e-06, |
|
"loss": 0.4467, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.09853687667960585, |
|
"grad_norm": 0.5682553052902222, |
|
"learning_rate": 9.507315616601971e-06, |
|
"loss": 0.4772, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.09953219866626854, |
|
"grad_norm": 0.5869882106781006, |
|
"learning_rate": 9.502339006668659e-06, |
|
"loss": 0.3976, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.09953219866626854, |
|
"eval_loss": 0.46156319975852966, |
|
"eval_runtime": 147.6656, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.10052752065293122, |
|
"grad_norm": 0.5758237838745117, |
|
"learning_rate": 9.497362396735344e-06, |
|
"loss": 0.4528, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.10152284263959391, |
|
"grad_norm": 0.700281023979187, |
|
"learning_rate": 9.492385786802032e-06, |
|
"loss": 0.4545, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.1025181646262566, |
|
"grad_norm": 1.1320914030075073, |
|
"learning_rate": 9.487409176868719e-06, |
|
"loss": 0.4331, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.10351348661291927, |
|
"grad_norm": 0.6469867825508118, |
|
"learning_rate": 9.482432566935405e-06, |
|
"loss": 0.3759, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.10450880859958196, |
|
"grad_norm": 0.9471383094787598, |
|
"learning_rate": 9.47745595700209e-06, |
|
"loss": 0.4041, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.10550413058624465, |
|
"grad_norm": 0.5729160904884338, |
|
"learning_rate": 9.472479347068777e-06, |
|
"loss": 0.4871, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.10649945257290734, |
|
"grad_norm": 0.642436683177948, |
|
"learning_rate": 9.467502737135465e-06, |
|
"loss": 0.3893, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.10749477455957002, |
|
"grad_norm": 0.95659339427948, |
|
"learning_rate": 9.46252612720215e-06, |
|
"loss": 0.4486, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.1084900965462327, |
|
"grad_norm": 0.6642667055130005, |
|
"learning_rate": 9.457549517268838e-06, |
|
"loss": 0.5168, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.1094854185328954, |
|
"grad_norm": 0.5805796980857849, |
|
"learning_rate": 9.452572907335525e-06, |
|
"loss": 0.4019, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1094854185328954, |
|
"eval_loss": 0.4559178054332733, |
|
"eval_runtime": 147.5891, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.11048074051955807, |
|
"grad_norm": 0.7006909251213074, |
|
"learning_rate": 9.44759629740221e-06, |
|
"loss": 0.457, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.11147606250622076, |
|
"grad_norm": 1.1821540594100952, |
|
"learning_rate": 9.442619687468896e-06, |
|
"loss": 0.3484, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.11247138449288345, |
|
"grad_norm": 0.7232743501663208, |
|
"learning_rate": 9.437643077535584e-06, |
|
"loss": 0.417, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.11346670647954614, |
|
"grad_norm": 0.6104183197021484, |
|
"learning_rate": 9.43266646760227e-06, |
|
"loss": 0.4821, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.11446202846620881, |
|
"grad_norm": 0.5961386561393738, |
|
"learning_rate": 9.427689857668956e-06, |
|
"loss": 0.4834, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.1154573504528715, |
|
"grad_norm": 0.5530894994735718, |
|
"learning_rate": 9.422713247735644e-06, |
|
"loss": 0.443, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.1164526724395342, |
|
"grad_norm": 0.5148622393608093, |
|
"learning_rate": 9.41773663780233e-06, |
|
"loss": 0.4029, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.11744799442619687, |
|
"grad_norm": 0.6148583292961121, |
|
"learning_rate": 9.412760027869017e-06, |
|
"loss": 0.4308, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.11844331641285956, |
|
"grad_norm": 0.7840449213981628, |
|
"learning_rate": 9.407783417935702e-06, |
|
"loss": 0.499, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.11943863839952225, |
|
"grad_norm": 0.6757422089576721, |
|
"learning_rate": 9.40280680800239e-06, |
|
"loss": 0.4263, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.11943863839952225, |
|
"eval_loss": 0.4505193829536438, |
|
"eval_runtime": 147.6664, |
|
"eval_samples_per_second": 1.375, |
|
"eval_steps_per_second": 0.691, |
|
"step": 1200 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 20094, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.756627798568276e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|