|
{ |
|
"best_metric": 0.7685801386833191, |
|
"best_model_checkpoint": "./output/checkpoint-4800", |
|
"epoch": 0.6153654898060666, |
|
"eval_steps": 150, |
|
"global_step": 4950, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0012431626056688214, |
|
"grad_norm": 3.163724184036255, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.9463, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.002486325211337643, |
|
"grad_norm": 3.8644485473632812, |
|
"learning_rate": 2.5e-05, |
|
"loss": 2.054, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0037294878170064643, |
|
"grad_norm": 2.5856993198394775, |
|
"learning_rate": 3.75e-05, |
|
"loss": 1.6479, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.004972650422675286, |
|
"grad_norm": 3.1547975540161133, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3341, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.006215813028344107, |
|
"grad_norm": 2.9259073734283447, |
|
"learning_rate": 6.25e-05, |
|
"loss": 1.2614, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.007458975634012929, |
|
"grad_norm": 2.072551727294922, |
|
"learning_rate": 7.5e-05, |
|
"loss": 1.2827, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.00870213823968175, |
|
"grad_norm": 2.1959753036499023, |
|
"learning_rate": 8.75e-05, |
|
"loss": 1.1541, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.009945300845350571, |
|
"grad_norm": 2.231361150741577, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1048, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.011188463451019393, |
|
"grad_norm": 1.8180471658706665, |
|
"learning_rate": 0.00011250000000000001, |
|
"loss": 1.1412, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.012431626056688214, |
|
"grad_norm": 2.6011712551116943, |
|
"learning_rate": 0.000125, |
|
"loss": 0.9483, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.013674788662357036, |
|
"grad_norm": 1.6959333419799805, |
|
"learning_rate": 0.00012499871543489787, |
|
"loss": 1.0523, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.014917951268025857, |
|
"grad_norm": 2.7038402557373047, |
|
"learning_rate": 0.00012499486179239495, |
|
"loss": 1.0785, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.01616111387369468, |
|
"grad_norm": 2.327219009399414, |
|
"learning_rate": 0.00012498843923089938, |
|
"loss": 1.028, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0174042764793635, |
|
"grad_norm": 2.2055954933166504, |
|
"learning_rate": 0.0001249794480144175, |
|
"loss": 0.9735, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.018647439085032323, |
|
"grad_norm": 1.949844241142273, |
|
"learning_rate": 0.000124967888512543, |
|
"loss": 1.0077, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.018647439085032323, |
|
"eval_loss": 1.273397445678711, |
|
"eval_runtime": 54.5001, |
|
"eval_samples_per_second": 9.174, |
|
"eval_steps_per_second": 9.174, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.019890601690701143, |
|
"grad_norm": 2.5000596046447754, |
|
"learning_rate": 0.00012495376120044173, |
|
"loss": 0.8882, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.021133764296369966, |
|
"grad_norm": 1.6136529445648193, |
|
"learning_rate": 0.00012493706665883217, |
|
"loss": 0.9579, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.022376926902038786, |
|
"grad_norm": 2.2783761024475098, |
|
"learning_rate": 0.00012491780557396154, |
|
"loss": 1.0162, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.02362008950770761, |
|
"grad_norm": 2.086787700653076, |
|
"learning_rate": 0.00012489597873757756, |
|
"loss": 0.9505, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.02486325211337643, |
|
"grad_norm": 2.1386966705322266, |
|
"learning_rate": 0.00012487158704689602, |
|
"loss": 0.8124, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.026106414719045252, |
|
"grad_norm": 2.209433078765869, |
|
"learning_rate": 0.0001248446315045638, |
|
"loss": 0.9005, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.02734957732471407, |
|
"grad_norm": 1.7136540412902832, |
|
"learning_rate": 0.00012481511321861763, |
|
"loss": 0.8635, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.028592739930382895, |
|
"grad_norm": 1.6844110488891602, |
|
"learning_rate": 0.00012478303340243864, |
|
"loss": 0.9517, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.029835902536051714, |
|
"grad_norm": 1.9923568964004517, |
|
"learning_rate": 0.00012474839337470246, |
|
"loss": 0.8701, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.031079065141720538, |
|
"grad_norm": 1.8591444492340088, |
|
"learning_rate": 0.0001247111945593249, |
|
"loss": 0.9421, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.03232222774738936, |
|
"grad_norm": 1.571029543876648, |
|
"learning_rate": 0.00012467143848540359, |
|
"loss": 0.9207, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.03356539035305818, |
|
"grad_norm": 2.1113922595977783, |
|
"learning_rate": 0.000124629126787155, |
|
"loss": 0.7869, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.034808552958727, |
|
"grad_norm": 2.252993106842041, |
|
"learning_rate": 0.00012458426120384738, |
|
"loss": 0.9157, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.03605171556439582, |
|
"grad_norm": 1.6023743152618408, |
|
"learning_rate": 0.00012453684357972906, |
|
"loss": 0.8904, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.037294878170064646, |
|
"grad_norm": 1.2541028261184692, |
|
"learning_rate": 0.00012448687586395289, |
|
"loss": 0.855, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.037294878170064646, |
|
"eval_loss": 1.1304936408996582, |
|
"eval_runtime": 54.356, |
|
"eval_samples_per_second": 9.199, |
|
"eval_steps_per_second": 9.199, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.038538040775733466, |
|
"grad_norm": 2.082008123397827, |
|
"learning_rate": 0.00012443436011049593, |
|
"loss": 1.0134, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.039781203381402286, |
|
"grad_norm": 1.3970550298690796, |
|
"learning_rate": 0.0001243792984780751, |
|
"loss": 0.8233, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.041024365987071106, |
|
"grad_norm": 1.6384402513504028, |
|
"learning_rate": 0.00012432169323005853, |
|
"loss": 0.7816, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.04226752859273993, |
|
"grad_norm": 2.04970645904541, |
|
"learning_rate": 0.00012426154673437223, |
|
"loss": 0.8563, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.04351069119840875, |
|
"grad_norm": 1.8425785303115845, |
|
"learning_rate": 0.00012419886146340314, |
|
"loss": 0.8697, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.04475385380407757, |
|
"grad_norm": 1.3543235063552856, |
|
"learning_rate": 0.0001241336399938972, |
|
"loss": 0.8055, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0459970164097464, |
|
"grad_norm": 1.8363412618637085, |
|
"learning_rate": 0.00012406588500685355, |
|
"loss": 0.7834, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.04724017901541522, |
|
"grad_norm": 1.7371594905853271, |
|
"learning_rate": 0.00012399559928741435, |
|
"loss": 0.8563, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.04848334162108404, |
|
"grad_norm": 1.7979482412338257, |
|
"learning_rate": 0.00012392278572475023, |
|
"loss": 0.9205, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.04972650422675286, |
|
"grad_norm": 1.9040799140930176, |
|
"learning_rate": 0.0001238474473119416, |
|
"loss": 0.7881, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.050969666832421684, |
|
"grad_norm": 1.9962204694747925, |
|
"learning_rate": 0.00012376958714585545, |
|
"loss": 0.8942, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.052212829438090504, |
|
"grad_norm": 1.4779837131500244, |
|
"learning_rate": 0.0001236892084270183, |
|
"loss": 0.8128, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.05345599204375932, |
|
"grad_norm": 1.712457299232483, |
|
"learning_rate": 0.00012360631445948448, |
|
"loss": 0.8101, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.05469915464942814, |
|
"grad_norm": 1.750348448753357, |
|
"learning_rate": 0.00012352090865070026, |
|
"loss": 0.8743, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.05594231725509697, |
|
"grad_norm": 2.143127202987671, |
|
"learning_rate": 0.00012343299451136397, |
|
"loss": 0.8286, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.05594231725509697, |
|
"eval_loss": 1.080597162246704, |
|
"eval_runtime": 56.9902, |
|
"eval_samples_per_second": 8.773, |
|
"eval_steps_per_second": 8.773, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.05718547986076579, |
|
"grad_norm": 1.626365065574646, |
|
"learning_rate": 0.00012334257565528155, |
|
"loss": 0.8354, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.05842864246643461, |
|
"grad_norm": 1.8055260181427002, |
|
"learning_rate": 0.000123249655799218, |
|
"loss": 0.801, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.05967180507210343, |
|
"grad_norm": 1.6423254013061523, |
|
"learning_rate": 0.00012315423876274468, |
|
"loss": 0.752, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.060914967677772255, |
|
"grad_norm": 1.7773467302322388, |
|
"learning_rate": 0.0001230563284680822, |
|
"loss": 0.8418, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.062158130283441075, |
|
"grad_norm": 1.448588490486145, |
|
"learning_rate": 0.00012295592893993935, |
|
"loss": 0.8632, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0634012928891099, |
|
"grad_norm": 1.3413039445877075, |
|
"learning_rate": 0.00012285304430534745, |
|
"loss": 0.7983, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.06464445549477872, |
|
"grad_norm": 1.3094663619995117, |
|
"learning_rate": 0.00012274767879349083, |
|
"loss": 0.7309, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.06588761810044753, |
|
"grad_norm": 1.427276611328125, |
|
"learning_rate": 0.00012263983673553306, |
|
"loss": 0.8253, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.06713078070611636, |
|
"grad_norm": 1.9115201234817505, |
|
"learning_rate": 0.0001225295225644387, |
|
"loss": 0.7826, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.06837394331178519, |
|
"grad_norm": 1.8499016761779785, |
|
"learning_rate": 0.0001224167408147913, |
|
"loss": 0.7889, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.069617105917454, |
|
"grad_norm": 1.753057837486267, |
|
"learning_rate": 0.0001223014961226068, |
|
"loss": 0.8468, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.07086026852312283, |
|
"grad_norm": 1.4970792531967163, |
|
"learning_rate": 0.00012218379322514317, |
|
"loss": 0.8043, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.07210343112879164, |
|
"grad_norm": 1.355022668838501, |
|
"learning_rate": 0.00012206363696070545, |
|
"loss": 0.7553, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.07334659373446047, |
|
"grad_norm": 1.5447806119918823, |
|
"learning_rate": 0.0001219410322684471, |
|
"loss": 0.7947, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.07458975634012929, |
|
"grad_norm": 1.6613497734069824, |
|
"learning_rate": 0.0001218159841881668, |
|
"loss": 0.7285, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.07458975634012929, |
|
"eval_loss": 1.035021185874939, |
|
"eval_runtime": 56.3558, |
|
"eval_samples_per_second": 8.872, |
|
"eval_steps_per_second": 8.872, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.0758329189457981, |
|
"grad_norm": 1.778586506843567, |
|
"learning_rate": 0.00012168849786010133, |
|
"loss": 0.7333, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.07707608155146693, |
|
"grad_norm": 1.4905791282653809, |
|
"learning_rate": 0.00012155857852471433, |
|
"loss": 0.7691, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.07831924415713576, |
|
"grad_norm": 1.5982129573822021, |
|
"learning_rate": 0.0001214262315224808, |
|
"loss": 0.7232, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.07956240676280457, |
|
"grad_norm": 2.0250298976898193, |
|
"learning_rate": 0.00012129146229366766, |
|
"loss": 0.7468, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.0808055693684734, |
|
"grad_norm": 1.4550626277923584, |
|
"learning_rate": 0.00012115427637811003, |
|
"loss": 0.7719, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.08204873197414221, |
|
"grad_norm": 1.4702717065811157, |
|
"learning_rate": 0.00012101467941498357, |
|
"loss": 0.8111, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.08329189457981104, |
|
"grad_norm": 1.379180908203125, |
|
"learning_rate": 0.0001208726771425727, |
|
"loss": 0.7651, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.08453505718547986, |
|
"grad_norm": 1.971276044845581, |
|
"learning_rate": 0.00012072827539803463, |
|
"loss": 0.8005, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.08577821979114868, |
|
"grad_norm": 1.7520999908447266, |
|
"learning_rate": 0.00012058148011715949, |
|
"loss": 0.6388, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.0870213823968175, |
|
"grad_norm": 7.61739444732666, |
|
"learning_rate": 0.00012043229733412636, |
|
"loss": 0.8455, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.08826454500248633, |
|
"grad_norm": 1.2994366884231567, |
|
"learning_rate": 0.0001202807331812551, |
|
"loss": 0.7365, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.08950770760815514, |
|
"grad_norm": 1.3976930379867554, |
|
"learning_rate": 0.00012012679388875441, |
|
"loss": 0.7674, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.09075087021382397, |
|
"grad_norm": 1.8187024593353271, |
|
"learning_rate": 0.00011997048578446568, |
|
"loss": 0.8233, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.0919940328194928, |
|
"grad_norm": 1.5645136833190918, |
|
"learning_rate": 0.00011981181529360282, |
|
"loss": 0.7395, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.09323719542516161, |
|
"grad_norm": 2.0647048950195312, |
|
"learning_rate": 0.00011965078893848828, |
|
"loss": 0.7704, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.09323719542516161, |
|
"eval_loss": 0.9952675700187683, |
|
"eval_runtime": 55.6428, |
|
"eval_samples_per_second": 8.986, |
|
"eval_steps_per_second": 8.986, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.09448035803083044, |
|
"grad_norm": 1.6342475414276123, |
|
"learning_rate": 0.00011948741333828481, |
|
"loss": 0.7924, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.09572352063649925, |
|
"grad_norm": 1.773032307624817, |
|
"learning_rate": 0.00011932169520872344, |
|
"loss": 0.7737, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.09696668324216808, |
|
"grad_norm": 1.3042093515396118, |
|
"learning_rate": 0.00011915364136182738, |
|
"loss": 0.7554, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.0982098458478369, |
|
"grad_norm": 1.9383268356323242, |
|
"learning_rate": 0.0001189832587056321, |
|
"loss": 0.793, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.09945300845350571, |
|
"grad_norm": 0.9794479608535767, |
|
"learning_rate": 0.00011881055424390119, |
|
"loss": 0.7059, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.10069617105917454, |
|
"grad_norm": 1.8000845909118652, |
|
"learning_rate": 0.00011863553507583869, |
|
"loss": 0.7037, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.10193933366484337, |
|
"grad_norm": 1.6130144596099854, |
|
"learning_rate": 0.00011845820839579708, |
|
"loss": 0.9073, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.10318249627051218, |
|
"grad_norm": 1.7219104766845703, |
|
"learning_rate": 0.00011827858149298162, |
|
"loss": 0.7089, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.10442565887618101, |
|
"grad_norm": 1.8444559574127197, |
|
"learning_rate": 0.00011809666175115075, |
|
"loss": 0.6576, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.10566882148184982, |
|
"grad_norm": 2.564314126968384, |
|
"learning_rate": 0.00011791245664831251, |
|
"loss": 0.7686, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.10691198408751865, |
|
"grad_norm": 1.1209467649459839, |
|
"learning_rate": 0.0001177259737564172, |
|
"loss": 0.7148, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.10815514669318747, |
|
"grad_norm": 1.5384615659713745, |
|
"learning_rate": 0.00011753722074104613, |
|
"loss": 0.7689, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.10939830929885629, |
|
"grad_norm": 1.2957892417907715, |
|
"learning_rate": 0.00011734620536109644, |
|
"loss": 0.7372, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.11064147190452511, |
|
"grad_norm": 31.243797302246094, |
|
"learning_rate": 0.00011715293546846223, |
|
"loss": 0.725, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.11188463451019394, |
|
"grad_norm": 1.338306188583374, |
|
"learning_rate": 0.00011695741900771184, |
|
"loss": 0.6768, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.11188463451019394, |
|
"eval_loss": 0.9783537983894348, |
|
"eval_runtime": 57.0411, |
|
"eval_samples_per_second": 8.766, |
|
"eval_steps_per_second": 8.766, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.11312779711586275, |
|
"grad_norm": 1.0017350912094116, |
|
"learning_rate": 0.00011675966401576116, |
|
"loss": 0.71, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.11437095972153158, |
|
"grad_norm": 1.6244183778762817, |
|
"learning_rate": 0.00011655967862154335, |
|
"loss": 0.7, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.11561412232720039, |
|
"grad_norm": 1.6065330505371094, |
|
"learning_rate": 0.0001163574710456747, |
|
"loss": 0.7206, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.11685728493286922, |
|
"grad_norm": 1.042508602142334, |
|
"learning_rate": 0.00011615304960011663, |
|
"loss": 0.6664, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.11810044753853804, |
|
"grad_norm": 1.2049801349639893, |
|
"learning_rate": 0.00011594642268783415, |
|
"loss": 0.7894, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.11934361014420686, |
|
"grad_norm": 1.6910526752471924, |
|
"learning_rate": 0.00011573759880245027, |
|
"loss": 0.7269, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.12058677274987568, |
|
"grad_norm": 2.0689308643341064, |
|
"learning_rate": 0.00011552658652789703, |
|
"loss": 0.7058, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.12182993535554451, |
|
"grad_norm": 1.326037883758545, |
|
"learning_rate": 0.00011531339453806258, |
|
"loss": 0.6103, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.12307309796121332, |
|
"grad_norm": 1.0418630838394165, |
|
"learning_rate": 0.00011509803159643458, |
|
"loss": 0.741, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.12431626056688215, |
|
"grad_norm": 1.9915788173675537, |
|
"learning_rate": 0.00011488050655574003, |
|
"loss": 0.6941, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.12555942317255098, |
|
"grad_norm": 1.4139434099197388, |
|
"learning_rate": 0.00011466082835758141, |
|
"loss": 0.79, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.1268025857782198, |
|
"grad_norm": 2.1607091426849365, |
|
"learning_rate": 0.000114439006032069, |
|
"loss": 0.7007, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.1280457483838886, |
|
"grad_norm": 1.401469111442566, |
|
"learning_rate": 0.00011421504869744978, |
|
"loss": 0.6854, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.12928891098955744, |
|
"grad_norm": 1.8763591051101685, |
|
"learning_rate": 0.0001139889655597326, |
|
"loss": 0.7929, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.13053207359522626, |
|
"grad_norm": 1.3231312036514282, |
|
"learning_rate": 0.00011376076591230974, |
|
"loss": 0.834, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.13053207359522626, |
|
"eval_loss": 0.9482272267341614, |
|
"eval_runtime": 54.6775, |
|
"eval_samples_per_second": 9.145, |
|
"eval_steps_per_second": 9.145, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.13177523620089507, |
|
"grad_norm": 1.7060565948486328, |
|
"learning_rate": 0.00011353045913557492, |
|
"loss": 0.737, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.1330183988065639, |
|
"grad_norm": 1.3965402841567993, |
|
"learning_rate": 0.00011329805469653768, |
|
"loss": 0.7084, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.13426156141223272, |
|
"grad_norm": 1.6514191627502441, |
|
"learning_rate": 0.00011306356214843422, |
|
"loss": 0.6474, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.13550472401790153, |
|
"grad_norm": 1.554608702659607, |
|
"learning_rate": 0.00011282699113033477, |
|
"loss": 0.6597, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.13674788662357037, |
|
"grad_norm": 1.4712316989898682, |
|
"learning_rate": 0.00011258835136674729, |
|
"loss": 0.6732, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1379910492292392, |
|
"grad_norm": 1.4330312013626099, |
|
"learning_rate": 0.00011234765266721778, |
|
"loss": 0.7318, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.139234211834908, |
|
"grad_norm": 1.700324296951294, |
|
"learning_rate": 0.00011210490492592703, |
|
"loss": 0.711, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.14047737444057684, |
|
"grad_norm": 1.3977235555648804, |
|
"learning_rate": 0.0001118601181212839, |
|
"loss": 0.7372, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.14172053704624565, |
|
"grad_norm": 1.6782617568969727, |
|
"learning_rate": 0.00011161330231551515, |
|
"loss": 0.6542, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.14296369965191447, |
|
"grad_norm": 1.953229546546936, |
|
"learning_rate": 0.00011136446765425187, |
|
"loss": 0.7917, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.14420686225758328, |
|
"grad_norm": 1.23630690574646, |
|
"learning_rate": 0.00011111362436611234, |
|
"loss": 0.7396, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.14545002486325212, |
|
"grad_norm": 1.2555770874023438, |
|
"learning_rate": 0.00011086078276228167, |
|
"loss": 0.6994, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.14669318746892093, |
|
"grad_norm": 1.9462257623672485, |
|
"learning_rate": 0.00011060595323608789, |
|
"loss": 0.7422, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.14793635007458975, |
|
"grad_norm": 1.3382525444030762, |
|
"learning_rate": 0.00011034914626257467, |
|
"loss": 0.6999, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.14917951268025859, |
|
"grad_norm": 1.6739270687103271, |
|
"learning_rate": 0.0001100903723980709, |
|
"loss": 0.684, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.14917951268025859, |
|
"eval_loss": 0.9281342625617981, |
|
"eval_runtime": 53.7337, |
|
"eval_samples_per_second": 9.305, |
|
"eval_steps_per_second": 9.305, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1504226752859274, |
|
"grad_norm": 1.2842893600463867, |
|
"learning_rate": 0.00010982964227975658, |
|
"loss": 0.7362, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.1516658378915962, |
|
"grad_norm": 0.9873863458633423, |
|
"learning_rate": 0.00010956696662522569, |
|
"loss": 0.7374, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.15290900049726505, |
|
"grad_norm": 1.2325960397720337, |
|
"learning_rate": 0.00010930235623204551, |
|
"loss": 0.6258, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.15415216310293386, |
|
"grad_norm": 1.4576919078826904, |
|
"learning_rate": 0.00010903582197731294, |
|
"loss": 0.7175, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.15539532570860268, |
|
"grad_norm": 1.1568377017974854, |
|
"learning_rate": 0.00010876737481720722, |
|
"loss": 0.606, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.15663848831427152, |
|
"grad_norm": 1.2811295986175537, |
|
"learning_rate": 0.0001084970257865397, |
|
"loss": 0.6926, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.15788165091994033, |
|
"grad_norm": 1.1385823488235474, |
|
"learning_rate": 0.00010822478599830008, |
|
"loss": 0.6335, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.15912481352560914, |
|
"grad_norm": 1.5843505859375, |
|
"learning_rate": 0.00010795066664319983, |
|
"loss": 0.7607, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.16036797613127798, |
|
"grad_norm": 1.7240452766418457, |
|
"learning_rate": 0.00010767467898921197, |
|
"loss": 0.6379, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.1616111387369468, |
|
"grad_norm": 1.7185726165771484, |
|
"learning_rate": 0.00010739683438110797, |
|
"loss": 0.6669, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.1628543013426156, |
|
"grad_norm": 1.4889787435531616, |
|
"learning_rate": 0.00010711714423999145, |
|
"loss": 0.6905, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.16409746394828442, |
|
"grad_norm": 1.56671142578125, |
|
"learning_rate": 0.00010683562006282861, |
|
"loss": 0.6471, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.16534062655395326, |
|
"grad_norm": 1.1389793157577515, |
|
"learning_rate": 0.00010655227342197574, |
|
"loss": 0.6481, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.16658378915962208, |
|
"grad_norm": 1.2167776823043823, |
|
"learning_rate": 0.00010626711596470343, |
|
"loss": 0.7018, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.1678269517652909, |
|
"grad_norm": 1.3152663707733154, |
|
"learning_rate": 0.0001059801594127179, |
|
"loss": 0.6356, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.1678269517652909, |
|
"eval_loss": 0.9194591045379639, |
|
"eval_runtime": 53.4488, |
|
"eval_samples_per_second": 9.355, |
|
"eval_steps_per_second": 9.355, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.16907011437095973, |
|
"grad_norm": 1.5011839866638184, |
|
"learning_rate": 0.00010569141556167905, |
|
"loss": 0.6192, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.17031327697662854, |
|
"grad_norm": 2.1147801876068115, |
|
"learning_rate": 0.00010540089628071566, |
|
"loss": 0.6289, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.17155643958229735, |
|
"grad_norm": 1.8639715909957886, |
|
"learning_rate": 0.00010510861351193747, |
|
"loss": 0.6891, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.1727996021879662, |
|
"grad_norm": 1.4501938819885254, |
|
"learning_rate": 0.00010481457926994435, |
|
"loss": 0.7117, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.174042764793635, |
|
"grad_norm": 1.1600079536437988, |
|
"learning_rate": 0.0001045188056413323, |
|
"loss": 0.652, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.17528592739930382, |
|
"grad_norm": 1.4674696922302246, |
|
"learning_rate": 0.00010422130478419676, |
|
"loss": 0.7558, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.17652909000497266, |
|
"grad_norm": 1.767659068107605, |
|
"learning_rate": 0.00010392208892763269, |
|
"loss": 0.6438, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.17777225261064147, |
|
"grad_norm": 1.4168020486831665, |
|
"learning_rate": 0.00010362117037123204, |
|
"loss": 0.6173, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.1790154152163103, |
|
"grad_norm": 1.6601725816726685, |
|
"learning_rate": 0.00010331856148457803, |
|
"loss": 0.68, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.18025857782197913, |
|
"grad_norm": 1.2710611820220947, |
|
"learning_rate": 0.00010301427470673678, |
|
"loss": 0.6924, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.18150174042764794, |
|
"grad_norm": 2.1576950550079346, |
|
"learning_rate": 0.00010270832254574588, |
|
"loss": 0.6917, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.18274490303331675, |
|
"grad_norm": 1.6391758918762207, |
|
"learning_rate": 0.00010240071757810036, |
|
"loss": 0.6717, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.1839880656389856, |
|
"grad_norm": 1.4594990015029907, |
|
"learning_rate": 0.00010209147244823564, |
|
"loss": 0.7148, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.1852312282446544, |
|
"grad_norm": 1.277106523513794, |
|
"learning_rate": 0.00010178059986800773, |
|
"loss": 0.6752, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.18647439085032322, |
|
"grad_norm": 1.34278404712677, |
|
"learning_rate": 0.00010146811261617085, |
|
"loss": 0.7066, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.18647439085032322, |
|
"eval_loss": 0.897224485874176, |
|
"eval_runtime": 55.354, |
|
"eval_samples_per_second": 9.033, |
|
"eval_steps_per_second": 9.033, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.18771755345599203, |
|
"grad_norm": 1.5729053020477295, |
|
"learning_rate": 0.00010115402353785197, |
|
"loss": 0.6708, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.18896071606166087, |
|
"grad_norm": 1.7002263069152832, |
|
"learning_rate": 0.00010083834554402292, |
|
"loss": 0.6601, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.19020387866732968, |
|
"grad_norm": 1.1837941408157349, |
|
"learning_rate": 0.00010052109161096958, |
|
"loss": 0.623, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.1914470412729985, |
|
"grad_norm": 1.0333565473556519, |
|
"learning_rate": 0.00010020227477975852, |
|
"loss": 0.6087, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.19269020387866734, |
|
"grad_norm": 1.9234379529953003, |
|
"learning_rate": 9.9881908155701e-05, |
|
"loss": 0.7849, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.19393336648433615, |
|
"grad_norm": 1.6405690908432007, |
|
"learning_rate": 9.956000490781411e-05, |
|
"loss": 0.6642, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.19517652909000496, |
|
"grad_norm": 1.6141093969345093, |
|
"learning_rate": 9.923657826827957e-05, |
|
"loss": 0.7581, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.1964196916956738, |
|
"grad_norm": 1.091253399848938, |
|
"learning_rate": 9.891164153189976e-05, |
|
"loss": 0.7226, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.19766285430134262, |
|
"grad_norm": 1.7699875831604004, |
|
"learning_rate": 9.858520805555123e-05, |
|
"loss": 0.8105, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.19890601690701143, |
|
"grad_norm": 1.3844791650772095, |
|
"learning_rate": 9.825729125763561e-05, |
|
"loss": 0.6536, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.20014917951268027, |
|
"grad_norm": 1.7255157232284546, |
|
"learning_rate": 9.792790461752813e-05, |
|
"loss": 0.8065, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.20139234211834908, |
|
"grad_norm": 1.0022765398025513, |
|
"learning_rate": 9.759706167502343e-05, |
|
"loss": 0.5987, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.2026355047240179, |
|
"grad_norm": 1.8386592864990234, |
|
"learning_rate": 9.726477602977905e-05, |
|
"loss": 0.8187, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.20387866732968674, |
|
"grad_norm": 1.4216880798339844, |
|
"learning_rate": 9.69310613407564e-05, |
|
"loss": 0.6245, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.20512182993535555, |
|
"grad_norm": 0.9029594659805298, |
|
"learning_rate": 9.659593132565929e-05, |
|
"loss": 0.6008, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.20512182993535555, |
|
"eval_loss": 0.8831911683082581, |
|
"eval_runtime": 56.5558, |
|
"eval_samples_per_second": 8.841, |
|
"eval_steps_per_second": 8.841, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.20636499254102436, |
|
"grad_norm": 1.572769284248352, |
|
"learning_rate": 9.625939976037002e-05, |
|
"loss": 0.6875, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.20760815514669317, |
|
"grad_norm": 1.2906407117843628, |
|
"learning_rate": 9.59214804783831e-05, |
|
"loss": 0.6527, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.20885131775236201, |
|
"grad_norm": 1.4845420122146606, |
|
"learning_rate": 9.558218737023671e-05, |
|
"loss": 0.6984, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.21009448035803083, |
|
"grad_norm": 1.5461996793746948, |
|
"learning_rate": 9.524153438294159e-05, |
|
"loss": 0.6153, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.21133764296369964, |
|
"grad_norm": 1.531815767288208, |
|
"learning_rate": 9.489953551940783e-05, |
|
"loss": 0.563, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.21258080556936848, |
|
"grad_norm": 1.5427440404891968, |
|
"learning_rate": 9.455620483786914e-05, |
|
"loss": 0.6713, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.2138239681750373, |
|
"grad_norm": 1.5302445888519287, |
|
"learning_rate": 9.421155645130514e-05, |
|
"loss": 0.6464, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.2150671307807061, |
|
"grad_norm": 1.5195375680923462, |
|
"learning_rate": 9.38656045268611e-05, |
|
"loss": 0.6482, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.21631029338637495, |
|
"grad_norm": 1.5557074546813965, |
|
"learning_rate": 9.351836328526563e-05, |
|
"loss": 0.6748, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.21755345599204376, |
|
"grad_norm": 1.5321686267852783, |
|
"learning_rate": 9.316984700024612e-05, |
|
"loss": 0.6679, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.21879661859771257, |
|
"grad_norm": 1.157923698425293, |
|
"learning_rate": 9.2820069997942e-05, |
|
"loss": 0.7281, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.2200397812033814, |
|
"grad_norm": 1.592752456665039, |
|
"learning_rate": 9.246904665631588e-05, |
|
"loss": 0.6926, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.22128294380905023, |
|
"grad_norm": 1.5591671466827393, |
|
"learning_rate": 9.211679140456242e-05, |
|
"loss": 0.6389, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.22252610641471904, |
|
"grad_norm": 1.146720290184021, |
|
"learning_rate": 9.176331872251536e-05, |
|
"loss": 0.6265, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.22376926902038788, |
|
"grad_norm": 0.9858022928237915, |
|
"learning_rate": 9.140864314005222e-05, |
|
"loss": 0.7098, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.22376926902038788, |
|
"eval_loss": 0.8798418045043945, |
|
"eval_runtime": 56.1677, |
|
"eval_samples_per_second": 8.902, |
|
"eval_steps_per_second": 8.902, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.2250124316260567, |
|
"grad_norm": 1.6383726596832275, |
|
"learning_rate": 9.105277923649698e-05, |
|
"loss": 0.6029, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.2262555942317255, |
|
"grad_norm": 0.863681435585022, |
|
"learning_rate": 9.06957416400209e-05, |
|
"loss": 0.6647, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.22749875683739434, |
|
"grad_norm": 1.0826516151428223, |
|
"learning_rate": 9.03375450270412e-05, |
|
"loss": 0.6775, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.22874191944306316, |
|
"grad_norm": 1.2367980480194092, |
|
"learning_rate": 8.997820412161764e-05, |
|
"loss": 0.7778, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.22998508204873197, |
|
"grad_norm": 1.4348937273025513, |
|
"learning_rate": 8.961773369484738e-05, |
|
"loss": 0.6699, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.23122824465440078, |
|
"grad_norm": 0.9706162810325623, |
|
"learning_rate": 8.925614856425786e-05, |
|
"loss": 0.684, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.23247140726006962, |
|
"grad_norm": 1.4127984046936035, |
|
"learning_rate": 8.88934635931975e-05, |
|
"loss": 0.6667, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.23371456986573844, |
|
"grad_norm": 1.4040454626083374, |
|
"learning_rate": 8.852969369022494e-05, |
|
"loss": 0.6014, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.23495773247140725, |
|
"grad_norm": 1.3731218576431274, |
|
"learning_rate": 8.816485380849613e-05, |
|
"loss": 0.7063, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.2362008950770761, |
|
"grad_norm": 1.5299303531646729, |
|
"learning_rate": 8.779895894514961e-05, |
|
"loss": 0.6177, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.2374440576827449, |
|
"grad_norm": 1.3770627975463867, |
|
"learning_rate": 8.743202414069011e-05, |
|
"loss": 0.6487, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.23868722028841372, |
|
"grad_norm": 1.1185230016708374, |
|
"learning_rate": 8.706406447837023e-05, |
|
"loss": 0.6612, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.23993038289408256, |
|
"grad_norm": 1.027255654335022, |
|
"learning_rate": 8.669509508357052e-05, |
|
"loss": 0.6125, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.24117354549975137, |
|
"grad_norm": 1.053480625152588, |
|
"learning_rate": 8.632513112317761e-05, |
|
"loss": 0.6614, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.24241670810542018, |
|
"grad_norm": 1.3678585290908813, |
|
"learning_rate": 8.59541878049609e-05, |
|
"loss": 0.5761, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.24241670810542018, |
|
"eval_loss": 0.8656662106513977, |
|
"eval_runtime": 56.0937, |
|
"eval_samples_per_second": 8.914, |
|
"eval_steps_per_second": 8.914, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.24365987071108902, |
|
"grad_norm": 1.3336479663848877, |
|
"learning_rate": 8.558228037694728e-05, |
|
"loss": 0.7036, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.24490303331675783, |
|
"grad_norm": 1.3345288038253784, |
|
"learning_rate": 8.520942412679447e-05, |
|
"loss": 0.6508, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.24614619592242665, |
|
"grad_norm": 1.4155631065368652, |
|
"learning_rate": 8.483563438116257e-05, |
|
"loss": 0.6446, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.2473893585280955, |
|
"grad_norm": 1.3814365863800049, |
|
"learning_rate": 8.446092650508393e-05, |
|
"loss": 0.6557, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.2486325211337643, |
|
"grad_norm": 1.9659488201141357, |
|
"learning_rate": 8.408531590133172e-05, |
|
"loss": 0.693, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2498756837394331, |
|
"grad_norm": 1.5568230152130127, |
|
"learning_rate": 8.370881800978673e-05, |
|
"loss": 0.5812, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.25111884634510195, |
|
"grad_norm": 1.5361123085021973, |
|
"learning_rate": 8.333144830680262e-05, |
|
"loss": 0.6411, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.25236200895077077, |
|
"grad_norm": 1.9059866666793823, |
|
"learning_rate": 8.29532223045698e-05, |
|
"loss": 0.7285, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.2536051715564396, |
|
"grad_norm": 1.3273738622665405, |
|
"learning_rate": 8.257415555047785e-05, |
|
"loss": 0.6589, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.2548483341621084, |
|
"grad_norm": 1.4303077459335327, |
|
"learning_rate": 8.21942636264763e-05, |
|
"loss": 0.6544, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.2560914967677772, |
|
"grad_norm": 1.2462729215621948, |
|
"learning_rate": 8.181356214843422e-05, |
|
"loss": 0.5755, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.2573346593734461, |
|
"grad_norm": 1.2266968488693237, |
|
"learning_rate": 8.143206676549826e-05, |
|
"loss": 0.5514, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.2585778219791149, |
|
"grad_norm": 1.5586425065994263, |
|
"learning_rate": 8.10497931594494e-05, |
|
"loss": 0.6775, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.2598209845847837, |
|
"grad_norm": 1.5792748928070068, |
|
"learning_rate": 8.066675704405836e-05, |
|
"loss": 0.6575, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.2610641471904525, |
|
"grad_norm": 1.0587018728256226, |
|
"learning_rate": 8.028297416443952e-05, |
|
"loss": 0.586, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.2610641471904525, |
|
"eval_loss": 0.8552118539810181, |
|
"eval_runtime": 54.3671, |
|
"eval_samples_per_second": 9.197, |
|
"eval_steps_per_second": 9.197, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.2623073097961213, |
|
"grad_norm": 1.3237509727478027, |
|
"learning_rate": 7.989846029640397e-05, |
|
"loss": 0.6647, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.26355047240179014, |
|
"grad_norm": 1.246394157409668, |
|
"learning_rate": 7.951323124581069e-05, |
|
"loss": 0.6031, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.26479363500745895, |
|
"grad_norm": 1.697279930114746, |
|
"learning_rate": 7.91273028479172e-05, |
|
"loss": 0.5719, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.2660367976131278, |
|
"grad_norm": 1.3424289226531982, |
|
"learning_rate": 7.874069096672831e-05, |
|
"loss": 0.6554, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.26727996021879663, |
|
"grad_norm": 1.3016737699508667, |
|
"learning_rate": 7.83534114943442e-05, |
|
"loss": 0.6112, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.26852312282446544, |
|
"grad_norm": 1.566977858543396, |
|
"learning_rate": 7.796548035030715e-05, |
|
"loss": 0.6176, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.26976628543013426, |
|
"grad_norm": 1.2994070053100586, |
|
"learning_rate": 7.757691348094703e-05, |
|
"loss": 0.6126, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.27100944803580307, |
|
"grad_norm": 0.9738805890083313, |
|
"learning_rate": 7.718772685872595e-05, |
|
"loss": 0.6169, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.2722526106414719, |
|
"grad_norm": 1.048270344734192, |
|
"learning_rate": 7.679793648158159e-05, |
|
"loss": 0.5741, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.27349577324714075, |
|
"grad_norm": 2.1474382877349854, |
|
"learning_rate": 7.640755837226965e-05, |
|
"loss": 0.6018, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.27473893585280956, |
|
"grad_norm": 2.463155746459961, |
|
"learning_rate": 7.601660857770522e-05, |
|
"loss": 0.6934, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.2759820984584784, |
|
"grad_norm": 1.2787517309188843, |
|
"learning_rate": 7.562510316830308e-05, |
|
"loss": 0.7, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.2772252610641472, |
|
"grad_norm": 1.377604603767395, |
|
"learning_rate": 7.523305823731723e-05, |
|
"loss": 0.5736, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.278468423669816, |
|
"grad_norm": 1.2263590097427368, |
|
"learning_rate": 7.484048990017919e-05, |
|
"loss": 0.6463, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.2797115862754848, |
|
"grad_norm": 1.2096174955368042, |
|
"learning_rate": 7.444741429383578e-05, |
|
"loss": 0.6445, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.2797115862754848, |
|
"eval_loss": 0.8472199440002441, |
|
"eval_runtime": 54.839, |
|
"eval_samples_per_second": 9.118, |
|
"eval_steps_per_second": 9.118, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.2809547488811537, |
|
"grad_norm": 1.337813138961792, |
|
"learning_rate": 7.405384757608555e-05, |
|
"loss": 0.571, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.2821979114868225, |
|
"grad_norm": 1.363511323928833, |
|
"learning_rate": 7.36598059249148e-05, |
|
"loss": 0.5797, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.2834410740924913, |
|
"grad_norm": 1.3352985382080078, |
|
"learning_rate": 7.326530553783243e-05, |
|
"loss": 0.6532, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.2846842366981601, |
|
"grad_norm": 1.102362871170044, |
|
"learning_rate": 7.287036263120425e-05, |
|
"loss": 0.4812, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.28592739930382893, |
|
"grad_norm": 0.9128854274749756, |
|
"learning_rate": 7.247499343958621e-05, |
|
"loss": 0.6263, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.28717056190949775, |
|
"grad_norm": 1.369809865951538, |
|
"learning_rate": 7.207921421505724e-05, |
|
"loss": 0.5442, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.28841372451516656, |
|
"grad_norm": 1.3508280515670776, |
|
"learning_rate": 7.168304122655113e-05, |
|
"loss": 0.6703, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.2896568871208354, |
|
"grad_norm": 1.2335124015808105, |
|
"learning_rate": 7.128649075918768e-05, |
|
"loss": 0.733, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.29090004972650424, |
|
"grad_norm": 1.152423620223999, |
|
"learning_rate": 7.088957911360347e-05, |
|
"loss": 0.7101, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.29214321233217305, |
|
"grad_norm": 1.1230435371398926, |
|
"learning_rate": 7.049232260528163e-05, |
|
"loss": 0.6501, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.29338637493784187, |
|
"grad_norm": 1.2065176963806152, |
|
"learning_rate": 7.009473756388128e-05, |
|
"loss": 0.5584, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.2946295375435107, |
|
"grad_norm": 1.258360743522644, |
|
"learning_rate": 6.969684033256622e-05, |
|
"loss": 0.6351, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.2958727001491795, |
|
"grad_norm": 1.4752328395843506, |
|
"learning_rate": 6.92986472673332e-05, |
|
"loss": 0.5295, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.29711586275484836, |
|
"grad_norm": 1.4005481004714966, |
|
"learning_rate": 6.890017473633946e-05, |
|
"loss": 0.6168, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.29835902536051717, |
|
"grad_norm": 1.4845457077026367, |
|
"learning_rate": 6.850143911923011e-05, |
|
"loss": 0.6611, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.29835902536051717, |
|
"eval_loss": 0.842125415802002, |
|
"eval_runtime": 54.7426, |
|
"eval_samples_per_second": 9.134, |
|
"eval_steps_per_second": 9.134, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.299602187966186, |
|
"grad_norm": 1.1345279216766357, |
|
"learning_rate": 6.81024568064646e-05, |
|
"loss": 0.647, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.3008453505718548, |
|
"grad_norm": 1.0600401163101196, |
|
"learning_rate": 6.770324419864309e-05, |
|
"loss": 0.5967, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.3020885131775236, |
|
"grad_norm": 1.600447177886963, |
|
"learning_rate": 6.73038177058323e-05, |
|
"loss": 0.6442, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.3033316757831924, |
|
"grad_norm": 1.2468013763427734, |
|
"learning_rate": 6.690419374689087e-05, |
|
"loss": 0.6526, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.30457483838886124, |
|
"grad_norm": 1.0898563861846924, |
|
"learning_rate": 6.650438874879456e-05, |
|
"loss": 0.641, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.3058180009945301, |
|
"grad_norm": 0.772791862487793, |
|
"learning_rate": 6.61044191459609e-05, |
|
"loss": 0.5612, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.3070611636001989, |
|
"grad_norm": 1.353593349456787, |
|
"learning_rate": 6.57043013795737e-05, |
|
"loss": 0.6442, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.30830432620586773, |
|
"grad_norm": 1.1054726839065552, |
|
"learning_rate": 6.530405189690719e-05, |
|
"loss": 0.5728, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.30954748881153654, |
|
"grad_norm": 1.6124745607376099, |
|
"learning_rate": 6.49036871506499e-05, |
|
"loss": 0.5805, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.31079065141720535, |
|
"grad_norm": 1.458871603012085, |
|
"learning_rate": 6.450322359822846e-05, |
|
"loss": 0.554, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.31203381402287417, |
|
"grad_norm": 1.366873860359192, |
|
"learning_rate": 6.410267770113098e-05, |
|
"loss": 0.5137, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.31327697662854304, |
|
"grad_norm": 1.7523047924041748, |
|
"learning_rate": 6.370206592423045e-05, |
|
"loss": 0.59, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.31452013923421185, |
|
"grad_norm": 2.6855154037475586, |
|
"learning_rate": 6.330140473510796e-05, |
|
"loss": 0.5745, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.31576330183988066, |
|
"grad_norm": 1.4660212993621826, |
|
"learning_rate": 6.29007106033757e-05, |
|
"loss": 0.5887, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.3170064644455495, |
|
"grad_norm": 1.741590142250061, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.6324, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.3170064644455495, |
|
"eval_loss": 0.826242208480835, |
|
"eval_runtime": 54.5046, |
|
"eval_samples_per_second": 9.174, |
|
"eval_steps_per_second": 9.174, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.3182496270512183, |
|
"grad_norm": 1.1369116306304932, |
|
"learning_rate": 6.20992893966243e-05, |
|
"loss": 0.5283, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.3194927896568871, |
|
"grad_norm": 0.9932198524475098, |
|
"learning_rate": 6.169859526489204e-05, |
|
"loss": 0.6021, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.32073595226255597, |
|
"grad_norm": 1.0104336738586426, |
|
"learning_rate": 6.129793407576955e-05, |
|
"loss": 0.6471, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.3219791148682248, |
|
"grad_norm": 1.2468757629394531, |
|
"learning_rate": 6.089732229886904e-05, |
|
"loss": 0.698, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.3232222774738936, |
|
"grad_norm": 1.2286524772644043, |
|
"learning_rate": 6.049677640177155e-05, |
|
"loss": 0.5984, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.3244654400795624, |
|
"grad_norm": 1.4735106229782104, |
|
"learning_rate": 6.00963128493501e-05, |
|
"loss": 0.5562, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.3257086026852312, |
|
"grad_norm": 1.5603587627410889, |
|
"learning_rate": 5.969594810309284e-05, |
|
"loss": 0.6013, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.32695176529090003, |
|
"grad_norm": 1.1900887489318848, |
|
"learning_rate": 5.929569862042631e-05, |
|
"loss": 0.6304, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.32819492789656884, |
|
"grad_norm": 1.482504963874817, |
|
"learning_rate": 5.889558085403911e-05, |
|
"loss": 0.5655, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.3294380905022377, |
|
"grad_norm": 1.5434118509292603, |
|
"learning_rate": 5.849561125120545e-05, |
|
"loss": 0.6596, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.3306812531079065, |
|
"grad_norm": 1.8465694189071655, |
|
"learning_rate": 5.809580625310912e-05, |
|
"loss": 0.6457, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.33192441571357534, |
|
"grad_norm": 1.7762593030929565, |
|
"learning_rate": 5.769618229416773e-05, |
|
"loss": 0.5508, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.33316757831924415, |
|
"grad_norm": 1.78951895236969, |
|
"learning_rate": 5.7296755801356926e-05, |
|
"loss": 0.5979, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.33441074092491296, |
|
"grad_norm": 1.1297141313552856, |
|
"learning_rate": 5.6897543193535414e-05, |
|
"loss": 0.5565, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.3356539035305818, |
|
"grad_norm": 1.264514684677124, |
|
"learning_rate": 5.649856088076989e-05, |
|
"loss": 0.5706, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.3356539035305818, |
|
"eval_loss": 0.8196865320205688, |
|
"eval_runtime": 54.3418, |
|
"eval_samples_per_second": 9.201, |
|
"eval_steps_per_second": 9.201, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.33689706613625064, |
|
"grad_norm": 1.7647141218185425, |
|
"learning_rate": 5.609982526366054e-05, |
|
"loss": 0.5757, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.33814022874191946, |
|
"grad_norm": 1.655874252319336, |
|
"learning_rate": 5.570135273266683e-05, |
|
"loss": 0.6219, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.33938339134758827, |
|
"grad_norm": 1.6094146966934204, |
|
"learning_rate": 5.53031596674338e-05, |
|
"loss": 0.6181, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.3406265539532571, |
|
"grad_norm": 1.2176001071929932, |
|
"learning_rate": 5.490526243611873e-05, |
|
"loss": 0.5794, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.3418697165589259, |
|
"grad_norm": 1.6560986042022705, |
|
"learning_rate": 5.450767739471837e-05, |
|
"loss": 0.5676, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.3431128791645947, |
|
"grad_norm": 1.5764137506484985, |
|
"learning_rate": 5.411042088639655e-05, |
|
"loss": 0.5182, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.3443560417702636, |
|
"grad_norm": 0.9934073686599731, |
|
"learning_rate": 5.371350924081234e-05, |
|
"loss": 0.5327, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.3455992043759324, |
|
"grad_norm": 1.5813066959381104, |
|
"learning_rate": 5.331695877344888e-05, |
|
"loss": 0.6101, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.3468423669816012, |
|
"grad_norm": 1.0345252752304077, |
|
"learning_rate": 5.292078578494275e-05, |
|
"loss": 0.5376, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.34808552958727, |
|
"grad_norm": 1.1901750564575195, |
|
"learning_rate": 5.2525006560413816e-05, |
|
"loss": 0.5869, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.34932869219293883, |
|
"grad_norm": 1.1224308013916016, |
|
"learning_rate": 5.212963736879578e-05, |
|
"loss": 0.5736, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.35057185479860764, |
|
"grad_norm": 1.1819744110107422, |
|
"learning_rate": 5.173469446216757e-05, |
|
"loss": 0.4877, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.35181501740427645, |
|
"grad_norm": 1.3149231672286987, |
|
"learning_rate": 5.134019407508521e-05, |
|
"loss": 0.6002, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.3530581800099453, |
|
"grad_norm": 1.876297950744629, |
|
"learning_rate": 5.0946152423914456e-05, |
|
"loss": 0.6038, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.35430134261561413, |
|
"grad_norm": 1.6570100784301758, |
|
"learning_rate": 5.0552585706164246e-05, |
|
"loss": 0.5392, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.35430134261561413, |
|
"eval_loss": 0.813827633857727, |
|
"eval_runtime": 56.4795, |
|
"eval_samples_per_second": 8.853, |
|
"eval_steps_per_second": 8.853, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.35554450522128295, |
|
"grad_norm": 1.4959031343460083, |
|
"learning_rate": 5.015951009982081e-05, |
|
"loss": 0.6377, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.35678766782695176, |
|
"grad_norm": 1.524834394454956, |
|
"learning_rate": 4.976694176268278e-05, |
|
"loss": 0.6132, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.3580308304326206, |
|
"grad_norm": 1.1156436204910278, |
|
"learning_rate": 4.937489683169692e-05, |
|
"loss": 0.5865, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.3592739930382894, |
|
"grad_norm": 1.4648617506027222, |
|
"learning_rate": 4.8983391422294786e-05, |
|
"loss": 0.5489, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.36051715564395825, |
|
"grad_norm": 0.9886593222618103, |
|
"learning_rate": 4.8592441627730355e-05, |
|
"loss": 0.5844, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.36176031824962707, |
|
"grad_norm": 1.7449545860290527, |
|
"learning_rate": 4.820206351841842e-05, |
|
"loss": 0.617, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.3630034808552959, |
|
"grad_norm": 1.375961184501648, |
|
"learning_rate": 4.781227314127405e-05, |
|
"loss": 0.6068, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.3642466434609647, |
|
"grad_norm": 1.08450448513031, |
|
"learning_rate": 4.7423086519052966e-05, |
|
"loss": 0.6064, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.3654898060666335, |
|
"grad_norm": 1.6093776226043701, |
|
"learning_rate": 4.703451964969287e-05, |
|
"loss": 0.6449, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.3667329686723023, |
|
"grad_norm": 1.2619701623916626, |
|
"learning_rate": 4.66465885056558e-05, |
|
"loss": 0.5688, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.3679761312779712, |
|
"grad_norm": 1.2892088890075684, |
|
"learning_rate": 4.62593090332717e-05, |
|
"loss": 0.5941, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.36921929388364, |
|
"grad_norm": 1.0506901741027832, |
|
"learning_rate": 4.587269715208281e-05, |
|
"loss": 0.5454, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.3704624564893088, |
|
"grad_norm": 1.6574184894561768, |
|
"learning_rate": 4.5486768754189305e-05, |
|
"loss": 0.5409, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.3717056190949776, |
|
"grad_norm": 1.1367279291152954, |
|
"learning_rate": 4.510153970359606e-05, |
|
"loss": 0.6502, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.37294878170064644, |
|
"grad_norm": 1.1561517715454102, |
|
"learning_rate": 4.4717025835560476e-05, |
|
"loss": 0.5962, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.37294878170064644, |
|
"eval_loss": 0.8055340647697449, |
|
"eval_runtime": 54.6339, |
|
"eval_samples_per_second": 9.152, |
|
"eval_steps_per_second": 9.152, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.37419194430631525, |
|
"grad_norm": 1.3308610916137695, |
|
"learning_rate": 4.433324295594166e-05, |
|
"loss": 0.6302, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.37543510691198406, |
|
"grad_norm": 1.3117073774337769, |
|
"learning_rate": 4.3950206840550585e-05, |
|
"loss": 0.5286, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.37667826951765293, |
|
"grad_norm": 1.3257042169570923, |
|
"learning_rate": 4.3567933234501746e-05, |
|
"loss": 0.6227, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.37792143212332174, |
|
"grad_norm": 1.614931583404541, |
|
"learning_rate": 4.318643785156579e-05, |
|
"loss": 0.5531, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.37916459472899056, |
|
"grad_norm": 1.4024949073791504, |
|
"learning_rate": 4.280573637352371e-05, |
|
"loss": 0.6107, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.38040775733465937, |
|
"grad_norm": 1.3442318439483643, |
|
"learning_rate": 4.242584444952216e-05, |
|
"loss": 0.619, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.3816509199403282, |
|
"grad_norm": 1.6472797393798828, |
|
"learning_rate": 4.204677769543019e-05, |
|
"loss": 0.6219, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.382894082545997, |
|
"grad_norm": 1.226382851600647, |
|
"learning_rate": 4.16685516931974e-05, |
|
"loss": 0.5669, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.38413724515166586, |
|
"grad_norm": 1.2685925960540771, |
|
"learning_rate": 4.1291181990213286e-05, |
|
"loss": 0.5875, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.3853804077573347, |
|
"grad_norm": 2.1690385341644287, |
|
"learning_rate": 4.0914684098668286e-05, |
|
"loss": 0.6366, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.3866235703630035, |
|
"grad_norm": 1.2350751161575317, |
|
"learning_rate": 4.053907349491608e-05, |
|
"loss": 0.6034, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.3878667329686723, |
|
"grad_norm": 1.4489704370498657, |
|
"learning_rate": 4.016436561883746e-05, |
|
"loss": 0.6346, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.3891098955743411, |
|
"grad_norm": 1.332980990409851, |
|
"learning_rate": 3.979057587320554e-05, |
|
"loss": 0.5858, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.3903530581800099, |
|
"grad_norm": 1.391718864440918, |
|
"learning_rate": 3.941771962305274e-05, |
|
"loss": 0.7218, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.39159622078567874, |
|
"grad_norm": 1.269722819328308, |
|
"learning_rate": 3.9045812195039125e-05, |
|
"loss": 0.5846, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.39159622078567874, |
|
"eval_loss": 0.801069974899292, |
|
"eval_runtime": 56.6006, |
|
"eval_samples_per_second": 8.834, |
|
"eval_steps_per_second": 8.834, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.3928393833913476, |
|
"grad_norm": 1.8093242645263672, |
|
"learning_rate": 3.8674868876822395e-05, |
|
"loss": 0.5608, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.3940825459970164, |
|
"grad_norm": 1.6383775472640991, |
|
"learning_rate": 3.83049049164295e-05, |
|
"loss": 0.5706, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.39532570860268523, |
|
"grad_norm": 1.7762494087219238, |
|
"learning_rate": 3.793593552162978e-05, |
|
"loss": 0.6272, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.39656887120835405, |
|
"grad_norm": 1.989702582359314, |
|
"learning_rate": 3.75679758593099e-05, |
|
"loss": 0.6268, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.39781203381402286, |
|
"grad_norm": 1.2394602298736572, |
|
"learning_rate": 3.720104105485039e-05, |
|
"loss": 0.5745, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.39905519641969167, |
|
"grad_norm": 1.6666808128356934, |
|
"learning_rate": 3.6835146191503885e-05, |
|
"loss": 0.6287, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.40029835902536054, |
|
"grad_norm": 0.926642119884491, |
|
"learning_rate": 3.647030630977508e-05, |
|
"loss": 0.6038, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.40154152163102935, |
|
"grad_norm": 1.3358100652694702, |
|
"learning_rate": 3.6106536406802524e-05, |
|
"loss": 0.5941, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.40278468423669817, |
|
"grad_norm": 1.339179277420044, |
|
"learning_rate": 3.5743851435742176e-05, |
|
"loss": 0.5888, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.404027846842367, |
|
"grad_norm": 1.4704395532608032, |
|
"learning_rate": 3.538226630515262e-05, |
|
"loss": 0.5113, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.4052710094480358, |
|
"grad_norm": 1.2576725482940674, |
|
"learning_rate": 3.502179587838238e-05, |
|
"loss": 0.5874, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.4065141720537046, |
|
"grad_norm": 1.1804664134979248, |
|
"learning_rate": 3.46624549729588e-05, |
|
"loss": 0.6054, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.40775733465937347, |
|
"grad_norm": 1.6472457647323608, |
|
"learning_rate": 3.430425835997908e-05, |
|
"loss": 0.6168, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.4090004972650423, |
|
"grad_norm": 1.3699522018432617, |
|
"learning_rate": 3.394722076350302e-05, |
|
"loss": 0.5227, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.4102436598707111, |
|
"grad_norm": 0.9297524690628052, |
|
"learning_rate": 3.359135685994781e-05, |
|
"loss": 0.5818, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.4102436598707111, |
|
"eval_loss": 0.7923147678375244, |
|
"eval_runtime": 53.7845, |
|
"eval_samples_per_second": 9.296, |
|
"eval_steps_per_second": 9.296, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.4114868224763799, |
|
"grad_norm": 1.4288136959075928, |
|
"learning_rate": 3.3236681277484654e-05, |
|
"loss": 0.5647, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.4127299850820487, |
|
"grad_norm": 1.5457046031951904, |
|
"learning_rate": 3.2883208595437584e-05, |
|
"loss": 0.6427, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.41397314768771754, |
|
"grad_norm": 1.0546395778656006, |
|
"learning_rate": 3.2530953343684136e-05, |
|
"loss": 0.6108, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.41521631029338635, |
|
"grad_norm": 1.3705288171768188, |
|
"learning_rate": 3.217993000205799e-05, |
|
"loss": 0.6249, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.4164594728990552, |
|
"grad_norm": 1.1312577724456787, |
|
"learning_rate": 3.1830152999753903e-05, |
|
"loss": 0.6543, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.41770263550472403, |
|
"grad_norm": 1.5398489236831665, |
|
"learning_rate": 3.148163671473439e-05, |
|
"loss": 0.6304, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.41894579811039284, |
|
"grad_norm": 1.4748462438583374, |
|
"learning_rate": 3.113439547313892e-05, |
|
"loss": 0.5638, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.42018896071606165, |
|
"grad_norm": 0.9427064061164856, |
|
"learning_rate": 3.0788443548694874e-05, |
|
"loss": 0.6171, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.42143212332173047, |
|
"grad_norm": 1.718713402748108, |
|
"learning_rate": 3.0443795162130876e-05, |
|
"loss": 0.6358, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.4226752859273993, |
|
"grad_norm": 1.058449387550354, |
|
"learning_rate": 3.0100464480592185e-05, |
|
"loss": 0.6129, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.42391844853306815, |
|
"grad_norm": 1.0851000547409058, |
|
"learning_rate": 2.9758465617058404e-05, |
|
"loss": 0.6621, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.42516161113873696, |
|
"grad_norm": 1.3645485639572144, |
|
"learning_rate": 2.9417812629763285e-05, |
|
"loss": 0.6181, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.4264047737444058, |
|
"grad_norm": 1.682470679283142, |
|
"learning_rate": 2.9078519521616894e-05, |
|
"loss": 0.595, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.4276479363500746, |
|
"grad_norm": 1.1072993278503418, |
|
"learning_rate": 2.8740600239630002e-05, |
|
"loss": 0.564, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.4288910989557434, |
|
"grad_norm": 2.1429033279418945, |
|
"learning_rate": 2.8404068674340714e-05, |
|
"loss": 0.5779, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.4288910989557434, |
|
"eval_loss": 0.7892328500747681, |
|
"eval_runtime": 54.8609, |
|
"eval_samples_per_second": 9.114, |
|
"eval_steps_per_second": 9.114, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.4301342615614122, |
|
"grad_norm": 1.1970041990280151, |
|
"learning_rate": 2.80689386592436e-05, |
|
"loss": 0.5467, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.4313774241670811, |
|
"grad_norm": 1.570540189743042, |
|
"learning_rate": 2.7735223970220955e-05, |
|
"loss": 0.5794, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.4326205867727499, |
|
"grad_norm": 1.2686924934387207, |
|
"learning_rate": 2.7402938324976576e-05, |
|
"loss": 0.4993, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.4338637493784187, |
|
"grad_norm": 1.3892414569854736, |
|
"learning_rate": 2.70720953824719e-05, |
|
"loss": 0.515, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.4351069119840875, |
|
"grad_norm": 1.051397681236267, |
|
"learning_rate": 2.674270874236441e-05, |
|
"loss": 0.542, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.43635007458975633, |
|
"grad_norm": 0.9418326020240784, |
|
"learning_rate": 2.64147919444488e-05, |
|
"loss": 0.5283, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.43759323719542514, |
|
"grad_norm": 1.1212824583053589, |
|
"learning_rate": 2.6088358468100247e-05, |
|
"loss": 0.4912, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.43883639980109396, |
|
"grad_norm": 1.3727294206619263, |
|
"learning_rate": 2.5763421731720435e-05, |
|
"loss": 0.5601, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.4400795624067628, |
|
"grad_norm": 1.0249736309051514, |
|
"learning_rate": 2.5439995092185892e-05, |
|
"loss": 0.4856, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.44132272501243164, |
|
"grad_norm": 1.2564889192581177, |
|
"learning_rate": 2.5118091844299e-05, |
|
"loss": 0.5795, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.44256588761810045, |
|
"grad_norm": 3.1751506328582764, |
|
"learning_rate": 2.479772522024147e-05, |
|
"loss": 0.5902, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.44380905022376926, |
|
"grad_norm": 1.3278981447219849, |
|
"learning_rate": 2.4478908389030427e-05, |
|
"loss": 0.5305, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.4450522128294381, |
|
"grad_norm": 1.1867163181304932, |
|
"learning_rate": 2.41616544559771e-05, |
|
"loss": 0.6074, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.4462953754351069, |
|
"grad_norm": 1.357256293296814, |
|
"learning_rate": 2.3845976462148033e-05, |
|
"loss": 0.5541, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.44753853804077576, |
|
"grad_norm": 1.0771019458770752, |
|
"learning_rate": 2.3531887383829157e-05, |
|
"loss": 0.4935, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.44753853804077576, |
|
"eval_loss": 0.7831114530563354, |
|
"eval_runtime": 55.7555, |
|
"eval_samples_per_second": 8.968, |
|
"eval_steps_per_second": 8.968, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.44878170064644457, |
|
"grad_norm": 1.402942419052124, |
|
"learning_rate": 2.3219400131992273e-05, |
|
"loss": 0.6078, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.4500248632521134, |
|
"grad_norm": 0.9987258315086365, |
|
"learning_rate": 2.2908527551764404e-05, |
|
"loss": 0.5269, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.4512680258577822, |
|
"grad_norm": 1.655179500579834, |
|
"learning_rate": 2.259928242189966e-05, |
|
"loss": 0.5955, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.452511188463451, |
|
"grad_norm": 1.3401436805725098, |
|
"learning_rate": 2.2291677454254136e-05, |
|
"loss": 0.6061, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.4537543510691198, |
|
"grad_norm": 1.766615629196167, |
|
"learning_rate": 2.1985725293263237e-05, |
|
"loss": 0.6185, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.4549975136747887, |
|
"grad_norm": 1.7541744709014893, |
|
"learning_rate": 2.1681438515421953e-05, |
|
"loss": 0.5724, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.4562406762804575, |
|
"grad_norm": 1.9028109312057495, |
|
"learning_rate": 2.1378829628767965e-05, |
|
"loss": 0.5688, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.4574838388861263, |
|
"grad_norm": 1.54623281955719, |
|
"learning_rate": 2.1077911072367317e-05, |
|
"loss": 0.6044, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.4587270014917951, |
|
"grad_norm": 1.4844456911087036, |
|
"learning_rate": 2.077869521580325e-05, |
|
"loss": 0.5635, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.45997016409746394, |
|
"grad_norm": 1.464686632156372, |
|
"learning_rate": 2.0481194358667695e-05, |
|
"loss": 0.5237, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.46121332670313275, |
|
"grad_norm": 1.3379572629928589, |
|
"learning_rate": 2.018542073005567e-05, |
|
"loss": 0.5913, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.46245648930880157, |
|
"grad_norm": 1.292743444442749, |
|
"learning_rate": 1.9891386488062538e-05, |
|
"loss": 0.5878, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.46369965191447043, |
|
"grad_norm": 1.7692592144012451, |
|
"learning_rate": 1.959910371928436e-05, |
|
"loss": 0.5772, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.46494281452013925, |
|
"grad_norm": 1.5741891860961914, |
|
"learning_rate": 1.930858443832096e-05, |
|
"loss": 0.5899, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.46618597712580806, |
|
"grad_norm": 2.228027105331421, |
|
"learning_rate": 1.90198405872821e-05, |
|
"loss": 0.6182, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.46618597712580806, |
|
"eval_loss": 0.7807769775390625, |
|
"eval_runtime": 54.8116, |
|
"eval_samples_per_second": 9.122, |
|
"eval_steps_per_second": 9.122, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.4674291397314769, |
|
"grad_norm": 1.6490607261657715, |
|
"learning_rate": 1.8732884035296582e-05, |
|
"loss": 0.5878, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.4686723023371457, |
|
"grad_norm": 1.3176835775375366, |
|
"learning_rate": 1.844772657802428e-05, |
|
"loss": 0.6422, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.4699154649428145, |
|
"grad_norm": 0.9882899522781372, |
|
"learning_rate": 1.8164379937171382e-05, |
|
"loss": 0.5721, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.47115862754848337, |
|
"grad_norm": 1.3539491891860962, |
|
"learning_rate": 1.7882855760008547e-05, |
|
"loss": 0.5397, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.4724017901541522, |
|
"grad_norm": 1.472594141960144, |
|
"learning_rate": 1.760316561889203e-05, |
|
"loss": 0.5665, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.473644952759821, |
|
"grad_norm": 1.2908227443695068, |
|
"learning_rate": 1.7325321010788034e-05, |
|
"loss": 0.5835, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.4748881153654898, |
|
"grad_norm": 1.5640655755996704, |
|
"learning_rate": 1.7049333356800167e-05, |
|
"loss": 0.6635, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.4761312779711586, |
|
"grad_norm": 1.3457403182983398, |
|
"learning_rate": 1.6775214001699914e-05, |
|
"loss": 0.5415, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.47737444057682743, |
|
"grad_norm": 1.859492540359497, |
|
"learning_rate": 1.6502974213460316e-05, |
|
"loss": 0.5344, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.47861760318249624, |
|
"grad_norm": 1.5213356018066406, |
|
"learning_rate": 1.623262518279279e-05, |
|
"loss": 0.5805, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.4798607657881651, |
|
"grad_norm": 1.5838193893432617, |
|
"learning_rate": 1.596417802268707e-05, |
|
"loss": 0.5148, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.4811039283938339, |
|
"grad_norm": 1.2510637044906616, |
|
"learning_rate": 1.5697643767954488e-05, |
|
"loss": 0.561, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.48234709099950274, |
|
"grad_norm": 1.6863198280334473, |
|
"learning_rate": 1.543303337477432e-05, |
|
"loss": 0.5264, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.48359025360517155, |
|
"grad_norm": 1.6266530752182007, |
|
"learning_rate": 1.517035772024343e-05, |
|
"loss": 0.6313, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.48483341621084036, |
|
"grad_norm": 1.2105517387390137, |
|
"learning_rate": 1.49096276019291e-05, |
|
"loss": 0.5565, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.48483341621084036, |
|
"eval_loss": 0.7774006724357605, |
|
"eval_runtime": 57.9814, |
|
"eval_samples_per_second": 8.623, |
|
"eval_steps_per_second": 8.623, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.4860765788165092, |
|
"grad_norm": 1.2891868352890015, |
|
"learning_rate": 1.4650853737425327e-05, |
|
"loss": 0.5731, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.48731974142217804, |
|
"grad_norm": 0.8313778042793274, |
|
"learning_rate": 1.4394046763912122e-05, |
|
"loss": 0.6025, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.48856290402784686, |
|
"grad_norm": 1.1668204069137573, |
|
"learning_rate": 1.413921723771832e-05, |
|
"loss": 0.6046, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.48980606663351567, |
|
"grad_norm": 1.5604908466339111, |
|
"learning_rate": 1.3886375633887665e-05, |
|
"loss": 0.5574, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.4910492292391845, |
|
"grad_norm": 1.4268012046813965, |
|
"learning_rate": 1.3635532345748137e-05, |
|
"loss": 0.5466, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.4922923918448533, |
|
"grad_norm": 1.3794260025024414, |
|
"learning_rate": 1.3386697684484853e-05, |
|
"loss": 0.649, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.4935355544505221, |
|
"grad_norm": 1.2648028135299683, |
|
"learning_rate": 1.3139881878716107e-05, |
|
"loss": 0.5955, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.494778717056191, |
|
"grad_norm": 1.508892297744751, |
|
"learning_rate": 1.2895095074072986e-05, |
|
"loss": 0.5831, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.4960218796618598, |
|
"grad_norm": 1.3273192644119263, |
|
"learning_rate": 1.2652347332782227e-05, |
|
"loss": 0.5269, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.4972650422675286, |
|
"grad_norm": 1.6496832370758057, |
|
"learning_rate": 1.2411648633252719e-05, |
|
"loss": 0.59, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4985082048731974, |
|
"grad_norm": 1.8715214729309082, |
|
"learning_rate": 1.2173008869665241e-05, |
|
"loss": 0.6478, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.4997513674788662, |
|
"grad_norm": 1.4751086235046387, |
|
"learning_rate": 1.1936437851565791e-05, |
|
"loss": 0.5649, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.5009945300845351, |
|
"grad_norm": 1.407470464706421, |
|
"learning_rate": 1.1701945303462337e-05, |
|
"loss": 0.6002, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.5022376926902039, |
|
"grad_norm": 1.1696547269821167, |
|
"learning_rate": 1.146954086442508e-05, |
|
"loss": 0.5095, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.5034808552958727, |
|
"grad_norm": 2.332303285598755, |
|
"learning_rate": 1.1239234087690252e-05, |
|
"loss": 0.5586, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.5034808552958727, |
|
"eval_loss": 0.7748836278915405, |
|
"eval_runtime": 54.7049, |
|
"eval_samples_per_second": 9.14, |
|
"eval_steps_per_second": 9.14, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.5047240179015415, |
|
"grad_norm": 1.2329920530319214, |
|
"learning_rate": 1.1011034440267395e-05, |
|
"loss": 0.4973, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.5059671805072103, |
|
"grad_norm": 1.2258820533752441, |
|
"learning_rate": 1.078495130255023e-05, |
|
"loss": 0.5507, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.5072103431128792, |
|
"grad_norm": 1.7650001049041748, |
|
"learning_rate": 1.0560993967931004e-05, |
|
"loss": 0.6073, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.508453505718548, |
|
"grad_norm": 1.2635862827301025, |
|
"learning_rate": 1.0339171642418585e-05, |
|
"loss": 0.5028, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.5096966683242168, |
|
"grad_norm": 1.1583187580108643, |
|
"learning_rate": 1.0119493444259963e-05, |
|
"loss": 0.5461, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.5109398309298856, |
|
"grad_norm": 1.1466636657714844, |
|
"learning_rate": 9.901968403565428e-06, |
|
"loss": 0.5874, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.5121829935355544, |
|
"grad_norm": 1.3626190423965454, |
|
"learning_rate": 9.686605461937441e-06, |
|
"loss": 0.5841, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.5134261561412232, |
|
"grad_norm": 1.1523593664169312, |
|
"learning_rate": 9.473413472102982e-06, |
|
"loss": 0.5545, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.5146693187468921, |
|
"grad_norm": 4.49442195892334, |
|
"learning_rate": 9.262401197549744e-06, |
|
"loss": 0.6379, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.515912481352561, |
|
"grad_norm": 1.3444253206253052, |
|
"learning_rate": 9.05357731216587e-06, |
|
"loss": 0.5708, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.5171556439582298, |
|
"grad_norm": 1.5777453184127808, |
|
"learning_rate": 8.846950399883368e-06, |
|
"loss": 0.5973, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.5183988065638986, |
|
"grad_norm": 1.251243233680725, |
|
"learning_rate": 8.64252895432531e-06, |
|
"loss": 0.5389, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.5196419691695674, |
|
"grad_norm": 2.3069584369659424, |
|
"learning_rate": 8.440321378456656e-06, |
|
"loss": 0.5431, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.5208851317752362, |
|
"grad_norm": 1.4895597696304321, |
|
"learning_rate": 8.240335984238844e-06, |
|
"loss": 0.5687, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.522128294380905, |
|
"grad_norm": 1.2962337732315063, |
|
"learning_rate": 8.042580992288163e-06, |
|
"loss": 0.5778, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.522128294380905, |
|
"eval_loss": 0.7723983526229858, |
|
"eval_runtime": 56.0371, |
|
"eval_samples_per_second": 8.923, |
|
"eval_steps_per_second": 8.923, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.5233714569865738, |
|
"grad_norm": 1.3925156593322754, |
|
"learning_rate": 7.847064531537774e-06, |
|
"loss": 0.5649, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.5246146195922426, |
|
"grad_norm": 1.3647059202194214, |
|
"learning_rate": 7.653794638903574e-06, |
|
"loss": 0.5655, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.5258577821979115, |
|
"grad_norm": 2.4528896808624268, |
|
"learning_rate": 7.462779258953875e-06, |
|
"loss": 0.528, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.5271009448035803, |
|
"grad_norm": 0.9853927493095398, |
|
"learning_rate": 7.274026243582796e-06, |
|
"loss": 0.4958, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.5283441074092491, |
|
"grad_norm": 1.48464035987854, |
|
"learning_rate": 7.087543351687493e-06, |
|
"loss": 0.4987, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.5295872700149179, |
|
"grad_norm": 1.827588677406311, |
|
"learning_rate": 6.903338248849269e-06, |
|
"loss": 0.6239, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.5308304326205868, |
|
"grad_norm": 1.2086472511291504, |
|
"learning_rate": 6.7214185070183925e-06, |
|
"loss": 0.5193, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.5320735952262556, |
|
"grad_norm": 1.1530331373214722, |
|
"learning_rate": 6.541791604202936e-06, |
|
"loss": 0.6089, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.5333167578319244, |
|
"grad_norm": 1.198806881904602, |
|
"learning_rate": 6.364464924161311e-06, |
|
"loss": 0.5195, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.5345599204375933, |
|
"grad_norm": 1.753954291343689, |
|
"learning_rate": 6.1894457560988106e-06, |
|
"loss": 0.624, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.5358030830432621, |
|
"grad_norm": 1.1597492694854736, |
|
"learning_rate": 6.016741294367911e-06, |
|
"loss": 0.479, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.5370462456489309, |
|
"grad_norm": 1.3531755208969116, |
|
"learning_rate": 5.846358638172615e-06, |
|
"loss": 0.5792, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.5382894082545997, |
|
"grad_norm": 1.2098314762115479, |
|
"learning_rate": 5.678304791276567e-06, |
|
"loss": 0.5247, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.5395325708602685, |
|
"grad_norm": 1.73563551902771, |
|
"learning_rate": 5.51258666171519e-06, |
|
"loss": 0.4989, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.5407757334659373, |
|
"grad_norm": 1.3523006439208984, |
|
"learning_rate": 5.349211061511726e-06, |
|
"loss": 0.6063, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.5407757334659373, |
|
"eval_loss": 0.7707881927490234, |
|
"eval_runtime": 57.1975, |
|
"eval_samples_per_second": 8.742, |
|
"eval_steps_per_second": 8.742, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.5420188960716061, |
|
"grad_norm": 1.6083730459213257, |
|
"learning_rate": 5.188184706397182e-06, |
|
"loss": 0.6078, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.543262058677275, |
|
"grad_norm": 1.7973166704177856, |
|
"learning_rate": 5.029514215534339e-06, |
|
"loss": 0.5657, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.5445052212829438, |
|
"grad_norm": 1.397307276725769, |
|
"learning_rate": 4.873206111245594e-06, |
|
"loss": 0.5861, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.5457483838886126, |
|
"grad_norm": 1.766788363456726, |
|
"learning_rate": 4.719266818744912e-06, |
|
"loss": 0.5335, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.5469915464942815, |
|
"grad_norm": 2.368110418319702, |
|
"learning_rate": 4.567702665873648e-06, |
|
"loss": 0.6134, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.5482347090999503, |
|
"grad_norm": 1.4024748802185059, |
|
"learning_rate": 4.418519882840505e-06, |
|
"loss": 0.5903, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.5494778717056191, |
|
"grad_norm": 1.45235013961792, |
|
"learning_rate": 4.271724601965371e-06, |
|
"loss": 0.6008, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.5507210343112879, |
|
"grad_norm": 1.5979630947113037, |
|
"learning_rate": 4.127322857427306e-06, |
|
"loss": 0.5718, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.5519641969169568, |
|
"grad_norm": 1.427748203277588, |
|
"learning_rate": 3.985320585016425e-06, |
|
"loss": 0.5916, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.5532073595226256, |
|
"grad_norm": 1.756362795829773, |
|
"learning_rate": 3.845723621889973e-06, |
|
"loss": 0.5969, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.5544505221282944, |
|
"grad_norm": 1.447805404663086, |
|
"learning_rate": 3.7085377063323447e-06, |
|
"loss": 0.5322, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.5556936847339632, |
|
"grad_norm": 1.3792946338653564, |
|
"learning_rate": 3.5737684775191887e-06, |
|
"loss": 0.5492, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.556936847339632, |
|
"grad_norm": 1.699859619140625, |
|
"learning_rate": 3.441421475285679e-06, |
|
"loss": 0.5606, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.5581800099453008, |
|
"grad_norm": 1.4851022958755493, |
|
"learning_rate": 3.3115021398986768e-06, |
|
"loss": 0.6094, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.5594231725509696, |
|
"grad_norm": 0.9180851578712463, |
|
"learning_rate": 3.18401581183321e-06, |
|
"loss": 0.5229, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5594231725509696, |
|
"eval_loss": 0.7698713541030884, |
|
"eval_runtime": 55.7444, |
|
"eval_samples_per_second": 8.97, |
|
"eval_steps_per_second": 8.97, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5606663351566384, |
|
"grad_norm": 1.226762056350708, |
|
"learning_rate": 3.0589677315529044e-06, |
|
"loss": 0.5444, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.5619094977623074, |
|
"grad_norm": 1.1591328382492065, |
|
"learning_rate": 2.9363630392945513e-06, |
|
"loss": 0.6316, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.5631526603679762, |
|
"grad_norm": 1.0757044553756714, |
|
"learning_rate": 2.816206774856854e-06, |
|
"loss": 0.4764, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.564395822973645, |
|
"grad_norm": 1.269648551940918, |
|
"learning_rate": 2.6985038773932046e-06, |
|
"loss": 0.5399, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.5656389855793138, |
|
"grad_norm": 1.5957112312316895, |
|
"learning_rate": 2.583259185208714e-06, |
|
"loss": 0.5419, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.5668821481849826, |
|
"grad_norm": 1.3144357204437256, |
|
"learning_rate": 2.4704774355612943e-06, |
|
"loss": 0.527, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.5681253107906514, |
|
"grad_norm": 1.456000804901123, |
|
"learning_rate": 2.3601632644669536e-06, |
|
"loss": 0.5787, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.5693684733963202, |
|
"grad_norm": 1.3255828619003296, |
|
"learning_rate": 2.2523212065091723e-06, |
|
"loss": 0.5483, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.570611636001989, |
|
"grad_norm": 1.271060585975647, |
|
"learning_rate": 2.1469556946525706e-06, |
|
"loss": 0.5318, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.5718547986076579, |
|
"grad_norm": 1.491672396659851, |
|
"learning_rate": 2.0440710600606595e-06, |
|
"loss": 0.5108, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.5730979612133267, |
|
"grad_norm": 1.744455337524414, |
|
"learning_rate": 1.9436715319177956e-06, |
|
"loss": 0.6084, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.5743411238189955, |
|
"grad_norm": 1.216773271560669, |
|
"learning_rate": 1.8457612372553348e-06, |
|
"loss": 0.5647, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.5755842864246643, |
|
"grad_norm": 1.0653576850891113, |
|
"learning_rate": 1.75034420078201e-06, |
|
"loss": 0.5191, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.5768274490303331, |
|
"grad_norm": 1.4697630405426025, |
|
"learning_rate": 1.6574243447184597e-06, |
|
"loss": 0.5554, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.578070611636002, |
|
"grad_norm": 1.4933724403381348, |
|
"learning_rate": 1.567005488636024e-06, |
|
"loss": 0.5825, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.578070611636002, |
|
"eval_loss": 0.7693516612052917, |
|
"eval_runtime": 55.3708, |
|
"eval_samples_per_second": 9.03, |
|
"eval_steps_per_second": 9.03, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.5793137742416709, |
|
"grad_norm": 1.5786610841751099, |
|
"learning_rate": 1.4790913492997438e-06, |
|
"loss": 0.5274, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.5805569368473397, |
|
"grad_norm": 1.6399073600769043, |
|
"learning_rate": 1.3936855405155408e-06, |
|
"loss": 0.6191, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.5818000994530085, |
|
"grad_norm": 2.071617841720581, |
|
"learning_rate": 1.3107915729816954e-06, |
|
"loss": 0.4883, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.5830432620586773, |
|
"grad_norm": 1.687066912651062, |
|
"learning_rate": 1.230412854144547e-06, |
|
"loss": 0.5191, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.5842864246643461, |
|
"grad_norm": 1.2871911525726318, |
|
"learning_rate": 1.15255268805841e-06, |
|
"loss": 0.5855, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.5855295872700149, |
|
"grad_norm": 1.2647926807403564, |
|
"learning_rate": 1.0772142752497604e-06, |
|
"loss": 0.5283, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.5867727498756837, |
|
"grad_norm": 1.6415475606918335, |
|
"learning_rate": 1.004400712585646e-06, |
|
"loss": 0.6313, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.5880159124813525, |
|
"grad_norm": 1.8416297435760498, |
|
"learning_rate": 9.341149931464537e-07, |
|
"loss": 0.5707, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.5892590750870214, |
|
"grad_norm": 1.3439534902572632, |
|
"learning_rate": 8.663600061028162e-07, |
|
"loss": 0.5725, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.5905022376926902, |
|
"grad_norm": 1.181318759918213, |
|
"learning_rate": 8.011385365968641e-07, |
|
"loss": 0.537, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.591745400298359, |
|
"grad_norm": 1.5585083961486816, |
|
"learning_rate": 7.384532656277698e-07, |
|
"loss": 0.5584, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.5929885629040278, |
|
"grad_norm": 1.2982670068740845, |
|
"learning_rate": 6.783067699414891e-07, |
|
"loss": 0.5202, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.5942317255096967, |
|
"grad_norm": 1.5101556777954102, |
|
"learning_rate": 6.207015219248866e-07, |
|
"loss": 0.5231, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.5954748881153655, |
|
"grad_norm": 1.1590073108673096, |
|
"learning_rate": 5.656398895040813e-07, |
|
"loss": 0.5544, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.5967180507210343, |
|
"grad_norm": 1.7406432628631592, |
|
"learning_rate": 5.131241360471217e-07, |
|
"loss": 0.4726, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.5967180507210343, |
|
"eval_loss": 0.7685801386833191, |
|
"eval_runtime": 56.8319, |
|
"eval_samples_per_second": 8.798, |
|
"eval_steps_per_second": 8.798, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.5979612133267032, |
|
"grad_norm": 1.2891184091567993, |
|
"learning_rate": 4.631564202709354e-07, |
|
"loss": 0.5094, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.599204375932372, |
|
"grad_norm": 1.6769993305206299, |
|
"learning_rate": 4.1573879615262185e-07, |
|
"loss": 0.5776, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.6004475385380408, |
|
"grad_norm": 1.0175931453704834, |
|
"learning_rate": 3.708732128449785e-07, |
|
"loss": 0.5417, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.6016907011437096, |
|
"grad_norm": 1.0494272708892822, |
|
"learning_rate": 3.2856151459641216e-07, |
|
"loss": 0.5291, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.6029338637493784, |
|
"grad_norm": 0.9337966442108154, |
|
"learning_rate": 2.888054406751106e-07, |
|
"loss": 0.4692, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.6041770263550472, |
|
"grad_norm": 2.697531223297119, |
|
"learning_rate": 2.5160662529755823e-07, |
|
"loss": 0.588, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.605420188960716, |
|
"grad_norm": 1.2500678300857544, |
|
"learning_rate": 2.169665975613605e-07, |
|
"loss": 0.5474, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.6066633515663848, |
|
"grad_norm": 1.5010889768600464, |
|
"learning_rate": 1.8488678138238456e-07, |
|
"loss": 0.5865, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.6079065141720537, |
|
"grad_norm": 1.6108850240707397, |
|
"learning_rate": 1.5536849543621584e-07, |
|
"loss": 0.583, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.6091496767777225, |
|
"grad_norm": 1.4152370691299438, |
|
"learning_rate": 1.2841295310397905e-07, |
|
"loss": 0.5705, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.6103928393833914, |
|
"grad_norm": 1.315041184425354, |
|
"learning_rate": 1.0402126242244764e-07, |
|
"loss": 0.566, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.6116360019890602, |
|
"grad_norm": 1.2925786972045898, |
|
"learning_rate": 8.219442603847605e-08, |
|
"loss": 0.5201, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.612879164594729, |
|
"grad_norm": 1.1554069519042969, |
|
"learning_rate": 6.293334116783817e-08, |
|
"loss": 0.6017, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.6141223272003978, |
|
"grad_norm": 1.682599663734436, |
|
"learning_rate": 4.623879955827082e-08, |
|
"loss": 0.5932, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.6153654898060666, |
|
"grad_norm": 1.2782875299453735, |
|
"learning_rate": 3.211148745700665e-08, |
|
"loss": 0.5791, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.6153654898060666, |
|
"eval_loss": 0.768585741519928, |
|
"eval_runtime": 55.9999, |
|
"eval_samples_per_second": 8.929, |
|
"eval_steps_per_second": 8.929, |
|
"step": 4950 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.724336235597312e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|