{ "best_metric": 0.8012369099843738, "best_model_checkpoint": "/data/hungnm/unisentiment/modernBERT-base-sentiment/checkpoint-4611", "epoch": 5.0, "eval_steps": 500, "global_step": 7685, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032530904359141183, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.4862, "step": 5 }, { "epoch": 0.006506180871828237, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.4619, "step": 10 }, { "epoch": 0.009759271307742356, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.474, "step": 15 }, { "epoch": 0.013012361743656473, "grad_norm": 5.975010395050049, "learning_rate": 2.5974025974025976e-06, "loss": 2.4748, "step": 20 }, { "epoch": 0.01626545217957059, "grad_norm": 4.729438781738281, "learning_rate": 5.194805194805195e-06, "loss": 2.4383, "step": 25 }, { "epoch": 0.01951854261548471, "grad_norm": 4.140679359436035, "learning_rate": 8.441558441558442e-06, "loss": 2.2384, "step": 30 }, { "epoch": 0.02277163305139883, "grad_norm": 2.7495357990264893, "learning_rate": 1.1688311688311688e-05, "loss": 2.16, "step": 35 }, { "epoch": 0.026024723487312947, "grad_norm": 1.4239497184753418, "learning_rate": 1.4935064935064936e-05, "loss": 2.0898, "step": 40 }, { "epoch": 0.029277813923227064, "grad_norm": 1.3778964281082153, "learning_rate": 1.8181818181818182e-05, "loss": 2.037, "step": 45 }, { "epoch": 0.03253090435914118, "grad_norm": 1.6160250902175903, "learning_rate": 2.1428571428571428e-05, "loss": 2.0056, "step": 50 }, { "epoch": 0.035783994795055306, "grad_norm": 1.090104579925537, "learning_rate": 2.4675324675324678e-05, "loss": 1.9513, "step": 55 }, { "epoch": 0.03903708523096942, "grad_norm": 2.1062819957733154, "learning_rate": 2.792207792207792e-05, "loss": 1.9023, "step": 60 }, { "epoch": 0.04229017566688354, "grad_norm": 3.310304880142212, "learning_rate": 3.1168831168831166e-05, "loss": 1.877, "step": 65 }, { "epoch": 0.04554326610279766, "grad_norm": 5.446138858795166, "learning_rate": 3.4415584415584416e-05, "loss": 1.822, "step": 70 }, { "epoch": 0.048796356538711776, "grad_norm": 1.910844087600708, "learning_rate": 3.7662337662337665e-05, "loss": 1.7707, "step": 75 }, { "epoch": 0.05204944697462589, "grad_norm": 5.207052707672119, "learning_rate": 4.0909090909090915e-05, "loss": 1.7986, "step": 80 }, { "epoch": 0.05530253741054001, "grad_norm": 4.687050819396973, "learning_rate": 4.415584415584416e-05, "loss": 1.7189, "step": 85 }, { "epoch": 0.05855562784645413, "grad_norm": 4.655097961425781, "learning_rate": 4.740259740259741e-05, "loss": 1.7185, "step": 90 }, { "epoch": 0.06180871828236825, "grad_norm": 5.834875106811523, "learning_rate": 4.999999786858144e-05, "loss": 1.6804, "step": 95 }, { "epoch": 0.06506180871828236, "grad_norm": 2.986246109008789, "learning_rate": 4.99999232689698e-05, "loss": 1.6772, "step": 100 }, { "epoch": 0.06831489915419649, "grad_norm": 1.4194883108139038, "learning_rate": 4.999974209879331e-05, "loss": 1.602, "step": 105 }, { "epoch": 0.07156798959011061, "grad_norm": 3.983574628829956, "learning_rate": 4.999945435882428e-05, "loss": 1.5656, "step": 110 }, { "epoch": 0.07482108002602472, "grad_norm": 1.342112421989441, "learning_rate": 4.9999060050289286e-05, "loss": 1.511, "step": 115 }, { "epoch": 0.07807417046193885, "grad_norm": 2.197117805480957, "learning_rate": 4.999855917486921e-05, "loss": 1.4768, "step": 120 }, { "epoch": 0.08132726089785296, "grad_norm": 1.8786858320236206, "learning_rate": 4.999795173469919e-05, "loss": 1.473, "step": 125 }, { "epoch": 0.08458035133376708, "grad_norm": 2.5618531703948975, "learning_rate": 4.9997237732368645e-05, "loss": 1.4527, "step": 130 }, { "epoch": 0.08783344176968119, "grad_norm": 1.8612209558486938, "learning_rate": 4.999641717092126e-05, "loss": 1.4092, "step": 135 }, { "epoch": 0.09108653220559532, "grad_norm": 1.912489891052246, "learning_rate": 4.999549005385494e-05, "loss": 1.3939, "step": 140 }, { "epoch": 0.09433962264150944, "grad_norm": 2.8550467491149902, "learning_rate": 4.999445638512185e-05, "loss": 1.3562, "step": 145 }, { "epoch": 0.09759271307742355, "grad_norm": 1.902714729309082, "learning_rate": 4.9993316169128334e-05, "loss": 1.3427, "step": 150 }, { "epoch": 0.10084580351333768, "grad_norm": 3.12044620513916, "learning_rate": 4.999206941073496e-05, "loss": 1.3634, "step": 155 }, { "epoch": 0.10409889394925179, "grad_norm": 2.6095197200775146, "learning_rate": 4.999071611525643e-05, "loss": 1.3605, "step": 160 }, { "epoch": 0.10735198438516591, "grad_norm": 2.5530121326446533, "learning_rate": 4.998925628846164e-05, "loss": 1.3444, "step": 165 }, { "epoch": 0.11060507482108002, "grad_norm": 1.9909695386886597, "learning_rate": 4.99876899365736e-05, "loss": 1.3192, "step": 170 }, { "epoch": 0.11385816525699415, "grad_norm": 1.21974515914917, "learning_rate": 4.998601706626938e-05, "loss": 1.3085, "step": 175 }, { "epoch": 0.11711125569290826, "grad_norm": 1.2985081672668457, "learning_rate": 4.9984237684680194e-05, "loss": 1.2848, "step": 180 }, { "epoch": 0.12036434612882238, "grad_norm": 2.141941785812378, "learning_rate": 4.998235179939122e-05, "loss": 1.2729, "step": 185 }, { "epoch": 0.1236174365647365, "grad_norm": 1.9323813915252686, "learning_rate": 4.998035941844167e-05, "loss": 1.275, "step": 190 }, { "epoch": 0.12687052700065063, "grad_norm": 2.6978371143341064, "learning_rate": 4.997826055032476e-05, "loss": 1.2825, "step": 195 }, { "epoch": 0.13012361743656473, "grad_norm": 2.018090009689331, "learning_rate": 4.997605520398762e-05, "loss": 1.2656, "step": 200 }, { "epoch": 0.13337670787247885, "grad_norm": 1.0469837188720703, "learning_rate": 4.997374338883127e-05, "loss": 1.2584, "step": 205 }, { "epoch": 0.13662979830839297, "grad_norm": 1.2959955930709839, "learning_rate": 4.99713251147106e-05, "loss": 1.2494, "step": 210 }, { "epoch": 0.1398828887443071, "grad_norm": 2.215878486633301, "learning_rate": 4.996880039193431e-05, "loss": 1.2482, "step": 215 }, { "epoch": 0.14313597918022122, "grad_norm": 1.711484432220459, "learning_rate": 4.996616923126488e-05, "loss": 1.2258, "step": 220 }, { "epoch": 0.14638906961613532, "grad_norm": 1.5809857845306396, "learning_rate": 4.996343164391853e-05, "loss": 1.223, "step": 225 }, { "epoch": 0.14964216005204944, "grad_norm": 1.6745812892913818, "learning_rate": 4.9960587641565125e-05, "loss": 1.2151, "step": 230 }, { "epoch": 0.15289525048796357, "grad_norm": 1.5372675657272339, "learning_rate": 4.9957637236328195e-05, "loss": 1.1983, "step": 235 }, { "epoch": 0.1561483409238777, "grad_norm": 1.5290815830230713, "learning_rate": 4.995458044078482e-05, "loss": 1.24, "step": 240 }, { "epoch": 0.1594014313597918, "grad_norm": 1.4023972749710083, "learning_rate": 4.9951417267965626e-05, "loss": 1.1897, "step": 245 }, { "epoch": 0.16265452179570591, "grad_norm": 1.8283660411834717, "learning_rate": 4.99481477313547e-05, "loss": 1.2029, "step": 250 }, { "epoch": 0.16590761223162004, "grad_norm": 1.8741523027420044, "learning_rate": 4.9944771844889524e-05, "loss": 1.19, "step": 255 }, { "epoch": 0.16916070266753416, "grad_norm": 1.552556037902832, "learning_rate": 4.994128962296097e-05, "loss": 1.1946, "step": 260 }, { "epoch": 0.1724137931034483, "grad_norm": 2.1094107627868652, "learning_rate": 4.9937701080413165e-05, "loss": 1.1756, "step": 265 }, { "epoch": 0.17566688353936238, "grad_norm": 1.7123149633407593, "learning_rate": 4.993400623254347e-05, "loss": 1.1789, "step": 270 }, { "epoch": 0.1789199739752765, "grad_norm": 1.2891788482666016, "learning_rate": 4.993020509510243e-05, "loss": 1.1833, "step": 275 }, { "epoch": 0.18217306441119063, "grad_norm": 1.2659103870391846, "learning_rate": 4.992629768429367e-05, "loss": 1.1697, "step": 280 }, { "epoch": 0.18542615484710476, "grad_norm": 1.602931022644043, "learning_rate": 4.992228401677382e-05, "loss": 1.16, "step": 285 }, { "epoch": 0.18867924528301888, "grad_norm": 1.1984357833862305, "learning_rate": 4.99181641096525e-05, "loss": 1.1415, "step": 290 }, { "epoch": 0.19193233571893298, "grad_norm": 2.036529302597046, "learning_rate": 4.991393798049219e-05, "loss": 1.168, "step": 295 }, { "epoch": 0.1951854261548471, "grad_norm": 1.9513144493103027, "learning_rate": 4.990960564730819e-05, "loss": 1.1623, "step": 300 }, { "epoch": 0.19843851659076123, "grad_norm": 1.2966268062591553, "learning_rate": 4.9905167128568516e-05, "loss": 1.143, "step": 305 }, { "epoch": 0.20169160702667535, "grad_norm": 1.3897426128387451, "learning_rate": 4.990062244319387e-05, "loss": 1.1431, "step": 310 }, { "epoch": 0.20494469746258945, "grad_norm": 1.7485623359680176, "learning_rate": 4.989597161055746e-05, "loss": 1.1507, "step": 315 }, { "epoch": 0.20819778789850357, "grad_norm": 1.1369644403457642, "learning_rate": 4.989121465048505e-05, "loss": 1.1447, "step": 320 }, { "epoch": 0.2114508783344177, "grad_norm": 1.292037844657898, "learning_rate": 4.988635158325476e-05, "loss": 1.1289, "step": 325 }, { "epoch": 0.21470396877033182, "grad_norm": 1.1460140943527222, "learning_rate": 4.988138242959707e-05, "loss": 1.1314, "step": 330 }, { "epoch": 0.21795705920624595, "grad_norm": 1.9661816358566284, "learning_rate": 4.987630721069465e-05, "loss": 1.147, "step": 335 }, { "epoch": 0.22121014964216004, "grad_norm": 1.3988662958145142, "learning_rate": 4.987112594818232e-05, "loss": 1.1443, "step": 340 }, { "epoch": 0.22446324007807417, "grad_norm": 1.6520105600357056, "learning_rate": 4.986583866414696e-05, "loss": 1.1089, "step": 345 }, { "epoch": 0.2277163305139883, "grad_norm": 1.6153268814086914, "learning_rate": 4.9860445381127385e-05, "loss": 1.1279, "step": 350 }, { "epoch": 0.23096942094990242, "grad_norm": 1.0572576522827148, "learning_rate": 4.985494612211429e-05, "loss": 1.1073, "step": 355 }, { "epoch": 0.2342225113858165, "grad_norm": 1.1980561017990112, "learning_rate": 4.984934091055009e-05, "loss": 1.1161, "step": 360 }, { "epoch": 0.23747560182173064, "grad_norm": 3.1612489223480225, "learning_rate": 4.98436297703289e-05, "loss": 1.1473, "step": 365 }, { "epoch": 0.24072869225764476, "grad_norm": 1.7351305484771729, "learning_rate": 4.983781272579636e-05, "loss": 1.1282, "step": 370 }, { "epoch": 0.24398178269355889, "grad_norm": 1.4272353649139404, "learning_rate": 4.983188980174958e-05, "loss": 1.1486, "step": 375 }, { "epoch": 0.247234873129473, "grad_norm": 1.6868839263916016, "learning_rate": 4.9825861023437016e-05, "loss": 1.1224, "step": 380 }, { "epoch": 0.2504879635653871, "grad_norm": 1.1032485961914062, "learning_rate": 4.981972641655835e-05, "loss": 1.1186, "step": 385 }, { "epoch": 0.25374105400130126, "grad_norm": 1.0825129747390747, "learning_rate": 4.981348600726441e-05, "loss": 1.093, "step": 390 }, { "epoch": 0.25699414443721535, "grad_norm": 1.0156402587890625, "learning_rate": 4.980713982215703e-05, "loss": 1.0873, "step": 395 }, { "epoch": 0.26024723487312945, "grad_norm": 2.106105089187622, "learning_rate": 4.9800687888288964e-05, "loss": 1.0924, "step": 400 }, { "epoch": 0.2635003253090436, "grad_norm": 1.6301723718643188, "learning_rate": 4.9794130233163735e-05, "loss": 1.1063, "step": 405 }, { "epoch": 0.2667534157449577, "grad_norm": 1.30489981174469, "learning_rate": 4.978746688473556e-05, "loss": 1.0993, "step": 410 }, { "epoch": 0.27000650618087185, "grad_norm": 1.1064469814300537, "learning_rate": 4.978069787140919e-05, "loss": 1.093, "step": 415 }, { "epoch": 0.27325959661678595, "grad_norm": 1.1742445230484009, "learning_rate": 4.977382322203982e-05, "loss": 1.0848, "step": 420 }, { "epoch": 0.27651268705270005, "grad_norm": 1.0716508626937866, "learning_rate": 4.976684296593295e-05, "loss": 1.1157, "step": 425 }, { "epoch": 0.2797657774886142, "grad_norm": 1.4256720542907715, "learning_rate": 4.9759757132844256e-05, "loss": 1.0835, "step": 430 }, { "epoch": 0.2830188679245283, "grad_norm": 1.2922230958938599, "learning_rate": 4.975256575297949e-05, "loss": 1.0804, "step": 435 }, { "epoch": 0.28627195836044245, "grad_norm": 1.5222572088241577, "learning_rate": 4.974526885699432e-05, "loss": 1.077, "step": 440 }, { "epoch": 0.28952504879635654, "grad_norm": 1.023868441581726, "learning_rate": 4.973786647599422e-05, "loss": 1.0782, "step": 445 }, { "epoch": 0.29277813923227064, "grad_norm": 1.7092077732086182, "learning_rate": 4.9730358641534324e-05, "loss": 1.1011, "step": 450 }, { "epoch": 0.2960312296681848, "grad_norm": 1.0816203355789185, "learning_rate": 4.9722745385619285e-05, "loss": 1.0857, "step": 455 }, { "epoch": 0.2992843201040989, "grad_norm": 0.9598567485809326, "learning_rate": 4.971502674070317e-05, "loss": 1.0874, "step": 460 }, { "epoch": 0.302537410540013, "grad_norm": 1.1397418975830078, "learning_rate": 4.970720273968929e-05, "loss": 1.0743, "step": 465 }, { "epoch": 0.30579050097592714, "grad_norm": 1.6813876628875732, "learning_rate": 4.969927341593008e-05, "loss": 1.0587, "step": 470 }, { "epoch": 0.30904359141184123, "grad_norm": 1.4590063095092773, "learning_rate": 4.9691238803226944e-05, "loss": 1.0706, "step": 475 }, { "epoch": 0.3122966818477554, "grad_norm": 0.988750696182251, "learning_rate": 4.9683098935830115e-05, "loss": 1.0569, "step": 480 }, { "epoch": 0.3155497722836695, "grad_norm": 1.0971347093582153, "learning_rate": 4.9674853848438506e-05, "loss": 1.0441, "step": 485 }, { "epoch": 0.3188028627195836, "grad_norm": 1.0693708658218384, "learning_rate": 4.9666503576199574e-05, "loss": 1.0644, "step": 490 }, { "epoch": 0.32205595315549773, "grad_norm": 1.2514370679855347, "learning_rate": 4.965804815470916e-05, "loss": 1.0609, "step": 495 }, { "epoch": 0.32530904359141183, "grad_norm": 1.5080784559249878, "learning_rate": 4.964948762001133e-05, "loss": 1.0682, "step": 500 }, { "epoch": 0.328562134027326, "grad_norm": 1.1908406019210815, "learning_rate": 4.964082200859824e-05, "loss": 1.0418, "step": 505 }, { "epoch": 0.3318152244632401, "grad_norm": 1.6586133241653442, "learning_rate": 4.963205135740997e-05, "loss": 1.0668, "step": 510 }, { "epoch": 0.3350683148991542, "grad_norm": 0.7452509999275208, "learning_rate": 4.962317570383436e-05, "loss": 1.0508, "step": 515 }, { "epoch": 0.3383214053350683, "grad_norm": 1.3133275508880615, "learning_rate": 4.961419508570686e-05, "loss": 1.0543, "step": 520 }, { "epoch": 0.3415744957709824, "grad_norm": 1.1373653411865234, "learning_rate": 4.960510954131038e-05, "loss": 1.0711, "step": 525 }, { "epoch": 0.3448275862068966, "grad_norm": 1.12503981590271, "learning_rate": 4.95959191093751e-05, "loss": 1.0486, "step": 530 }, { "epoch": 0.34808067664281067, "grad_norm": 0.921503484249115, "learning_rate": 4.95866238290783e-05, "loss": 1.0543, "step": 535 }, { "epoch": 0.35133376707872477, "grad_norm": 0.9198605418205261, "learning_rate": 4.957722374004427e-05, "loss": 1.0438, "step": 540 }, { "epoch": 0.3545868575146389, "grad_norm": 1.630878210067749, "learning_rate": 4.9567718882344015e-05, "loss": 1.0544, "step": 545 }, { "epoch": 0.357839947950553, "grad_norm": 2.2188167572021484, "learning_rate": 4.95581092964952e-05, "loss": 1.0541, "step": 550 }, { "epoch": 0.36109303838646717, "grad_norm": 0.9371961355209351, "learning_rate": 4.95483950234619e-05, "loss": 1.0723, "step": 555 }, { "epoch": 0.36434612882238127, "grad_norm": 1.0933233499526978, "learning_rate": 4.9538576104654466e-05, "loss": 1.052, "step": 560 }, { "epoch": 0.36759921925829536, "grad_norm": 1.1232990026474, "learning_rate": 4.9528652581929335e-05, "loss": 1.0354, "step": 565 }, { "epoch": 0.3708523096942095, "grad_norm": 1.000786542892456, "learning_rate": 4.951862449758885e-05, "loss": 1.0407, "step": 570 }, { "epoch": 0.3741054001301236, "grad_norm": 0.939582884311676, "learning_rate": 4.9508491894381104e-05, "loss": 1.0206, "step": 575 }, { "epoch": 0.37735849056603776, "grad_norm": 1.264381766319275, "learning_rate": 4.9498254815499694e-05, "loss": 1.0362, "step": 580 }, { "epoch": 0.38061158100195186, "grad_norm": 0.673314094543457, "learning_rate": 4.948791330458363e-05, "loss": 1.0381, "step": 585 }, { "epoch": 0.38386467143786596, "grad_norm": 1.441362738609314, "learning_rate": 4.947746740571706e-05, "loss": 1.0354, "step": 590 }, { "epoch": 0.3871177618737801, "grad_norm": 1.1851030588150024, "learning_rate": 4.9466917163429124e-05, "loss": 1.0146, "step": 595 }, { "epoch": 0.3903708523096942, "grad_norm": 0.9171844124794006, "learning_rate": 4.94562626226938e-05, "loss": 1.0103, "step": 600 }, { "epoch": 0.3936239427456083, "grad_norm": 1.5662965774536133, "learning_rate": 4.944550382892962e-05, "loss": 1.0466, "step": 605 }, { "epoch": 0.39687703318152245, "grad_norm": 1.1077489852905273, "learning_rate": 4.943464082799955e-05, "loss": 1.0458, "step": 610 }, { "epoch": 0.40013012361743655, "grad_norm": 1.5997633934020996, "learning_rate": 4.942367366621081e-05, "loss": 1.0464, "step": 615 }, { "epoch": 0.4033832140533507, "grad_norm": 1.0540611743927002, "learning_rate": 4.9412602390314585e-05, "loss": 1.0242, "step": 620 }, { "epoch": 0.4066363044892648, "grad_norm": 1.1247586011886597, "learning_rate": 4.94014270475059e-05, "loss": 1.0232, "step": 625 }, { "epoch": 0.4098893949251789, "grad_norm": 1.065820336341858, "learning_rate": 4.939014768542342e-05, "loss": 1.0137, "step": 630 }, { "epoch": 0.41314248536109305, "grad_norm": 0.8374763131141663, "learning_rate": 4.93787643521492e-05, "loss": 1.0203, "step": 635 }, { "epoch": 0.41639557579700714, "grad_norm": 0.7515140771865845, "learning_rate": 4.936727709620853e-05, "loss": 1.0176, "step": 640 }, { "epoch": 0.4196486662329213, "grad_norm": 0.8034088015556335, "learning_rate": 4.9355685966569684e-05, "loss": 1.0322, "step": 645 }, { "epoch": 0.4229017566688354, "grad_norm": 1.2314985990524292, "learning_rate": 4.934399101264375e-05, "loss": 1.0198, "step": 650 }, { "epoch": 0.4261548471047495, "grad_norm": 1.342058539390564, "learning_rate": 4.93321922842844e-05, "loss": 1.0133, "step": 655 }, { "epoch": 0.42940793754066364, "grad_norm": 0.8881794214248657, "learning_rate": 4.932028983178766e-05, "loss": 1.0255, "step": 660 }, { "epoch": 0.43266102797657774, "grad_norm": 1.3695508241653442, "learning_rate": 4.9308283705891736e-05, "loss": 1.0293, "step": 665 }, { "epoch": 0.4359141184124919, "grad_norm": 0.9350308179855347, "learning_rate": 4.9296173957776776e-05, "loss": 1.03, "step": 670 }, { "epoch": 0.439167208848406, "grad_norm": 0.9181856513023376, "learning_rate": 4.928396063906463e-05, "loss": 1.0234, "step": 675 }, { "epoch": 0.4424202992843201, "grad_norm": 1.352927803993225, "learning_rate": 4.927164380181869e-05, "loss": 1.0474, "step": 680 }, { "epoch": 0.44567338972023424, "grad_norm": 1.176147222518921, "learning_rate": 4.9259223498543597e-05, "loss": 1.0329, "step": 685 }, { "epoch": 0.44892648015614833, "grad_norm": 1.0797678232192993, "learning_rate": 4.9246699782185055e-05, "loss": 1.0141, "step": 690 }, { "epoch": 0.4521795705920625, "grad_norm": 0.9696300029754639, "learning_rate": 4.9234072706129627e-05, "loss": 0.999, "step": 695 }, { "epoch": 0.4554326610279766, "grad_norm": 0.9436845779418945, "learning_rate": 4.922134232420445e-05, "loss": 1.0003, "step": 700 }, { "epoch": 0.4586857514638907, "grad_norm": 1.1857705116271973, "learning_rate": 4.920850869067706e-05, "loss": 0.9831, "step": 705 }, { "epoch": 0.46193884189980483, "grad_norm": 0.9158900380134583, "learning_rate": 4.919557186025512e-05, "loss": 1.0201, "step": 710 }, { "epoch": 0.4651919323357189, "grad_norm": 0.8820152282714844, "learning_rate": 4.9182531888086205e-05, "loss": 0.9852, "step": 715 }, { "epoch": 0.468445022771633, "grad_norm": 1.5595647096633911, "learning_rate": 4.916938882975759e-05, "loss": 1.0002, "step": 720 }, { "epoch": 0.4716981132075472, "grad_norm": 1.1958764791488647, "learning_rate": 4.915614274129597e-05, "loss": 1.0375, "step": 725 }, { "epoch": 0.4749512036434613, "grad_norm": 1.1134103536605835, "learning_rate": 4.914279367916724e-05, "loss": 1.0208, "step": 730 }, { "epoch": 0.4782042940793754, "grad_norm": 0.8463726043701172, "learning_rate": 4.9129341700276266e-05, "loss": 0.9955, "step": 735 }, { "epoch": 0.4814573845152895, "grad_norm": 0.8405961394309998, "learning_rate": 4.911578686196661e-05, "loss": 0.9754, "step": 740 }, { "epoch": 0.4847104749512036, "grad_norm": 1.0310126543045044, "learning_rate": 4.9102129222020324e-05, "loss": 1.0213, "step": 745 }, { "epoch": 0.48796356538711777, "grad_norm": 1.058269739151001, "learning_rate": 4.908836883865768e-05, "loss": 0.9966, "step": 750 }, { "epoch": 0.49121665582303187, "grad_norm": 0.9762022495269775, "learning_rate": 4.907450577053694e-05, "loss": 1.0059, "step": 755 }, { "epoch": 0.494469746258946, "grad_norm": 0.8593292832374573, "learning_rate": 4.906054007675408e-05, "loss": 0.9922, "step": 760 }, { "epoch": 0.4977228366948601, "grad_norm": 1.3241448402404785, "learning_rate": 4.9046471816842565e-05, "loss": 1.007, "step": 765 }, { "epoch": 0.5009759271307742, "grad_norm": 0.9241655468940735, "learning_rate": 4.903230105077306e-05, "loss": 1.0204, "step": 770 }, { "epoch": 0.5042290175666884, "grad_norm": 0.8068680763244629, "learning_rate": 4.9018027838953226e-05, "loss": 0.9932, "step": 775 }, { "epoch": 0.5074821080026025, "grad_norm": 1.2541546821594238, "learning_rate": 4.900365224222742e-05, "loss": 0.9945, "step": 780 }, { "epoch": 0.5107351984385166, "grad_norm": 0.925835907459259, "learning_rate": 4.898917432187644e-05, "loss": 0.9745, "step": 785 }, { "epoch": 0.5139882888744307, "grad_norm": 0.7561518549919128, "learning_rate": 4.897459413961729e-05, "loss": 1.0065, "step": 790 }, { "epoch": 0.5172413793103449, "grad_norm": 1.056420922279358, "learning_rate": 4.8959911757602885e-05, "loss": 0.974, "step": 795 }, { "epoch": 0.5204944697462589, "grad_norm": 1.219141960144043, "learning_rate": 4.89451272384218e-05, "loss": 0.9926, "step": 800 }, { "epoch": 0.523747560182173, "grad_norm": 0.9372319579124451, "learning_rate": 4.8930240645098027e-05, "loss": 1.0141, "step": 805 }, { "epoch": 0.5270006506180872, "grad_norm": 1.0118193626403809, "learning_rate": 4.891525204109065e-05, "loss": 0.9996, "step": 810 }, { "epoch": 0.5302537410540012, "grad_norm": 0.91470867395401, "learning_rate": 4.890016149029365e-05, "loss": 0.9851, "step": 815 }, { "epoch": 0.5335068314899154, "grad_norm": 0.787122368812561, "learning_rate": 4.888496905703554e-05, "loss": 0.9969, "step": 820 }, { "epoch": 0.5367599219258296, "grad_norm": 0.8628039956092834, "learning_rate": 4.886967480607918e-05, "loss": 1.0024, "step": 825 }, { "epoch": 0.5400130123617437, "grad_norm": 1.450460433959961, "learning_rate": 4.885427880262144e-05, "loss": 0.9743, "step": 830 }, { "epoch": 0.5432661027976577, "grad_norm": 1.0362318754196167, "learning_rate": 4.883878111229296e-05, "loss": 0.9723, "step": 835 }, { "epoch": 0.5465191932335719, "grad_norm": 0.9855751991271973, "learning_rate": 4.8823181801157844e-05, "loss": 0.9898, "step": 840 }, { "epoch": 0.549772283669486, "grad_norm": 1.0782288312911987, "learning_rate": 4.880748093571339e-05, "loss": 0.9727, "step": 845 }, { "epoch": 0.5530253741054001, "grad_norm": 1.5194872617721558, "learning_rate": 4.879167858288982e-05, "loss": 0.9922, "step": 850 }, { "epoch": 0.5562784645413142, "grad_norm": 1.5501078367233276, "learning_rate": 4.877577481004995e-05, "loss": 0.9705, "step": 855 }, { "epoch": 0.5595315549772284, "grad_norm": 1.5971125364303589, "learning_rate": 4.875976968498895e-05, "loss": 1.0078, "step": 860 }, { "epoch": 0.5627846454131424, "grad_norm": 0.9124265313148499, "learning_rate": 4.874366327593406e-05, "loss": 0.9737, "step": 865 }, { "epoch": 0.5660377358490566, "grad_norm": 0.8439720273017883, "learning_rate": 4.872745565154424e-05, "loss": 0.9967, "step": 870 }, { "epoch": 0.5692908262849707, "grad_norm": 0.9340474009513855, "learning_rate": 4.871114688090992e-05, "loss": 0.9934, "step": 875 }, { "epoch": 0.5725439167208849, "grad_norm": 0.8820469975471497, "learning_rate": 4.869473703355273e-05, "loss": 0.9917, "step": 880 }, { "epoch": 0.5757970071567989, "grad_norm": 0.8724156618118286, "learning_rate": 4.867822617942514e-05, "loss": 0.9762, "step": 885 }, { "epoch": 0.5790500975927131, "grad_norm": 0.9085761308670044, "learning_rate": 4.866161438891022e-05, "loss": 0.9686, "step": 890 }, { "epoch": 0.5823031880286272, "grad_norm": 0.7215405106544495, "learning_rate": 4.864490173282128e-05, "loss": 0.9858, "step": 895 }, { "epoch": 0.5855562784645413, "grad_norm": 1.0854041576385498, "learning_rate": 4.862808828240164e-05, "loss": 0.9935, "step": 900 }, { "epoch": 0.5888093689004554, "grad_norm": 0.8779392242431641, "learning_rate": 4.861117410932429e-05, "loss": 0.9816, "step": 905 }, { "epoch": 0.5920624593363696, "grad_norm": 1.2866002321243286, "learning_rate": 4.8594159285691546e-05, "loss": 0.9818, "step": 910 }, { "epoch": 0.5953155497722836, "grad_norm": 0.7991343140602112, "learning_rate": 4.8577043884034826e-05, "loss": 0.9592, "step": 915 }, { "epoch": 0.5985686402081978, "grad_norm": 0.9553494453430176, "learning_rate": 4.8559827977314254e-05, "loss": 0.9943, "step": 920 }, { "epoch": 0.6018217306441119, "grad_norm": 1.2053009271621704, "learning_rate": 4.854251163891843e-05, "loss": 0.946, "step": 925 }, { "epoch": 0.605074821080026, "grad_norm": 0.744791567325592, "learning_rate": 4.852509494266405e-05, "loss": 0.9804, "step": 930 }, { "epoch": 0.6083279115159401, "grad_norm": 1.2371433973312378, "learning_rate": 4.850757796279563e-05, "loss": 0.9902, "step": 935 }, { "epoch": 0.6115810019518543, "grad_norm": 0.723250150680542, "learning_rate": 4.8489960773985174e-05, "loss": 0.9839, "step": 940 }, { "epoch": 0.6148340923877684, "grad_norm": 0.7003908753395081, "learning_rate": 4.847224345133188e-05, "loss": 0.9712, "step": 945 }, { "epoch": 0.6180871828236825, "grad_norm": 0.8090314865112305, "learning_rate": 4.845442607036176e-05, "loss": 0.9631, "step": 950 }, { "epoch": 0.6213402732595966, "grad_norm": 0.7971912622451782, "learning_rate": 4.8436508707027384e-05, "loss": 0.9722, "step": 955 }, { "epoch": 0.6245933636955108, "grad_norm": 0.7696447968482971, "learning_rate": 4.841849143770754e-05, "loss": 0.9712, "step": 960 }, { "epoch": 0.6278464541314248, "grad_norm": 0.9497612714767456, "learning_rate": 4.840037433920688e-05, "loss": 0.9653, "step": 965 }, { "epoch": 0.631099544567339, "grad_norm": 1.1326346397399902, "learning_rate": 4.838215748875562e-05, "loss": 0.9648, "step": 970 }, { "epoch": 0.6343526350032531, "grad_norm": 0.8858407139778137, "learning_rate": 4.83638409640092e-05, "loss": 0.9765, "step": 975 }, { "epoch": 0.6376057254391672, "grad_norm": 0.9079559445381165, "learning_rate": 4.834542484304795e-05, "loss": 0.958, "step": 980 }, { "epoch": 0.6408588158750813, "grad_norm": 0.9221760630607605, "learning_rate": 4.8326909204376776e-05, "loss": 0.9675, "step": 985 }, { "epoch": 0.6441119063109955, "grad_norm": 0.8072174787521362, "learning_rate": 4.8308294126924794e-05, "loss": 0.9745, "step": 990 }, { "epoch": 0.6473649967469096, "grad_norm": 0.9354230165481567, "learning_rate": 4.828957969004502e-05, "loss": 0.9581, "step": 995 }, { "epoch": 0.6506180871828237, "grad_norm": 0.8067158460617065, "learning_rate": 4.827076597351403e-05, "loss": 0.9669, "step": 1000 }, { "epoch": 0.6538711776187378, "grad_norm": 1.0591189861297607, "learning_rate": 4.825185305753161e-05, "loss": 0.9682, "step": 1005 }, { "epoch": 0.657124268054652, "grad_norm": 0.7701990604400635, "learning_rate": 4.823284102272041e-05, "loss": 0.9756, "step": 1010 }, { "epoch": 0.660377358490566, "grad_norm": 0.9886049628257751, "learning_rate": 4.82137299501256e-05, "loss": 0.9646, "step": 1015 }, { "epoch": 0.6636304489264802, "grad_norm": 0.966618537902832, "learning_rate": 4.819451992121454e-05, "loss": 0.9673, "step": 1020 }, { "epoch": 0.6668835393623943, "grad_norm": 0.987940788269043, "learning_rate": 4.817521101787646e-05, "loss": 0.9647, "step": 1025 }, { "epoch": 0.6701366297983083, "grad_norm": 0.752627432346344, "learning_rate": 4.815580332242199e-05, "loss": 0.9545, "step": 1030 }, { "epoch": 0.6733897202342225, "grad_norm": 1.0263205766677856, "learning_rate": 4.813629691758299e-05, "loss": 0.9479, "step": 1035 }, { "epoch": 0.6766428106701367, "grad_norm": 0.8434374332427979, "learning_rate": 4.811669188651204e-05, "loss": 0.9747, "step": 1040 }, { "epoch": 0.6798959011060507, "grad_norm": 0.8626881837844849, "learning_rate": 4.8096988312782174e-05, "loss": 0.9713, "step": 1045 }, { "epoch": 0.6831489915419648, "grad_norm": 0.8781446814537048, "learning_rate": 4.8077186280386475e-05, "loss": 0.964, "step": 1050 }, { "epoch": 0.686402081977879, "grad_norm": 0.8338606953620911, "learning_rate": 4.8057285873737765e-05, "loss": 0.9916, "step": 1055 }, { "epoch": 0.6896551724137931, "grad_norm": 0.8619135022163391, "learning_rate": 4.803728717766821e-05, "loss": 0.9562, "step": 1060 }, { "epoch": 0.6929082628497072, "grad_norm": 0.8325028419494629, "learning_rate": 4.8017190277428956e-05, "loss": 0.9494, "step": 1065 }, { "epoch": 0.6961613532856213, "grad_norm": 0.772607684135437, "learning_rate": 4.799699525868979e-05, "loss": 0.9783, "step": 1070 }, { "epoch": 0.6994144437215355, "grad_norm": 0.7735521793365479, "learning_rate": 4.797670220753876e-05, "loss": 0.966, "step": 1075 }, { "epoch": 0.7026675341574495, "grad_norm": 0.8032121062278748, "learning_rate": 4.79563112104818e-05, "loss": 0.9569, "step": 1080 }, { "epoch": 0.7059206245933637, "grad_norm": 0.9248620271682739, "learning_rate": 4.7935822354442397e-05, "loss": 0.9676, "step": 1085 }, { "epoch": 0.7091737150292778, "grad_norm": 0.6317049264907837, "learning_rate": 4.7915235726761154e-05, "loss": 0.9443, "step": 1090 }, { "epoch": 0.7124268054651919, "grad_norm": 0.9738350510597229, "learning_rate": 4.789455141519551e-05, "loss": 0.9693, "step": 1095 }, { "epoch": 0.715679895901106, "grad_norm": 0.7499257922172546, "learning_rate": 4.7873769507919266e-05, "loss": 0.958, "step": 1100 }, { "epoch": 0.7189329863370202, "grad_norm": 0.8857749700546265, "learning_rate": 4.785289009352227e-05, "loss": 0.9596, "step": 1105 }, { "epoch": 0.7221860767729343, "grad_norm": 0.7081575393676758, "learning_rate": 4.7831913261010066e-05, "loss": 0.9454, "step": 1110 }, { "epoch": 0.7254391672088484, "grad_norm": 0.8387717604637146, "learning_rate": 4.781083909980342e-05, "loss": 0.9472, "step": 1115 }, { "epoch": 0.7286922576447625, "grad_norm": 0.9755154848098755, "learning_rate": 4.778966769973802e-05, "loss": 0.9668, "step": 1120 }, { "epoch": 0.7319453480806767, "grad_norm": 0.7101641893386841, "learning_rate": 4.7768399151064076e-05, "loss": 0.9457, "step": 1125 }, { "epoch": 0.7351984385165907, "grad_norm": 0.9372628331184387, "learning_rate": 4.774703354444591e-05, "loss": 0.9709, "step": 1130 }, { "epoch": 0.7384515289525049, "grad_norm": 0.9276643991470337, "learning_rate": 4.7725570970961586e-05, "loss": 0.9586, "step": 1135 }, { "epoch": 0.741704619388419, "grad_norm": 0.7329192757606506, "learning_rate": 4.770401152210253e-05, "loss": 0.9608, "step": 1140 }, { "epoch": 0.7449577098243331, "grad_norm": 0.7759012579917908, "learning_rate": 4.768235528977314e-05, "loss": 0.9469, "step": 1145 }, { "epoch": 0.7482108002602472, "grad_norm": 1.2127937078475952, "learning_rate": 4.766060236629037e-05, "loss": 0.9542, "step": 1150 }, { "epoch": 0.7514638906961614, "grad_norm": 0.7369085550308228, "learning_rate": 4.763875284438336e-05, "loss": 0.9643, "step": 1155 }, { "epoch": 0.7547169811320755, "grad_norm": 0.7963067293167114, "learning_rate": 4.7616806817193024e-05, "loss": 0.9678, "step": 1160 }, { "epoch": 0.7579700715679896, "grad_norm": 0.7773886919021606, "learning_rate": 4.759476437827168e-05, "loss": 0.9603, "step": 1165 }, { "epoch": 0.7612231620039037, "grad_norm": 0.8198060393333435, "learning_rate": 4.757262562158262e-05, "loss": 0.9759, "step": 1170 }, { "epoch": 0.7644762524398179, "grad_norm": 0.7127149701118469, "learning_rate": 4.7550390641499715e-05, "loss": 0.9244, "step": 1175 }, { "epoch": 0.7677293428757319, "grad_norm": 1.236286997795105, "learning_rate": 4.7528059532807045e-05, "loss": 0.9313, "step": 1180 }, { "epoch": 0.7709824333116461, "grad_norm": 0.6795628070831299, "learning_rate": 4.750563239069845e-05, "loss": 0.9586, "step": 1185 }, { "epoch": 0.7742355237475602, "grad_norm": 0.8040820956230164, "learning_rate": 4.7483109310777165e-05, "loss": 0.9483, "step": 1190 }, { "epoch": 0.7774886141834743, "grad_norm": 0.8001431226730347, "learning_rate": 4.7460490389055355e-05, "loss": 0.9408, "step": 1195 }, { "epoch": 0.7807417046193884, "grad_norm": 0.969782292842865, "learning_rate": 4.743777572195378e-05, "loss": 0.9778, "step": 1200 }, { "epoch": 0.7839947950553026, "grad_norm": 1.0955541133880615, "learning_rate": 4.741496540630134e-05, "loss": 0.9385, "step": 1205 }, { "epoch": 0.7872478854912166, "grad_norm": 0.7429236173629761, "learning_rate": 4.739205953933464e-05, "loss": 0.9642, "step": 1210 }, { "epoch": 0.7905009759271308, "grad_norm": 1.0475250482559204, "learning_rate": 4.736905821869765e-05, "loss": 0.9437, "step": 1215 }, { "epoch": 0.7937540663630449, "grad_norm": 0.7216660380363464, "learning_rate": 4.734596154244121e-05, "loss": 0.9289, "step": 1220 }, { "epoch": 0.7970071567989591, "grad_norm": 0.8584089279174805, "learning_rate": 4.732276960902267e-05, "loss": 0.9246, "step": 1225 }, { "epoch": 0.8002602472348731, "grad_norm": 0.8769578337669373, "learning_rate": 4.7299482517305404e-05, "loss": 0.9298, "step": 1230 }, { "epoch": 0.8035133376707873, "grad_norm": 0.7453442811965942, "learning_rate": 4.7276100366558474e-05, "loss": 0.9491, "step": 1235 }, { "epoch": 0.8067664281067014, "grad_norm": 0.906287431716919, "learning_rate": 4.7252623256456144e-05, "loss": 0.9539, "step": 1240 }, { "epoch": 0.8100195185426154, "grad_norm": 1.0656296014785767, "learning_rate": 4.722905128707749e-05, "loss": 0.9405, "step": 1245 }, { "epoch": 0.8132726089785296, "grad_norm": 0.6985450983047485, "learning_rate": 4.720538455890591e-05, "loss": 0.9369, "step": 1250 }, { "epoch": 0.8165256994144438, "grad_norm": 0.6577023267745972, "learning_rate": 4.718162317282882e-05, "loss": 0.9346, "step": 1255 }, { "epoch": 0.8197787898503578, "grad_norm": 0.7832421064376831, "learning_rate": 4.7157767230137064e-05, "loss": 0.9256, "step": 1260 }, { "epoch": 0.8230318802862719, "grad_norm": 0.7928493618965149, "learning_rate": 4.713381683252463e-05, "loss": 0.9477, "step": 1265 }, { "epoch": 0.8262849707221861, "grad_norm": 0.8775043487548828, "learning_rate": 4.710977208208812e-05, "loss": 0.9313, "step": 1270 }, { "epoch": 0.8295380611581002, "grad_norm": 0.7714875936508179, "learning_rate": 4.708563308132636e-05, "loss": 0.9469, "step": 1275 }, { "epoch": 0.8327911515940143, "grad_norm": 0.7258083820343018, "learning_rate": 4.706139993313994e-05, "loss": 0.9294, "step": 1280 }, { "epoch": 0.8360442420299284, "grad_norm": 0.7745918035507202, "learning_rate": 4.7037072740830785e-05, "loss": 0.9365, "step": 1285 }, { "epoch": 0.8392973324658426, "grad_norm": 0.7213959097862244, "learning_rate": 4.701265160810172e-05, "loss": 0.947, "step": 1290 }, { "epoch": 0.8425504229017566, "grad_norm": 0.825713038444519, "learning_rate": 4.6988136639056025e-05, "loss": 0.9404, "step": 1295 }, { "epoch": 0.8458035133376708, "grad_norm": 0.6750174164772034, "learning_rate": 4.696352793819698e-05, "loss": 0.9364, "step": 1300 }, { "epoch": 0.8490566037735849, "grad_norm": 0.8314560055732727, "learning_rate": 4.693882561042743e-05, "loss": 0.9521, "step": 1305 }, { "epoch": 0.852309694209499, "grad_norm": 1.0009961128234863, "learning_rate": 4.6914029761049357e-05, "loss": 0.9297, "step": 1310 }, { "epoch": 0.8555627846454131, "grad_norm": 0.7527256011962891, "learning_rate": 4.688914049576337e-05, "loss": 0.9269, "step": 1315 }, { "epoch": 0.8588158750813273, "grad_norm": 0.9169411659240723, "learning_rate": 4.686415792066833e-05, "loss": 0.9312, "step": 1320 }, { "epoch": 0.8620689655172413, "grad_norm": 0.9165216088294983, "learning_rate": 4.683908214226084e-05, "loss": 0.9524, "step": 1325 }, { "epoch": 0.8653220559531555, "grad_norm": 0.9357953071594238, "learning_rate": 4.6813913267434835e-05, "loss": 0.9245, "step": 1330 }, { "epoch": 0.8685751463890696, "grad_norm": 0.6473081707954407, "learning_rate": 4.678865140348108e-05, "loss": 0.9584, "step": 1335 }, { "epoch": 0.8718282368249838, "grad_norm": 0.884191632270813, "learning_rate": 4.676329665808677e-05, "loss": 0.9569, "step": 1340 }, { "epoch": 0.8750813272608978, "grad_norm": 1.0534435510635376, "learning_rate": 4.673784913933499e-05, "loss": 0.9178, "step": 1345 }, { "epoch": 0.878334417696812, "grad_norm": 0.8140066266059875, "learning_rate": 4.6712308955704346e-05, "loss": 0.9536, "step": 1350 }, { "epoch": 0.8815875081327261, "grad_norm": 0.71702641248703, "learning_rate": 4.668667621606845e-05, "loss": 0.947, "step": 1355 }, { "epoch": 0.8848405985686402, "grad_norm": 0.6529531478881836, "learning_rate": 4.666095102969544e-05, "loss": 0.9107, "step": 1360 }, { "epoch": 0.8880936890045543, "grad_norm": 0.9059852957725525, "learning_rate": 4.6635133506247585e-05, "loss": 0.9399, "step": 1365 }, { "epoch": 0.8913467794404685, "grad_norm": 0.8972651958465576, "learning_rate": 4.660922375578073e-05, "loss": 0.9511, "step": 1370 }, { "epoch": 0.8945998698763825, "grad_norm": 1.0316717624664307, "learning_rate": 4.658322188874388e-05, "loss": 0.9335, "step": 1375 }, { "epoch": 0.8978529603122967, "grad_norm": 0.7475149035453796, "learning_rate": 4.6557128015978726e-05, "loss": 0.9262, "step": 1380 }, { "epoch": 0.9011060507482108, "grad_norm": 1.035979986190796, "learning_rate": 4.653094224871916e-05, "loss": 0.9115, "step": 1385 }, { "epoch": 0.904359141184125, "grad_norm": 0.8210706114768982, "learning_rate": 4.650466469859079e-05, "loss": 0.9535, "step": 1390 }, { "epoch": 0.907612231620039, "grad_norm": 0.9931228160858154, "learning_rate": 4.647829547761053e-05, "loss": 0.9335, "step": 1395 }, { "epoch": 0.9108653220559532, "grad_norm": 0.7681549191474915, "learning_rate": 4.6451834698186e-05, "loss": 0.9434, "step": 1400 }, { "epoch": 0.9141184124918673, "grad_norm": 0.7461596727371216, "learning_rate": 4.642528247311518e-05, "loss": 0.9487, "step": 1405 }, { "epoch": 0.9173715029277814, "grad_norm": 1.4867486953735352, "learning_rate": 4.6398638915585835e-05, "loss": 0.9074, "step": 1410 }, { "epoch": 0.9206245933636955, "grad_norm": 0.890620231628418, "learning_rate": 4.637190413917506e-05, "loss": 0.9467, "step": 1415 }, { "epoch": 0.9238776837996097, "grad_norm": 0.6205281615257263, "learning_rate": 4.634507825784882e-05, "loss": 0.9242, "step": 1420 }, { "epoch": 0.9271307742355237, "grad_norm": 0.8957470655441284, "learning_rate": 4.631816138596145e-05, "loss": 0.94, "step": 1425 }, { "epoch": 0.9303838646714379, "grad_norm": 0.8642396330833435, "learning_rate": 4.629115363825514e-05, "loss": 0.9142, "step": 1430 }, { "epoch": 0.933636955107352, "grad_norm": 0.6721086502075195, "learning_rate": 4.626405512985948e-05, "loss": 0.9205, "step": 1435 }, { "epoch": 0.936890045543266, "grad_norm": 0.8930765986442566, "learning_rate": 4.623686597629098e-05, "loss": 0.9235, "step": 1440 }, { "epoch": 0.9401431359791802, "grad_norm": 0.9480865597724915, "learning_rate": 4.62095862934525e-05, "loss": 0.9309, "step": 1445 }, { "epoch": 0.9433962264150944, "grad_norm": 0.9130436778068542, "learning_rate": 4.618221619763287e-05, "loss": 0.9257, "step": 1450 }, { "epoch": 0.9466493168510085, "grad_norm": 0.63996821641922, "learning_rate": 4.6154755805506294e-05, "loss": 0.9364, "step": 1455 }, { "epoch": 0.9499024072869225, "grad_norm": 0.786276638507843, "learning_rate": 4.612720523413193e-05, "loss": 0.9389, "step": 1460 }, { "epoch": 0.9531554977228367, "grad_norm": 0.8122700452804565, "learning_rate": 4.609956460095332e-05, "loss": 0.9296, "step": 1465 }, { "epoch": 0.9564085881587508, "grad_norm": 1.0054434537887573, "learning_rate": 4.607183402379794e-05, "loss": 0.9118, "step": 1470 }, { "epoch": 0.9596616785946649, "grad_norm": 0.9399415850639343, "learning_rate": 4.6044013620876706e-05, "loss": 0.9311, "step": 1475 }, { "epoch": 0.962914769030579, "grad_norm": 0.6693314909934998, "learning_rate": 4.60161035107834e-05, "loss": 0.9322, "step": 1480 }, { "epoch": 0.9661678594664932, "grad_norm": 0.7549735903739929, "learning_rate": 4.598810381249425e-05, "loss": 0.9246, "step": 1485 }, { "epoch": 0.9694209499024072, "grad_norm": 0.8314823508262634, "learning_rate": 4.596001464536737e-05, "loss": 0.9335, "step": 1490 }, { "epoch": 0.9726740403383214, "grad_norm": 0.7478086948394775, "learning_rate": 4.593183612914225e-05, "loss": 0.9341, "step": 1495 }, { "epoch": 0.9759271307742355, "grad_norm": 0.9777085185050964, "learning_rate": 4.5903568383939284e-05, "loss": 0.9323, "step": 1500 }, { "epoch": 0.9791802212101497, "grad_norm": 0.893374502658844, "learning_rate": 4.587521153025922e-05, "loss": 0.939, "step": 1505 }, { "epoch": 0.9824333116460637, "grad_norm": 0.6938668489456177, "learning_rate": 4.584676568898267e-05, "loss": 0.9437, "step": 1510 }, { "epoch": 0.9856864020819779, "grad_norm": 0.6903214454650879, "learning_rate": 4.5818230981369584e-05, "loss": 0.9332, "step": 1515 }, { "epoch": 0.988939492517892, "grad_norm": 0.817034125328064, "learning_rate": 4.5789607529058715e-05, "loss": 0.9375, "step": 1520 }, { "epoch": 0.9921925829538061, "grad_norm": 0.8222942352294922, "learning_rate": 4.5760895454067154e-05, "loss": 0.9316, "step": 1525 }, { "epoch": 0.9954456733897202, "grad_norm": 0.7549692392349243, "learning_rate": 4.5732094878789756e-05, "loss": 0.9221, "step": 1530 }, { "epoch": 0.9986987638256344, "grad_norm": 0.8544319868087769, "learning_rate": 4.570320592599863e-05, "loss": 0.9287, "step": 1535 }, { "epoch": 1.0, "eval_f1": 0.7910057808991992, "eval_loss": 0.462646484375, "eval_precision": 0.7940469727119374, "eval_recall": 0.7896973937143991, "eval_runtime": 247.1562, "eval_samples_per_second": 1591.847, "eval_steps_per_second": 1.558, "step": 1537 }, { "epoch": 1.0019518542615484, "grad_norm": 0.7457589507102966, "learning_rate": 4.567422871884265e-05, "loss": 0.9279, "step": 1540 }, { "epoch": 1.0052049446974627, "grad_norm": 0.8609625697135925, "learning_rate": 4.564516338084688e-05, "loss": 0.8765, "step": 1545 }, { "epoch": 1.0084580351333767, "grad_norm": 0.8822636008262634, "learning_rate": 4.561601003591208e-05, "loss": 0.8427, "step": 1550 }, { "epoch": 1.0117111255692908, "grad_norm": 0.7266067266464233, "learning_rate": 4.558676880831417e-05, "loss": 0.8828, "step": 1555 }, { "epoch": 1.014964216005205, "grad_norm": 0.6970102787017822, "learning_rate": 4.555743982270369e-05, "loss": 0.8842, "step": 1560 }, { "epoch": 1.018217306441119, "grad_norm": 0.6802201867103577, "learning_rate": 4.5528023204105306e-05, "loss": 0.872, "step": 1565 }, { "epoch": 1.0214703968770331, "grad_norm": 0.7830452919006348, "learning_rate": 4.549851907791722e-05, "loss": 0.8624, "step": 1570 }, { "epoch": 1.0247234873129474, "grad_norm": 0.6845102906227112, "learning_rate": 4.5468927569910663e-05, "loss": 0.8744, "step": 1575 }, { "epoch": 1.0279765777488614, "grad_norm": 0.8832181692123413, "learning_rate": 4.5439248806229386e-05, "loss": 0.8722, "step": 1580 }, { "epoch": 1.0312296681847755, "grad_norm": 0.7359802722930908, "learning_rate": 4.5409482913389065e-05, "loss": 0.8567, "step": 1585 }, { "epoch": 1.0344827586206897, "grad_norm": 0.7686721086502075, "learning_rate": 4.5379630018276834e-05, "loss": 0.8509, "step": 1590 }, { "epoch": 1.0377358490566038, "grad_norm": 0.77400141954422, "learning_rate": 4.534969024815066e-05, "loss": 0.8676, "step": 1595 }, { "epoch": 1.0409889394925178, "grad_norm": 0.8024744987487793, "learning_rate": 4.531966373063886e-05, "loss": 0.8772, "step": 1600 }, { "epoch": 1.044242029928432, "grad_norm": 0.7155640721321106, "learning_rate": 4.528955059373956e-05, "loss": 0.8608, "step": 1605 }, { "epoch": 1.047495120364346, "grad_norm": 0.8553564548492432, "learning_rate": 4.52593509658201e-05, "loss": 0.8614, "step": 1610 }, { "epoch": 1.0507482108002602, "grad_norm": 0.6926222443580627, "learning_rate": 4.522906497561655e-05, "loss": 0.8582, "step": 1615 }, { "epoch": 1.0540013012361744, "grad_norm": 0.8300968408584595, "learning_rate": 4.519869275223309e-05, "loss": 0.8838, "step": 1620 }, { "epoch": 1.0572543916720885, "grad_norm": 0.8907480835914612, "learning_rate": 4.516823442514153e-05, "loss": 0.8656, "step": 1625 }, { "epoch": 1.0605074821080025, "grad_norm": 1.035863995552063, "learning_rate": 4.513769012418071e-05, "loss": 0.8814, "step": 1630 }, { "epoch": 1.0637605725439168, "grad_norm": 0.9308491945266724, "learning_rate": 4.510705997955596e-05, "loss": 0.8831, "step": 1635 }, { "epoch": 1.0670136629798308, "grad_norm": 1.0290710926055908, "learning_rate": 4.507634412183856e-05, "loss": 0.8566, "step": 1640 }, { "epoch": 1.070266753415745, "grad_norm": 0.9163823127746582, "learning_rate": 4.504554268196516e-05, "loss": 0.8646, "step": 1645 }, { "epoch": 1.073519843851659, "grad_norm": 0.7528260946273804, "learning_rate": 4.5014655791237245e-05, "loss": 0.8681, "step": 1650 }, { "epoch": 1.0767729342875731, "grad_norm": 0.9018992781639099, "learning_rate": 4.498368358132055e-05, "loss": 0.8667, "step": 1655 }, { "epoch": 1.0800260247234874, "grad_norm": 1.000990390777588, "learning_rate": 4.4952626184244504e-05, "loss": 0.8627, "step": 1660 }, { "epoch": 1.0832791151594015, "grad_norm": 1.1555023193359375, "learning_rate": 4.492148373240171e-05, "loss": 0.8488, "step": 1665 }, { "epoch": 1.0865322055953155, "grad_norm": 0.9759275913238525, "learning_rate": 4.4890256358547304e-05, "loss": 0.8775, "step": 1670 }, { "epoch": 1.0897852960312298, "grad_norm": 0.7439780235290527, "learning_rate": 4.485894419579846e-05, "loss": 0.8758, "step": 1675 }, { "epoch": 1.0930383864671438, "grad_norm": 0.8394938111305237, "learning_rate": 4.482754737763378e-05, "loss": 0.8797, "step": 1680 }, { "epoch": 1.0962914769030578, "grad_norm": 0.8299522399902344, "learning_rate": 4.4796066037892734e-05, "loss": 0.864, "step": 1685 }, { "epoch": 1.099544567338972, "grad_norm": 0.8585712909698486, "learning_rate": 4.4764500310775116e-05, "loss": 0.8586, "step": 1690 }, { "epoch": 1.1027976577748861, "grad_norm": 1.0859423875808716, "learning_rate": 4.473285033084043e-05, "loss": 0.8773, "step": 1695 }, { "epoch": 1.1060507482108002, "grad_norm": 0.7827959060668945, "learning_rate": 4.4701116233007314e-05, "loss": 0.8423, "step": 1700 }, { "epoch": 1.1093038386467144, "grad_norm": 0.7498010993003845, "learning_rate": 4.466929815255304e-05, "loss": 0.884, "step": 1705 }, { "epoch": 1.1125569290826285, "grad_norm": 0.7543908357620239, "learning_rate": 4.4637396225112846e-05, "loss": 0.8606, "step": 1710 }, { "epoch": 1.1158100195185425, "grad_norm": 1.3613898754119873, "learning_rate": 4.460541058667942e-05, "loss": 0.8909, "step": 1715 }, { "epoch": 1.1190631099544568, "grad_norm": 0.8409460783004761, "learning_rate": 4.457334137360226e-05, "loss": 0.8892, "step": 1720 }, { "epoch": 1.1223162003903708, "grad_norm": 0.9072450995445251, "learning_rate": 4.4541188722587165e-05, "loss": 0.8714, "step": 1725 }, { "epoch": 1.1255692908262849, "grad_norm": 1.02306067943573, "learning_rate": 4.450895277069561e-05, "loss": 0.8813, "step": 1730 }, { "epoch": 1.1288223812621991, "grad_norm": 1.0199263095855713, "learning_rate": 4.4476633655344144e-05, "loss": 0.8693, "step": 1735 }, { "epoch": 1.1320754716981132, "grad_norm": 0.7447525262832642, "learning_rate": 4.444423151430386e-05, "loss": 0.8894, "step": 1740 }, { "epoch": 1.1353285621340272, "grad_norm": 1.062179446220398, "learning_rate": 4.4411746485699744e-05, "loss": 0.8425, "step": 1745 }, { "epoch": 1.1385816525699415, "grad_norm": 0.7509242296218872, "learning_rate": 4.437917870801015e-05, "loss": 0.8666, "step": 1750 }, { "epoch": 1.1418347430058555, "grad_norm": 1.1955047845840454, "learning_rate": 4.434652832006616e-05, "loss": 0.8798, "step": 1755 }, { "epoch": 1.1450878334417696, "grad_norm": 1.1089417934417725, "learning_rate": 4.431379546105101e-05, "loss": 0.8808, "step": 1760 }, { "epoch": 1.1483409238776838, "grad_norm": 0.7296579480171204, "learning_rate": 4.4280980270499494e-05, "loss": 0.854, "step": 1765 }, { "epoch": 1.1515940143135979, "grad_norm": 1.0274302959442139, "learning_rate": 4.424808288829739e-05, "loss": 0.8775, "step": 1770 }, { "epoch": 1.1548471047495121, "grad_norm": 0.8249827027320862, "learning_rate": 4.421510345468082e-05, "loss": 0.8825, "step": 1775 }, { "epoch": 1.1581001951854262, "grad_norm": 0.814564049243927, "learning_rate": 4.4182042110235686e-05, "loss": 0.8354, "step": 1780 }, { "epoch": 1.1613532856213402, "grad_norm": 0.8738640546798706, "learning_rate": 4.414889899589709e-05, "loss": 0.8667, "step": 1785 }, { "epoch": 1.1646063760572545, "grad_norm": 0.873928427696228, "learning_rate": 4.411567425294867e-05, "loss": 0.8589, "step": 1790 }, { "epoch": 1.1678594664931685, "grad_norm": 1.0771477222442627, "learning_rate": 4.408236802302203e-05, "loss": 0.8677, "step": 1795 }, { "epoch": 1.1711125569290826, "grad_norm": 1.026843786239624, "learning_rate": 4.404898044809618e-05, "loss": 0.8613, "step": 1800 }, { "epoch": 1.1743656473649968, "grad_norm": 1.2807365655899048, "learning_rate": 4.401551167049686e-05, "loss": 0.8612, "step": 1805 }, { "epoch": 1.1776187378009109, "grad_norm": 1.086053729057312, "learning_rate": 4.398196183289595e-05, "loss": 0.8679, "step": 1810 }, { "epoch": 1.180871828236825, "grad_norm": 1.2245922088623047, "learning_rate": 4.394833107831091e-05, "loss": 0.8666, "step": 1815 }, { "epoch": 1.1841249186727392, "grad_norm": 0.788972020149231, "learning_rate": 4.3914619550104125e-05, "loss": 0.8549, "step": 1820 }, { "epoch": 1.1873780091086532, "grad_norm": 0.7560495734214783, "learning_rate": 4.388082739198229e-05, "loss": 0.8689, "step": 1825 }, { "epoch": 1.1906310995445673, "grad_norm": 0.9753955006599426, "learning_rate": 4.3846954747995825e-05, "loss": 0.8676, "step": 1830 }, { "epoch": 1.1938841899804815, "grad_norm": 0.7910217642784119, "learning_rate": 4.381300176253825e-05, "loss": 0.872, "step": 1835 }, { "epoch": 1.1971372804163956, "grad_norm": 0.9588011503219604, "learning_rate": 4.377896858034557e-05, "loss": 0.8903, "step": 1840 }, { "epoch": 1.2003903708523098, "grad_norm": 0.9886934757232666, "learning_rate": 4.374485534649562e-05, "loss": 0.879, "step": 1845 }, { "epoch": 1.2036434612882239, "grad_norm": 0.896848738193512, "learning_rate": 4.371066220640754e-05, "loss": 0.854, "step": 1850 }, { "epoch": 1.206896551724138, "grad_norm": 1.7082849740982056, "learning_rate": 4.367638930584105e-05, "loss": 0.8877, "step": 1855 }, { "epoch": 1.2101496421600522, "grad_norm": 1.307518482208252, "learning_rate": 4.36420367908959e-05, "loss": 0.8637, "step": 1860 }, { "epoch": 1.2134027325959662, "grad_norm": 0.9649641513824463, "learning_rate": 4.3607604808011213e-05, "loss": 0.8644, "step": 1865 }, { "epoch": 1.2166558230318802, "grad_norm": 0.958816409111023, "learning_rate": 4.357309350396488e-05, "loss": 0.8771, "step": 1870 }, { "epoch": 1.2199089134677945, "grad_norm": 0.7665415406227112, "learning_rate": 4.353850302587291e-05, "loss": 0.8559, "step": 1875 }, { "epoch": 1.2231620039037086, "grad_norm": 0.8145641088485718, "learning_rate": 4.3503833521188844e-05, "loss": 0.8776, "step": 1880 }, { "epoch": 1.2264150943396226, "grad_norm": 1.0663881301879883, "learning_rate": 4.346908513770306e-05, "loss": 0.8643, "step": 1885 }, { "epoch": 1.2296681847755369, "grad_norm": 0.7401409149169922, "learning_rate": 4.343425802354222e-05, "loss": 0.8646, "step": 1890 }, { "epoch": 1.232921275211451, "grad_norm": 0.7239570021629333, "learning_rate": 4.3399352327168595e-05, "loss": 0.8885, "step": 1895 }, { "epoch": 1.236174365647365, "grad_norm": 1.0525251626968384, "learning_rate": 4.3364368197379426e-05, "loss": 0.8817, "step": 1900 }, { "epoch": 1.2394274560832792, "grad_norm": 0.8934289813041687, "learning_rate": 4.33293057833063e-05, "loss": 0.8699, "step": 1905 }, { "epoch": 1.2426805465191932, "grad_norm": 0.8614199757575989, "learning_rate": 4.329416523441454e-05, "loss": 0.866, "step": 1910 }, { "epoch": 1.2459336369551073, "grad_norm": 0.884955644607544, "learning_rate": 4.3258946700502535e-05, "loss": 0.8641, "step": 1915 }, { "epoch": 1.2491867273910215, "grad_norm": 0.8655734062194824, "learning_rate": 4.322365033170109e-05, "loss": 0.8393, "step": 1920 }, { "epoch": 1.2524398178269356, "grad_norm": 1.0718590021133423, "learning_rate": 4.318827627847284e-05, "loss": 0.8788, "step": 1925 }, { "epoch": 1.2556929082628496, "grad_norm": 0.9467219710350037, "learning_rate": 4.315282469161156e-05, "loss": 0.8758, "step": 1930 }, { "epoch": 1.258945998698764, "grad_norm": 1.0598018169403076, "learning_rate": 4.311729572224153e-05, "loss": 0.8872, "step": 1935 }, { "epoch": 1.262199089134678, "grad_norm": 0.7586490511894226, "learning_rate": 4.308168952181691e-05, "loss": 0.8749, "step": 1940 }, { "epoch": 1.265452179570592, "grad_norm": 0.8791137933731079, "learning_rate": 4.304600624212109e-05, "loss": 0.8833, "step": 1945 }, { "epoch": 1.2687052700065062, "grad_norm": 1.0280482769012451, "learning_rate": 4.3017404223497385e-05, "loss": 0.893, "step": 1950 }, { "epoch": 1.2719583604424203, "grad_norm": 0.8759311437606812, "learning_rate": 4.298158258465592e-05, "loss": 0.8833, "step": 1955 }, { "epoch": 1.2752114508783343, "grad_norm": 0.8623502850532532, "learning_rate": 4.2945684293282685e-05, "loss": 0.8533, "step": 1960 }, { "epoch": 1.2784645413142486, "grad_norm": 0.9812124967575073, "learning_rate": 4.290970950240617e-05, "loss": 0.8832, "step": 1965 }, { "epoch": 1.2817176317501626, "grad_norm": 0.8114174008369446, "learning_rate": 4.2873658365381026e-05, "loss": 0.8657, "step": 1970 }, { "epoch": 1.2849707221860767, "grad_norm": 0.7681922912597656, "learning_rate": 4.2837531035887305e-05, "loss": 0.8563, "step": 1975 }, { "epoch": 1.288223812621991, "grad_norm": 0.9911778569221497, "learning_rate": 4.280132766792989e-05, "loss": 0.8401, "step": 1980 }, { "epoch": 1.291476903057905, "grad_norm": 0.7618448138237, "learning_rate": 4.276504841583778e-05, "loss": 0.8727, "step": 1985 }, { "epoch": 1.294729993493819, "grad_norm": 0.7748595476150513, "learning_rate": 4.2728693434263476e-05, "loss": 0.8726, "step": 1990 }, { "epoch": 1.2979830839297333, "grad_norm": 0.995187520980835, "learning_rate": 4.269226287818228e-05, "loss": 0.8606, "step": 1995 }, { "epoch": 1.3012361743656473, "grad_norm": 0.9184800386428833, "learning_rate": 4.2655756902891665e-05, "loss": 0.8881, "step": 2000 }, { "epoch": 1.3044892648015614, "grad_norm": 0.6605210304260254, "learning_rate": 4.261917566401061e-05, "loss": 0.8452, "step": 2005 }, { "epoch": 1.3077423552374756, "grad_norm": 0.9930521249771118, "learning_rate": 4.258251931747893e-05, "loss": 0.8661, "step": 2010 }, { "epoch": 1.3109954456733897, "grad_norm": 0.6971027255058289, "learning_rate": 4.25457880195566e-05, "loss": 0.8607, "step": 2015 }, { "epoch": 1.3142485361093037, "grad_norm": 0.8052083253860474, "learning_rate": 4.250898192682311e-05, "loss": 0.8407, "step": 2020 }, { "epoch": 1.317501626545218, "grad_norm": 0.7318537831306458, "learning_rate": 4.247210119617679e-05, "loss": 0.8703, "step": 2025 }, { "epoch": 1.320754716981132, "grad_norm": 1.0614877939224243, "learning_rate": 4.243514598483412e-05, "loss": 0.854, "step": 2030 }, { "epoch": 1.3240078074170463, "grad_norm": 1.2773613929748535, "learning_rate": 4.23981164503291e-05, "loss": 0.8728, "step": 2035 }, { "epoch": 1.3272608978529603, "grad_norm": 1.41408371925354, "learning_rate": 4.236101275051256e-05, "loss": 0.859, "step": 2040 }, { "epoch": 1.3305139882888743, "grad_norm": 0.7571334838867188, "learning_rate": 4.232383504355147e-05, "loss": 0.8588, "step": 2045 }, { "epoch": 1.3337670787247886, "grad_norm": 0.7090466618537903, "learning_rate": 4.228658348792828e-05, "loss": 0.8672, "step": 2050 }, { "epoch": 1.3370201691607027, "grad_norm": 0.826134204864502, "learning_rate": 4.224925824244025e-05, "loss": 0.8552, "step": 2055 }, { "epoch": 1.340273259596617, "grad_norm": 0.8876454830169678, "learning_rate": 4.2211859466198785e-05, "loss": 0.8733, "step": 2060 }, { "epoch": 1.343526350032531, "grad_norm": 0.7836646437644958, "learning_rate": 4.217438731862871e-05, "loss": 0.8643, "step": 2065 }, { "epoch": 1.346779440468445, "grad_norm": 0.795116662979126, "learning_rate": 4.213684195946762e-05, "loss": 0.8759, "step": 2070 }, { "epoch": 1.3500325309043593, "grad_norm": 0.9851782321929932, "learning_rate": 4.2099223548765224e-05, "loss": 0.872, "step": 2075 }, { "epoch": 1.3532856213402733, "grad_norm": 0.9454843997955322, "learning_rate": 4.206153224688264e-05, "loss": 0.8709, "step": 2080 }, { "epoch": 1.3565387117761873, "grad_norm": 0.7972314953804016, "learning_rate": 4.202376821449167e-05, "loss": 0.881, "step": 2085 }, { "epoch": 1.3597918022121016, "grad_norm": 0.7645969390869141, "learning_rate": 4.1985931612574186e-05, "loss": 0.8729, "step": 2090 }, { "epoch": 1.3630448926480156, "grad_norm": 1.1820120811462402, "learning_rate": 4.194802260242141e-05, "loss": 0.8556, "step": 2095 }, { "epoch": 1.3662979830839297, "grad_norm": 0.9157008528709412, "learning_rate": 4.191004134563322e-05, "loss": 0.8721, "step": 2100 }, { "epoch": 1.369551073519844, "grad_norm": 0.8286409974098206, "learning_rate": 4.187198800411748e-05, "loss": 0.8756, "step": 2105 }, { "epoch": 1.372804163955758, "grad_norm": 0.8742622137069702, "learning_rate": 4.183386274008932e-05, "loss": 0.8592, "step": 2110 }, { "epoch": 1.376057254391672, "grad_norm": 0.8968034386634827, "learning_rate": 4.1795665716070474e-05, "loss": 0.8641, "step": 2115 }, { "epoch": 1.3793103448275863, "grad_norm": 0.8291420340538025, "learning_rate": 4.1757397094888594e-05, "loss": 0.8529, "step": 2120 }, { "epoch": 1.3825634352635003, "grad_norm": 0.919009268283844, "learning_rate": 4.1719057039676515e-05, "loss": 0.8636, "step": 2125 }, { "epoch": 1.3858165256994144, "grad_norm": 1.0421229600906372, "learning_rate": 4.168064571387159e-05, "loss": 0.8681, "step": 2130 }, { "epoch": 1.3890696161353286, "grad_norm": 0.7388564944267273, "learning_rate": 4.1642163281214984e-05, "loss": 0.8513, "step": 2135 }, { "epoch": 1.3923227065712427, "grad_norm": 0.6921651363372803, "learning_rate": 4.160360990575099e-05, "loss": 0.8723, "step": 2140 }, { "epoch": 1.3955757970071567, "grad_norm": 0.7668315768241882, "learning_rate": 4.156498575182633e-05, "loss": 0.8621, "step": 2145 }, { "epoch": 1.398828887443071, "grad_norm": 0.7497116327285767, "learning_rate": 4.152629098408939e-05, "loss": 0.8604, "step": 2150 }, { "epoch": 1.402081977878985, "grad_norm": 0.7256556749343872, "learning_rate": 4.1487525767489635e-05, "loss": 0.8638, "step": 2155 }, { "epoch": 1.405335068314899, "grad_norm": 1.1155390739440918, "learning_rate": 4.144869026727681e-05, "loss": 0.8547, "step": 2160 }, { "epoch": 1.4085881587508133, "grad_norm": 0.9044195413589478, "learning_rate": 4.140978464900025e-05, "loss": 0.8792, "step": 2165 }, { "epoch": 1.4118412491867274, "grad_norm": 0.7881206274032593, "learning_rate": 4.137080907850823e-05, "loss": 0.874, "step": 2170 }, { "epoch": 1.4150943396226414, "grad_norm": 0.851743757724762, "learning_rate": 4.13317637219472e-05, "loss": 0.8551, "step": 2175 }, { "epoch": 1.4183474300585557, "grad_norm": 0.8619376420974731, "learning_rate": 4.129264874576111e-05, "loss": 0.8757, "step": 2180 }, { "epoch": 1.4216005204944697, "grad_norm": 1.2099318504333496, "learning_rate": 4.125346431669065e-05, "loss": 0.8567, "step": 2185 }, { "epoch": 1.4248536109303838, "grad_norm": 0.8172369599342346, "learning_rate": 4.121421060177263e-05, "loss": 0.8625, "step": 2190 }, { "epoch": 1.428106701366298, "grad_norm": 1.1485086679458618, "learning_rate": 4.1174887768339164e-05, "loss": 0.8681, "step": 2195 }, { "epoch": 1.431359791802212, "grad_norm": 0.8006755709648132, "learning_rate": 4.113549598401704e-05, "loss": 0.8657, "step": 2200 }, { "epoch": 1.434612882238126, "grad_norm": 0.7858587503433228, "learning_rate": 4.1096035416726966e-05, "loss": 0.8681, "step": 2205 }, { "epoch": 1.4378659726740404, "grad_norm": 1.0397981405258179, "learning_rate": 4.105650623468284e-05, "loss": 0.871, "step": 2210 }, { "epoch": 1.4411190631099544, "grad_norm": 1.409725546836853, "learning_rate": 4.101690860639108e-05, "loss": 0.8525, "step": 2215 }, { "epoch": 1.4443721535458685, "grad_norm": 1.0374292135238647, "learning_rate": 4.097724270064988e-05, "loss": 0.8561, "step": 2220 }, { "epoch": 1.4476252439817827, "grad_norm": 1.10367751121521, "learning_rate": 4.0937508686548455e-05, "loss": 0.8608, "step": 2225 }, { "epoch": 1.4508783344176968, "grad_norm": 0.9354111552238464, "learning_rate": 4.089770673346639e-05, "loss": 0.8556, "step": 2230 }, { "epoch": 1.4541314248536108, "grad_norm": 0.7732600569725037, "learning_rate": 4.085783701107288e-05, "loss": 0.8664, "step": 2235 }, { "epoch": 1.457384515289525, "grad_norm": 0.7464646697044373, "learning_rate": 4.0817899689325975e-05, "loss": 0.8544, "step": 2240 }, { "epoch": 1.460637605725439, "grad_norm": 0.7917648553848267, "learning_rate": 4.077789493847194e-05, "loss": 0.849, "step": 2245 }, { "epoch": 1.4638906961613534, "grad_norm": 0.8593052625656128, "learning_rate": 4.073782292904445e-05, "loss": 0.905, "step": 2250 }, { "epoch": 1.4671437865972674, "grad_norm": 0.7432965636253357, "learning_rate": 4.0697683831863877e-05, "loss": 0.8606, "step": 2255 }, { "epoch": 1.4703968770331814, "grad_norm": 1.0467164516448975, "learning_rate": 4.065747781803662e-05, "loss": 0.8733, "step": 2260 }, { "epoch": 1.4736499674690957, "grad_norm": 0.8533846735954285, "learning_rate": 4.06172050589543e-05, "loss": 0.8411, "step": 2265 }, { "epoch": 1.4769030579050098, "grad_norm": 0.7896531224250793, "learning_rate": 4.057686572629307e-05, "loss": 0.8732, "step": 2270 }, { "epoch": 1.480156148340924, "grad_norm": 0.7728810906410217, "learning_rate": 4.053645999201287e-05, "loss": 0.8822, "step": 2275 }, { "epoch": 1.483409238776838, "grad_norm": 0.791527271270752, "learning_rate": 4.0495988028356725e-05, "loss": 0.8692, "step": 2280 }, { "epoch": 1.486662329212752, "grad_norm": 1.7369199991226196, "learning_rate": 4.0455450007849945e-05, "loss": 0.878, "step": 2285 }, { "epoch": 1.4899154196486664, "grad_norm": 0.8174150586128235, "learning_rate": 4.041484610329945e-05, "loss": 0.8843, "step": 2290 }, { "epoch": 1.4931685100845804, "grad_norm": 0.8122901916503906, "learning_rate": 4.037417648779304e-05, "loss": 0.8511, "step": 2295 }, { "epoch": 1.4964216005204944, "grad_norm": 0.856270968914032, "learning_rate": 4.033344133469857e-05, "loss": 0.8576, "step": 2300 }, { "epoch": 1.4996746909564087, "grad_norm": 0.7714033126831055, "learning_rate": 4.029264081766333e-05, "loss": 0.8563, "step": 2305 }, { "epoch": 1.5029277813923227, "grad_norm": 0.7557379007339478, "learning_rate": 4.02517751106132e-05, "loss": 0.8632, "step": 2310 }, { "epoch": 1.5061808718282368, "grad_norm": 0.9310267567634583, "learning_rate": 4.021084438775199e-05, "loss": 0.8756, "step": 2315 }, { "epoch": 1.509433962264151, "grad_norm": 1.1613460779190063, "learning_rate": 4.016984882356063e-05, "loss": 0.8581, "step": 2320 }, { "epoch": 1.512687052700065, "grad_norm": 0.8737664222717285, "learning_rate": 4.0128788592796484e-05, "loss": 0.8463, "step": 2325 }, { "epoch": 1.5159401431359791, "grad_norm": 1.137432336807251, "learning_rate": 4.008766387049257e-05, "loss": 0.8668, "step": 2330 }, { "epoch": 1.5191932335718934, "grad_norm": 1.205127239227295, "learning_rate": 4.004647483195682e-05, "loss": 0.854, "step": 2335 }, { "epoch": 1.5224463240078074, "grad_norm": 1.2103711366653442, "learning_rate": 4.0005221652771326e-05, "loss": 0.8599, "step": 2340 }, { "epoch": 1.5256994144437215, "grad_norm": 0.8847302794456482, "learning_rate": 3.996390450879163e-05, "loss": 0.8902, "step": 2345 }, { "epoch": 1.5289525048796357, "grad_norm": 0.9139837622642517, "learning_rate": 3.992252357614591e-05, "loss": 0.8537, "step": 2350 }, { "epoch": 1.5322055953155498, "grad_norm": 0.6250112056732178, "learning_rate": 3.9881079031234295e-05, "loss": 0.8625, "step": 2355 }, { "epoch": 1.5354586857514638, "grad_norm": 1.3147530555725098, "learning_rate": 3.983957105072806e-05, "loss": 0.8594, "step": 2360 }, { "epoch": 1.538711776187378, "grad_norm": 0.8052361607551575, "learning_rate": 3.9797999811568916e-05, "loss": 0.8613, "step": 2365 }, { "epoch": 1.5419648666232921, "grad_norm": 0.963198721408844, "learning_rate": 3.9756365490968216e-05, "loss": 0.8846, "step": 2370 }, { "epoch": 1.5452179570592062, "grad_norm": 0.7471247911453247, "learning_rate": 3.971466826640622e-05, "loss": 0.8559, "step": 2375 }, { "epoch": 1.5484710474951204, "grad_norm": 0.9139803051948547, "learning_rate": 3.967290831563137e-05, "loss": 0.8734, "step": 2380 }, { "epoch": 1.5517241379310345, "grad_norm": 0.8502246141433716, "learning_rate": 3.963108581665945e-05, "loss": 0.8517, "step": 2385 }, { "epoch": 1.5549772283669485, "grad_norm": 1.010526418685913, "learning_rate": 3.958920094777292e-05, "loss": 0.8699, "step": 2390 }, { "epoch": 1.5582303188028628, "grad_norm": 0.9621404409408569, "learning_rate": 3.954725388752006e-05, "loss": 0.8715, "step": 2395 }, { "epoch": 1.5614834092387768, "grad_norm": 0.931891679763794, "learning_rate": 3.950524481471434e-05, "loss": 0.8639, "step": 2400 }, { "epoch": 1.5647364996746909, "grad_norm": 0.9025523066520691, "learning_rate": 3.94631739084335e-05, "loss": 0.8407, "step": 2405 }, { "epoch": 1.5679895901106051, "grad_norm": 0.7679696679115295, "learning_rate": 3.942104134801892e-05, "loss": 0.8703, "step": 2410 }, { "epoch": 1.5712426805465192, "grad_norm": 0.7461057901382446, "learning_rate": 3.937884731307477e-05, "loss": 0.8508, "step": 2415 }, { "epoch": 1.5744957709824332, "grad_norm": 0.8891671895980835, "learning_rate": 3.9336591983467296e-05, "loss": 0.8392, "step": 2420 }, { "epoch": 1.5777488614183475, "grad_norm": 0.7495052218437195, "learning_rate": 3.929427553932402e-05, "loss": 0.8617, "step": 2425 }, { "epoch": 1.5810019518542615, "grad_norm": 0.8563068509101868, "learning_rate": 3.925189816103298e-05, "loss": 0.8682, "step": 2430 }, { "epoch": 1.5842550422901756, "grad_norm": 0.8730781674385071, "learning_rate": 3.9209460029242e-05, "loss": 0.8634, "step": 2435 }, { "epoch": 1.5875081327260898, "grad_norm": 1.0046974420547485, "learning_rate": 3.916696132485783e-05, "loss": 0.8423, "step": 2440 }, { "epoch": 1.5907612231620039, "grad_norm": 0.8691470623016357, "learning_rate": 3.9124402229045495e-05, "loss": 0.8443, "step": 2445 }, { "epoch": 1.594014313597918, "grad_norm": 0.7887680530548096, "learning_rate": 3.90817829232274e-05, "loss": 0.8796, "step": 2450 }, { "epoch": 1.5972674040338322, "grad_norm": 0.8779820203781128, "learning_rate": 3.903910358908267e-05, "loss": 0.8808, "step": 2455 }, { "epoch": 1.6005204944697464, "grad_norm": 0.9116110801696777, "learning_rate": 3.8996364408546284e-05, "loss": 0.8539, "step": 2460 }, { "epoch": 1.6037735849056602, "grad_norm": 0.8549916744232178, "learning_rate": 3.895356556380833e-05, "loss": 0.8714, "step": 2465 }, { "epoch": 1.6070266753415745, "grad_norm": 0.7568048238754272, "learning_rate": 3.8910707237313274e-05, "loss": 0.8545, "step": 2470 }, { "epoch": 1.6102797657774888, "grad_norm": 0.873261034488678, "learning_rate": 3.886778961175909e-05, "loss": 0.861, "step": 2475 }, { "epoch": 1.6135328562134026, "grad_norm": 0.8435690999031067, "learning_rate": 3.8824812870096585e-05, "loss": 0.849, "step": 2480 }, { "epoch": 1.6167859466493169, "grad_norm": 0.7543259263038635, "learning_rate": 3.878177719552854e-05, "loss": 0.8389, "step": 2485 }, { "epoch": 1.6200390370852311, "grad_norm": 0.6784664392471313, "learning_rate": 3.8738682771508975e-05, "loss": 0.862, "step": 2490 }, { "epoch": 1.623292127521145, "grad_norm": 0.735149085521698, "learning_rate": 3.869552978174232e-05, "loss": 0.86, "step": 2495 }, { "epoch": 1.6265452179570592, "grad_norm": 1.1492180824279785, "learning_rate": 3.8652318410182696e-05, "loss": 0.8682, "step": 2500 }, { "epoch": 1.6297983083929735, "grad_norm": 1.2123005390167236, "learning_rate": 3.860904884103307e-05, "loss": 0.8767, "step": 2505 }, { "epoch": 1.6330513988288873, "grad_norm": 1.0573855638504028, "learning_rate": 3.85657212587445e-05, "loss": 0.8784, "step": 2510 }, { "epoch": 1.6363044892648015, "grad_norm": 0.7657274603843689, "learning_rate": 3.8522335848015354e-05, "loss": 0.8614, "step": 2515 }, { "epoch": 1.6395575797007158, "grad_norm": 0.7586051225662231, "learning_rate": 3.847889279379052e-05, "loss": 0.8522, "step": 2520 }, { "epoch": 1.6428106701366298, "grad_norm": 0.8660874366760254, "learning_rate": 3.843539228126058e-05, "loss": 0.8491, "step": 2525 }, { "epoch": 1.6460637605725439, "grad_norm": 0.8181445002555847, "learning_rate": 3.8391834495861104e-05, "loss": 0.8774, "step": 2530 }, { "epoch": 1.6493168510084582, "grad_norm": 0.8161119222640991, "learning_rate": 3.834821962327173e-05, "loss": 0.8446, "step": 2535 }, { "epoch": 1.6525699414443722, "grad_norm": 0.7471867203712463, "learning_rate": 3.830454784941552e-05, "loss": 0.8743, "step": 2540 }, { "epoch": 1.6558230318802862, "grad_norm": 0.8243322372436523, "learning_rate": 3.8260819360458066e-05, "loss": 0.8582, "step": 2545 }, { "epoch": 1.6590761223162005, "grad_norm": 0.7759085297584534, "learning_rate": 3.8217034342806726e-05, "loss": 0.8634, "step": 2550 }, { "epoch": 1.6623292127521145, "grad_norm": 0.7820890545845032, "learning_rate": 3.817319298310984e-05, "loss": 0.849, "step": 2555 }, { "epoch": 1.6655823031880286, "grad_norm": 0.7369856238365173, "learning_rate": 3.812929546825591e-05, "loss": 0.851, "step": 2560 }, { "epoch": 1.6688353936239428, "grad_norm": 0.6760427355766296, "learning_rate": 3.8085341985372847e-05, "loss": 0.8526, "step": 2565 }, { "epoch": 1.6720884840598569, "grad_norm": 0.7964663505554199, "learning_rate": 3.804133272182711e-05, "loss": 0.8369, "step": 2570 }, { "epoch": 1.675341574495771, "grad_norm": 0.7458584308624268, "learning_rate": 3.7997267865222966e-05, "loss": 0.858, "step": 2575 }, { "epoch": 1.6785946649316852, "grad_norm": 0.7713748812675476, "learning_rate": 3.795314760340165e-05, "loss": 0.8422, "step": 2580 }, { "epoch": 1.6818477553675992, "grad_norm": 1.1121766567230225, "learning_rate": 3.79089721244406e-05, "loss": 0.8564, "step": 2585 }, { "epoch": 1.6851008458035133, "grad_norm": 0.7054054141044617, "learning_rate": 3.786474161665261e-05, "loss": 0.8503, "step": 2590 }, { "epoch": 1.6883539362394275, "grad_norm": 0.8231985569000244, "learning_rate": 3.782045626858508e-05, "loss": 0.8459, "step": 2595 }, { "epoch": 1.6916070266753416, "grad_norm": 0.8120073676109314, "learning_rate": 3.7776116269019164e-05, "loss": 0.8579, "step": 2600 }, { "epoch": 1.6948601171112556, "grad_norm": 0.7463471293449402, "learning_rate": 3.773172180696899e-05, "loss": 0.8685, "step": 2605 }, { "epoch": 1.6981132075471699, "grad_norm": 0.9310842752456665, "learning_rate": 3.7687273071680875e-05, "loss": 0.8657, "step": 2610 }, { "epoch": 1.701366297983084, "grad_norm": 0.7997697591781616, "learning_rate": 3.7642770252632445e-05, "loss": 0.8536, "step": 2615 }, { "epoch": 1.704619388418998, "grad_norm": 0.9354361295700073, "learning_rate": 3.7598213539531924e-05, "loss": 0.8584, "step": 2620 }, { "epoch": 1.7078724788549122, "grad_norm": 0.8442994356155396, "learning_rate": 3.755360312231726e-05, "loss": 0.8509, "step": 2625 }, { "epoch": 1.7111255692908263, "grad_norm": 0.7156201601028442, "learning_rate": 3.7508939191155315e-05, "loss": 0.8587, "step": 2630 }, { "epoch": 1.7143786597267403, "grad_norm": 0.8114856481552124, "learning_rate": 3.7464221936441094e-05, "loss": 0.8575, "step": 2635 }, { "epoch": 1.7176317501626546, "grad_norm": 0.9958142042160034, "learning_rate": 3.741945154879691e-05, "loss": 0.8291, "step": 2640 }, { "epoch": 1.7208848405985686, "grad_norm": 0.8814706206321716, "learning_rate": 3.7374628219071576e-05, "loss": 0.8756, "step": 2645 }, { "epoch": 1.7241379310344827, "grad_norm": 0.9752816557884216, "learning_rate": 3.732975213833957e-05, "loss": 0.8526, "step": 2650 }, { "epoch": 1.727391021470397, "grad_norm": 1.069827914237976, "learning_rate": 3.728482349790025e-05, "loss": 0.85, "step": 2655 }, { "epoch": 1.730644111906311, "grad_norm": 0.7829200029373169, "learning_rate": 3.723984248927704e-05, "loss": 0.8775, "step": 2660 }, { "epoch": 1.733897202342225, "grad_norm": 0.9264289140701294, "learning_rate": 3.719480930421657e-05, "loss": 0.8561, "step": 2665 }, { "epoch": 1.7371502927781393, "grad_norm": 1.0062094926834106, "learning_rate": 3.7149724134687915e-05, "loss": 0.8734, "step": 2670 }, { "epoch": 1.7404033832140533, "grad_norm": 1.15998375415802, "learning_rate": 3.710458717288176e-05, "loss": 0.8817, "step": 2675 }, { "epoch": 1.7436564736499673, "grad_norm": 0.8632653951644897, "learning_rate": 3.705939861120952e-05, "loss": 0.8467, "step": 2680 }, { "epoch": 1.7469095640858816, "grad_norm": 0.9579365849494934, "learning_rate": 3.7014158642302645e-05, "loss": 0.8516, "step": 2685 }, { "epoch": 1.7501626545217959, "grad_norm": 0.7893072962760925, "learning_rate": 3.6968867459011675e-05, "loss": 0.8533, "step": 2690 }, { "epoch": 1.7534157449577097, "grad_norm": 0.8436265587806702, "learning_rate": 3.692352525440548e-05, "loss": 0.8661, "step": 2695 }, { "epoch": 1.756668835393624, "grad_norm": 0.7928500175476074, "learning_rate": 3.687813222177042e-05, "loss": 0.8617, "step": 2700 }, { "epoch": 1.7599219258295382, "grad_norm": 1.0979465246200562, "learning_rate": 3.683268855460955e-05, "loss": 0.8457, "step": 2705 }, { "epoch": 1.763175016265452, "grad_norm": 0.9280642867088318, "learning_rate": 3.678719444664174e-05, "loss": 0.8698, "step": 2710 }, { "epoch": 1.7664281067013663, "grad_norm": 0.7560756206512451, "learning_rate": 3.674165009180091e-05, "loss": 0.8476, "step": 2715 }, { "epoch": 1.7696811971372806, "grad_norm": 1.6937271356582642, "learning_rate": 3.669605568423515e-05, "loss": 0.8601, "step": 2720 }, { "epoch": 1.7729342875731944, "grad_norm": 0.7721190452575684, "learning_rate": 3.665041141830594e-05, "loss": 0.8479, "step": 2725 }, { "epoch": 1.7761873780091086, "grad_norm": 0.691184401512146, "learning_rate": 3.660471748858728e-05, "loss": 0.846, "step": 2730 }, { "epoch": 1.779440468445023, "grad_norm": 0.8458099961280823, "learning_rate": 3.655897408986487e-05, "loss": 0.8543, "step": 2735 }, { "epoch": 1.7826935588809367, "grad_norm": 0.7717384696006775, "learning_rate": 3.651318141713532e-05, "loss": 0.8555, "step": 2740 }, { "epoch": 1.785946649316851, "grad_norm": 0.7364319562911987, "learning_rate": 3.646733966560527e-05, "loss": 0.8693, "step": 2745 }, { "epoch": 1.7891997397527653, "grad_norm": 0.7715139389038086, "learning_rate": 3.642144903069055e-05, "loss": 0.8575, "step": 2750 }, { "epoch": 1.7924528301886793, "grad_norm": 0.7801803350448608, "learning_rate": 3.637550970801543e-05, "loss": 0.8832, "step": 2755 }, { "epoch": 1.7957059206245933, "grad_norm": 0.8797639012336731, "learning_rate": 3.632952189341166e-05, "loss": 0.8787, "step": 2760 }, { "epoch": 1.7989590110605076, "grad_norm": 0.8655262589454651, "learning_rate": 3.628348578291776e-05, "loss": 0.8527, "step": 2765 }, { "epoch": 1.8022121014964216, "grad_norm": 0.7039540410041809, "learning_rate": 3.623740157277811e-05, "loss": 0.8023, "step": 2770 }, { "epoch": 1.8054651919323357, "grad_norm": 0.8364835977554321, "learning_rate": 3.619126945944209e-05, "loss": 0.8428, "step": 2775 }, { "epoch": 1.80871828236825, "grad_norm": 0.8477578163146973, "learning_rate": 3.614508963956335e-05, "loss": 0.8364, "step": 2780 }, { "epoch": 1.811971372804164, "grad_norm": 0.790069043636322, "learning_rate": 3.609886230999886e-05, "loss": 0.8557, "step": 2785 }, { "epoch": 1.815224463240078, "grad_norm": 1.1685853004455566, "learning_rate": 3.605258766780815e-05, "loss": 0.8639, "step": 2790 }, { "epoch": 1.8184775536759923, "grad_norm": 0.6820409297943115, "learning_rate": 3.600626591025239e-05, "loss": 0.8561, "step": 2795 }, { "epoch": 1.8217306441119063, "grad_norm": 0.6816509366035461, "learning_rate": 3.595989723479363e-05, "loss": 0.8595, "step": 2800 }, { "epoch": 1.8249837345478204, "grad_norm": 0.6458393335342407, "learning_rate": 3.591348183909391e-05, "loss": 0.852, "step": 2805 }, { "epoch": 1.8282368249837346, "grad_norm": 0.8720667958259583, "learning_rate": 3.586701992101446e-05, "loss": 0.8493, "step": 2810 }, { "epoch": 1.8314899154196487, "grad_norm": 0.8076214790344238, "learning_rate": 3.582051167861477e-05, "loss": 0.8399, "step": 2815 }, { "epoch": 1.8347430058555627, "grad_norm": 1.1117894649505615, "learning_rate": 3.577395731015184e-05, "loss": 0.8462, "step": 2820 }, { "epoch": 1.837996096291477, "grad_norm": 0.8749067783355713, "learning_rate": 3.57273570140793e-05, "loss": 0.8484, "step": 2825 }, { "epoch": 1.841249186727391, "grad_norm": 0.9115192890167236, "learning_rate": 3.5680710989046565e-05, "loss": 0.8379, "step": 2830 }, { "epoch": 1.844502277163305, "grad_norm": 0.7345873117446899, "learning_rate": 3.5634019433897964e-05, "loss": 0.8521, "step": 2835 }, { "epoch": 1.8477553675992193, "grad_norm": 0.8665250539779663, "learning_rate": 3.558728254767192e-05, "loss": 0.8591, "step": 2840 }, { "epoch": 1.8510084580351334, "grad_norm": 0.6966584324836731, "learning_rate": 3.5540500529600096e-05, "loss": 0.8633, "step": 2845 }, { "epoch": 1.8542615484710474, "grad_norm": 0.9217740893363953, "learning_rate": 3.5493673579106555e-05, "loss": 0.8581, "step": 2850 }, { "epoch": 1.8575146389069617, "grad_norm": 1.1653602123260498, "learning_rate": 3.5446801895806904e-05, "loss": 0.8429, "step": 2855 }, { "epoch": 1.8607677293428757, "grad_norm": 1.0861412286758423, "learning_rate": 3.539988567950741e-05, "loss": 0.8385, "step": 2860 }, { "epoch": 1.8640208197787898, "grad_norm": 0.9099658727645874, "learning_rate": 3.53529251302042e-05, "loss": 0.8727, "step": 2865 }, { "epoch": 1.867273910214704, "grad_norm": 0.8507881760597229, "learning_rate": 3.530592044808237e-05, "loss": 0.8601, "step": 2870 }, { "epoch": 1.870527000650618, "grad_norm": 0.7487595677375793, "learning_rate": 3.525887183351517e-05, "loss": 0.8453, "step": 2875 }, { "epoch": 1.873780091086532, "grad_norm": 0.7527421116828918, "learning_rate": 3.521177948706311e-05, "loss": 0.856, "step": 2880 }, { "epoch": 1.8770331815224464, "grad_norm": 1.198721170425415, "learning_rate": 3.5164643609473114e-05, "loss": 0.8322, "step": 2885 }, { "epoch": 1.8802862719583604, "grad_norm": 0.7312609553337097, "learning_rate": 3.51174644016777e-05, "loss": 0.8571, "step": 2890 }, { "epoch": 1.8835393623942744, "grad_norm": 0.813762903213501, "learning_rate": 3.507024206479406e-05, "loss": 0.8485, "step": 2895 }, { "epoch": 1.8867924528301887, "grad_norm": 0.6589996814727783, "learning_rate": 3.502297680012327e-05, "loss": 0.8199, "step": 2900 }, { "epoch": 1.8900455432661027, "grad_norm": 0.8973954319953918, "learning_rate": 3.4975668809149375e-05, "loss": 0.8595, "step": 2905 }, { "epoch": 1.8932986337020168, "grad_norm": 0.8979359269142151, "learning_rate": 3.492831829353857e-05, "loss": 0.8637, "step": 2910 }, { "epoch": 1.896551724137931, "grad_norm": 0.7665019035339355, "learning_rate": 3.488092545513833e-05, "loss": 0.8753, "step": 2915 }, { "epoch": 1.8998048145738453, "grad_norm": 1.2857329845428467, "learning_rate": 3.483349049597653e-05, "loss": 0.8394, "step": 2920 }, { "epoch": 1.9030579050097591, "grad_norm": 0.7651403546333313, "learning_rate": 3.4786013618260615e-05, "loss": 0.846, "step": 2925 }, { "epoch": 1.9063109954456734, "grad_norm": 0.818390429019928, "learning_rate": 3.47384950243767e-05, "loss": 0.8919, "step": 2930 }, { "epoch": 1.9095640858815877, "grad_norm": 0.8343967795372009, "learning_rate": 3.4690934916888754e-05, "loss": 0.8451, "step": 2935 }, { "epoch": 1.9128171763175015, "grad_norm": 0.8200094699859619, "learning_rate": 3.464333349853769e-05, "loss": 0.8468, "step": 2940 }, { "epoch": 1.9160702667534157, "grad_norm": 0.8766981959342957, "learning_rate": 3.459569097224054e-05, "loss": 0.8455, "step": 2945 }, { "epoch": 1.91932335718933, "grad_norm": 0.7592107057571411, "learning_rate": 3.454800754108957e-05, "loss": 0.8564, "step": 2950 }, { "epoch": 1.9225764476252438, "grad_norm": 0.7694371938705444, "learning_rate": 3.45002834083514e-05, "loss": 0.8579, "step": 2955 }, { "epoch": 1.925829538061158, "grad_norm": 0.9310813546180725, "learning_rate": 3.445251877746616e-05, "loss": 0.853, "step": 2960 }, { "epoch": 1.9290826284970723, "grad_norm": 0.7357284426689148, "learning_rate": 3.440471385204664e-05, "loss": 0.843, "step": 2965 }, { "epoch": 1.9323357189329864, "grad_norm": 1.0630100965499878, "learning_rate": 3.4356868835877376e-05, "loss": 0.8656, "step": 2970 }, { "epoch": 1.9355888093689004, "grad_norm": 1.3015029430389404, "learning_rate": 3.430898393291381e-05, "loss": 0.8681, "step": 2975 }, { "epoch": 1.9388418998048147, "grad_norm": 0.941599428653717, "learning_rate": 3.426105934728141e-05, "loss": 0.8374, "step": 2980 }, { "epoch": 1.9420949902407287, "grad_norm": 0.827949583530426, "learning_rate": 3.4213095283274807e-05, "loss": 0.8342, "step": 2985 }, { "epoch": 1.9453480806766428, "grad_norm": 0.7155514359474182, "learning_rate": 3.416509194535693e-05, "loss": 0.8604, "step": 2990 }, { "epoch": 1.948601171112557, "grad_norm": 0.6395983099937439, "learning_rate": 3.411704953815813e-05, "loss": 0.8545, "step": 2995 }, { "epoch": 1.951854261548471, "grad_norm": 1.0403225421905518, "learning_rate": 3.406896826647528e-05, "loss": 0.8317, "step": 3000 }, { "epoch": 1.9551073519843851, "grad_norm": 0.809688925743103, "learning_rate": 3.4020848335270944e-05, "loss": 0.8459, "step": 3005 }, { "epoch": 1.9583604424202994, "grad_norm": 0.7284942865371704, "learning_rate": 3.397268994967248e-05, "loss": 0.8609, "step": 3010 }, { "epoch": 1.9616135328562134, "grad_norm": 0.8415728807449341, "learning_rate": 3.392449331497117e-05, "loss": 0.8421, "step": 3015 }, { "epoch": 1.9648666232921275, "grad_norm": 0.7867475152015686, "learning_rate": 3.387625863662137e-05, "loss": 0.8537, "step": 3020 }, { "epoch": 1.9681197137280417, "grad_norm": 0.8730093240737915, "learning_rate": 3.3827986120239556e-05, "loss": 0.8453, "step": 3025 }, { "epoch": 1.9713728041639558, "grad_norm": 1.0075076818466187, "learning_rate": 3.377967597160355e-05, "loss": 0.8485, "step": 3030 }, { "epoch": 1.9746258945998698, "grad_norm": 0.7558779716491699, "learning_rate": 3.373132839665159e-05, "loss": 0.8283, "step": 3035 }, { "epoch": 1.977878985035784, "grad_norm": 0.8635545969009399, "learning_rate": 3.368294360148141e-05, "loss": 0.8445, "step": 3040 }, { "epoch": 1.9811320754716981, "grad_norm": 0.7366521954536438, "learning_rate": 3.363452179234946e-05, "loss": 0.8377, "step": 3045 }, { "epoch": 1.9843851659076122, "grad_norm": 0.895798921585083, "learning_rate": 3.3586063175669957e-05, "loss": 0.8517, "step": 3050 }, { "epoch": 1.9876382563435264, "grad_norm": 0.8703877329826355, "learning_rate": 3.353756795801402e-05, "loss": 0.8635, "step": 3055 }, { "epoch": 1.9908913467794405, "grad_norm": 0.8399415612220764, "learning_rate": 3.348903634610879e-05, "loss": 0.8469, "step": 3060 }, { "epoch": 1.9941444372153545, "grad_norm": 0.6633405685424805, "learning_rate": 3.344046854683656e-05, "loss": 0.8265, "step": 3065 }, { "epoch": 1.9973975276512688, "grad_norm": 0.8422790765762329, "learning_rate": 3.3391864767233874e-05, "loss": 0.8356, "step": 3070 }, { "epoch": 2.0, "eval_f1": 0.8011475160594294, "eval_loss": 0.444091796875, "eval_precision": 0.8009366991425545, "eval_recall": 0.8015108608319047, "eval_runtime": 238.6273, "eval_samples_per_second": 1648.743, "eval_steps_per_second": 1.613, "step": 3074 }, { "epoch": 2.000650618087183, "grad_norm": 0.9484532475471497, "learning_rate": 3.334322521449066e-05, "loss": 0.8414, "step": 3075 }, { "epoch": 2.003903708523097, "grad_norm": 1.058498740196228, "learning_rate": 3.3294550095949325e-05, "loss": 0.7647, "step": 3080 }, { "epoch": 2.007156798959011, "grad_norm": 1.1817635297775269, "learning_rate": 3.3245839619103916e-05, "loss": 0.7739, "step": 3085 }, { "epoch": 2.0104098893949254, "grad_norm": 0.9960103034973145, "learning_rate": 3.319709399159919e-05, "loss": 0.7627, "step": 3090 }, { "epoch": 2.013662979830839, "grad_norm": 0.7337830066680908, "learning_rate": 3.314831342122974e-05, "loss": 0.7736, "step": 3095 }, { "epoch": 2.0169160702667535, "grad_norm": 0.8539023995399475, "learning_rate": 3.309949811593914e-05, "loss": 0.7677, "step": 3100 }, { "epoch": 2.0201691607026677, "grad_norm": 0.812573254108429, "learning_rate": 3.3050648283818985e-05, "loss": 0.7688, "step": 3105 }, { "epoch": 2.0234222511385815, "grad_norm": 0.8771811127662659, "learning_rate": 3.30017641331081e-05, "loss": 0.7873, "step": 3110 }, { "epoch": 2.026675341574496, "grad_norm": 0.8817070126533508, "learning_rate": 3.295284587219159e-05, "loss": 0.7516, "step": 3115 }, { "epoch": 2.02992843201041, "grad_norm": 0.8555654287338257, "learning_rate": 3.290389370959995e-05, "loss": 0.7245, "step": 3120 }, { "epoch": 2.033181522446324, "grad_norm": 0.9785915017127991, "learning_rate": 3.285490785400822e-05, "loss": 0.7591, "step": 3125 }, { "epoch": 2.036434612882238, "grad_norm": 1.1170217990875244, "learning_rate": 3.280588851423504e-05, "loss": 0.7545, "step": 3130 }, { "epoch": 2.0396877033181524, "grad_norm": 0.889552652835846, "learning_rate": 3.275683589924181e-05, "loss": 0.7509, "step": 3135 }, { "epoch": 2.0429407937540662, "grad_norm": 0.9748543500900269, "learning_rate": 3.270775021813177e-05, "loss": 0.7419, "step": 3140 }, { "epoch": 2.0461938841899805, "grad_norm": 0.9157707691192627, "learning_rate": 3.26586316801491e-05, "loss": 0.7476, "step": 3145 }, { "epoch": 2.0494469746258948, "grad_norm": 1.3593250513076782, "learning_rate": 3.2609480494678055e-05, "loss": 0.778, "step": 3150 }, { "epoch": 2.0527000650618086, "grad_norm": 0.8584513664245605, "learning_rate": 3.256029687124209e-05, "loss": 0.7634, "step": 3155 }, { "epoch": 2.055953155497723, "grad_norm": 1.1206103563308716, "learning_rate": 3.2511081019502875e-05, "loss": 0.7612, "step": 3160 }, { "epoch": 2.059206245933637, "grad_norm": 1.1010791063308716, "learning_rate": 3.2461833149259516e-05, "loss": 0.7631, "step": 3165 }, { "epoch": 2.062459336369551, "grad_norm": 1.0924779176712036, "learning_rate": 3.241255347044759e-05, "loss": 0.7592, "step": 3170 }, { "epoch": 2.065712426805465, "grad_norm": 0.9586931467056274, "learning_rate": 3.236324219313826e-05, "loss": 0.7591, "step": 3175 }, { "epoch": 2.0689655172413794, "grad_norm": 1.0838814973831177, "learning_rate": 3.231389952753742e-05, "loss": 0.7724, "step": 3180 }, { "epoch": 2.0722186076772933, "grad_norm": 0.9030594229698181, "learning_rate": 3.226452568398471e-05, "loss": 0.7627, "step": 3185 }, { "epoch": 2.0754716981132075, "grad_norm": 1.0417284965515137, "learning_rate": 3.221512087295275e-05, "loss": 0.765, "step": 3190 }, { "epoch": 2.078724788549122, "grad_norm": 1.3411697149276733, "learning_rate": 3.216568530504611e-05, "loss": 0.7718, "step": 3195 }, { "epoch": 2.0819778789850356, "grad_norm": 1.1210920810699463, "learning_rate": 3.21162191910005e-05, "loss": 0.7578, "step": 3200 }, { "epoch": 2.08523096942095, "grad_norm": 1.0522574186325073, "learning_rate": 3.2066722741681845e-05, "loss": 0.7645, "step": 3205 }, { "epoch": 2.088484059856864, "grad_norm": 0.9024161100387573, "learning_rate": 3.2017196168085345e-05, "loss": 0.7542, "step": 3210 }, { "epoch": 2.091737150292778, "grad_norm": 0.93799889087677, "learning_rate": 3.196763968133466e-05, "loss": 0.7675, "step": 3215 }, { "epoch": 2.094990240728692, "grad_norm": 0.9059098362922668, "learning_rate": 3.191805349268097e-05, "loss": 0.774, "step": 3220 }, { "epoch": 2.0982433311646065, "grad_norm": 0.954647958278656, "learning_rate": 3.1868437813502026e-05, "loss": 0.7591, "step": 3225 }, { "epoch": 2.1014964216005203, "grad_norm": 0.956679105758667, "learning_rate": 3.1818792855301316e-05, "loss": 0.7585, "step": 3230 }, { "epoch": 2.1047495120364346, "grad_norm": 0.8911952376365662, "learning_rate": 3.1769118829707156e-05, "loss": 0.7736, "step": 3235 }, { "epoch": 2.108002602472349, "grad_norm": 1.1105453968048096, "learning_rate": 3.171941594847173e-05, "loss": 0.746, "step": 3240 }, { "epoch": 2.1112556929082626, "grad_norm": 1.0151236057281494, "learning_rate": 3.1669684423470275e-05, "loss": 0.7628, "step": 3245 }, { "epoch": 2.114508783344177, "grad_norm": 1.0137097835540771, "learning_rate": 3.16199244667001e-05, "loss": 0.7611, "step": 3250 }, { "epoch": 2.117761873780091, "grad_norm": 0.9404064416885376, "learning_rate": 3.157013629027972e-05, "loss": 0.7601, "step": 3255 }, { "epoch": 2.121014964216005, "grad_norm": 1.3806120157241821, "learning_rate": 3.152032010644796e-05, "loss": 0.7647, "step": 3260 }, { "epoch": 2.1242680546519193, "grad_norm": 0.9700812697410583, "learning_rate": 3.147047612756302e-05, "loss": 0.766, "step": 3265 }, { "epoch": 2.1275211450878335, "grad_norm": 1.1779789924621582, "learning_rate": 3.142060456610159e-05, "loss": 0.7571, "step": 3270 }, { "epoch": 2.130774235523748, "grad_norm": 1.1766470670700073, "learning_rate": 3.137070563465796e-05, "loss": 0.7587, "step": 3275 }, { "epoch": 2.1340273259596616, "grad_norm": 1.1181317567825317, "learning_rate": 3.1320779545943034e-05, "loss": 0.7514, "step": 3280 }, { "epoch": 2.137280416395576, "grad_norm": 1.520752191543579, "learning_rate": 3.127082651278357e-05, "loss": 0.7383, "step": 3285 }, { "epoch": 2.14053350683149, "grad_norm": 1.1578936576843262, "learning_rate": 3.1220846748121105e-05, "loss": 0.7736, "step": 3290 }, { "epoch": 2.143786597267404, "grad_norm": 1.3091363906860352, "learning_rate": 3.117084046501119e-05, "loss": 0.7615, "step": 3295 }, { "epoch": 2.147039687703318, "grad_norm": 0.9620407223701477, "learning_rate": 3.112080787662237e-05, "loss": 0.7924, "step": 3300 }, { "epoch": 2.1502927781392325, "grad_norm": 0.9089716672897339, "learning_rate": 3.107074919623536e-05, "loss": 0.7455, "step": 3305 }, { "epoch": 2.1535458685751463, "grad_norm": 1.1510998010635376, "learning_rate": 3.102066463724209e-05, "loss": 0.765, "step": 3310 }, { "epoch": 2.1567989590110606, "grad_norm": 1.8722169399261475, "learning_rate": 3.0970554413144805e-05, "loss": 0.7627, "step": 3315 }, { "epoch": 2.160052049446975, "grad_norm": 1.0691964626312256, "learning_rate": 3.0920418737555144e-05, "loss": 0.7753, "step": 3320 }, { "epoch": 2.1633051398828886, "grad_norm": 0.9641361832618713, "learning_rate": 3.0870257824193263e-05, "loss": 0.7516, "step": 3325 }, { "epoch": 2.166558230318803, "grad_norm": 1.0590273141860962, "learning_rate": 3.08200718868869e-05, "loss": 0.7859, "step": 3330 }, { "epoch": 2.169811320754717, "grad_norm": 1.2373055219650269, "learning_rate": 3.076986113957044e-05, "loss": 0.772, "step": 3335 }, { "epoch": 2.173064411190631, "grad_norm": 1.160982608795166, "learning_rate": 3.071962579628408e-05, "loss": 0.7673, "step": 3340 }, { "epoch": 2.1763175016265452, "grad_norm": 0.8511375188827515, "learning_rate": 3.066936607117279e-05, "loss": 0.7558, "step": 3345 }, { "epoch": 2.1795705920624595, "grad_norm": 0.9551635384559631, "learning_rate": 3.061908217848556e-05, "loss": 0.7641, "step": 3350 }, { "epoch": 2.1828236824983733, "grad_norm": 0.9262502789497375, "learning_rate": 3.056877433257434e-05, "loss": 0.7667, "step": 3355 }, { "epoch": 2.1860767729342876, "grad_norm": 1.2747892141342163, "learning_rate": 3.051844274789321e-05, "loss": 0.7497, "step": 3360 }, { "epoch": 2.189329863370202, "grad_norm": 1.2817254066467285, "learning_rate": 3.046808763899745e-05, "loss": 0.7743, "step": 3365 }, { "epoch": 2.1925829538061157, "grad_norm": 1.3123672008514404, "learning_rate": 3.041770922054262e-05, "loss": 0.7681, "step": 3370 }, { "epoch": 2.19583604424203, "grad_norm": 1.0206502676010132, "learning_rate": 3.0367307707283626e-05, "loss": 0.7833, "step": 3375 }, { "epoch": 2.199089134677944, "grad_norm": 1.0204437971115112, "learning_rate": 3.0326970012795626e-05, "loss": 0.7575, "step": 3380 }, { "epoch": 2.202342225113858, "grad_norm": 1.0020246505737305, "learning_rate": 3.027652747038522e-05, "loss": 0.7702, "step": 3385 }, { "epoch": 2.2055953155497723, "grad_norm": 1.045996904373169, "learning_rate": 3.022606243500526e-05, "loss": 0.7609, "step": 3390 }, { "epoch": 2.2088484059856865, "grad_norm": 0.9325571060180664, "learning_rate": 3.0175575121779886e-05, "loss": 0.7363, "step": 3395 }, { "epoch": 2.2121014964216004, "grad_norm": 1.2504099607467651, "learning_rate": 3.012506574592825e-05, "loss": 0.7742, "step": 3400 }, { "epoch": 2.2153545868575146, "grad_norm": 1.0567350387573242, "learning_rate": 3.007453452276349e-05, "loss": 0.7544, "step": 3405 }, { "epoch": 2.218607677293429, "grad_norm": 0.9951023459434509, "learning_rate": 3.0023981667691926e-05, "loss": 0.7432, "step": 3410 }, { "epoch": 2.2218607677293427, "grad_norm": 1.0222620964050293, "learning_rate": 2.997340739621206e-05, "loss": 0.794, "step": 3415 }, { "epoch": 2.225113858165257, "grad_norm": 0.8401185870170593, "learning_rate": 2.9922811923913714e-05, "loss": 0.751, "step": 3420 }, { "epoch": 2.2283669486011712, "grad_norm": 1.1666043996810913, "learning_rate": 2.9872195466477054e-05, "loss": 0.7592, "step": 3425 }, { "epoch": 2.231620039037085, "grad_norm": 0.95232754945755, "learning_rate": 2.9821558239671744e-05, "loss": 0.7639, "step": 3430 }, { "epoch": 2.2348731294729993, "grad_norm": 0.8971825242042542, "learning_rate": 2.977090045935594e-05, "loss": 0.7553, "step": 3435 }, { "epoch": 2.2381262199089136, "grad_norm": 1.0237399339675903, "learning_rate": 2.9720222341475445e-05, "loss": 0.7504, "step": 3440 }, { "epoch": 2.2413793103448274, "grad_norm": 1.1775766611099243, "learning_rate": 2.966952410206275e-05, "loss": 0.7449, "step": 3445 }, { "epoch": 2.2446324007807417, "grad_norm": 0.885957658290863, "learning_rate": 2.9618805957236113e-05, "loss": 0.7631, "step": 3450 }, { "epoch": 2.247885491216656, "grad_norm": 1.3709341287612915, "learning_rate": 2.956806812319865e-05, "loss": 0.7589, "step": 3455 }, { "epoch": 2.2511385816525697, "grad_norm": 1.204150676727295, "learning_rate": 2.951731081623742e-05, "loss": 0.7662, "step": 3460 }, { "epoch": 2.254391672088484, "grad_norm": 1.6271796226501465, "learning_rate": 2.946653425272247e-05, "loss": 0.7821, "step": 3465 }, { "epoch": 2.2576447625243983, "grad_norm": 1.0852000713348389, "learning_rate": 2.9415738649105963e-05, "loss": 0.7408, "step": 3470 }, { "epoch": 2.260897852960312, "grad_norm": 1.0353608131408691, "learning_rate": 2.9364924221921185e-05, "loss": 0.7478, "step": 3475 }, { "epoch": 2.2641509433962264, "grad_norm": 1.881262183189392, "learning_rate": 2.9314091187781715e-05, "loss": 0.7584, "step": 3480 }, { "epoch": 2.2674040338321406, "grad_norm": 1.2990703582763672, "learning_rate": 2.9263239763380412e-05, "loss": 0.7566, "step": 3485 }, { "epoch": 2.2706571242680544, "grad_norm": 0.9985173940658569, "learning_rate": 2.921237016548854e-05, "loss": 0.7676, "step": 3490 }, { "epoch": 2.2739102147039687, "grad_norm": 0.9522629976272583, "learning_rate": 2.9161482610954842e-05, "loss": 0.7475, "step": 3495 }, { "epoch": 2.277163305139883, "grad_norm": 0.9219643473625183, "learning_rate": 2.9110577316704602e-05, "loss": 0.7613, "step": 3500 }, { "epoch": 2.280416395575797, "grad_norm": 0.9594421982765198, "learning_rate": 2.905965449973871e-05, "loss": 0.768, "step": 3505 }, { "epoch": 2.283669486011711, "grad_norm": 1.0452098846435547, "learning_rate": 2.900871437713279e-05, "loss": 0.7699, "step": 3510 }, { "epoch": 2.2869225764476253, "grad_norm": 0.9670342803001404, "learning_rate": 2.8957757166036193e-05, "loss": 0.7573, "step": 3515 }, { "epoch": 2.290175666883539, "grad_norm": 1.147403597831726, "learning_rate": 2.890678308367115e-05, "loss": 0.7688, "step": 3520 }, { "epoch": 2.2934287573194534, "grad_norm": 1.086470603942871, "learning_rate": 2.8855792347331793e-05, "loss": 0.7671, "step": 3525 }, { "epoch": 2.2966818477553677, "grad_norm": 1.6733858585357666, "learning_rate": 2.8804785174383248e-05, "loss": 0.7753, "step": 3530 }, { "epoch": 2.2999349381912815, "grad_norm": 1.0693230628967285, "learning_rate": 2.8753761782260723e-05, "loss": 0.7457, "step": 3535 }, { "epoch": 2.3031880286271957, "grad_norm": 1.079010009765625, "learning_rate": 2.8702722388468546e-05, "loss": 0.7701, "step": 3540 }, { "epoch": 2.30644111906311, "grad_norm": 0.9620556235313416, "learning_rate": 2.8651667210579257e-05, "loss": 0.759, "step": 3545 }, { "epoch": 2.3096942094990243, "grad_norm": 1.1349847316741943, "learning_rate": 2.8600596466232715e-05, "loss": 0.7776, "step": 3550 }, { "epoch": 2.312947299934938, "grad_norm": 1.4847538471221924, "learning_rate": 2.8549510373135092e-05, "loss": 0.7566, "step": 3555 }, { "epoch": 2.3162003903708523, "grad_norm": 1.657256007194519, "learning_rate": 2.8498409149058008e-05, "loss": 0.762, "step": 3560 }, { "epoch": 2.3194534808067666, "grad_norm": 1.0619240999221802, "learning_rate": 2.8447293011837596e-05, "loss": 0.771, "step": 3565 }, { "epoch": 2.3227065712426804, "grad_norm": 0.8844910264015198, "learning_rate": 2.8396162179373535e-05, "loss": 0.7573, "step": 3570 }, { "epoch": 2.3259596616785947, "grad_norm": 1.3543357849121094, "learning_rate": 2.8345016869628175e-05, "loss": 0.7736, "step": 3575 }, { "epoch": 2.329212752114509, "grad_norm": 0.9610804319381714, "learning_rate": 2.8293857300625555e-05, "loss": 0.7536, "step": 3580 }, { "epoch": 2.3324658425504228, "grad_norm": 1.2407771348953247, "learning_rate": 2.8242683690450518e-05, "loss": 0.7584, "step": 3585 }, { "epoch": 2.335718932986337, "grad_norm": 1.388168215751648, "learning_rate": 2.8191496257247764e-05, "loss": 0.7426, "step": 3590 }, { "epoch": 2.3389720234222513, "grad_norm": 1.1140729188919067, "learning_rate": 2.814029521922088e-05, "loss": 0.7418, "step": 3595 }, { "epoch": 2.342225113858165, "grad_norm": 1.0877522230148315, "learning_rate": 2.8089080794631512e-05, "loss": 0.7531, "step": 3600 }, { "epoch": 2.3454782042940794, "grad_norm": 1.0917423963546753, "learning_rate": 2.803785320179832e-05, "loss": 0.7435, "step": 3605 }, { "epoch": 2.3487312947299936, "grad_norm": 1.3571592569351196, "learning_rate": 2.7986612659096113e-05, "loss": 0.7594, "step": 3610 }, { "epoch": 2.3519843851659075, "grad_norm": 1.0520139932632446, "learning_rate": 2.7935359384954914e-05, "loss": 0.758, "step": 3615 }, { "epoch": 2.3552374756018217, "grad_norm": 1.271592617034912, "learning_rate": 2.7884093597858996e-05, "loss": 0.7457, "step": 3620 }, { "epoch": 2.358490566037736, "grad_norm": 0.9961024522781372, "learning_rate": 2.783281551634599e-05, "loss": 0.7626, "step": 3625 }, { "epoch": 2.36174365647365, "grad_norm": 1.3508564233779907, "learning_rate": 2.7781525359005943e-05, "loss": 0.734, "step": 3630 }, { "epoch": 2.364996746909564, "grad_norm": 1.0961614847183228, "learning_rate": 2.7730223344480348e-05, "loss": 0.7553, "step": 3635 }, { "epoch": 2.3682498373454783, "grad_norm": 1.032395839691162, "learning_rate": 2.7678909691461274e-05, "loss": 0.7915, "step": 3640 }, { "epoch": 2.371502927781392, "grad_norm": 1.1500605344772339, "learning_rate": 2.7627584618690394e-05, "loss": 0.7539, "step": 3645 }, { "epoch": 2.3747560182173064, "grad_norm": 1.0203113555908203, "learning_rate": 2.7576248344958054e-05, "loss": 0.7771, "step": 3650 }, { "epoch": 2.3780091086532207, "grad_norm": 2.247779607772827, "learning_rate": 2.7524901089102358e-05, "loss": 0.764, "step": 3655 }, { "epoch": 2.3812621990891345, "grad_norm": 1.131200909614563, "learning_rate": 2.7473543070008213e-05, "loss": 0.742, "step": 3660 }, { "epoch": 2.3845152895250488, "grad_norm": 1.2509359121322632, "learning_rate": 2.7422174506606413e-05, "loss": 0.7461, "step": 3665 }, { "epoch": 2.387768379960963, "grad_norm": 0.864366352558136, "learning_rate": 2.737079561787272e-05, "loss": 0.7405, "step": 3670 }, { "epoch": 2.391021470396877, "grad_norm": 0.9416084885597229, "learning_rate": 2.7319406622826878e-05, "loss": 0.7439, "step": 3675 }, { "epoch": 2.394274560832791, "grad_norm": 1.7094473838806152, "learning_rate": 2.726800774053173e-05, "loss": 0.7698, "step": 3680 }, { "epoch": 2.3975276512687054, "grad_norm": 0.9964091777801514, "learning_rate": 2.7216599190092273e-05, "loss": 0.7536, "step": 3685 }, { "epoch": 2.4007807417046196, "grad_norm": 1.1519944667816162, "learning_rate": 2.7165181190654702e-05, "loss": 0.7459, "step": 3690 }, { "epoch": 2.4040338321405335, "grad_norm": 1.2240533828735352, "learning_rate": 2.7113753961405515e-05, "loss": 0.7434, "step": 3695 }, { "epoch": 2.4072869225764477, "grad_norm": 1.122253656387329, "learning_rate": 2.7062317721570512e-05, "loss": 0.7471, "step": 3700 }, { "epoch": 2.410540013012362, "grad_norm": 1.0433543920516968, "learning_rate": 2.7010872690413956e-05, "loss": 0.7429, "step": 3705 }, { "epoch": 2.413793103448276, "grad_norm": 1.092159628868103, "learning_rate": 2.6959419087237553e-05, "loss": 0.7506, "step": 3710 }, { "epoch": 2.41704619388419, "grad_norm": 0.9082927107810974, "learning_rate": 2.6907957131379553e-05, "loss": 0.7666, "step": 3715 }, { "epoch": 2.4202992843201043, "grad_norm": 0.8798219561576843, "learning_rate": 2.6856487042213822e-05, "loss": 0.7637, "step": 3720 }, { "epoch": 2.423552374756018, "grad_norm": 0.8654388189315796, "learning_rate": 2.6805009039148897e-05, "loss": 0.7541, "step": 3725 }, { "epoch": 2.4268054651919324, "grad_norm": 1.0439229011535645, "learning_rate": 2.675352334162704e-05, "loss": 0.7618, "step": 3730 }, { "epoch": 2.4300585556278467, "grad_norm": 0.9634140729904175, "learning_rate": 2.6702030169123316e-05, "loss": 0.737, "step": 3735 }, { "epoch": 2.4333116460637605, "grad_norm": 0.8647895455360413, "learning_rate": 2.6650529741144665e-05, "loss": 0.7485, "step": 3740 }, { "epoch": 2.4365647364996748, "grad_norm": 1.984215259552002, "learning_rate": 2.6599022277228948e-05, "loss": 0.7541, "step": 3745 }, { "epoch": 2.439817826935589, "grad_norm": 1.074607014656067, "learning_rate": 2.6547507996944022e-05, "loss": 0.7595, "step": 3750 }, { "epoch": 2.443070917371503, "grad_norm": 0.9121082425117493, "learning_rate": 2.649598711988679e-05, "loss": 0.7741, "step": 3755 }, { "epoch": 2.446324007807417, "grad_norm": 1.6042678356170654, "learning_rate": 2.6444459865682297e-05, "loss": 0.7699, "step": 3760 }, { "epoch": 2.4495770982433314, "grad_norm": 0.9366397857666016, "learning_rate": 2.6392926453982748e-05, "loss": 0.7525, "step": 3765 }, { "epoch": 2.452830188679245, "grad_norm": 1.0728055238723755, "learning_rate": 2.6341387104466612e-05, "loss": 0.749, "step": 3770 }, { "epoch": 2.4560832791151594, "grad_norm": 0.988258957862854, "learning_rate": 2.6289842036837675e-05, "loss": 0.7563, "step": 3775 }, { "epoch": 2.4593363695510737, "grad_norm": 1.2626458406448364, "learning_rate": 2.6238291470824085e-05, "loss": 0.7367, "step": 3780 }, { "epoch": 2.4625894599869875, "grad_norm": 0.8835701942443848, "learning_rate": 2.6186735626177428e-05, "loss": 0.7534, "step": 3785 }, { "epoch": 2.465842550422902, "grad_norm": 0.8948650360107422, "learning_rate": 2.6135174722671813e-05, "loss": 0.7975, "step": 3790 }, { "epoch": 2.469095640858816, "grad_norm": 1.0557647943496704, "learning_rate": 2.608360898010288e-05, "loss": 0.7542, "step": 3795 }, { "epoch": 2.47234873129473, "grad_norm": 1.1379538774490356, "learning_rate": 2.603203861828693e-05, "loss": 0.7569, "step": 3800 }, { "epoch": 2.475601821730644, "grad_norm": 1.1298165321350098, "learning_rate": 2.598046385705994e-05, "loss": 0.7662, "step": 3805 }, { "epoch": 2.4788549121665584, "grad_norm": 0.9936167001724243, "learning_rate": 2.5928884916276635e-05, "loss": 0.7427, "step": 3810 }, { "epoch": 2.482108002602472, "grad_norm": 1.055421233177185, "learning_rate": 2.5877302015809574e-05, "loss": 0.741, "step": 3815 }, { "epoch": 2.4853610930383865, "grad_norm": 1.0035120248794556, "learning_rate": 2.5825715375548175e-05, "loss": 0.7495, "step": 3820 }, { "epoch": 2.4886141834743007, "grad_norm": 1.5768109560012817, "learning_rate": 2.5774125215397815e-05, "loss": 0.7677, "step": 3825 }, { "epoch": 2.4918672739102146, "grad_norm": 1.1085072755813599, "learning_rate": 2.5722531755278874e-05, "loss": 0.7693, "step": 3830 }, { "epoch": 2.495120364346129, "grad_norm": 0.9290764927864075, "learning_rate": 2.567093521512578e-05, "loss": 0.7734, "step": 3835 }, { "epoch": 2.498373454782043, "grad_norm": 1.2003841400146484, "learning_rate": 2.561933581488612e-05, "loss": 0.7529, "step": 3840 }, { "epoch": 2.501626545217957, "grad_norm": 0.9982072114944458, "learning_rate": 2.556773377451965e-05, "loss": 0.7555, "step": 3845 }, { "epoch": 2.504879635653871, "grad_norm": 0.9454076886177063, "learning_rate": 2.5516129313997388e-05, "loss": 0.7726, "step": 3850 }, { "epoch": 2.5081327260897854, "grad_norm": 0.9885278940200806, "learning_rate": 2.5464522653300676e-05, "loss": 0.7585, "step": 3855 }, { "epoch": 2.5113858165256993, "grad_norm": 1.0617841482162476, "learning_rate": 2.541291401242022e-05, "loss": 0.7613, "step": 3860 }, { "epoch": 2.5146389069616135, "grad_norm": 0.9445372223854065, "learning_rate": 2.536130361135518e-05, "loss": 0.7867, "step": 3865 }, { "epoch": 2.517891997397528, "grad_norm": 1.2932319641113281, "learning_rate": 2.5309691670112218e-05, "loss": 0.7509, "step": 3870 }, { "epoch": 2.5211450878334416, "grad_norm": 1.1702325344085693, "learning_rate": 2.525807840870455e-05, "loss": 0.7772, "step": 3875 }, { "epoch": 2.524398178269356, "grad_norm": 1.0334542989730835, "learning_rate": 2.5206464047151046e-05, "loss": 0.7478, "step": 3880 }, { "epoch": 2.52765126870527, "grad_norm": 2.0176279544830322, "learning_rate": 2.5154848805475224e-05, "loss": 0.759, "step": 3885 }, { "epoch": 2.530904359141184, "grad_norm": 1.1288046836853027, "learning_rate": 2.5103232903704393e-05, "loss": 0.7529, "step": 3890 }, { "epoch": 2.534157449577098, "grad_norm": 1.0248112678527832, "learning_rate": 2.5051616561868663e-05, "loss": 0.7748, "step": 3895 }, { "epoch": 2.5374105400130125, "grad_norm": 0.8906844258308411, "learning_rate": 2.5e-05, "loss": 0.7369, "step": 3900 }, { "epoch": 2.5406636304489263, "grad_norm": 1.1588047742843628, "learning_rate": 2.4948383438131346e-05, "loss": 0.7465, "step": 3905 }, { "epoch": 2.5439167208848406, "grad_norm": 1.0166900157928467, "learning_rate": 2.4896767096295613e-05, "loss": 0.7576, "step": 3910 }, { "epoch": 2.547169811320755, "grad_norm": 1.0682686567306519, "learning_rate": 2.484515119452478e-05, "loss": 0.7884, "step": 3915 }, { "epoch": 2.5504229017566686, "grad_norm": 0.9026442766189575, "learning_rate": 2.4793535952848963e-05, "loss": 0.7311, "step": 3920 }, { "epoch": 2.553675992192583, "grad_norm": 0.8642654418945312, "learning_rate": 2.4741921591295454e-05, "loss": 0.7547, "step": 3925 }, { "epoch": 2.556929082628497, "grad_norm": 1.1124982833862305, "learning_rate": 2.4690308329887788e-05, "loss": 0.7523, "step": 3930 }, { "epoch": 2.560182173064411, "grad_norm": 1.664115309715271, "learning_rate": 2.463869638864483e-05, "loss": 0.7249, "step": 3935 }, { "epoch": 2.5634352635003252, "grad_norm": 0.9926962852478027, "learning_rate": 2.458708598757979e-05, "loss": 0.7318, "step": 3940 }, { "epoch": 2.5666883539362395, "grad_norm": 1.076627254486084, "learning_rate": 2.4535477346699333e-05, "loss": 0.7586, "step": 3945 }, { "epoch": 2.5699414443721533, "grad_norm": 1.7046575546264648, "learning_rate": 2.4483870686002625e-05, "loss": 0.7482, "step": 3950 }, { "epoch": 2.5731945348080676, "grad_norm": 1.0066241025924683, "learning_rate": 2.443226622548036e-05, "loss": 0.7636, "step": 3955 }, { "epoch": 2.576447625243982, "grad_norm": 2.010552406311035, "learning_rate": 2.4380664185113887e-05, "loss": 0.7661, "step": 3960 }, { "epoch": 2.5797007156798957, "grad_norm": 1.1133430004119873, "learning_rate": 2.432906478487423e-05, "loss": 0.7597, "step": 3965 }, { "epoch": 2.58295380611581, "grad_norm": 1.1634178161621094, "learning_rate": 2.427746824472113e-05, "loss": 0.76, "step": 3970 }, { "epoch": 2.586206896551724, "grad_norm": 0.9780275821685791, "learning_rate": 2.4225874784602184e-05, "loss": 0.7688, "step": 3975 }, { "epoch": 2.589459986987638, "grad_norm": 1.2186133861541748, "learning_rate": 2.4174284624451824e-05, "loss": 0.7309, "step": 3980 }, { "epoch": 2.5927130774235523, "grad_norm": 0.9547963738441467, "learning_rate": 2.4122697984190428e-05, "loss": 0.7593, "step": 3985 }, { "epoch": 2.5959661678594665, "grad_norm": 0.943261444568634, "learning_rate": 2.4071115083723364e-05, "loss": 0.7562, "step": 3990 }, { "epoch": 2.5992192582953804, "grad_norm": 0.9355084896087646, "learning_rate": 2.401953614294006e-05, "loss": 0.7294, "step": 3995 }, { "epoch": 2.6024723487312946, "grad_norm": 1.0167070627212524, "learning_rate": 2.396796138171307e-05, "loss": 0.7578, "step": 4000 }, { "epoch": 2.605725439167209, "grad_norm": 0.9536129832267761, "learning_rate": 2.391639101989712e-05, "loss": 0.7363, "step": 4005 }, { "epoch": 2.6089785296031227, "grad_norm": 0.9292064309120178, "learning_rate": 2.3864825277328193e-05, "loss": 0.7517, "step": 4010 }, { "epoch": 2.612231620039037, "grad_norm": 1.1821918487548828, "learning_rate": 2.3813264373822578e-05, "loss": 0.7627, "step": 4015 }, { "epoch": 2.6154847104749512, "grad_norm": 0.9278668165206909, "learning_rate": 2.376170852917592e-05, "loss": 0.7673, "step": 4020 }, { "epoch": 2.618737800910865, "grad_norm": 0.9061160683631897, "learning_rate": 2.3710157963162328e-05, "loss": 0.774, "step": 4025 }, { "epoch": 2.6219908913467793, "grad_norm": 1.2330580949783325, "learning_rate": 2.3658612895533393e-05, "loss": 0.7514, "step": 4030 }, { "epoch": 2.6252439817826936, "grad_norm": 0.9609399437904358, "learning_rate": 2.3607073546017258e-05, "loss": 0.7373, "step": 4035 }, { "epoch": 2.6284970722186074, "grad_norm": 1.5064210891723633, "learning_rate": 2.3555540134317712e-05, "loss": 0.7487, "step": 4040 }, { "epoch": 2.6317501626545217, "grad_norm": 1.0178202390670776, "learning_rate": 2.3504012880113216e-05, "loss": 0.7789, "step": 4045 }, { "epoch": 2.635003253090436, "grad_norm": 0.8506657481193542, "learning_rate": 2.3452492003055984e-05, "loss": 0.7316, "step": 4050 }, { "epoch": 2.63825634352635, "grad_norm": 0.9458078145980835, "learning_rate": 2.3400977722771058e-05, "loss": 0.7703, "step": 4055 }, { "epoch": 2.641509433962264, "grad_norm": 1.1263021230697632, "learning_rate": 2.3349470258855337e-05, "loss": 0.7579, "step": 4060 }, { "epoch": 2.6447625243981783, "grad_norm": 0.8372018933296204, "learning_rate": 2.3297969830876686e-05, "loss": 0.76, "step": 4065 }, { "epoch": 2.6480156148340925, "grad_norm": 0.8701651692390442, "learning_rate": 2.3246476658372973e-05, "loss": 0.7476, "step": 4070 }, { "epoch": 2.6512687052700064, "grad_norm": 1.3167948722839355, "learning_rate": 2.3194990960851112e-05, "loss": 0.7628, "step": 4075 }, { "epoch": 2.6545217957059206, "grad_norm": 1.0400781631469727, "learning_rate": 2.3143512957786184e-05, "loss": 0.7773, "step": 4080 }, { "epoch": 2.657774886141835, "grad_norm": 0.9622422456741333, "learning_rate": 2.309204286862046e-05, "loss": 0.7469, "step": 4085 }, { "epoch": 2.6610279765777487, "grad_norm": 0.929834246635437, "learning_rate": 2.3040580912762456e-05, "loss": 0.7544, "step": 4090 }, { "epoch": 2.664281067013663, "grad_norm": 1.018149495124817, "learning_rate": 2.298912730958605e-05, "loss": 0.7746, "step": 4095 }, { "epoch": 2.6675341574495772, "grad_norm": 1.0057318210601807, "learning_rate": 2.2937682278429494e-05, "loss": 0.7352, "step": 4100 }, { "epoch": 2.6707872478854915, "grad_norm": 0.9973504543304443, "learning_rate": 2.288624603859449e-05, "loss": 0.721, "step": 4105 }, { "epoch": 2.6740403383214053, "grad_norm": 1.0883572101593018, "learning_rate": 2.2834818809345297e-05, "loss": 0.7474, "step": 4110 }, { "epoch": 2.6772934287573196, "grad_norm": 1.337254524230957, "learning_rate": 2.2783400809907726e-05, "loss": 0.7701, "step": 4115 }, { "epoch": 2.680546519193234, "grad_norm": 1.1612261533737183, "learning_rate": 2.2731992259468272e-05, "loss": 0.7547, "step": 4120 }, { "epoch": 2.6837996096291477, "grad_norm": 1.0043455362319946, "learning_rate": 2.2680593377173124e-05, "loss": 0.7576, "step": 4125 }, { "epoch": 2.687052700065062, "grad_norm": 1.180498719215393, "learning_rate": 2.2629204382127284e-05, "loss": 0.7533, "step": 4130 }, { "epoch": 2.690305790500976, "grad_norm": 1.0349406003952026, "learning_rate": 2.257782549339359e-05, "loss": 0.7636, "step": 4135 }, { "epoch": 2.69355888093689, "grad_norm": 1.073776125907898, "learning_rate": 2.2526456929991793e-05, "loss": 0.7718, "step": 4140 }, { "epoch": 2.6968119713728043, "grad_norm": 1.114530324935913, "learning_rate": 2.2475098910897645e-05, "loss": 0.7445, "step": 4145 }, { "epoch": 2.7000650618087185, "grad_norm": 0.9346311092376709, "learning_rate": 2.2423751655041952e-05, "loss": 0.7294, "step": 4150 }, { "epoch": 2.7033181522446323, "grad_norm": 1.086501955986023, "learning_rate": 2.237241538130961e-05, "loss": 0.7507, "step": 4155 }, { "epoch": 2.7065712426805466, "grad_norm": 0.9763929843902588, "learning_rate": 2.2321090308538732e-05, "loss": 0.743, "step": 4160 }, { "epoch": 2.709824333116461, "grad_norm": 0.8880870938301086, "learning_rate": 2.2269776655519658e-05, "loss": 0.7418, "step": 4165 }, { "epoch": 2.7130774235523747, "grad_norm": 0.9564589858055115, "learning_rate": 2.2218474640994063e-05, "loss": 0.765, "step": 4170 }, { "epoch": 2.716330513988289, "grad_norm": 1.169952630996704, "learning_rate": 2.2167184483654013e-05, "loss": 0.7531, "step": 4175 }, { "epoch": 2.719583604424203, "grad_norm": 0.9627036452293396, "learning_rate": 2.211590640214101e-05, "loss": 0.7623, "step": 4180 }, { "epoch": 2.722836694860117, "grad_norm": 0.9291010499000549, "learning_rate": 2.2064640615045092e-05, "loss": 0.7641, "step": 4185 }, { "epoch": 2.7260897852960313, "grad_norm": 1.0236008167266846, "learning_rate": 2.2013387340903893e-05, "loss": 0.7703, "step": 4190 }, { "epoch": 2.7293428757319456, "grad_norm": 1.2711366415023804, "learning_rate": 2.1962146798201684e-05, "loss": 0.7454, "step": 4195 }, { "epoch": 2.7325959661678594, "grad_norm": 1.1424434185028076, "learning_rate": 2.191091920536849e-05, "loss": 0.7559, "step": 4200 }, { "epoch": 2.7358490566037736, "grad_norm": 1.4138892889022827, "learning_rate": 2.1859704780779126e-05, "loss": 0.7569, "step": 4205 }, { "epoch": 2.739102147039688, "grad_norm": 0.967829704284668, "learning_rate": 2.1808503742752252e-05, "loss": 0.7432, "step": 4210 }, { "epoch": 2.7423552374756017, "grad_norm": 0.8999619483947754, "learning_rate": 2.175731630954949e-05, "loss": 0.7457, "step": 4215 }, { "epoch": 2.745608327911516, "grad_norm": 1.0657751560211182, "learning_rate": 2.1706142699374454e-05, "loss": 0.786, "step": 4220 }, { "epoch": 2.7488614183474303, "grad_norm": 1.5017127990722656, "learning_rate": 2.1654983130371837e-05, "loss": 0.7516, "step": 4225 }, { "epoch": 2.752114508783344, "grad_norm": 1.0914252996444702, "learning_rate": 2.1603837820626478e-05, "loss": 0.7616, "step": 4230 }, { "epoch": 2.7553675992192583, "grad_norm": 1.1397154331207275, "learning_rate": 2.1552706988162417e-05, "loss": 0.761, "step": 4235 }, { "epoch": 2.7586206896551726, "grad_norm": 1.162166714668274, "learning_rate": 2.1501590850941994e-05, "loss": 0.7353, "step": 4240 }, { "epoch": 2.7618737800910864, "grad_norm": 1.0100218057632446, "learning_rate": 2.1450489626864907e-05, "loss": 0.7446, "step": 4245 }, { "epoch": 2.7651268705270007, "grad_norm": 0.9108495116233826, "learning_rate": 2.139940353376728e-05, "loss": 0.7644, "step": 4250 }, { "epoch": 2.768379960962915, "grad_norm": 0.9544759392738342, "learning_rate": 2.134833278942074e-05, "loss": 0.7693, "step": 4255 }, { "epoch": 2.7716330513988288, "grad_norm": 1.6715203523635864, "learning_rate": 2.1297277611531456e-05, "loss": 0.764, "step": 4260 }, { "epoch": 2.774886141834743, "grad_norm": 1.0044587850570679, "learning_rate": 2.1246238217739283e-05, "loss": 0.7593, "step": 4265 }, { "epoch": 2.7781392322706573, "grad_norm": 0.9041277766227722, "learning_rate": 2.119521482561675e-05, "loss": 0.7427, "step": 4270 }, { "epoch": 2.781392322706571, "grad_norm": 0.8890901803970337, "learning_rate": 2.114420765266821e-05, "loss": 0.7462, "step": 4275 }, { "epoch": 2.7846454131424854, "grad_norm": 0.9522978663444519, "learning_rate": 2.1093216916328855e-05, "loss": 0.7398, "step": 4280 }, { "epoch": 2.7878985035783996, "grad_norm": 1.2829575538635254, "learning_rate": 2.104224283396381e-05, "loss": 0.7632, "step": 4285 }, { "epoch": 2.7911515940143135, "grad_norm": 0.9626341462135315, "learning_rate": 2.0991285622867215e-05, "loss": 0.7681, "step": 4290 }, { "epoch": 2.7944046844502277, "grad_norm": 0.952867865562439, "learning_rate": 2.0940345500261294e-05, "loss": 0.7518, "step": 4295 }, { "epoch": 2.797657774886142, "grad_norm": 1.0598902702331543, "learning_rate": 2.0889422683295407e-05, "loss": 0.7884, "step": 4300 }, { "epoch": 2.800910865322056, "grad_norm": 1.0540211200714111, "learning_rate": 2.083851738904516e-05, "loss": 0.7518, "step": 4305 }, { "epoch": 2.80416395575797, "grad_norm": 0.9470973014831543, "learning_rate": 2.0787629834511466e-05, "loss": 0.764, "step": 4310 }, { "epoch": 2.8074170461938843, "grad_norm": 1.127659559249878, "learning_rate": 2.0736760236619594e-05, "loss": 0.7332, "step": 4315 }, { "epoch": 2.810670136629798, "grad_norm": 1.0755411386489868, "learning_rate": 2.0685908812218287e-05, "loss": 0.7622, "step": 4320 }, { "epoch": 2.8139232270657124, "grad_norm": 1.1209520101547241, "learning_rate": 2.0635075778078817e-05, "loss": 0.7416, "step": 4325 }, { "epoch": 2.8171763175016267, "grad_norm": 1.0491728782653809, "learning_rate": 2.0584261350894046e-05, "loss": 0.7802, "step": 4330 }, { "epoch": 2.8204294079375405, "grad_norm": 1.025694727897644, "learning_rate": 2.0533465747277535e-05, "loss": 0.7487, "step": 4335 }, { "epoch": 2.8236824983734548, "grad_norm": 0.9486551880836487, "learning_rate": 2.0482689183762588e-05, "loss": 0.7594, "step": 4340 }, { "epoch": 2.826935588809369, "grad_norm": 0.9839990139007568, "learning_rate": 2.0431931876801352e-05, "loss": 0.7431, "step": 4345 }, { "epoch": 2.830188679245283, "grad_norm": 1.0050575733184814, "learning_rate": 2.03811940427639e-05, "loss": 0.7527, "step": 4350 }, { "epoch": 2.833441769681197, "grad_norm": 0.9743004441261292, "learning_rate": 2.033047589793726e-05, "loss": 0.7307, "step": 4355 }, { "epoch": 2.8366948601171114, "grad_norm": 1.0488122701644897, "learning_rate": 2.027977765852456e-05, "loss": 0.7598, "step": 4360 }, { "epoch": 2.839947950553025, "grad_norm": 1.074271321296692, "learning_rate": 2.022909954064407e-05, "loss": 0.7571, "step": 4365 }, { "epoch": 2.8432010409889394, "grad_norm": 0.9306830167770386, "learning_rate": 2.0178441760328268e-05, "loss": 0.735, "step": 4370 }, { "epoch": 2.8464541314248537, "grad_norm": 0.8995447754859924, "learning_rate": 2.0127804533522948e-05, "loss": 0.7519, "step": 4375 }, { "epoch": 2.8497072218607675, "grad_norm": 0.9495101571083069, "learning_rate": 2.0077188076086288e-05, "loss": 0.7544, "step": 4380 }, { "epoch": 2.852960312296682, "grad_norm": 1.3610079288482666, "learning_rate": 2.002659260378794e-05, "loss": 0.7573, "step": 4385 }, { "epoch": 2.856213402732596, "grad_norm": 0.9668116569519043, "learning_rate": 1.9976018332308077e-05, "loss": 0.7332, "step": 4390 }, { "epoch": 2.85946649316851, "grad_norm": 1.128670334815979, "learning_rate": 1.992546547723651e-05, "loss": 0.7512, "step": 4395 }, { "epoch": 2.862719583604424, "grad_norm": 1.276426911354065, "learning_rate": 1.987493425407176e-05, "loss": 0.7449, "step": 4400 }, { "epoch": 2.8659726740403384, "grad_norm": 0.9716594815254211, "learning_rate": 1.982442487822011e-05, "loss": 0.7432, "step": 4405 }, { "epoch": 2.869225764476252, "grad_norm": 0.9533106088638306, "learning_rate": 1.9773937564994745e-05, "loss": 0.7423, "step": 4410 }, { "epoch": 2.8724788549121665, "grad_norm": 1.0256469249725342, "learning_rate": 1.972347252961479e-05, "loss": 0.7614, "step": 4415 }, { "epoch": 2.8757319453480807, "grad_norm": 1.1626900434494019, "learning_rate": 1.967302998720438e-05, "loss": 0.7392, "step": 4420 }, { "epoch": 2.8789850357839946, "grad_norm": 0.9739611744880676, "learning_rate": 1.9622610152791792e-05, "loss": 0.7622, "step": 4425 }, { "epoch": 2.882238126219909, "grad_norm": 1.0657685995101929, "learning_rate": 1.9572213241308507e-05, "loss": 0.7507, "step": 4430 }, { "epoch": 2.885491216655823, "grad_norm": 1.029432773590088, "learning_rate": 1.952183946758826e-05, "loss": 0.7723, "step": 4435 }, { "epoch": 2.888744307091737, "grad_norm": 1.1281373500823975, "learning_rate": 1.9471489046366185e-05, "loss": 0.7479, "step": 4440 }, { "epoch": 2.891997397527651, "grad_norm": 1.1470041275024414, "learning_rate": 1.942116219227784e-05, "loss": 0.7341, "step": 4445 }, { "epoch": 2.8952504879635654, "grad_norm": 1.0326032638549805, "learning_rate": 1.937085911985834e-05, "loss": 0.7571, "step": 4450 }, { "epoch": 2.8985035783994793, "grad_norm": 0.9806135296821594, "learning_rate": 1.9320580043541425e-05, "loss": 0.734, "step": 4455 }, { "epoch": 2.9017566688353935, "grad_norm": 1.063024878501892, "learning_rate": 1.9270325177658523e-05, "loss": 0.7521, "step": 4460 }, { "epoch": 2.905009759271308, "grad_norm": 4.5842156410217285, "learning_rate": 1.922009473643787e-05, "loss": 0.7563, "step": 4465 }, { "epoch": 2.9082628497072216, "grad_norm": 1.3341448307037354, "learning_rate": 1.9169888934003598e-05, "loss": 0.7528, "step": 4470 }, { "epoch": 2.911515940143136, "grad_norm": 1.3391072750091553, "learning_rate": 1.9119707984374774e-05, "loss": 0.737, "step": 4475 }, { "epoch": 2.91476903057905, "grad_norm": 0.985970139503479, "learning_rate": 1.9069552101464552e-05, "loss": 0.7657, "step": 4480 }, { "epoch": 2.918022121014964, "grad_norm": 1.069992184638977, "learning_rate": 1.901942149907922e-05, "loss": 0.7526, "step": 4485 }, { "epoch": 2.921275211450878, "grad_norm": 0.8812434077262878, "learning_rate": 1.8969316390917288e-05, "loss": 0.7664, "step": 4490 }, { "epoch": 2.9245283018867925, "grad_norm": 1.2932692766189575, "learning_rate": 1.891923699056861e-05, "loss": 0.7553, "step": 4495 }, { "epoch": 2.9277813923227067, "grad_norm": 0.935070276260376, "learning_rate": 1.886918351151343e-05, "loss": 0.7583, "step": 4500 }, { "epoch": 2.9310344827586206, "grad_norm": 0.9840937852859497, "learning_rate": 1.881915616712151e-05, "loss": 0.748, "step": 4505 }, { "epoch": 2.934287573194535, "grad_norm": 1.0583505630493164, "learning_rate": 1.8769155170651203e-05, "loss": 0.7482, "step": 4510 }, { "epoch": 2.937540663630449, "grad_norm": 1.0253130197525024, "learning_rate": 1.8719180735248522e-05, "loss": 0.751, "step": 4515 }, { "epoch": 2.940793754066363, "grad_norm": 1.0491794347763062, "learning_rate": 1.8669233073946303e-05, "loss": 0.7533, "step": 4520 }, { "epoch": 2.944046844502277, "grad_norm": 1.1201449632644653, "learning_rate": 1.86193123996632e-05, "loss": 0.7486, "step": 4525 }, { "epoch": 2.9472999349381914, "grad_norm": 1.3683768510818481, "learning_rate": 1.856941892520284e-05, "loss": 0.7584, "step": 4530 }, { "epoch": 2.9505530253741052, "grad_norm": 1.0555903911590576, "learning_rate": 1.851955286325292e-05, "loss": 0.7554, "step": 4535 }, { "epoch": 2.9538061158100195, "grad_norm": 1.5055445432662964, "learning_rate": 1.846971442638426e-05, "loss": 0.7418, "step": 4540 }, { "epoch": 2.9570592062459338, "grad_norm": 1.222474455833435, "learning_rate": 1.841990382704993e-05, "loss": 0.7455, "step": 4545 }, { "epoch": 2.960312296681848, "grad_norm": 1.0359810590744019, "learning_rate": 1.8370121277584325e-05, "loss": 0.7404, "step": 4550 }, { "epoch": 2.963565387117762, "grad_norm": 1.2511727809906006, "learning_rate": 1.8320366990202276e-05, "loss": 0.7228, "step": 4555 }, { "epoch": 2.966818477553676, "grad_norm": 0.8730882406234741, "learning_rate": 1.827064117699814e-05, "loss": 0.7586, "step": 4560 }, { "epoch": 2.9700715679895904, "grad_norm": 1.5805312395095825, "learning_rate": 1.822094404994487e-05, "loss": 0.7499, "step": 4565 }, { "epoch": 2.973324658425504, "grad_norm": 1.1607098579406738, "learning_rate": 1.817127582089317e-05, "loss": 0.7637, "step": 4570 }, { "epoch": 2.9765777488614185, "grad_norm": 0.9193926453590393, "learning_rate": 1.8121636701570537e-05, "loss": 0.7532, "step": 4575 }, { "epoch": 2.9798308392973327, "grad_norm": 1.0218764543533325, "learning_rate": 1.807202690358037e-05, "loss": 0.7503, "step": 4580 }, { "epoch": 2.9830839297332465, "grad_norm": 1.0876221656799316, "learning_rate": 1.802244663840109e-05, "loss": 0.7707, "step": 4585 }, { "epoch": 2.986337020169161, "grad_norm": 1.0459486246109009, "learning_rate": 1.797289611738523e-05, "loss": 0.7397, "step": 4590 }, { "epoch": 2.989590110605075, "grad_norm": 1.0498055219650269, "learning_rate": 1.7923375551758505e-05, "loss": 0.7691, "step": 4595 }, { "epoch": 2.992843201040989, "grad_norm": 0.9780749082565308, "learning_rate": 1.7873885152618956e-05, "loss": 0.7525, "step": 4600 }, { "epoch": 2.996096291476903, "grad_norm": 1.0338603258132935, "learning_rate": 1.7824425130936023e-05, "loss": 0.7459, "step": 4605 }, { "epoch": 2.9993493819128174, "grad_norm": 0.9098593592643738, "learning_rate": 1.7774995697549645e-05, "loss": 0.7488, "step": 4610 }, { "epoch": 3.0, "eval_f1": 0.8012369099843738, "eval_loss": 0.45166015625, "eval_precision": 0.8020338050069477, "eval_recall": 0.8006626052475169, "eval_runtime": 238.3932, "eval_samples_per_second": 1650.361, "eval_steps_per_second": 1.615, "step": 4611 }, { "epoch": 3.0026024723487312, "grad_norm": 1.3282872438430786, "learning_rate": 1.7725597063169386e-05, "loss": 0.6622, "step": 4615 }, { "epoch": 3.0058555627846455, "grad_norm": 1.3152724504470825, "learning_rate": 1.767622943837349e-05, "loss": 0.6352, "step": 4620 }, { "epoch": 3.0091086532205593, "grad_norm": 1.105705976486206, "learning_rate": 1.7626893033608038e-05, "loss": 0.6291, "step": 4625 }, { "epoch": 3.0123617436564736, "grad_norm": 1.0462555885314941, "learning_rate": 1.7577588059186027e-05, "loss": 0.6476, "step": 4630 }, { "epoch": 3.015614834092388, "grad_norm": 1.0921547412872314, "learning_rate": 1.7528314725286443e-05, "loss": 0.6358, "step": 4635 }, { "epoch": 3.018867924528302, "grad_norm": 1.1877232789993286, "learning_rate": 1.747907324195342e-05, "loss": 0.6434, "step": 4640 }, { "epoch": 3.022121014964216, "grad_norm": 1.1791988611221313, "learning_rate": 1.7429863819095313e-05, "loss": 0.6372, "step": 4645 }, { "epoch": 3.02537410540013, "grad_norm": 1.23057222366333, "learning_rate": 1.738068666648379e-05, "loss": 0.6521, "step": 4650 }, { "epoch": 3.0286271958360445, "grad_norm": 1.0966289043426514, "learning_rate": 1.7331541993752993e-05, "loss": 0.6337, "step": 4655 }, { "epoch": 3.0318802862719583, "grad_norm": 1.108396291732788, "learning_rate": 1.7282430010398577e-05, "loss": 0.6394, "step": 4660 }, { "epoch": 3.0351333767078725, "grad_norm": 1.2432180643081665, "learning_rate": 1.723335092577686e-05, "loss": 0.6319, "step": 4665 }, { "epoch": 3.038386467143787, "grad_norm": 1.5450379848480225, "learning_rate": 1.718430494910391e-05, "loss": 0.632, "step": 4670 }, { "epoch": 3.0416395575797006, "grad_norm": 1.3607127666473389, "learning_rate": 1.713529228945466e-05, "loss": 0.6608, "step": 4675 }, { "epoch": 3.044892648015615, "grad_norm": 1.0697190761566162, "learning_rate": 1.7086313155762046e-05, "loss": 0.6263, "step": 4680 }, { "epoch": 3.048145738451529, "grad_norm": 1.3838845491409302, "learning_rate": 1.703736775681604e-05, "loss": 0.6367, "step": 4685 }, { "epoch": 3.051398828887443, "grad_norm": 1.324628233909607, "learning_rate": 1.6988456301262854e-05, "loss": 0.6435, "step": 4690 }, { "epoch": 3.0546519193233572, "grad_norm": 1.2009634971618652, "learning_rate": 1.6939578997603983e-05, "loss": 0.6467, "step": 4695 }, { "epoch": 3.0579050097592715, "grad_norm": 1.2275351285934448, "learning_rate": 1.689073605419533e-05, "loss": 0.6403, "step": 4700 }, { "epoch": 3.0611581001951853, "grad_norm": 1.9216879606246948, "learning_rate": 1.6841927679246345e-05, "loss": 0.6186, "step": 4705 }, { "epoch": 3.0644111906310996, "grad_norm": 2.3563551902770996, "learning_rate": 1.679315408081911e-05, "loss": 0.6202, "step": 4710 }, { "epoch": 3.067664281067014, "grad_norm": 1.435333490371704, "learning_rate": 1.6744415466827463e-05, "loss": 0.6273, "step": 4715 }, { "epoch": 3.0709173715029277, "grad_norm": 1.315987229347229, "learning_rate": 1.6695712045036104e-05, "loss": 0.6318, "step": 4720 }, { "epoch": 3.074170461938842, "grad_norm": 1.5982025861740112, "learning_rate": 1.6647044023059712e-05, "loss": 0.6384, "step": 4725 }, { "epoch": 3.077423552374756, "grad_norm": 1.998374104499817, "learning_rate": 1.659841160836207e-05, "loss": 0.6286, "step": 4730 }, { "epoch": 3.08067664281067, "grad_norm": 1.3811148405075073, "learning_rate": 1.6549815008255176e-05, "loss": 0.6482, "step": 4735 }, { "epoch": 3.0839297332465843, "grad_norm": 1.2464516162872314, "learning_rate": 1.6501254429898343e-05, "loss": 0.6433, "step": 4740 }, { "epoch": 3.0871828236824985, "grad_norm": 1.2944623231887817, "learning_rate": 1.6452730080297342e-05, "loss": 0.6328, "step": 4745 }, { "epoch": 3.0904359141184123, "grad_norm": 1.1027922630310059, "learning_rate": 1.6404242166303507e-05, "loss": 0.6357, "step": 4750 }, { "epoch": 3.0936890045543266, "grad_norm": 3.5568132400512695, "learning_rate": 1.6355790894612834e-05, "loss": 0.6081, "step": 4755 }, { "epoch": 3.096942094990241, "grad_norm": 1.588714838027954, "learning_rate": 1.630737647176514e-05, "loss": 0.6601, "step": 4760 }, { "epoch": 3.1001951854261547, "grad_norm": 1.1922274827957153, "learning_rate": 1.6258999104143157e-05, "loss": 0.6145, "step": 4765 }, { "epoch": 3.103448275862069, "grad_norm": 1.3667454719543457, "learning_rate": 1.621065899797165e-05, "loss": 0.6372, "step": 4770 }, { "epoch": 3.106701366297983, "grad_norm": 1.8918445110321045, "learning_rate": 1.616235635931655e-05, "loss": 0.6152, "step": 4775 }, { "epoch": 3.109954456733897, "grad_norm": 1.293562650680542, "learning_rate": 1.611409139408406e-05, "loss": 0.6211, "step": 4780 }, { "epoch": 3.1132075471698113, "grad_norm": 1.446754813194275, "learning_rate": 1.6065864308019807e-05, "loss": 0.6453, "step": 4785 }, { "epoch": 3.1164606376057256, "grad_norm": 1.1851979494094849, "learning_rate": 1.6017675306707926e-05, "loss": 0.631, "step": 4790 }, { "epoch": 3.1197137280416394, "grad_norm": 1.3031965494155884, "learning_rate": 1.5969524595570216e-05, "loss": 0.6184, "step": 4795 }, { "epoch": 3.1229668184775536, "grad_norm": 2.6355156898498535, "learning_rate": 1.5921412379865257e-05, "loss": 0.6451, "step": 4800 }, { "epoch": 3.126219908913468, "grad_norm": 1.4367573261260986, "learning_rate": 1.58733388646875e-05, "loss": 0.6466, "step": 4805 }, { "epoch": 3.1294729993493817, "grad_norm": 1.4838011264801025, "learning_rate": 1.5825304254966445e-05, "loss": 0.6181, "step": 4810 }, { "epoch": 3.132726089785296, "grad_norm": 1.2338780164718628, "learning_rate": 1.577730875546575e-05, "loss": 0.6179, "step": 4815 }, { "epoch": 3.1359791802212102, "grad_norm": 1.4179608821868896, "learning_rate": 1.5729352570782324e-05, "loss": 0.6362, "step": 4820 }, { "epoch": 3.139232270657124, "grad_norm": 1.2671458721160889, "learning_rate": 1.5681435905345522e-05, "loss": 0.6365, "step": 4825 }, { "epoch": 3.1424853610930383, "grad_norm": 1.368369221687317, "learning_rate": 1.5643131164122626e-05, "loss": 0.6102, "step": 4830 }, { "epoch": 3.1457384515289526, "grad_norm": 1.341280460357666, "learning_rate": 1.5595286147953364e-05, "loss": 0.637, "step": 4835 }, { "epoch": 3.1489915419648664, "grad_norm": 1.5806121826171875, "learning_rate": 1.5547481222533846e-05, "loss": 0.6296, "step": 4840 }, { "epoch": 3.1522446324007807, "grad_norm": 1.505342721939087, "learning_rate": 1.549971659164861e-05, "loss": 0.6284, "step": 4845 }, { "epoch": 3.155497722836695, "grad_norm": 1.2677946090698242, "learning_rate": 1.5451992458910442e-05, "loss": 0.6134, "step": 4850 }, { "epoch": 3.1587508132726088, "grad_norm": 1.2727744579315186, "learning_rate": 1.540430902775946e-05, "loss": 0.626, "step": 4855 }, { "epoch": 3.162003903708523, "grad_norm": 1.258187174797058, "learning_rate": 1.5356666501462314e-05, "loss": 0.6085, "step": 4860 }, { "epoch": 3.1652569941444373, "grad_norm": 1.589736819267273, "learning_rate": 1.5309065083111255e-05, "loss": 0.6247, "step": 4865 }, { "epoch": 3.168510084580351, "grad_norm": 1.2900131940841675, "learning_rate": 1.5261504975623306e-05, "loss": 0.624, "step": 4870 }, { "epoch": 3.1717631750162654, "grad_norm": 2.3252532482147217, "learning_rate": 1.5213986381739393e-05, "loss": 0.6295, "step": 4875 }, { "epoch": 3.1750162654521796, "grad_norm": 1.3652303218841553, "learning_rate": 1.5166509504023473e-05, "loss": 0.6274, "step": 4880 }, { "epoch": 3.178269355888094, "grad_norm": 1.8075648546218872, "learning_rate": 1.5119074544861678e-05, "loss": 0.6375, "step": 4885 }, { "epoch": 3.1815224463240077, "grad_norm": 1.2221382856369019, "learning_rate": 1.5071681706461438e-05, "loss": 0.6273, "step": 4890 }, { "epoch": 3.184775536759922, "grad_norm": 1.5147900581359863, "learning_rate": 1.5024331190850637e-05, "loss": 0.6381, "step": 4895 }, { "epoch": 3.1880286271958362, "grad_norm": 2.4453020095825195, "learning_rate": 1.4977023199876743e-05, "loss": 0.6552, "step": 4900 }, { "epoch": 3.19128171763175, "grad_norm": 2.3050053119659424, "learning_rate": 1.4929757935205951e-05, "loss": 0.6176, "step": 4905 }, { "epoch": 3.1945348080676643, "grad_norm": 1.289581060409546, "learning_rate": 1.4882535598322311e-05, "loss": 0.6253, "step": 4910 }, { "epoch": 3.1977878985035786, "grad_norm": 1.5076651573181152, "learning_rate": 1.4835356390526888e-05, "loss": 0.6194, "step": 4915 }, { "epoch": 3.2010409889394924, "grad_norm": 1.4202001094818115, "learning_rate": 1.478822051293689e-05, "loss": 0.6081, "step": 4920 }, { "epoch": 3.2042940793754067, "grad_norm": 1.287611961364746, "learning_rate": 1.4741128166484824e-05, "loss": 0.6429, "step": 4925 }, { "epoch": 3.207547169811321, "grad_norm": 1.2236043214797974, "learning_rate": 1.4694079551917629e-05, "loss": 0.6176, "step": 4930 }, { "epoch": 3.2108002602472347, "grad_norm": 1.3410075902938843, "learning_rate": 1.4656472282003922e-05, "loss": 0.6209, "step": 4935 }, { "epoch": 3.214053350683149, "grad_norm": 1.419541835784912, "learning_rate": 1.4609502890116145e-05, "loss": 0.6436, "step": 4940 }, { "epoch": 3.2173064411190633, "grad_norm": 1.7478810548782349, "learning_rate": 1.4562577791210158e-05, "loss": 0.6023, "step": 4945 }, { "epoch": 3.220559531554977, "grad_norm": 1.8083374500274658, "learning_rate": 1.4515697185319946e-05, "loss": 0.6166, "step": 4950 }, { "epoch": 3.2238126219908914, "grad_norm": 2.203806161880493, "learning_rate": 1.4468861272289818e-05, "loss": 0.636, "step": 4955 }, { "epoch": 3.2270657124268056, "grad_norm": 1.3574259281158447, "learning_rate": 1.4422070251773594e-05, "loss": 0.6012, "step": 4960 }, { "epoch": 3.2303188028627194, "grad_norm": 1.4441782236099243, "learning_rate": 1.4375324323233697e-05, "loss": 0.6197, "step": 4965 }, { "epoch": 3.2335718932986337, "grad_norm": 1.7502111196517944, "learning_rate": 1.4328623685940335e-05, "loss": 0.6354, "step": 4970 }, { "epoch": 3.236824983734548, "grad_norm": 1.5651460886001587, "learning_rate": 1.4281968538970646e-05, "loss": 0.6257, "step": 4975 }, { "epoch": 3.240078074170462, "grad_norm": 1.3271369934082031, "learning_rate": 1.4235359081207871e-05, "loss": 0.6378, "step": 4980 }, { "epoch": 3.243331164606376, "grad_norm": 1.354906678199768, "learning_rate": 1.4188795511340461e-05, "loss": 0.6324, "step": 4985 }, { "epoch": 3.2465842550422903, "grad_norm": 1.295578956604004, "learning_rate": 1.4142278027861253e-05, "loss": 0.6176, "step": 4990 }, { "epoch": 3.249837345478204, "grad_norm": 1.4495329856872559, "learning_rate": 1.4095806829066655e-05, "loss": 0.6387, "step": 4995 }, { "epoch": 3.2530904359141184, "grad_norm": 1.3459370136260986, "learning_rate": 1.404938211305574e-05, "loss": 0.6343, "step": 5000 }, { "epoch": 3.2563435263500327, "grad_norm": 1.299459457397461, "learning_rate": 1.4003004077729438e-05, "loss": 0.6394, "step": 5005 }, { "epoch": 3.2595966167859465, "grad_norm": 1.3181241750717163, "learning_rate": 1.3956672920789705e-05, "loss": 0.6135, "step": 5010 }, { "epoch": 3.2628497072218607, "grad_norm": 1.5811583995819092, "learning_rate": 1.3910388839738647e-05, "loss": 0.6377, "step": 5015 }, { "epoch": 3.266102797657775, "grad_norm": 1.3512473106384277, "learning_rate": 1.386415203187768e-05, "loss": 0.6293, "step": 5020 }, { "epoch": 3.269355888093689, "grad_norm": 1.8290486335754395, "learning_rate": 1.3817962694306747e-05, "loss": 0.635, "step": 5025 }, { "epoch": 3.272608978529603, "grad_norm": 1.5076416730880737, "learning_rate": 1.3771821023923383e-05, "loss": 0.6027, "step": 5030 }, { "epoch": 3.2758620689655173, "grad_norm": 1.5753469467163086, "learning_rate": 1.3725727217421947e-05, "loss": 0.6165, "step": 5035 }, { "epoch": 3.279115159401431, "grad_norm": 1.5028088092803955, "learning_rate": 1.3679681471292776e-05, "loss": 0.621, "step": 5040 }, { "epoch": 3.2823682498373454, "grad_norm": 1.4654455184936523, "learning_rate": 1.363368398182131e-05, "loss": 0.6266, "step": 5045 }, { "epoch": 3.2856213402732597, "grad_norm": 1.7276520729064941, "learning_rate": 1.3587734945087277e-05, "loss": 0.6258, "step": 5050 }, { "epoch": 3.288874430709174, "grad_norm": 1.710095763206482, "learning_rate": 1.3541834556963895e-05, "loss": 0.6388, "step": 5055 }, { "epoch": 3.2921275211450878, "grad_norm": 1.6146140098571777, "learning_rate": 1.3495983013116953e-05, "loss": 0.6466, "step": 5060 }, { "epoch": 3.295380611581002, "grad_norm": 1.3169276714324951, "learning_rate": 1.3450180509004066e-05, "loss": 0.6389, "step": 5065 }, { "epoch": 3.2986337020169163, "grad_norm": 2.564819574356079, "learning_rate": 1.3404427239873763e-05, "loss": 0.6158, "step": 5070 }, { "epoch": 3.30188679245283, "grad_norm": 1.6384319067001343, "learning_rate": 1.335872340076474e-05, "loss": 0.6241, "step": 5075 }, { "epoch": 3.3051398828887444, "grad_norm": 1.4620628356933594, "learning_rate": 1.3313069186504929e-05, "loss": 0.6203, "step": 5080 }, { "epoch": 3.3083929733246586, "grad_norm": 1.7426296472549438, "learning_rate": 1.3267464791710747e-05, "loss": 0.6238, "step": 5085 }, { "epoch": 3.3116460637605725, "grad_norm": 2.093579053878784, "learning_rate": 1.3221910410786248e-05, "loss": 0.6144, "step": 5090 }, { "epoch": 3.3148991541964867, "grad_norm": 1.4141899347305298, "learning_rate": 1.3176406237922262e-05, "loss": 0.6145, "step": 5095 }, { "epoch": 3.318152244632401, "grad_norm": 1.2416197061538696, "learning_rate": 1.3130952467095593e-05, "loss": 0.6134, "step": 5100 }, { "epoch": 3.321405335068315, "grad_norm": 1.6651731729507446, "learning_rate": 1.3085549292068213e-05, "loss": 0.6366, "step": 5105 }, { "epoch": 3.324658425504229, "grad_norm": 1.4123419523239136, "learning_rate": 1.3040196906386392e-05, "loss": 0.6363, "step": 5110 }, { "epoch": 3.3279115159401433, "grad_norm": 1.5788094997406006, "learning_rate": 1.2994895503379886e-05, "loss": 0.6463, "step": 5115 }, { "epoch": 3.331164606376057, "grad_norm": 1.9464671611785889, "learning_rate": 1.2949645276161149e-05, "loss": 0.6193, "step": 5120 }, { "epoch": 3.3344176968119714, "grad_norm": 1.3868358135223389, "learning_rate": 1.2904446417624457e-05, "loss": 0.6182, "step": 5125 }, { "epoch": 3.3376707872478857, "grad_norm": 7.827129364013672, "learning_rate": 1.2859299120445107e-05, "loss": 0.615, "step": 5130 }, { "epoch": 3.3409238776837995, "grad_norm": 1.3248870372772217, "learning_rate": 1.2814203577078626e-05, "loss": 0.6286, "step": 5135 }, { "epoch": 3.3441769681197138, "grad_norm": 1.3587925434112549, "learning_rate": 1.2769159979759899e-05, "loss": 0.6285, "step": 5140 }, { "epoch": 3.347430058555628, "grad_norm": 1.518294095993042, "learning_rate": 1.2724168520502371e-05, "loss": 0.6304, "step": 5145 }, { "epoch": 3.350683148991542, "grad_norm": 1.2859338521957397, "learning_rate": 1.2679229391097241e-05, "loss": 0.6299, "step": 5150 }, { "epoch": 3.353936239427456, "grad_norm": 1.3024553060531616, "learning_rate": 1.2634342783112646e-05, "loss": 0.6177, "step": 5155 }, { "epoch": 3.3571893298633704, "grad_norm": 3.6768040657043457, "learning_rate": 1.258950888789281e-05, "loss": 0.6385, "step": 5160 }, { "epoch": 3.360442420299284, "grad_norm": 1.476014256477356, "learning_rate": 1.2544727896557257e-05, "loss": 0.6313, "step": 5165 }, { "epoch": 3.3636955107351985, "grad_norm": 2.193185806274414, "learning_rate": 1.2500000000000006e-05, "loss": 0.6386, "step": 5170 }, { "epoch": 3.3669486011711127, "grad_norm": 1.4634368419647217, "learning_rate": 1.2455325388888726e-05, "loss": 0.617, "step": 5175 }, { "epoch": 3.3702016916070265, "grad_norm": 1.770553708076477, "learning_rate": 1.2410704253663932e-05, "loss": 0.637, "step": 5180 }, { "epoch": 3.373454782042941, "grad_norm": 1.7664306163787842, "learning_rate": 1.236613678453821e-05, "loss": 0.6203, "step": 5185 }, { "epoch": 3.376707872478855, "grad_norm": 1.4499051570892334, "learning_rate": 1.232162317149535e-05, "loss": 0.6417, "step": 5190 }, { "epoch": 3.379960962914769, "grad_norm": 2.710038661956787, "learning_rate": 1.2277163604289558e-05, "loss": 0.6246, "step": 5195 }, { "epoch": 3.383214053350683, "grad_norm": 1.9992517232894897, "learning_rate": 1.2232758272444672e-05, "loss": 0.6188, "step": 5200 }, { "epoch": 3.3864671437865974, "grad_norm": 1.1757420301437378, "learning_rate": 1.2188407365253337e-05, "loss": 0.6232, "step": 5205 }, { "epoch": 3.3897202342225112, "grad_norm": 1.3049498796463013, "learning_rate": 1.2144111071776174e-05, "loss": 0.6314, "step": 5210 }, { "epoch": 3.3929733246584255, "grad_norm": 1.2970354557037354, "learning_rate": 1.209986958084099e-05, "loss": 0.6361, "step": 5215 }, { "epoch": 3.3962264150943398, "grad_norm": 1.4407247304916382, "learning_rate": 1.205568308104201e-05, "loss": 0.6246, "step": 5220 }, { "epoch": 3.3994795055302536, "grad_norm": 1.673065185546875, "learning_rate": 1.2011551760739014e-05, "loss": 0.6318, "step": 5225 }, { "epoch": 3.402732595966168, "grad_norm": 1.4697465896606445, "learning_rate": 1.196747580805656e-05, "loss": 0.6417, "step": 5230 }, { "epoch": 3.405985686402082, "grad_norm": 1.6552962064743042, "learning_rate": 1.1923455410883212e-05, "loss": 0.6343, "step": 5235 }, { "epoch": 3.409238776837996, "grad_norm": 1.5813676118850708, "learning_rate": 1.1879490756870674e-05, "loss": 0.6352, "step": 5240 }, { "epoch": 3.41249186727391, "grad_norm": 3.213158130645752, "learning_rate": 1.1835582033433037e-05, "loss": 0.6352, "step": 5245 }, { "epoch": 3.4157449577098244, "grad_norm": 1.2842360734939575, "learning_rate": 1.1791729427745992e-05, "loss": 0.6416, "step": 5250 }, { "epoch": 3.4189980481457383, "grad_norm": 1.6811124086380005, "learning_rate": 1.1747933126745983e-05, "loss": 0.651, "step": 5255 }, { "epoch": 3.4222511385816525, "grad_norm": 1.2236487865447998, "learning_rate": 1.170419331712943e-05, "loss": 0.641, "step": 5260 }, { "epoch": 3.425504229017567, "grad_norm": 1.3968175649642944, "learning_rate": 1.1660510185351978e-05, "loss": 0.6271, "step": 5265 }, { "epoch": 3.4287573194534806, "grad_norm": 2.152369976043701, "learning_rate": 1.161688391762763e-05, "loss": 0.633, "step": 5270 }, { "epoch": 3.432010409889395, "grad_norm": 1.5563530921936035, "learning_rate": 1.1573314699927985e-05, "loss": 0.6429, "step": 5275 }, { "epoch": 3.435263500325309, "grad_norm": 1.4173344373703003, "learning_rate": 1.1529802717981475e-05, "loss": 0.6344, "step": 5280 }, { "epoch": 3.438516590761223, "grad_norm": 1.8149155378341675, "learning_rate": 1.1486348157272526e-05, "loss": 0.6278, "step": 5285 }, { "epoch": 3.441769681197137, "grad_norm": 1.4700722694396973, "learning_rate": 1.1442951203040775e-05, "loss": 0.607, "step": 5290 }, { "epoch": 3.4450227716330515, "grad_norm": 1.4950767755508423, "learning_rate": 1.139961204028033e-05, "loss": 0.6298, "step": 5295 }, { "epoch": 3.4482758620689653, "grad_norm": 1.702974796295166, "learning_rate": 1.1356330853738906e-05, "loss": 0.6599, "step": 5300 }, { "epoch": 3.4515289525048796, "grad_norm": 1.7694127559661865, "learning_rate": 1.1313107827917083e-05, "loss": 0.6235, "step": 5305 }, { "epoch": 3.454782042940794, "grad_norm": 1.2292397022247314, "learning_rate": 1.1269943147067535e-05, "loss": 0.6264, "step": 5310 }, { "epoch": 3.4580351333767076, "grad_norm": 1.3355427980422974, "learning_rate": 1.1226836995194196e-05, "loss": 0.6274, "step": 5315 }, { "epoch": 3.461288223812622, "grad_norm": 1.313506841659546, "learning_rate": 1.1183789556051508e-05, "loss": 0.6075, "step": 5320 }, { "epoch": 3.464541314248536, "grad_norm": 1.3950237035751343, "learning_rate": 1.1140801013143618e-05, "loss": 0.606, "step": 5325 }, { "epoch": 3.46779440468445, "grad_norm": 1.4222460985183716, "learning_rate": 1.1097871549723629e-05, "loss": 0.6238, "step": 5330 }, { "epoch": 3.4710474951203643, "grad_norm": 1.701815128326416, "learning_rate": 1.1055001348792807e-05, "loss": 0.6227, "step": 5335 }, { "epoch": 3.4743005855562785, "grad_norm": 1.5569487810134888, "learning_rate": 1.1012190593099744e-05, "loss": 0.643, "step": 5340 }, { "epoch": 3.4775536759921923, "grad_norm": 1.3712338209152222, "learning_rate": 1.0969439465139687e-05, "loss": 0.6167, "step": 5345 }, { "epoch": 3.4808067664281066, "grad_norm": 1.3950178623199463, "learning_rate": 1.0926748147153648e-05, "loss": 0.6318, "step": 5350 }, { "epoch": 3.484059856864021, "grad_norm": 1.347066044807434, "learning_rate": 1.088411682112771e-05, "loss": 0.6225, "step": 5355 }, { "epoch": 3.487312947299935, "grad_norm": 1.347697138786316, "learning_rate": 1.08415456687922e-05, "loss": 0.6225, "step": 5360 }, { "epoch": 3.490566037735849, "grad_norm": 1.5315964221954346, "learning_rate": 1.0799034871620958e-05, "loss": 0.6067, "step": 5365 }, { "epoch": 3.493819128171763, "grad_norm": 1.3384947776794434, "learning_rate": 1.0756584610830523e-05, "loss": 0.6235, "step": 5370 }, { "epoch": 3.4970722186076775, "grad_norm": 1.3656494617462158, "learning_rate": 1.071419506737937e-05, "loss": 0.6347, "step": 5375 }, { "epoch": 3.5003253090435913, "grad_norm": 1.3071860074996948, "learning_rate": 1.0671866421967175e-05, "loss": 0.6108, "step": 5380 }, { "epoch": 3.5035783994795056, "grad_norm": 1.3579492568969727, "learning_rate": 1.062959885503399e-05, "loss": 0.6354, "step": 5385 }, { "epoch": 3.5068314899154194, "grad_norm": 1.52472722530365, "learning_rate": 1.0587392546759498e-05, "loss": 0.6177, "step": 5390 }, { "epoch": 3.5100845803513336, "grad_norm": 1.7216352224349976, "learning_rate": 1.0545247677062273e-05, "loss": 0.6225, "step": 5395 }, { "epoch": 3.513337670787248, "grad_norm": 1.3169187307357788, "learning_rate": 1.050316442559896e-05, "loss": 0.6196, "step": 5400 }, { "epoch": 3.516590761223162, "grad_norm": 1.7447690963745117, "learning_rate": 1.0461142971763535e-05, "loss": 0.6338, "step": 5405 }, { "epoch": 3.519843851659076, "grad_norm": 1.4032801389694214, "learning_rate": 1.0419183494686574e-05, "loss": 0.6261, "step": 5410 }, { "epoch": 3.5230969420949902, "grad_norm": 1.6217771768569946, "learning_rate": 1.0377286173234416e-05, "loss": 0.6306, "step": 5415 }, { "epoch": 3.5263500325309045, "grad_norm": 1.2982110977172852, "learning_rate": 1.0335451186008454e-05, "loss": 0.6242, "step": 5420 }, { "epoch": 3.5296031229668183, "grad_norm": 1.2958654165267944, "learning_rate": 1.0293678711344382e-05, "loss": 0.6292, "step": 5425 }, { "epoch": 3.5328562134027326, "grad_norm": 1.7522900104522705, "learning_rate": 1.0251968927311384e-05, "loss": 0.6541, "step": 5430 }, { "epoch": 3.536109303838647, "grad_norm": 1.435259222984314, "learning_rate": 1.0210322011711408e-05, "loss": 0.6064, "step": 5435 }, { "epoch": 3.5393623942745607, "grad_norm": 1.3290374279022217, "learning_rate": 1.0168738142078429e-05, "loss": 0.6255, "step": 5440 }, { "epoch": 3.542615484710475, "grad_norm": 1.3328436613082886, "learning_rate": 1.012721749567764e-05, "loss": 0.6006, "step": 5445 }, { "epoch": 3.545868575146389, "grad_norm": 1.3372770547866821, "learning_rate": 1.0085760249504728e-05, "loss": 0.6194, "step": 5450 }, { "epoch": 3.5491216655823035, "grad_norm": 1.7760313749313354, "learning_rate": 1.0044366580285137e-05, "loss": 0.6067, "step": 5455 }, { "epoch": 3.5523747560182173, "grad_norm": 1.7420598268508911, "learning_rate": 1.0003036664473267e-05, "loss": 0.6071, "step": 5460 }, { "epoch": 3.5556278464541315, "grad_norm": 1.498193621635437, "learning_rate": 9.96177067825175e-06, "loss": 0.6146, "step": 5465 }, { "epoch": 3.558880936890046, "grad_norm": 1.8063032627105713, "learning_rate": 9.920568797530716e-06, "loss": 0.626, "step": 5470 }, { "epoch": 3.5621340273259596, "grad_norm": 1.2613329887390137, "learning_rate": 9.879431197947014e-06, "loss": 0.6049, "step": 5475 }, { "epoch": 3.565387117761874, "grad_norm": 1.34530770778656, "learning_rate": 9.83835805486347e-06, "loss": 0.6197, "step": 5480 }, { "epoch": 3.568640208197788, "grad_norm": 1.9523491859436035, "learning_rate": 9.797349543368128e-06, "loss": 0.6342, "step": 5485 }, { "epoch": 3.571893298633702, "grad_norm": 1.8784916400909424, "learning_rate": 9.756405838273558e-06, "loss": 0.64, "step": 5490 }, { "epoch": 3.5751463890696162, "grad_norm": 1.5533080101013184, "learning_rate": 9.715527114116035e-06, "loss": 0.6243, "step": 5495 }, { "epoch": 3.5783994795055305, "grad_norm": 1.385695219039917, "learning_rate": 9.674713545154831e-06, "loss": 0.6264, "step": 5500 }, { "epoch": 3.5816525699414443, "grad_norm": 1.3538482189178467, "learning_rate": 9.633965305371506e-06, "loss": 0.621, "step": 5505 }, { "epoch": 3.5849056603773586, "grad_norm": 1.6445493698120117, "learning_rate": 9.5932825684691e-06, "loss": 0.6239, "step": 5510 }, { "epoch": 3.588158750813273, "grad_norm": 1.803451657295227, "learning_rate": 9.552665507871428e-06, "loss": 0.6311, "step": 5515 }, { "epoch": 3.5914118412491867, "grad_norm": 1.3346718549728394, "learning_rate": 9.51211429672236e-06, "loss": 0.6396, "step": 5520 }, { "epoch": 3.594664931685101, "grad_norm": 2.1071603298187256, "learning_rate": 9.471629107885038e-06, "loss": 0.6238, "step": 5525 }, { "epoch": 3.597918022121015, "grad_norm": 1.4250411987304688, "learning_rate": 9.431210113941169e-06, "loss": 0.6063, "step": 5530 }, { "epoch": 3.601171112556929, "grad_norm": 1.3815439939498901, "learning_rate": 9.390857487190274e-06, "loss": 0.5978, "step": 5535 }, { "epoch": 3.6044242029928433, "grad_norm": 1.6549842357635498, "learning_rate": 9.350571399648988e-06, "loss": 0.6094, "step": 5540 }, { "epoch": 3.6076772934287575, "grad_norm": 1.4034509658813477, "learning_rate": 9.310352023050272e-06, "loss": 0.6187, "step": 5545 }, { "epoch": 3.6109303838646714, "grad_norm": 1.6350473165512085, "learning_rate": 9.270199528842715e-06, "loss": 0.6076, "step": 5550 }, { "epoch": 3.6141834743005856, "grad_norm": 1.4474992752075195, "learning_rate": 9.230114088189814e-06, "loss": 0.6507, "step": 5555 }, { "epoch": 3.6174365647365, "grad_norm": 1.4828194379806519, "learning_rate": 9.19009587196921e-06, "loss": 0.6264, "step": 5560 }, { "epoch": 3.6206896551724137, "grad_norm": 1.7121607065200806, "learning_rate": 9.150145050771972e-06, "loss": 0.6383, "step": 5565 }, { "epoch": 3.623942745608328, "grad_norm": 1.8459277153015137, "learning_rate": 9.110261794901903e-06, "loss": 0.6436, "step": 5570 }, { "epoch": 3.6271958360442422, "grad_norm": 1.4332444667816162, "learning_rate": 9.070446274374766e-06, "loss": 0.6313, "step": 5575 }, { "epoch": 3.630448926480156, "grad_norm": 1.2665612697601318, "learning_rate": 9.030698658917566e-06, "loss": 0.6003, "step": 5580 }, { "epoch": 3.6337020169160703, "grad_norm": 1.5076160430908203, "learning_rate": 8.99101911796788e-06, "loss": 0.6203, "step": 5585 }, { "epoch": 3.6369551073519846, "grad_norm": 1.567221999168396, "learning_rate": 8.951407820673058e-06, "loss": 0.6252, "step": 5590 }, { "epoch": 3.6402081977878984, "grad_norm": 1.504109263420105, "learning_rate": 8.911864935889544e-06, "loss": 0.6332, "step": 5595 }, { "epoch": 3.6434612882238127, "grad_norm": 1.6598913669586182, "learning_rate": 8.872390632182175e-06, "loss": 0.6258, "step": 5600 }, { "epoch": 3.646714378659727, "grad_norm": 1.3711302280426025, "learning_rate": 8.832985077823406e-06, "loss": 0.6273, "step": 5605 }, { "epoch": 3.6499674690956407, "grad_norm": 1.293453574180603, "learning_rate": 8.793648440792654e-06, "loss": 0.6041, "step": 5610 }, { "epoch": 3.653220559531555, "grad_norm": 1.6621414422988892, "learning_rate": 8.754380888775523e-06, "loss": 0.6177, "step": 5615 }, { "epoch": 3.6564736499674693, "grad_norm": 1.2931593656539917, "learning_rate": 8.715182589163153e-06, "loss": 0.6084, "step": 5620 }, { "epoch": 3.659726740403383, "grad_norm": 1.4701381921768188, "learning_rate": 8.676053709051446e-06, "loss": 0.6235, "step": 5625 }, { "epoch": 3.6629798308392973, "grad_norm": 2.272709369659424, "learning_rate": 8.636994415240376e-06, "loss": 0.6326, "step": 5630 }, { "epoch": 3.6662329212752116, "grad_norm": 1.3057537078857422, "learning_rate": 8.598004874233315e-06, "loss": 0.616, "step": 5635 }, { "epoch": 3.6694860117111254, "grad_norm": 1.6016069650650024, "learning_rate": 8.559085252236259e-06, "loss": 0.6126, "step": 5640 }, { "epoch": 3.6727391021470397, "grad_norm": 1.38706636428833, "learning_rate": 8.520235715157152e-06, "loss": 0.6424, "step": 5645 }, { "epoch": 3.675992192582954, "grad_norm": 1.403805136680603, "learning_rate": 8.481456428605205e-06, "loss": 0.6328, "step": 5650 }, { "epoch": 3.6792452830188678, "grad_norm": 2.8022546768188477, "learning_rate": 8.442747557890138e-06, "loss": 0.6225, "step": 5655 }, { "epoch": 3.682498373454782, "grad_norm": 1.2923667430877686, "learning_rate": 8.404109268021493e-06, "loss": 0.6068, "step": 5660 }, { "epoch": 3.6857514638906963, "grad_norm": 1.327010154724121, "learning_rate": 8.365541723707971e-06, "loss": 0.6032, "step": 5665 }, { "epoch": 3.68900455432661, "grad_norm": 3.022547960281372, "learning_rate": 8.327045089356663e-06, "loss": 0.6202, "step": 5670 }, { "epoch": 3.6922576447625244, "grad_norm": 1.7190786600112915, "learning_rate": 8.288619529072394e-06, "loss": 0.6136, "step": 5675 }, { "epoch": 3.6955107351984386, "grad_norm": 1.8883839845657349, "learning_rate": 8.250265206657025e-06, "loss": 0.626, "step": 5680 }, { "epoch": 3.6987638256343525, "grad_norm": 1.216133952140808, "learning_rate": 8.211982285608721e-06, "loss": 0.6084, "step": 5685 }, { "epoch": 3.7020169160702667, "grad_norm": 1.4318759441375732, "learning_rate": 8.17377092912128e-06, "loss": 0.6252, "step": 5690 }, { "epoch": 3.705270006506181, "grad_norm": 1.3429824113845825, "learning_rate": 8.135631300083448e-06, "loss": 0.6421, "step": 5695 }, { "epoch": 3.708523096942095, "grad_norm": 1.563573956489563, "learning_rate": 8.097563561078193e-06, "loss": 0.6426, "step": 5700 }, { "epoch": 3.711776187378009, "grad_norm": 1.3186182975769043, "learning_rate": 8.059567874382023e-06, "loss": 0.6148, "step": 5705 }, { "epoch": 3.7150292778139233, "grad_norm": 1.4381370544433594, "learning_rate": 8.021644401964305e-06, "loss": 0.6206, "step": 5710 }, { "epoch": 3.718282368249837, "grad_norm": 1.6375632286071777, "learning_rate": 7.983793305486583e-06, "loss": 0.6169, "step": 5715 }, { "epoch": 3.7215354586857514, "grad_norm": 1.426100730895996, "learning_rate": 7.946014746301858e-06, "loss": 0.6299, "step": 5720 }, { "epoch": 3.7247885491216657, "grad_norm": 1.6016979217529297, "learning_rate": 7.908308885453908e-06, "loss": 0.6039, "step": 5725 }, { "epoch": 3.7280416395575795, "grad_norm": 1.8250033855438232, "learning_rate": 7.87067588367664e-06, "loss": 0.6375, "step": 5730 }, { "epoch": 3.7312947299934938, "grad_norm": 1.6048786640167236, "learning_rate": 7.833115901393347e-06, "loss": 0.6469, "step": 5735 }, { "epoch": 3.734547820429408, "grad_norm": 1.473156213760376, "learning_rate": 7.795629098716045e-06, "loss": 0.6291, "step": 5740 }, { "epoch": 3.737800910865322, "grad_norm": 1.4616464376449585, "learning_rate": 7.758215635444848e-06, "loss": 0.6418, "step": 5745 }, { "epoch": 3.741054001301236, "grad_norm": 1.3316526412963867, "learning_rate": 7.720875671067188e-06, "loss": 0.6052, "step": 5750 }, { "epoch": 3.7443070917371504, "grad_norm": 2.7276248931884766, "learning_rate": 7.683609364757192e-06, "loss": 0.6311, "step": 5755 }, { "epoch": 3.747560182173064, "grad_norm": 1.4057763814926147, "learning_rate": 7.646416875374992e-06, "loss": 0.6262, "step": 5760 }, { "epoch": 3.7508132726089785, "grad_norm": 1.7808401584625244, "learning_rate": 7.609298361466083e-06, "loss": 0.6372, "step": 5765 }, { "epoch": 3.7540663630448927, "grad_norm": 1.5597418546676636, "learning_rate": 7.572253981260571e-06, "loss": 0.6181, "step": 5770 }, { "epoch": 3.7573194534808065, "grad_norm": 1.6378741264343262, "learning_rate": 7.535283892672562e-06, "loss": 0.6247, "step": 5775 }, { "epoch": 3.760572543916721, "grad_norm": 2.498858690261841, "learning_rate": 7.498388253299482e-06, "loss": 0.643, "step": 5780 }, { "epoch": 3.763825634352635, "grad_norm": 1.9484217166900635, "learning_rate": 7.46156722042137e-06, "loss": 0.6223, "step": 5785 }, { "epoch": 3.767078724788549, "grad_norm": 1.3782168626785278, "learning_rate": 7.424820951000233e-06, "loss": 0.6148, "step": 5790 }, { "epoch": 3.770331815224463, "grad_norm": 1.3748527765274048, "learning_rate": 7.388149601679392e-06, "loss": 0.6242, "step": 5795 }, { "epoch": 3.7735849056603774, "grad_norm": 1.4963568449020386, "learning_rate": 7.351553328782779e-06, "loss": 0.6014, "step": 5800 }, { "epoch": 3.7768379960962912, "grad_norm": 1.708061695098877, "learning_rate": 7.31503228831428e-06, "loss": 0.6154, "step": 5805 }, { "epoch": 3.7800910865322055, "grad_norm": 1.8436424732208252, "learning_rate": 7.278586635957107e-06, "loss": 0.6263, "step": 5810 }, { "epoch": 3.7833441769681198, "grad_norm": 1.9801384210586548, "learning_rate": 7.242216527073079e-06, "loss": 0.5955, "step": 5815 }, { "epoch": 3.7865972674040336, "grad_norm": 1.4177374839782715, "learning_rate": 7.205922116701985e-06, "loss": 0.6255, "step": 5820 }, { "epoch": 3.789850357839948, "grad_norm": 1.4929031133651733, "learning_rate": 7.169703559560953e-06, "loss": 0.6046, "step": 5825 }, { "epoch": 3.793103448275862, "grad_norm": 2.4425814151763916, "learning_rate": 7.133561010043724e-06, "loss": 0.6072, "step": 5830 }, { "epoch": 3.796356538711776, "grad_norm": 1.5860954523086548, "learning_rate": 7.097494622220049e-06, "loss": 0.6173, "step": 5835 }, { "epoch": 3.79960962914769, "grad_norm": 1.4166280031204224, "learning_rate": 7.0615045498350215e-06, "loss": 0.5985, "step": 5840 }, { "epoch": 3.8028627195836044, "grad_norm": 1.7926712036132812, "learning_rate": 7.025590946308402e-06, "loss": 0.6077, "step": 5845 }, { "epoch": 3.8061158100195187, "grad_norm": 1.411357045173645, "learning_rate": 6.9897539647339725e-06, "loss": 0.6126, "step": 5850 }, { "epoch": 3.8093689004554325, "grad_norm": 1.4378728866577148, "learning_rate": 6.95399375787891e-06, "loss": 0.6217, "step": 5855 }, { "epoch": 3.812621990891347, "grad_norm": 1.630339503288269, "learning_rate": 6.918310478183093e-06, "loss": 0.6081, "step": 5860 }, { "epoch": 3.815875081327261, "grad_norm": 1.4536669254302979, "learning_rate": 6.882704277758475e-06, "loss": 0.631, "step": 5865 }, { "epoch": 3.819128171763175, "grad_norm": 1.369432806968689, "learning_rate": 6.847175308388451e-06, "loss": 0.6023, "step": 5870 }, { "epoch": 3.822381262199089, "grad_norm": 1.8251979351043701, "learning_rate": 6.811723721527161e-06, "loss": 0.6088, "step": 5875 }, { "epoch": 3.8256343526350034, "grad_norm": 1.4121100902557373, "learning_rate": 6.776349668298912e-06, "loss": 0.6393, "step": 5880 }, { "epoch": 3.828887443070917, "grad_norm": 1.4803780317306519, "learning_rate": 6.741053299497468e-06, "loss": 0.601, "step": 5885 }, { "epoch": 3.8321405335068315, "grad_norm": 1.5110501050949097, "learning_rate": 6.705834765585459e-06, "loss": 0.6299, "step": 5890 }, { "epoch": 3.8353936239427457, "grad_norm": 1.8608803749084473, "learning_rate": 6.670694216693701e-06, "loss": 0.6394, "step": 5895 }, { "epoch": 3.8386467143786596, "grad_norm": 1.4101976156234741, "learning_rate": 6.635631802620576e-06, "loss": 0.6149, "step": 5900 }, { "epoch": 3.841899804814574, "grad_norm": 1.5235905647277832, "learning_rate": 6.600647672831406e-06, "loss": 0.6377, "step": 5905 }, { "epoch": 3.845152895250488, "grad_norm": 2.4760963916778564, "learning_rate": 6.565741976457782e-06, "loss": 0.6315, "step": 5910 }, { "epoch": 3.8484059856864024, "grad_norm": 1.4764820337295532, "learning_rate": 6.530914862296947e-06, "loss": 0.6148, "step": 5915 }, { "epoch": 3.851659076122316, "grad_norm": 1.408517599105835, "learning_rate": 6.496166478811164e-06, "loss": 0.629, "step": 5920 }, { "epoch": 3.8549121665582304, "grad_norm": 2.276674509048462, "learning_rate": 6.461496974127093e-06, "loss": 0.613, "step": 5925 }, { "epoch": 3.8581652569941447, "grad_norm": 1.5643647909164429, "learning_rate": 6.426906496035129e-06, "loss": 0.6063, "step": 5930 }, { "epoch": 3.8614183474300585, "grad_norm": 1.3531688451766968, "learning_rate": 6.39239519198879e-06, "loss": 0.6135, "step": 5935 }, { "epoch": 3.864671437865973, "grad_norm": 1.4261928796768188, "learning_rate": 6.357963209104106e-06, "loss": 0.6206, "step": 5940 }, { "epoch": 3.867924528301887, "grad_norm": 1.3013157844543457, "learning_rate": 6.32361069415896e-06, "loss": 0.6153, "step": 5945 }, { "epoch": 3.871177618737801, "grad_norm": 1.520578145980835, "learning_rate": 6.289337793592468e-06, "loss": 0.629, "step": 5950 }, { "epoch": 3.874430709173715, "grad_norm": 1.5987921953201294, "learning_rate": 6.255144653504382e-06, "loss": 0.645, "step": 5955 }, { "epoch": 3.8776837996096294, "grad_norm": 2.1227879524230957, "learning_rate": 6.221031419654444e-06, "loss": 0.6333, "step": 5960 }, { "epoch": 3.880936890045543, "grad_norm": 1.5177706480026245, "learning_rate": 6.1869982374617495e-06, "loss": 0.629, "step": 5965 }, { "epoch": 3.8841899804814575, "grad_norm": 1.3354036808013916, "learning_rate": 6.153045252004177e-06, "loss": 0.6055, "step": 5970 }, { "epoch": 3.8874430709173717, "grad_norm": 1.8337645530700684, "learning_rate": 6.119172608017718e-06, "loss": 0.623, "step": 5975 }, { "epoch": 3.8906961613532856, "grad_norm": 1.2876662015914917, "learning_rate": 6.08538044989588e-06, "loss": 0.6064, "step": 5980 }, { "epoch": 3.8939492517892, "grad_norm": 1.3676327466964722, "learning_rate": 6.051668921689094e-06, "loss": 0.6219, "step": 5985 }, { "epoch": 3.897202342225114, "grad_norm": 1.5804736614227295, "learning_rate": 6.0180381671040596e-06, "loss": 0.6135, "step": 5990 }, { "epoch": 3.900455432661028, "grad_norm": 2.2858810424804688, "learning_rate": 5.9844883295031515e-06, "loss": 0.6393, "step": 5995 }, { "epoch": 3.903708523096942, "grad_norm": 1.8066788911819458, "learning_rate": 5.9510195519038245e-06, "loss": 0.6056, "step": 6000 }, { "epoch": 3.9069616135328564, "grad_norm": 1.3947362899780273, "learning_rate": 5.917631976977975e-06, "loss": 0.6138, "step": 6005 }, { "epoch": 3.9102147039687702, "grad_norm": 1.551949381828308, "learning_rate": 5.884325747051336e-06, "loss": 0.614, "step": 6010 }, { "epoch": 3.9134677944046845, "grad_norm": 1.3901867866516113, "learning_rate": 5.851101004102907e-06, "loss": 0.6375, "step": 6015 }, { "epoch": 3.9167208848405988, "grad_norm": 1.4056464433670044, "learning_rate": 5.817957889764308e-06, "loss": 0.6141, "step": 6020 }, { "epoch": 3.9199739752765126, "grad_norm": 1.499922752380371, "learning_rate": 5.784896545319187e-06, "loss": 0.6074, "step": 6025 }, { "epoch": 3.923227065712427, "grad_norm": 1.2578163146972656, "learning_rate": 5.751917111702612e-06, "loss": 0.6143, "step": 6030 }, { "epoch": 3.926480156148341, "grad_norm": 1.2877789735794067, "learning_rate": 5.719019729500508e-06, "loss": 0.5956, "step": 6035 }, { "epoch": 3.929733246584255, "grad_norm": 1.576788067817688, "learning_rate": 5.686204538948997e-06, "loss": 0.6141, "step": 6040 }, { "epoch": 3.932986337020169, "grad_norm": 1.8292930126190186, "learning_rate": 5.653471679933839e-06, "loss": 0.5909, "step": 6045 }, { "epoch": 3.9362394274560835, "grad_norm": 1.5432319641113281, "learning_rate": 5.62082129198985e-06, "loss": 0.6199, "step": 6050 }, { "epoch": 3.9394925178919973, "grad_norm": 1.739689826965332, "learning_rate": 5.58825351430026e-06, "loss": 0.6035, "step": 6055 }, { "epoch": 3.9427456083279115, "grad_norm": 1.3205852508544922, "learning_rate": 5.555768485696144e-06, "loss": 0.6169, "step": 6060 }, { "epoch": 3.945998698763826, "grad_norm": 1.6433742046356201, "learning_rate": 5.523366344655856e-06, "loss": 0.6404, "step": 6065 }, { "epoch": 3.9492517891997396, "grad_norm": 1.6137924194335938, "learning_rate": 5.491047229304397e-06, "loss": 0.6219, "step": 6070 }, { "epoch": 3.952504879635654, "grad_norm": 1.5387951135635376, "learning_rate": 5.4588112774128314e-06, "loss": 0.5937, "step": 6075 }, { "epoch": 3.955757970071568, "grad_norm": 1.4663158655166626, "learning_rate": 5.42665862639774e-06, "loss": 0.6066, "step": 6080 }, { "epoch": 3.959011060507482, "grad_norm": 4.082248210906982, "learning_rate": 5.394589413320589e-06, "loss": 0.6311, "step": 6085 }, { "epoch": 3.9622641509433962, "grad_norm": 1.4563738107681274, "learning_rate": 5.3626037748871565e-06, "loss": 0.6142, "step": 6090 }, { "epoch": 3.9655172413793105, "grad_norm": 1.569101095199585, "learning_rate": 5.330701847446962e-06, "loss": 0.6014, "step": 6095 }, { "epoch": 3.9687703318152243, "grad_norm": 1.567270278930664, "learning_rate": 5.29888376699269e-06, "loss": 0.6155, "step": 6100 }, { "epoch": 3.9720234222511386, "grad_norm": 1.668445110321045, "learning_rate": 5.267149669159588e-06, "loss": 0.6171, "step": 6105 }, { "epoch": 3.975276512687053, "grad_norm": 1.7854609489440918, "learning_rate": 5.235499689224885e-06, "loss": 0.6135, "step": 6110 }, { "epoch": 3.9785296031229667, "grad_norm": 1.8517600297927856, "learning_rate": 5.203933962107266e-06, "loss": 0.6207, "step": 6115 }, { "epoch": 3.981782693558881, "grad_norm": 1.5116204023361206, "learning_rate": 5.172452622366228e-06, "loss": 0.614, "step": 6120 }, { "epoch": 3.985035783994795, "grad_norm": 1.4917980432510376, "learning_rate": 5.141055804201541e-06, "loss": 0.6118, "step": 6125 }, { "epoch": 3.988288874430709, "grad_norm": 1.527981162071228, "learning_rate": 5.109743641452699e-06, "loss": 0.6083, "step": 6130 }, { "epoch": 3.9915419648666233, "grad_norm": 1.3188831806182861, "learning_rate": 5.078516267598299e-06, "loss": 0.6141, "step": 6135 }, { "epoch": 3.9947950553025375, "grad_norm": 1.4134242534637451, "learning_rate": 5.047373815755496e-06, "loss": 0.6234, "step": 6140 }, { "epoch": 3.9980481457384514, "grad_norm": 1.5778809785842896, "learning_rate": 5.016316418679454e-06, "loss": 0.6177, "step": 6145 }, { "epoch": 4.0, "eval_f1": 0.7989837428748611, "eval_loss": 0.491455078125, "eval_precision": 0.7989192926261178, "eval_recall": 0.7990541428374994, "eval_runtime": 238.1189, "eval_samples_per_second": 1652.263, "eval_steps_per_second": 1.617, "step": 6148 }, { "epoch": 4.001301236174366, "grad_norm": 1.376760721206665, "learning_rate": 4.985344208762757e-06, "loss": 0.5954, "step": 6150 }, { "epoch": 4.00455432661028, "grad_norm": 1.2846732139587402, "learning_rate": 4.954457318034841e-06, "loss": 0.533, "step": 6155 }, { "epoch": 4.007807417046194, "grad_norm": 1.16463303565979, "learning_rate": 4.92365587816144e-06, "loss": 0.533, "step": 6160 }, { "epoch": 4.011060507482108, "grad_norm": 1.4882513284683228, "learning_rate": 4.892940020444043e-06, "loss": 0.5236, "step": 6165 }, { "epoch": 4.014313597918022, "grad_norm": 3.275876998901367, "learning_rate": 4.862309875819299e-06, "loss": 0.5213, "step": 6170 }, { "epoch": 4.017566688353936, "grad_norm": 1.5742096900939941, "learning_rate": 4.837867561302392e-06, "loss": 0.5295, "step": 6175 }, { "epoch": 4.020819778789851, "grad_norm": 5.1677422523498535, "learning_rate": 4.807392029038138e-06, "loss": 0.5301, "step": 6180 }, { "epoch": 4.024072869225765, "grad_norm": 1.7716647386550903, "learning_rate": 4.77700257454356e-06, "loss": 0.5366, "step": 6185 }, { "epoch": 4.027325959661678, "grad_norm": 1.8003216981887817, "learning_rate": 4.746699327363918e-06, "loss": 0.5209, "step": 6190 }, { "epoch": 4.030579050097593, "grad_norm": 1.7417036294937134, "learning_rate": 4.7164824166769735e-06, "loss": 0.5335, "step": 6195 }, { "epoch": 4.033832140533507, "grad_norm": 1.7009021043777466, "learning_rate": 4.686351971292443e-06, "loss": 0.5222, "step": 6200 }, { "epoch": 4.037085230969421, "grad_norm": 2.0051186084747314, "learning_rate": 4.6563081196514786e-06, "loss": 0.5516, "step": 6205 }, { "epoch": 4.040338321405335, "grad_norm": 1.5723603963851929, "learning_rate": 4.626350989826075e-06, "loss": 0.5263, "step": 6210 }, { "epoch": 4.043591411841249, "grad_norm": 1.8875335454940796, "learning_rate": 4.596480709518547e-06, "loss": 0.5346, "step": 6215 }, { "epoch": 4.046844502277163, "grad_norm": 1.5543326139450073, "learning_rate": 4.566697406061005e-06, "loss": 0.5344, "step": 6220 }, { "epoch": 4.050097592713078, "grad_norm": 1.6131196022033691, "learning_rate": 4.53700120641477e-06, "loss": 0.5318, "step": 6225 }, { "epoch": 4.053350683148992, "grad_norm": 1.3502036333084106, "learning_rate": 4.5073922371698554e-06, "loss": 0.5234, "step": 6230 }, { "epoch": 4.056603773584905, "grad_norm": 2.2002179622650146, "learning_rate": 4.4778706245444475e-06, "loss": 0.5422, "step": 6235 }, { "epoch": 4.05985686402082, "grad_norm": 1.62948477268219, "learning_rate": 4.44843649438432e-06, "loss": 0.5136, "step": 6240 }, { "epoch": 4.063109954456734, "grad_norm": 1.563274621963501, "learning_rate": 4.419089972162327e-06, "loss": 0.5087, "step": 6245 }, { "epoch": 4.066363044892648, "grad_norm": 1.5413563251495361, "learning_rate": 4.389831182977882e-06, "loss": 0.535, "step": 6250 }, { "epoch": 4.0696161353285625, "grad_norm": 1.6265994310379028, "learning_rate": 4.360660251556395e-06, "loss": 0.5291, "step": 6255 }, { "epoch": 4.072869225764476, "grad_norm": 1.6212644577026367, "learning_rate": 4.331577302248746e-06, "loss": 0.5165, "step": 6260 }, { "epoch": 4.07612231620039, "grad_norm": 1.5618913173675537, "learning_rate": 4.302582459030769e-06, "loss": 0.5301, "step": 6265 }, { "epoch": 4.079375406636305, "grad_norm": 1.7876514196395874, "learning_rate": 4.273675845502722e-06, "loss": 0.5282, "step": 6270 }, { "epoch": 4.082628497072219, "grad_norm": 1.6155240535736084, "learning_rate": 4.244857584888748e-06, "loss": 0.5219, "step": 6275 }, { "epoch": 4.0858815875081325, "grad_norm": 1.826150894165039, "learning_rate": 4.2161278000363456e-06, "loss": 0.5254, "step": 6280 }, { "epoch": 4.089134677944047, "grad_norm": 1.569254755973816, "learning_rate": 4.187486613415878e-06, "loss": 0.5563, "step": 6285 }, { "epoch": 4.092387768379961, "grad_norm": 1.651341438293457, "learning_rate": 4.158934147120019e-06, "loss": 0.5196, "step": 6290 }, { "epoch": 4.095640858815875, "grad_norm": 1.960835337638855, "learning_rate": 4.130470522863231e-06, "loss": 0.5233, "step": 6295 }, { "epoch": 4.0988939492517895, "grad_norm": 1.762459397315979, "learning_rate": 4.102095861981275e-06, "loss": 0.5101, "step": 6300 }, { "epoch": 4.102147039687703, "grad_norm": 1.7269344329833984, "learning_rate": 4.073810285430668e-06, "loss": 0.5283, "step": 6305 }, { "epoch": 4.105400130123617, "grad_norm": 2.420794725418091, "learning_rate": 4.045613913788171e-06, "loss": 0.5168, "step": 6310 }, { "epoch": 4.108653220559532, "grad_norm": 1.5948150157928467, "learning_rate": 4.0175068672502784e-06, "loss": 0.535, "step": 6315 }, { "epoch": 4.111906310995446, "grad_norm": 2.1127867698669434, "learning_rate": 3.9894892656327235e-06, "loss": 0.5181, "step": 6320 }, { "epoch": 4.1151594014313595, "grad_norm": 2.1554746627807617, "learning_rate": 3.961561228369928e-06, "loss": 0.5314, "step": 6325 }, { "epoch": 4.118412491867274, "grad_norm": 1.7790179252624512, "learning_rate": 3.933722874514526e-06, "loss": 0.5327, "step": 6330 }, { "epoch": 4.121665582303188, "grad_norm": 1.5885546207427979, "learning_rate": 3.905974322736849e-06, "loss": 0.5221, "step": 6335 }, { "epoch": 4.124918672739102, "grad_norm": 1.4991848468780518, "learning_rate": 3.878315691324416e-06, "loss": 0.5134, "step": 6340 }, { "epoch": 4.1281717631750166, "grad_norm": 1.57703697681427, "learning_rate": 3.850747098181421e-06, "loss": 0.5239, "step": 6345 }, { "epoch": 4.13142485361093, "grad_norm": 3.0852479934692383, "learning_rate": 3.82326866082825e-06, "loss": 0.5216, "step": 6350 }, { "epoch": 4.134677944046844, "grad_norm": 1.6248340606689453, "learning_rate": 3.7958804964009692e-06, "loss": 0.5195, "step": 6355 }, { "epoch": 4.137931034482759, "grad_norm": 1.69948410987854, "learning_rate": 3.7685827216508124e-06, "loss": 0.507, "step": 6360 }, { "epoch": 4.141184124918673, "grad_norm": 1.6397584676742554, "learning_rate": 3.741375452943724e-06, "loss": 0.5353, "step": 6365 }, { "epoch": 4.1444372153545865, "grad_norm": 1.4918780326843262, "learning_rate": 3.714258806259807e-06, "loss": 0.5013, "step": 6370 }, { "epoch": 4.147690305790501, "grad_norm": 2.1283321380615234, "learning_rate": 3.6872328971928718e-06, "loss": 0.5289, "step": 6375 }, { "epoch": 4.150943396226415, "grad_norm": 2.7849512100219727, "learning_rate": 3.660297840949933e-06, "loss": 0.5289, "step": 6380 }, { "epoch": 4.154196486662329, "grad_norm": 1.7255409955978394, "learning_rate": 3.633453752350707e-06, "loss": 0.5174, "step": 6385 }, { "epoch": 4.157449577098244, "grad_norm": 1.7871309518814087, "learning_rate": 3.606700745827127e-06, "loss": 0.5231, "step": 6390 }, { "epoch": 4.160702667534157, "grad_norm": 1.5307867527008057, "learning_rate": 3.5800389354228748e-06, "loss": 0.524, "step": 6395 }, { "epoch": 4.163955757970071, "grad_norm": 1.9164159297943115, "learning_rate": 3.553468434792859e-06, "loss": 0.5321, "step": 6400 }, { "epoch": 4.167208848405986, "grad_norm": 1.539781093597412, "learning_rate": 3.526989357202756e-06, "loss": 0.5223, "step": 6405 }, { "epoch": 4.1704619388419, "grad_norm": 1.5751947164535522, "learning_rate": 3.5006018155285286e-06, "loss": 0.5302, "step": 6410 }, { "epoch": 4.173715029277814, "grad_norm": 1.7798151969909668, "learning_rate": 3.4743059222559298e-06, "loss": 0.5295, "step": 6415 }, { "epoch": 4.176968119713728, "grad_norm": 2.035566568374634, "learning_rate": 3.448101789480024e-06, "loss": 0.5249, "step": 6420 }, { "epoch": 4.180221210149642, "grad_norm": 1.6014204025268555, "learning_rate": 3.4219895289047317e-06, "loss": 0.5236, "step": 6425 }, { "epoch": 4.183474300585556, "grad_norm": 1.9151594638824463, "learning_rate": 3.395969251842329e-06, "loss": 0.5146, "step": 6430 }, { "epoch": 4.186727391021471, "grad_norm": 1.543568730354309, "learning_rate": 3.3700410692129815e-06, "loss": 0.518, "step": 6435 }, { "epoch": 4.189980481457384, "grad_norm": 1.5983526706695557, "learning_rate": 3.3442050915442615e-06, "loss": 0.5047, "step": 6440 }, { "epoch": 4.193233571893298, "grad_norm": 1.5908766984939575, "learning_rate": 3.318461428970707e-06, "loss": 0.5273, "step": 6445 }, { "epoch": 4.196486662329213, "grad_norm": 1.7272975444793701, "learning_rate": 3.2928101912333197e-06, "loss": 0.5143, "step": 6450 }, { "epoch": 4.199739752765127, "grad_norm": 1.6854057312011719, "learning_rate": 3.2672514876791044e-06, "loss": 0.5412, "step": 6455 }, { "epoch": 4.202992843201041, "grad_norm": 1.7159767150878906, "learning_rate": 3.2417854272606212e-06, "loss": 0.5328, "step": 6460 }, { "epoch": 4.206245933636955, "grad_norm": 2.0293431282043457, "learning_rate": 3.2164121185355026e-06, "loss": 0.5207, "step": 6465 }, { "epoch": 4.209499024072869, "grad_norm": 1.4942529201507568, "learning_rate": 3.1911316696659837e-06, "loss": 0.5098, "step": 6470 }, { "epoch": 4.212752114508783, "grad_norm": 1.5757249593734741, "learning_rate": 3.165944188418474e-06, "loss": 0.5075, "step": 6475 }, { "epoch": 4.216005204944698, "grad_norm": 1.6114063262939453, "learning_rate": 3.140849782163066e-06, "loss": 0.5283, "step": 6480 }, { "epoch": 4.2192582953806115, "grad_norm": 1.791574478149414, "learning_rate": 3.1158485578730883e-06, "loss": 0.5116, "step": 6485 }, { "epoch": 4.222511385816525, "grad_norm": 1.4832271337509155, "learning_rate": 3.090940622124644e-06, "loss": 0.5187, "step": 6490 }, { "epoch": 4.22576447625244, "grad_norm": 1.5384358167648315, "learning_rate": 3.066126081096185e-06, "loss": 0.5158, "step": 6495 }, { "epoch": 4.229017566688354, "grad_norm": 1.766423225402832, "learning_rate": 3.0414050405680155e-06, "loss": 0.5196, "step": 6500 }, { "epoch": 4.232270657124268, "grad_norm": 2.07438325881958, "learning_rate": 3.016777605921861e-06, "loss": 0.5062, "step": 6505 }, { "epoch": 4.235523747560182, "grad_norm": 4.485304355621338, "learning_rate": 2.9922438821404415e-06, "loss": 0.4975, "step": 6510 }, { "epoch": 4.238776837996096, "grad_norm": 1.6027443408966064, "learning_rate": 2.9678039738069845e-06, "loss": 0.5211, "step": 6515 }, { "epoch": 4.24202992843201, "grad_norm": 2.2789571285247803, "learning_rate": 2.9434579851047973e-06, "loss": 0.5084, "step": 6520 }, { "epoch": 4.245283018867925, "grad_norm": 1.481426477432251, "learning_rate": 2.919206019816842e-06, "loss": 0.5417, "step": 6525 }, { "epoch": 4.2485361093038385, "grad_norm": 1.6203233003616333, "learning_rate": 2.895048181325252e-06, "loss": 0.5114, "step": 6530 }, { "epoch": 4.251789199739752, "grad_norm": 1.5848479270935059, "learning_rate": 2.8709845726109243e-06, "loss": 0.5028, "step": 6535 }, { "epoch": 4.255042290175667, "grad_norm": 1.80342435836792, "learning_rate": 2.8470152962530723e-06, "loss": 0.5122, "step": 6540 }, { "epoch": 4.258295380611581, "grad_norm": 2.087617874145508, "learning_rate": 2.8231404544287796e-06, "loss": 0.506, "step": 6545 }, { "epoch": 4.261548471047496, "grad_norm": 1.7649626731872559, "learning_rate": 2.7993601489125693e-06, "loss": 0.5166, "step": 6550 }, { "epoch": 4.264801561483409, "grad_norm": 3.1642332077026367, "learning_rate": 2.7756744810759823e-06, "loss": 0.5107, "step": 6555 }, { "epoch": 4.268054651919323, "grad_norm": 1.9564752578735352, "learning_rate": 2.7520835518871302e-06, "loss": 0.5112, "step": 6560 }, { "epoch": 4.271307742355237, "grad_norm": 1.6043564081192017, "learning_rate": 2.7285874619102675e-06, "loss": 0.5084, "step": 6565 }, { "epoch": 4.274560832791152, "grad_norm": 1.9543806314468384, "learning_rate": 2.705186311305355e-06, "loss": 0.5135, "step": 6570 }, { "epoch": 4.2778139232270656, "grad_norm": 1.6966253519058228, "learning_rate": 2.6818801998276634e-06, "loss": 0.525, "step": 6575 }, { "epoch": 4.28106701366298, "grad_norm": 2.0935935974121094, "learning_rate": 2.658669226827315e-06, "loss": 0.5216, "step": 6580 }, { "epoch": 4.284320104098894, "grad_norm": 1.7863517999649048, "learning_rate": 2.6355534912488627e-06, "loss": 0.5271, "step": 6585 }, { "epoch": 4.287573194534808, "grad_norm": 1.611092448234558, "learning_rate": 2.612533091630903e-06, "loss": 0.5142, "step": 6590 }, { "epoch": 4.290826284970722, "grad_norm": 1.709322452545166, "learning_rate": 2.5896081261056138e-06, "loss": 0.5292, "step": 6595 }, { "epoch": 4.294079375406636, "grad_norm": 1.7398649454116821, "learning_rate": 2.5667786923983443e-06, "loss": 0.5253, "step": 6600 }, { "epoch": 4.29733246584255, "grad_norm": 1.5445489883422852, "learning_rate": 2.544044887827235e-06, "loss": 0.5443, "step": 6605 }, { "epoch": 4.300585556278465, "grad_norm": 1.763914704322815, "learning_rate": 2.5214068093027484e-06, "loss": 0.5301, "step": 6610 }, { "epoch": 4.303838646714379, "grad_norm": 2.1207916736602783, "learning_rate": 2.498864553327296e-06, "loss": 0.5351, "step": 6615 }, { "epoch": 4.307091737150293, "grad_norm": 1.8002142906188965, "learning_rate": 2.4764182159948133e-06, "loss": 0.5043, "step": 6620 }, { "epoch": 4.310344827586207, "grad_norm": 1.4603972434997559, "learning_rate": 2.454067892990347e-06, "loss": 0.5032, "step": 6625 }, { "epoch": 4.313597918022121, "grad_norm": 1.6874291896820068, "learning_rate": 2.431813679589645e-06, "loss": 0.5232, "step": 6630 }, { "epoch": 4.316851008458035, "grad_norm": 1.7689220905303955, "learning_rate": 2.4096556706587726e-06, "loss": 0.5218, "step": 6635 }, { "epoch": 4.32010409889395, "grad_norm": 1.5644956827163696, "learning_rate": 2.387593960653675e-06, "loss": 0.5164, "step": 6640 }, { "epoch": 4.3233571893298635, "grad_norm": 2.199660301208496, "learning_rate": 2.3656286436197965e-06, "loss": 0.538, "step": 6645 }, { "epoch": 4.326610279765777, "grad_norm": 2.4460320472717285, "learning_rate": 2.343759813191676e-06, "loss": 0.5197, "step": 6650 }, { "epoch": 4.329863370201692, "grad_norm": 1.8965719938278198, "learning_rate": 2.3219875625925452e-06, "loss": 0.5399, "step": 6655 }, { "epoch": 4.333116460637606, "grad_norm": 1.7241499423980713, "learning_rate": 2.3003119846339293e-06, "loss": 0.514, "step": 6660 }, { "epoch": 4.33636955107352, "grad_norm": 1.776291847229004, "learning_rate": 2.27873317171525e-06, "loss": 0.5217, "step": 6665 }, { "epoch": 4.339622641509434, "grad_norm": 1.6230307817459106, "learning_rate": 2.25725121582345e-06, "loss": 0.5208, "step": 6670 }, { "epoch": 4.342875731945348, "grad_norm": 1.5767405033111572, "learning_rate": 2.2358662085325723e-06, "loss": 0.5064, "step": 6675 }, { "epoch": 4.346128822381262, "grad_norm": 1.785072922706604, "learning_rate": 2.2145782410033844e-06, "loss": 0.5195, "step": 6680 }, { "epoch": 4.349381912817177, "grad_norm": 2.802659034729004, "learning_rate": 2.1933874039830078e-06, "loss": 0.5178, "step": 6685 }, { "epoch": 4.3526350032530905, "grad_norm": 1.8929702043533325, "learning_rate": 2.172293787804483e-06, "loss": 0.5281, "step": 6690 }, { "epoch": 4.355888093689004, "grad_norm": 2.050996780395508, "learning_rate": 2.1512974823864414e-06, "loss": 0.5432, "step": 6695 }, { "epoch": 4.359141184124919, "grad_norm": 1.6718263626098633, "learning_rate": 2.130398577232673e-06, "loss": 0.5267, "step": 6700 }, { "epoch": 4.362394274560833, "grad_norm": 1.8539758920669556, "learning_rate": 2.109597161431784e-06, "loss": 0.5334, "step": 6705 }, { "epoch": 4.365647364996747, "grad_norm": 1.541066288948059, "learning_rate": 2.088893323656793e-06, "loss": 0.5235, "step": 6710 }, { "epoch": 4.368900455432661, "grad_norm": 1.5558756589889526, "learning_rate": 2.068287152164747e-06, "loss": 0.5157, "step": 6715 }, { "epoch": 4.372153545868575, "grad_norm": 1.825431227684021, "learning_rate": 2.0477787347963823e-06, "loss": 0.521, "step": 6720 }, { "epoch": 4.375406636304489, "grad_norm": 1.558396816253662, "learning_rate": 2.0273681589757063e-06, "loss": 0.5082, "step": 6725 }, { "epoch": 4.378659726740404, "grad_norm": 1.8559561967849731, "learning_rate": 2.007055511709646e-06, "loss": 0.526, "step": 6730 }, { "epoch": 4.3819128171763175, "grad_norm": 1.8222005367279053, "learning_rate": 1.986840879587687e-06, "loss": 0.522, "step": 6735 }, { "epoch": 4.385165907612231, "grad_norm": 4.778210639953613, "learning_rate": 1.966724348781479e-06, "loss": 0.5089, "step": 6740 }, { "epoch": 4.388418998048146, "grad_norm": 1.7374241352081299, "learning_rate": 1.9467060050444824e-06, "loss": 0.5166, "step": 6745 }, { "epoch": 4.39167208848406, "grad_norm": 1.846447467803955, "learning_rate": 1.9267859337116195e-06, "loss": 0.5255, "step": 6750 }, { "epoch": 4.394925178919974, "grad_norm": 1.6373209953308105, "learning_rate": 1.9069642196988757e-06, "loss": 0.5103, "step": 6755 }, { "epoch": 4.398178269355888, "grad_norm": 2.6573219299316406, "learning_rate": 1.8872409475029524e-06, "loss": 0.5192, "step": 6760 }, { "epoch": 4.401431359791802, "grad_norm": 3.289806365966797, "learning_rate": 1.8676162012009307e-06, "loss": 0.5195, "step": 6765 }, { "epoch": 4.404684450227716, "grad_norm": 2.3919076919555664, "learning_rate": 1.8480900644498756e-06, "loss": 0.5139, "step": 6770 }, { "epoch": 4.407937540663631, "grad_norm": 2.7541277408599854, "learning_rate": 1.8286626204864903e-06, "loss": 0.5285, "step": 6775 }, { "epoch": 4.411190631099545, "grad_norm": 2.060319423675537, "learning_rate": 1.8093339521267876e-06, "loss": 0.5211, "step": 6780 }, { "epoch": 4.414443721535458, "grad_norm": 1.9002997875213623, "learning_rate": 1.7901041417657027e-06, "loss": 0.5189, "step": 6785 }, { "epoch": 4.417696811971373, "grad_norm": 2.1053810119628906, "learning_rate": 1.7709732713767497e-06, "loss": 0.5107, "step": 6790 }, { "epoch": 4.420949902407287, "grad_norm": 1.6905279159545898, "learning_rate": 1.7519414225116937e-06, "loss": 0.5147, "step": 6795 }, { "epoch": 4.424202992843201, "grad_norm": 2.2751264572143555, "learning_rate": 1.733008676300177e-06, "loss": 0.5065, "step": 6800 }, { "epoch": 4.427456083279115, "grad_norm": 1.9138133525848389, "learning_rate": 1.7141751134493815e-06, "loss": 0.5144, "step": 6805 }, { "epoch": 4.430709173715029, "grad_norm": 1.75284743309021, "learning_rate": 1.6954408142436955e-06, "loss": 0.5164, "step": 6810 }, { "epoch": 4.433962264150943, "grad_norm": 1.6290788650512695, "learning_rate": 1.6768058585443585e-06, "loss": 0.5197, "step": 6815 }, { "epoch": 4.437215354586858, "grad_norm": 2.135432243347168, "learning_rate": 1.6582703257891214e-06, "loss": 0.5252, "step": 6820 }, { "epoch": 4.440468445022772, "grad_norm": 1.6389341354370117, "learning_rate": 1.63983429499191e-06, "loss": 0.5217, "step": 6825 }, { "epoch": 4.443721535458685, "grad_norm": 1.6227918863296509, "learning_rate": 1.6214978447425062e-06, "loss": 0.5178, "step": 6830 }, { "epoch": 4.4469746258946, "grad_norm": 1.907899022102356, "learning_rate": 1.603261053206176e-06, "loss": 0.5235, "step": 6835 }, { "epoch": 4.450227716330514, "grad_norm": 2.548617362976074, "learning_rate": 1.5851239981233639e-06, "loss": 0.5238, "step": 6840 }, { "epoch": 4.453480806766428, "grad_norm": 1.8666588068008423, "learning_rate": 1.5670867568093633e-06, "loss": 0.5378, "step": 6845 }, { "epoch": 4.4567338972023425, "grad_norm": 1.6732510328292847, "learning_rate": 1.5491494061539658e-06, "loss": 0.5101, "step": 6850 }, { "epoch": 4.459986987638256, "grad_norm": 1.560084342956543, "learning_rate": 1.5313120226211452e-06, "loss": 0.5318, "step": 6855 }, { "epoch": 4.46324007807417, "grad_norm": 4.673284530639648, "learning_rate": 1.5135746822487419e-06, "loss": 0.5279, "step": 6860 }, { "epoch": 4.466493168510085, "grad_norm": 1.600279450416565, "learning_rate": 1.4959374606481251e-06, "loss": 0.4943, "step": 6865 }, { "epoch": 4.469746258945999, "grad_norm": 2.073321580886841, "learning_rate": 1.4784004330038653e-06, "loss": 0.5204, "step": 6870 }, { "epoch": 4.4729993493819125, "grad_norm": 3.2433438301086426, "learning_rate": 1.4609636740734316e-06, "loss": 0.5174, "step": 6875 }, { "epoch": 4.476252439817827, "grad_norm": 2.53226637840271, "learning_rate": 1.4436272581868665e-06, "loss": 0.54, "step": 6880 }, { "epoch": 4.479505530253741, "grad_norm": 1.7645595073699951, "learning_rate": 1.4263912592464597e-06, "loss": 0.5271, "step": 6885 }, { "epoch": 4.482758620689655, "grad_norm": 1.7925113439559937, "learning_rate": 1.4092557507264375e-06, "loss": 0.5169, "step": 6890 }, { "epoch": 4.4860117111255695, "grad_norm": 2.9148597717285156, "learning_rate": 1.3922208056726644e-06, "loss": 0.525, "step": 6895 }, { "epoch": 4.489264801561483, "grad_norm": 3.2308194637298584, "learning_rate": 1.3752864967023105e-06, "loss": 0.5341, "step": 6900 }, { "epoch": 4.492517891997397, "grad_norm": 1.633375644683838, "learning_rate": 1.358452896003548e-06, "loss": 0.5249, "step": 6905 }, { "epoch": 4.495770982433312, "grad_norm": 1.7651923894882202, "learning_rate": 1.3417200753352538e-06, "loss": 0.5211, "step": 6910 }, { "epoch": 4.499024072869226, "grad_norm": 1.584030032157898, "learning_rate": 1.3250881060266952e-06, "loss": 0.5164, "step": 6915 }, { "epoch": 4.5022771633051395, "grad_norm": 2.4326541423797607, "learning_rate": 1.3085570589772168e-06, "loss": 0.5306, "step": 6920 }, { "epoch": 4.505530253741054, "grad_norm": 1.5874032974243164, "learning_rate": 1.2921270046559658e-06, "loss": 0.5374, "step": 6925 }, { "epoch": 4.508783344176968, "grad_norm": 2.053276300430298, "learning_rate": 1.2757980131015563e-06, "loss": 0.5294, "step": 6930 }, { "epoch": 4.512036434612883, "grad_norm": 1.5977790355682373, "learning_rate": 1.2595701539217963e-06, "loss": 0.515, "step": 6935 }, { "epoch": 4.5152895250487965, "grad_norm": 1.5569490194320679, "learning_rate": 1.2434434962933866e-06, "loss": 0.5178, "step": 6940 }, { "epoch": 4.51854261548471, "grad_norm": 1.8135985136032104, "learning_rate": 1.2274181089616172e-06, "loss": 0.5268, "step": 6945 }, { "epoch": 4.521795705920624, "grad_norm": 1.5852515697479248, "learning_rate": 1.2114940602400788e-06, "loss": 0.5192, "step": 6950 }, { "epoch": 4.525048796356539, "grad_norm": 2.1236679553985596, "learning_rate": 1.19567141801038e-06, "loss": 0.527, "step": 6955 }, { "epoch": 4.528301886792453, "grad_norm": 2.562978744506836, "learning_rate": 1.1799502497218368e-06, "loss": 0.5379, "step": 6960 }, { "epoch": 4.531554977228367, "grad_norm": 1.632822871208191, "learning_rate": 1.164330622391213e-06, "loss": 0.5162, "step": 6965 }, { "epoch": 4.534808067664281, "grad_norm": 2.966524124145508, "learning_rate": 1.1488126026024087e-06, "loss": 0.5399, "step": 6970 }, { "epoch": 4.538061158100195, "grad_norm": 1.8411732912063599, "learning_rate": 1.1333962565061973e-06, "loss": 0.5232, "step": 6975 }, { "epoch": 4.541314248536109, "grad_norm": 1.5464459657669067, "learning_rate": 1.118081649819927e-06, "loss": 0.5168, "step": 6980 }, { "epoch": 4.544567338972024, "grad_norm": 1.7210750579833984, "learning_rate": 1.1028688478272459e-06, "loss": 0.5327, "step": 6985 }, { "epoch": 4.547820429407937, "grad_norm": 2.1294288635253906, "learning_rate": 1.0877579153778323e-06, "loss": 0.4963, "step": 6990 }, { "epoch": 4.551073519843852, "grad_norm": 1.5896108150482178, "learning_rate": 1.0727489168871092e-06, "loss": 0.537, "step": 6995 }, { "epoch": 4.554326610279766, "grad_norm": 1.6593022346496582, "learning_rate": 1.0578419163359666e-06, "loss": 0.5164, "step": 7000 }, { "epoch": 4.55757970071568, "grad_norm": 1.6132862567901611, "learning_rate": 1.0430369772705034e-06, "loss": 0.5246, "step": 7005 }, { "epoch": 4.560832791151594, "grad_norm": 1.6968963146209717, "learning_rate": 1.028334162801739e-06, "loss": 0.5169, "step": 7010 }, { "epoch": 4.564085881587508, "grad_norm": 3.422121524810791, "learning_rate": 1.0137335356053545e-06, "loss": 0.5306, "step": 7015 }, { "epoch": 4.567338972023422, "grad_norm": 2.2838146686553955, "learning_rate": 9.99235157921427e-07, "loss": 0.536, "step": 7020 }, { "epoch": 4.570592062459337, "grad_norm": 1.923091173171997, "learning_rate": 9.8483909155416e-07, "loss": 0.5165, "step": 7025 }, { "epoch": 4.573845152895251, "grad_norm": 1.5500158071517944, "learning_rate": 9.705453978716112e-07, "loss": 0.5086, "step": 7030 }, { "epoch": 4.577098243331164, "grad_norm": 1.948114037513733, "learning_rate": 9.56354137805457e-07, "loss": 0.5262, "step": 7035 }, { "epoch": 4.580351333767078, "grad_norm": 2.5097603797912598, "learning_rate": 9.422653718507007e-07, "loss": 0.5353, "step": 7040 }, { "epoch": 4.583604424202993, "grad_norm": 1.757633090019226, "learning_rate": 9.282791600654428e-07, "loss": 0.5167, "step": 7045 }, { "epoch": 4.586857514638907, "grad_norm": 2.2960455417633057, "learning_rate": 9.14395562070594e-07, "loss": 0.5264, "step": 7050 }, { "epoch": 4.5901106050748215, "grad_norm": 1.556706428527832, "learning_rate": 9.006146370496654e-07, "loss": 0.5177, "step": 7055 }, { "epoch": 4.593363695510735, "grad_norm": 1.7054029703140259, "learning_rate": 8.869364437484678e-07, "loss": 0.4893, "step": 7060 }, { "epoch": 4.596616785946649, "grad_norm": 1.746472716331482, "learning_rate": 8.733610404748904e-07, "loss": 0.5093, "step": 7065 }, { "epoch": 4.599869876382563, "grad_norm": 2.1942458152770996, "learning_rate": 8.598884850986533e-07, "loss": 0.5299, "step": 7070 }, { "epoch": 4.603122966818478, "grad_norm": 2.43866229057312, "learning_rate": 8.465188350510411e-07, "loss": 0.5282, "step": 7075 }, { "epoch": 4.6063760572543915, "grad_norm": 1.625575304031372, "learning_rate": 8.332521473246758e-07, "loss": 0.5189, "step": 7080 }, { "epoch": 4.609629147690306, "grad_norm": 2.3699636459350586, "learning_rate": 8.200884784732688e-07, "loss": 0.5249, "step": 7085 }, { "epoch": 4.61288223812622, "grad_norm": 1.750931739807129, "learning_rate": 8.070278846113749e-07, "loss": 0.5165, "step": 7090 }, { "epoch": 4.616135328562134, "grad_norm": 1.8055213689804077, "learning_rate": 7.940704214141614e-07, "loss": 0.5315, "step": 7095 }, { "epoch": 4.6193884189980485, "grad_norm": 2.2767059803009033, "learning_rate": 7.812161441171611e-07, "loss": 0.5232, "step": 7100 }, { "epoch": 4.622641509433962, "grad_norm": 1.4966483116149902, "learning_rate": 7.684651075160531e-07, "loss": 0.5045, "step": 7105 }, { "epoch": 4.625894599869876, "grad_norm": 2.188704490661621, "learning_rate": 7.558173659664075e-07, "loss": 0.5201, "step": 7110 }, { "epoch": 4.629147690305791, "grad_norm": 2.934805154800415, "learning_rate": 7.432729733834631e-07, "loss": 0.5247, "step": 7115 }, { "epoch": 4.632400780741705, "grad_norm": 1.9948830604553223, "learning_rate": 7.308319832419141e-07, "loss": 0.5247, "step": 7120 }, { "epoch": 4.6356538711776185, "grad_norm": 1.8401069641113281, "learning_rate": 7.18494448575649e-07, "loss": 0.5364, "step": 7125 }, { "epoch": 4.638906961613533, "grad_norm": 1.45015549659729, "learning_rate": 7.062604219775531e-07, "loss": 0.5106, "step": 7130 }, { "epoch": 4.642160052049447, "grad_norm": 1.7785407304763794, "learning_rate": 6.941299555992737e-07, "loss": 0.5117, "step": 7135 }, { "epoch": 4.645413142485361, "grad_norm": 2.026643753051758, "learning_rate": 6.821031011509937e-07, "loss": 0.5039, "step": 7140 }, { "epoch": 4.648666232921276, "grad_norm": 1.6481338739395142, "learning_rate": 6.701799099012141e-07, "loss": 0.5385, "step": 7145 }, { "epoch": 4.651919323357189, "grad_norm": 2.9961116313934326, "learning_rate": 6.583604326765496e-07, "loss": 0.5148, "step": 7150 }, { "epoch": 4.655172413793103, "grad_norm": 1.7340404987335205, "learning_rate": 6.466447198614806e-07, "loss": 0.4913, "step": 7155 }, { "epoch": 4.658425504229018, "grad_norm": 1.569608211517334, "learning_rate": 6.350328213981654e-07, "loss": 0.5052, "step": 7160 }, { "epoch": 4.661678594664932, "grad_norm": 1.9746705293655396, "learning_rate": 6.235247867862226e-07, "loss": 0.4885, "step": 7165 }, { "epoch": 4.6649316851008455, "grad_norm": 1.7358078956604004, "learning_rate": 6.121206650825162e-07, "loss": 0.5256, "step": 7170 }, { "epoch": 4.66818477553676, "grad_norm": 1.609820008277893, "learning_rate": 6.008205049009341e-07, "loss": 0.5275, "step": 7175 }, { "epoch": 4.671437865972674, "grad_norm": 1.8338040113449097, "learning_rate": 5.896243544122076e-07, "loss": 0.5019, "step": 7180 }, { "epoch": 4.674690956408588, "grad_norm": 1.7695443630218506, "learning_rate": 5.785322613436894e-07, "loss": 0.5287, "step": 7185 }, { "epoch": 4.677944046844503, "grad_norm": 1.9566013813018799, "learning_rate": 5.675442729791425e-07, "loss": 0.5262, "step": 7190 }, { "epoch": 4.681197137280416, "grad_norm": 1.9720107316970825, "learning_rate": 5.566604361585626e-07, "loss": 0.5327, "step": 7195 }, { "epoch": 4.68445022771633, "grad_norm": 2.7521474361419678, "learning_rate": 5.458807972779534e-07, "loss": 0.5002, "step": 7200 }, { "epoch": 4.687703318152245, "grad_norm": 6.726840019226074, "learning_rate": 5.352054022891406e-07, "loss": 0.52, "step": 7205 }, { "epoch": 4.690956408588159, "grad_norm": 1.8968901634216309, "learning_rate": 5.246342966995888e-07, "loss": 0.5259, "step": 7210 }, { "epoch": 4.694209499024073, "grad_norm": 1.647226333618164, "learning_rate": 5.141675255721762e-07, "loss": 0.532, "step": 7215 }, { "epoch": 4.697462589459987, "grad_norm": 2.063908576965332, "learning_rate": 5.038051335250316e-07, "loss": 0.5132, "step": 7220 }, { "epoch": 4.700715679895901, "grad_norm": 1.5827159881591797, "learning_rate": 4.935471647313284e-07, "loss": 0.515, "step": 7225 }, { "epoch": 4.703968770331815, "grad_norm": 1.9684885740280151, "learning_rate": 4.833936629191016e-07, "loss": 0.5054, "step": 7230 }, { "epoch": 4.70722186076773, "grad_norm": 2.0594069957733154, "learning_rate": 4.7334467137105933e-07, "loss": 0.5235, "step": 7235 }, { "epoch": 4.7104749512036435, "grad_norm": 1.9911025762557983, "learning_rate": 4.634002329244047e-07, "loss": 0.5146, "step": 7240 }, { "epoch": 4.713728041639557, "grad_norm": 1.7078765630722046, "learning_rate": 4.535603899706448e-07, "loss": 0.5174, "step": 7245 }, { "epoch": 4.716981132075472, "grad_norm": 1.6561988592147827, "learning_rate": 4.438251844554098e-07, "loss": 0.5201, "step": 7250 }, { "epoch": 4.720234222511386, "grad_norm": 1.7727643251419067, "learning_rate": 4.341946578782868e-07, "loss": 0.5185, "step": 7255 }, { "epoch": 4.7234873129473, "grad_norm": 1.8667991161346436, "learning_rate": 4.2466885129262004e-07, "loss": 0.5033, "step": 7260 }, { "epoch": 4.726740403383214, "grad_norm": 1.5502984523773193, "learning_rate": 4.152478053053632e-07, "loss": 0.5328, "step": 7265 }, { "epoch": 4.729993493819128, "grad_norm": 1.9481128454208374, "learning_rate": 4.059315600768887e-07, "loss": 0.5151, "step": 7270 }, { "epoch": 4.733246584255042, "grad_norm": 2.2370522022247314, "learning_rate": 3.967201553208122e-07, "loss": 0.5126, "step": 7275 }, { "epoch": 4.736499674690957, "grad_norm": 1.9233421087265015, "learning_rate": 3.876136303038458e-07, "loss": 0.5224, "step": 7280 }, { "epoch": 4.7397527651268705, "grad_norm": 1.6725999116897583, "learning_rate": 3.7861202384560644e-07, "loss": 0.5343, "step": 7285 }, { "epoch": 4.743005855562784, "grad_norm": 1.5591987371444702, "learning_rate": 3.6971537431846057e-07, "loss": 0.5073, "step": 7290 }, { "epoch": 4.746258945998699, "grad_norm": 1.721091866493225, "learning_rate": 3.609237196473658e-07, "loss": 0.5274, "step": 7295 }, { "epoch": 4.749512036434613, "grad_norm": 1.7789474725723267, "learning_rate": 3.5223709730970446e-07, "loss": 0.5072, "step": 7300 }, { "epoch": 4.752765126870527, "grad_norm": 1.7836154699325562, "learning_rate": 3.4365554433511416e-07, "loss": 0.5126, "step": 7305 }, { "epoch": 4.756018217306441, "grad_norm": 1.6633206605911255, "learning_rate": 3.3517909730534926e-07, "loss": 0.5137, "step": 7310 }, { "epoch": 4.759271307742355, "grad_norm": 3.098612070083618, "learning_rate": 3.268077923541085e-07, "loss": 0.5061, "step": 7315 }, { "epoch": 4.762524398178269, "grad_norm": 1.939909815788269, "learning_rate": 3.185416651668882e-07, "loss": 0.5349, "step": 7320 }, { "epoch": 4.765777488614184, "grad_norm": 1.7502937316894531, "learning_rate": 3.1038075098083485e-07, "loss": 0.5032, "step": 7325 }, { "epoch": 4.7690305790500975, "grad_norm": 1.7100647687911987, "learning_rate": 3.023250845845815e-07, "loss": 0.5133, "step": 7330 }, { "epoch": 4.772283669486011, "grad_norm": 1.752884030342102, "learning_rate": 2.943747003181091e-07, "loss": 0.5358, "step": 7335 }, { "epoch": 4.775536759921926, "grad_norm": 1.700315237045288, "learning_rate": 2.8652963207260184e-07, "loss": 0.5048, "step": 7340 }, { "epoch": 4.77878985035784, "grad_norm": 2.1294970512390137, "learning_rate": 2.787899132902949e-07, "loss": 0.4829, "step": 7345 }, { "epoch": 4.782042940793754, "grad_norm": 1.8150845766067505, "learning_rate": 2.711555769643381e-07, "loss": 0.512, "step": 7350 }, { "epoch": 4.785296031229668, "grad_norm": 2.119196653366089, "learning_rate": 2.636266556386546e-07, "loss": 0.5267, "step": 7355 }, { "epoch": 4.788549121665582, "grad_norm": 2.050795793533325, "learning_rate": 2.562031814077964e-07, "loss": 0.5089, "step": 7360 }, { "epoch": 4.791802212101496, "grad_norm": 1.6425533294677734, "learning_rate": 2.488851859168112e-07, "loss": 0.5168, "step": 7365 }, { "epoch": 4.795055302537411, "grad_norm": 1.8162248134613037, "learning_rate": 2.4167270036111743e-07, "loss": 0.5028, "step": 7370 }, { "epoch": 4.798308392973325, "grad_norm": 1.7280550003051758, "learning_rate": 2.345657554863545e-07, "loss": 0.5127, "step": 7375 }, { "epoch": 4.801561483409239, "grad_norm": 1.8187389373779297, "learning_rate": 2.2756438158826053e-07, "loss": 0.5349, "step": 7380 }, { "epoch": 4.804814573845153, "grad_norm": 1.7695400714874268, "learning_rate": 2.2066860851253922e-07, "loss": 0.5211, "step": 7385 }, { "epoch": 4.808067664281067, "grad_norm": 3.8797385692596436, "learning_rate": 2.1387846565474045e-07, "loss": 0.5189, "step": 7390 }, { "epoch": 4.811320754716981, "grad_norm": 1.7038609981536865, "learning_rate": 2.0719398196012707e-07, "loss": 0.5342, "step": 7395 }, { "epoch": 4.814573845152895, "grad_norm": 1.7032898664474487, "learning_rate": 2.0061518592355277e-07, "loss": 0.5139, "step": 7400 }, { "epoch": 4.817826935588809, "grad_norm": 1.818298101425171, "learning_rate": 1.9414210558934554e-07, "loss": 0.5198, "step": 7405 }, { "epoch": 4.821080026024724, "grad_norm": 1.8034625053405762, "learning_rate": 1.8777476855118547e-07, "loss": 0.5314, "step": 7410 }, { "epoch": 4.824333116460638, "grad_norm": 3.0345141887664795, "learning_rate": 1.8151320195197997e-07, "loss": 0.5387, "step": 7415 }, { "epoch": 4.827586206896552, "grad_norm": 1.8238238096237183, "learning_rate": 1.753574324837609e-07, "loss": 0.5219, "step": 7420 }, { "epoch": 4.830839297332465, "grad_norm": 1.8523563146591187, "learning_rate": 1.6930748638756266e-07, "loss": 0.5075, "step": 7425 }, { "epoch": 4.83409238776838, "grad_norm": 1.592421531677246, "learning_rate": 1.6336338945331098e-07, "loss": 0.512, "step": 7430 }, { "epoch": 4.837345478204294, "grad_norm": 1.459957480430603, "learning_rate": 1.57525167019712e-07, "loss": 0.5154, "step": 7435 }, { "epoch": 4.840598568640209, "grad_norm": 1.7186057567596436, "learning_rate": 1.517928439741495e-07, "loss": 0.5316, "step": 7440 }, { "epoch": 4.8438516590761225, "grad_norm": 1.5618077516555786, "learning_rate": 1.461664447525768e-07, "loss": 0.4997, "step": 7445 }, { "epoch": 4.847104749512036, "grad_norm": 1.9501081705093384, "learning_rate": 1.4064599333940555e-07, "loss": 0.5115, "step": 7450 }, { "epoch": 4.85035783994795, "grad_norm": 1.593405842781067, "learning_rate": 1.3523151326741702e-07, "loss": 0.5062, "step": 7455 }, { "epoch": 4.853610930383865, "grad_norm": 1.6193232536315918, "learning_rate": 1.299230276176483e-07, "loss": 0.5096, "step": 7460 }, { "epoch": 4.856864020819779, "grad_norm": 1.7456111907958984, "learning_rate": 1.247205590192979e-07, "loss": 0.5154, "step": 7465 }, { "epoch": 4.860117111255693, "grad_norm": 1.7586069107055664, "learning_rate": 1.1962412964964254e-07, "loss": 0.5285, "step": 7470 }, { "epoch": 4.863370201691607, "grad_norm": 2.715386152267456, "learning_rate": 1.1463376123391766e-07, "loss": 0.4909, "step": 7475 }, { "epoch": 4.866623292127521, "grad_norm": 2.343010902404785, "learning_rate": 1.0974947504524269e-07, "loss": 0.5142, "step": 7480 }, { "epoch": 4.869876382563435, "grad_norm": 1.7289972305297852, "learning_rate": 1.0497129190452926e-07, "loss": 0.5191, "step": 7485 }, { "epoch": 4.8731294729993495, "grad_norm": 1.742447018623352, "learning_rate": 1.0029923218038972e-07, "loss": 0.5248, "step": 7490 }, { "epoch": 4.876382563435263, "grad_norm": 1.901174545288086, "learning_rate": 9.573331578904e-08, "loss": 0.5213, "step": 7495 }, { "epoch": 4.879635653871178, "grad_norm": 2.5633485317230225, "learning_rate": 9.127356219423843e-08, "loss": 0.5136, "step": 7500 }, { "epoch": 4.882888744307092, "grad_norm": 1.8684697151184082, "learning_rate": 8.691999040717491e-08, "loss": 0.5188, "step": 7505 }, { "epoch": 4.886141834743006, "grad_norm": 2.1927521228790283, "learning_rate": 8.267261898641798e-08, "loss": 0.5119, "step": 7510 }, { "epoch": 4.8893949251789195, "grad_norm": 1.949514627456665, "learning_rate": 7.853146603780947e-08, "loss": 0.5147, "step": 7515 }, { "epoch": 4.892648015614834, "grad_norm": 2.0919501781463623, "learning_rate": 7.449654921440618e-08, "loss": 0.5064, "step": 7520 }, { "epoch": 4.895901106050748, "grad_norm": 1.5901788473129272, "learning_rate": 7.056788571639105e-08, "loss": 0.5109, "step": 7525 }, { "epoch": 4.899154196486663, "grad_norm": 1.8604559898376465, "learning_rate": 6.674549229101767e-08, "loss": 0.526, "step": 7530 }, { "epoch": 4.9024072869225765, "grad_norm": 1.8954790830612183, "learning_rate": 6.302938523251589e-08, "loss": 0.5039, "step": 7535 }, { "epoch": 4.90566037735849, "grad_norm": 1.7178046703338623, "learning_rate": 5.941958038204187e-08, "loss": 0.5219, "step": 7540 }, { "epoch": 4.908913467794405, "grad_norm": 5.730696201324463, "learning_rate": 5.59160931275976e-08, "loss": 0.5004, "step": 7545 }, { "epoch": 4.912166558230319, "grad_norm": 1.6460310220718384, "learning_rate": 5.2518938403978145e-08, "loss": 0.5319, "step": 7550 }, { "epoch": 4.915419648666233, "grad_norm": 1.6308308839797974, "learning_rate": 4.922813069269394e-08, "loss": 0.5214, "step": 7555 }, { "epoch": 4.918672739102147, "grad_norm": 1.5023380517959595, "learning_rate": 4.604368402191528e-08, "loss": 0.5008, "step": 7560 }, { "epoch": 4.921925829538061, "grad_norm": 1.7468260526657104, "learning_rate": 4.2965611966416796e-08, "loss": 0.5007, "step": 7565 }, { "epoch": 4.925178919973975, "grad_norm": 1.9006001949310303, "learning_rate": 3.9993927647516415e-08, "loss": 0.51, "step": 7570 }, { "epoch": 4.92843201040989, "grad_norm": 1.9338369369506836, "learning_rate": 3.71286437330115e-08, "loss": 0.5216, "step": 7575 }, { "epoch": 4.931685100845804, "grad_norm": 1.6572381258010864, "learning_rate": 3.4369772437137236e-08, "loss": 0.542, "step": 7580 }, { "epoch": 4.934938191281717, "grad_norm": 2.3215434551239014, "learning_rate": 3.1717325520513876e-08, "loss": 0.513, "step": 7585 }, { "epoch": 4.938191281717632, "grad_norm": 1.6987948417663574, "learning_rate": 2.9171314290080132e-08, "loss": 0.5284, "step": 7590 }, { "epoch": 4.941444372153546, "grad_norm": 1.7099159955978394, "learning_rate": 2.6731749599065435e-08, "loss": 0.5267, "step": 7595 }, { "epoch": 4.94469746258946, "grad_norm": 1.620719075202942, "learning_rate": 2.4398641846937187e-08, "loss": 0.5248, "step": 7600 }, { "epoch": 4.9479505530253745, "grad_norm": 1.8238213062286377, "learning_rate": 2.2172000979345242e-08, "loss": 0.5268, "step": 7605 }, { "epoch": 4.951203643461288, "grad_norm": 2.011178970336914, "learning_rate": 2.0051836488094167e-08, "loss": 0.5184, "step": 7610 }, { "epoch": 4.954456733897202, "grad_norm": 1.6856626272201538, "learning_rate": 1.8038157411101597e-08, "loss": 0.5102, "step": 7615 }, { "epoch": 4.957709824333117, "grad_norm": 2.0251147747039795, "learning_rate": 1.6130972332345505e-08, "loss": 0.5112, "step": 7620 }, { "epoch": 4.960962914769031, "grad_norm": 2.00230073928833, "learning_rate": 1.4330289381844775e-08, "loss": 0.5224, "step": 7625 }, { "epoch": 4.964216005204944, "grad_norm": 1.570320963859558, "learning_rate": 1.2636116235612005e-08, "loss": 0.5315, "step": 7630 }, { "epoch": 4.967469095640859, "grad_norm": 1.6889746189117432, "learning_rate": 1.1048460115634096e-08, "loss": 0.5193, "step": 7635 }, { "epoch": 4.970722186076773, "grad_norm": 2.201413631439209, "learning_rate": 9.567327789825054e-09, "loss": 0.5286, "step": 7640 }, { "epoch": 4.973975276512687, "grad_norm": 1.7942432165145874, "learning_rate": 8.192725572006565e-09, "loss": 0.5211, "step": 7645 }, { "epoch": 4.9772283669486015, "grad_norm": 1.7889728546142578, "learning_rate": 6.924659321888571e-09, "loss": 0.5164, "step": 7650 }, { "epoch": 4.980481457384515, "grad_norm": 2.543469190597534, "learning_rate": 5.763134445022078e-09, "loss": 0.5054, "step": 7655 }, { "epoch": 4.983734547820429, "grad_norm": 5.216160297393799, "learning_rate": 4.7081558927991594e-09, "loss": 0.4954, "step": 7660 }, { "epoch": 4.986987638256344, "grad_norm": 2.649937868118286, "learning_rate": 3.759728162422427e-09, "loss": 0.5127, "step": 7665 }, { "epoch": 4.990240728692258, "grad_norm": 1.9266875982284546, "learning_rate": 2.9178552968800454e-09, "loss": 0.5304, "step": 7670 }, { "epoch": 4.9934938191281715, "grad_norm": 1.6233766078948975, "learning_rate": 2.1825408849401873e-09, "loss": 0.5277, "step": 7675 }, { "epoch": 4.996746909564086, "grad_norm": 1.7894667387008667, "learning_rate": 1.5537880611260491e-09, "loss": 0.5239, "step": 7680 }, { "epoch": 5.0, "grad_norm": 1.5771998167037964, "learning_rate": 1.0315995057075256e-09, "loss": 0.5174, "step": 7685 }, { "epoch": 5.0, "eval_f1": 0.7944165410554209, "eval_loss": 0.54638671875, "eval_precision": 0.7945063287994906, "eval_recall": 0.7943501451962497, "eval_runtime": 257.0765, "eval_samples_per_second": 1530.42, "eval_steps_per_second": 1.498, "step": 7685 }, { "epoch": 5.0, "step": 7685, "total_flos": 5.363134814553637e+18, "train_loss": 0.7729351929743412, "train_runtime": 35402.7725, "train_samples_per_second": 444.524, "train_steps_per_second": 0.217 } ], "logging_steps": 5, "max_steps": 7685, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.363134814553637e+18, "train_batch_size": 512, "trial_name": null, "trial_params": null }