diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,70 +1,10841 @@ { - "best_metric": 0.12910686958067819, - "best_model_checkpoint": "./output/modernBERT-base-sentiment-v2/checkpoint-8", - "epoch": 2.0, + "best_metric": 0.8012369099843738, + "best_model_checkpoint": "/data/hungnm/unisentiment/modernBERT-base-sentiment/checkpoint-4611", + "epoch": 5.0, "eval_steps": 500, - "global_step": 16, + "global_step": 7685, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.625, + "epoch": 0.0032530904359141183, "grad_norm": 0.0, "learning_rate": 0.0, - "loss": 1.8373, + "loss": 2.4862, "step": 5 }, { - "epoch": 1.0, - "eval_f1": 0.12910686958067819, - "eval_loss": 1.8330078125, - "eval_precision": 0.16504066117321736, - "eval_recall": 0.1890018282051825, - "eval_runtime": 8.3441, - "eval_samples_per_second": 239.691, - "eval_steps_per_second": 0.12, - "step": 8 - }, - { - "epoch": 1.25, + "epoch": 0.006506180871828237, "grad_norm": 0.0, "learning_rate": 0.0, - "loss": 1.8336, + "loss": 2.4619, "step": 10 }, { - "epoch": 1.875, + "epoch": 0.009759271307742356, "grad_norm": 0.0, "learning_rate": 0.0, - "loss": 1.8364, + "loss": 2.474, "step": 15 }, { - "epoch": 2.0, - "eval_f1": 0.12910686958067819, - "eval_loss": 1.8330078125, - "eval_precision": 0.16504066117321736, - "eval_recall": 0.1890018282051825, - "eval_runtime": 0.2749, - "eval_samples_per_second": 7276.302, - "eval_steps_per_second": 3.638, - "step": 16 + "epoch": 0.013012361743656473, + "grad_norm": 5.975010395050049, + "learning_rate": 2.5974025974025976e-06, + "loss": 2.4748, + "step": 20 }, { - "epoch": 2.0, - "step": 16, - "total_flos": 1489153141243904.0, - "train_loss": 1.836273193359375, - "train_runtime": 55.8529, - "train_samples_per_second": 572.934, - "train_steps_per_second": 0.286 + "epoch": 0.01626545217957059, + "grad_norm": 4.729438781738281, + "learning_rate": 5.194805194805195e-06, + "loss": 2.4383, + "step": 25 + }, + { + "epoch": 0.01951854261548471, + "grad_norm": 4.140679359436035, + "learning_rate": 8.441558441558442e-06, + "loss": 2.2384, + "step": 30 + }, + { + "epoch": 0.02277163305139883, + "grad_norm": 2.7495357990264893, + "learning_rate": 1.1688311688311688e-05, + "loss": 2.16, + "step": 35 + }, + { + "epoch": 0.026024723487312947, + "grad_norm": 1.4239497184753418, + "learning_rate": 1.4935064935064936e-05, + "loss": 2.0898, + "step": 40 + }, + { + "epoch": 0.029277813923227064, + "grad_norm": 1.3778964281082153, + "learning_rate": 1.8181818181818182e-05, + "loss": 2.037, + "step": 45 + }, + { + "epoch": 0.03253090435914118, + "grad_norm": 1.6160250902175903, + "learning_rate": 2.1428571428571428e-05, + "loss": 2.0056, + "step": 50 + }, + { + "epoch": 0.035783994795055306, + "grad_norm": 1.090104579925537, + "learning_rate": 2.4675324675324678e-05, + "loss": 1.9513, + "step": 55 + }, + { + "epoch": 0.03903708523096942, + "grad_norm": 2.1062819957733154, + "learning_rate": 2.792207792207792e-05, + "loss": 1.9023, + "step": 60 + }, + { + "epoch": 0.04229017566688354, + "grad_norm": 3.310304880142212, + "learning_rate": 3.1168831168831166e-05, + "loss": 1.877, + "step": 65 + }, + { + "epoch": 0.04554326610279766, + "grad_norm": 5.446138858795166, + "learning_rate": 3.4415584415584416e-05, + "loss": 1.822, + "step": 70 + }, + { + "epoch": 0.048796356538711776, + "grad_norm": 1.910844087600708, + "learning_rate": 3.7662337662337665e-05, + "loss": 1.7707, + "step": 75 + }, + { + "epoch": 0.05204944697462589, + "grad_norm": 5.207052707672119, + "learning_rate": 4.0909090909090915e-05, + "loss": 1.7986, + "step": 80 + }, + { + "epoch": 0.05530253741054001, + "grad_norm": 4.687050819396973, + "learning_rate": 4.415584415584416e-05, + "loss": 1.7189, + "step": 85 + }, + { + "epoch": 0.05855562784645413, + "grad_norm": 4.655097961425781, + "learning_rate": 4.740259740259741e-05, + "loss": 1.7185, + "step": 90 + }, + { + "epoch": 0.06180871828236825, + "grad_norm": 5.834875106811523, + "learning_rate": 4.999999786858144e-05, + "loss": 1.6804, + "step": 95 + }, + { + "epoch": 0.06506180871828236, + "grad_norm": 2.986246109008789, + "learning_rate": 4.99999232689698e-05, + "loss": 1.6772, + "step": 100 + }, + { + "epoch": 0.06831489915419649, + "grad_norm": 1.4194883108139038, + "learning_rate": 4.999974209879331e-05, + "loss": 1.602, + "step": 105 + }, + { + "epoch": 0.07156798959011061, + "grad_norm": 3.983574628829956, + "learning_rate": 4.999945435882428e-05, + "loss": 1.5656, + "step": 110 + }, + { + "epoch": 0.07482108002602472, + "grad_norm": 1.342112421989441, + "learning_rate": 4.9999060050289286e-05, + "loss": 1.511, + "step": 115 + }, + { + "epoch": 0.07807417046193885, + "grad_norm": 2.197117805480957, + "learning_rate": 4.999855917486921e-05, + "loss": 1.4768, + "step": 120 + }, + { + "epoch": 0.08132726089785296, + "grad_norm": 1.8786858320236206, + "learning_rate": 4.999795173469919e-05, + "loss": 1.473, + "step": 125 + }, + { + "epoch": 0.08458035133376708, + "grad_norm": 2.5618531703948975, + "learning_rate": 4.9997237732368645e-05, + "loss": 1.4527, + "step": 130 + }, + { + "epoch": 0.08783344176968119, + "grad_norm": 1.8612209558486938, + "learning_rate": 4.999641717092126e-05, + "loss": 1.4092, + "step": 135 + }, + { + "epoch": 0.09108653220559532, + "grad_norm": 1.912489891052246, + "learning_rate": 4.999549005385494e-05, + "loss": 1.3939, + "step": 140 + }, + { + "epoch": 0.09433962264150944, + "grad_norm": 2.8550467491149902, + "learning_rate": 4.999445638512185e-05, + "loss": 1.3562, + "step": 145 + }, + { + "epoch": 0.09759271307742355, + "grad_norm": 1.902714729309082, + "learning_rate": 4.9993316169128334e-05, + "loss": 1.3427, + "step": 150 + }, + { + "epoch": 0.10084580351333768, + "grad_norm": 3.12044620513916, + "learning_rate": 4.999206941073496e-05, + "loss": 1.3634, + "step": 155 + }, + { + "epoch": 0.10409889394925179, + "grad_norm": 2.6095197200775146, + "learning_rate": 4.999071611525643e-05, + "loss": 1.3605, + "step": 160 + }, + { + "epoch": 0.10735198438516591, + "grad_norm": 2.5530121326446533, + "learning_rate": 4.998925628846164e-05, + "loss": 1.3444, + "step": 165 + }, + { + "epoch": 0.11060507482108002, + "grad_norm": 1.9909695386886597, + "learning_rate": 4.99876899365736e-05, + "loss": 1.3192, + "step": 170 + }, + { + "epoch": 0.11385816525699415, + "grad_norm": 1.21974515914917, + "learning_rate": 4.998601706626938e-05, + "loss": 1.3085, + "step": 175 + }, + { + "epoch": 0.11711125569290826, + "grad_norm": 1.2985081672668457, + "learning_rate": 4.9984237684680194e-05, + "loss": 1.2848, + "step": 180 + }, + { + "epoch": 0.12036434612882238, + "grad_norm": 2.141941785812378, + "learning_rate": 4.998235179939122e-05, + "loss": 1.2729, + "step": 185 + }, + { + "epoch": 0.1236174365647365, + "grad_norm": 1.9323813915252686, + "learning_rate": 4.998035941844167e-05, + "loss": 1.275, + "step": 190 + }, + { + "epoch": 0.12687052700065063, + "grad_norm": 2.6978371143341064, + "learning_rate": 4.997826055032476e-05, + "loss": 1.2825, + "step": 195 + }, + { + "epoch": 0.13012361743656473, + "grad_norm": 2.018090009689331, + "learning_rate": 4.997605520398762e-05, + "loss": 1.2656, + "step": 200 + }, + { + "epoch": 0.13337670787247885, + "grad_norm": 1.0469837188720703, + "learning_rate": 4.997374338883127e-05, + "loss": 1.2584, + "step": 205 + }, + { + "epoch": 0.13662979830839297, + "grad_norm": 1.2959955930709839, + "learning_rate": 4.99713251147106e-05, + "loss": 1.2494, + "step": 210 + }, + { + "epoch": 0.1398828887443071, + "grad_norm": 2.215878486633301, + "learning_rate": 4.996880039193431e-05, + "loss": 1.2482, + "step": 215 + }, + { + "epoch": 0.14313597918022122, + "grad_norm": 1.711484432220459, + "learning_rate": 4.996616923126488e-05, + "loss": 1.2258, + "step": 220 + }, + { + "epoch": 0.14638906961613532, + "grad_norm": 1.5809857845306396, + "learning_rate": 4.996343164391853e-05, + "loss": 1.223, + "step": 225 + }, + { + "epoch": 0.14964216005204944, + "grad_norm": 1.6745812892913818, + "learning_rate": 4.9960587641565125e-05, + "loss": 1.2151, + "step": 230 + }, + { + "epoch": 0.15289525048796357, + "grad_norm": 1.5372675657272339, + "learning_rate": 4.9957637236328195e-05, + "loss": 1.1983, + "step": 235 + }, + { + "epoch": 0.1561483409238777, + "grad_norm": 1.5290815830230713, + "learning_rate": 4.995458044078482e-05, + "loss": 1.24, + "step": 240 + }, + { + "epoch": 0.1594014313597918, + "grad_norm": 1.4023972749710083, + "learning_rate": 4.9951417267965626e-05, + "loss": 1.1897, + "step": 245 + }, + { + "epoch": 0.16265452179570591, + "grad_norm": 1.8283660411834717, + "learning_rate": 4.99481477313547e-05, + "loss": 1.2029, + "step": 250 + }, + { + "epoch": 0.16590761223162004, + "grad_norm": 1.8741523027420044, + "learning_rate": 4.9944771844889524e-05, + "loss": 1.19, + "step": 255 + }, + { + "epoch": 0.16916070266753416, + "grad_norm": 1.552556037902832, + "learning_rate": 4.994128962296097e-05, + "loss": 1.1946, + "step": 260 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 2.1094107627868652, + "learning_rate": 4.9937701080413165e-05, + "loss": 1.1756, + "step": 265 + }, + { + "epoch": 0.17566688353936238, + "grad_norm": 1.7123149633407593, + "learning_rate": 4.993400623254347e-05, + "loss": 1.1789, + "step": 270 + }, + { + "epoch": 0.1789199739752765, + "grad_norm": 1.2891788482666016, + "learning_rate": 4.993020509510243e-05, + "loss": 1.1833, + "step": 275 + }, + { + "epoch": 0.18217306441119063, + "grad_norm": 1.2659103870391846, + "learning_rate": 4.992629768429367e-05, + "loss": 1.1697, + "step": 280 + }, + { + "epoch": 0.18542615484710476, + "grad_norm": 1.602931022644043, + "learning_rate": 4.992228401677382e-05, + "loss": 1.16, + "step": 285 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 1.1984357833862305, + "learning_rate": 4.99181641096525e-05, + "loss": 1.1415, + "step": 290 + }, + { + "epoch": 0.19193233571893298, + "grad_norm": 2.036529302597046, + "learning_rate": 4.991393798049219e-05, + "loss": 1.168, + "step": 295 + }, + { + "epoch": 0.1951854261548471, + "grad_norm": 1.9513144493103027, + "learning_rate": 4.990960564730819e-05, + "loss": 1.1623, + "step": 300 + }, + { + "epoch": 0.19843851659076123, + "grad_norm": 1.2966268062591553, + "learning_rate": 4.9905167128568516e-05, + "loss": 1.143, + "step": 305 + }, + { + "epoch": 0.20169160702667535, + "grad_norm": 1.3897426128387451, + "learning_rate": 4.990062244319387e-05, + "loss": 1.1431, + "step": 310 + }, + { + "epoch": 0.20494469746258945, + "grad_norm": 1.7485623359680176, + "learning_rate": 4.989597161055746e-05, + "loss": 1.1507, + "step": 315 + }, + { + "epoch": 0.20819778789850357, + "grad_norm": 1.1369644403457642, + "learning_rate": 4.989121465048505e-05, + "loss": 1.1447, + "step": 320 + }, + { + "epoch": 0.2114508783344177, + "grad_norm": 1.292037844657898, + "learning_rate": 4.988635158325476e-05, + "loss": 1.1289, + "step": 325 + }, + { + "epoch": 0.21470396877033182, + "grad_norm": 1.1460140943527222, + "learning_rate": 4.988138242959707e-05, + "loss": 1.1314, + "step": 330 + }, + { + "epoch": 0.21795705920624595, + "grad_norm": 1.9661816358566284, + "learning_rate": 4.987630721069465e-05, + "loss": 1.147, + "step": 335 + }, + { + "epoch": 0.22121014964216004, + "grad_norm": 1.3988662958145142, + "learning_rate": 4.987112594818232e-05, + "loss": 1.1443, + "step": 340 + }, + { + "epoch": 0.22446324007807417, + "grad_norm": 1.6520105600357056, + "learning_rate": 4.986583866414696e-05, + "loss": 1.1089, + "step": 345 + }, + { + "epoch": 0.2277163305139883, + "grad_norm": 1.6153268814086914, + "learning_rate": 4.9860445381127385e-05, + "loss": 1.1279, + "step": 350 + }, + { + "epoch": 0.23096942094990242, + "grad_norm": 1.0572576522827148, + "learning_rate": 4.985494612211429e-05, + "loss": 1.1073, + "step": 355 + }, + { + "epoch": 0.2342225113858165, + "grad_norm": 1.1980561017990112, + "learning_rate": 4.984934091055009e-05, + "loss": 1.1161, + "step": 360 + }, + { + "epoch": 0.23747560182173064, + "grad_norm": 3.1612489223480225, + "learning_rate": 4.98436297703289e-05, + "loss": 1.1473, + "step": 365 + }, + { + "epoch": 0.24072869225764476, + "grad_norm": 1.7351305484771729, + "learning_rate": 4.983781272579636e-05, + "loss": 1.1282, + "step": 370 + }, + { + "epoch": 0.24398178269355889, + "grad_norm": 1.4272353649139404, + "learning_rate": 4.983188980174958e-05, + "loss": 1.1486, + "step": 375 + }, + { + "epoch": 0.247234873129473, + "grad_norm": 1.6868839263916016, + "learning_rate": 4.9825861023437016e-05, + "loss": 1.1224, + "step": 380 + }, + { + "epoch": 0.2504879635653871, + "grad_norm": 1.1032485961914062, + "learning_rate": 4.981972641655835e-05, + "loss": 1.1186, + "step": 385 + }, + { + "epoch": 0.25374105400130126, + "grad_norm": 1.0825129747390747, + "learning_rate": 4.981348600726441e-05, + "loss": 1.093, + "step": 390 + }, + { + "epoch": 0.25699414443721535, + "grad_norm": 1.0156402587890625, + "learning_rate": 4.980713982215703e-05, + "loss": 1.0873, + "step": 395 + }, + { + "epoch": 0.26024723487312945, + "grad_norm": 2.106105089187622, + "learning_rate": 4.9800687888288964e-05, + "loss": 1.0924, + "step": 400 + }, + { + "epoch": 0.2635003253090436, + "grad_norm": 1.6301723718643188, + "learning_rate": 4.9794130233163735e-05, + "loss": 1.1063, + "step": 405 + }, + { + "epoch": 0.2667534157449577, + "grad_norm": 1.30489981174469, + "learning_rate": 4.978746688473556e-05, + "loss": 1.0993, + "step": 410 + }, + { + "epoch": 0.27000650618087185, + "grad_norm": 1.1064469814300537, + "learning_rate": 4.978069787140919e-05, + "loss": 1.093, + "step": 415 + }, + { + "epoch": 0.27325959661678595, + "grad_norm": 1.1742445230484009, + "learning_rate": 4.977382322203982e-05, + "loss": 1.0848, + "step": 420 + }, + { + "epoch": 0.27651268705270005, + "grad_norm": 1.0716508626937866, + "learning_rate": 4.976684296593295e-05, + "loss": 1.1157, + "step": 425 + }, + { + "epoch": 0.2797657774886142, + "grad_norm": 1.4256720542907715, + "learning_rate": 4.9759757132844256e-05, + "loss": 1.0835, + "step": 430 + }, + { + "epoch": 0.2830188679245283, + "grad_norm": 1.2922230958938599, + "learning_rate": 4.975256575297949e-05, + "loss": 1.0804, + "step": 435 + }, + { + "epoch": 0.28627195836044245, + "grad_norm": 1.5222572088241577, + "learning_rate": 4.974526885699432e-05, + "loss": 1.077, + "step": 440 + }, + { + "epoch": 0.28952504879635654, + "grad_norm": 1.023868441581726, + "learning_rate": 4.973786647599422e-05, + "loss": 1.0782, + "step": 445 + }, + { + "epoch": 0.29277813923227064, + "grad_norm": 1.7092077732086182, + "learning_rate": 4.9730358641534324e-05, + "loss": 1.1011, + "step": 450 + }, + { + "epoch": 0.2960312296681848, + "grad_norm": 1.0816203355789185, + "learning_rate": 4.9722745385619285e-05, + "loss": 1.0857, + "step": 455 + }, + { + "epoch": 0.2992843201040989, + "grad_norm": 0.9598567485809326, + "learning_rate": 4.971502674070317e-05, + "loss": 1.0874, + "step": 460 + }, + { + "epoch": 0.302537410540013, + "grad_norm": 1.1397418975830078, + "learning_rate": 4.970720273968929e-05, + "loss": 1.0743, + "step": 465 + }, + { + "epoch": 0.30579050097592714, + "grad_norm": 1.6813876628875732, + "learning_rate": 4.969927341593008e-05, + "loss": 1.0587, + "step": 470 + }, + { + "epoch": 0.30904359141184123, + "grad_norm": 1.4590063095092773, + "learning_rate": 4.9691238803226944e-05, + "loss": 1.0706, + "step": 475 + }, + { + "epoch": 0.3122966818477554, + "grad_norm": 0.988750696182251, + "learning_rate": 4.9683098935830115e-05, + "loss": 1.0569, + "step": 480 + }, + { + "epoch": 0.3155497722836695, + "grad_norm": 1.0971347093582153, + "learning_rate": 4.9674853848438506e-05, + "loss": 1.0441, + "step": 485 + }, + { + "epoch": 0.3188028627195836, + "grad_norm": 1.0693708658218384, + "learning_rate": 4.9666503576199574e-05, + "loss": 1.0644, + "step": 490 + }, + { + "epoch": 0.32205595315549773, + "grad_norm": 1.2514370679855347, + "learning_rate": 4.965804815470916e-05, + "loss": 1.0609, + "step": 495 + }, + { + "epoch": 0.32530904359141183, + "grad_norm": 1.5080784559249878, + "learning_rate": 4.964948762001133e-05, + "loss": 1.0682, + "step": 500 + }, + { + "epoch": 0.328562134027326, + "grad_norm": 1.1908406019210815, + "learning_rate": 4.964082200859824e-05, + "loss": 1.0418, + "step": 505 + }, + { + "epoch": 0.3318152244632401, + "grad_norm": 1.6586133241653442, + "learning_rate": 4.963205135740997e-05, + "loss": 1.0668, + "step": 510 + }, + { + "epoch": 0.3350683148991542, + "grad_norm": 0.7452509999275208, + "learning_rate": 4.962317570383436e-05, + "loss": 1.0508, + "step": 515 + }, + { + "epoch": 0.3383214053350683, + "grad_norm": 1.3133275508880615, + "learning_rate": 4.961419508570686e-05, + "loss": 1.0543, + "step": 520 + }, + { + "epoch": 0.3415744957709824, + "grad_norm": 1.1373653411865234, + "learning_rate": 4.960510954131038e-05, + "loss": 1.0711, + "step": 525 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 1.12503981590271, + "learning_rate": 4.95959191093751e-05, + "loss": 1.0486, + "step": 530 + }, + { + "epoch": 0.34808067664281067, + "grad_norm": 0.921503484249115, + "learning_rate": 4.95866238290783e-05, + "loss": 1.0543, + "step": 535 + }, + { + "epoch": 0.35133376707872477, + "grad_norm": 0.9198605418205261, + "learning_rate": 4.957722374004427e-05, + "loss": 1.0438, + "step": 540 + }, + { + "epoch": 0.3545868575146389, + "grad_norm": 1.630878210067749, + "learning_rate": 4.9567718882344015e-05, + "loss": 1.0544, + "step": 545 + }, + { + "epoch": 0.357839947950553, + "grad_norm": 2.2188167572021484, + "learning_rate": 4.95581092964952e-05, + "loss": 1.0541, + "step": 550 + }, + { + "epoch": 0.36109303838646717, + "grad_norm": 0.9371961355209351, + "learning_rate": 4.95483950234619e-05, + "loss": 1.0723, + "step": 555 + }, + { + "epoch": 0.36434612882238127, + "grad_norm": 1.0933233499526978, + "learning_rate": 4.9538576104654466e-05, + "loss": 1.052, + "step": 560 + }, + { + "epoch": 0.36759921925829536, + "grad_norm": 1.1232990026474, + "learning_rate": 4.9528652581929335e-05, + "loss": 1.0354, + "step": 565 + }, + { + "epoch": 0.3708523096942095, + "grad_norm": 1.000786542892456, + "learning_rate": 4.951862449758885e-05, + "loss": 1.0407, + "step": 570 + }, + { + "epoch": 0.3741054001301236, + "grad_norm": 0.939582884311676, + "learning_rate": 4.9508491894381104e-05, + "loss": 1.0206, + "step": 575 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 1.264381766319275, + "learning_rate": 4.9498254815499694e-05, + "loss": 1.0362, + "step": 580 + }, + { + "epoch": 0.38061158100195186, + "grad_norm": 0.673314094543457, + "learning_rate": 4.948791330458363e-05, + "loss": 1.0381, + "step": 585 + }, + { + "epoch": 0.38386467143786596, + "grad_norm": 1.441362738609314, + "learning_rate": 4.947746740571706e-05, + "loss": 1.0354, + "step": 590 + }, + { + "epoch": 0.3871177618737801, + "grad_norm": 1.1851030588150024, + "learning_rate": 4.9466917163429124e-05, + "loss": 1.0146, + "step": 595 + }, + { + "epoch": 0.3903708523096942, + "grad_norm": 0.9171844124794006, + "learning_rate": 4.94562626226938e-05, + "loss": 1.0103, + "step": 600 + }, + { + "epoch": 0.3936239427456083, + "grad_norm": 1.5662965774536133, + "learning_rate": 4.944550382892962e-05, + "loss": 1.0466, + "step": 605 + }, + { + "epoch": 0.39687703318152245, + "grad_norm": 1.1077489852905273, + "learning_rate": 4.943464082799955e-05, + "loss": 1.0458, + "step": 610 + }, + { + "epoch": 0.40013012361743655, + "grad_norm": 1.5997633934020996, + "learning_rate": 4.942367366621081e-05, + "loss": 1.0464, + "step": 615 + }, + { + "epoch": 0.4033832140533507, + "grad_norm": 1.0540611743927002, + "learning_rate": 4.9412602390314585e-05, + "loss": 1.0242, + "step": 620 + }, + { + "epoch": 0.4066363044892648, + "grad_norm": 1.1247586011886597, + "learning_rate": 4.94014270475059e-05, + "loss": 1.0232, + "step": 625 + }, + { + "epoch": 0.4098893949251789, + "grad_norm": 1.065820336341858, + "learning_rate": 4.939014768542342e-05, + "loss": 1.0137, + "step": 630 + }, + { + "epoch": 0.41314248536109305, + "grad_norm": 0.8374763131141663, + "learning_rate": 4.93787643521492e-05, + "loss": 1.0203, + "step": 635 + }, + { + "epoch": 0.41639557579700714, + "grad_norm": 0.7515140771865845, + "learning_rate": 4.936727709620853e-05, + "loss": 1.0176, + "step": 640 + }, + { + "epoch": 0.4196486662329213, + "grad_norm": 0.8034088015556335, + "learning_rate": 4.9355685966569684e-05, + "loss": 1.0322, + "step": 645 + }, + { + "epoch": 0.4229017566688354, + "grad_norm": 1.2314985990524292, + "learning_rate": 4.934399101264375e-05, + "loss": 1.0198, + "step": 650 + }, + { + "epoch": 0.4261548471047495, + "grad_norm": 1.342058539390564, + "learning_rate": 4.93321922842844e-05, + "loss": 1.0133, + "step": 655 + }, + { + "epoch": 0.42940793754066364, + "grad_norm": 0.8881794214248657, + "learning_rate": 4.932028983178766e-05, + "loss": 1.0255, + "step": 660 + }, + { + "epoch": 0.43266102797657774, + "grad_norm": 1.3695508241653442, + "learning_rate": 4.9308283705891736e-05, + "loss": 1.0293, + "step": 665 + }, + { + "epoch": 0.4359141184124919, + "grad_norm": 0.9350308179855347, + "learning_rate": 4.9296173957776776e-05, + "loss": 1.03, + "step": 670 + }, + { + "epoch": 0.439167208848406, + "grad_norm": 0.9181856513023376, + "learning_rate": 4.928396063906463e-05, + "loss": 1.0234, + "step": 675 + }, + { + "epoch": 0.4424202992843201, + "grad_norm": 1.352927803993225, + "learning_rate": 4.927164380181869e-05, + "loss": 1.0474, + "step": 680 + }, + { + "epoch": 0.44567338972023424, + "grad_norm": 1.176147222518921, + "learning_rate": 4.9259223498543597e-05, + "loss": 1.0329, + "step": 685 + }, + { + "epoch": 0.44892648015614833, + "grad_norm": 1.0797678232192993, + "learning_rate": 4.9246699782185055e-05, + "loss": 1.0141, + "step": 690 + }, + { + "epoch": 0.4521795705920625, + "grad_norm": 0.9696300029754639, + "learning_rate": 4.9234072706129627e-05, + "loss": 0.999, + "step": 695 + }, + { + "epoch": 0.4554326610279766, + "grad_norm": 0.9436845779418945, + "learning_rate": 4.922134232420445e-05, + "loss": 1.0003, + "step": 700 + }, + { + "epoch": 0.4586857514638907, + "grad_norm": 1.1857705116271973, + "learning_rate": 4.920850869067706e-05, + "loss": 0.9831, + "step": 705 + }, + { + "epoch": 0.46193884189980483, + "grad_norm": 0.9158900380134583, + "learning_rate": 4.919557186025512e-05, + "loss": 1.0201, + "step": 710 + }, + { + "epoch": 0.4651919323357189, + "grad_norm": 0.8820152282714844, + "learning_rate": 4.9182531888086205e-05, + "loss": 0.9852, + "step": 715 + }, + { + "epoch": 0.468445022771633, + "grad_norm": 1.5595647096633911, + "learning_rate": 4.916938882975759e-05, + "loss": 1.0002, + "step": 720 + }, + { + "epoch": 0.4716981132075472, + "grad_norm": 1.1958764791488647, + "learning_rate": 4.915614274129597e-05, + "loss": 1.0375, + "step": 725 + }, + { + "epoch": 0.4749512036434613, + "grad_norm": 1.1134103536605835, + "learning_rate": 4.914279367916724e-05, + "loss": 1.0208, + "step": 730 + }, + { + "epoch": 0.4782042940793754, + "grad_norm": 0.8463726043701172, + "learning_rate": 4.9129341700276266e-05, + "loss": 0.9955, + "step": 735 + }, + { + "epoch": 0.4814573845152895, + "grad_norm": 0.8405961394309998, + "learning_rate": 4.911578686196661e-05, + "loss": 0.9754, + "step": 740 + }, + { + "epoch": 0.4847104749512036, + "grad_norm": 1.0310126543045044, + "learning_rate": 4.9102129222020324e-05, + "loss": 1.0213, + "step": 745 + }, + { + "epoch": 0.48796356538711777, + "grad_norm": 1.058269739151001, + "learning_rate": 4.908836883865768e-05, + "loss": 0.9966, + "step": 750 + }, + { + "epoch": 0.49121665582303187, + "grad_norm": 0.9762022495269775, + "learning_rate": 4.907450577053694e-05, + "loss": 1.0059, + "step": 755 + }, + { + "epoch": 0.494469746258946, + "grad_norm": 0.8593292832374573, + "learning_rate": 4.906054007675408e-05, + "loss": 0.9922, + "step": 760 + }, + { + "epoch": 0.4977228366948601, + "grad_norm": 1.3241448402404785, + "learning_rate": 4.9046471816842565e-05, + "loss": 1.007, + "step": 765 + }, + { + "epoch": 0.5009759271307742, + "grad_norm": 0.9241655468940735, + "learning_rate": 4.903230105077306e-05, + "loss": 1.0204, + "step": 770 + }, + { + "epoch": 0.5042290175666884, + "grad_norm": 0.8068680763244629, + "learning_rate": 4.9018027838953226e-05, + "loss": 0.9932, + "step": 775 + }, + { + "epoch": 0.5074821080026025, + "grad_norm": 1.2541546821594238, + "learning_rate": 4.900365224222742e-05, + "loss": 0.9945, + "step": 780 + }, + { + "epoch": 0.5107351984385166, + "grad_norm": 0.925835907459259, + "learning_rate": 4.898917432187644e-05, + "loss": 0.9745, + "step": 785 + }, + { + "epoch": 0.5139882888744307, + "grad_norm": 0.7561518549919128, + "learning_rate": 4.897459413961729e-05, + "loss": 1.0065, + "step": 790 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 1.056420922279358, + "learning_rate": 4.8959911757602885e-05, + "loss": 0.974, + "step": 795 + }, + { + "epoch": 0.5204944697462589, + "grad_norm": 1.219141960144043, + "learning_rate": 4.89451272384218e-05, + "loss": 0.9926, + "step": 800 + }, + { + "epoch": 0.523747560182173, + "grad_norm": 0.9372319579124451, + "learning_rate": 4.8930240645098027e-05, + "loss": 1.0141, + "step": 805 + }, + { + "epoch": 0.5270006506180872, + "grad_norm": 1.0118193626403809, + "learning_rate": 4.891525204109065e-05, + "loss": 0.9996, + "step": 810 + }, + { + "epoch": 0.5302537410540012, + "grad_norm": 0.91470867395401, + "learning_rate": 4.890016149029365e-05, + "loss": 0.9851, + "step": 815 + }, + { + "epoch": 0.5335068314899154, + "grad_norm": 0.787122368812561, + "learning_rate": 4.888496905703554e-05, + "loss": 0.9969, + "step": 820 + }, + { + "epoch": 0.5367599219258296, + "grad_norm": 0.8628039956092834, + "learning_rate": 4.886967480607918e-05, + "loss": 1.0024, + "step": 825 + }, + { + "epoch": 0.5400130123617437, + "grad_norm": 1.450460433959961, + "learning_rate": 4.885427880262144e-05, + "loss": 0.9743, + "step": 830 + }, + { + "epoch": 0.5432661027976577, + "grad_norm": 1.0362318754196167, + "learning_rate": 4.883878111229296e-05, + "loss": 0.9723, + "step": 835 + }, + { + "epoch": 0.5465191932335719, + "grad_norm": 0.9855751991271973, + "learning_rate": 4.8823181801157844e-05, + "loss": 0.9898, + "step": 840 + }, + { + "epoch": 0.549772283669486, + "grad_norm": 1.0782288312911987, + "learning_rate": 4.880748093571339e-05, + "loss": 0.9727, + "step": 845 + }, + { + "epoch": 0.5530253741054001, + "grad_norm": 1.5194872617721558, + "learning_rate": 4.879167858288982e-05, + "loss": 0.9922, + "step": 850 + }, + { + "epoch": 0.5562784645413142, + "grad_norm": 1.5501078367233276, + "learning_rate": 4.877577481004995e-05, + "loss": 0.9705, + "step": 855 + }, + { + "epoch": 0.5595315549772284, + "grad_norm": 1.5971125364303589, + "learning_rate": 4.875976968498895e-05, + "loss": 1.0078, + "step": 860 + }, + { + "epoch": 0.5627846454131424, + "grad_norm": 0.9124265313148499, + "learning_rate": 4.874366327593406e-05, + "loss": 0.9737, + "step": 865 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 0.8439720273017883, + "learning_rate": 4.872745565154424e-05, + "loss": 0.9967, + "step": 870 + }, + { + "epoch": 0.5692908262849707, + "grad_norm": 0.9340474009513855, + "learning_rate": 4.871114688090992e-05, + "loss": 0.9934, + "step": 875 + }, + { + "epoch": 0.5725439167208849, + "grad_norm": 0.8820469975471497, + "learning_rate": 4.869473703355273e-05, + "loss": 0.9917, + "step": 880 + }, + { + "epoch": 0.5757970071567989, + "grad_norm": 0.8724156618118286, + "learning_rate": 4.867822617942514e-05, + "loss": 0.9762, + "step": 885 + }, + { + "epoch": 0.5790500975927131, + "grad_norm": 0.9085761308670044, + "learning_rate": 4.866161438891022e-05, + "loss": 0.9686, + "step": 890 + }, + { + "epoch": 0.5823031880286272, + "grad_norm": 0.7215405106544495, + "learning_rate": 4.864490173282128e-05, + "loss": 0.9858, + "step": 895 + }, + { + "epoch": 0.5855562784645413, + "grad_norm": 1.0854041576385498, + "learning_rate": 4.862808828240164e-05, + "loss": 0.9935, + "step": 900 + }, + { + "epoch": 0.5888093689004554, + "grad_norm": 0.8779392242431641, + "learning_rate": 4.861117410932429e-05, + "loss": 0.9816, + "step": 905 + }, + { + "epoch": 0.5920624593363696, + "grad_norm": 1.2866002321243286, + "learning_rate": 4.8594159285691546e-05, + "loss": 0.9818, + "step": 910 + }, + { + "epoch": 0.5953155497722836, + "grad_norm": 0.7991343140602112, + "learning_rate": 4.8577043884034826e-05, + "loss": 0.9592, + "step": 915 + }, + { + "epoch": 0.5985686402081978, + "grad_norm": 0.9553494453430176, + "learning_rate": 4.8559827977314254e-05, + "loss": 0.9943, + "step": 920 + }, + { + "epoch": 0.6018217306441119, + "grad_norm": 1.2053009271621704, + "learning_rate": 4.854251163891843e-05, + "loss": 0.946, + "step": 925 + }, + { + "epoch": 0.605074821080026, + "grad_norm": 0.744791567325592, + "learning_rate": 4.852509494266405e-05, + "loss": 0.9804, + "step": 930 + }, + { + "epoch": 0.6083279115159401, + "grad_norm": 1.2371433973312378, + "learning_rate": 4.850757796279563e-05, + "loss": 0.9902, + "step": 935 + }, + { + "epoch": 0.6115810019518543, + "grad_norm": 0.723250150680542, + "learning_rate": 4.8489960773985174e-05, + "loss": 0.9839, + "step": 940 + }, + { + "epoch": 0.6148340923877684, + "grad_norm": 0.7003908753395081, + "learning_rate": 4.847224345133188e-05, + "loss": 0.9712, + "step": 945 + }, + { + "epoch": 0.6180871828236825, + "grad_norm": 0.8090314865112305, + "learning_rate": 4.845442607036176e-05, + "loss": 0.9631, + "step": 950 + }, + { + "epoch": 0.6213402732595966, + "grad_norm": 0.7971912622451782, + "learning_rate": 4.8436508707027384e-05, + "loss": 0.9722, + "step": 955 + }, + { + "epoch": 0.6245933636955108, + "grad_norm": 0.7696447968482971, + "learning_rate": 4.841849143770754e-05, + "loss": 0.9712, + "step": 960 + }, + { + "epoch": 0.6278464541314248, + "grad_norm": 0.9497612714767456, + "learning_rate": 4.840037433920688e-05, + "loss": 0.9653, + "step": 965 + }, + { + "epoch": 0.631099544567339, + "grad_norm": 1.1326346397399902, + "learning_rate": 4.838215748875562e-05, + "loss": 0.9648, + "step": 970 + }, + { + "epoch": 0.6343526350032531, + "grad_norm": 0.8858407139778137, + "learning_rate": 4.83638409640092e-05, + "loss": 0.9765, + "step": 975 + }, + { + "epoch": 0.6376057254391672, + "grad_norm": 0.9079559445381165, + "learning_rate": 4.834542484304795e-05, + "loss": 0.958, + "step": 980 + }, + { + "epoch": 0.6408588158750813, + "grad_norm": 0.9221760630607605, + "learning_rate": 4.8326909204376776e-05, + "loss": 0.9675, + "step": 985 + }, + { + "epoch": 0.6441119063109955, + "grad_norm": 0.8072174787521362, + "learning_rate": 4.8308294126924794e-05, + "loss": 0.9745, + "step": 990 + }, + { + "epoch": 0.6473649967469096, + "grad_norm": 0.9354230165481567, + "learning_rate": 4.828957969004502e-05, + "loss": 0.9581, + "step": 995 + }, + { + "epoch": 0.6506180871828237, + "grad_norm": 0.8067158460617065, + "learning_rate": 4.827076597351403e-05, + "loss": 0.9669, + "step": 1000 + }, + { + "epoch": 0.6538711776187378, + "grad_norm": 1.0591189861297607, + "learning_rate": 4.825185305753161e-05, + "loss": 0.9682, + "step": 1005 + }, + { + "epoch": 0.657124268054652, + "grad_norm": 0.7701990604400635, + "learning_rate": 4.823284102272041e-05, + "loss": 0.9756, + "step": 1010 + }, + { + "epoch": 0.660377358490566, + "grad_norm": 0.9886049628257751, + "learning_rate": 4.82137299501256e-05, + "loss": 0.9646, + "step": 1015 + }, + { + "epoch": 0.6636304489264802, + "grad_norm": 0.966618537902832, + "learning_rate": 4.819451992121454e-05, + "loss": 0.9673, + "step": 1020 + }, + { + "epoch": 0.6668835393623943, + "grad_norm": 0.987940788269043, + "learning_rate": 4.817521101787646e-05, + "loss": 0.9647, + "step": 1025 + }, + { + "epoch": 0.6701366297983083, + "grad_norm": 0.752627432346344, + "learning_rate": 4.815580332242199e-05, + "loss": 0.9545, + "step": 1030 + }, + { + "epoch": 0.6733897202342225, + "grad_norm": 1.0263205766677856, + "learning_rate": 4.813629691758299e-05, + "loss": 0.9479, + "step": 1035 + }, + { + "epoch": 0.6766428106701367, + "grad_norm": 0.8434374332427979, + "learning_rate": 4.811669188651204e-05, + "loss": 0.9747, + "step": 1040 + }, + { + "epoch": 0.6798959011060507, + "grad_norm": 0.8626881837844849, + "learning_rate": 4.8096988312782174e-05, + "loss": 0.9713, + "step": 1045 + }, + { + "epoch": 0.6831489915419648, + "grad_norm": 0.8781446814537048, + "learning_rate": 4.8077186280386475e-05, + "loss": 0.964, + "step": 1050 + }, + { + "epoch": 0.686402081977879, + "grad_norm": 0.8338606953620911, + "learning_rate": 4.8057285873737765e-05, + "loss": 0.9916, + "step": 1055 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.8619135022163391, + "learning_rate": 4.803728717766821e-05, + "loss": 0.9562, + "step": 1060 + }, + { + "epoch": 0.6929082628497072, + "grad_norm": 0.8325028419494629, + "learning_rate": 4.8017190277428956e-05, + "loss": 0.9494, + "step": 1065 + }, + { + "epoch": 0.6961613532856213, + "grad_norm": 0.772607684135437, + "learning_rate": 4.799699525868979e-05, + "loss": 0.9783, + "step": 1070 + }, + { + "epoch": 0.6994144437215355, + "grad_norm": 0.7735521793365479, + "learning_rate": 4.797670220753876e-05, + "loss": 0.966, + "step": 1075 + }, + { + "epoch": 0.7026675341574495, + "grad_norm": 0.8032121062278748, + "learning_rate": 4.79563112104818e-05, + "loss": 0.9569, + "step": 1080 + }, + { + "epoch": 0.7059206245933637, + "grad_norm": 0.9248620271682739, + "learning_rate": 4.7935822354442397e-05, + "loss": 0.9676, + "step": 1085 + }, + { + "epoch": 0.7091737150292778, + "grad_norm": 0.6317049264907837, + "learning_rate": 4.7915235726761154e-05, + "loss": 0.9443, + "step": 1090 + }, + { + "epoch": 0.7124268054651919, + "grad_norm": 0.9738350510597229, + "learning_rate": 4.789455141519551e-05, + "loss": 0.9693, + "step": 1095 + }, + { + "epoch": 0.715679895901106, + "grad_norm": 0.7499257922172546, + "learning_rate": 4.7873769507919266e-05, + "loss": 0.958, + "step": 1100 + }, + { + "epoch": 0.7189329863370202, + "grad_norm": 0.8857749700546265, + "learning_rate": 4.785289009352227e-05, + "loss": 0.9596, + "step": 1105 + }, + { + "epoch": 0.7221860767729343, + "grad_norm": 0.7081575393676758, + "learning_rate": 4.7831913261010066e-05, + "loss": 0.9454, + "step": 1110 + }, + { + "epoch": 0.7254391672088484, + "grad_norm": 0.8387717604637146, + "learning_rate": 4.781083909980342e-05, + "loss": 0.9472, + "step": 1115 + }, + { + "epoch": 0.7286922576447625, + "grad_norm": 0.9755154848098755, + "learning_rate": 4.778966769973802e-05, + "loss": 0.9668, + "step": 1120 + }, + { + "epoch": 0.7319453480806767, + "grad_norm": 0.7101641893386841, + "learning_rate": 4.7768399151064076e-05, + "loss": 0.9457, + "step": 1125 + }, + { + "epoch": 0.7351984385165907, + "grad_norm": 0.9372628331184387, + "learning_rate": 4.774703354444591e-05, + "loss": 0.9709, + "step": 1130 + }, + { + "epoch": 0.7384515289525049, + "grad_norm": 0.9276643991470337, + "learning_rate": 4.7725570970961586e-05, + "loss": 0.9586, + "step": 1135 + }, + { + "epoch": 0.741704619388419, + "grad_norm": 0.7329192757606506, + "learning_rate": 4.770401152210253e-05, + "loss": 0.9608, + "step": 1140 + }, + { + "epoch": 0.7449577098243331, + "grad_norm": 0.7759012579917908, + "learning_rate": 4.768235528977314e-05, + "loss": 0.9469, + "step": 1145 + }, + { + "epoch": 0.7482108002602472, + "grad_norm": 1.2127937078475952, + "learning_rate": 4.766060236629037e-05, + "loss": 0.9542, + "step": 1150 + }, + { + "epoch": 0.7514638906961614, + "grad_norm": 0.7369085550308228, + "learning_rate": 4.763875284438336e-05, + "loss": 0.9643, + "step": 1155 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.7963067293167114, + "learning_rate": 4.7616806817193024e-05, + "loss": 0.9678, + "step": 1160 + }, + { + "epoch": 0.7579700715679896, + "grad_norm": 0.7773886919021606, + "learning_rate": 4.759476437827168e-05, + "loss": 0.9603, + "step": 1165 + }, + { + "epoch": 0.7612231620039037, + "grad_norm": 0.8198060393333435, + "learning_rate": 4.757262562158262e-05, + "loss": 0.9759, + "step": 1170 + }, + { + "epoch": 0.7644762524398179, + "grad_norm": 0.7127149701118469, + "learning_rate": 4.7550390641499715e-05, + "loss": 0.9244, + "step": 1175 + }, + { + "epoch": 0.7677293428757319, + "grad_norm": 1.236286997795105, + "learning_rate": 4.7528059532807045e-05, + "loss": 0.9313, + "step": 1180 + }, + { + "epoch": 0.7709824333116461, + "grad_norm": 0.6795628070831299, + "learning_rate": 4.750563239069845e-05, + "loss": 0.9586, + "step": 1185 + }, + { + "epoch": 0.7742355237475602, + "grad_norm": 0.8040820956230164, + "learning_rate": 4.7483109310777165e-05, + "loss": 0.9483, + "step": 1190 + }, + { + "epoch": 0.7774886141834743, + "grad_norm": 0.8001431226730347, + "learning_rate": 4.7460490389055355e-05, + "loss": 0.9408, + "step": 1195 + }, + { + "epoch": 0.7807417046193884, + "grad_norm": 0.969782292842865, + "learning_rate": 4.743777572195378e-05, + "loss": 0.9778, + "step": 1200 + }, + { + "epoch": 0.7839947950553026, + "grad_norm": 1.0955541133880615, + "learning_rate": 4.741496540630134e-05, + "loss": 0.9385, + "step": 1205 + }, + { + "epoch": 0.7872478854912166, + "grad_norm": 0.7429236173629761, + "learning_rate": 4.739205953933464e-05, + "loss": 0.9642, + "step": 1210 + }, + { + "epoch": 0.7905009759271308, + "grad_norm": 1.0475250482559204, + "learning_rate": 4.736905821869765e-05, + "loss": 0.9437, + "step": 1215 + }, + { + "epoch": 0.7937540663630449, + "grad_norm": 0.7216660380363464, + "learning_rate": 4.734596154244121e-05, + "loss": 0.9289, + "step": 1220 + }, + { + "epoch": 0.7970071567989591, + "grad_norm": 0.8584089279174805, + "learning_rate": 4.732276960902267e-05, + "loss": 0.9246, + "step": 1225 + }, + { + "epoch": 0.8002602472348731, + "grad_norm": 0.8769578337669373, + "learning_rate": 4.7299482517305404e-05, + "loss": 0.9298, + "step": 1230 + }, + { + "epoch": 0.8035133376707873, + "grad_norm": 0.7453442811965942, + "learning_rate": 4.7276100366558474e-05, + "loss": 0.9491, + "step": 1235 + }, + { + "epoch": 0.8067664281067014, + "grad_norm": 0.906287431716919, + "learning_rate": 4.7252623256456144e-05, + "loss": 0.9539, + "step": 1240 + }, + { + "epoch": 0.8100195185426154, + "grad_norm": 1.0656296014785767, + "learning_rate": 4.722905128707749e-05, + "loss": 0.9405, + "step": 1245 + }, + { + "epoch": 0.8132726089785296, + "grad_norm": 0.6985450983047485, + "learning_rate": 4.720538455890591e-05, + "loss": 0.9369, + "step": 1250 + }, + { + "epoch": 0.8165256994144438, + "grad_norm": 0.6577023267745972, + "learning_rate": 4.718162317282882e-05, + "loss": 0.9346, + "step": 1255 + }, + { + "epoch": 0.8197787898503578, + "grad_norm": 0.7832421064376831, + "learning_rate": 4.7157767230137064e-05, + "loss": 0.9256, + "step": 1260 + }, + { + "epoch": 0.8230318802862719, + "grad_norm": 0.7928493618965149, + "learning_rate": 4.713381683252463e-05, + "loss": 0.9477, + "step": 1265 + }, + { + "epoch": 0.8262849707221861, + "grad_norm": 0.8775043487548828, + "learning_rate": 4.710977208208812e-05, + "loss": 0.9313, + "step": 1270 + }, + { + "epoch": 0.8295380611581002, + "grad_norm": 0.7714875936508179, + "learning_rate": 4.708563308132636e-05, + "loss": 0.9469, + "step": 1275 + }, + { + "epoch": 0.8327911515940143, + "grad_norm": 0.7258083820343018, + "learning_rate": 4.706139993313994e-05, + "loss": 0.9294, + "step": 1280 + }, + { + "epoch": 0.8360442420299284, + "grad_norm": 0.7745918035507202, + "learning_rate": 4.7037072740830785e-05, + "loss": 0.9365, + "step": 1285 + }, + { + "epoch": 0.8392973324658426, + "grad_norm": 0.7213959097862244, + "learning_rate": 4.701265160810172e-05, + "loss": 0.947, + "step": 1290 + }, + { + "epoch": 0.8425504229017566, + "grad_norm": 0.825713038444519, + "learning_rate": 4.6988136639056025e-05, + "loss": 0.9404, + "step": 1295 + }, + { + "epoch": 0.8458035133376708, + "grad_norm": 0.6750174164772034, + "learning_rate": 4.696352793819698e-05, + "loss": 0.9364, + "step": 1300 + }, + { + "epoch": 0.8490566037735849, + "grad_norm": 0.8314560055732727, + "learning_rate": 4.693882561042743e-05, + "loss": 0.9521, + "step": 1305 + }, + { + "epoch": 0.852309694209499, + "grad_norm": 1.0009961128234863, + "learning_rate": 4.6914029761049357e-05, + "loss": 0.9297, + "step": 1310 + }, + { + "epoch": 0.8555627846454131, + "grad_norm": 0.7527256011962891, + "learning_rate": 4.688914049576337e-05, + "loss": 0.9269, + "step": 1315 + }, + { + "epoch": 0.8588158750813273, + "grad_norm": 0.9169411659240723, + "learning_rate": 4.686415792066833e-05, + "loss": 0.9312, + "step": 1320 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 0.9165216088294983, + "learning_rate": 4.683908214226084e-05, + "loss": 0.9524, + "step": 1325 + }, + { + "epoch": 0.8653220559531555, + "grad_norm": 0.9357953071594238, + "learning_rate": 4.6813913267434835e-05, + "loss": 0.9245, + "step": 1330 + }, + { + "epoch": 0.8685751463890696, + "grad_norm": 0.6473081707954407, + "learning_rate": 4.678865140348108e-05, + "loss": 0.9584, + "step": 1335 + }, + { + "epoch": 0.8718282368249838, + "grad_norm": 0.884191632270813, + "learning_rate": 4.676329665808677e-05, + "loss": 0.9569, + "step": 1340 + }, + { + "epoch": 0.8750813272608978, + "grad_norm": 1.0534435510635376, + "learning_rate": 4.673784913933499e-05, + "loss": 0.9178, + "step": 1345 + }, + { + "epoch": 0.878334417696812, + "grad_norm": 0.8140066266059875, + "learning_rate": 4.6712308955704346e-05, + "loss": 0.9536, + "step": 1350 + }, + { + "epoch": 0.8815875081327261, + "grad_norm": 0.71702641248703, + "learning_rate": 4.668667621606845e-05, + "loss": 0.947, + "step": 1355 + }, + { + "epoch": 0.8848405985686402, + "grad_norm": 0.6529531478881836, + "learning_rate": 4.666095102969544e-05, + "loss": 0.9107, + "step": 1360 + }, + { + "epoch": 0.8880936890045543, + "grad_norm": 0.9059852957725525, + "learning_rate": 4.6635133506247585e-05, + "loss": 0.9399, + "step": 1365 + }, + { + "epoch": 0.8913467794404685, + "grad_norm": 0.8972651958465576, + "learning_rate": 4.660922375578073e-05, + "loss": 0.9511, + "step": 1370 + }, + { + "epoch": 0.8945998698763825, + "grad_norm": 1.0316717624664307, + "learning_rate": 4.658322188874388e-05, + "loss": 0.9335, + "step": 1375 + }, + { + "epoch": 0.8978529603122967, + "grad_norm": 0.7475149035453796, + "learning_rate": 4.6557128015978726e-05, + "loss": 0.9262, + "step": 1380 + }, + { + "epoch": 0.9011060507482108, + "grad_norm": 1.035979986190796, + "learning_rate": 4.653094224871916e-05, + "loss": 0.9115, + "step": 1385 + }, + { + "epoch": 0.904359141184125, + "grad_norm": 0.8210706114768982, + "learning_rate": 4.650466469859079e-05, + "loss": 0.9535, + "step": 1390 + }, + { + "epoch": 0.907612231620039, + "grad_norm": 0.9931228160858154, + "learning_rate": 4.647829547761053e-05, + "loss": 0.9335, + "step": 1395 + }, + { + "epoch": 0.9108653220559532, + "grad_norm": 0.7681549191474915, + "learning_rate": 4.6451834698186e-05, + "loss": 0.9434, + "step": 1400 + }, + { + "epoch": 0.9141184124918673, + "grad_norm": 0.7461596727371216, + "learning_rate": 4.642528247311518e-05, + "loss": 0.9487, + "step": 1405 + }, + { + "epoch": 0.9173715029277814, + "grad_norm": 1.4867486953735352, + "learning_rate": 4.6398638915585835e-05, + "loss": 0.9074, + "step": 1410 + }, + { + "epoch": 0.9206245933636955, + "grad_norm": 0.890620231628418, + "learning_rate": 4.637190413917506e-05, + "loss": 0.9467, + "step": 1415 + }, + { + "epoch": 0.9238776837996097, + "grad_norm": 0.6205281615257263, + "learning_rate": 4.634507825784882e-05, + "loss": 0.9242, + "step": 1420 + }, + { + "epoch": 0.9271307742355237, + "grad_norm": 0.8957470655441284, + "learning_rate": 4.631816138596145e-05, + "loss": 0.94, + "step": 1425 + }, + { + "epoch": 0.9303838646714379, + "grad_norm": 0.8642396330833435, + "learning_rate": 4.629115363825514e-05, + "loss": 0.9142, + "step": 1430 + }, + { + "epoch": 0.933636955107352, + "grad_norm": 0.6721086502075195, + "learning_rate": 4.626405512985948e-05, + "loss": 0.9205, + "step": 1435 + }, + { + "epoch": 0.936890045543266, + "grad_norm": 0.8930765986442566, + "learning_rate": 4.623686597629098e-05, + "loss": 0.9235, + "step": 1440 + }, + { + "epoch": 0.9401431359791802, + "grad_norm": 0.9480865597724915, + "learning_rate": 4.62095862934525e-05, + "loss": 0.9309, + "step": 1445 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 0.9130436778068542, + "learning_rate": 4.618221619763287e-05, + "loss": 0.9257, + "step": 1450 + }, + { + "epoch": 0.9466493168510085, + "grad_norm": 0.63996821641922, + "learning_rate": 4.6154755805506294e-05, + "loss": 0.9364, + "step": 1455 + }, + { + "epoch": 0.9499024072869225, + "grad_norm": 0.786276638507843, + "learning_rate": 4.612720523413193e-05, + "loss": 0.9389, + "step": 1460 + }, + { + "epoch": 0.9531554977228367, + "grad_norm": 0.8122700452804565, + "learning_rate": 4.609956460095332e-05, + "loss": 0.9296, + "step": 1465 + }, + { + "epoch": 0.9564085881587508, + "grad_norm": 1.0054434537887573, + "learning_rate": 4.607183402379794e-05, + "loss": 0.9118, + "step": 1470 + }, + { + "epoch": 0.9596616785946649, + "grad_norm": 0.9399415850639343, + "learning_rate": 4.6044013620876706e-05, + "loss": 0.9311, + "step": 1475 + }, + { + "epoch": 0.962914769030579, + "grad_norm": 0.6693314909934998, + "learning_rate": 4.60161035107834e-05, + "loss": 0.9322, + "step": 1480 + }, + { + "epoch": 0.9661678594664932, + "grad_norm": 0.7549735903739929, + "learning_rate": 4.598810381249425e-05, + "loss": 0.9246, + "step": 1485 + }, + { + "epoch": 0.9694209499024072, + "grad_norm": 0.8314823508262634, + "learning_rate": 4.596001464536737e-05, + "loss": 0.9335, + "step": 1490 + }, + { + "epoch": 0.9726740403383214, + "grad_norm": 0.7478086948394775, + "learning_rate": 4.593183612914225e-05, + "loss": 0.9341, + "step": 1495 + }, + { + "epoch": 0.9759271307742355, + "grad_norm": 0.9777085185050964, + "learning_rate": 4.5903568383939284e-05, + "loss": 0.9323, + "step": 1500 + }, + { + "epoch": 0.9791802212101497, + "grad_norm": 0.893374502658844, + "learning_rate": 4.587521153025922e-05, + "loss": 0.939, + "step": 1505 + }, + { + "epoch": 0.9824333116460637, + "grad_norm": 0.6938668489456177, + "learning_rate": 4.584676568898267e-05, + "loss": 0.9437, + "step": 1510 + }, + { + "epoch": 0.9856864020819779, + "grad_norm": 0.6903214454650879, + "learning_rate": 4.5818230981369584e-05, + "loss": 0.9332, + "step": 1515 + }, + { + "epoch": 0.988939492517892, + "grad_norm": 0.817034125328064, + "learning_rate": 4.5789607529058715e-05, + "loss": 0.9375, + "step": 1520 + }, + { + "epoch": 0.9921925829538061, + "grad_norm": 0.8222942352294922, + "learning_rate": 4.5760895454067154e-05, + "loss": 0.9316, + "step": 1525 + }, + { + "epoch": 0.9954456733897202, + "grad_norm": 0.7549692392349243, + "learning_rate": 4.5732094878789756e-05, + "loss": 0.9221, + "step": 1530 + }, + { + "epoch": 0.9986987638256344, + "grad_norm": 0.8544319868087769, + "learning_rate": 4.570320592599863e-05, + "loss": 0.9287, + "step": 1535 + }, + { + "epoch": 1.0, + "eval_f1": 0.7910057808991992, + "eval_loss": 0.462646484375, + "eval_precision": 0.7940469727119374, + "eval_recall": 0.7896973937143991, + "eval_runtime": 247.1562, + "eval_samples_per_second": 1591.847, + "eval_steps_per_second": 1.558, + "step": 1537 + }, + { + "epoch": 1.0019518542615484, + "grad_norm": 0.7457589507102966, + "learning_rate": 4.567422871884265e-05, + "loss": 0.9279, + "step": 1540 + }, + { + "epoch": 1.0052049446974627, + "grad_norm": 0.8609625697135925, + "learning_rate": 4.564516338084688e-05, + "loss": 0.8765, + "step": 1545 + }, + { + "epoch": 1.0084580351333767, + "grad_norm": 0.8822636008262634, + "learning_rate": 4.561601003591208e-05, + "loss": 0.8427, + "step": 1550 + }, + { + "epoch": 1.0117111255692908, + "grad_norm": 0.7266067266464233, + "learning_rate": 4.558676880831417e-05, + "loss": 0.8828, + "step": 1555 + }, + { + "epoch": 1.014964216005205, + "grad_norm": 0.6970102787017822, + "learning_rate": 4.555743982270369e-05, + "loss": 0.8842, + "step": 1560 + }, + { + "epoch": 1.018217306441119, + "grad_norm": 0.6802201867103577, + "learning_rate": 4.5528023204105306e-05, + "loss": 0.872, + "step": 1565 + }, + { + "epoch": 1.0214703968770331, + "grad_norm": 0.7830452919006348, + "learning_rate": 4.549851907791722e-05, + "loss": 0.8624, + "step": 1570 + }, + { + "epoch": 1.0247234873129474, + "grad_norm": 0.6845102906227112, + "learning_rate": 4.5468927569910663e-05, + "loss": 0.8744, + "step": 1575 + }, + { + "epoch": 1.0279765777488614, + "grad_norm": 0.8832181692123413, + "learning_rate": 4.5439248806229386e-05, + "loss": 0.8722, + "step": 1580 + }, + { + "epoch": 1.0312296681847755, + "grad_norm": 0.7359802722930908, + "learning_rate": 4.5409482913389065e-05, + "loss": 0.8567, + "step": 1585 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 0.7686721086502075, + "learning_rate": 4.5379630018276834e-05, + "loss": 0.8509, + "step": 1590 + }, + { + "epoch": 1.0377358490566038, + "grad_norm": 0.77400141954422, + "learning_rate": 4.534969024815066e-05, + "loss": 0.8676, + "step": 1595 + }, + { + "epoch": 1.0409889394925178, + "grad_norm": 0.8024744987487793, + "learning_rate": 4.531966373063886e-05, + "loss": 0.8772, + "step": 1600 + }, + { + "epoch": 1.044242029928432, + "grad_norm": 0.7155640721321106, + "learning_rate": 4.528955059373956e-05, + "loss": 0.8608, + "step": 1605 + }, + { + "epoch": 1.047495120364346, + "grad_norm": 0.8553564548492432, + "learning_rate": 4.52593509658201e-05, + "loss": 0.8614, + "step": 1610 + }, + { + "epoch": 1.0507482108002602, + "grad_norm": 0.6926222443580627, + "learning_rate": 4.522906497561655e-05, + "loss": 0.8582, + "step": 1615 + }, + { + "epoch": 1.0540013012361744, + "grad_norm": 0.8300968408584595, + "learning_rate": 4.519869275223309e-05, + "loss": 0.8838, + "step": 1620 + }, + { + "epoch": 1.0572543916720885, + "grad_norm": 0.8907480835914612, + "learning_rate": 4.516823442514153e-05, + "loss": 0.8656, + "step": 1625 + }, + { + "epoch": 1.0605074821080025, + "grad_norm": 1.035863995552063, + "learning_rate": 4.513769012418071e-05, + "loss": 0.8814, + "step": 1630 + }, + { + "epoch": 1.0637605725439168, + "grad_norm": 0.9308491945266724, + "learning_rate": 4.510705997955596e-05, + "loss": 0.8831, + "step": 1635 + }, + { + "epoch": 1.0670136629798308, + "grad_norm": 1.0290710926055908, + "learning_rate": 4.507634412183856e-05, + "loss": 0.8566, + "step": 1640 + }, + { + "epoch": 1.070266753415745, + "grad_norm": 0.9163823127746582, + "learning_rate": 4.504554268196516e-05, + "loss": 0.8646, + "step": 1645 + }, + { + "epoch": 1.073519843851659, + "grad_norm": 0.7528260946273804, + "learning_rate": 4.5014655791237245e-05, + "loss": 0.8681, + "step": 1650 + }, + { + "epoch": 1.0767729342875731, + "grad_norm": 0.9018992781639099, + "learning_rate": 4.498368358132055e-05, + "loss": 0.8667, + "step": 1655 + }, + { + "epoch": 1.0800260247234874, + "grad_norm": 1.000990390777588, + "learning_rate": 4.4952626184244504e-05, + "loss": 0.8627, + "step": 1660 + }, + { + "epoch": 1.0832791151594015, + "grad_norm": 1.1555023193359375, + "learning_rate": 4.492148373240171e-05, + "loss": 0.8488, + "step": 1665 + }, + { + "epoch": 1.0865322055953155, + "grad_norm": 0.9759275913238525, + "learning_rate": 4.4890256358547304e-05, + "loss": 0.8775, + "step": 1670 + }, + { + "epoch": 1.0897852960312298, + "grad_norm": 0.7439780235290527, + "learning_rate": 4.485894419579846e-05, + "loss": 0.8758, + "step": 1675 + }, + { + "epoch": 1.0930383864671438, + "grad_norm": 0.8394938111305237, + "learning_rate": 4.482754737763378e-05, + "loss": 0.8797, + "step": 1680 + }, + { + "epoch": 1.0962914769030578, + "grad_norm": 0.8299522399902344, + "learning_rate": 4.4796066037892734e-05, + "loss": 0.864, + "step": 1685 + }, + { + "epoch": 1.099544567338972, + "grad_norm": 0.8585712909698486, + "learning_rate": 4.4764500310775116e-05, + "loss": 0.8586, + "step": 1690 + }, + { + "epoch": 1.1027976577748861, + "grad_norm": 1.0859423875808716, + "learning_rate": 4.473285033084043e-05, + "loss": 0.8773, + "step": 1695 + }, + { + "epoch": 1.1060507482108002, + "grad_norm": 0.7827959060668945, + "learning_rate": 4.4701116233007314e-05, + "loss": 0.8423, + "step": 1700 + }, + { + "epoch": 1.1093038386467144, + "grad_norm": 0.7498010993003845, + "learning_rate": 4.466929815255304e-05, + "loss": 0.884, + "step": 1705 + }, + { + "epoch": 1.1125569290826285, + "grad_norm": 0.7543908357620239, + "learning_rate": 4.4637396225112846e-05, + "loss": 0.8606, + "step": 1710 + }, + { + "epoch": 1.1158100195185425, + "grad_norm": 1.3613898754119873, + "learning_rate": 4.460541058667942e-05, + "loss": 0.8909, + "step": 1715 + }, + { + "epoch": 1.1190631099544568, + "grad_norm": 0.8409460783004761, + "learning_rate": 4.457334137360226e-05, + "loss": 0.8892, + "step": 1720 + }, + { + "epoch": 1.1223162003903708, + "grad_norm": 0.9072450995445251, + "learning_rate": 4.4541188722587165e-05, + "loss": 0.8714, + "step": 1725 + }, + { + "epoch": 1.1255692908262849, + "grad_norm": 1.02306067943573, + "learning_rate": 4.450895277069561e-05, + "loss": 0.8813, + "step": 1730 + }, + { + "epoch": 1.1288223812621991, + "grad_norm": 1.0199263095855713, + "learning_rate": 4.4476633655344144e-05, + "loss": 0.8693, + "step": 1735 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 0.7447525262832642, + "learning_rate": 4.444423151430386e-05, + "loss": 0.8894, + "step": 1740 + }, + { + "epoch": 1.1353285621340272, + "grad_norm": 1.062179446220398, + "learning_rate": 4.4411746485699744e-05, + "loss": 0.8425, + "step": 1745 + }, + { + "epoch": 1.1385816525699415, + "grad_norm": 0.7509242296218872, + "learning_rate": 4.437917870801015e-05, + "loss": 0.8666, + "step": 1750 + }, + { + "epoch": 1.1418347430058555, + "grad_norm": 1.1955047845840454, + "learning_rate": 4.434652832006616e-05, + "loss": 0.8798, + "step": 1755 + }, + { + "epoch": 1.1450878334417696, + "grad_norm": 1.1089417934417725, + "learning_rate": 4.431379546105101e-05, + "loss": 0.8808, + "step": 1760 + }, + { + "epoch": 1.1483409238776838, + "grad_norm": 0.7296579480171204, + "learning_rate": 4.4280980270499494e-05, + "loss": 0.854, + "step": 1765 + }, + { + "epoch": 1.1515940143135979, + "grad_norm": 1.0274302959442139, + "learning_rate": 4.424808288829739e-05, + "loss": 0.8775, + "step": 1770 + }, + { + "epoch": 1.1548471047495121, + "grad_norm": 0.8249827027320862, + "learning_rate": 4.421510345468082e-05, + "loss": 0.8825, + "step": 1775 + }, + { + "epoch": 1.1581001951854262, + "grad_norm": 0.814564049243927, + "learning_rate": 4.4182042110235686e-05, + "loss": 0.8354, + "step": 1780 + }, + { + "epoch": 1.1613532856213402, + "grad_norm": 0.8738640546798706, + "learning_rate": 4.414889899589709e-05, + "loss": 0.8667, + "step": 1785 + }, + { + "epoch": 1.1646063760572545, + "grad_norm": 0.873928427696228, + "learning_rate": 4.411567425294867e-05, + "loss": 0.8589, + "step": 1790 + }, + { + "epoch": 1.1678594664931685, + "grad_norm": 1.0771477222442627, + "learning_rate": 4.408236802302203e-05, + "loss": 0.8677, + "step": 1795 + }, + { + "epoch": 1.1711125569290826, + "grad_norm": 1.026843786239624, + "learning_rate": 4.404898044809618e-05, + "loss": 0.8613, + "step": 1800 + }, + { + "epoch": 1.1743656473649968, + "grad_norm": 1.2807365655899048, + "learning_rate": 4.401551167049686e-05, + "loss": 0.8612, + "step": 1805 + }, + { + "epoch": 1.1776187378009109, + "grad_norm": 1.086053729057312, + "learning_rate": 4.398196183289595e-05, + "loss": 0.8679, + "step": 1810 + }, + { + "epoch": 1.180871828236825, + "grad_norm": 1.2245922088623047, + "learning_rate": 4.394833107831091e-05, + "loss": 0.8666, + "step": 1815 + }, + { + "epoch": 1.1841249186727392, + "grad_norm": 0.788972020149231, + "learning_rate": 4.3914619550104125e-05, + "loss": 0.8549, + "step": 1820 + }, + { + "epoch": 1.1873780091086532, + "grad_norm": 0.7560495734214783, + "learning_rate": 4.388082739198229e-05, + "loss": 0.8689, + "step": 1825 + }, + { + "epoch": 1.1906310995445673, + "grad_norm": 0.9753955006599426, + "learning_rate": 4.3846954747995825e-05, + "loss": 0.8676, + "step": 1830 + }, + { + "epoch": 1.1938841899804815, + "grad_norm": 0.7910217642784119, + "learning_rate": 4.381300176253825e-05, + "loss": 0.872, + "step": 1835 + }, + { + "epoch": 1.1971372804163956, + "grad_norm": 0.9588011503219604, + "learning_rate": 4.377896858034557e-05, + "loss": 0.8903, + "step": 1840 + }, + { + "epoch": 1.2003903708523098, + "grad_norm": 0.9886934757232666, + "learning_rate": 4.374485534649562e-05, + "loss": 0.879, + "step": 1845 + }, + { + "epoch": 1.2036434612882239, + "grad_norm": 0.896848738193512, + "learning_rate": 4.371066220640754e-05, + "loss": 0.854, + "step": 1850 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 1.7082849740982056, + "learning_rate": 4.367638930584105e-05, + "loss": 0.8877, + "step": 1855 + }, + { + "epoch": 1.2101496421600522, + "grad_norm": 1.307518482208252, + "learning_rate": 4.36420367908959e-05, + "loss": 0.8637, + "step": 1860 + }, + { + "epoch": 1.2134027325959662, + "grad_norm": 0.9649641513824463, + "learning_rate": 4.3607604808011213e-05, + "loss": 0.8644, + "step": 1865 + }, + { + "epoch": 1.2166558230318802, + "grad_norm": 0.958816409111023, + "learning_rate": 4.357309350396488e-05, + "loss": 0.8771, + "step": 1870 + }, + { + "epoch": 1.2199089134677945, + "grad_norm": 0.7665415406227112, + "learning_rate": 4.353850302587291e-05, + "loss": 0.8559, + "step": 1875 + }, + { + "epoch": 1.2231620039037086, + "grad_norm": 0.8145641088485718, + "learning_rate": 4.3503833521188844e-05, + "loss": 0.8776, + "step": 1880 + }, + { + "epoch": 1.2264150943396226, + "grad_norm": 1.0663881301879883, + "learning_rate": 4.346908513770306e-05, + "loss": 0.8643, + "step": 1885 + }, + { + "epoch": 1.2296681847755369, + "grad_norm": 0.7401409149169922, + "learning_rate": 4.343425802354222e-05, + "loss": 0.8646, + "step": 1890 + }, + { + "epoch": 1.232921275211451, + "grad_norm": 0.7239570021629333, + "learning_rate": 4.3399352327168595e-05, + "loss": 0.8885, + "step": 1895 + }, + { + "epoch": 1.236174365647365, + "grad_norm": 1.0525251626968384, + "learning_rate": 4.3364368197379426e-05, + "loss": 0.8817, + "step": 1900 + }, + { + "epoch": 1.2394274560832792, + "grad_norm": 0.8934289813041687, + "learning_rate": 4.33293057833063e-05, + "loss": 0.8699, + "step": 1905 + }, + { + "epoch": 1.2426805465191932, + "grad_norm": 0.8614199757575989, + "learning_rate": 4.329416523441454e-05, + "loss": 0.866, + "step": 1910 + }, + { + "epoch": 1.2459336369551073, + "grad_norm": 0.884955644607544, + "learning_rate": 4.3258946700502535e-05, + "loss": 0.8641, + "step": 1915 + }, + { + "epoch": 1.2491867273910215, + "grad_norm": 0.8655734062194824, + "learning_rate": 4.322365033170109e-05, + "loss": 0.8393, + "step": 1920 + }, + { + "epoch": 1.2524398178269356, + "grad_norm": 1.0718590021133423, + "learning_rate": 4.318827627847284e-05, + "loss": 0.8788, + "step": 1925 + }, + { + "epoch": 1.2556929082628496, + "grad_norm": 0.9467219710350037, + "learning_rate": 4.315282469161156e-05, + "loss": 0.8758, + "step": 1930 + }, + { + "epoch": 1.258945998698764, + "grad_norm": 1.0598018169403076, + "learning_rate": 4.311729572224153e-05, + "loss": 0.8872, + "step": 1935 + }, + { + "epoch": 1.262199089134678, + "grad_norm": 0.7586490511894226, + "learning_rate": 4.308168952181691e-05, + "loss": 0.8749, + "step": 1940 + }, + { + "epoch": 1.265452179570592, + "grad_norm": 0.8791137933731079, + "learning_rate": 4.304600624212109e-05, + "loss": 0.8833, + "step": 1945 + }, + { + "epoch": 1.2687052700065062, + "grad_norm": 1.0280482769012451, + "learning_rate": 4.3017404223497385e-05, + "loss": 0.893, + "step": 1950 + }, + { + "epoch": 1.2719583604424203, + "grad_norm": 0.8759311437606812, + "learning_rate": 4.298158258465592e-05, + "loss": 0.8833, + "step": 1955 + }, + { + "epoch": 1.2752114508783343, + "grad_norm": 0.8623502850532532, + "learning_rate": 4.2945684293282685e-05, + "loss": 0.8533, + "step": 1960 + }, + { + "epoch": 1.2784645413142486, + "grad_norm": 0.9812124967575073, + "learning_rate": 4.290970950240617e-05, + "loss": 0.8832, + "step": 1965 + }, + { + "epoch": 1.2817176317501626, + "grad_norm": 0.8114174008369446, + "learning_rate": 4.2873658365381026e-05, + "loss": 0.8657, + "step": 1970 + }, + { + "epoch": 1.2849707221860767, + "grad_norm": 0.7681922912597656, + "learning_rate": 4.2837531035887305e-05, + "loss": 0.8563, + "step": 1975 + }, + { + "epoch": 1.288223812621991, + "grad_norm": 0.9911778569221497, + "learning_rate": 4.280132766792989e-05, + "loss": 0.8401, + "step": 1980 + }, + { + "epoch": 1.291476903057905, + "grad_norm": 0.7618448138237, + "learning_rate": 4.276504841583778e-05, + "loss": 0.8727, + "step": 1985 + }, + { + "epoch": 1.294729993493819, + "grad_norm": 0.7748595476150513, + "learning_rate": 4.2728693434263476e-05, + "loss": 0.8726, + "step": 1990 + }, + { + "epoch": 1.2979830839297333, + "grad_norm": 0.995187520980835, + "learning_rate": 4.269226287818228e-05, + "loss": 0.8606, + "step": 1995 + }, + { + "epoch": 1.3012361743656473, + "grad_norm": 0.9184800386428833, + "learning_rate": 4.2655756902891665e-05, + "loss": 0.8881, + "step": 2000 + }, + { + "epoch": 1.3044892648015614, + "grad_norm": 0.6605210304260254, + "learning_rate": 4.261917566401061e-05, + "loss": 0.8452, + "step": 2005 + }, + { + "epoch": 1.3077423552374756, + "grad_norm": 0.9930521249771118, + "learning_rate": 4.258251931747893e-05, + "loss": 0.8661, + "step": 2010 + }, + { + "epoch": 1.3109954456733897, + "grad_norm": 0.6971027255058289, + "learning_rate": 4.25457880195566e-05, + "loss": 0.8607, + "step": 2015 + }, + { + "epoch": 1.3142485361093037, + "grad_norm": 0.8052083253860474, + "learning_rate": 4.250898192682311e-05, + "loss": 0.8407, + "step": 2020 + }, + { + "epoch": 1.317501626545218, + "grad_norm": 0.7318537831306458, + "learning_rate": 4.247210119617679e-05, + "loss": 0.8703, + "step": 2025 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 1.0614877939224243, + "learning_rate": 4.243514598483412e-05, + "loss": 0.854, + "step": 2030 + }, + { + "epoch": 1.3240078074170463, + "grad_norm": 1.2773613929748535, + "learning_rate": 4.23981164503291e-05, + "loss": 0.8728, + "step": 2035 + }, + { + "epoch": 1.3272608978529603, + "grad_norm": 1.41408371925354, + "learning_rate": 4.236101275051256e-05, + "loss": 0.859, + "step": 2040 + }, + { + "epoch": 1.3305139882888743, + "grad_norm": 0.7571334838867188, + "learning_rate": 4.232383504355147e-05, + "loss": 0.8588, + "step": 2045 + }, + { + "epoch": 1.3337670787247886, + "grad_norm": 0.7090466618537903, + "learning_rate": 4.228658348792828e-05, + "loss": 0.8672, + "step": 2050 + }, + { + "epoch": 1.3370201691607027, + "grad_norm": 0.826134204864502, + "learning_rate": 4.224925824244025e-05, + "loss": 0.8552, + "step": 2055 + }, + { + "epoch": 1.340273259596617, + "grad_norm": 0.8876454830169678, + "learning_rate": 4.2211859466198785e-05, + "loss": 0.8733, + "step": 2060 + }, + { + "epoch": 1.343526350032531, + "grad_norm": 0.7836646437644958, + "learning_rate": 4.217438731862871e-05, + "loss": 0.8643, + "step": 2065 + }, + { + "epoch": 1.346779440468445, + "grad_norm": 0.795116662979126, + "learning_rate": 4.213684195946762e-05, + "loss": 0.8759, + "step": 2070 + }, + { + "epoch": 1.3500325309043593, + "grad_norm": 0.9851782321929932, + "learning_rate": 4.2099223548765224e-05, + "loss": 0.872, + "step": 2075 + }, + { + "epoch": 1.3532856213402733, + "grad_norm": 0.9454843997955322, + "learning_rate": 4.206153224688264e-05, + "loss": 0.8709, + "step": 2080 + }, + { + "epoch": 1.3565387117761873, + "grad_norm": 0.7972314953804016, + "learning_rate": 4.202376821449167e-05, + "loss": 0.881, + "step": 2085 + }, + { + "epoch": 1.3597918022121016, + "grad_norm": 0.7645969390869141, + "learning_rate": 4.1985931612574186e-05, + "loss": 0.8729, + "step": 2090 + }, + { + "epoch": 1.3630448926480156, + "grad_norm": 1.1820120811462402, + "learning_rate": 4.194802260242141e-05, + "loss": 0.8556, + "step": 2095 + }, + { + "epoch": 1.3662979830839297, + "grad_norm": 0.9157008528709412, + "learning_rate": 4.191004134563322e-05, + "loss": 0.8721, + "step": 2100 + }, + { + "epoch": 1.369551073519844, + "grad_norm": 0.8286409974098206, + "learning_rate": 4.187198800411748e-05, + "loss": 0.8756, + "step": 2105 + }, + { + "epoch": 1.372804163955758, + "grad_norm": 0.8742622137069702, + "learning_rate": 4.183386274008932e-05, + "loss": 0.8592, + "step": 2110 + }, + { + "epoch": 1.376057254391672, + "grad_norm": 0.8968034386634827, + "learning_rate": 4.1795665716070474e-05, + "loss": 0.8641, + "step": 2115 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.8291420340538025, + "learning_rate": 4.1757397094888594e-05, + "loss": 0.8529, + "step": 2120 + }, + { + "epoch": 1.3825634352635003, + "grad_norm": 0.919009268283844, + "learning_rate": 4.1719057039676515e-05, + "loss": 0.8636, + "step": 2125 + }, + { + "epoch": 1.3858165256994144, + "grad_norm": 1.0421229600906372, + "learning_rate": 4.168064571387159e-05, + "loss": 0.8681, + "step": 2130 + }, + { + "epoch": 1.3890696161353286, + "grad_norm": 0.7388564944267273, + "learning_rate": 4.1642163281214984e-05, + "loss": 0.8513, + "step": 2135 + }, + { + "epoch": 1.3923227065712427, + "grad_norm": 0.6921651363372803, + "learning_rate": 4.160360990575099e-05, + "loss": 0.8723, + "step": 2140 + }, + { + "epoch": 1.3955757970071567, + "grad_norm": 0.7668315768241882, + "learning_rate": 4.156498575182633e-05, + "loss": 0.8621, + "step": 2145 + }, + { + "epoch": 1.398828887443071, + "grad_norm": 0.7497116327285767, + "learning_rate": 4.152629098408939e-05, + "loss": 0.8604, + "step": 2150 + }, + { + "epoch": 1.402081977878985, + "grad_norm": 0.7256556749343872, + "learning_rate": 4.1487525767489635e-05, + "loss": 0.8638, + "step": 2155 + }, + { + "epoch": 1.405335068314899, + "grad_norm": 1.1155390739440918, + "learning_rate": 4.144869026727681e-05, + "loss": 0.8547, + "step": 2160 + }, + { + "epoch": 1.4085881587508133, + "grad_norm": 0.9044195413589478, + "learning_rate": 4.140978464900025e-05, + "loss": 0.8792, + "step": 2165 + }, + { + "epoch": 1.4118412491867274, + "grad_norm": 0.7881206274032593, + "learning_rate": 4.137080907850823e-05, + "loss": 0.874, + "step": 2170 + }, + { + "epoch": 1.4150943396226414, + "grad_norm": 0.851743757724762, + "learning_rate": 4.13317637219472e-05, + "loss": 0.8551, + "step": 2175 + }, + { + "epoch": 1.4183474300585557, + "grad_norm": 0.8619376420974731, + "learning_rate": 4.129264874576111e-05, + "loss": 0.8757, + "step": 2180 + }, + { + "epoch": 1.4216005204944697, + "grad_norm": 1.2099318504333496, + "learning_rate": 4.125346431669065e-05, + "loss": 0.8567, + "step": 2185 + }, + { + "epoch": 1.4248536109303838, + "grad_norm": 0.8172369599342346, + "learning_rate": 4.121421060177263e-05, + "loss": 0.8625, + "step": 2190 + }, + { + "epoch": 1.428106701366298, + "grad_norm": 1.1485086679458618, + "learning_rate": 4.1174887768339164e-05, + "loss": 0.8681, + "step": 2195 + }, + { + "epoch": 1.431359791802212, + "grad_norm": 0.8006755709648132, + "learning_rate": 4.113549598401704e-05, + "loss": 0.8657, + "step": 2200 + }, + { + "epoch": 1.434612882238126, + "grad_norm": 0.7858587503433228, + "learning_rate": 4.1096035416726966e-05, + "loss": 0.8681, + "step": 2205 + }, + { + "epoch": 1.4378659726740404, + "grad_norm": 1.0397981405258179, + "learning_rate": 4.105650623468284e-05, + "loss": 0.871, + "step": 2210 + }, + { + "epoch": 1.4411190631099544, + "grad_norm": 1.409725546836853, + "learning_rate": 4.101690860639108e-05, + "loss": 0.8525, + "step": 2215 + }, + { + "epoch": 1.4443721535458685, + "grad_norm": 1.0374292135238647, + "learning_rate": 4.097724270064988e-05, + "loss": 0.8561, + "step": 2220 + }, + { + "epoch": 1.4476252439817827, + "grad_norm": 1.10367751121521, + "learning_rate": 4.0937508686548455e-05, + "loss": 0.8608, + "step": 2225 + }, + { + "epoch": 1.4508783344176968, + "grad_norm": 0.9354111552238464, + "learning_rate": 4.089770673346639e-05, + "loss": 0.8556, + "step": 2230 + }, + { + "epoch": 1.4541314248536108, + "grad_norm": 0.7732600569725037, + "learning_rate": 4.085783701107288e-05, + "loss": 0.8664, + "step": 2235 + }, + { + "epoch": 1.457384515289525, + "grad_norm": 0.7464646697044373, + "learning_rate": 4.0817899689325975e-05, + "loss": 0.8544, + "step": 2240 + }, + { + "epoch": 1.460637605725439, + "grad_norm": 0.7917648553848267, + "learning_rate": 4.077789493847194e-05, + "loss": 0.849, + "step": 2245 + }, + { + "epoch": 1.4638906961613534, + "grad_norm": 0.8593052625656128, + "learning_rate": 4.073782292904445e-05, + "loss": 0.905, + "step": 2250 + }, + { + "epoch": 1.4671437865972674, + "grad_norm": 0.7432965636253357, + "learning_rate": 4.0697683831863877e-05, + "loss": 0.8606, + "step": 2255 + }, + { + "epoch": 1.4703968770331814, + "grad_norm": 1.0467164516448975, + "learning_rate": 4.065747781803662e-05, + "loss": 0.8733, + "step": 2260 + }, + { + "epoch": 1.4736499674690957, + "grad_norm": 0.8533846735954285, + "learning_rate": 4.06172050589543e-05, + "loss": 0.8411, + "step": 2265 + }, + { + "epoch": 1.4769030579050098, + "grad_norm": 0.7896531224250793, + "learning_rate": 4.057686572629307e-05, + "loss": 0.8732, + "step": 2270 + }, + { + "epoch": 1.480156148340924, + "grad_norm": 0.7728810906410217, + "learning_rate": 4.053645999201287e-05, + "loss": 0.8822, + "step": 2275 + }, + { + "epoch": 1.483409238776838, + "grad_norm": 0.791527271270752, + "learning_rate": 4.0495988028356725e-05, + "loss": 0.8692, + "step": 2280 + }, + { + "epoch": 1.486662329212752, + "grad_norm": 1.7369199991226196, + "learning_rate": 4.0455450007849945e-05, + "loss": 0.878, + "step": 2285 + }, + { + "epoch": 1.4899154196486664, + "grad_norm": 0.8174150586128235, + "learning_rate": 4.041484610329945e-05, + "loss": 0.8843, + "step": 2290 + }, + { + "epoch": 1.4931685100845804, + "grad_norm": 0.8122901916503906, + "learning_rate": 4.037417648779304e-05, + "loss": 0.8511, + "step": 2295 + }, + { + "epoch": 1.4964216005204944, + "grad_norm": 0.856270968914032, + "learning_rate": 4.033344133469857e-05, + "loss": 0.8576, + "step": 2300 + }, + { + "epoch": 1.4996746909564087, + "grad_norm": 0.7714033126831055, + "learning_rate": 4.029264081766333e-05, + "loss": 0.8563, + "step": 2305 + }, + { + "epoch": 1.5029277813923227, + "grad_norm": 0.7557379007339478, + "learning_rate": 4.02517751106132e-05, + "loss": 0.8632, + "step": 2310 + }, + { + "epoch": 1.5061808718282368, + "grad_norm": 0.9310267567634583, + "learning_rate": 4.021084438775199e-05, + "loss": 0.8756, + "step": 2315 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 1.1613460779190063, + "learning_rate": 4.016984882356063e-05, + "loss": 0.8581, + "step": 2320 + }, + { + "epoch": 1.512687052700065, + "grad_norm": 0.8737664222717285, + "learning_rate": 4.0128788592796484e-05, + "loss": 0.8463, + "step": 2325 + }, + { + "epoch": 1.5159401431359791, + "grad_norm": 1.137432336807251, + "learning_rate": 4.008766387049257e-05, + "loss": 0.8668, + "step": 2330 + }, + { + "epoch": 1.5191932335718934, + "grad_norm": 1.205127239227295, + "learning_rate": 4.004647483195682e-05, + "loss": 0.854, + "step": 2335 + }, + { + "epoch": 1.5224463240078074, + "grad_norm": 1.2103711366653442, + "learning_rate": 4.0005221652771326e-05, + "loss": 0.8599, + "step": 2340 + }, + { + "epoch": 1.5256994144437215, + "grad_norm": 0.8847302794456482, + "learning_rate": 3.996390450879163e-05, + "loss": 0.8902, + "step": 2345 + }, + { + "epoch": 1.5289525048796357, + "grad_norm": 0.9139837622642517, + "learning_rate": 3.992252357614591e-05, + "loss": 0.8537, + "step": 2350 + }, + { + "epoch": 1.5322055953155498, + "grad_norm": 0.6250112056732178, + "learning_rate": 3.9881079031234295e-05, + "loss": 0.8625, + "step": 2355 + }, + { + "epoch": 1.5354586857514638, + "grad_norm": 1.3147530555725098, + "learning_rate": 3.983957105072806e-05, + "loss": 0.8594, + "step": 2360 + }, + { + "epoch": 1.538711776187378, + "grad_norm": 0.8052361607551575, + "learning_rate": 3.9797999811568916e-05, + "loss": 0.8613, + "step": 2365 + }, + { + "epoch": 1.5419648666232921, + "grad_norm": 0.963198721408844, + "learning_rate": 3.9756365490968216e-05, + "loss": 0.8846, + "step": 2370 + }, + { + "epoch": 1.5452179570592062, + "grad_norm": 0.7471247911453247, + "learning_rate": 3.971466826640622e-05, + "loss": 0.8559, + "step": 2375 + }, + { + "epoch": 1.5484710474951204, + "grad_norm": 0.9139803051948547, + "learning_rate": 3.967290831563137e-05, + "loss": 0.8734, + "step": 2380 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 0.8502246141433716, + "learning_rate": 3.963108581665945e-05, + "loss": 0.8517, + "step": 2385 + }, + { + "epoch": 1.5549772283669485, + "grad_norm": 1.010526418685913, + "learning_rate": 3.958920094777292e-05, + "loss": 0.8699, + "step": 2390 + }, + { + "epoch": 1.5582303188028628, + "grad_norm": 0.9621404409408569, + "learning_rate": 3.954725388752006e-05, + "loss": 0.8715, + "step": 2395 + }, + { + "epoch": 1.5614834092387768, + "grad_norm": 0.931891679763794, + "learning_rate": 3.950524481471434e-05, + "loss": 0.8639, + "step": 2400 + }, + { + "epoch": 1.5647364996746909, + "grad_norm": 0.9025523066520691, + "learning_rate": 3.94631739084335e-05, + "loss": 0.8407, + "step": 2405 + }, + { + "epoch": 1.5679895901106051, + "grad_norm": 0.7679696679115295, + "learning_rate": 3.942104134801892e-05, + "loss": 0.8703, + "step": 2410 + }, + { + "epoch": 1.5712426805465192, + "grad_norm": 0.7461057901382446, + "learning_rate": 3.937884731307477e-05, + "loss": 0.8508, + "step": 2415 + }, + { + "epoch": 1.5744957709824332, + "grad_norm": 0.8891671895980835, + "learning_rate": 3.9336591983467296e-05, + "loss": 0.8392, + "step": 2420 + }, + { + "epoch": 1.5777488614183475, + "grad_norm": 0.7495052218437195, + "learning_rate": 3.929427553932402e-05, + "loss": 0.8617, + "step": 2425 + }, + { + "epoch": 1.5810019518542615, + "grad_norm": 0.8563068509101868, + "learning_rate": 3.925189816103298e-05, + "loss": 0.8682, + "step": 2430 + }, + { + "epoch": 1.5842550422901756, + "grad_norm": 0.8730781674385071, + "learning_rate": 3.9209460029242e-05, + "loss": 0.8634, + "step": 2435 + }, + { + "epoch": 1.5875081327260898, + "grad_norm": 1.0046974420547485, + "learning_rate": 3.916696132485783e-05, + "loss": 0.8423, + "step": 2440 + }, + { + "epoch": 1.5907612231620039, + "grad_norm": 0.8691470623016357, + "learning_rate": 3.9124402229045495e-05, + "loss": 0.8443, + "step": 2445 + }, + { + "epoch": 1.594014313597918, + "grad_norm": 0.7887680530548096, + "learning_rate": 3.90817829232274e-05, + "loss": 0.8796, + "step": 2450 + }, + { + "epoch": 1.5972674040338322, + "grad_norm": 0.8779820203781128, + "learning_rate": 3.903910358908267e-05, + "loss": 0.8808, + "step": 2455 + }, + { + "epoch": 1.6005204944697464, + "grad_norm": 0.9116110801696777, + "learning_rate": 3.8996364408546284e-05, + "loss": 0.8539, + "step": 2460 + }, + { + "epoch": 1.6037735849056602, + "grad_norm": 0.8549916744232178, + "learning_rate": 3.895356556380833e-05, + "loss": 0.8714, + "step": 2465 + }, + { + "epoch": 1.6070266753415745, + "grad_norm": 0.7568048238754272, + "learning_rate": 3.8910707237313274e-05, + "loss": 0.8545, + "step": 2470 + }, + { + "epoch": 1.6102797657774888, + "grad_norm": 0.873261034488678, + "learning_rate": 3.886778961175909e-05, + "loss": 0.861, + "step": 2475 + }, + { + "epoch": 1.6135328562134026, + "grad_norm": 0.8435690999031067, + "learning_rate": 3.8824812870096585e-05, + "loss": 0.849, + "step": 2480 + }, + { + "epoch": 1.6167859466493169, + "grad_norm": 0.7543259263038635, + "learning_rate": 3.878177719552854e-05, + "loss": 0.8389, + "step": 2485 + }, + { + "epoch": 1.6200390370852311, + "grad_norm": 0.6784664392471313, + "learning_rate": 3.8738682771508975e-05, + "loss": 0.862, + "step": 2490 + }, + { + "epoch": 1.623292127521145, + "grad_norm": 0.735149085521698, + "learning_rate": 3.869552978174232e-05, + "loss": 0.86, + "step": 2495 + }, + { + "epoch": 1.6265452179570592, + "grad_norm": 1.1492180824279785, + "learning_rate": 3.8652318410182696e-05, + "loss": 0.8682, + "step": 2500 + }, + { + "epoch": 1.6297983083929735, + "grad_norm": 1.2123005390167236, + "learning_rate": 3.860904884103307e-05, + "loss": 0.8767, + "step": 2505 + }, + { + "epoch": 1.6330513988288873, + "grad_norm": 1.0573855638504028, + "learning_rate": 3.85657212587445e-05, + "loss": 0.8784, + "step": 2510 + }, + { + "epoch": 1.6363044892648015, + "grad_norm": 0.7657274603843689, + "learning_rate": 3.8522335848015354e-05, + "loss": 0.8614, + "step": 2515 + }, + { + "epoch": 1.6395575797007158, + "grad_norm": 0.7586051225662231, + "learning_rate": 3.847889279379052e-05, + "loss": 0.8522, + "step": 2520 + }, + { + "epoch": 1.6428106701366298, + "grad_norm": 0.8660874366760254, + "learning_rate": 3.843539228126058e-05, + "loss": 0.8491, + "step": 2525 + }, + { + "epoch": 1.6460637605725439, + "grad_norm": 0.8181445002555847, + "learning_rate": 3.8391834495861104e-05, + "loss": 0.8774, + "step": 2530 + }, + { + "epoch": 1.6493168510084582, + "grad_norm": 0.8161119222640991, + "learning_rate": 3.834821962327173e-05, + "loss": 0.8446, + "step": 2535 + }, + { + "epoch": 1.6525699414443722, + "grad_norm": 0.7471867203712463, + "learning_rate": 3.830454784941552e-05, + "loss": 0.8743, + "step": 2540 + }, + { + "epoch": 1.6558230318802862, + "grad_norm": 0.8243322372436523, + "learning_rate": 3.8260819360458066e-05, + "loss": 0.8582, + "step": 2545 + }, + { + "epoch": 1.6590761223162005, + "grad_norm": 0.7759085297584534, + "learning_rate": 3.8217034342806726e-05, + "loss": 0.8634, + "step": 2550 + }, + { + "epoch": 1.6623292127521145, + "grad_norm": 0.7820890545845032, + "learning_rate": 3.817319298310984e-05, + "loss": 0.849, + "step": 2555 + }, + { + "epoch": 1.6655823031880286, + "grad_norm": 0.7369856238365173, + "learning_rate": 3.812929546825591e-05, + "loss": 0.851, + "step": 2560 + }, + { + "epoch": 1.6688353936239428, + "grad_norm": 0.6760427355766296, + "learning_rate": 3.8085341985372847e-05, + "loss": 0.8526, + "step": 2565 + }, + { + "epoch": 1.6720884840598569, + "grad_norm": 0.7964663505554199, + "learning_rate": 3.804133272182711e-05, + "loss": 0.8369, + "step": 2570 + }, + { + "epoch": 1.675341574495771, + "grad_norm": 0.7458584308624268, + "learning_rate": 3.7997267865222966e-05, + "loss": 0.858, + "step": 2575 + }, + { + "epoch": 1.6785946649316852, + "grad_norm": 0.7713748812675476, + "learning_rate": 3.795314760340165e-05, + "loss": 0.8422, + "step": 2580 + }, + { + "epoch": 1.6818477553675992, + "grad_norm": 1.1121766567230225, + "learning_rate": 3.79089721244406e-05, + "loss": 0.8564, + "step": 2585 + }, + { + "epoch": 1.6851008458035133, + "grad_norm": 0.7054054141044617, + "learning_rate": 3.786474161665261e-05, + "loss": 0.8503, + "step": 2590 + }, + { + "epoch": 1.6883539362394275, + "grad_norm": 0.8231985569000244, + "learning_rate": 3.782045626858508e-05, + "loss": 0.8459, + "step": 2595 + }, + { + "epoch": 1.6916070266753416, + "grad_norm": 0.8120073676109314, + "learning_rate": 3.7776116269019164e-05, + "loss": 0.8579, + "step": 2600 + }, + { + "epoch": 1.6948601171112556, + "grad_norm": 0.7463471293449402, + "learning_rate": 3.773172180696899e-05, + "loss": 0.8685, + "step": 2605 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 0.9310842752456665, + "learning_rate": 3.7687273071680875e-05, + "loss": 0.8657, + "step": 2610 + }, + { + "epoch": 1.701366297983084, + "grad_norm": 0.7997697591781616, + "learning_rate": 3.7642770252632445e-05, + "loss": 0.8536, + "step": 2615 + }, + { + "epoch": 1.704619388418998, + "grad_norm": 0.9354361295700073, + "learning_rate": 3.7598213539531924e-05, + "loss": 0.8584, + "step": 2620 + }, + { + "epoch": 1.7078724788549122, + "grad_norm": 0.8442994356155396, + "learning_rate": 3.755360312231726e-05, + "loss": 0.8509, + "step": 2625 + }, + { + "epoch": 1.7111255692908263, + "grad_norm": 0.7156201601028442, + "learning_rate": 3.7508939191155315e-05, + "loss": 0.8587, + "step": 2630 + }, + { + "epoch": 1.7143786597267403, + "grad_norm": 0.8114856481552124, + "learning_rate": 3.7464221936441094e-05, + "loss": 0.8575, + "step": 2635 + }, + { + "epoch": 1.7176317501626546, + "grad_norm": 0.9958142042160034, + "learning_rate": 3.741945154879691e-05, + "loss": 0.8291, + "step": 2640 + }, + { + "epoch": 1.7208848405985686, + "grad_norm": 0.8814706206321716, + "learning_rate": 3.7374628219071576e-05, + "loss": 0.8756, + "step": 2645 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 0.9752816557884216, + "learning_rate": 3.732975213833957e-05, + "loss": 0.8526, + "step": 2650 + }, + { + "epoch": 1.727391021470397, + "grad_norm": 1.069827914237976, + "learning_rate": 3.728482349790025e-05, + "loss": 0.85, + "step": 2655 + }, + { + "epoch": 1.730644111906311, + "grad_norm": 0.7829200029373169, + "learning_rate": 3.723984248927704e-05, + "loss": 0.8775, + "step": 2660 + }, + { + "epoch": 1.733897202342225, + "grad_norm": 0.9264289140701294, + "learning_rate": 3.719480930421657e-05, + "loss": 0.8561, + "step": 2665 + }, + { + "epoch": 1.7371502927781393, + "grad_norm": 1.0062094926834106, + "learning_rate": 3.7149724134687915e-05, + "loss": 0.8734, + "step": 2670 + }, + { + "epoch": 1.7404033832140533, + "grad_norm": 1.15998375415802, + "learning_rate": 3.710458717288176e-05, + "loss": 0.8817, + "step": 2675 + }, + { + "epoch": 1.7436564736499673, + "grad_norm": 0.8632653951644897, + "learning_rate": 3.705939861120952e-05, + "loss": 0.8467, + "step": 2680 + }, + { + "epoch": 1.7469095640858816, + "grad_norm": 0.9579365849494934, + "learning_rate": 3.7014158642302645e-05, + "loss": 0.8516, + "step": 2685 + }, + { + "epoch": 1.7501626545217959, + "grad_norm": 0.7893072962760925, + "learning_rate": 3.6968867459011675e-05, + "loss": 0.8533, + "step": 2690 + }, + { + "epoch": 1.7534157449577097, + "grad_norm": 0.8436265587806702, + "learning_rate": 3.692352525440548e-05, + "loss": 0.8661, + "step": 2695 + }, + { + "epoch": 1.756668835393624, + "grad_norm": 0.7928500175476074, + "learning_rate": 3.687813222177042e-05, + "loss": 0.8617, + "step": 2700 + }, + { + "epoch": 1.7599219258295382, + "grad_norm": 1.0979465246200562, + "learning_rate": 3.683268855460955e-05, + "loss": 0.8457, + "step": 2705 + }, + { + "epoch": 1.763175016265452, + "grad_norm": 0.9280642867088318, + "learning_rate": 3.678719444664174e-05, + "loss": 0.8698, + "step": 2710 + }, + { + "epoch": 1.7664281067013663, + "grad_norm": 0.7560756206512451, + "learning_rate": 3.674165009180091e-05, + "loss": 0.8476, + "step": 2715 + }, + { + "epoch": 1.7696811971372806, + "grad_norm": 1.6937271356582642, + "learning_rate": 3.669605568423515e-05, + "loss": 0.8601, + "step": 2720 + }, + { + "epoch": 1.7729342875731944, + "grad_norm": 0.7721190452575684, + "learning_rate": 3.665041141830594e-05, + "loss": 0.8479, + "step": 2725 + }, + { + "epoch": 1.7761873780091086, + "grad_norm": 0.691184401512146, + "learning_rate": 3.660471748858728e-05, + "loss": 0.846, + "step": 2730 + }, + { + "epoch": 1.779440468445023, + "grad_norm": 0.8458099961280823, + "learning_rate": 3.655897408986487e-05, + "loss": 0.8543, + "step": 2735 + }, + { + "epoch": 1.7826935588809367, + "grad_norm": 0.7717384696006775, + "learning_rate": 3.651318141713532e-05, + "loss": 0.8555, + "step": 2740 + }, + { + "epoch": 1.785946649316851, + "grad_norm": 0.7364319562911987, + "learning_rate": 3.646733966560527e-05, + "loss": 0.8693, + "step": 2745 + }, + { + "epoch": 1.7891997397527653, + "grad_norm": 0.7715139389038086, + "learning_rate": 3.642144903069055e-05, + "loss": 0.8575, + "step": 2750 + }, + { + "epoch": 1.7924528301886793, + "grad_norm": 0.7801803350448608, + "learning_rate": 3.637550970801543e-05, + "loss": 0.8832, + "step": 2755 + }, + { + "epoch": 1.7957059206245933, + "grad_norm": 0.8797639012336731, + "learning_rate": 3.632952189341166e-05, + "loss": 0.8787, + "step": 2760 + }, + { + "epoch": 1.7989590110605076, + "grad_norm": 0.8655262589454651, + "learning_rate": 3.628348578291776e-05, + "loss": 0.8527, + "step": 2765 + }, + { + "epoch": 1.8022121014964216, + "grad_norm": 0.7039540410041809, + "learning_rate": 3.623740157277811e-05, + "loss": 0.8023, + "step": 2770 + }, + { + "epoch": 1.8054651919323357, + "grad_norm": 0.8364835977554321, + "learning_rate": 3.619126945944209e-05, + "loss": 0.8428, + "step": 2775 + }, + { + "epoch": 1.80871828236825, + "grad_norm": 0.8477578163146973, + "learning_rate": 3.614508963956335e-05, + "loss": 0.8364, + "step": 2780 + }, + { + "epoch": 1.811971372804164, + "grad_norm": 0.790069043636322, + "learning_rate": 3.609886230999886e-05, + "loss": 0.8557, + "step": 2785 + }, + { + "epoch": 1.815224463240078, + "grad_norm": 1.1685853004455566, + "learning_rate": 3.605258766780815e-05, + "loss": 0.8639, + "step": 2790 + }, + { + "epoch": 1.8184775536759923, + "grad_norm": 0.6820409297943115, + "learning_rate": 3.600626591025239e-05, + "loss": 0.8561, + "step": 2795 + }, + { + "epoch": 1.8217306441119063, + "grad_norm": 0.6816509366035461, + "learning_rate": 3.595989723479363e-05, + "loss": 0.8595, + "step": 2800 + }, + { + "epoch": 1.8249837345478204, + "grad_norm": 0.6458393335342407, + "learning_rate": 3.591348183909391e-05, + "loss": 0.852, + "step": 2805 + }, + { + "epoch": 1.8282368249837346, + "grad_norm": 0.8720667958259583, + "learning_rate": 3.586701992101446e-05, + "loss": 0.8493, + "step": 2810 + }, + { + "epoch": 1.8314899154196487, + "grad_norm": 0.8076214790344238, + "learning_rate": 3.582051167861477e-05, + "loss": 0.8399, + "step": 2815 + }, + { + "epoch": 1.8347430058555627, + "grad_norm": 1.1117894649505615, + "learning_rate": 3.577395731015184e-05, + "loss": 0.8462, + "step": 2820 + }, + { + "epoch": 1.837996096291477, + "grad_norm": 0.8749067783355713, + "learning_rate": 3.57273570140793e-05, + "loss": 0.8484, + "step": 2825 + }, + { + "epoch": 1.841249186727391, + "grad_norm": 0.9115192890167236, + "learning_rate": 3.5680710989046565e-05, + "loss": 0.8379, + "step": 2830 + }, + { + "epoch": 1.844502277163305, + "grad_norm": 0.7345873117446899, + "learning_rate": 3.5634019433897964e-05, + "loss": 0.8521, + "step": 2835 + }, + { + "epoch": 1.8477553675992193, + "grad_norm": 0.8665250539779663, + "learning_rate": 3.558728254767192e-05, + "loss": 0.8591, + "step": 2840 + }, + { + "epoch": 1.8510084580351334, + "grad_norm": 0.6966584324836731, + "learning_rate": 3.5540500529600096e-05, + "loss": 0.8633, + "step": 2845 + }, + { + "epoch": 1.8542615484710474, + "grad_norm": 0.9217740893363953, + "learning_rate": 3.5493673579106555e-05, + "loss": 0.8581, + "step": 2850 + }, + { + "epoch": 1.8575146389069617, + "grad_norm": 1.1653602123260498, + "learning_rate": 3.5446801895806904e-05, + "loss": 0.8429, + "step": 2855 + }, + { + "epoch": 1.8607677293428757, + "grad_norm": 1.0861412286758423, + "learning_rate": 3.539988567950741e-05, + "loss": 0.8385, + "step": 2860 + }, + { + "epoch": 1.8640208197787898, + "grad_norm": 0.9099658727645874, + "learning_rate": 3.53529251302042e-05, + "loss": 0.8727, + "step": 2865 + }, + { + "epoch": 1.867273910214704, + "grad_norm": 0.8507881760597229, + "learning_rate": 3.530592044808237e-05, + "loss": 0.8601, + "step": 2870 + }, + { + "epoch": 1.870527000650618, + "grad_norm": 0.7487595677375793, + "learning_rate": 3.525887183351517e-05, + "loss": 0.8453, + "step": 2875 + }, + { + "epoch": 1.873780091086532, + "grad_norm": 0.7527421116828918, + "learning_rate": 3.521177948706311e-05, + "loss": 0.856, + "step": 2880 + }, + { + "epoch": 1.8770331815224464, + "grad_norm": 1.198721170425415, + "learning_rate": 3.5164643609473114e-05, + "loss": 0.8322, + "step": 2885 + }, + { + "epoch": 1.8802862719583604, + "grad_norm": 0.7312609553337097, + "learning_rate": 3.51174644016777e-05, + "loss": 0.8571, + "step": 2890 + }, + { + "epoch": 1.8835393623942744, + "grad_norm": 0.813762903213501, + "learning_rate": 3.507024206479406e-05, + "loss": 0.8485, + "step": 2895 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 0.6589996814727783, + "learning_rate": 3.502297680012327e-05, + "loss": 0.8199, + "step": 2900 + }, + { + "epoch": 1.8900455432661027, + "grad_norm": 0.8973954319953918, + "learning_rate": 3.4975668809149375e-05, + "loss": 0.8595, + "step": 2905 + }, + { + "epoch": 1.8932986337020168, + "grad_norm": 0.8979359269142151, + "learning_rate": 3.492831829353857e-05, + "loss": 0.8637, + "step": 2910 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 0.7665019035339355, + "learning_rate": 3.488092545513833e-05, + "loss": 0.8753, + "step": 2915 + }, + { + "epoch": 1.8998048145738453, + "grad_norm": 1.2857329845428467, + "learning_rate": 3.483349049597653e-05, + "loss": 0.8394, + "step": 2920 + }, + { + "epoch": 1.9030579050097591, + "grad_norm": 0.7651403546333313, + "learning_rate": 3.4786013618260615e-05, + "loss": 0.846, + "step": 2925 + }, + { + "epoch": 1.9063109954456734, + "grad_norm": 0.818390429019928, + "learning_rate": 3.47384950243767e-05, + "loss": 0.8919, + "step": 2930 + }, + { + "epoch": 1.9095640858815877, + "grad_norm": 0.8343967795372009, + "learning_rate": 3.4690934916888754e-05, + "loss": 0.8451, + "step": 2935 + }, + { + "epoch": 1.9128171763175015, + "grad_norm": 0.8200094699859619, + "learning_rate": 3.464333349853769e-05, + "loss": 0.8468, + "step": 2940 + }, + { + "epoch": 1.9160702667534157, + "grad_norm": 0.8766981959342957, + "learning_rate": 3.459569097224054e-05, + "loss": 0.8455, + "step": 2945 + }, + { + "epoch": 1.91932335718933, + "grad_norm": 0.7592107057571411, + "learning_rate": 3.454800754108957e-05, + "loss": 0.8564, + "step": 2950 + }, + { + "epoch": 1.9225764476252438, + "grad_norm": 0.7694371938705444, + "learning_rate": 3.45002834083514e-05, + "loss": 0.8579, + "step": 2955 + }, + { + "epoch": 1.925829538061158, + "grad_norm": 0.9310813546180725, + "learning_rate": 3.445251877746616e-05, + "loss": 0.853, + "step": 2960 + }, + { + "epoch": 1.9290826284970723, + "grad_norm": 0.7357284426689148, + "learning_rate": 3.440471385204664e-05, + "loss": 0.843, + "step": 2965 + }, + { + "epoch": 1.9323357189329864, + "grad_norm": 1.0630100965499878, + "learning_rate": 3.4356868835877376e-05, + "loss": 0.8656, + "step": 2970 + }, + { + "epoch": 1.9355888093689004, + "grad_norm": 1.3015029430389404, + "learning_rate": 3.430898393291381e-05, + "loss": 0.8681, + "step": 2975 + }, + { + "epoch": 1.9388418998048147, + "grad_norm": 0.941599428653717, + "learning_rate": 3.426105934728141e-05, + "loss": 0.8374, + "step": 2980 + }, + { + "epoch": 1.9420949902407287, + "grad_norm": 0.827949583530426, + "learning_rate": 3.4213095283274807e-05, + "loss": 0.8342, + "step": 2985 + }, + { + "epoch": 1.9453480806766428, + "grad_norm": 0.7155514359474182, + "learning_rate": 3.416509194535693e-05, + "loss": 0.8604, + "step": 2990 + }, + { + "epoch": 1.948601171112557, + "grad_norm": 0.6395983099937439, + "learning_rate": 3.411704953815813e-05, + "loss": 0.8545, + "step": 2995 + }, + { + "epoch": 1.951854261548471, + "grad_norm": 1.0403225421905518, + "learning_rate": 3.406896826647528e-05, + "loss": 0.8317, + "step": 3000 + }, + { + "epoch": 1.9551073519843851, + "grad_norm": 0.809688925743103, + "learning_rate": 3.4020848335270944e-05, + "loss": 0.8459, + "step": 3005 + }, + { + "epoch": 1.9583604424202994, + "grad_norm": 0.7284942865371704, + "learning_rate": 3.397268994967248e-05, + "loss": 0.8609, + "step": 3010 + }, + { + "epoch": 1.9616135328562134, + "grad_norm": 0.8415728807449341, + "learning_rate": 3.392449331497117e-05, + "loss": 0.8421, + "step": 3015 + }, + { + "epoch": 1.9648666232921275, + "grad_norm": 0.7867475152015686, + "learning_rate": 3.387625863662137e-05, + "loss": 0.8537, + "step": 3020 + }, + { + "epoch": 1.9681197137280417, + "grad_norm": 0.8730093240737915, + "learning_rate": 3.3827986120239556e-05, + "loss": 0.8453, + "step": 3025 + }, + { + "epoch": 1.9713728041639558, + "grad_norm": 1.0075076818466187, + "learning_rate": 3.377967597160355e-05, + "loss": 0.8485, + "step": 3030 + }, + { + "epoch": 1.9746258945998698, + "grad_norm": 0.7558779716491699, + "learning_rate": 3.373132839665159e-05, + "loss": 0.8283, + "step": 3035 + }, + { + "epoch": 1.977878985035784, + "grad_norm": 0.8635545969009399, + "learning_rate": 3.368294360148141e-05, + "loss": 0.8445, + "step": 3040 + }, + { + "epoch": 1.9811320754716981, + "grad_norm": 0.7366521954536438, + "learning_rate": 3.363452179234946e-05, + "loss": 0.8377, + "step": 3045 + }, + { + "epoch": 1.9843851659076122, + "grad_norm": 0.895798921585083, + "learning_rate": 3.3586063175669957e-05, + "loss": 0.8517, + "step": 3050 + }, + { + "epoch": 1.9876382563435264, + "grad_norm": 0.8703877329826355, + "learning_rate": 3.353756795801402e-05, + "loss": 0.8635, + "step": 3055 + }, + { + "epoch": 1.9908913467794405, + "grad_norm": 0.8399415612220764, + "learning_rate": 3.348903634610879e-05, + "loss": 0.8469, + "step": 3060 + }, + { + "epoch": 1.9941444372153545, + "grad_norm": 0.6633405685424805, + "learning_rate": 3.344046854683656e-05, + "loss": 0.8265, + "step": 3065 + }, + { + "epoch": 1.9973975276512688, + "grad_norm": 0.8422790765762329, + "learning_rate": 3.3391864767233874e-05, + "loss": 0.8356, + "step": 3070 + }, + { + "epoch": 2.0, + "eval_f1": 0.8011475160594294, + "eval_loss": 0.444091796875, + "eval_precision": 0.8009366991425545, + "eval_recall": 0.8015108608319047, + "eval_runtime": 238.6273, + "eval_samples_per_second": 1648.743, + "eval_steps_per_second": 1.613, + "step": 3074 + }, + { + "epoch": 2.000650618087183, + "grad_norm": 0.9484532475471497, + "learning_rate": 3.334322521449066e-05, + "loss": 0.8414, + "step": 3075 + }, + { + "epoch": 2.003903708523097, + "grad_norm": 1.058498740196228, + "learning_rate": 3.3294550095949325e-05, + "loss": 0.7647, + "step": 3080 + }, + { + "epoch": 2.007156798959011, + "grad_norm": 1.1817635297775269, + "learning_rate": 3.3245839619103916e-05, + "loss": 0.7739, + "step": 3085 + }, + { + "epoch": 2.0104098893949254, + "grad_norm": 0.9960103034973145, + "learning_rate": 3.319709399159919e-05, + "loss": 0.7627, + "step": 3090 + }, + { + "epoch": 2.013662979830839, + "grad_norm": 0.7337830066680908, + "learning_rate": 3.314831342122974e-05, + "loss": 0.7736, + "step": 3095 + }, + { + "epoch": 2.0169160702667535, + "grad_norm": 0.8539023995399475, + "learning_rate": 3.309949811593914e-05, + "loss": 0.7677, + "step": 3100 + }, + { + "epoch": 2.0201691607026677, + "grad_norm": 0.812573254108429, + "learning_rate": 3.3050648283818985e-05, + "loss": 0.7688, + "step": 3105 + }, + { + "epoch": 2.0234222511385815, + "grad_norm": 0.8771811127662659, + "learning_rate": 3.30017641331081e-05, + "loss": 0.7873, + "step": 3110 + }, + { + "epoch": 2.026675341574496, + "grad_norm": 0.8817070126533508, + "learning_rate": 3.295284587219159e-05, + "loss": 0.7516, + "step": 3115 + }, + { + "epoch": 2.02992843201041, + "grad_norm": 0.8555654287338257, + "learning_rate": 3.290389370959995e-05, + "loss": 0.7245, + "step": 3120 + }, + { + "epoch": 2.033181522446324, + "grad_norm": 0.9785915017127991, + "learning_rate": 3.285490785400822e-05, + "loss": 0.7591, + "step": 3125 + }, + { + "epoch": 2.036434612882238, + "grad_norm": 1.1170217990875244, + "learning_rate": 3.280588851423504e-05, + "loss": 0.7545, + "step": 3130 + }, + { + "epoch": 2.0396877033181524, + "grad_norm": 0.889552652835846, + "learning_rate": 3.275683589924181e-05, + "loss": 0.7509, + "step": 3135 + }, + { + "epoch": 2.0429407937540662, + "grad_norm": 0.9748543500900269, + "learning_rate": 3.270775021813177e-05, + "loss": 0.7419, + "step": 3140 + }, + { + "epoch": 2.0461938841899805, + "grad_norm": 0.9157707691192627, + "learning_rate": 3.26586316801491e-05, + "loss": 0.7476, + "step": 3145 + }, + { + "epoch": 2.0494469746258948, + "grad_norm": 1.3593250513076782, + "learning_rate": 3.2609480494678055e-05, + "loss": 0.778, + "step": 3150 + }, + { + "epoch": 2.0527000650618086, + "grad_norm": 0.8584513664245605, + "learning_rate": 3.256029687124209e-05, + "loss": 0.7634, + "step": 3155 + }, + { + "epoch": 2.055953155497723, + "grad_norm": 1.1206103563308716, + "learning_rate": 3.2511081019502875e-05, + "loss": 0.7612, + "step": 3160 + }, + { + "epoch": 2.059206245933637, + "grad_norm": 1.1010791063308716, + "learning_rate": 3.2461833149259516e-05, + "loss": 0.7631, + "step": 3165 + }, + { + "epoch": 2.062459336369551, + "grad_norm": 1.0924779176712036, + "learning_rate": 3.241255347044759e-05, + "loss": 0.7592, + "step": 3170 + }, + { + "epoch": 2.065712426805465, + "grad_norm": 0.9586931467056274, + "learning_rate": 3.236324219313826e-05, + "loss": 0.7591, + "step": 3175 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 1.0838814973831177, + "learning_rate": 3.231389952753742e-05, + "loss": 0.7724, + "step": 3180 + }, + { + "epoch": 2.0722186076772933, + "grad_norm": 0.9030594229698181, + "learning_rate": 3.226452568398471e-05, + "loss": 0.7627, + "step": 3185 + }, + { + "epoch": 2.0754716981132075, + "grad_norm": 1.0417284965515137, + "learning_rate": 3.221512087295275e-05, + "loss": 0.765, + "step": 3190 + }, + { + "epoch": 2.078724788549122, + "grad_norm": 1.3411697149276733, + "learning_rate": 3.216568530504611e-05, + "loss": 0.7718, + "step": 3195 + }, + { + "epoch": 2.0819778789850356, + "grad_norm": 1.1210920810699463, + "learning_rate": 3.21162191910005e-05, + "loss": 0.7578, + "step": 3200 + }, + { + "epoch": 2.08523096942095, + "grad_norm": 1.0522574186325073, + "learning_rate": 3.2066722741681845e-05, + "loss": 0.7645, + "step": 3205 + }, + { + "epoch": 2.088484059856864, + "grad_norm": 0.9024161100387573, + "learning_rate": 3.2017196168085345e-05, + "loss": 0.7542, + "step": 3210 + }, + { + "epoch": 2.091737150292778, + "grad_norm": 0.93799889087677, + "learning_rate": 3.196763968133466e-05, + "loss": 0.7675, + "step": 3215 + }, + { + "epoch": 2.094990240728692, + "grad_norm": 0.9059098362922668, + "learning_rate": 3.191805349268097e-05, + "loss": 0.774, + "step": 3220 + }, + { + "epoch": 2.0982433311646065, + "grad_norm": 0.954647958278656, + "learning_rate": 3.1868437813502026e-05, + "loss": 0.7591, + "step": 3225 + }, + { + "epoch": 2.1014964216005203, + "grad_norm": 0.956679105758667, + "learning_rate": 3.1818792855301316e-05, + "loss": 0.7585, + "step": 3230 + }, + { + "epoch": 2.1047495120364346, + "grad_norm": 0.8911952376365662, + "learning_rate": 3.1769118829707156e-05, + "loss": 0.7736, + "step": 3235 + }, + { + "epoch": 2.108002602472349, + "grad_norm": 1.1105453968048096, + "learning_rate": 3.171941594847173e-05, + "loss": 0.746, + "step": 3240 + }, + { + "epoch": 2.1112556929082626, + "grad_norm": 1.0151236057281494, + "learning_rate": 3.1669684423470275e-05, + "loss": 0.7628, + "step": 3245 + }, + { + "epoch": 2.114508783344177, + "grad_norm": 1.0137097835540771, + "learning_rate": 3.16199244667001e-05, + "loss": 0.7611, + "step": 3250 + }, + { + "epoch": 2.117761873780091, + "grad_norm": 0.9404064416885376, + "learning_rate": 3.157013629027972e-05, + "loss": 0.7601, + "step": 3255 + }, + { + "epoch": 2.121014964216005, + "grad_norm": 1.3806120157241821, + "learning_rate": 3.152032010644796e-05, + "loss": 0.7647, + "step": 3260 + }, + { + "epoch": 2.1242680546519193, + "grad_norm": 0.9700812697410583, + "learning_rate": 3.147047612756302e-05, + "loss": 0.766, + "step": 3265 + }, + { + "epoch": 2.1275211450878335, + "grad_norm": 1.1779789924621582, + "learning_rate": 3.142060456610159e-05, + "loss": 0.7571, + "step": 3270 + }, + { + "epoch": 2.130774235523748, + "grad_norm": 1.1766470670700073, + "learning_rate": 3.137070563465796e-05, + "loss": 0.7587, + "step": 3275 + }, + { + "epoch": 2.1340273259596616, + "grad_norm": 1.1181317567825317, + "learning_rate": 3.1320779545943034e-05, + "loss": 0.7514, + "step": 3280 + }, + { + "epoch": 2.137280416395576, + "grad_norm": 1.520752191543579, + "learning_rate": 3.127082651278357e-05, + "loss": 0.7383, + "step": 3285 + }, + { + "epoch": 2.14053350683149, + "grad_norm": 1.1578936576843262, + "learning_rate": 3.1220846748121105e-05, + "loss": 0.7736, + "step": 3290 + }, + { + "epoch": 2.143786597267404, + "grad_norm": 1.3091363906860352, + "learning_rate": 3.117084046501119e-05, + "loss": 0.7615, + "step": 3295 + }, + { + "epoch": 2.147039687703318, + "grad_norm": 0.9620407223701477, + "learning_rate": 3.112080787662237e-05, + "loss": 0.7924, + "step": 3300 + }, + { + "epoch": 2.1502927781392325, + "grad_norm": 0.9089716672897339, + "learning_rate": 3.107074919623536e-05, + "loss": 0.7455, + "step": 3305 + }, + { + "epoch": 2.1535458685751463, + "grad_norm": 1.1510998010635376, + "learning_rate": 3.102066463724209e-05, + "loss": 0.765, + "step": 3310 + }, + { + "epoch": 2.1567989590110606, + "grad_norm": 1.8722169399261475, + "learning_rate": 3.0970554413144805e-05, + "loss": 0.7627, + "step": 3315 + }, + { + "epoch": 2.160052049446975, + "grad_norm": 1.0691964626312256, + "learning_rate": 3.0920418737555144e-05, + "loss": 0.7753, + "step": 3320 + }, + { + "epoch": 2.1633051398828886, + "grad_norm": 0.9641361832618713, + "learning_rate": 3.0870257824193263e-05, + "loss": 0.7516, + "step": 3325 + }, + { + "epoch": 2.166558230318803, + "grad_norm": 1.0590273141860962, + "learning_rate": 3.08200718868869e-05, + "loss": 0.7859, + "step": 3330 + }, + { + "epoch": 2.169811320754717, + "grad_norm": 1.2373055219650269, + "learning_rate": 3.076986113957044e-05, + "loss": 0.772, + "step": 3335 + }, + { + "epoch": 2.173064411190631, + "grad_norm": 1.160982608795166, + "learning_rate": 3.071962579628408e-05, + "loss": 0.7673, + "step": 3340 + }, + { + "epoch": 2.1763175016265452, + "grad_norm": 0.8511375188827515, + "learning_rate": 3.066936607117279e-05, + "loss": 0.7558, + "step": 3345 + }, + { + "epoch": 2.1795705920624595, + "grad_norm": 0.9551635384559631, + "learning_rate": 3.061908217848556e-05, + "loss": 0.7641, + "step": 3350 + }, + { + "epoch": 2.1828236824983733, + "grad_norm": 0.9262502789497375, + "learning_rate": 3.056877433257434e-05, + "loss": 0.7667, + "step": 3355 + }, + { + "epoch": 2.1860767729342876, + "grad_norm": 1.2747892141342163, + "learning_rate": 3.051844274789321e-05, + "loss": 0.7497, + "step": 3360 + }, + { + "epoch": 2.189329863370202, + "grad_norm": 1.2817254066467285, + "learning_rate": 3.046808763899745e-05, + "loss": 0.7743, + "step": 3365 + }, + { + "epoch": 2.1925829538061157, + "grad_norm": 1.3123672008514404, + "learning_rate": 3.041770922054262e-05, + "loss": 0.7681, + "step": 3370 + }, + { + "epoch": 2.19583604424203, + "grad_norm": 1.0206502676010132, + "learning_rate": 3.0367307707283626e-05, + "loss": 0.7833, + "step": 3375 + }, + { + "epoch": 2.199089134677944, + "grad_norm": 1.0204437971115112, + "learning_rate": 3.0326970012795626e-05, + "loss": 0.7575, + "step": 3380 + }, + { + "epoch": 2.202342225113858, + "grad_norm": 1.0020246505737305, + "learning_rate": 3.027652747038522e-05, + "loss": 0.7702, + "step": 3385 + }, + { + "epoch": 2.2055953155497723, + "grad_norm": 1.045996904373169, + "learning_rate": 3.022606243500526e-05, + "loss": 0.7609, + "step": 3390 + }, + { + "epoch": 2.2088484059856865, + "grad_norm": 0.9325571060180664, + "learning_rate": 3.0175575121779886e-05, + "loss": 0.7363, + "step": 3395 + }, + { + "epoch": 2.2121014964216004, + "grad_norm": 1.2504099607467651, + "learning_rate": 3.012506574592825e-05, + "loss": 0.7742, + "step": 3400 + }, + { + "epoch": 2.2153545868575146, + "grad_norm": 1.0567350387573242, + "learning_rate": 3.007453452276349e-05, + "loss": 0.7544, + "step": 3405 + }, + { + "epoch": 2.218607677293429, + "grad_norm": 0.9951023459434509, + "learning_rate": 3.0023981667691926e-05, + "loss": 0.7432, + "step": 3410 + }, + { + "epoch": 2.2218607677293427, + "grad_norm": 1.0222620964050293, + "learning_rate": 2.997340739621206e-05, + "loss": 0.794, + "step": 3415 + }, + { + "epoch": 2.225113858165257, + "grad_norm": 0.8401185870170593, + "learning_rate": 2.9922811923913714e-05, + "loss": 0.751, + "step": 3420 + }, + { + "epoch": 2.2283669486011712, + "grad_norm": 1.1666043996810913, + "learning_rate": 2.9872195466477054e-05, + "loss": 0.7592, + "step": 3425 + }, + { + "epoch": 2.231620039037085, + "grad_norm": 0.95232754945755, + "learning_rate": 2.9821558239671744e-05, + "loss": 0.7639, + "step": 3430 + }, + { + "epoch": 2.2348731294729993, + "grad_norm": 0.8971825242042542, + "learning_rate": 2.977090045935594e-05, + "loss": 0.7553, + "step": 3435 + }, + { + "epoch": 2.2381262199089136, + "grad_norm": 1.0237399339675903, + "learning_rate": 2.9720222341475445e-05, + "loss": 0.7504, + "step": 3440 + }, + { + "epoch": 2.2413793103448274, + "grad_norm": 1.1775766611099243, + "learning_rate": 2.966952410206275e-05, + "loss": 0.7449, + "step": 3445 + }, + { + "epoch": 2.2446324007807417, + "grad_norm": 0.885957658290863, + "learning_rate": 2.9618805957236113e-05, + "loss": 0.7631, + "step": 3450 + }, + { + "epoch": 2.247885491216656, + "grad_norm": 1.3709341287612915, + "learning_rate": 2.956806812319865e-05, + "loss": 0.7589, + "step": 3455 + }, + { + "epoch": 2.2511385816525697, + "grad_norm": 1.204150676727295, + "learning_rate": 2.951731081623742e-05, + "loss": 0.7662, + "step": 3460 + }, + { + "epoch": 2.254391672088484, + "grad_norm": 1.6271796226501465, + "learning_rate": 2.946653425272247e-05, + "loss": 0.7821, + "step": 3465 + }, + { + "epoch": 2.2576447625243983, + "grad_norm": 1.0852000713348389, + "learning_rate": 2.9415738649105963e-05, + "loss": 0.7408, + "step": 3470 + }, + { + "epoch": 2.260897852960312, + "grad_norm": 1.0353608131408691, + "learning_rate": 2.9364924221921185e-05, + "loss": 0.7478, + "step": 3475 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 1.881262183189392, + "learning_rate": 2.9314091187781715e-05, + "loss": 0.7584, + "step": 3480 + }, + { + "epoch": 2.2674040338321406, + "grad_norm": 1.2990703582763672, + "learning_rate": 2.9263239763380412e-05, + "loss": 0.7566, + "step": 3485 + }, + { + "epoch": 2.2706571242680544, + "grad_norm": 0.9985173940658569, + "learning_rate": 2.921237016548854e-05, + "loss": 0.7676, + "step": 3490 + }, + { + "epoch": 2.2739102147039687, + "grad_norm": 0.9522629976272583, + "learning_rate": 2.9161482610954842e-05, + "loss": 0.7475, + "step": 3495 + }, + { + "epoch": 2.277163305139883, + "grad_norm": 0.9219643473625183, + "learning_rate": 2.9110577316704602e-05, + "loss": 0.7613, + "step": 3500 + }, + { + "epoch": 2.280416395575797, + "grad_norm": 0.9594421982765198, + "learning_rate": 2.905965449973871e-05, + "loss": 0.768, + "step": 3505 + }, + { + "epoch": 2.283669486011711, + "grad_norm": 1.0452098846435547, + "learning_rate": 2.900871437713279e-05, + "loss": 0.7699, + "step": 3510 + }, + { + "epoch": 2.2869225764476253, + "grad_norm": 0.9670342803001404, + "learning_rate": 2.8957757166036193e-05, + "loss": 0.7573, + "step": 3515 + }, + { + "epoch": 2.290175666883539, + "grad_norm": 1.147403597831726, + "learning_rate": 2.890678308367115e-05, + "loss": 0.7688, + "step": 3520 + }, + { + "epoch": 2.2934287573194534, + "grad_norm": 1.086470603942871, + "learning_rate": 2.8855792347331793e-05, + "loss": 0.7671, + "step": 3525 + }, + { + "epoch": 2.2966818477553677, + "grad_norm": 1.6733858585357666, + "learning_rate": 2.8804785174383248e-05, + "loss": 0.7753, + "step": 3530 + }, + { + "epoch": 2.2999349381912815, + "grad_norm": 1.0693230628967285, + "learning_rate": 2.8753761782260723e-05, + "loss": 0.7457, + "step": 3535 + }, + { + "epoch": 2.3031880286271957, + "grad_norm": 1.079010009765625, + "learning_rate": 2.8702722388468546e-05, + "loss": 0.7701, + "step": 3540 + }, + { + "epoch": 2.30644111906311, + "grad_norm": 0.9620556235313416, + "learning_rate": 2.8651667210579257e-05, + "loss": 0.759, + "step": 3545 + }, + { + "epoch": 2.3096942094990243, + "grad_norm": 1.1349847316741943, + "learning_rate": 2.8600596466232715e-05, + "loss": 0.7776, + "step": 3550 + }, + { + "epoch": 2.312947299934938, + "grad_norm": 1.4847538471221924, + "learning_rate": 2.8549510373135092e-05, + "loss": 0.7566, + "step": 3555 + }, + { + "epoch": 2.3162003903708523, + "grad_norm": 1.657256007194519, + "learning_rate": 2.8498409149058008e-05, + "loss": 0.762, + "step": 3560 + }, + { + "epoch": 2.3194534808067666, + "grad_norm": 1.0619240999221802, + "learning_rate": 2.8447293011837596e-05, + "loss": 0.771, + "step": 3565 + }, + { + "epoch": 2.3227065712426804, + "grad_norm": 0.8844910264015198, + "learning_rate": 2.8396162179373535e-05, + "loss": 0.7573, + "step": 3570 + }, + { + "epoch": 2.3259596616785947, + "grad_norm": 1.3543357849121094, + "learning_rate": 2.8345016869628175e-05, + "loss": 0.7736, + "step": 3575 + }, + { + "epoch": 2.329212752114509, + "grad_norm": 0.9610804319381714, + "learning_rate": 2.8293857300625555e-05, + "loss": 0.7536, + "step": 3580 + }, + { + "epoch": 2.3324658425504228, + "grad_norm": 1.2407771348953247, + "learning_rate": 2.8242683690450518e-05, + "loss": 0.7584, + "step": 3585 + }, + { + "epoch": 2.335718932986337, + "grad_norm": 1.388168215751648, + "learning_rate": 2.8191496257247764e-05, + "loss": 0.7426, + "step": 3590 + }, + { + "epoch": 2.3389720234222513, + "grad_norm": 1.1140729188919067, + "learning_rate": 2.814029521922088e-05, + "loss": 0.7418, + "step": 3595 + }, + { + "epoch": 2.342225113858165, + "grad_norm": 1.0877522230148315, + "learning_rate": 2.8089080794631512e-05, + "loss": 0.7531, + "step": 3600 + }, + { + "epoch": 2.3454782042940794, + "grad_norm": 1.0917423963546753, + "learning_rate": 2.803785320179832e-05, + "loss": 0.7435, + "step": 3605 + }, + { + "epoch": 2.3487312947299936, + "grad_norm": 1.3571592569351196, + "learning_rate": 2.7986612659096113e-05, + "loss": 0.7594, + "step": 3610 + }, + { + "epoch": 2.3519843851659075, + "grad_norm": 1.0520139932632446, + "learning_rate": 2.7935359384954914e-05, + "loss": 0.758, + "step": 3615 + }, + { + "epoch": 2.3552374756018217, + "grad_norm": 1.271592617034912, + "learning_rate": 2.7884093597858996e-05, + "loss": 0.7457, + "step": 3620 + }, + { + "epoch": 2.358490566037736, + "grad_norm": 0.9961024522781372, + "learning_rate": 2.783281551634599e-05, + "loss": 0.7626, + "step": 3625 + }, + { + "epoch": 2.36174365647365, + "grad_norm": 1.3508564233779907, + "learning_rate": 2.7781525359005943e-05, + "loss": 0.734, + "step": 3630 + }, + { + "epoch": 2.364996746909564, + "grad_norm": 1.0961614847183228, + "learning_rate": 2.7730223344480348e-05, + "loss": 0.7553, + "step": 3635 + }, + { + "epoch": 2.3682498373454783, + "grad_norm": 1.032395839691162, + "learning_rate": 2.7678909691461274e-05, + "loss": 0.7915, + "step": 3640 + }, + { + "epoch": 2.371502927781392, + "grad_norm": 1.1500605344772339, + "learning_rate": 2.7627584618690394e-05, + "loss": 0.7539, + "step": 3645 + }, + { + "epoch": 2.3747560182173064, + "grad_norm": 1.0203113555908203, + "learning_rate": 2.7576248344958054e-05, + "loss": 0.7771, + "step": 3650 + }, + { + "epoch": 2.3780091086532207, + "grad_norm": 2.247779607772827, + "learning_rate": 2.7524901089102358e-05, + "loss": 0.764, + "step": 3655 + }, + { + "epoch": 2.3812621990891345, + "grad_norm": 1.131200909614563, + "learning_rate": 2.7473543070008213e-05, + "loss": 0.742, + "step": 3660 + }, + { + "epoch": 2.3845152895250488, + "grad_norm": 1.2509359121322632, + "learning_rate": 2.7422174506606413e-05, + "loss": 0.7461, + "step": 3665 + }, + { + "epoch": 2.387768379960963, + "grad_norm": 0.864366352558136, + "learning_rate": 2.737079561787272e-05, + "loss": 0.7405, + "step": 3670 + }, + { + "epoch": 2.391021470396877, + "grad_norm": 0.9416084885597229, + "learning_rate": 2.7319406622826878e-05, + "loss": 0.7439, + "step": 3675 + }, + { + "epoch": 2.394274560832791, + "grad_norm": 1.7094473838806152, + "learning_rate": 2.726800774053173e-05, + "loss": 0.7698, + "step": 3680 + }, + { + "epoch": 2.3975276512687054, + "grad_norm": 0.9964091777801514, + "learning_rate": 2.7216599190092273e-05, + "loss": 0.7536, + "step": 3685 + }, + { + "epoch": 2.4007807417046196, + "grad_norm": 1.1519944667816162, + "learning_rate": 2.7165181190654702e-05, + "loss": 0.7459, + "step": 3690 + }, + { + "epoch": 2.4040338321405335, + "grad_norm": 1.2240533828735352, + "learning_rate": 2.7113753961405515e-05, + "loss": 0.7434, + "step": 3695 + }, + { + "epoch": 2.4072869225764477, + "grad_norm": 1.122253656387329, + "learning_rate": 2.7062317721570512e-05, + "loss": 0.7471, + "step": 3700 + }, + { + "epoch": 2.410540013012362, + "grad_norm": 1.0433543920516968, + "learning_rate": 2.7010872690413956e-05, + "loss": 0.7429, + "step": 3705 + }, + { + "epoch": 2.413793103448276, + "grad_norm": 1.092159628868103, + "learning_rate": 2.6959419087237553e-05, + "loss": 0.7506, + "step": 3710 + }, + { + "epoch": 2.41704619388419, + "grad_norm": 0.9082927107810974, + "learning_rate": 2.6907957131379553e-05, + "loss": 0.7666, + "step": 3715 + }, + { + "epoch": 2.4202992843201043, + "grad_norm": 0.8798219561576843, + "learning_rate": 2.6856487042213822e-05, + "loss": 0.7637, + "step": 3720 + }, + { + "epoch": 2.423552374756018, + "grad_norm": 0.8654388189315796, + "learning_rate": 2.6805009039148897e-05, + "loss": 0.7541, + "step": 3725 + }, + { + "epoch": 2.4268054651919324, + "grad_norm": 1.0439229011535645, + "learning_rate": 2.675352334162704e-05, + "loss": 0.7618, + "step": 3730 + }, + { + "epoch": 2.4300585556278467, + "grad_norm": 0.9634140729904175, + "learning_rate": 2.6702030169123316e-05, + "loss": 0.737, + "step": 3735 + }, + { + "epoch": 2.4333116460637605, + "grad_norm": 0.8647895455360413, + "learning_rate": 2.6650529741144665e-05, + "loss": 0.7485, + "step": 3740 + }, + { + "epoch": 2.4365647364996748, + "grad_norm": 1.984215259552002, + "learning_rate": 2.6599022277228948e-05, + "loss": 0.7541, + "step": 3745 + }, + { + "epoch": 2.439817826935589, + "grad_norm": 1.074607014656067, + "learning_rate": 2.6547507996944022e-05, + "loss": 0.7595, + "step": 3750 + }, + { + "epoch": 2.443070917371503, + "grad_norm": 0.9121082425117493, + "learning_rate": 2.649598711988679e-05, + "loss": 0.7741, + "step": 3755 + }, + { + "epoch": 2.446324007807417, + "grad_norm": 1.6042678356170654, + "learning_rate": 2.6444459865682297e-05, + "loss": 0.7699, + "step": 3760 + }, + { + "epoch": 2.4495770982433314, + "grad_norm": 0.9366397857666016, + "learning_rate": 2.6392926453982748e-05, + "loss": 0.7525, + "step": 3765 + }, + { + "epoch": 2.452830188679245, + "grad_norm": 1.0728055238723755, + "learning_rate": 2.6341387104466612e-05, + "loss": 0.749, + "step": 3770 + }, + { + "epoch": 2.4560832791151594, + "grad_norm": 0.988258957862854, + "learning_rate": 2.6289842036837675e-05, + "loss": 0.7563, + "step": 3775 + }, + { + "epoch": 2.4593363695510737, + "grad_norm": 1.2626458406448364, + "learning_rate": 2.6238291470824085e-05, + "loss": 0.7367, + "step": 3780 + }, + { + "epoch": 2.4625894599869875, + "grad_norm": 0.8835701942443848, + "learning_rate": 2.6186735626177428e-05, + "loss": 0.7534, + "step": 3785 + }, + { + "epoch": 2.465842550422902, + "grad_norm": 0.8948650360107422, + "learning_rate": 2.6135174722671813e-05, + "loss": 0.7975, + "step": 3790 + }, + { + "epoch": 2.469095640858816, + "grad_norm": 1.0557647943496704, + "learning_rate": 2.608360898010288e-05, + "loss": 0.7542, + "step": 3795 + }, + { + "epoch": 2.47234873129473, + "grad_norm": 1.1379538774490356, + "learning_rate": 2.603203861828693e-05, + "loss": 0.7569, + "step": 3800 + }, + { + "epoch": 2.475601821730644, + "grad_norm": 1.1298165321350098, + "learning_rate": 2.598046385705994e-05, + "loss": 0.7662, + "step": 3805 + }, + { + "epoch": 2.4788549121665584, + "grad_norm": 0.9936167001724243, + "learning_rate": 2.5928884916276635e-05, + "loss": 0.7427, + "step": 3810 + }, + { + "epoch": 2.482108002602472, + "grad_norm": 1.055421233177185, + "learning_rate": 2.5877302015809574e-05, + "loss": 0.741, + "step": 3815 + }, + { + "epoch": 2.4853610930383865, + "grad_norm": 1.0035120248794556, + "learning_rate": 2.5825715375548175e-05, + "loss": 0.7495, + "step": 3820 + }, + { + "epoch": 2.4886141834743007, + "grad_norm": 1.5768109560012817, + "learning_rate": 2.5774125215397815e-05, + "loss": 0.7677, + "step": 3825 + }, + { + "epoch": 2.4918672739102146, + "grad_norm": 1.1085072755813599, + "learning_rate": 2.5722531755278874e-05, + "loss": 0.7693, + "step": 3830 + }, + { + "epoch": 2.495120364346129, + "grad_norm": 0.9290764927864075, + "learning_rate": 2.567093521512578e-05, + "loss": 0.7734, + "step": 3835 + }, + { + "epoch": 2.498373454782043, + "grad_norm": 1.2003841400146484, + "learning_rate": 2.561933581488612e-05, + "loss": 0.7529, + "step": 3840 + }, + { + "epoch": 2.501626545217957, + "grad_norm": 0.9982072114944458, + "learning_rate": 2.556773377451965e-05, + "loss": 0.7555, + "step": 3845 + }, + { + "epoch": 2.504879635653871, + "grad_norm": 0.9454076886177063, + "learning_rate": 2.5516129313997388e-05, + "loss": 0.7726, + "step": 3850 + }, + { + "epoch": 2.5081327260897854, + "grad_norm": 0.9885278940200806, + "learning_rate": 2.5464522653300676e-05, + "loss": 0.7585, + "step": 3855 + }, + { + "epoch": 2.5113858165256993, + "grad_norm": 1.0617841482162476, + "learning_rate": 2.541291401242022e-05, + "loss": 0.7613, + "step": 3860 + }, + { + "epoch": 2.5146389069616135, + "grad_norm": 0.9445372223854065, + "learning_rate": 2.536130361135518e-05, + "loss": 0.7867, + "step": 3865 + }, + { + "epoch": 2.517891997397528, + "grad_norm": 1.2932319641113281, + "learning_rate": 2.5309691670112218e-05, + "loss": 0.7509, + "step": 3870 + }, + { + "epoch": 2.5211450878334416, + "grad_norm": 1.1702325344085693, + "learning_rate": 2.525807840870455e-05, + "loss": 0.7772, + "step": 3875 + }, + { + "epoch": 2.524398178269356, + "grad_norm": 1.0334542989730835, + "learning_rate": 2.5206464047151046e-05, + "loss": 0.7478, + "step": 3880 + }, + { + "epoch": 2.52765126870527, + "grad_norm": 2.0176279544830322, + "learning_rate": 2.5154848805475224e-05, + "loss": 0.759, + "step": 3885 + }, + { + "epoch": 2.530904359141184, + "grad_norm": 1.1288046836853027, + "learning_rate": 2.5103232903704393e-05, + "loss": 0.7529, + "step": 3890 + }, + { + "epoch": 2.534157449577098, + "grad_norm": 1.0248112678527832, + "learning_rate": 2.5051616561868663e-05, + "loss": 0.7748, + "step": 3895 + }, + { + "epoch": 2.5374105400130125, + "grad_norm": 0.8906844258308411, + "learning_rate": 2.5e-05, + "loss": 0.7369, + "step": 3900 + }, + { + "epoch": 2.5406636304489263, + "grad_norm": 1.1588047742843628, + "learning_rate": 2.4948383438131346e-05, + "loss": 0.7465, + "step": 3905 + }, + { + "epoch": 2.5439167208848406, + "grad_norm": 1.0166900157928467, + "learning_rate": 2.4896767096295613e-05, + "loss": 0.7576, + "step": 3910 + }, + { + "epoch": 2.547169811320755, + "grad_norm": 1.0682686567306519, + "learning_rate": 2.484515119452478e-05, + "loss": 0.7884, + "step": 3915 + }, + { + "epoch": 2.5504229017566686, + "grad_norm": 0.9026442766189575, + "learning_rate": 2.4793535952848963e-05, + "loss": 0.7311, + "step": 3920 + }, + { + "epoch": 2.553675992192583, + "grad_norm": 0.8642654418945312, + "learning_rate": 2.4741921591295454e-05, + "loss": 0.7547, + "step": 3925 + }, + { + "epoch": 2.556929082628497, + "grad_norm": 1.1124982833862305, + "learning_rate": 2.4690308329887788e-05, + "loss": 0.7523, + "step": 3930 + }, + { + "epoch": 2.560182173064411, + "grad_norm": 1.664115309715271, + "learning_rate": 2.463869638864483e-05, + "loss": 0.7249, + "step": 3935 + }, + { + "epoch": 2.5634352635003252, + "grad_norm": 0.9926962852478027, + "learning_rate": 2.458708598757979e-05, + "loss": 0.7318, + "step": 3940 + }, + { + "epoch": 2.5666883539362395, + "grad_norm": 1.076627254486084, + "learning_rate": 2.4535477346699333e-05, + "loss": 0.7586, + "step": 3945 + }, + { + "epoch": 2.5699414443721533, + "grad_norm": 1.7046575546264648, + "learning_rate": 2.4483870686002625e-05, + "loss": 0.7482, + "step": 3950 + }, + { + "epoch": 2.5731945348080676, + "grad_norm": 1.0066241025924683, + "learning_rate": 2.443226622548036e-05, + "loss": 0.7636, + "step": 3955 + }, + { + "epoch": 2.576447625243982, + "grad_norm": 2.010552406311035, + "learning_rate": 2.4380664185113887e-05, + "loss": 0.7661, + "step": 3960 + }, + { + "epoch": 2.5797007156798957, + "grad_norm": 1.1133430004119873, + "learning_rate": 2.432906478487423e-05, + "loss": 0.7597, + "step": 3965 + }, + { + "epoch": 2.58295380611581, + "grad_norm": 1.1634178161621094, + "learning_rate": 2.427746824472113e-05, + "loss": 0.76, + "step": 3970 + }, + { + "epoch": 2.586206896551724, + "grad_norm": 0.9780275821685791, + "learning_rate": 2.4225874784602184e-05, + "loss": 0.7688, + "step": 3975 + }, + { + "epoch": 2.589459986987638, + "grad_norm": 1.2186133861541748, + "learning_rate": 2.4174284624451824e-05, + "loss": 0.7309, + "step": 3980 + }, + { + "epoch": 2.5927130774235523, + "grad_norm": 0.9547963738441467, + "learning_rate": 2.4122697984190428e-05, + "loss": 0.7593, + "step": 3985 + }, + { + "epoch": 2.5959661678594665, + "grad_norm": 0.943261444568634, + "learning_rate": 2.4071115083723364e-05, + "loss": 0.7562, + "step": 3990 + }, + { + "epoch": 2.5992192582953804, + "grad_norm": 0.9355084896087646, + "learning_rate": 2.401953614294006e-05, + "loss": 0.7294, + "step": 3995 + }, + { + "epoch": 2.6024723487312946, + "grad_norm": 1.0167070627212524, + "learning_rate": 2.396796138171307e-05, + "loss": 0.7578, + "step": 4000 + }, + { + "epoch": 2.605725439167209, + "grad_norm": 0.9536129832267761, + "learning_rate": 2.391639101989712e-05, + "loss": 0.7363, + "step": 4005 + }, + { + "epoch": 2.6089785296031227, + "grad_norm": 0.9292064309120178, + "learning_rate": 2.3864825277328193e-05, + "loss": 0.7517, + "step": 4010 + }, + { + "epoch": 2.612231620039037, + "grad_norm": 1.1821918487548828, + "learning_rate": 2.3813264373822578e-05, + "loss": 0.7627, + "step": 4015 + }, + { + "epoch": 2.6154847104749512, + "grad_norm": 0.9278668165206909, + "learning_rate": 2.376170852917592e-05, + "loss": 0.7673, + "step": 4020 + }, + { + "epoch": 2.618737800910865, + "grad_norm": 0.9061160683631897, + "learning_rate": 2.3710157963162328e-05, + "loss": 0.774, + "step": 4025 + }, + { + "epoch": 2.6219908913467793, + "grad_norm": 1.2330580949783325, + "learning_rate": 2.3658612895533393e-05, + "loss": 0.7514, + "step": 4030 + }, + { + "epoch": 2.6252439817826936, + "grad_norm": 0.9609399437904358, + "learning_rate": 2.3607073546017258e-05, + "loss": 0.7373, + "step": 4035 + }, + { + "epoch": 2.6284970722186074, + "grad_norm": 1.5064210891723633, + "learning_rate": 2.3555540134317712e-05, + "loss": 0.7487, + "step": 4040 + }, + { + "epoch": 2.6317501626545217, + "grad_norm": 1.0178202390670776, + "learning_rate": 2.3504012880113216e-05, + "loss": 0.7789, + "step": 4045 + }, + { + "epoch": 2.635003253090436, + "grad_norm": 0.8506657481193542, + "learning_rate": 2.3452492003055984e-05, + "loss": 0.7316, + "step": 4050 + }, + { + "epoch": 2.63825634352635, + "grad_norm": 0.9458078145980835, + "learning_rate": 2.3400977722771058e-05, + "loss": 0.7703, + "step": 4055 + }, + { + "epoch": 2.641509433962264, + "grad_norm": 1.1263021230697632, + "learning_rate": 2.3349470258855337e-05, + "loss": 0.7579, + "step": 4060 + }, + { + "epoch": 2.6447625243981783, + "grad_norm": 0.8372018933296204, + "learning_rate": 2.3297969830876686e-05, + "loss": 0.76, + "step": 4065 + }, + { + "epoch": 2.6480156148340925, + "grad_norm": 0.8701651692390442, + "learning_rate": 2.3246476658372973e-05, + "loss": 0.7476, + "step": 4070 + }, + { + "epoch": 2.6512687052700064, + "grad_norm": 1.3167948722839355, + "learning_rate": 2.3194990960851112e-05, + "loss": 0.7628, + "step": 4075 + }, + { + "epoch": 2.6545217957059206, + "grad_norm": 1.0400781631469727, + "learning_rate": 2.3143512957786184e-05, + "loss": 0.7773, + "step": 4080 + }, + { + "epoch": 2.657774886141835, + "grad_norm": 0.9622422456741333, + "learning_rate": 2.309204286862046e-05, + "loss": 0.7469, + "step": 4085 + }, + { + "epoch": 2.6610279765777487, + "grad_norm": 0.929834246635437, + "learning_rate": 2.3040580912762456e-05, + "loss": 0.7544, + "step": 4090 + }, + { + "epoch": 2.664281067013663, + "grad_norm": 1.018149495124817, + "learning_rate": 2.298912730958605e-05, + "loss": 0.7746, + "step": 4095 + }, + { + "epoch": 2.6675341574495772, + "grad_norm": 1.0057318210601807, + "learning_rate": 2.2937682278429494e-05, + "loss": 0.7352, + "step": 4100 + }, + { + "epoch": 2.6707872478854915, + "grad_norm": 0.9973504543304443, + "learning_rate": 2.288624603859449e-05, + "loss": 0.721, + "step": 4105 + }, + { + "epoch": 2.6740403383214053, + "grad_norm": 1.0883572101593018, + "learning_rate": 2.2834818809345297e-05, + "loss": 0.7474, + "step": 4110 + }, + { + "epoch": 2.6772934287573196, + "grad_norm": 1.337254524230957, + "learning_rate": 2.2783400809907726e-05, + "loss": 0.7701, + "step": 4115 + }, + { + "epoch": 2.680546519193234, + "grad_norm": 1.1612261533737183, + "learning_rate": 2.2731992259468272e-05, + "loss": 0.7547, + "step": 4120 + }, + { + "epoch": 2.6837996096291477, + "grad_norm": 1.0043455362319946, + "learning_rate": 2.2680593377173124e-05, + "loss": 0.7576, + "step": 4125 + }, + { + "epoch": 2.687052700065062, + "grad_norm": 1.180498719215393, + "learning_rate": 2.2629204382127284e-05, + "loss": 0.7533, + "step": 4130 + }, + { + "epoch": 2.690305790500976, + "grad_norm": 1.0349406003952026, + "learning_rate": 2.257782549339359e-05, + "loss": 0.7636, + "step": 4135 + }, + { + "epoch": 2.69355888093689, + "grad_norm": 1.073776125907898, + "learning_rate": 2.2526456929991793e-05, + "loss": 0.7718, + "step": 4140 + }, + { + "epoch": 2.6968119713728043, + "grad_norm": 1.114530324935913, + "learning_rate": 2.2475098910897645e-05, + "loss": 0.7445, + "step": 4145 + }, + { + "epoch": 2.7000650618087185, + "grad_norm": 0.9346311092376709, + "learning_rate": 2.2423751655041952e-05, + "loss": 0.7294, + "step": 4150 + }, + { + "epoch": 2.7033181522446323, + "grad_norm": 1.086501955986023, + "learning_rate": 2.237241538130961e-05, + "loss": 0.7507, + "step": 4155 + }, + { + "epoch": 2.7065712426805466, + "grad_norm": 0.9763929843902588, + "learning_rate": 2.2321090308538732e-05, + "loss": 0.743, + "step": 4160 + }, + { + "epoch": 2.709824333116461, + "grad_norm": 0.8880870938301086, + "learning_rate": 2.2269776655519658e-05, + "loss": 0.7418, + "step": 4165 + }, + { + "epoch": 2.7130774235523747, + "grad_norm": 0.9564589858055115, + "learning_rate": 2.2218474640994063e-05, + "loss": 0.765, + "step": 4170 + }, + { + "epoch": 2.716330513988289, + "grad_norm": 1.169952630996704, + "learning_rate": 2.2167184483654013e-05, + "loss": 0.7531, + "step": 4175 + }, + { + "epoch": 2.719583604424203, + "grad_norm": 0.9627036452293396, + "learning_rate": 2.211590640214101e-05, + "loss": 0.7623, + "step": 4180 + }, + { + "epoch": 2.722836694860117, + "grad_norm": 0.9291010499000549, + "learning_rate": 2.2064640615045092e-05, + "loss": 0.7641, + "step": 4185 + }, + { + "epoch": 2.7260897852960313, + "grad_norm": 1.0236008167266846, + "learning_rate": 2.2013387340903893e-05, + "loss": 0.7703, + "step": 4190 + }, + { + "epoch": 2.7293428757319456, + "grad_norm": 1.2711366415023804, + "learning_rate": 2.1962146798201684e-05, + "loss": 0.7454, + "step": 4195 + }, + { + "epoch": 2.7325959661678594, + "grad_norm": 1.1424434185028076, + "learning_rate": 2.191091920536849e-05, + "loss": 0.7559, + "step": 4200 + }, + { + "epoch": 2.7358490566037736, + "grad_norm": 1.4138892889022827, + "learning_rate": 2.1859704780779126e-05, + "loss": 0.7569, + "step": 4205 + }, + { + "epoch": 2.739102147039688, + "grad_norm": 0.967829704284668, + "learning_rate": 2.1808503742752252e-05, + "loss": 0.7432, + "step": 4210 + }, + { + "epoch": 2.7423552374756017, + "grad_norm": 0.8999619483947754, + "learning_rate": 2.175731630954949e-05, + "loss": 0.7457, + "step": 4215 + }, + { + "epoch": 2.745608327911516, + "grad_norm": 1.0657751560211182, + "learning_rate": 2.1706142699374454e-05, + "loss": 0.786, + "step": 4220 + }, + { + "epoch": 2.7488614183474303, + "grad_norm": 1.5017127990722656, + "learning_rate": 2.1654983130371837e-05, + "loss": 0.7516, + "step": 4225 + }, + { + "epoch": 2.752114508783344, + "grad_norm": 1.0914252996444702, + "learning_rate": 2.1603837820626478e-05, + "loss": 0.7616, + "step": 4230 + }, + { + "epoch": 2.7553675992192583, + "grad_norm": 1.1397154331207275, + "learning_rate": 2.1552706988162417e-05, + "loss": 0.761, + "step": 4235 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 1.162166714668274, + "learning_rate": 2.1501590850941994e-05, + "loss": 0.7353, + "step": 4240 + }, + { + "epoch": 2.7618737800910864, + "grad_norm": 1.0100218057632446, + "learning_rate": 2.1450489626864907e-05, + "loss": 0.7446, + "step": 4245 + }, + { + "epoch": 2.7651268705270007, + "grad_norm": 0.9108495116233826, + "learning_rate": 2.139940353376728e-05, + "loss": 0.7644, + "step": 4250 + }, + { + "epoch": 2.768379960962915, + "grad_norm": 0.9544759392738342, + "learning_rate": 2.134833278942074e-05, + "loss": 0.7693, + "step": 4255 + }, + { + "epoch": 2.7716330513988288, + "grad_norm": 1.6715203523635864, + "learning_rate": 2.1297277611531456e-05, + "loss": 0.764, + "step": 4260 + }, + { + "epoch": 2.774886141834743, + "grad_norm": 1.0044587850570679, + "learning_rate": 2.1246238217739283e-05, + "loss": 0.7593, + "step": 4265 + }, + { + "epoch": 2.7781392322706573, + "grad_norm": 0.9041277766227722, + "learning_rate": 2.119521482561675e-05, + "loss": 0.7427, + "step": 4270 + }, + { + "epoch": 2.781392322706571, + "grad_norm": 0.8890901803970337, + "learning_rate": 2.114420765266821e-05, + "loss": 0.7462, + "step": 4275 + }, + { + "epoch": 2.7846454131424854, + "grad_norm": 0.9522978663444519, + "learning_rate": 2.1093216916328855e-05, + "loss": 0.7398, + "step": 4280 + }, + { + "epoch": 2.7878985035783996, + "grad_norm": 1.2829575538635254, + "learning_rate": 2.104224283396381e-05, + "loss": 0.7632, + "step": 4285 + }, + { + "epoch": 2.7911515940143135, + "grad_norm": 0.9626341462135315, + "learning_rate": 2.0991285622867215e-05, + "loss": 0.7681, + "step": 4290 + }, + { + "epoch": 2.7944046844502277, + "grad_norm": 0.952867865562439, + "learning_rate": 2.0940345500261294e-05, + "loss": 0.7518, + "step": 4295 + }, + { + "epoch": 2.797657774886142, + "grad_norm": 1.0598902702331543, + "learning_rate": 2.0889422683295407e-05, + "loss": 0.7884, + "step": 4300 + }, + { + "epoch": 2.800910865322056, + "grad_norm": 1.0540211200714111, + "learning_rate": 2.083851738904516e-05, + "loss": 0.7518, + "step": 4305 + }, + { + "epoch": 2.80416395575797, + "grad_norm": 0.9470973014831543, + "learning_rate": 2.0787629834511466e-05, + "loss": 0.764, + "step": 4310 + }, + { + "epoch": 2.8074170461938843, + "grad_norm": 1.127659559249878, + "learning_rate": 2.0736760236619594e-05, + "loss": 0.7332, + "step": 4315 + }, + { + "epoch": 2.810670136629798, + "grad_norm": 1.0755411386489868, + "learning_rate": 2.0685908812218287e-05, + "loss": 0.7622, + "step": 4320 + }, + { + "epoch": 2.8139232270657124, + "grad_norm": 1.1209520101547241, + "learning_rate": 2.0635075778078817e-05, + "loss": 0.7416, + "step": 4325 + }, + { + "epoch": 2.8171763175016267, + "grad_norm": 1.0491728782653809, + "learning_rate": 2.0584261350894046e-05, + "loss": 0.7802, + "step": 4330 + }, + { + "epoch": 2.8204294079375405, + "grad_norm": 1.025694727897644, + "learning_rate": 2.0533465747277535e-05, + "loss": 0.7487, + "step": 4335 + }, + { + "epoch": 2.8236824983734548, + "grad_norm": 0.9486551880836487, + "learning_rate": 2.0482689183762588e-05, + "loss": 0.7594, + "step": 4340 + }, + { + "epoch": 2.826935588809369, + "grad_norm": 0.9839990139007568, + "learning_rate": 2.0431931876801352e-05, + "loss": 0.7431, + "step": 4345 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 1.0050575733184814, + "learning_rate": 2.03811940427639e-05, + "loss": 0.7527, + "step": 4350 + }, + { + "epoch": 2.833441769681197, + "grad_norm": 0.9743004441261292, + "learning_rate": 2.033047589793726e-05, + "loss": 0.7307, + "step": 4355 + }, + { + "epoch": 2.8366948601171114, + "grad_norm": 1.0488122701644897, + "learning_rate": 2.027977765852456e-05, + "loss": 0.7598, + "step": 4360 + }, + { + "epoch": 2.839947950553025, + "grad_norm": 1.074271321296692, + "learning_rate": 2.022909954064407e-05, + "loss": 0.7571, + "step": 4365 + }, + { + "epoch": 2.8432010409889394, + "grad_norm": 0.9306830167770386, + "learning_rate": 2.0178441760328268e-05, + "loss": 0.735, + "step": 4370 + }, + { + "epoch": 2.8464541314248537, + "grad_norm": 0.8995447754859924, + "learning_rate": 2.0127804533522948e-05, + "loss": 0.7519, + "step": 4375 + }, + { + "epoch": 2.8497072218607675, + "grad_norm": 0.9495101571083069, + "learning_rate": 2.0077188076086288e-05, + "loss": 0.7544, + "step": 4380 + }, + { + "epoch": 2.852960312296682, + "grad_norm": 1.3610079288482666, + "learning_rate": 2.002659260378794e-05, + "loss": 0.7573, + "step": 4385 + }, + { + "epoch": 2.856213402732596, + "grad_norm": 0.9668116569519043, + "learning_rate": 1.9976018332308077e-05, + "loss": 0.7332, + "step": 4390 + }, + { + "epoch": 2.85946649316851, + "grad_norm": 1.128670334815979, + "learning_rate": 1.992546547723651e-05, + "loss": 0.7512, + "step": 4395 + }, + { + "epoch": 2.862719583604424, + "grad_norm": 1.276426911354065, + "learning_rate": 1.987493425407176e-05, + "loss": 0.7449, + "step": 4400 + }, + { + "epoch": 2.8659726740403384, + "grad_norm": 0.9716594815254211, + "learning_rate": 1.982442487822011e-05, + "loss": 0.7432, + "step": 4405 + }, + { + "epoch": 2.869225764476252, + "grad_norm": 0.9533106088638306, + "learning_rate": 1.9773937564994745e-05, + "loss": 0.7423, + "step": 4410 + }, + { + "epoch": 2.8724788549121665, + "grad_norm": 1.0256469249725342, + "learning_rate": 1.972347252961479e-05, + "loss": 0.7614, + "step": 4415 + }, + { + "epoch": 2.8757319453480807, + "grad_norm": 1.1626900434494019, + "learning_rate": 1.967302998720438e-05, + "loss": 0.7392, + "step": 4420 + }, + { + "epoch": 2.8789850357839946, + "grad_norm": 0.9739611744880676, + "learning_rate": 1.9622610152791792e-05, + "loss": 0.7622, + "step": 4425 + }, + { + "epoch": 2.882238126219909, + "grad_norm": 1.0657685995101929, + "learning_rate": 1.9572213241308507e-05, + "loss": 0.7507, + "step": 4430 + }, + { + "epoch": 2.885491216655823, + "grad_norm": 1.029432773590088, + "learning_rate": 1.952183946758826e-05, + "loss": 0.7723, + "step": 4435 + }, + { + "epoch": 2.888744307091737, + "grad_norm": 1.1281373500823975, + "learning_rate": 1.9471489046366185e-05, + "loss": 0.7479, + "step": 4440 + }, + { + "epoch": 2.891997397527651, + "grad_norm": 1.1470041275024414, + "learning_rate": 1.942116219227784e-05, + "loss": 0.7341, + "step": 4445 + }, + { + "epoch": 2.8952504879635654, + "grad_norm": 1.0326032638549805, + "learning_rate": 1.937085911985834e-05, + "loss": 0.7571, + "step": 4450 + }, + { + "epoch": 2.8985035783994793, + "grad_norm": 0.9806135296821594, + "learning_rate": 1.9320580043541425e-05, + "loss": 0.734, + "step": 4455 + }, + { + "epoch": 2.9017566688353935, + "grad_norm": 1.063024878501892, + "learning_rate": 1.9270325177658523e-05, + "loss": 0.7521, + "step": 4460 + }, + { + "epoch": 2.905009759271308, + "grad_norm": 4.5842156410217285, + "learning_rate": 1.922009473643787e-05, + "loss": 0.7563, + "step": 4465 + }, + { + "epoch": 2.9082628497072216, + "grad_norm": 1.3341448307037354, + "learning_rate": 1.9169888934003598e-05, + "loss": 0.7528, + "step": 4470 + }, + { + "epoch": 2.911515940143136, + "grad_norm": 1.3391072750091553, + "learning_rate": 1.9119707984374774e-05, + "loss": 0.737, + "step": 4475 + }, + { + "epoch": 2.91476903057905, + "grad_norm": 0.985970139503479, + "learning_rate": 1.9069552101464552e-05, + "loss": 0.7657, + "step": 4480 + }, + { + "epoch": 2.918022121014964, + "grad_norm": 1.069992184638977, + "learning_rate": 1.901942149907922e-05, + "loss": 0.7526, + "step": 4485 + }, + { + "epoch": 2.921275211450878, + "grad_norm": 0.8812434077262878, + "learning_rate": 1.8969316390917288e-05, + "loss": 0.7664, + "step": 4490 + }, + { + "epoch": 2.9245283018867925, + "grad_norm": 1.2932692766189575, + "learning_rate": 1.891923699056861e-05, + "loss": 0.7553, + "step": 4495 + }, + { + "epoch": 2.9277813923227067, + "grad_norm": 0.935070276260376, + "learning_rate": 1.886918351151343e-05, + "loss": 0.7583, + "step": 4500 + }, + { + "epoch": 2.9310344827586206, + "grad_norm": 0.9840937852859497, + "learning_rate": 1.881915616712151e-05, + "loss": 0.748, + "step": 4505 + }, + { + "epoch": 2.934287573194535, + "grad_norm": 1.0583505630493164, + "learning_rate": 1.8769155170651203e-05, + "loss": 0.7482, + "step": 4510 + }, + { + "epoch": 2.937540663630449, + "grad_norm": 1.0253130197525024, + "learning_rate": 1.8719180735248522e-05, + "loss": 0.751, + "step": 4515 + }, + { + "epoch": 2.940793754066363, + "grad_norm": 1.0491794347763062, + "learning_rate": 1.8669233073946303e-05, + "loss": 0.7533, + "step": 4520 + }, + { + "epoch": 2.944046844502277, + "grad_norm": 1.1201449632644653, + "learning_rate": 1.86193123996632e-05, + "loss": 0.7486, + "step": 4525 + }, + { + "epoch": 2.9472999349381914, + "grad_norm": 1.3683768510818481, + "learning_rate": 1.856941892520284e-05, + "loss": 0.7584, + "step": 4530 + }, + { + "epoch": 2.9505530253741052, + "grad_norm": 1.0555903911590576, + "learning_rate": 1.851955286325292e-05, + "loss": 0.7554, + "step": 4535 + }, + { + "epoch": 2.9538061158100195, + "grad_norm": 1.5055445432662964, + "learning_rate": 1.846971442638426e-05, + "loss": 0.7418, + "step": 4540 + }, + { + "epoch": 2.9570592062459338, + "grad_norm": 1.222474455833435, + "learning_rate": 1.841990382704993e-05, + "loss": 0.7455, + "step": 4545 + }, + { + "epoch": 2.960312296681848, + "grad_norm": 1.0359810590744019, + "learning_rate": 1.8370121277584325e-05, + "loss": 0.7404, + "step": 4550 + }, + { + "epoch": 2.963565387117762, + "grad_norm": 1.2511727809906006, + "learning_rate": 1.8320366990202276e-05, + "loss": 0.7228, + "step": 4555 + }, + { + "epoch": 2.966818477553676, + "grad_norm": 0.8730882406234741, + "learning_rate": 1.827064117699814e-05, + "loss": 0.7586, + "step": 4560 + }, + { + "epoch": 2.9700715679895904, + "grad_norm": 1.5805312395095825, + "learning_rate": 1.822094404994487e-05, + "loss": 0.7499, + "step": 4565 + }, + { + "epoch": 2.973324658425504, + "grad_norm": 1.1607098579406738, + "learning_rate": 1.817127582089317e-05, + "loss": 0.7637, + "step": 4570 + }, + { + "epoch": 2.9765777488614185, + "grad_norm": 0.9193926453590393, + "learning_rate": 1.8121636701570537e-05, + "loss": 0.7532, + "step": 4575 + }, + { + "epoch": 2.9798308392973327, + "grad_norm": 1.0218764543533325, + "learning_rate": 1.807202690358037e-05, + "loss": 0.7503, + "step": 4580 + }, + { + "epoch": 2.9830839297332465, + "grad_norm": 1.0876221656799316, + "learning_rate": 1.802244663840109e-05, + "loss": 0.7707, + "step": 4585 + }, + { + "epoch": 2.986337020169161, + "grad_norm": 1.0459486246109009, + "learning_rate": 1.797289611738523e-05, + "loss": 0.7397, + "step": 4590 + }, + { + "epoch": 2.989590110605075, + "grad_norm": 1.0498055219650269, + "learning_rate": 1.7923375551758505e-05, + "loss": 0.7691, + "step": 4595 + }, + { + "epoch": 2.992843201040989, + "grad_norm": 0.9780749082565308, + "learning_rate": 1.7873885152618956e-05, + "loss": 0.7525, + "step": 4600 + }, + { + "epoch": 2.996096291476903, + "grad_norm": 1.0338603258132935, + "learning_rate": 1.7824425130936023e-05, + "loss": 0.7459, + "step": 4605 + }, + { + "epoch": 2.9993493819128174, + "grad_norm": 0.9098593592643738, + "learning_rate": 1.7774995697549645e-05, + "loss": 0.7488, + "step": 4610 + }, + { + "epoch": 3.0, + "eval_f1": 0.8012369099843738, + "eval_loss": 0.45166015625, + "eval_precision": 0.8020338050069477, + "eval_recall": 0.8006626052475169, + "eval_runtime": 238.3932, + "eval_samples_per_second": 1650.361, + "eval_steps_per_second": 1.615, + "step": 4611 + }, + { + "epoch": 3.0026024723487312, + "grad_norm": 1.3282872438430786, + "learning_rate": 1.7725597063169386e-05, + "loss": 0.6622, + "step": 4615 + }, + { + "epoch": 3.0058555627846455, + "grad_norm": 1.3152724504470825, + "learning_rate": 1.767622943837349e-05, + "loss": 0.6352, + "step": 4620 + }, + { + "epoch": 3.0091086532205593, + "grad_norm": 1.105705976486206, + "learning_rate": 1.7626893033608038e-05, + "loss": 0.6291, + "step": 4625 + }, + { + "epoch": 3.0123617436564736, + "grad_norm": 1.0462555885314941, + "learning_rate": 1.7577588059186027e-05, + "loss": 0.6476, + "step": 4630 + }, + { + "epoch": 3.015614834092388, + "grad_norm": 1.0921547412872314, + "learning_rate": 1.7528314725286443e-05, + "loss": 0.6358, + "step": 4635 + }, + { + "epoch": 3.018867924528302, + "grad_norm": 1.1877232789993286, + "learning_rate": 1.747907324195342e-05, + "loss": 0.6434, + "step": 4640 + }, + { + "epoch": 3.022121014964216, + "grad_norm": 1.1791988611221313, + "learning_rate": 1.7429863819095313e-05, + "loss": 0.6372, + "step": 4645 + }, + { + "epoch": 3.02537410540013, + "grad_norm": 1.23057222366333, + "learning_rate": 1.738068666648379e-05, + "loss": 0.6521, + "step": 4650 + }, + { + "epoch": 3.0286271958360445, + "grad_norm": 1.0966289043426514, + "learning_rate": 1.7331541993752993e-05, + "loss": 0.6337, + "step": 4655 + }, + { + "epoch": 3.0318802862719583, + "grad_norm": 1.108396291732788, + "learning_rate": 1.7282430010398577e-05, + "loss": 0.6394, + "step": 4660 + }, + { + "epoch": 3.0351333767078725, + "grad_norm": 1.2432180643081665, + "learning_rate": 1.723335092577686e-05, + "loss": 0.6319, + "step": 4665 + }, + { + "epoch": 3.038386467143787, + "grad_norm": 1.5450379848480225, + "learning_rate": 1.718430494910391e-05, + "loss": 0.632, + "step": 4670 + }, + { + "epoch": 3.0416395575797006, + "grad_norm": 1.3607127666473389, + "learning_rate": 1.713529228945466e-05, + "loss": 0.6608, + "step": 4675 + }, + { + "epoch": 3.044892648015615, + "grad_norm": 1.0697190761566162, + "learning_rate": 1.7086313155762046e-05, + "loss": 0.6263, + "step": 4680 + }, + { + "epoch": 3.048145738451529, + "grad_norm": 1.3838845491409302, + "learning_rate": 1.703736775681604e-05, + "loss": 0.6367, + "step": 4685 + }, + { + "epoch": 3.051398828887443, + "grad_norm": 1.324628233909607, + "learning_rate": 1.6988456301262854e-05, + "loss": 0.6435, + "step": 4690 + }, + { + "epoch": 3.0546519193233572, + "grad_norm": 1.2009634971618652, + "learning_rate": 1.6939578997603983e-05, + "loss": 0.6467, + "step": 4695 + }, + { + "epoch": 3.0579050097592715, + "grad_norm": 1.2275351285934448, + "learning_rate": 1.689073605419533e-05, + "loss": 0.6403, + "step": 4700 + }, + { + "epoch": 3.0611581001951853, + "grad_norm": 1.9216879606246948, + "learning_rate": 1.6841927679246345e-05, + "loss": 0.6186, + "step": 4705 + }, + { + "epoch": 3.0644111906310996, + "grad_norm": 2.3563551902770996, + "learning_rate": 1.679315408081911e-05, + "loss": 0.6202, + "step": 4710 + }, + { + "epoch": 3.067664281067014, + "grad_norm": 1.435333490371704, + "learning_rate": 1.6744415466827463e-05, + "loss": 0.6273, + "step": 4715 + }, + { + "epoch": 3.0709173715029277, + "grad_norm": 1.315987229347229, + "learning_rate": 1.6695712045036104e-05, + "loss": 0.6318, + "step": 4720 + }, + { + "epoch": 3.074170461938842, + "grad_norm": 1.5982025861740112, + "learning_rate": 1.6647044023059712e-05, + "loss": 0.6384, + "step": 4725 + }, + { + "epoch": 3.077423552374756, + "grad_norm": 1.998374104499817, + "learning_rate": 1.659841160836207e-05, + "loss": 0.6286, + "step": 4730 + }, + { + "epoch": 3.08067664281067, + "grad_norm": 1.3811148405075073, + "learning_rate": 1.6549815008255176e-05, + "loss": 0.6482, + "step": 4735 + }, + { + "epoch": 3.0839297332465843, + "grad_norm": 1.2464516162872314, + "learning_rate": 1.6501254429898343e-05, + "loss": 0.6433, + "step": 4740 + }, + { + "epoch": 3.0871828236824985, + "grad_norm": 1.2944623231887817, + "learning_rate": 1.6452730080297342e-05, + "loss": 0.6328, + "step": 4745 + }, + { + "epoch": 3.0904359141184123, + "grad_norm": 1.1027922630310059, + "learning_rate": 1.6404242166303507e-05, + "loss": 0.6357, + "step": 4750 + }, + { + "epoch": 3.0936890045543266, + "grad_norm": 3.5568132400512695, + "learning_rate": 1.6355790894612834e-05, + "loss": 0.6081, + "step": 4755 + }, + { + "epoch": 3.096942094990241, + "grad_norm": 1.588714838027954, + "learning_rate": 1.630737647176514e-05, + "loss": 0.6601, + "step": 4760 + }, + { + "epoch": 3.1001951854261547, + "grad_norm": 1.1922274827957153, + "learning_rate": 1.6258999104143157e-05, + "loss": 0.6145, + "step": 4765 + }, + { + "epoch": 3.103448275862069, + "grad_norm": 1.3667454719543457, + "learning_rate": 1.621065899797165e-05, + "loss": 0.6372, + "step": 4770 + }, + { + "epoch": 3.106701366297983, + "grad_norm": 1.8918445110321045, + "learning_rate": 1.616235635931655e-05, + "loss": 0.6152, + "step": 4775 + }, + { + "epoch": 3.109954456733897, + "grad_norm": 1.293562650680542, + "learning_rate": 1.611409139408406e-05, + "loss": 0.6211, + "step": 4780 + }, + { + "epoch": 3.1132075471698113, + "grad_norm": 1.446754813194275, + "learning_rate": 1.6065864308019807e-05, + "loss": 0.6453, + "step": 4785 + }, + { + "epoch": 3.1164606376057256, + "grad_norm": 1.1851979494094849, + "learning_rate": 1.6017675306707926e-05, + "loss": 0.631, + "step": 4790 + }, + { + "epoch": 3.1197137280416394, + "grad_norm": 1.3031965494155884, + "learning_rate": 1.5969524595570216e-05, + "loss": 0.6184, + "step": 4795 + }, + { + "epoch": 3.1229668184775536, + "grad_norm": 2.6355156898498535, + "learning_rate": 1.5921412379865257e-05, + "loss": 0.6451, + "step": 4800 + }, + { + "epoch": 3.126219908913468, + "grad_norm": 1.4367573261260986, + "learning_rate": 1.58733388646875e-05, + "loss": 0.6466, + "step": 4805 + }, + { + "epoch": 3.1294729993493817, + "grad_norm": 1.4838011264801025, + "learning_rate": 1.5825304254966445e-05, + "loss": 0.6181, + "step": 4810 + }, + { + "epoch": 3.132726089785296, + "grad_norm": 1.2338780164718628, + "learning_rate": 1.577730875546575e-05, + "loss": 0.6179, + "step": 4815 + }, + { + "epoch": 3.1359791802212102, + "grad_norm": 1.4179608821868896, + "learning_rate": 1.5729352570782324e-05, + "loss": 0.6362, + "step": 4820 + }, + { + "epoch": 3.139232270657124, + "grad_norm": 1.2671458721160889, + "learning_rate": 1.5681435905345522e-05, + "loss": 0.6365, + "step": 4825 + }, + { + "epoch": 3.1424853610930383, + "grad_norm": 1.368369221687317, + "learning_rate": 1.5643131164122626e-05, + "loss": 0.6102, + "step": 4830 + }, + { + "epoch": 3.1457384515289526, + "grad_norm": 1.341280460357666, + "learning_rate": 1.5595286147953364e-05, + "loss": 0.637, + "step": 4835 + }, + { + "epoch": 3.1489915419648664, + "grad_norm": 1.5806121826171875, + "learning_rate": 1.5547481222533846e-05, + "loss": 0.6296, + "step": 4840 + }, + { + "epoch": 3.1522446324007807, + "grad_norm": 1.505342721939087, + "learning_rate": 1.549971659164861e-05, + "loss": 0.6284, + "step": 4845 + }, + { + "epoch": 3.155497722836695, + "grad_norm": 1.2677946090698242, + "learning_rate": 1.5451992458910442e-05, + "loss": 0.6134, + "step": 4850 + }, + { + "epoch": 3.1587508132726088, + "grad_norm": 1.2727744579315186, + "learning_rate": 1.540430902775946e-05, + "loss": 0.626, + "step": 4855 + }, + { + "epoch": 3.162003903708523, + "grad_norm": 1.258187174797058, + "learning_rate": 1.5356666501462314e-05, + "loss": 0.6085, + "step": 4860 + }, + { + "epoch": 3.1652569941444373, + "grad_norm": 1.589736819267273, + "learning_rate": 1.5309065083111255e-05, + "loss": 0.6247, + "step": 4865 + }, + { + "epoch": 3.168510084580351, + "grad_norm": 1.2900131940841675, + "learning_rate": 1.5261504975623306e-05, + "loss": 0.624, + "step": 4870 + }, + { + "epoch": 3.1717631750162654, + "grad_norm": 2.3252532482147217, + "learning_rate": 1.5213986381739393e-05, + "loss": 0.6295, + "step": 4875 + }, + { + "epoch": 3.1750162654521796, + "grad_norm": 1.3652303218841553, + "learning_rate": 1.5166509504023473e-05, + "loss": 0.6274, + "step": 4880 + }, + { + "epoch": 3.178269355888094, + "grad_norm": 1.8075648546218872, + "learning_rate": 1.5119074544861678e-05, + "loss": 0.6375, + "step": 4885 + }, + { + "epoch": 3.1815224463240077, + "grad_norm": 1.2221382856369019, + "learning_rate": 1.5071681706461438e-05, + "loss": 0.6273, + "step": 4890 + }, + { + "epoch": 3.184775536759922, + "grad_norm": 1.5147900581359863, + "learning_rate": 1.5024331190850637e-05, + "loss": 0.6381, + "step": 4895 + }, + { + "epoch": 3.1880286271958362, + "grad_norm": 2.4453020095825195, + "learning_rate": 1.4977023199876743e-05, + "loss": 0.6552, + "step": 4900 + }, + { + "epoch": 3.19128171763175, + "grad_norm": 2.3050053119659424, + "learning_rate": 1.4929757935205951e-05, + "loss": 0.6176, + "step": 4905 + }, + { + "epoch": 3.1945348080676643, + "grad_norm": 1.289581060409546, + "learning_rate": 1.4882535598322311e-05, + "loss": 0.6253, + "step": 4910 + }, + { + "epoch": 3.1977878985035786, + "grad_norm": 1.5076651573181152, + "learning_rate": 1.4835356390526888e-05, + "loss": 0.6194, + "step": 4915 + }, + { + "epoch": 3.2010409889394924, + "grad_norm": 1.4202001094818115, + "learning_rate": 1.478822051293689e-05, + "loss": 0.6081, + "step": 4920 + }, + { + "epoch": 3.2042940793754067, + "grad_norm": 1.287611961364746, + "learning_rate": 1.4741128166484824e-05, + "loss": 0.6429, + "step": 4925 + }, + { + "epoch": 3.207547169811321, + "grad_norm": 1.2236043214797974, + "learning_rate": 1.4694079551917629e-05, + "loss": 0.6176, + "step": 4930 + }, + { + "epoch": 3.2108002602472347, + "grad_norm": 1.3410075902938843, + "learning_rate": 1.4656472282003922e-05, + "loss": 0.6209, + "step": 4935 + }, + { + "epoch": 3.214053350683149, + "grad_norm": 1.419541835784912, + "learning_rate": 1.4609502890116145e-05, + "loss": 0.6436, + "step": 4940 + }, + { + "epoch": 3.2173064411190633, + "grad_norm": 1.7478810548782349, + "learning_rate": 1.4562577791210158e-05, + "loss": 0.6023, + "step": 4945 + }, + { + "epoch": 3.220559531554977, + "grad_norm": 1.8083374500274658, + "learning_rate": 1.4515697185319946e-05, + "loss": 0.6166, + "step": 4950 + }, + { + "epoch": 3.2238126219908914, + "grad_norm": 2.203806161880493, + "learning_rate": 1.4468861272289818e-05, + "loss": 0.636, + "step": 4955 + }, + { + "epoch": 3.2270657124268056, + "grad_norm": 1.3574259281158447, + "learning_rate": 1.4422070251773594e-05, + "loss": 0.6012, + "step": 4960 + }, + { + "epoch": 3.2303188028627194, + "grad_norm": 1.4441782236099243, + "learning_rate": 1.4375324323233697e-05, + "loss": 0.6197, + "step": 4965 + }, + { + "epoch": 3.2335718932986337, + "grad_norm": 1.7502111196517944, + "learning_rate": 1.4328623685940335e-05, + "loss": 0.6354, + "step": 4970 + }, + { + "epoch": 3.236824983734548, + "grad_norm": 1.5651460886001587, + "learning_rate": 1.4281968538970646e-05, + "loss": 0.6257, + "step": 4975 + }, + { + "epoch": 3.240078074170462, + "grad_norm": 1.3271369934082031, + "learning_rate": 1.4235359081207871e-05, + "loss": 0.6378, + "step": 4980 + }, + { + "epoch": 3.243331164606376, + "grad_norm": 1.354906678199768, + "learning_rate": 1.4188795511340461e-05, + "loss": 0.6324, + "step": 4985 + }, + { + "epoch": 3.2465842550422903, + "grad_norm": 1.295578956604004, + "learning_rate": 1.4142278027861253e-05, + "loss": 0.6176, + "step": 4990 + }, + { + "epoch": 3.249837345478204, + "grad_norm": 1.4495329856872559, + "learning_rate": 1.4095806829066655e-05, + "loss": 0.6387, + "step": 4995 + }, + { + "epoch": 3.2530904359141184, + "grad_norm": 1.3459370136260986, + "learning_rate": 1.404938211305574e-05, + "loss": 0.6343, + "step": 5000 + }, + { + "epoch": 3.2563435263500327, + "grad_norm": 1.299459457397461, + "learning_rate": 1.4003004077729438e-05, + "loss": 0.6394, + "step": 5005 + }, + { + "epoch": 3.2595966167859465, + "grad_norm": 1.3181241750717163, + "learning_rate": 1.3956672920789705e-05, + "loss": 0.6135, + "step": 5010 + }, + { + "epoch": 3.2628497072218607, + "grad_norm": 1.5811583995819092, + "learning_rate": 1.3910388839738647e-05, + "loss": 0.6377, + "step": 5015 + }, + { + "epoch": 3.266102797657775, + "grad_norm": 1.3512473106384277, + "learning_rate": 1.386415203187768e-05, + "loss": 0.6293, + "step": 5020 + }, + { + "epoch": 3.269355888093689, + "grad_norm": 1.8290486335754395, + "learning_rate": 1.3817962694306747e-05, + "loss": 0.635, + "step": 5025 + }, + { + "epoch": 3.272608978529603, + "grad_norm": 1.5076416730880737, + "learning_rate": 1.3771821023923383e-05, + "loss": 0.6027, + "step": 5030 + }, + { + "epoch": 3.2758620689655173, + "grad_norm": 1.5753469467163086, + "learning_rate": 1.3725727217421947e-05, + "loss": 0.6165, + "step": 5035 + }, + { + "epoch": 3.279115159401431, + "grad_norm": 1.5028088092803955, + "learning_rate": 1.3679681471292776e-05, + "loss": 0.621, + "step": 5040 + }, + { + "epoch": 3.2823682498373454, + "grad_norm": 1.4654455184936523, + "learning_rate": 1.363368398182131e-05, + "loss": 0.6266, + "step": 5045 + }, + { + "epoch": 3.2856213402732597, + "grad_norm": 1.7276520729064941, + "learning_rate": 1.3587734945087277e-05, + "loss": 0.6258, + "step": 5050 + }, + { + "epoch": 3.288874430709174, + "grad_norm": 1.710095763206482, + "learning_rate": 1.3541834556963895e-05, + "loss": 0.6388, + "step": 5055 + }, + { + "epoch": 3.2921275211450878, + "grad_norm": 1.6146140098571777, + "learning_rate": 1.3495983013116953e-05, + "loss": 0.6466, + "step": 5060 + }, + { + "epoch": 3.295380611581002, + "grad_norm": 1.3169276714324951, + "learning_rate": 1.3450180509004066e-05, + "loss": 0.6389, + "step": 5065 + }, + { + "epoch": 3.2986337020169163, + "grad_norm": 2.564819574356079, + "learning_rate": 1.3404427239873763e-05, + "loss": 0.6158, + "step": 5070 + }, + { + "epoch": 3.30188679245283, + "grad_norm": 1.6384319067001343, + "learning_rate": 1.335872340076474e-05, + "loss": 0.6241, + "step": 5075 + }, + { + "epoch": 3.3051398828887444, + "grad_norm": 1.4620628356933594, + "learning_rate": 1.3313069186504929e-05, + "loss": 0.6203, + "step": 5080 + }, + { + "epoch": 3.3083929733246586, + "grad_norm": 1.7426296472549438, + "learning_rate": 1.3267464791710747e-05, + "loss": 0.6238, + "step": 5085 + }, + { + "epoch": 3.3116460637605725, + "grad_norm": 2.093579053878784, + "learning_rate": 1.3221910410786248e-05, + "loss": 0.6144, + "step": 5090 + }, + { + "epoch": 3.3148991541964867, + "grad_norm": 1.4141899347305298, + "learning_rate": 1.3176406237922262e-05, + "loss": 0.6145, + "step": 5095 + }, + { + "epoch": 3.318152244632401, + "grad_norm": 1.2416197061538696, + "learning_rate": 1.3130952467095593e-05, + "loss": 0.6134, + "step": 5100 + }, + { + "epoch": 3.321405335068315, + "grad_norm": 1.6651731729507446, + "learning_rate": 1.3085549292068213e-05, + "loss": 0.6366, + "step": 5105 + }, + { + "epoch": 3.324658425504229, + "grad_norm": 1.4123419523239136, + "learning_rate": 1.3040196906386392e-05, + "loss": 0.6363, + "step": 5110 + }, + { + "epoch": 3.3279115159401433, + "grad_norm": 1.5788094997406006, + "learning_rate": 1.2994895503379886e-05, + "loss": 0.6463, + "step": 5115 + }, + { + "epoch": 3.331164606376057, + "grad_norm": 1.9464671611785889, + "learning_rate": 1.2949645276161149e-05, + "loss": 0.6193, + "step": 5120 + }, + { + "epoch": 3.3344176968119714, + "grad_norm": 1.3868358135223389, + "learning_rate": 1.2904446417624457e-05, + "loss": 0.6182, + "step": 5125 + }, + { + "epoch": 3.3376707872478857, + "grad_norm": 7.827129364013672, + "learning_rate": 1.2859299120445107e-05, + "loss": 0.615, + "step": 5130 + }, + { + "epoch": 3.3409238776837995, + "grad_norm": 1.3248870372772217, + "learning_rate": 1.2814203577078626e-05, + "loss": 0.6286, + "step": 5135 + }, + { + "epoch": 3.3441769681197138, + "grad_norm": 1.3587925434112549, + "learning_rate": 1.2769159979759899e-05, + "loss": 0.6285, + "step": 5140 + }, + { + "epoch": 3.347430058555628, + "grad_norm": 1.518294095993042, + "learning_rate": 1.2724168520502371e-05, + "loss": 0.6304, + "step": 5145 + }, + { + "epoch": 3.350683148991542, + "grad_norm": 1.2859338521957397, + "learning_rate": 1.2679229391097241e-05, + "loss": 0.6299, + "step": 5150 + }, + { + "epoch": 3.353936239427456, + "grad_norm": 1.3024553060531616, + "learning_rate": 1.2634342783112646e-05, + "loss": 0.6177, + "step": 5155 + }, + { + "epoch": 3.3571893298633704, + "grad_norm": 3.6768040657043457, + "learning_rate": 1.258950888789281e-05, + "loss": 0.6385, + "step": 5160 + }, + { + "epoch": 3.360442420299284, + "grad_norm": 1.476014256477356, + "learning_rate": 1.2544727896557257e-05, + "loss": 0.6313, + "step": 5165 + }, + { + "epoch": 3.3636955107351985, + "grad_norm": 2.193185806274414, + "learning_rate": 1.2500000000000006e-05, + "loss": 0.6386, + "step": 5170 + }, + { + "epoch": 3.3669486011711127, + "grad_norm": 1.4634368419647217, + "learning_rate": 1.2455325388888726e-05, + "loss": 0.617, + "step": 5175 + }, + { + "epoch": 3.3702016916070265, + "grad_norm": 1.770553708076477, + "learning_rate": 1.2410704253663932e-05, + "loss": 0.637, + "step": 5180 + }, + { + "epoch": 3.373454782042941, + "grad_norm": 1.7664306163787842, + "learning_rate": 1.236613678453821e-05, + "loss": 0.6203, + "step": 5185 + }, + { + "epoch": 3.376707872478855, + "grad_norm": 1.4499051570892334, + "learning_rate": 1.232162317149535e-05, + "loss": 0.6417, + "step": 5190 + }, + { + "epoch": 3.379960962914769, + "grad_norm": 2.710038661956787, + "learning_rate": 1.2277163604289558e-05, + "loss": 0.6246, + "step": 5195 + }, + { + "epoch": 3.383214053350683, + "grad_norm": 1.9992517232894897, + "learning_rate": 1.2232758272444672e-05, + "loss": 0.6188, + "step": 5200 + }, + { + "epoch": 3.3864671437865974, + "grad_norm": 1.1757420301437378, + "learning_rate": 1.2188407365253337e-05, + "loss": 0.6232, + "step": 5205 + }, + { + "epoch": 3.3897202342225112, + "grad_norm": 1.3049498796463013, + "learning_rate": 1.2144111071776174e-05, + "loss": 0.6314, + "step": 5210 + }, + { + "epoch": 3.3929733246584255, + "grad_norm": 1.2970354557037354, + "learning_rate": 1.209986958084099e-05, + "loss": 0.6361, + "step": 5215 + }, + { + "epoch": 3.3962264150943398, + "grad_norm": 1.4407247304916382, + "learning_rate": 1.205568308104201e-05, + "loss": 0.6246, + "step": 5220 + }, + { + "epoch": 3.3994795055302536, + "grad_norm": 1.673065185546875, + "learning_rate": 1.2011551760739014e-05, + "loss": 0.6318, + "step": 5225 + }, + { + "epoch": 3.402732595966168, + "grad_norm": 1.4697465896606445, + "learning_rate": 1.196747580805656e-05, + "loss": 0.6417, + "step": 5230 + }, + { + "epoch": 3.405985686402082, + "grad_norm": 1.6552962064743042, + "learning_rate": 1.1923455410883212e-05, + "loss": 0.6343, + "step": 5235 + }, + { + "epoch": 3.409238776837996, + "grad_norm": 1.5813676118850708, + "learning_rate": 1.1879490756870674e-05, + "loss": 0.6352, + "step": 5240 + }, + { + "epoch": 3.41249186727391, + "grad_norm": 3.213158130645752, + "learning_rate": 1.1835582033433037e-05, + "loss": 0.6352, + "step": 5245 + }, + { + "epoch": 3.4157449577098244, + "grad_norm": 1.2842360734939575, + "learning_rate": 1.1791729427745992e-05, + "loss": 0.6416, + "step": 5250 + }, + { + "epoch": 3.4189980481457383, + "grad_norm": 1.6811124086380005, + "learning_rate": 1.1747933126745983e-05, + "loss": 0.651, + "step": 5255 + }, + { + "epoch": 3.4222511385816525, + "grad_norm": 1.2236487865447998, + "learning_rate": 1.170419331712943e-05, + "loss": 0.641, + "step": 5260 + }, + { + "epoch": 3.425504229017567, + "grad_norm": 1.3968175649642944, + "learning_rate": 1.1660510185351978e-05, + "loss": 0.6271, + "step": 5265 + }, + { + "epoch": 3.4287573194534806, + "grad_norm": 2.152369976043701, + "learning_rate": 1.161688391762763e-05, + "loss": 0.633, + "step": 5270 + }, + { + "epoch": 3.432010409889395, + "grad_norm": 1.5563530921936035, + "learning_rate": 1.1573314699927985e-05, + "loss": 0.6429, + "step": 5275 + }, + { + "epoch": 3.435263500325309, + "grad_norm": 1.4173344373703003, + "learning_rate": 1.1529802717981475e-05, + "loss": 0.6344, + "step": 5280 + }, + { + "epoch": 3.438516590761223, + "grad_norm": 1.8149155378341675, + "learning_rate": 1.1486348157272526e-05, + "loss": 0.6278, + "step": 5285 + }, + { + "epoch": 3.441769681197137, + "grad_norm": 1.4700722694396973, + "learning_rate": 1.1442951203040775e-05, + "loss": 0.607, + "step": 5290 + }, + { + "epoch": 3.4450227716330515, + "grad_norm": 1.4950767755508423, + "learning_rate": 1.139961204028033e-05, + "loss": 0.6298, + "step": 5295 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 1.702974796295166, + "learning_rate": 1.1356330853738906e-05, + "loss": 0.6599, + "step": 5300 + }, + { + "epoch": 3.4515289525048796, + "grad_norm": 1.7694127559661865, + "learning_rate": 1.1313107827917083e-05, + "loss": 0.6235, + "step": 5305 + }, + { + "epoch": 3.454782042940794, + "grad_norm": 1.2292397022247314, + "learning_rate": 1.1269943147067535e-05, + "loss": 0.6264, + "step": 5310 + }, + { + "epoch": 3.4580351333767076, + "grad_norm": 1.3355427980422974, + "learning_rate": 1.1226836995194196e-05, + "loss": 0.6274, + "step": 5315 + }, + { + "epoch": 3.461288223812622, + "grad_norm": 1.313506841659546, + "learning_rate": 1.1183789556051508e-05, + "loss": 0.6075, + "step": 5320 + }, + { + "epoch": 3.464541314248536, + "grad_norm": 1.3950237035751343, + "learning_rate": 1.1140801013143618e-05, + "loss": 0.606, + "step": 5325 + }, + { + "epoch": 3.46779440468445, + "grad_norm": 1.4222460985183716, + "learning_rate": 1.1097871549723629e-05, + "loss": 0.6238, + "step": 5330 + }, + { + "epoch": 3.4710474951203643, + "grad_norm": 1.701815128326416, + "learning_rate": 1.1055001348792807e-05, + "loss": 0.6227, + "step": 5335 + }, + { + "epoch": 3.4743005855562785, + "grad_norm": 1.5569487810134888, + "learning_rate": 1.1012190593099744e-05, + "loss": 0.643, + "step": 5340 + }, + { + "epoch": 3.4775536759921923, + "grad_norm": 1.3712338209152222, + "learning_rate": 1.0969439465139687e-05, + "loss": 0.6167, + "step": 5345 + }, + { + "epoch": 3.4808067664281066, + "grad_norm": 1.3950178623199463, + "learning_rate": 1.0926748147153648e-05, + "loss": 0.6318, + "step": 5350 + }, + { + "epoch": 3.484059856864021, + "grad_norm": 1.347066044807434, + "learning_rate": 1.088411682112771e-05, + "loss": 0.6225, + "step": 5355 + }, + { + "epoch": 3.487312947299935, + "grad_norm": 1.347697138786316, + "learning_rate": 1.08415456687922e-05, + "loss": 0.6225, + "step": 5360 + }, + { + "epoch": 3.490566037735849, + "grad_norm": 1.5315964221954346, + "learning_rate": 1.0799034871620958e-05, + "loss": 0.6067, + "step": 5365 + }, + { + "epoch": 3.493819128171763, + "grad_norm": 1.3384947776794434, + "learning_rate": 1.0756584610830523e-05, + "loss": 0.6235, + "step": 5370 + }, + { + "epoch": 3.4970722186076775, + "grad_norm": 1.3656494617462158, + "learning_rate": 1.071419506737937e-05, + "loss": 0.6347, + "step": 5375 + }, + { + "epoch": 3.5003253090435913, + "grad_norm": 1.3071860074996948, + "learning_rate": 1.0671866421967175e-05, + "loss": 0.6108, + "step": 5380 + }, + { + "epoch": 3.5035783994795056, + "grad_norm": 1.3579492568969727, + "learning_rate": 1.062959885503399e-05, + "loss": 0.6354, + "step": 5385 + }, + { + "epoch": 3.5068314899154194, + "grad_norm": 1.52472722530365, + "learning_rate": 1.0587392546759498e-05, + "loss": 0.6177, + "step": 5390 + }, + { + "epoch": 3.5100845803513336, + "grad_norm": 1.7216352224349976, + "learning_rate": 1.0545247677062273e-05, + "loss": 0.6225, + "step": 5395 + }, + { + "epoch": 3.513337670787248, + "grad_norm": 1.3169187307357788, + "learning_rate": 1.050316442559896e-05, + "loss": 0.6196, + "step": 5400 + }, + { + "epoch": 3.516590761223162, + "grad_norm": 1.7447690963745117, + "learning_rate": 1.0461142971763535e-05, + "loss": 0.6338, + "step": 5405 + }, + { + "epoch": 3.519843851659076, + "grad_norm": 1.4032801389694214, + "learning_rate": 1.0419183494686574e-05, + "loss": 0.6261, + "step": 5410 + }, + { + "epoch": 3.5230969420949902, + "grad_norm": 1.6217771768569946, + "learning_rate": 1.0377286173234416e-05, + "loss": 0.6306, + "step": 5415 + }, + { + "epoch": 3.5263500325309045, + "grad_norm": 1.2982110977172852, + "learning_rate": 1.0335451186008454e-05, + "loss": 0.6242, + "step": 5420 + }, + { + "epoch": 3.5296031229668183, + "grad_norm": 1.2958654165267944, + "learning_rate": 1.0293678711344382e-05, + "loss": 0.6292, + "step": 5425 + }, + { + "epoch": 3.5328562134027326, + "grad_norm": 1.7522900104522705, + "learning_rate": 1.0251968927311384e-05, + "loss": 0.6541, + "step": 5430 + }, + { + "epoch": 3.536109303838647, + "grad_norm": 1.435259222984314, + "learning_rate": 1.0210322011711408e-05, + "loss": 0.6064, + "step": 5435 + }, + { + "epoch": 3.5393623942745607, + "grad_norm": 1.3290374279022217, + "learning_rate": 1.0168738142078429e-05, + "loss": 0.6255, + "step": 5440 + }, + { + "epoch": 3.542615484710475, + "grad_norm": 1.3328436613082886, + "learning_rate": 1.012721749567764e-05, + "loss": 0.6006, + "step": 5445 + }, + { + "epoch": 3.545868575146389, + "grad_norm": 1.3372770547866821, + "learning_rate": 1.0085760249504728e-05, + "loss": 0.6194, + "step": 5450 + }, + { + "epoch": 3.5491216655823035, + "grad_norm": 1.7760313749313354, + "learning_rate": 1.0044366580285137e-05, + "loss": 0.6067, + "step": 5455 + }, + { + "epoch": 3.5523747560182173, + "grad_norm": 1.7420598268508911, + "learning_rate": 1.0003036664473267e-05, + "loss": 0.6071, + "step": 5460 + }, + { + "epoch": 3.5556278464541315, + "grad_norm": 1.498193621635437, + "learning_rate": 9.96177067825175e-06, + "loss": 0.6146, + "step": 5465 + }, + { + "epoch": 3.558880936890046, + "grad_norm": 1.8063032627105713, + "learning_rate": 9.920568797530716e-06, + "loss": 0.626, + "step": 5470 + }, + { + "epoch": 3.5621340273259596, + "grad_norm": 1.2613329887390137, + "learning_rate": 9.879431197947014e-06, + "loss": 0.6049, + "step": 5475 + }, + { + "epoch": 3.565387117761874, + "grad_norm": 1.34530770778656, + "learning_rate": 9.83835805486347e-06, + "loss": 0.6197, + "step": 5480 + }, + { + "epoch": 3.568640208197788, + "grad_norm": 1.9523491859436035, + "learning_rate": 9.797349543368128e-06, + "loss": 0.6342, + "step": 5485 + }, + { + "epoch": 3.571893298633702, + "grad_norm": 1.8784916400909424, + "learning_rate": 9.756405838273558e-06, + "loss": 0.64, + "step": 5490 + }, + { + "epoch": 3.5751463890696162, + "grad_norm": 1.5533080101013184, + "learning_rate": 9.715527114116035e-06, + "loss": 0.6243, + "step": 5495 + }, + { + "epoch": 3.5783994795055305, + "grad_norm": 1.385695219039917, + "learning_rate": 9.674713545154831e-06, + "loss": 0.6264, + "step": 5500 + }, + { + "epoch": 3.5816525699414443, + "grad_norm": 1.3538482189178467, + "learning_rate": 9.633965305371506e-06, + "loss": 0.621, + "step": 5505 + }, + { + "epoch": 3.5849056603773586, + "grad_norm": 1.6445493698120117, + "learning_rate": 9.5932825684691e-06, + "loss": 0.6239, + "step": 5510 + }, + { + "epoch": 3.588158750813273, + "grad_norm": 1.803451657295227, + "learning_rate": 9.552665507871428e-06, + "loss": 0.6311, + "step": 5515 + }, + { + "epoch": 3.5914118412491867, + "grad_norm": 1.3346718549728394, + "learning_rate": 9.51211429672236e-06, + "loss": 0.6396, + "step": 5520 + }, + { + "epoch": 3.594664931685101, + "grad_norm": 2.1071603298187256, + "learning_rate": 9.471629107885038e-06, + "loss": 0.6238, + "step": 5525 + }, + { + "epoch": 3.597918022121015, + "grad_norm": 1.4250411987304688, + "learning_rate": 9.431210113941169e-06, + "loss": 0.6063, + "step": 5530 + }, + { + "epoch": 3.601171112556929, + "grad_norm": 1.3815439939498901, + "learning_rate": 9.390857487190274e-06, + "loss": 0.5978, + "step": 5535 + }, + { + "epoch": 3.6044242029928433, + "grad_norm": 1.6549842357635498, + "learning_rate": 9.350571399648988e-06, + "loss": 0.6094, + "step": 5540 + }, + { + "epoch": 3.6076772934287575, + "grad_norm": 1.4034509658813477, + "learning_rate": 9.310352023050272e-06, + "loss": 0.6187, + "step": 5545 + }, + { + "epoch": 3.6109303838646714, + "grad_norm": 1.6350473165512085, + "learning_rate": 9.270199528842715e-06, + "loss": 0.6076, + "step": 5550 + }, + { + "epoch": 3.6141834743005856, + "grad_norm": 1.4474992752075195, + "learning_rate": 9.230114088189814e-06, + "loss": 0.6507, + "step": 5555 + }, + { + "epoch": 3.6174365647365, + "grad_norm": 1.4828194379806519, + "learning_rate": 9.19009587196921e-06, + "loss": 0.6264, + "step": 5560 + }, + { + "epoch": 3.6206896551724137, + "grad_norm": 1.7121607065200806, + "learning_rate": 9.150145050771972e-06, + "loss": 0.6383, + "step": 5565 + }, + { + "epoch": 3.623942745608328, + "grad_norm": 1.8459277153015137, + "learning_rate": 9.110261794901903e-06, + "loss": 0.6436, + "step": 5570 + }, + { + "epoch": 3.6271958360442422, + "grad_norm": 1.4332444667816162, + "learning_rate": 9.070446274374766e-06, + "loss": 0.6313, + "step": 5575 + }, + { + "epoch": 3.630448926480156, + "grad_norm": 1.2665612697601318, + "learning_rate": 9.030698658917566e-06, + "loss": 0.6003, + "step": 5580 + }, + { + "epoch": 3.6337020169160703, + "grad_norm": 1.5076160430908203, + "learning_rate": 8.99101911796788e-06, + "loss": 0.6203, + "step": 5585 + }, + { + "epoch": 3.6369551073519846, + "grad_norm": 1.567221999168396, + "learning_rate": 8.951407820673058e-06, + "loss": 0.6252, + "step": 5590 + }, + { + "epoch": 3.6402081977878984, + "grad_norm": 1.504109263420105, + "learning_rate": 8.911864935889544e-06, + "loss": 0.6332, + "step": 5595 + }, + { + "epoch": 3.6434612882238127, + "grad_norm": 1.6598913669586182, + "learning_rate": 8.872390632182175e-06, + "loss": 0.6258, + "step": 5600 + }, + { + "epoch": 3.646714378659727, + "grad_norm": 1.3711302280426025, + "learning_rate": 8.832985077823406e-06, + "loss": 0.6273, + "step": 5605 + }, + { + "epoch": 3.6499674690956407, + "grad_norm": 1.293453574180603, + "learning_rate": 8.793648440792654e-06, + "loss": 0.6041, + "step": 5610 + }, + { + "epoch": 3.653220559531555, + "grad_norm": 1.6621414422988892, + "learning_rate": 8.754380888775523e-06, + "loss": 0.6177, + "step": 5615 + }, + { + "epoch": 3.6564736499674693, + "grad_norm": 1.2931593656539917, + "learning_rate": 8.715182589163153e-06, + "loss": 0.6084, + "step": 5620 + }, + { + "epoch": 3.659726740403383, + "grad_norm": 1.4701381921768188, + "learning_rate": 8.676053709051446e-06, + "loss": 0.6235, + "step": 5625 + }, + { + "epoch": 3.6629798308392973, + "grad_norm": 2.272709369659424, + "learning_rate": 8.636994415240376e-06, + "loss": 0.6326, + "step": 5630 + }, + { + "epoch": 3.6662329212752116, + "grad_norm": 1.3057537078857422, + "learning_rate": 8.598004874233315e-06, + "loss": 0.616, + "step": 5635 + }, + { + "epoch": 3.6694860117111254, + "grad_norm": 1.6016069650650024, + "learning_rate": 8.559085252236259e-06, + "loss": 0.6126, + "step": 5640 + }, + { + "epoch": 3.6727391021470397, + "grad_norm": 1.38706636428833, + "learning_rate": 8.520235715157152e-06, + "loss": 0.6424, + "step": 5645 + }, + { + "epoch": 3.675992192582954, + "grad_norm": 1.403805136680603, + "learning_rate": 8.481456428605205e-06, + "loss": 0.6328, + "step": 5650 + }, + { + "epoch": 3.6792452830188678, + "grad_norm": 2.8022546768188477, + "learning_rate": 8.442747557890138e-06, + "loss": 0.6225, + "step": 5655 + }, + { + "epoch": 3.682498373454782, + "grad_norm": 1.2923667430877686, + "learning_rate": 8.404109268021493e-06, + "loss": 0.6068, + "step": 5660 + }, + { + "epoch": 3.6857514638906963, + "grad_norm": 1.327010154724121, + "learning_rate": 8.365541723707971e-06, + "loss": 0.6032, + "step": 5665 + }, + { + "epoch": 3.68900455432661, + "grad_norm": 3.022547960281372, + "learning_rate": 8.327045089356663e-06, + "loss": 0.6202, + "step": 5670 + }, + { + "epoch": 3.6922576447625244, + "grad_norm": 1.7190786600112915, + "learning_rate": 8.288619529072394e-06, + "loss": 0.6136, + "step": 5675 + }, + { + "epoch": 3.6955107351984386, + "grad_norm": 1.8883839845657349, + "learning_rate": 8.250265206657025e-06, + "loss": 0.626, + "step": 5680 + }, + { + "epoch": 3.6987638256343525, + "grad_norm": 1.216133952140808, + "learning_rate": 8.211982285608721e-06, + "loss": 0.6084, + "step": 5685 + }, + { + "epoch": 3.7020169160702667, + "grad_norm": 1.4318759441375732, + "learning_rate": 8.17377092912128e-06, + "loss": 0.6252, + "step": 5690 + }, + { + "epoch": 3.705270006506181, + "grad_norm": 1.3429824113845825, + "learning_rate": 8.135631300083448e-06, + "loss": 0.6421, + "step": 5695 + }, + { + "epoch": 3.708523096942095, + "grad_norm": 1.563573956489563, + "learning_rate": 8.097563561078193e-06, + "loss": 0.6426, + "step": 5700 + }, + { + "epoch": 3.711776187378009, + "grad_norm": 1.3186182975769043, + "learning_rate": 8.059567874382023e-06, + "loss": 0.6148, + "step": 5705 + }, + { + "epoch": 3.7150292778139233, + "grad_norm": 1.4381370544433594, + "learning_rate": 8.021644401964305e-06, + "loss": 0.6206, + "step": 5710 + }, + { + "epoch": 3.718282368249837, + "grad_norm": 1.6375632286071777, + "learning_rate": 7.983793305486583e-06, + "loss": 0.6169, + "step": 5715 + }, + { + "epoch": 3.7215354586857514, + "grad_norm": 1.426100730895996, + "learning_rate": 7.946014746301858e-06, + "loss": 0.6299, + "step": 5720 + }, + { + "epoch": 3.7247885491216657, + "grad_norm": 1.6016979217529297, + "learning_rate": 7.908308885453908e-06, + "loss": 0.6039, + "step": 5725 + }, + { + "epoch": 3.7280416395575795, + "grad_norm": 1.8250033855438232, + "learning_rate": 7.87067588367664e-06, + "loss": 0.6375, + "step": 5730 + }, + { + "epoch": 3.7312947299934938, + "grad_norm": 1.6048786640167236, + "learning_rate": 7.833115901393347e-06, + "loss": 0.6469, + "step": 5735 + }, + { + "epoch": 3.734547820429408, + "grad_norm": 1.473156213760376, + "learning_rate": 7.795629098716045e-06, + "loss": 0.6291, + "step": 5740 + }, + { + "epoch": 3.737800910865322, + "grad_norm": 1.4616464376449585, + "learning_rate": 7.758215635444848e-06, + "loss": 0.6418, + "step": 5745 + }, + { + "epoch": 3.741054001301236, + "grad_norm": 1.3316526412963867, + "learning_rate": 7.720875671067188e-06, + "loss": 0.6052, + "step": 5750 + }, + { + "epoch": 3.7443070917371504, + "grad_norm": 2.7276248931884766, + "learning_rate": 7.683609364757192e-06, + "loss": 0.6311, + "step": 5755 + }, + { + "epoch": 3.747560182173064, + "grad_norm": 1.4057763814926147, + "learning_rate": 7.646416875374992e-06, + "loss": 0.6262, + "step": 5760 + }, + { + "epoch": 3.7508132726089785, + "grad_norm": 1.7808401584625244, + "learning_rate": 7.609298361466083e-06, + "loss": 0.6372, + "step": 5765 + }, + { + "epoch": 3.7540663630448927, + "grad_norm": 1.5597418546676636, + "learning_rate": 7.572253981260571e-06, + "loss": 0.6181, + "step": 5770 + }, + { + "epoch": 3.7573194534808065, + "grad_norm": 1.6378741264343262, + "learning_rate": 7.535283892672562e-06, + "loss": 0.6247, + "step": 5775 + }, + { + "epoch": 3.760572543916721, + "grad_norm": 2.498858690261841, + "learning_rate": 7.498388253299482e-06, + "loss": 0.643, + "step": 5780 + }, + { + "epoch": 3.763825634352635, + "grad_norm": 1.9484217166900635, + "learning_rate": 7.46156722042137e-06, + "loss": 0.6223, + "step": 5785 + }, + { + "epoch": 3.767078724788549, + "grad_norm": 1.3782168626785278, + "learning_rate": 7.424820951000233e-06, + "loss": 0.6148, + "step": 5790 + }, + { + "epoch": 3.770331815224463, + "grad_norm": 1.3748527765274048, + "learning_rate": 7.388149601679392e-06, + "loss": 0.6242, + "step": 5795 + }, + { + "epoch": 3.7735849056603774, + "grad_norm": 1.4963568449020386, + "learning_rate": 7.351553328782779e-06, + "loss": 0.6014, + "step": 5800 + }, + { + "epoch": 3.7768379960962912, + "grad_norm": 1.708061695098877, + "learning_rate": 7.31503228831428e-06, + "loss": 0.6154, + "step": 5805 + }, + { + "epoch": 3.7800910865322055, + "grad_norm": 1.8436424732208252, + "learning_rate": 7.278586635957107e-06, + "loss": 0.6263, + "step": 5810 + }, + { + "epoch": 3.7833441769681198, + "grad_norm": 1.9801384210586548, + "learning_rate": 7.242216527073079e-06, + "loss": 0.5955, + "step": 5815 + }, + { + "epoch": 3.7865972674040336, + "grad_norm": 1.4177374839782715, + "learning_rate": 7.205922116701985e-06, + "loss": 0.6255, + "step": 5820 + }, + { + "epoch": 3.789850357839948, + "grad_norm": 1.4929031133651733, + "learning_rate": 7.169703559560953e-06, + "loss": 0.6046, + "step": 5825 + }, + { + "epoch": 3.793103448275862, + "grad_norm": 2.4425814151763916, + "learning_rate": 7.133561010043724e-06, + "loss": 0.6072, + "step": 5830 + }, + { + "epoch": 3.796356538711776, + "grad_norm": 1.5860954523086548, + "learning_rate": 7.097494622220049e-06, + "loss": 0.6173, + "step": 5835 + }, + { + "epoch": 3.79960962914769, + "grad_norm": 1.4166280031204224, + "learning_rate": 7.0615045498350215e-06, + "loss": 0.5985, + "step": 5840 + }, + { + "epoch": 3.8028627195836044, + "grad_norm": 1.7926712036132812, + "learning_rate": 7.025590946308402e-06, + "loss": 0.6077, + "step": 5845 + }, + { + "epoch": 3.8061158100195187, + "grad_norm": 1.411357045173645, + "learning_rate": 6.9897539647339725e-06, + "loss": 0.6126, + "step": 5850 + }, + { + "epoch": 3.8093689004554325, + "grad_norm": 1.4378728866577148, + "learning_rate": 6.95399375787891e-06, + "loss": 0.6217, + "step": 5855 + }, + { + "epoch": 3.812621990891347, + "grad_norm": 1.630339503288269, + "learning_rate": 6.918310478183093e-06, + "loss": 0.6081, + "step": 5860 + }, + { + "epoch": 3.815875081327261, + "grad_norm": 1.4536669254302979, + "learning_rate": 6.882704277758475e-06, + "loss": 0.631, + "step": 5865 + }, + { + "epoch": 3.819128171763175, + "grad_norm": 1.369432806968689, + "learning_rate": 6.847175308388451e-06, + "loss": 0.6023, + "step": 5870 + }, + { + "epoch": 3.822381262199089, + "grad_norm": 1.8251979351043701, + "learning_rate": 6.811723721527161e-06, + "loss": 0.6088, + "step": 5875 + }, + { + "epoch": 3.8256343526350034, + "grad_norm": 1.4121100902557373, + "learning_rate": 6.776349668298912e-06, + "loss": 0.6393, + "step": 5880 + }, + { + "epoch": 3.828887443070917, + "grad_norm": 1.4803780317306519, + "learning_rate": 6.741053299497468e-06, + "loss": 0.601, + "step": 5885 + }, + { + "epoch": 3.8321405335068315, + "grad_norm": 1.5110501050949097, + "learning_rate": 6.705834765585459e-06, + "loss": 0.6299, + "step": 5890 + }, + { + "epoch": 3.8353936239427457, + "grad_norm": 1.8608803749084473, + "learning_rate": 6.670694216693701e-06, + "loss": 0.6394, + "step": 5895 + }, + { + "epoch": 3.8386467143786596, + "grad_norm": 1.4101976156234741, + "learning_rate": 6.635631802620576e-06, + "loss": 0.6149, + "step": 5900 + }, + { + "epoch": 3.841899804814574, + "grad_norm": 1.5235905647277832, + "learning_rate": 6.600647672831406e-06, + "loss": 0.6377, + "step": 5905 + }, + { + "epoch": 3.845152895250488, + "grad_norm": 2.4760963916778564, + "learning_rate": 6.565741976457782e-06, + "loss": 0.6315, + "step": 5910 + }, + { + "epoch": 3.8484059856864024, + "grad_norm": 1.4764820337295532, + "learning_rate": 6.530914862296947e-06, + "loss": 0.6148, + "step": 5915 + }, + { + "epoch": 3.851659076122316, + "grad_norm": 1.408517599105835, + "learning_rate": 6.496166478811164e-06, + "loss": 0.629, + "step": 5920 + }, + { + "epoch": 3.8549121665582304, + "grad_norm": 2.276674509048462, + "learning_rate": 6.461496974127093e-06, + "loss": 0.613, + "step": 5925 + }, + { + "epoch": 3.8581652569941447, + "grad_norm": 1.5643647909164429, + "learning_rate": 6.426906496035129e-06, + "loss": 0.6063, + "step": 5930 + }, + { + "epoch": 3.8614183474300585, + "grad_norm": 1.3531688451766968, + "learning_rate": 6.39239519198879e-06, + "loss": 0.6135, + "step": 5935 + }, + { + "epoch": 3.864671437865973, + "grad_norm": 1.4261928796768188, + "learning_rate": 6.357963209104106e-06, + "loss": 0.6206, + "step": 5940 + }, + { + "epoch": 3.867924528301887, + "grad_norm": 1.3013157844543457, + "learning_rate": 6.32361069415896e-06, + "loss": 0.6153, + "step": 5945 + }, + { + "epoch": 3.871177618737801, + "grad_norm": 1.520578145980835, + "learning_rate": 6.289337793592468e-06, + "loss": 0.629, + "step": 5950 + }, + { + "epoch": 3.874430709173715, + "grad_norm": 1.5987921953201294, + "learning_rate": 6.255144653504382e-06, + "loss": 0.645, + "step": 5955 + }, + { + "epoch": 3.8776837996096294, + "grad_norm": 2.1227879524230957, + "learning_rate": 6.221031419654444e-06, + "loss": 0.6333, + "step": 5960 + }, + { + "epoch": 3.880936890045543, + "grad_norm": 1.5177706480026245, + "learning_rate": 6.1869982374617495e-06, + "loss": 0.629, + "step": 5965 + }, + { + "epoch": 3.8841899804814575, + "grad_norm": 1.3354036808013916, + "learning_rate": 6.153045252004177e-06, + "loss": 0.6055, + "step": 5970 + }, + { + "epoch": 3.8874430709173717, + "grad_norm": 1.8337645530700684, + "learning_rate": 6.119172608017718e-06, + "loss": 0.623, + "step": 5975 + }, + { + "epoch": 3.8906961613532856, + "grad_norm": 1.2876662015914917, + "learning_rate": 6.08538044989588e-06, + "loss": 0.6064, + "step": 5980 + }, + { + "epoch": 3.8939492517892, + "grad_norm": 1.3676327466964722, + "learning_rate": 6.051668921689094e-06, + "loss": 0.6219, + "step": 5985 + }, + { + "epoch": 3.897202342225114, + "grad_norm": 1.5804736614227295, + "learning_rate": 6.0180381671040596e-06, + "loss": 0.6135, + "step": 5990 + }, + { + "epoch": 3.900455432661028, + "grad_norm": 2.2858810424804688, + "learning_rate": 5.9844883295031515e-06, + "loss": 0.6393, + "step": 5995 + }, + { + "epoch": 3.903708523096942, + "grad_norm": 1.8066788911819458, + "learning_rate": 5.9510195519038245e-06, + "loss": 0.6056, + "step": 6000 + }, + { + "epoch": 3.9069616135328564, + "grad_norm": 1.3947362899780273, + "learning_rate": 5.917631976977975e-06, + "loss": 0.6138, + "step": 6005 + }, + { + "epoch": 3.9102147039687702, + "grad_norm": 1.551949381828308, + "learning_rate": 5.884325747051336e-06, + "loss": 0.614, + "step": 6010 + }, + { + "epoch": 3.9134677944046845, + "grad_norm": 1.3901867866516113, + "learning_rate": 5.851101004102907e-06, + "loss": 0.6375, + "step": 6015 + }, + { + "epoch": 3.9167208848405988, + "grad_norm": 1.4056464433670044, + "learning_rate": 5.817957889764308e-06, + "loss": 0.6141, + "step": 6020 + }, + { + "epoch": 3.9199739752765126, + "grad_norm": 1.499922752380371, + "learning_rate": 5.784896545319187e-06, + "loss": 0.6074, + "step": 6025 + }, + { + "epoch": 3.923227065712427, + "grad_norm": 1.2578163146972656, + "learning_rate": 5.751917111702612e-06, + "loss": 0.6143, + "step": 6030 + }, + { + "epoch": 3.926480156148341, + "grad_norm": 1.2877789735794067, + "learning_rate": 5.719019729500508e-06, + "loss": 0.5956, + "step": 6035 + }, + { + "epoch": 3.929733246584255, + "grad_norm": 1.576788067817688, + "learning_rate": 5.686204538948997e-06, + "loss": 0.6141, + "step": 6040 + }, + { + "epoch": 3.932986337020169, + "grad_norm": 1.8292930126190186, + "learning_rate": 5.653471679933839e-06, + "loss": 0.5909, + "step": 6045 + }, + { + "epoch": 3.9362394274560835, + "grad_norm": 1.5432319641113281, + "learning_rate": 5.62082129198985e-06, + "loss": 0.6199, + "step": 6050 + }, + { + "epoch": 3.9394925178919973, + "grad_norm": 1.739689826965332, + "learning_rate": 5.58825351430026e-06, + "loss": 0.6035, + "step": 6055 + }, + { + "epoch": 3.9427456083279115, + "grad_norm": 1.3205852508544922, + "learning_rate": 5.555768485696144e-06, + "loss": 0.6169, + "step": 6060 + }, + { + "epoch": 3.945998698763826, + "grad_norm": 1.6433742046356201, + "learning_rate": 5.523366344655856e-06, + "loss": 0.6404, + "step": 6065 + }, + { + "epoch": 3.9492517891997396, + "grad_norm": 1.6137924194335938, + "learning_rate": 5.491047229304397e-06, + "loss": 0.6219, + "step": 6070 + }, + { + "epoch": 3.952504879635654, + "grad_norm": 1.5387951135635376, + "learning_rate": 5.4588112774128314e-06, + "loss": 0.5937, + "step": 6075 + }, + { + "epoch": 3.955757970071568, + "grad_norm": 1.4663158655166626, + "learning_rate": 5.42665862639774e-06, + "loss": 0.6066, + "step": 6080 + }, + { + "epoch": 3.959011060507482, + "grad_norm": 4.082248210906982, + "learning_rate": 5.394589413320589e-06, + "loss": 0.6311, + "step": 6085 + }, + { + "epoch": 3.9622641509433962, + "grad_norm": 1.4563738107681274, + "learning_rate": 5.3626037748871565e-06, + "loss": 0.6142, + "step": 6090 + }, + { + "epoch": 3.9655172413793105, + "grad_norm": 1.569101095199585, + "learning_rate": 5.330701847446962e-06, + "loss": 0.6014, + "step": 6095 + }, + { + "epoch": 3.9687703318152243, + "grad_norm": 1.567270278930664, + "learning_rate": 5.29888376699269e-06, + "loss": 0.6155, + "step": 6100 + }, + { + "epoch": 3.9720234222511386, + "grad_norm": 1.668445110321045, + "learning_rate": 5.267149669159588e-06, + "loss": 0.6171, + "step": 6105 + }, + { + "epoch": 3.975276512687053, + "grad_norm": 1.7854609489440918, + "learning_rate": 5.235499689224885e-06, + "loss": 0.6135, + "step": 6110 + }, + { + "epoch": 3.9785296031229667, + "grad_norm": 1.8517600297927856, + "learning_rate": 5.203933962107266e-06, + "loss": 0.6207, + "step": 6115 + }, + { + "epoch": 3.981782693558881, + "grad_norm": 1.5116204023361206, + "learning_rate": 5.172452622366228e-06, + "loss": 0.614, + "step": 6120 + }, + { + "epoch": 3.985035783994795, + "grad_norm": 1.4917980432510376, + "learning_rate": 5.141055804201541e-06, + "loss": 0.6118, + "step": 6125 + }, + { + "epoch": 3.988288874430709, + "grad_norm": 1.527981162071228, + "learning_rate": 5.109743641452699e-06, + "loss": 0.6083, + "step": 6130 + }, + { + "epoch": 3.9915419648666233, + "grad_norm": 1.3188831806182861, + "learning_rate": 5.078516267598299e-06, + "loss": 0.6141, + "step": 6135 + }, + { + "epoch": 3.9947950553025375, + "grad_norm": 1.4134242534637451, + "learning_rate": 5.047373815755496e-06, + "loss": 0.6234, + "step": 6140 + }, + { + "epoch": 3.9980481457384514, + "grad_norm": 1.5778809785842896, + "learning_rate": 5.016316418679454e-06, + "loss": 0.6177, + "step": 6145 + }, + { + "epoch": 4.0, + "eval_f1": 0.7989837428748611, + "eval_loss": 0.491455078125, + "eval_precision": 0.7989192926261178, + "eval_recall": 0.7990541428374994, + "eval_runtime": 238.1189, + "eval_samples_per_second": 1652.263, + "eval_steps_per_second": 1.617, + "step": 6148 + }, + { + "epoch": 4.001301236174366, + "grad_norm": 1.376760721206665, + "learning_rate": 4.985344208762757e-06, + "loss": 0.5954, + "step": 6150 + }, + { + "epoch": 4.00455432661028, + "grad_norm": 1.2846732139587402, + "learning_rate": 4.954457318034841e-06, + "loss": 0.533, + "step": 6155 + }, + { + "epoch": 4.007807417046194, + "grad_norm": 1.16463303565979, + "learning_rate": 4.92365587816144e-06, + "loss": 0.533, + "step": 6160 + }, + { + "epoch": 4.011060507482108, + "grad_norm": 1.4882513284683228, + "learning_rate": 4.892940020444043e-06, + "loss": 0.5236, + "step": 6165 + }, + { + "epoch": 4.014313597918022, + "grad_norm": 3.275876998901367, + "learning_rate": 4.862309875819299e-06, + "loss": 0.5213, + "step": 6170 + }, + { + "epoch": 4.017566688353936, + "grad_norm": 1.5742096900939941, + "learning_rate": 4.837867561302392e-06, + "loss": 0.5295, + "step": 6175 + }, + { + "epoch": 4.020819778789851, + "grad_norm": 5.1677422523498535, + "learning_rate": 4.807392029038138e-06, + "loss": 0.5301, + "step": 6180 + }, + { + "epoch": 4.024072869225765, + "grad_norm": 1.7716647386550903, + "learning_rate": 4.77700257454356e-06, + "loss": 0.5366, + "step": 6185 + }, + { + "epoch": 4.027325959661678, + "grad_norm": 1.8003216981887817, + "learning_rate": 4.746699327363918e-06, + "loss": 0.5209, + "step": 6190 + }, + { + "epoch": 4.030579050097593, + "grad_norm": 1.7417036294937134, + "learning_rate": 4.7164824166769735e-06, + "loss": 0.5335, + "step": 6195 + }, + { + "epoch": 4.033832140533507, + "grad_norm": 1.7009021043777466, + "learning_rate": 4.686351971292443e-06, + "loss": 0.5222, + "step": 6200 + }, + { + "epoch": 4.037085230969421, + "grad_norm": 2.0051186084747314, + "learning_rate": 4.6563081196514786e-06, + "loss": 0.5516, + "step": 6205 + }, + { + "epoch": 4.040338321405335, + "grad_norm": 1.5723603963851929, + "learning_rate": 4.626350989826075e-06, + "loss": 0.5263, + "step": 6210 + }, + { + "epoch": 4.043591411841249, + "grad_norm": 1.8875335454940796, + "learning_rate": 4.596480709518547e-06, + "loss": 0.5346, + "step": 6215 + }, + { + "epoch": 4.046844502277163, + "grad_norm": 1.5543326139450073, + "learning_rate": 4.566697406061005e-06, + "loss": 0.5344, + "step": 6220 + }, + { + "epoch": 4.050097592713078, + "grad_norm": 1.6131196022033691, + "learning_rate": 4.53700120641477e-06, + "loss": 0.5318, + "step": 6225 + }, + { + "epoch": 4.053350683148992, + "grad_norm": 1.3502036333084106, + "learning_rate": 4.5073922371698554e-06, + "loss": 0.5234, + "step": 6230 + }, + { + "epoch": 4.056603773584905, + "grad_norm": 2.2002179622650146, + "learning_rate": 4.4778706245444475e-06, + "loss": 0.5422, + "step": 6235 + }, + { + "epoch": 4.05985686402082, + "grad_norm": 1.62948477268219, + "learning_rate": 4.44843649438432e-06, + "loss": 0.5136, + "step": 6240 + }, + { + "epoch": 4.063109954456734, + "grad_norm": 1.563274621963501, + "learning_rate": 4.419089972162327e-06, + "loss": 0.5087, + "step": 6245 + }, + { + "epoch": 4.066363044892648, + "grad_norm": 1.5413563251495361, + "learning_rate": 4.389831182977882e-06, + "loss": 0.535, + "step": 6250 + }, + { + "epoch": 4.0696161353285625, + "grad_norm": 1.6265994310379028, + "learning_rate": 4.360660251556395e-06, + "loss": 0.5291, + "step": 6255 + }, + { + "epoch": 4.072869225764476, + "grad_norm": 1.6212644577026367, + "learning_rate": 4.331577302248746e-06, + "loss": 0.5165, + "step": 6260 + }, + { + "epoch": 4.07612231620039, + "grad_norm": 1.5618913173675537, + "learning_rate": 4.302582459030769e-06, + "loss": 0.5301, + "step": 6265 + }, + { + "epoch": 4.079375406636305, + "grad_norm": 1.7876514196395874, + "learning_rate": 4.273675845502722e-06, + "loss": 0.5282, + "step": 6270 + }, + { + "epoch": 4.082628497072219, + "grad_norm": 1.6155240535736084, + "learning_rate": 4.244857584888748e-06, + "loss": 0.5219, + "step": 6275 + }, + { + "epoch": 4.0858815875081325, + "grad_norm": 1.826150894165039, + "learning_rate": 4.2161278000363456e-06, + "loss": 0.5254, + "step": 6280 + }, + { + "epoch": 4.089134677944047, + "grad_norm": 1.569254755973816, + "learning_rate": 4.187486613415878e-06, + "loss": 0.5563, + "step": 6285 + }, + { + "epoch": 4.092387768379961, + "grad_norm": 1.651341438293457, + "learning_rate": 4.158934147120019e-06, + "loss": 0.5196, + "step": 6290 + }, + { + "epoch": 4.095640858815875, + "grad_norm": 1.960835337638855, + "learning_rate": 4.130470522863231e-06, + "loss": 0.5233, + "step": 6295 + }, + { + "epoch": 4.0988939492517895, + "grad_norm": 1.762459397315979, + "learning_rate": 4.102095861981275e-06, + "loss": 0.5101, + "step": 6300 + }, + { + "epoch": 4.102147039687703, + "grad_norm": 1.7269344329833984, + "learning_rate": 4.073810285430668e-06, + "loss": 0.5283, + "step": 6305 + }, + { + "epoch": 4.105400130123617, + "grad_norm": 2.420794725418091, + "learning_rate": 4.045613913788171e-06, + "loss": 0.5168, + "step": 6310 + }, + { + "epoch": 4.108653220559532, + "grad_norm": 1.5948150157928467, + "learning_rate": 4.0175068672502784e-06, + "loss": 0.535, + "step": 6315 + }, + { + "epoch": 4.111906310995446, + "grad_norm": 2.1127867698669434, + "learning_rate": 3.9894892656327235e-06, + "loss": 0.5181, + "step": 6320 + }, + { + "epoch": 4.1151594014313595, + "grad_norm": 2.1554746627807617, + "learning_rate": 3.961561228369928e-06, + "loss": 0.5314, + "step": 6325 + }, + { + "epoch": 4.118412491867274, + "grad_norm": 1.7790179252624512, + "learning_rate": 3.933722874514526e-06, + "loss": 0.5327, + "step": 6330 + }, + { + "epoch": 4.121665582303188, + "grad_norm": 1.5885546207427979, + "learning_rate": 3.905974322736849e-06, + "loss": 0.5221, + "step": 6335 + }, + { + "epoch": 4.124918672739102, + "grad_norm": 1.4991848468780518, + "learning_rate": 3.878315691324416e-06, + "loss": 0.5134, + "step": 6340 + }, + { + "epoch": 4.1281717631750166, + "grad_norm": 1.57703697681427, + "learning_rate": 3.850747098181421e-06, + "loss": 0.5239, + "step": 6345 + }, + { + "epoch": 4.13142485361093, + "grad_norm": 3.0852479934692383, + "learning_rate": 3.82326866082825e-06, + "loss": 0.5216, + "step": 6350 + }, + { + "epoch": 4.134677944046844, + "grad_norm": 1.6248340606689453, + "learning_rate": 3.7958804964009692e-06, + "loss": 0.5195, + "step": 6355 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 1.69948410987854, + "learning_rate": 3.7685827216508124e-06, + "loss": 0.507, + "step": 6360 + }, + { + "epoch": 4.141184124918673, + "grad_norm": 1.6397584676742554, + "learning_rate": 3.741375452943724e-06, + "loss": 0.5353, + "step": 6365 + }, + { + "epoch": 4.1444372153545865, + "grad_norm": 1.4918780326843262, + "learning_rate": 3.714258806259807e-06, + "loss": 0.5013, + "step": 6370 + }, + { + "epoch": 4.147690305790501, + "grad_norm": 2.1283321380615234, + "learning_rate": 3.6872328971928718e-06, + "loss": 0.5289, + "step": 6375 + }, + { + "epoch": 4.150943396226415, + "grad_norm": 2.7849512100219727, + "learning_rate": 3.660297840949933e-06, + "loss": 0.5289, + "step": 6380 + }, + { + "epoch": 4.154196486662329, + "grad_norm": 1.7255409955978394, + "learning_rate": 3.633453752350707e-06, + "loss": 0.5174, + "step": 6385 + }, + { + "epoch": 4.157449577098244, + "grad_norm": 1.7871309518814087, + "learning_rate": 3.606700745827127e-06, + "loss": 0.5231, + "step": 6390 + }, + { + "epoch": 4.160702667534157, + "grad_norm": 1.5307867527008057, + "learning_rate": 3.5800389354228748e-06, + "loss": 0.524, + "step": 6395 + }, + { + "epoch": 4.163955757970071, + "grad_norm": 1.9164159297943115, + "learning_rate": 3.553468434792859e-06, + "loss": 0.5321, + "step": 6400 + }, + { + "epoch": 4.167208848405986, + "grad_norm": 1.539781093597412, + "learning_rate": 3.526989357202756e-06, + "loss": 0.5223, + "step": 6405 + }, + { + "epoch": 4.1704619388419, + "grad_norm": 1.5751947164535522, + "learning_rate": 3.5006018155285286e-06, + "loss": 0.5302, + "step": 6410 + }, + { + "epoch": 4.173715029277814, + "grad_norm": 1.7798151969909668, + "learning_rate": 3.4743059222559298e-06, + "loss": 0.5295, + "step": 6415 + }, + { + "epoch": 4.176968119713728, + "grad_norm": 2.035566568374634, + "learning_rate": 3.448101789480024e-06, + "loss": 0.5249, + "step": 6420 + }, + { + "epoch": 4.180221210149642, + "grad_norm": 1.6014204025268555, + "learning_rate": 3.4219895289047317e-06, + "loss": 0.5236, + "step": 6425 + }, + { + "epoch": 4.183474300585556, + "grad_norm": 1.9151594638824463, + "learning_rate": 3.395969251842329e-06, + "loss": 0.5146, + "step": 6430 + }, + { + "epoch": 4.186727391021471, + "grad_norm": 1.543568730354309, + "learning_rate": 3.3700410692129815e-06, + "loss": 0.518, + "step": 6435 + }, + { + "epoch": 4.189980481457384, + "grad_norm": 1.5983526706695557, + "learning_rate": 3.3442050915442615e-06, + "loss": 0.5047, + "step": 6440 + }, + { + "epoch": 4.193233571893298, + "grad_norm": 1.5908766984939575, + "learning_rate": 3.318461428970707e-06, + "loss": 0.5273, + "step": 6445 + }, + { + "epoch": 4.196486662329213, + "grad_norm": 1.7272975444793701, + "learning_rate": 3.2928101912333197e-06, + "loss": 0.5143, + "step": 6450 + }, + { + "epoch": 4.199739752765127, + "grad_norm": 1.6854057312011719, + "learning_rate": 3.2672514876791044e-06, + "loss": 0.5412, + "step": 6455 + }, + { + "epoch": 4.202992843201041, + "grad_norm": 1.7159767150878906, + "learning_rate": 3.2417854272606212e-06, + "loss": 0.5328, + "step": 6460 + }, + { + "epoch": 4.206245933636955, + "grad_norm": 2.0293431282043457, + "learning_rate": 3.2164121185355026e-06, + "loss": 0.5207, + "step": 6465 + }, + { + "epoch": 4.209499024072869, + "grad_norm": 1.4942529201507568, + "learning_rate": 3.1911316696659837e-06, + "loss": 0.5098, + "step": 6470 + }, + { + "epoch": 4.212752114508783, + "grad_norm": 1.5757249593734741, + "learning_rate": 3.165944188418474e-06, + "loss": 0.5075, + "step": 6475 + }, + { + "epoch": 4.216005204944698, + "grad_norm": 1.6114063262939453, + "learning_rate": 3.140849782163066e-06, + "loss": 0.5283, + "step": 6480 + }, + { + "epoch": 4.2192582953806115, + "grad_norm": 1.791574478149414, + "learning_rate": 3.1158485578730883e-06, + "loss": 0.5116, + "step": 6485 + }, + { + "epoch": 4.222511385816525, + "grad_norm": 1.4832271337509155, + "learning_rate": 3.090940622124644e-06, + "loss": 0.5187, + "step": 6490 + }, + { + "epoch": 4.22576447625244, + "grad_norm": 1.5384358167648315, + "learning_rate": 3.066126081096185e-06, + "loss": 0.5158, + "step": 6495 + }, + { + "epoch": 4.229017566688354, + "grad_norm": 1.766423225402832, + "learning_rate": 3.0414050405680155e-06, + "loss": 0.5196, + "step": 6500 + }, + { + "epoch": 4.232270657124268, + "grad_norm": 2.07438325881958, + "learning_rate": 3.016777605921861e-06, + "loss": 0.5062, + "step": 6505 + }, + { + "epoch": 4.235523747560182, + "grad_norm": 4.485304355621338, + "learning_rate": 2.9922438821404415e-06, + "loss": 0.4975, + "step": 6510 + }, + { + "epoch": 4.238776837996096, + "grad_norm": 1.6027443408966064, + "learning_rate": 2.9678039738069845e-06, + "loss": 0.5211, + "step": 6515 + }, + { + "epoch": 4.24202992843201, + "grad_norm": 2.2789571285247803, + "learning_rate": 2.9434579851047973e-06, + "loss": 0.5084, + "step": 6520 + }, + { + "epoch": 4.245283018867925, + "grad_norm": 1.481426477432251, + "learning_rate": 2.919206019816842e-06, + "loss": 0.5417, + "step": 6525 + }, + { + "epoch": 4.2485361093038385, + "grad_norm": 1.6203233003616333, + "learning_rate": 2.895048181325252e-06, + "loss": 0.5114, + "step": 6530 + }, + { + "epoch": 4.251789199739752, + "grad_norm": 1.5848479270935059, + "learning_rate": 2.8709845726109243e-06, + "loss": 0.5028, + "step": 6535 + }, + { + "epoch": 4.255042290175667, + "grad_norm": 1.80342435836792, + "learning_rate": 2.8470152962530723e-06, + "loss": 0.5122, + "step": 6540 + }, + { + "epoch": 4.258295380611581, + "grad_norm": 2.087617874145508, + "learning_rate": 2.8231404544287796e-06, + "loss": 0.506, + "step": 6545 + }, + { + "epoch": 4.261548471047496, + "grad_norm": 1.7649626731872559, + "learning_rate": 2.7993601489125693e-06, + "loss": 0.5166, + "step": 6550 + }, + { + "epoch": 4.264801561483409, + "grad_norm": 3.1642332077026367, + "learning_rate": 2.7756744810759823e-06, + "loss": 0.5107, + "step": 6555 + }, + { + "epoch": 4.268054651919323, + "grad_norm": 1.9564752578735352, + "learning_rate": 2.7520835518871302e-06, + "loss": 0.5112, + "step": 6560 + }, + { + "epoch": 4.271307742355237, + "grad_norm": 1.6043564081192017, + "learning_rate": 2.7285874619102675e-06, + "loss": 0.5084, + "step": 6565 + }, + { + "epoch": 4.274560832791152, + "grad_norm": 1.9543806314468384, + "learning_rate": 2.705186311305355e-06, + "loss": 0.5135, + "step": 6570 + }, + { + "epoch": 4.2778139232270656, + "grad_norm": 1.6966253519058228, + "learning_rate": 2.6818801998276634e-06, + "loss": 0.525, + "step": 6575 + }, + { + "epoch": 4.28106701366298, + "grad_norm": 2.0935935974121094, + "learning_rate": 2.658669226827315e-06, + "loss": 0.5216, + "step": 6580 + }, + { + "epoch": 4.284320104098894, + "grad_norm": 1.7863517999649048, + "learning_rate": 2.6355534912488627e-06, + "loss": 0.5271, + "step": 6585 + }, + { + "epoch": 4.287573194534808, + "grad_norm": 1.611092448234558, + "learning_rate": 2.612533091630903e-06, + "loss": 0.5142, + "step": 6590 + }, + { + "epoch": 4.290826284970722, + "grad_norm": 1.709322452545166, + "learning_rate": 2.5896081261056138e-06, + "loss": 0.5292, + "step": 6595 + }, + { + "epoch": 4.294079375406636, + "grad_norm": 1.7398649454116821, + "learning_rate": 2.5667786923983443e-06, + "loss": 0.5253, + "step": 6600 + }, + { + "epoch": 4.29733246584255, + "grad_norm": 1.5445489883422852, + "learning_rate": 2.544044887827235e-06, + "loss": 0.5443, + "step": 6605 + }, + { + "epoch": 4.300585556278465, + "grad_norm": 1.763914704322815, + "learning_rate": 2.5214068093027484e-06, + "loss": 0.5301, + "step": 6610 + }, + { + "epoch": 4.303838646714379, + "grad_norm": 2.1207916736602783, + "learning_rate": 2.498864553327296e-06, + "loss": 0.5351, + "step": 6615 + }, + { + "epoch": 4.307091737150293, + "grad_norm": 1.8002142906188965, + "learning_rate": 2.4764182159948133e-06, + "loss": 0.5043, + "step": 6620 + }, + { + "epoch": 4.310344827586207, + "grad_norm": 1.4603972434997559, + "learning_rate": 2.454067892990347e-06, + "loss": 0.5032, + "step": 6625 + }, + { + "epoch": 4.313597918022121, + "grad_norm": 1.6874291896820068, + "learning_rate": 2.431813679589645e-06, + "loss": 0.5232, + "step": 6630 + }, + { + "epoch": 4.316851008458035, + "grad_norm": 1.7689220905303955, + "learning_rate": 2.4096556706587726e-06, + "loss": 0.5218, + "step": 6635 + }, + { + "epoch": 4.32010409889395, + "grad_norm": 1.5644956827163696, + "learning_rate": 2.387593960653675e-06, + "loss": 0.5164, + "step": 6640 + }, + { + "epoch": 4.3233571893298635, + "grad_norm": 2.199660301208496, + "learning_rate": 2.3656286436197965e-06, + "loss": 0.538, + "step": 6645 + }, + { + "epoch": 4.326610279765777, + "grad_norm": 2.4460320472717285, + "learning_rate": 2.343759813191676e-06, + "loss": 0.5197, + "step": 6650 + }, + { + "epoch": 4.329863370201692, + "grad_norm": 1.8965719938278198, + "learning_rate": 2.3219875625925452e-06, + "loss": 0.5399, + "step": 6655 + }, + { + "epoch": 4.333116460637606, + "grad_norm": 1.7241499423980713, + "learning_rate": 2.3003119846339293e-06, + "loss": 0.514, + "step": 6660 + }, + { + "epoch": 4.33636955107352, + "grad_norm": 1.776291847229004, + "learning_rate": 2.27873317171525e-06, + "loss": 0.5217, + "step": 6665 + }, + { + "epoch": 4.339622641509434, + "grad_norm": 1.6230307817459106, + "learning_rate": 2.25725121582345e-06, + "loss": 0.5208, + "step": 6670 + }, + { + "epoch": 4.342875731945348, + "grad_norm": 1.5767405033111572, + "learning_rate": 2.2358662085325723e-06, + "loss": 0.5064, + "step": 6675 + }, + { + "epoch": 4.346128822381262, + "grad_norm": 1.785072922706604, + "learning_rate": 2.2145782410033844e-06, + "loss": 0.5195, + "step": 6680 + }, + { + "epoch": 4.349381912817177, + "grad_norm": 2.802659034729004, + "learning_rate": 2.1933874039830078e-06, + "loss": 0.5178, + "step": 6685 + }, + { + "epoch": 4.3526350032530905, + "grad_norm": 1.8929702043533325, + "learning_rate": 2.172293787804483e-06, + "loss": 0.5281, + "step": 6690 + }, + { + "epoch": 4.355888093689004, + "grad_norm": 2.050996780395508, + "learning_rate": 2.1512974823864414e-06, + "loss": 0.5432, + "step": 6695 + }, + { + "epoch": 4.359141184124919, + "grad_norm": 1.6718263626098633, + "learning_rate": 2.130398577232673e-06, + "loss": 0.5267, + "step": 6700 + }, + { + "epoch": 4.362394274560833, + "grad_norm": 1.8539758920669556, + "learning_rate": 2.109597161431784e-06, + "loss": 0.5334, + "step": 6705 + }, + { + "epoch": 4.365647364996747, + "grad_norm": 1.541066288948059, + "learning_rate": 2.088893323656793e-06, + "loss": 0.5235, + "step": 6710 + }, + { + "epoch": 4.368900455432661, + "grad_norm": 1.5558756589889526, + "learning_rate": 2.068287152164747e-06, + "loss": 0.5157, + "step": 6715 + }, + { + "epoch": 4.372153545868575, + "grad_norm": 1.825431227684021, + "learning_rate": 2.0477787347963823e-06, + "loss": 0.521, + "step": 6720 + }, + { + "epoch": 4.375406636304489, + "grad_norm": 1.558396816253662, + "learning_rate": 2.0273681589757063e-06, + "loss": 0.5082, + "step": 6725 + }, + { + "epoch": 4.378659726740404, + "grad_norm": 1.8559561967849731, + "learning_rate": 2.007055511709646e-06, + "loss": 0.526, + "step": 6730 + }, + { + "epoch": 4.3819128171763175, + "grad_norm": 1.8222005367279053, + "learning_rate": 1.986840879587687e-06, + "loss": 0.522, + "step": 6735 + }, + { + "epoch": 4.385165907612231, + "grad_norm": 4.778210639953613, + "learning_rate": 1.966724348781479e-06, + "loss": 0.5089, + "step": 6740 + }, + { + "epoch": 4.388418998048146, + "grad_norm": 1.7374241352081299, + "learning_rate": 1.9467060050444824e-06, + "loss": 0.5166, + "step": 6745 + }, + { + "epoch": 4.39167208848406, + "grad_norm": 1.846447467803955, + "learning_rate": 1.9267859337116195e-06, + "loss": 0.5255, + "step": 6750 + }, + { + "epoch": 4.394925178919974, + "grad_norm": 1.6373209953308105, + "learning_rate": 1.9069642196988757e-06, + "loss": 0.5103, + "step": 6755 + }, + { + "epoch": 4.398178269355888, + "grad_norm": 2.6573219299316406, + "learning_rate": 1.8872409475029524e-06, + "loss": 0.5192, + "step": 6760 + }, + { + "epoch": 4.401431359791802, + "grad_norm": 3.289806365966797, + "learning_rate": 1.8676162012009307e-06, + "loss": 0.5195, + "step": 6765 + }, + { + "epoch": 4.404684450227716, + "grad_norm": 2.3919076919555664, + "learning_rate": 1.8480900644498756e-06, + "loss": 0.5139, + "step": 6770 + }, + { + "epoch": 4.407937540663631, + "grad_norm": 2.7541277408599854, + "learning_rate": 1.8286626204864903e-06, + "loss": 0.5285, + "step": 6775 + }, + { + "epoch": 4.411190631099545, + "grad_norm": 2.060319423675537, + "learning_rate": 1.8093339521267876e-06, + "loss": 0.5211, + "step": 6780 + }, + { + "epoch": 4.414443721535458, + "grad_norm": 1.9002997875213623, + "learning_rate": 1.7901041417657027e-06, + "loss": 0.5189, + "step": 6785 + }, + { + "epoch": 4.417696811971373, + "grad_norm": 2.1053810119628906, + "learning_rate": 1.7709732713767497e-06, + "loss": 0.5107, + "step": 6790 + }, + { + "epoch": 4.420949902407287, + "grad_norm": 1.6905279159545898, + "learning_rate": 1.7519414225116937e-06, + "loss": 0.5147, + "step": 6795 + }, + { + "epoch": 4.424202992843201, + "grad_norm": 2.2751264572143555, + "learning_rate": 1.733008676300177e-06, + "loss": 0.5065, + "step": 6800 + }, + { + "epoch": 4.427456083279115, + "grad_norm": 1.9138133525848389, + "learning_rate": 1.7141751134493815e-06, + "loss": 0.5144, + "step": 6805 + }, + { + "epoch": 4.430709173715029, + "grad_norm": 1.75284743309021, + "learning_rate": 1.6954408142436955e-06, + "loss": 0.5164, + "step": 6810 + }, + { + "epoch": 4.433962264150943, + "grad_norm": 1.6290788650512695, + "learning_rate": 1.6768058585443585e-06, + "loss": 0.5197, + "step": 6815 + }, + { + "epoch": 4.437215354586858, + "grad_norm": 2.135432243347168, + "learning_rate": 1.6582703257891214e-06, + "loss": 0.5252, + "step": 6820 + }, + { + "epoch": 4.440468445022772, + "grad_norm": 1.6389341354370117, + "learning_rate": 1.63983429499191e-06, + "loss": 0.5217, + "step": 6825 + }, + { + "epoch": 4.443721535458685, + "grad_norm": 1.6227918863296509, + "learning_rate": 1.6214978447425062e-06, + "loss": 0.5178, + "step": 6830 + }, + { + "epoch": 4.4469746258946, + "grad_norm": 1.907899022102356, + "learning_rate": 1.603261053206176e-06, + "loss": 0.5235, + "step": 6835 + }, + { + "epoch": 4.450227716330514, + "grad_norm": 2.548617362976074, + "learning_rate": 1.5851239981233639e-06, + "loss": 0.5238, + "step": 6840 + }, + { + "epoch": 4.453480806766428, + "grad_norm": 1.8666588068008423, + "learning_rate": 1.5670867568093633e-06, + "loss": 0.5378, + "step": 6845 + }, + { + "epoch": 4.4567338972023425, + "grad_norm": 1.6732510328292847, + "learning_rate": 1.5491494061539658e-06, + "loss": 0.5101, + "step": 6850 + }, + { + "epoch": 4.459986987638256, + "grad_norm": 1.560084342956543, + "learning_rate": 1.5313120226211452e-06, + "loss": 0.5318, + "step": 6855 + }, + { + "epoch": 4.46324007807417, + "grad_norm": 4.673284530639648, + "learning_rate": 1.5135746822487419e-06, + "loss": 0.5279, + "step": 6860 + }, + { + "epoch": 4.466493168510085, + "grad_norm": 1.600279450416565, + "learning_rate": 1.4959374606481251e-06, + "loss": 0.4943, + "step": 6865 + }, + { + "epoch": 4.469746258945999, + "grad_norm": 2.073321580886841, + "learning_rate": 1.4784004330038653e-06, + "loss": 0.5204, + "step": 6870 + }, + { + "epoch": 4.4729993493819125, + "grad_norm": 3.2433438301086426, + "learning_rate": 1.4609636740734316e-06, + "loss": 0.5174, + "step": 6875 + }, + { + "epoch": 4.476252439817827, + "grad_norm": 2.53226637840271, + "learning_rate": 1.4436272581868665e-06, + "loss": 0.54, + "step": 6880 + }, + { + "epoch": 4.479505530253741, + "grad_norm": 1.7645595073699951, + "learning_rate": 1.4263912592464597e-06, + "loss": 0.5271, + "step": 6885 + }, + { + "epoch": 4.482758620689655, + "grad_norm": 1.7925113439559937, + "learning_rate": 1.4092557507264375e-06, + "loss": 0.5169, + "step": 6890 + }, + { + "epoch": 4.4860117111255695, + "grad_norm": 2.9148597717285156, + "learning_rate": 1.3922208056726644e-06, + "loss": 0.525, + "step": 6895 + }, + { + "epoch": 4.489264801561483, + "grad_norm": 3.2308194637298584, + "learning_rate": 1.3752864967023105e-06, + "loss": 0.5341, + "step": 6900 + }, + { + "epoch": 4.492517891997397, + "grad_norm": 1.633375644683838, + "learning_rate": 1.358452896003548e-06, + "loss": 0.5249, + "step": 6905 + }, + { + "epoch": 4.495770982433312, + "grad_norm": 1.7651923894882202, + "learning_rate": 1.3417200753352538e-06, + "loss": 0.5211, + "step": 6910 + }, + { + "epoch": 4.499024072869226, + "grad_norm": 1.584030032157898, + "learning_rate": 1.3250881060266952e-06, + "loss": 0.5164, + "step": 6915 + }, + { + "epoch": 4.5022771633051395, + "grad_norm": 2.4326541423797607, + "learning_rate": 1.3085570589772168e-06, + "loss": 0.5306, + "step": 6920 + }, + { + "epoch": 4.505530253741054, + "grad_norm": 1.5874032974243164, + "learning_rate": 1.2921270046559658e-06, + "loss": 0.5374, + "step": 6925 + }, + { + "epoch": 4.508783344176968, + "grad_norm": 2.053276300430298, + "learning_rate": 1.2757980131015563e-06, + "loss": 0.5294, + "step": 6930 + }, + { + "epoch": 4.512036434612883, + "grad_norm": 1.5977790355682373, + "learning_rate": 1.2595701539217963e-06, + "loss": 0.515, + "step": 6935 + }, + { + "epoch": 4.5152895250487965, + "grad_norm": 1.5569490194320679, + "learning_rate": 1.2434434962933866e-06, + "loss": 0.5178, + "step": 6940 + }, + { + "epoch": 4.51854261548471, + "grad_norm": 1.8135985136032104, + "learning_rate": 1.2274181089616172e-06, + "loss": 0.5268, + "step": 6945 + }, + { + "epoch": 4.521795705920624, + "grad_norm": 1.5852515697479248, + "learning_rate": 1.2114940602400788e-06, + "loss": 0.5192, + "step": 6950 + }, + { + "epoch": 4.525048796356539, + "grad_norm": 2.1236679553985596, + "learning_rate": 1.19567141801038e-06, + "loss": 0.527, + "step": 6955 + }, + { + "epoch": 4.528301886792453, + "grad_norm": 2.562978744506836, + "learning_rate": 1.1799502497218368e-06, + "loss": 0.5379, + "step": 6960 + }, + { + "epoch": 4.531554977228367, + "grad_norm": 1.632822871208191, + "learning_rate": 1.164330622391213e-06, + "loss": 0.5162, + "step": 6965 + }, + { + "epoch": 4.534808067664281, + "grad_norm": 2.966524124145508, + "learning_rate": 1.1488126026024087e-06, + "loss": 0.5399, + "step": 6970 + }, + { + "epoch": 4.538061158100195, + "grad_norm": 1.8411732912063599, + "learning_rate": 1.1333962565061973e-06, + "loss": 0.5232, + "step": 6975 + }, + { + "epoch": 4.541314248536109, + "grad_norm": 1.5464459657669067, + "learning_rate": 1.118081649819927e-06, + "loss": 0.5168, + "step": 6980 + }, + { + "epoch": 4.544567338972024, + "grad_norm": 1.7210750579833984, + "learning_rate": 1.1028688478272459e-06, + "loss": 0.5327, + "step": 6985 + }, + { + "epoch": 4.547820429407937, + "grad_norm": 2.1294288635253906, + "learning_rate": 1.0877579153778323e-06, + "loss": 0.4963, + "step": 6990 + }, + { + "epoch": 4.551073519843852, + "grad_norm": 1.5896108150482178, + "learning_rate": 1.0727489168871092e-06, + "loss": 0.537, + "step": 6995 + }, + { + "epoch": 4.554326610279766, + "grad_norm": 1.6593022346496582, + "learning_rate": 1.0578419163359666e-06, + "loss": 0.5164, + "step": 7000 + }, + { + "epoch": 4.55757970071568, + "grad_norm": 1.6132862567901611, + "learning_rate": 1.0430369772705034e-06, + "loss": 0.5246, + "step": 7005 + }, + { + "epoch": 4.560832791151594, + "grad_norm": 1.6968963146209717, + "learning_rate": 1.028334162801739e-06, + "loss": 0.5169, + "step": 7010 + }, + { + "epoch": 4.564085881587508, + "grad_norm": 3.422121524810791, + "learning_rate": 1.0137335356053545e-06, + "loss": 0.5306, + "step": 7015 + }, + { + "epoch": 4.567338972023422, + "grad_norm": 2.2838146686553955, + "learning_rate": 9.99235157921427e-07, + "loss": 0.536, + "step": 7020 + }, + { + "epoch": 4.570592062459337, + "grad_norm": 1.923091173171997, + "learning_rate": 9.8483909155416e-07, + "loss": 0.5165, + "step": 7025 + }, + { + "epoch": 4.573845152895251, + "grad_norm": 1.5500158071517944, + "learning_rate": 9.705453978716112e-07, + "loss": 0.5086, + "step": 7030 + }, + { + "epoch": 4.577098243331164, + "grad_norm": 1.948114037513733, + "learning_rate": 9.56354137805457e-07, + "loss": 0.5262, + "step": 7035 + }, + { + "epoch": 4.580351333767078, + "grad_norm": 2.5097603797912598, + "learning_rate": 9.422653718507007e-07, + "loss": 0.5353, + "step": 7040 + }, + { + "epoch": 4.583604424202993, + "grad_norm": 1.757633090019226, + "learning_rate": 9.282791600654428e-07, + "loss": 0.5167, + "step": 7045 + }, + { + "epoch": 4.586857514638907, + "grad_norm": 2.2960455417633057, + "learning_rate": 9.14395562070594e-07, + "loss": 0.5264, + "step": 7050 + }, + { + "epoch": 4.5901106050748215, + "grad_norm": 1.556706428527832, + "learning_rate": 9.006146370496654e-07, + "loss": 0.5177, + "step": 7055 + }, + { + "epoch": 4.593363695510735, + "grad_norm": 1.7054029703140259, + "learning_rate": 8.869364437484678e-07, + "loss": 0.4893, + "step": 7060 + }, + { + "epoch": 4.596616785946649, + "grad_norm": 1.746472716331482, + "learning_rate": 8.733610404748904e-07, + "loss": 0.5093, + "step": 7065 + }, + { + "epoch": 4.599869876382563, + "grad_norm": 2.1942458152770996, + "learning_rate": 8.598884850986533e-07, + "loss": 0.5299, + "step": 7070 + }, + { + "epoch": 4.603122966818478, + "grad_norm": 2.43866229057312, + "learning_rate": 8.465188350510411e-07, + "loss": 0.5282, + "step": 7075 + }, + { + "epoch": 4.6063760572543915, + "grad_norm": 1.625575304031372, + "learning_rate": 8.332521473246758e-07, + "loss": 0.5189, + "step": 7080 + }, + { + "epoch": 4.609629147690306, + "grad_norm": 2.3699636459350586, + "learning_rate": 8.200884784732688e-07, + "loss": 0.5249, + "step": 7085 + }, + { + "epoch": 4.61288223812622, + "grad_norm": 1.750931739807129, + "learning_rate": 8.070278846113749e-07, + "loss": 0.5165, + "step": 7090 + }, + { + "epoch": 4.616135328562134, + "grad_norm": 1.8055213689804077, + "learning_rate": 7.940704214141614e-07, + "loss": 0.5315, + "step": 7095 + }, + { + "epoch": 4.6193884189980485, + "grad_norm": 2.2767059803009033, + "learning_rate": 7.812161441171611e-07, + "loss": 0.5232, + "step": 7100 + }, + { + "epoch": 4.622641509433962, + "grad_norm": 1.4966483116149902, + "learning_rate": 7.684651075160531e-07, + "loss": 0.5045, + "step": 7105 + }, + { + "epoch": 4.625894599869876, + "grad_norm": 2.188704490661621, + "learning_rate": 7.558173659664075e-07, + "loss": 0.5201, + "step": 7110 + }, + { + "epoch": 4.629147690305791, + "grad_norm": 2.934805154800415, + "learning_rate": 7.432729733834631e-07, + "loss": 0.5247, + "step": 7115 + }, + { + "epoch": 4.632400780741705, + "grad_norm": 1.9948830604553223, + "learning_rate": 7.308319832419141e-07, + "loss": 0.5247, + "step": 7120 + }, + { + "epoch": 4.6356538711776185, + "grad_norm": 1.8401069641113281, + "learning_rate": 7.18494448575649e-07, + "loss": 0.5364, + "step": 7125 + }, + { + "epoch": 4.638906961613533, + "grad_norm": 1.45015549659729, + "learning_rate": 7.062604219775531e-07, + "loss": 0.5106, + "step": 7130 + }, + { + "epoch": 4.642160052049447, + "grad_norm": 1.7785407304763794, + "learning_rate": 6.941299555992737e-07, + "loss": 0.5117, + "step": 7135 + }, + { + "epoch": 4.645413142485361, + "grad_norm": 2.026643753051758, + "learning_rate": 6.821031011509937e-07, + "loss": 0.5039, + "step": 7140 + }, + { + "epoch": 4.648666232921276, + "grad_norm": 1.6481338739395142, + "learning_rate": 6.701799099012141e-07, + "loss": 0.5385, + "step": 7145 + }, + { + "epoch": 4.651919323357189, + "grad_norm": 2.9961116313934326, + "learning_rate": 6.583604326765496e-07, + "loss": 0.5148, + "step": 7150 + }, + { + "epoch": 4.655172413793103, + "grad_norm": 1.7340404987335205, + "learning_rate": 6.466447198614806e-07, + "loss": 0.4913, + "step": 7155 + }, + { + "epoch": 4.658425504229018, + "grad_norm": 1.569608211517334, + "learning_rate": 6.350328213981654e-07, + "loss": 0.5052, + "step": 7160 + }, + { + "epoch": 4.661678594664932, + "grad_norm": 1.9746705293655396, + "learning_rate": 6.235247867862226e-07, + "loss": 0.4885, + "step": 7165 + }, + { + "epoch": 4.6649316851008455, + "grad_norm": 1.7358078956604004, + "learning_rate": 6.121206650825162e-07, + "loss": 0.5256, + "step": 7170 + }, + { + "epoch": 4.66818477553676, + "grad_norm": 1.609820008277893, + "learning_rate": 6.008205049009341e-07, + "loss": 0.5275, + "step": 7175 + }, + { + "epoch": 4.671437865972674, + "grad_norm": 1.8338040113449097, + "learning_rate": 5.896243544122076e-07, + "loss": 0.5019, + "step": 7180 + }, + { + "epoch": 4.674690956408588, + "grad_norm": 1.7695443630218506, + "learning_rate": 5.785322613436894e-07, + "loss": 0.5287, + "step": 7185 + }, + { + "epoch": 4.677944046844503, + "grad_norm": 1.9566013813018799, + "learning_rate": 5.675442729791425e-07, + "loss": 0.5262, + "step": 7190 + }, + { + "epoch": 4.681197137280416, + "grad_norm": 1.9720107316970825, + "learning_rate": 5.566604361585626e-07, + "loss": 0.5327, + "step": 7195 + }, + { + "epoch": 4.68445022771633, + "grad_norm": 2.7521474361419678, + "learning_rate": 5.458807972779534e-07, + "loss": 0.5002, + "step": 7200 + }, + { + "epoch": 4.687703318152245, + "grad_norm": 6.726840019226074, + "learning_rate": 5.352054022891406e-07, + "loss": 0.52, + "step": 7205 + }, + { + "epoch": 4.690956408588159, + "grad_norm": 1.8968901634216309, + "learning_rate": 5.246342966995888e-07, + "loss": 0.5259, + "step": 7210 + }, + { + "epoch": 4.694209499024073, + "grad_norm": 1.647226333618164, + "learning_rate": 5.141675255721762e-07, + "loss": 0.532, + "step": 7215 + }, + { + "epoch": 4.697462589459987, + "grad_norm": 2.063908576965332, + "learning_rate": 5.038051335250316e-07, + "loss": 0.5132, + "step": 7220 + }, + { + "epoch": 4.700715679895901, + "grad_norm": 1.5827159881591797, + "learning_rate": 4.935471647313284e-07, + "loss": 0.515, + "step": 7225 + }, + { + "epoch": 4.703968770331815, + "grad_norm": 1.9684885740280151, + "learning_rate": 4.833936629191016e-07, + "loss": 0.5054, + "step": 7230 + }, + { + "epoch": 4.70722186076773, + "grad_norm": 2.0594069957733154, + "learning_rate": 4.7334467137105933e-07, + "loss": 0.5235, + "step": 7235 + }, + { + "epoch": 4.7104749512036435, + "grad_norm": 1.9911025762557983, + "learning_rate": 4.634002329244047e-07, + "loss": 0.5146, + "step": 7240 + }, + { + "epoch": 4.713728041639557, + "grad_norm": 1.7078765630722046, + "learning_rate": 4.535603899706448e-07, + "loss": 0.5174, + "step": 7245 + }, + { + "epoch": 4.716981132075472, + "grad_norm": 1.6561988592147827, + "learning_rate": 4.438251844554098e-07, + "loss": 0.5201, + "step": 7250 + }, + { + "epoch": 4.720234222511386, + "grad_norm": 1.7727643251419067, + "learning_rate": 4.341946578782868e-07, + "loss": 0.5185, + "step": 7255 + }, + { + "epoch": 4.7234873129473, + "grad_norm": 1.8667991161346436, + "learning_rate": 4.2466885129262004e-07, + "loss": 0.5033, + "step": 7260 + }, + { + "epoch": 4.726740403383214, + "grad_norm": 1.5502984523773193, + "learning_rate": 4.152478053053632e-07, + "loss": 0.5328, + "step": 7265 + }, + { + "epoch": 4.729993493819128, + "grad_norm": 1.9481128454208374, + "learning_rate": 4.059315600768887e-07, + "loss": 0.5151, + "step": 7270 + }, + { + "epoch": 4.733246584255042, + "grad_norm": 2.2370522022247314, + "learning_rate": 3.967201553208122e-07, + "loss": 0.5126, + "step": 7275 + }, + { + "epoch": 4.736499674690957, + "grad_norm": 1.9233421087265015, + "learning_rate": 3.876136303038458e-07, + "loss": 0.5224, + "step": 7280 + }, + { + "epoch": 4.7397527651268705, + "grad_norm": 1.6725999116897583, + "learning_rate": 3.7861202384560644e-07, + "loss": 0.5343, + "step": 7285 + }, + { + "epoch": 4.743005855562784, + "grad_norm": 1.5591987371444702, + "learning_rate": 3.6971537431846057e-07, + "loss": 0.5073, + "step": 7290 + }, + { + "epoch": 4.746258945998699, + "grad_norm": 1.721091866493225, + "learning_rate": 3.609237196473658e-07, + "loss": 0.5274, + "step": 7295 + }, + { + "epoch": 4.749512036434613, + "grad_norm": 1.7789474725723267, + "learning_rate": 3.5223709730970446e-07, + "loss": 0.5072, + "step": 7300 + }, + { + "epoch": 4.752765126870527, + "grad_norm": 1.7836154699325562, + "learning_rate": 3.4365554433511416e-07, + "loss": 0.5126, + "step": 7305 + }, + { + "epoch": 4.756018217306441, + "grad_norm": 1.6633206605911255, + "learning_rate": 3.3517909730534926e-07, + "loss": 0.5137, + "step": 7310 + }, + { + "epoch": 4.759271307742355, + "grad_norm": 3.098612070083618, + "learning_rate": 3.268077923541085e-07, + "loss": 0.5061, + "step": 7315 + }, + { + "epoch": 4.762524398178269, + "grad_norm": 1.939909815788269, + "learning_rate": 3.185416651668882e-07, + "loss": 0.5349, + "step": 7320 + }, + { + "epoch": 4.765777488614184, + "grad_norm": 1.7502937316894531, + "learning_rate": 3.1038075098083485e-07, + "loss": 0.5032, + "step": 7325 + }, + { + "epoch": 4.7690305790500975, + "grad_norm": 1.7100647687911987, + "learning_rate": 3.023250845845815e-07, + "loss": 0.5133, + "step": 7330 + }, + { + "epoch": 4.772283669486011, + "grad_norm": 1.752884030342102, + "learning_rate": 2.943747003181091e-07, + "loss": 0.5358, + "step": 7335 + }, + { + "epoch": 4.775536759921926, + "grad_norm": 1.700315237045288, + "learning_rate": 2.8652963207260184e-07, + "loss": 0.5048, + "step": 7340 + }, + { + "epoch": 4.77878985035784, + "grad_norm": 2.1294970512390137, + "learning_rate": 2.787899132902949e-07, + "loss": 0.4829, + "step": 7345 + }, + { + "epoch": 4.782042940793754, + "grad_norm": 1.8150845766067505, + "learning_rate": 2.711555769643381e-07, + "loss": 0.512, + "step": 7350 + }, + { + "epoch": 4.785296031229668, + "grad_norm": 2.119196653366089, + "learning_rate": 2.636266556386546e-07, + "loss": 0.5267, + "step": 7355 + }, + { + "epoch": 4.788549121665582, + "grad_norm": 2.050795793533325, + "learning_rate": 2.562031814077964e-07, + "loss": 0.5089, + "step": 7360 + }, + { + "epoch": 4.791802212101496, + "grad_norm": 1.6425533294677734, + "learning_rate": 2.488851859168112e-07, + "loss": 0.5168, + "step": 7365 + }, + { + "epoch": 4.795055302537411, + "grad_norm": 1.8162248134613037, + "learning_rate": 2.4167270036111743e-07, + "loss": 0.5028, + "step": 7370 + }, + { + "epoch": 4.798308392973325, + "grad_norm": 1.7280550003051758, + "learning_rate": 2.345657554863545e-07, + "loss": 0.5127, + "step": 7375 + }, + { + "epoch": 4.801561483409239, + "grad_norm": 1.8187389373779297, + "learning_rate": 2.2756438158826053e-07, + "loss": 0.5349, + "step": 7380 + }, + { + "epoch": 4.804814573845153, + "grad_norm": 1.7695400714874268, + "learning_rate": 2.2066860851253922e-07, + "loss": 0.5211, + "step": 7385 + }, + { + "epoch": 4.808067664281067, + "grad_norm": 3.8797385692596436, + "learning_rate": 2.1387846565474045e-07, + "loss": 0.5189, + "step": 7390 + }, + { + "epoch": 4.811320754716981, + "grad_norm": 1.7038609981536865, + "learning_rate": 2.0719398196012707e-07, + "loss": 0.5342, + "step": 7395 + }, + { + "epoch": 4.814573845152895, + "grad_norm": 1.7032898664474487, + "learning_rate": 2.0061518592355277e-07, + "loss": 0.5139, + "step": 7400 + }, + { + "epoch": 4.817826935588809, + "grad_norm": 1.818298101425171, + "learning_rate": 1.9414210558934554e-07, + "loss": 0.5198, + "step": 7405 + }, + { + "epoch": 4.821080026024724, + "grad_norm": 1.8034625053405762, + "learning_rate": 1.8777476855118547e-07, + "loss": 0.5314, + "step": 7410 + }, + { + "epoch": 4.824333116460638, + "grad_norm": 3.0345141887664795, + "learning_rate": 1.8151320195197997e-07, + "loss": 0.5387, + "step": 7415 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 1.8238238096237183, + "learning_rate": 1.753574324837609e-07, + "loss": 0.5219, + "step": 7420 + }, + { + "epoch": 4.830839297332465, + "grad_norm": 1.8523563146591187, + "learning_rate": 1.6930748638756266e-07, + "loss": 0.5075, + "step": 7425 + }, + { + "epoch": 4.83409238776838, + "grad_norm": 1.592421531677246, + "learning_rate": 1.6336338945331098e-07, + "loss": 0.512, + "step": 7430 + }, + { + "epoch": 4.837345478204294, + "grad_norm": 1.459957480430603, + "learning_rate": 1.57525167019712e-07, + "loss": 0.5154, + "step": 7435 + }, + { + "epoch": 4.840598568640209, + "grad_norm": 1.7186057567596436, + "learning_rate": 1.517928439741495e-07, + "loss": 0.5316, + "step": 7440 + }, + { + "epoch": 4.8438516590761225, + "grad_norm": 1.5618077516555786, + "learning_rate": 1.461664447525768e-07, + "loss": 0.4997, + "step": 7445 + }, + { + "epoch": 4.847104749512036, + "grad_norm": 1.9501081705093384, + "learning_rate": 1.4064599333940555e-07, + "loss": 0.5115, + "step": 7450 + }, + { + "epoch": 4.85035783994795, + "grad_norm": 1.593405842781067, + "learning_rate": 1.3523151326741702e-07, + "loss": 0.5062, + "step": 7455 + }, + { + "epoch": 4.853610930383865, + "grad_norm": 1.6193232536315918, + "learning_rate": 1.299230276176483e-07, + "loss": 0.5096, + "step": 7460 + }, + { + "epoch": 4.856864020819779, + "grad_norm": 1.7456111907958984, + "learning_rate": 1.247205590192979e-07, + "loss": 0.5154, + "step": 7465 + }, + { + "epoch": 4.860117111255693, + "grad_norm": 1.7586069107055664, + "learning_rate": 1.1962412964964254e-07, + "loss": 0.5285, + "step": 7470 + }, + { + "epoch": 4.863370201691607, + "grad_norm": 2.715386152267456, + "learning_rate": 1.1463376123391766e-07, + "loss": 0.4909, + "step": 7475 + }, + { + "epoch": 4.866623292127521, + "grad_norm": 2.343010902404785, + "learning_rate": 1.0974947504524269e-07, + "loss": 0.5142, + "step": 7480 + }, + { + "epoch": 4.869876382563435, + "grad_norm": 1.7289972305297852, + "learning_rate": 1.0497129190452926e-07, + "loss": 0.5191, + "step": 7485 + }, + { + "epoch": 4.8731294729993495, + "grad_norm": 1.742447018623352, + "learning_rate": 1.0029923218038972e-07, + "loss": 0.5248, + "step": 7490 + }, + { + "epoch": 4.876382563435263, + "grad_norm": 1.901174545288086, + "learning_rate": 9.573331578904e-08, + "loss": 0.5213, + "step": 7495 + }, + { + "epoch": 4.879635653871178, + "grad_norm": 2.5633485317230225, + "learning_rate": 9.127356219423843e-08, + "loss": 0.5136, + "step": 7500 + }, + { + "epoch": 4.882888744307092, + "grad_norm": 1.8684697151184082, + "learning_rate": 8.691999040717491e-08, + "loss": 0.5188, + "step": 7505 + }, + { + "epoch": 4.886141834743006, + "grad_norm": 2.1927521228790283, + "learning_rate": 8.267261898641798e-08, + "loss": 0.5119, + "step": 7510 + }, + { + "epoch": 4.8893949251789195, + "grad_norm": 1.949514627456665, + "learning_rate": 7.853146603780947e-08, + "loss": 0.5147, + "step": 7515 + }, + { + "epoch": 4.892648015614834, + "grad_norm": 2.0919501781463623, + "learning_rate": 7.449654921440618e-08, + "loss": 0.5064, + "step": 7520 + }, + { + "epoch": 4.895901106050748, + "grad_norm": 1.5901788473129272, + "learning_rate": 7.056788571639105e-08, + "loss": 0.5109, + "step": 7525 + }, + { + "epoch": 4.899154196486663, + "grad_norm": 1.8604559898376465, + "learning_rate": 6.674549229101767e-08, + "loss": 0.526, + "step": 7530 + }, + { + "epoch": 4.9024072869225765, + "grad_norm": 1.8954790830612183, + "learning_rate": 6.302938523251589e-08, + "loss": 0.5039, + "step": 7535 + }, + { + "epoch": 4.90566037735849, + "grad_norm": 1.7178046703338623, + "learning_rate": 5.941958038204187e-08, + "loss": 0.5219, + "step": 7540 + }, + { + "epoch": 4.908913467794405, + "grad_norm": 5.730696201324463, + "learning_rate": 5.59160931275976e-08, + "loss": 0.5004, + "step": 7545 + }, + { + "epoch": 4.912166558230319, + "grad_norm": 1.6460310220718384, + "learning_rate": 5.2518938403978145e-08, + "loss": 0.5319, + "step": 7550 + }, + { + "epoch": 4.915419648666233, + "grad_norm": 1.6308308839797974, + "learning_rate": 4.922813069269394e-08, + "loss": 0.5214, + "step": 7555 + }, + { + "epoch": 4.918672739102147, + "grad_norm": 1.5023380517959595, + "learning_rate": 4.604368402191528e-08, + "loss": 0.5008, + "step": 7560 + }, + { + "epoch": 4.921925829538061, + "grad_norm": 1.7468260526657104, + "learning_rate": 4.2965611966416796e-08, + "loss": 0.5007, + "step": 7565 + }, + { + "epoch": 4.925178919973975, + "grad_norm": 1.9006001949310303, + "learning_rate": 3.9993927647516415e-08, + "loss": 0.51, + "step": 7570 + }, + { + "epoch": 4.92843201040989, + "grad_norm": 1.9338369369506836, + "learning_rate": 3.71286437330115e-08, + "loss": 0.5216, + "step": 7575 + }, + { + "epoch": 4.931685100845804, + "grad_norm": 1.6572381258010864, + "learning_rate": 3.4369772437137236e-08, + "loss": 0.542, + "step": 7580 + }, + { + "epoch": 4.934938191281717, + "grad_norm": 2.3215434551239014, + "learning_rate": 3.1717325520513876e-08, + "loss": 0.513, + "step": 7585 + }, + { + "epoch": 4.938191281717632, + "grad_norm": 1.6987948417663574, + "learning_rate": 2.9171314290080132e-08, + "loss": 0.5284, + "step": 7590 + }, + { + "epoch": 4.941444372153546, + "grad_norm": 1.7099159955978394, + "learning_rate": 2.6731749599065435e-08, + "loss": 0.5267, + "step": 7595 + }, + { + "epoch": 4.94469746258946, + "grad_norm": 1.620719075202942, + "learning_rate": 2.4398641846937187e-08, + "loss": 0.5248, + "step": 7600 + }, + { + "epoch": 4.9479505530253745, + "grad_norm": 1.8238213062286377, + "learning_rate": 2.2172000979345242e-08, + "loss": 0.5268, + "step": 7605 + }, + { + "epoch": 4.951203643461288, + "grad_norm": 2.011178970336914, + "learning_rate": 2.0051836488094167e-08, + "loss": 0.5184, + "step": 7610 + }, + { + "epoch": 4.954456733897202, + "grad_norm": 1.6856626272201538, + "learning_rate": 1.8038157411101597e-08, + "loss": 0.5102, + "step": 7615 + }, + { + "epoch": 4.957709824333117, + "grad_norm": 2.0251147747039795, + "learning_rate": 1.6130972332345505e-08, + "loss": 0.5112, + "step": 7620 + }, + { + "epoch": 4.960962914769031, + "grad_norm": 2.00230073928833, + "learning_rate": 1.4330289381844775e-08, + "loss": 0.5224, + "step": 7625 + }, + { + "epoch": 4.964216005204944, + "grad_norm": 1.570320963859558, + "learning_rate": 1.2636116235612005e-08, + "loss": 0.5315, + "step": 7630 + }, + { + "epoch": 4.967469095640859, + "grad_norm": 1.6889746189117432, + "learning_rate": 1.1048460115634096e-08, + "loss": 0.5193, + "step": 7635 + }, + { + "epoch": 4.970722186076773, + "grad_norm": 2.201413631439209, + "learning_rate": 9.567327789825054e-09, + "loss": 0.5286, + "step": 7640 + }, + { + "epoch": 4.973975276512687, + "grad_norm": 1.7942432165145874, + "learning_rate": 8.192725572006565e-09, + "loss": 0.5211, + "step": 7645 + }, + { + "epoch": 4.9772283669486015, + "grad_norm": 1.7889728546142578, + "learning_rate": 6.924659321888571e-09, + "loss": 0.5164, + "step": 7650 + }, + { + "epoch": 4.980481457384515, + "grad_norm": 2.543469190597534, + "learning_rate": 5.763134445022078e-09, + "loss": 0.5054, + "step": 7655 + }, + { + "epoch": 4.983734547820429, + "grad_norm": 5.216160297393799, + "learning_rate": 4.7081558927991594e-09, + "loss": 0.4954, + "step": 7660 + }, + { + "epoch": 4.986987638256344, + "grad_norm": 2.649937868118286, + "learning_rate": 3.759728162422427e-09, + "loss": 0.5127, + "step": 7665 + }, + { + "epoch": 4.990240728692258, + "grad_norm": 1.9266875982284546, + "learning_rate": 2.9178552968800454e-09, + "loss": 0.5304, + "step": 7670 + }, + { + "epoch": 4.9934938191281715, + "grad_norm": 1.6233766078948975, + "learning_rate": 2.1825408849401873e-09, + "loss": 0.5277, + "step": 7675 + }, + { + "epoch": 4.996746909564086, + "grad_norm": 1.7894667387008667, + "learning_rate": 1.5537880611260491e-09, + "loss": 0.5239, + "step": 7680 + }, + { + "epoch": 5.0, + "grad_norm": 1.5771998167037964, + "learning_rate": 1.0315995057075256e-09, + "loss": 0.5174, + "step": 7685 + }, + { + "epoch": 5.0, + "eval_f1": 0.7944165410554209, + "eval_loss": 0.54638671875, + "eval_precision": 0.7945063287994906, + "eval_recall": 0.7943501451962497, + "eval_runtime": 257.0765, + "eval_samples_per_second": 1530.42, + "eval_steps_per_second": 1.498, + "step": 7685 + }, + { + "epoch": 5.0, + "step": 7685, + "total_flos": 5.363134814553637e+18, + "train_loss": 0.7729351929743412, + "train_runtime": 35402.7725, + "train_samples_per_second": 444.524, + "train_steps_per_second": 0.217 } ], "logging_steps": 5, - "max_steps": 16, + "max_steps": 7685, "num_input_tokens_seen": 0, - "num_train_epochs": 2, + "num_train_epochs": 5, "save_steps": 5.0, "stateful_callbacks": { "TrainerControl": { @@ -78,8 +10849,8 @@ "attributes": {} } }, - "total_flos": 1489153141243904.0, - "train_batch_size": 1024, + "total_flos": 5.363134814553637e+18, + "train_batch_size": 512, "trial_name": null, "trial_params": null }