diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,22244 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.998499565663745, + "eval_steps": 500, + "global_step": 15825, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003158809128958383, + "grad_norm": 6.212856065130571, + "learning_rate": 6.317119393556538e-07, + "loss": 8.5382, + "step": 1 + }, + { + "epoch": 0.0015794045644791914, + "grad_norm": 5.441923520175301, + "learning_rate": 3.158559696778269e-06, + "loss": 8.5222, + "step": 5 + }, + { + "epoch": 0.003158809128958383, + "grad_norm": 6.224499627000259, + "learning_rate": 6.317119393556538e-06, + "loss": 8.5644, + "step": 10 + }, + { + "epoch": 0.004738213693437574, + "grad_norm": 4.55491292930646, + "learning_rate": 9.475679090334807e-06, + "loss": 8.4695, + "step": 15 + }, + { + "epoch": 0.006317618257916766, + "grad_norm": 3.7500686255294613, + "learning_rate": 1.2634238787113077e-05, + "loss": 8.384, + "step": 20 + }, + { + "epoch": 0.007897022822395957, + "grad_norm": 1.0170620331774343, + "learning_rate": 1.5792798483891346e-05, + "loss": 8.2985, + "step": 25 + }, + { + "epoch": 0.009476427386875147, + "grad_norm": 1.398708533003651, + "learning_rate": 1.8951358180669615e-05, + "loss": 8.2458, + "step": 30 + }, + { + "epoch": 0.01105583195135434, + "grad_norm": 1.445046203995498, + "learning_rate": 2.2109917877447884e-05, + "loss": 8.1732, + "step": 35 + }, + { + "epoch": 0.012635236515833531, + "grad_norm": 0.584059432072112, + "learning_rate": 2.5268477574226153e-05, + "loss": 8.1083, + "step": 40 + }, + { + "epoch": 0.014214641080312722, + "grad_norm": 0.8079302274986953, + "learning_rate": 2.8427037271004422e-05, + "loss": 8.0705, + "step": 45 + }, + { + "epoch": 0.015794045644791914, + "grad_norm": 0.49011932786257634, + "learning_rate": 3.158559696778269e-05, + "loss": 8.0232, + "step": 50 + }, + { + "epoch": 0.017373450209271106, + "grad_norm": 0.5457661408112227, + "learning_rate": 3.474415666456096e-05, + "loss": 7.9518, + "step": 55 + }, + { + "epoch": 0.018952854773750295, + "grad_norm": 0.4024225943327694, + "learning_rate": 3.790271636133923e-05, + "loss": 7.8717, + "step": 60 + }, + { + "epoch": 0.020532259338229487, + "grad_norm": 0.49433086213837385, + "learning_rate": 4.10612760581175e-05, + "loss": 7.8674, + "step": 65 + }, + { + "epoch": 0.02211166390270868, + "grad_norm": 0.34359026561624734, + "learning_rate": 4.421983575489577e-05, + "loss": 7.8532, + "step": 70 + }, + { + "epoch": 0.02369106846718787, + "grad_norm": 0.6741133299066872, + "learning_rate": 4.737839545167404e-05, + "loss": 7.7934, + "step": 75 + }, + { + "epoch": 0.025270473031667063, + "grad_norm": 0.3050937286241217, + "learning_rate": 5.0536955148452307e-05, + "loss": 7.7468, + "step": 80 + }, + { + "epoch": 0.02684987759614625, + "grad_norm": 0.2614960429830844, + "learning_rate": 5.3695514845230576e-05, + "loss": 7.7666, + "step": 85 + }, + { + "epoch": 0.028429282160625444, + "grad_norm": 0.2567785117518754, + "learning_rate": 5.6854074542008845e-05, + "loss": 7.7146, + "step": 90 + }, + { + "epoch": 0.030008686725104636, + "grad_norm": 0.6926673337604969, + "learning_rate": 6.0012634238787114e-05, + "loss": 7.6734, + "step": 95 + }, + { + "epoch": 0.03158809128958383, + "grad_norm": 0.23320925067150183, + "learning_rate": 6.317119393556538e-05, + "loss": 7.6521, + "step": 100 + }, + { + "epoch": 0.03316749585406302, + "grad_norm": 0.25069824569313287, + "learning_rate": 6.632975363234366e-05, + "loss": 7.6358, + "step": 105 + }, + { + "epoch": 0.03474690041854221, + "grad_norm": 0.1942576204740953, + "learning_rate": 6.948831332912192e-05, + "loss": 7.5837, + "step": 110 + }, + { + "epoch": 0.036326304983021404, + "grad_norm": 0.38198932971350547, + "learning_rate": 7.264687302590018e-05, + "loss": 7.509, + "step": 115 + }, + { + "epoch": 0.03790570954750059, + "grad_norm": 0.5463580858340262, + "learning_rate": 7.580543272267846e-05, + "loss": 7.5754, + "step": 120 + }, + { + "epoch": 0.03948511411197978, + "grad_norm": 0.48873012656695597, + "learning_rate": 7.896399241945674e-05, + "loss": 7.5584, + "step": 125 + }, + { + "epoch": 0.04106451867645897, + "grad_norm": 0.43371845010250426, + "learning_rate": 8.2122552116235e-05, + "loss": 7.4969, + "step": 130 + }, + { + "epoch": 0.042643923240938165, + "grad_norm": 0.14855751116650812, + "learning_rate": 8.528111181301326e-05, + "loss": 7.4691, + "step": 135 + }, + { + "epoch": 0.04422332780541736, + "grad_norm": 0.4431555849752563, + "learning_rate": 8.843967150979154e-05, + "loss": 7.4897, + "step": 140 + }, + { + "epoch": 0.04580273236989655, + "grad_norm": 0.7149119994284892, + "learning_rate": 9.159823120656981e-05, + "loss": 7.4245, + "step": 145 + }, + { + "epoch": 0.04738213693437574, + "grad_norm": 0.5337474765697023, + "learning_rate": 9.475679090334807e-05, + "loss": 7.4496, + "step": 150 + }, + { + "epoch": 0.048961541498854934, + "grad_norm": 0.41578257630377913, + "learning_rate": 9.791535060012634e-05, + "loss": 7.4453, + "step": 155 + }, + { + "epoch": 0.050540946063334126, + "grad_norm": 0.33166265263489864, + "learning_rate": 0.00010107391029690461, + "loss": 7.3195, + "step": 160 + }, + { + "epoch": 0.05212035062781331, + "grad_norm": 0.37257521426585277, + "learning_rate": 0.00010423246999368289, + "loss": 7.3363, + "step": 165 + }, + { + "epoch": 0.0536997551922925, + "grad_norm": 0.8419585693918789, + "learning_rate": 0.00010739102969046115, + "loss": 7.3107, + "step": 170 + }, + { + "epoch": 0.055279159756771695, + "grad_norm": 0.9683416314959721, + "learning_rate": 0.00011054958938723943, + "loss": 7.2638, + "step": 175 + }, + { + "epoch": 0.05685856432125089, + "grad_norm": 0.8880701070544468, + "learning_rate": 0.00011370814908401769, + "loss": 7.2934, + "step": 180 + }, + { + "epoch": 0.05843796888573008, + "grad_norm": 0.5841725028771209, + "learning_rate": 0.00011686670878079595, + "loss": 7.34, + "step": 185 + }, + { + "epoch": 0.06001737345020927, + "grad_norm": 0.3513560839888165, + "learning_rate": 0.00012002526847757423, + "loss": 7.3608, + "step": 190 + }, + { + "epoch": 0.061596778014688464, + "grad_norm": 0.447712478864365, + "learning_rate": 0.0001231838281743525, + "loss": 7.3063, + "step": 195 + }, + { + "epoch": 0.06317618257916766, + "grad_norm": 1.006450646835058, + "learning_rate": 0.00012634238787113077, + "loss": 7.243, + "step": 200 + }, + { + "epoch": 0.06475558714364685, + "grad_norm": 0.7173850731067234, + "learning_rate": 0.00012950094756790904, + "loss": 7.2677, + "step": 205 + }, + { + "epoch": 0.06633499170812604, + "grad_norm": 0.9216993218698499, + "learning_rate": 0.00013265950726468732, + "loss": 7.2272, + "step": 210 + }, + { + "epoch": 0.06791439627260523, + "grad_norm": 1.0389416519219288, + "learning_rate": 0.00013581806696146557, + "loss": 7.227, + "step": 215 + }, + { + "epoch": 0.06949380083708442, + "grad_norm": 0.8340948519918115, + "learning_rate": 0.00013897662665824384, + "loss": 7.1824, + "step": 220 + }, + { + "epoch": 0.07107320540156362, + "grad_norm": 0.3870955261336493, + "learning_rate": 0.00014213518635502212, + "loss": 7.1977, + "step": 225 + }, + { + "epoch": 0.07265260996604281, + "grad_norm": 0.3683202325041625, + "learning_rate": 0.00014529374605180037, + "loss": 7.2583, + "step": 230 + }, + { + "epoch": 0.07423201453052199, + "grad_norm": 0.3596763082283532, + "learning_rate": 0.00014845230574857864, + "loss": 7.1471, + "step": 235 + }, + { + "epoch": 0.07581141909500118, + "grad_norm": 0.42532969548337346, + "learning_rate": 0.00015161086544535692, + "loss": 7.1602, + "step": 240 + }, + { + "epoch": 0.07739082365948037, + "grad_norm": 0.4682925421768155, + "learning_rate": 0.0001547694251421352, + "loss": 7.1785, + "step": 245 + }, + { + "epoch": 0.07897022822395956, + "grad_norm": 0.2593792457091068, + "learning_rate": 0.00015792798483891347, + "loss": 7.1123, + "step": 250 + }, + { + "epoch": 0.08054963278843875, + "grad_norm": 0.3548563172798498, + "learning_rate": 0.00016108654453569172, + "loss": 7.0681, + "step": 255 + }, + { + "epoch": 0.08212903735291795, + "grad_norm": 0.23746523684222884, + "learning_rate": 0.00016424510423247, + "loss": 7.1175, + "step": 260 + }, + { + "epoch": 0.08370844191739714, + "grad_norm": 0.2931552754311947, + "learning_rate": 0.00016740366392924827, + "loss": 7.1107, + "step": 265 + }, + { + "epoch": 0.08528784648187633, + "grad_norm": 0.45247713625301256, + "learning_rate": 0.00017056222362602652, + "loss": 7.1118, + "step": 270 + }, + { + "epoch": 0.08686725104635552, + "grad_norm": 0.4946172916591009, + "learning_rate": 0.0001737207833228048, + "loss": 7.1613, + "step": 275 + }, + { + "epoch": 0.08844665561083472, + "grad_norm": 0.4604549535906028, + "learning_rate": 0.00017687934301958307, + "loss": 7.1548, + "step": 280 + }, + { + "epoch": 0.09002606017531391, + "grad_norm": 0.5301197511893415, + "learning_rate": 0.00018003790271636132, + "loss": 7.1146, + "step": 285 + }, + { + "epoch": 0.0916054647397931, + "grad_norm": 0.1660195259037277, + "learning_rate": 0.00018319646241313962, + "loss": 7.0903, + "step": 290 + }, + { + "epoch": 0.09318486930427229, + "grad_norm": 0.8970188429319172, + "learning_rate": 0.0001863550221099179, + "loss": 7.0519, + "step": 295 + }, + { + "epoch": 0.09476427386875148, + "grad_norm": 0.35501590333638516, + "learning_rate": 0.00018951358180669615, + "loss": 7.1412, + "step": 300 + }, + { + "epoch": 0.09634367843323068, + "grad_norm": 0.5402743924835812, + "learning_rate": 0.00019267214150347443, + "loss": 7.08, + "step": 305 + }, + { + "epoch": 0.09792308299770987, + "grad_norm": 0.5222405693494848, + "learning_rate": 0.00019583070120025267, + "loss": 7.0659, + "step": 310 + }, + { + "epoch": 0.09950248756218906, + "grad_norm": 0.7653128151615268, + "learning_rate": 0.00019898926089703095, + "loss": 7.0446, + "step": 315 + }, + { + "epoch": 0.10108189212666825, + "grad_norm": 0.41020927031585835, + "learning_rate": 0.00020214782059380923, + "loss": 6.9886, + "step": 320 + }, + { + "epoch": 0.10266129669114744, + "grad_norm": 0.3465446565452738, + "learning_rate": 0.00020530638029058748, + "loss": 7.0641, + "step": 325 + }, + { + "epoch": 0.10424070125562662, + "grad_norm": 0.5451369780514456, + "learning_rate": 0.00020846493998736578, + "loss": 7.0725, + "step": 330 + }, + { + "epoch": 0.10582010582010581, + "grad_norm": 0.3149566875379701, + "learning_rate": 0.00021162349968414405, + "loss": 7.0405, + "step": 335 + }, + { + "epoch": 0.107399510384585, + "grad_norm": 1.0500906357634088, + "learning_rate": 0.0002147820593809223, + "loss": 7.0727, + "step": 340 + }, + { + "epoch": 0.1089789149490642, + "grad_norm": 0.708025196449839, + "learning_rate": 0.00021794061907770058, + "loss": 7.0846, + "step": 345 + }, + { + "epoch": 0.11055831951354339, + "grad_norm": 0.40365285259163436, + "learning_rate": 0.00022109917877447885, + "loss": 7.0724, + "step": 350 + }, + { + "epoch": 0.11213772407802258, + "grad_norm": 1.009853284364277, + "learning_rate": 0.0002242577384712571, + "loss": 7.1044, + "step": 355 + }, + { + "epoch": 0.11371712864250177, + "grad_norm": 0.30563348116782707, + "learning_rate": 0.00022741629816803538, + "loss": 7.0489, + "step": 360 + }, + { + "epoch": 0.11529653320698097, + "grad_norm": 0.7687726362295777, + "learning_rate": 0.00023057485786481363, + "loss": 7.0521, + "step": 365 + }, + { + "epoch": 0.11687593777146016, + "grad_norm": 0.5391670058221071, + "learning_rate": 0.0002337334175615919, + "loss": 7.0408, + "step": 370 + }, + { + "epoch": 0.11845534233593935, + "grad_norm": 1.5486041455114379, + "learning_rate": 0.0002368919772583702, + "loss": 7.0371, + "step": 375 + }, + { + "epoch": 0.12003474690041854, + "grad_norm": 0.9903096372991533, + "learning_rate": 0.00024005053695514846, + "loss": 7.0633, + "step": 380 + }, + { + "epoch": 0.12161415146489774, + "grad_norm": 0.8877067690259648, + "learning_rate": 0.00024320909665192673, + "loss": 7.0858, + "step": 385 + }, + { + "epoch": 0.12319355602937693, + "grad_norm": 0.49172389967013586, + "learning_rate": 0.000246367656348705, + "loss": 7.0088, + "step": 390 + }, + { + "epoch": 0.12477296059385612, + "grad_norm": 0.33449461681545467, + "learning_rate": 0.00024952621604548323, + "loss": 7.041, + "step": 395 + }, + { + "epoch": 0.1263523651583353, + "grad_norm": 0.1510023218564581, + "learning_rate": 0.00025268477574226153, + "loss": 7.0501, + "step": 400 + }, + { + "epoch": 0.1279317697228145, + "grad_norm": 0.1667817614114935, + "learning_rate": 0.00025584333543903984, + "loss": 7.0478, + "step": 405 + }, + { + "epoch": 0.1295111742872937, + "grad_norm": 0.31763527428712873, + "learning_rate": 0.0002590018951358181, + "loss": 7.0516, + "step": 410 + }, + { + "epoch": 0.1310905788517729, + "grad_norm": 0.8340640684806893, + "learning_rate": 0.00026216045483259633, + "loss": 7.0509, + "step": 415 + }, + { + "epoch": 0.13266998341625208, + "grad_norm": 1.2175814470982558, + "learning_rate": 0.00026531901452937464, + "loss": 7.0111, + "step": 420 + }, + { + "epoch": 0.13424938798073127, + "grad_norm": 0.2600000626861157, + "learning_rate": 0.0002684775742261529, + "loss": 7.038, + "step": 425 + }, + { + "epoch": 0.13582879254521046, + "grad_norm": 0.3670628581593165, + "learning_rate": 0.00027163613392293113, + "loss": 6.9391, + "step": 430 + }, + { + "epoch": 0.13740819710968966, + "grad_norm": 1.6405436514269163, + "learning_rate": 0.00027479469361970944, + "loss": 6.9568, + "step": 435 + }, + { + "epoch": 0.13898760167416885, + "grad_norm": 1.2445712669582063, + "learning_rate": 0.0002779532533164877, + "loss": 6.9098, + "step": 440 + }, + { + "epoch": 0.14056700623864804, + "grad_norm": 0.370130351586787, + "learning_rate": 0.00028111181301326594, + "loss": 6.9357, + "step": 445 + }, + { + "epoch": 0.14214641080312723, + "grad_norm": 1.0843722903361295, + "learning_rate": 0.00028427037271004424, + "loss": 6.9319, + "step": 450 + }, + { + "epoch": 0.14372581536760642, + "grad_norm": 0.6683778983623138, + "learning_rate": 0.0002874289324068225, + "loss": 6.9744, + "step": 455 + }, + { + "epoch": 0.14530521993208562, + "grad_norm": 1.0477861738856202, + "learning_rate": 0.00029058749210360074, + "loss": 6.9189, + "step": 460 + }, + { + "epoch": 0.1468846244965648, + "grad_norm": 0.25574892908976704, + "learning_rate": 0.00029374605180037904, + "loss": 6.9273, + "step": 465 + }, + { + "epoch": 0.14846402906104397, + "grad_norm": 0.9522550943300727, + "learning_rate": 0.0002969046114971573, + "loss": 6.8819, + "step": 470 + }, + { + "epoch": 0.15004343362552316, + "grad_norm": 2.2990064607561798, + "learning_rate": 0.00030006317119393554, + "loss": 6.9029, + "step": 475 + }, + { + "epoch": 0.15162283819000236, + "grad_norm": 1.1494013293928298, + "learning_rate": 0.00030322173089071384, + "loss": 6.9855, + "step": 480 + }, + { + "epoch": 0.15320224275448155, + "grad_norm": 0.9865873913996686, + "learning_rate": 0.00030638029058749214, + "loss": 6.9323, + "step": 485 + }, + { + "epoch": 0.15478164731896074, + "grad_norm": 0.29333155830925645, + "learning_rate": 0.0003095388502842704, + "loss": 6.8918, + "step": 490 + }, + { + "epoch": 0.15636105188343993, + "grad_norm": 1.180574183859259, + "learning_rate": 0.0003126974099810487, + "loss": 6.9231, + "step": 495 + }, + { + "epoch": 0.15794045644791913, + "grad_norm": 0.3676278970131492, + "learning_rate": 0.00031585596967782694, + "loss": 6.824, + "step": 500 + }, + { + "epoch": 0.15951986101239832, + "grad_norm": 0.7386923557000004, + "learning_rate": 0.0003190145293746052, + "loss": 6.8202, + "step": 505 + }, + { + "epoch": 0.1610992655768775, + "grad_norm": 0.49014816122240945, + "learning_rate": 0.00032217308907138344, + "loss": 6.8693, + "step": 510 + }, + { + "epoch": 0.1626786701413567, + "grad_norm": 0.6786399651886812, + "learning_rate": 0.00032533164876816174, + "loss": 6.8113, + "step": 515 + }, + { + "epoch": 0.1642580747058359, + "grad_norm": 0.2186297422888697, + "learning_rate": 0.00032849020846494, + "loss": 6.8103, + "step": 520 + }, + { + "epoch": 0.16583747927031509, + "grad_norm": 0.2993551100105818, + "learning_rate": 0.00033164876816171824, + "loss": 6.7853, + "step": 525 + }, + { + "epoch": 0.16741688383479428, + "grad_norm": 1.0992601864877571, + "learning_rate": 0.00033480732785849654, + "loss": 6.7248, + "step": 530 + }, + { + "epoch": 0.16899628839927347, + "grad_norm": 1.7439864127638862, + "learning_rate": 0.0003379658875552748, + "loss": 6.761, + "step": 535 + }, + { + "epoch": 0.17057569296375266, + "grad_norm": 1.1409198091974713, + "learning_rate": 0.00034112444725205304, + "loss": 6.7303, + "step": 540 + }, + { + "epoch": 0.17215509752823185, + "grad_norm": 1.0606879735807513, + "learning_rate": 0.00034428300694883135, + "loss": 6.7154, + "step": 545 + }, + { + "epoch": 0.17373450209271105, + "grad_norm": 1.4442637810068535, + "learning_rate": 0.0003474415666456096, + "loss": 6.7215, + "step": 550 + }, + { + "epoch": 0.17531390665719024, + "grad_norm": 1.006395303818788, + "learning_rate": 0.00035060012634238784, + "loss": 6.6503, + "step": 555 + }, + { + "epoch": 0.17689331122166943, + "grad_norm": 1.0589832445448473, + "learning_rate": 0.00035375868603916615, + "loss": 6.7135, + "step": 560 + }, + { + "epoch": 0.17847271578614862, + "grad_norm": 0.8497334144261705, + "learning_rate": 0.0003569172457359444, + "loss": 6.6027, + "step": 565 + }, + { + "epoch": 0.18005212035062781, + "grad_norm": 1.1077562449599352, + "learning_rate": 0.00036007580543272264, + "loss": 6.5622, + "step": 570 + }, + { + "epoch": 0.181631524915107, + "grad_norm": 1.7098080815292351, + "learning_rate": 0.000363234365129501, + "loss": 6.5887, + "step": 575 + }, + { + "epoch": 0.1832109294795862, + "grad_norm": 0.9248105424600663, + "learning_rate": 0.00036639292482627925, + "loss": 6.52, + "step": 580 + }, + { + "epoch": 0.1847903340440654, + "grad_norm": 0.896852304222008, + "learning_rate": 0.0003695514845230575, + "loss": 6.4583, + "step": 585 + }, + { + "epoch": 0.18636973860854458, + "grad_norm": 0.46310282774613815, + "learning_rate": 0.0003727100442198358, + "loss": 6.4367, + "step": 590 + }, + { + "epoch": 0.18794914317302377, + "grad_norm": 0.6968747602312554, + "learning_rate": 0.00037586860391661405, + "loss": 6.3631, + "step": 595 + }, + { + "epoch": 0.18952854773750297, + "grad_norm": 0.7109858278819291, + "learning_rate": 0.0003790271636133923, + "loss": 6.3846, + "step": 600 + }, + { + "epoch": 0.19110795230198216, + "grad_norm": 1.2857212115780978, + "learning_rate": 0.0003821857233101706, + "loss": 6.3662, + "step": 605 + }, + { + "epoch": 0.19268735686646135, + "grad_norm": 0.7639201898318926, + "learning_rate": 0.00038534428300694885, + "loss": 6.3526, + "step": 610 + }, + { + "epoch": 0.19426676143094054, + "grad_norm": 0.509150106992322, + "learning_rate": 0.0003885028427037271, + "loss": 6.2548, + "step": 615 + }, + { + "epoch": 0.19584616599541974, + "grad_norm": 1.0321104885060626, + "learning_rate": 0.00039166140240050535, + "loss": 6.1297, + "step": 620 + }, + { + "epoch": 0.19742557055989893, + "grad_norm": 0.7877386488974744, + "learning_rate": 0.00039481996209728365, + "loss": 6.2514, + "step": 625 + }, + { + "epoch": 0.19900497512437812, + "grad_norm": 0.8329046476850496, + "learning_rate": 0.0003979785217940619, + "loss": 6.2613, + "step": 630 + }, + { + "epoch": 0.2005843796888573, + "grad_norm": 0.8775569920638122, + "learning_rate": 0.00040113708149084015, + "loss": 6.1495, + "step": 635 + }, + { + "epoch": 0.2021637842533365, + "grad_norm": 0.7269233689233559, + "learning_rate": 0.00040429564118761845, + "loss": 6.1335, + "step": 640 + }, + { + "epoch": 0.2037431888178157, + "grad_norm": 0.6496721449107232, + "learning_rate": 0.0004074542008843967, + "loss": 6.0668, + "step": 645 + }, + { + "epoch": 0.2053225933822949, + "grad_norm": 0.5122479752510881, + "learning_rate": 0.00041061276058117495, + "loss": 6.1253, + "step": 650 + }, + { + "epoch": 0.20690199794677408, + "grad_norm": 0.5050744799240616, + "learning_rate": 0.00041377132027795325, + "loss": 6.0586, + "step": 655 + }, + { + "epoch": 0.20848140251125324, + "grad_norm": 0.3479184996986057, + "learning_rate": 0.00041692987997473156, + "loss": 6.0856, + "step": 660 + }, + { + "epoch": 0.21006080707573244, + "grad_norm": 0.3068938111653456, + "learning_rate": 0.0004200884396715098, + "loss": 5.9654, + "step": 665 + }, + { + "epoch": 0.21164021164021163, + "grad_norm": 0.39495459621524287, + "learning_rate": 0.0004232469993682881, + "loss": 6.095, + "step": 670 + }, + { + "epoch": 0.21321961620469082, + "grad_norm": 0.9660105686777036, + "learning_rate": 0.00042640555906506636, + "loss": 5.9261, + "step": 675 + }, + { + "epoch": 0.21479902076917, + "grad_norm": 0.9919331988900892, + "learning_rate": 0.0004295641187618446, + "loss": 5.886, + "step": 680 + }, + { + "epoch": 0.2163784253336492, + "grad_norm": 0.7160578065834907, + "learning_rate": 0.0004327226784586229, + "loss": 5.8574, + "step": 685 + }, + { + "epoch": 0.2179578298981284, + "grad_norm": 0.639006532826775, + "learning_rate": 0.00043588123815540116, + "loss": 5.8255, + "step": 690 + }, + { + "epoch": 0.2195372344626076, + "grad_norm": 0.3634595486992699, + "learning_rate": 0.0004390397978521794, + "loss": 5.8781, + "step": 695 + }, + { + "epoch": 0.22111663902708678, + "grad_norm": 0.2936598155781062, + "learning_rate": 0.0004421983575489577, + "loss": 5.811, + "step": 700 + }, + { + "epoch": 0.22269604359156597, + "grad_norm": 0.325525269860369, + "learning_rate": 0.00044535691724573596, + "loss": 5.771, + "step": 705 + }, + { + "epoch": 0.22427544815604517, + "grad_norm": 0.312506499301807, + "learning_rate": 0.0004485154769425142, + "loss": 5.6925, + "step": 710 + }, + { + "epoch": 0.22585485272052436, + "grad_norm": 0.9098263319413956, + "learning_rate": 0.0004516740366392925, + "loss": 5.6828, + "step": 715 + }, + { + "epoch": 0.22743425728500355, + "grad_norm": 0.38653013169140144, + "learning_rate": 0.00045483259633607076, + "loss": 5.7282, + "step": 720 + }, + { + "epoch": 0.22901366184948274, + "grad_norm": 0.8225548383300585, + "learning_rate": 0.000457991156032849, + "loss": 5.6139, + "step": 725 + }, + { + "epoch": 0.23059306641396193, + "grad_norm": 0.674246406645935, + "learning_rate": 0.00046114971572962726, + "loss": 5.751, + "step": 730 + }, + { + "epoch": 0.23217247097844113, + "grad_norm": 0.410688929283078, + "learning_rate": 0.00046430827542640556, + "loss": 5.6573, + "step": 735 + }, + { + "epoch": 0.23375187554292032, + "grad_norm": 0.5258450644644558, + "learning_rate": 0.0004674668351231838, + "loss": 5.6017, + "step": 740 + }, + { + "epoch": 0.2353312801073995, + "grad_norm": 0.5455996191043986, + "learning_rate": 0.00047062539481996206, + "loss": 5.5352, + "step": 745 + }, + { + "epoch": 0.2369106846718787, + "grad_norm": 0.6327547779360982, + "learning_rate": 0.0004737839545167404, + "loss": 5.5942, + "step": 750 + }, + { + "epoch": 0.2384900892363579, + "grad_norm": 0.4249793063677703, + "learning_rate": 0.00047694251421351866, + "loss": 5.5499, + "step": 755 + }, + { + "epoch": 0.24006949380083709, + "grad_norm": 0.4846805701090246, + "learning_rate": 0.0004801010739102969, + "loss": 5.5201, + "step": 760 + }, + { + "epoch": 0.24164889836531628, + "grad_norm": 0.40961968063799054, + "learning_rate": 0.0004832596336070752, + "loss": 5.5148, + "step": 765 + }, + { + "epoch": 0.24322830292979547, + "grad_norm": 0.43260679031245164, + "learning_rate": 0.00048641819330385346, + "loss": 5.5397, + "step": 770 + }, + { + "epoch": 0.24480770749427466, + "grad_norm": 0.580322281525859, + "learning_rate": 0.0004895767530006317, + "loss": 5.5303, + "step": 775 + }, + { + "epoch": 0.24638711205875385, + "grad_norm": 0.5838175482889031, + "learning_rate": 0.00049273531269741, + "loss": 5.4732, + "step": 780 + }, + { + "epoch": 0.24796651662323305, + "grad_norm": 0.5355101569599774, + "learning_rate": 0.0004958938723941882, + "loss": 5.4693, + "step": 785 + }, + { + "epoch": 0.24954592118771224, + "grad_norm": 0.2647729684431385, + "learning_rate": 0.0004990524320909665, + "loss": 5.4473, + "step": 790 + }, + { + "epoch": 0.25112532575219143, + "grad_norm": 0.37971671833738974, + "learning_rate": 0.0005022109917877448, + "loss": 5.4755, + "step": 795 + }, + { + "epoch": 0.2527047303166706, + "grad_norm": 0.5315182380875813, + "learning_rate": 0.0005053695514845231, + "loss": 5.3669, + "step": 800 + }, + { + "epoch": 0.2542841348811498, + "grad_norm": 0.3985349063314641, + "learning_rate": 0.0005085281111813014, + "loss": 5.3347, + "step": 805 + }, + { + "epoch": 0.255863539445629, + "grad_norm": 0.39453329040038015, + "learning_rate": 0.0005116866708780797, + "loss": 5.34, + "step": 810 + }, + { + "epoch": 0.2574429440101082, + "grad_norm": 0.44887016233351545, + "learning_rate": 0.0005148452305748579, + "loss": 5.2464, + "step": 815 + }, + { + "epoch": 0.2590223485745874, + "grad_norm": 0.4158770980372098, + "learning_rate": 0.0005180037902716362, + "loss": 5.3033, + "step": 820 + }, + { + "epoch": 0.2606017531390666, + "grad_norm": 0.4822698046717804, + "learning_rate": 0.0005211623499684144, + "loss": 5.3215, + "step": 825 + }, + { + "epoch": 0.2621811577035458, + "grad_norm": 0.4263622236375534, + "learning_rate": 0.0005243209096651927, + "loss": 5.258, + "step": 830 + }, + { + "epoch": 0.26376056226802497, + "grad_norm": 0.416073734999235, + "learning_rate": 0.000527479469361971, + "loss": 5.1997, + "step": 835 + }, + { + "epoch": 0.26533996683250416, + "grad_norm": 0.31131344489569407, + "learning_rate": 0.0005306380290587493, + "loss": 5.2658, + "step": 840 + }, + { + "epoch": 0.26691937139698335, + "grad_norm": 0.6203655842680998, + "learning_rate": 0.0005337965887555275, + "loss": 5.4155, + "step": 845 + }, + { + "epoch": 0.26849877596146254, + "grad_norm": 0.4874508807387519, + "learning_rate": 0.0005369551484523058, + "loss": 5.2538, + "step": 850 + }, + { + "epoch": 0.27007818052594174, + "grad_norm": 0.36434499694692957, + "learning_rate": 0.000540113708149084, + "loss": 5.1956, + "step": 855 + }, + { + "epoch": 0.2716575850904209, + "grad_norm": 0.35454696093453264, + "learning_rate": 0.0005432722678458623, + "loss": 5.2159, + "step": 860 + }, + { + "epoch": 0.2732369896549001, + "grad_norm": 0.3688009439063136, + "learning_rate": 0.0005464308275426406, + "loss": 5.2339, + "step": 865 + }, + { + "epoch": 0.2748163942193793, + "grad_norm": 0.42921225196120893, + "learning_rate": 0.0005495893872394189, + "loss": 5.2704, + "step": 870 + }, + { + "epoch": 0.2763957987838585, + "grad_norm": 0.4500237273968581, + "learning_rate": 0.0005527479469361971, + "loss": 5.2321, + "step": 875 + }, + { + "epoch": 0.2779752033483377, + "grad_norm": 0.6393991884989825, + "learning_rate": 0.0005559065066329754, + "loss": 5.2822, + "step": 880 + }, + { + "epoch": 0.2795546079128169, + "grad_norm": 0.4561042638940091, + "learning_rate": 0.0005590650663297536, + "loss": 5.1967, + "step": 885 + }, + { + "epoch": 0.2811340124772961, + "grad_norm": 0.4341056142543411, + "learning_rate": 0.0005622236260265319, + "loss": 5.0808, + "step": 890 + }, + { + "epoch": 0.2827134170417753, + "grad_norm": 0.33406793605522406, + "learning_rate": 0.0005653821857233101, + "loss": 5.1819, + "step": 895 + }, + { + "epoch": 0.28429282160625446, + "grad_norm": 0.2965674356367386, + "learning_rate": 0.0005685407454200885, + "loss": 5.0906, + "step": 900 + }, + { + "epoch": 0.28587222617073366, + "grad_norm": 0.2846613701718218, + "learning_rate": 0.0005716993051168667, + "loss": 5.0733, + "step": 905 + }, + { + "epoch": 0.28745163073521285, + "grad_norm": 0.3126642707513588, + "learning_rate": 0.000574857864813645, + "loss": 5.1798, + "step": 910 + }, + { + "epoch": 0.28903103529969204, + "grad_norm": 0.500617977929322, + "learning_rate": 0.0005780164245104232, + "loss": 5.1134, + "step": 915 + }, + { + "epoch": 0.29061043986417123, + "grad_norm": 0.5064572519101513, + "learning_rate": 0.0005811749842072015, + "loss": 5.0257, + "step": 920 + }, + { + "epoch": 0.2921898444286504, + "grad_norm": 0.3277593134763086, + "learning_rate": 0.0005843335439039797, + "loss": 5.1563, + "step": 925 + }, + { + "epoch": 0.2937692489931296, + "grad_norm": 0.7447972198091461, + "learning_rate": 0.0005874921036007581, + "loss": 5.0243, + "step": 930 + }, + { + "epoch": 0.2953486535576088, + "grad_norm": 0.48383747311962916, + "learning_rate": 0.0005906506632975363, + "loss": 5.0432, + "step": 935 + }, + { + "epoch": 0.29692805812208795, + "grad_norm": 0.4219932484068117, + "learning_rate": 0.0005938092229943146, + "loss": 5.0502, + "step": 940 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.45534278486226426, + "learning_rate": 0.0005969677826910928, + "loss": 5.0904, + "step": 945 + }, + { + "epoch": 0.30008686725104633, + "grad_norm": 0.8801601148284834, + "learning_rate": 0.0006001263423878711, + "loss": 5.1848, + "step": 950 + }, + { + "epoch": 0.3016662718155255, + "grad_norm": 0.4182931559063342, + "learning_rate": 0.0006032849020846493, + "loss": 5.0242, + "step": 955 + }, + { + "epoch": 0.3032456763800047, + "grad_norm": 0.5575755512746062, + "learning_rate": 0.0006064434617814277, + "loss": 4.9951, + "step": 960 + }, + { + "epoch": 0.3048250809444839, + "grad_norm": 0.5825632007479437, + "learning_rate": 0.0006096020214782059, + "loss": 5.0358, + "step": 965 + }, + { + "epoch": 0.3064044855089631, + "grad_norm": 0.3190473695748764, + "learning_rate": 0.0006127605811749843, + "loss": 4.9772, + "step": 970 + }, + { + "epoch": 0.3079838900734423, + "grad_norm": 0.5708845146854377, + "learning_rate": 0.0006159191408717625, + "loss": 4.9756, + "step": 975 + }, + { + "epoch": 0.3095632946379215, + "grad_norm": 0.3903158506235631, + "learning_rate": 0.0006190777005685408, + "loss": 4.9497, + "step": 980 + }, + { + "epoch": 0.3111426992024007, + "grad_norm": 0.43408173547097856, + "learning_rate": 0.000622236260265319, + "loss": 4.9968, + "step": 985 + }, + { + "epoch": 0.31272210376687987, + "grad_norm": 0.2740192171117319, + "learning_rate": 0.0006253948199620974, + "loss": 4.8619, + "step": 990 + }, + { + "epoch": 0.31430150833135906, + "grad_norm": 0.4258379345491042, + "learning_rate": 0.0006285533796588756, + "loss": 5.0052, + "step": 995 + }, + { + "epoch": 0.31588091289583825, + "grad_norm": 0.32210281861219764, + "learning_rate": 0.0006317119393556539, + "loss": 4.9013, + "step": 1000 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.340983266789433, + "learning_rate": 0.0006348704990524321, + "loss": 4.9806, + "step": 1005 + }, + { + "epoch": 0.31903972202479663, + "grad_norm": 0.4434195266796685, + "learning_rate": 0.0006380290587492104, + "loss": 4.8858, + "step": 1010 + }, + { + "epoch": 0.3206191265892758, + "grad_norm": 0.4466646390477218, + "learning_rate": 0.0006411876184459886, + "loss": 4.9415, + "step": 1015 + }, + { + "epoch": 0.322198531153755, + "grad_norm": 0.4677912287920951, + "learning_rate": 0.0006443461781427669, + "loss": 4.7934, + "step": 1020 + }, + { + "epoch": 0.3237779357182342, + "grad_norm": 0.33926521745253735, + "learning_rate": 0.0006475047378395452, + "loss": 4.8824, + "step": 1025 + }, + { + "epoch": 0.3253573402827134, + "grad_norm": 0.3345925861311437, + "learning_rate": 0.0006506632975363235, + "loss": 4.9302, + "step": 1030 + }, + { + "epoch": 0.3269367448471926, + "grad_norm": 0.3515104548017844, + "learning_rate": 0.0006538218572331017, + "loss": 4.9207, + "step": 1035 + }, + { + "epoch": 0.3285161494116718, + "grad_norm": 0.412734062451674, + "learning_rate": 0.00065698041692988, + "loss": 4.8604, + "step": 1040 + }, + { + "epoch": 0.330095553976151, + "grad_norm": 0.40404802367588816, + "learning_rate": 0.0006601389766266582, + "loss": 4.8051, + "step": 1045 + }, + { + "epoch": 0.33167495854063017, + "grad_norm": 0.2687637260302526, + "learning_rate": 0.0006632975363234365, + "loss": 4.7848, + "step": 1050 + }, + { + "epoch": 0.33325436310510936, + "grad_norm": 0.43853265327336977, + "learning_rate": 0.0006664560960202148, + "loss": 4.9433, + "step": 1055 + }, + { + "epoch": 0.33483376766958856, + "grad_norm": 0.5133882529724854, + "learning_rate": 0.0006696146557169931, + "loss": 4.7974, + "step": 1060 + }, + { + "epoch": 0.33641317223406775, + "grad_norm": 0.45108000204120774, + "learning_rate": 0.0006727732154137713, + "loss": 4.8565, + "step": 1065 + }, + { + "epoch": 0.33799257679854694, + "grad_norm": 0.520840065835754, + "learning_rate": 0.0006759317751105496, + "loss": 4.7931, + "step": 1070 + }, + { + "epoch": 0.33957198136302613, + "grad_norm": 0.37929072418664284, + "learning_rate": 0.0006790903348073278, + "loss": 4.7981, + "step": 1075 + }, + { + "epoch": 0.3411513859275053, + "grad_norm": 0.37545606188093755, + "learning_rate": 0.0006822488945041061, + "loss": 4.8073, + "step": 1080 + }, + { + "epoch": 0.3427307904919845, + "grad_norm": 0.6292169690326377, + "learning_rate": 0.0006854074542008844, + "loss": 4.7367, + "step": 1085 + }, + { + "epoch": 0.3443101950564637, + "grad_norm": 0.33769916351911256, + "learning_rate": 0.0006885660138976627, + "loss": 4.7695, + "step": 1090 + }, + { + "epoch": 0.3458895996209429, + "grad_norm": 0.4320591815932741, + "learning_rate": 0.0006917245735944409, + "loss": 4.6927, + "step": 1095 + }, + { + "epoch": 0.3474690041854221, + "grad_norm": 0.4091589279145143, + "learning_rate": 0.0006948831332912192, + "loss": 4.8544, + "step": 1100 + }, + { + "epoch": 0.3490484087499013, + "grad_norm": 0.470757839112991, + "learning_rate": 0.0006980416929879974, + "loss": 4.7676, + "step": 1105 + }, + { + "epoch": 0.3506278133143805, + "grad_norm": 0.4198823601800511, + "learning_rate": 0.0007012002526847757, + "loss": 4.721, + "step": 1110 + }, + { + "epoch": 0.35220721787885967, + "grad_norm": 0.34137569065032386, + "learning_rate": 0.0007043588123815539, + "loss": 4.7626, + "step": 1115 + }, + { + "epoch": 0.35378662244333886, + "grad_norm": 0.49971050328089983, + "learning_rate": 0.0007075173720783323, + "loss": 4.8122, + "step": 1120 + }, + { + "epoch": 0.35536602700781805, + "grad_norm": 0.3953000041331016, + "learning_rate": 0.0007106759317751105, + "loss": 4.744, + "step": 1125 + }, + { + "epoch": 0.35694543157229724, + "grad_norm": 0.3817554183346107, + "learning_rate": 0.0007138344914718888, + "loss": 4.7326, + "step": 1130 + }, + { + "epoch": 0.35852483613677644, + "grad_norm": 0.3065329226094244, + "learning_rate": 0.000716993051168667, + "loss": 4.6795, + "step": 1135 + }, + { + "epoch": 0.36010424070125563, + "grad_norm": 0.2771623794292736, + "learning_rate": 0.0007201516108654453, + "loss": 4.6518, + "step": 1140 + }, + { + "epoch": 0.3616836452657348, + "grad_norm": 0.35400731064131585, + "learning_rate": 0.0007233101705622235, + "loss": 4.7473, + "step": 1145 + }, + { + "epoch": 0.363263049830214, + "grad_norm": 0.3484555026524429, + "learning_rate": 0.000726468730259002, + "loss": 4.6603, + "step": 1150 + }, + { + "epoch": 0.3648424543946932, + "grad_norm": 0.2450169475092362, + "learning_rate": 0.0007296272899557803, + "loss": 4.6341, + "step": 1155 + }, + { + "epoch": 0.3664218589591724, + "grad_norm": 0.4006637364871361, + "learning_rate": 0.0007327858496525585, + "loss": 4.6863, + "step": 1160 + }, + { + "epoch": 0.3680012635236516, + "grad_norm": 0.45669146482081835, + "learning_rate": 0.0007359444093493367, + "loss": 4.6163, + "step": 1165 + }, + { + "epoch": 0.3695806680881308, + "grad_norm": 0.28184437239622734, + "learning_rate": 0.000739102969046115, + "loss": 4.6583, + "step": 1170 + }, + { + "epoch": 0.37116007265261, + "grad_norm": 0.36542190468548746, + "learning_rate": 0.0007422615287428932, + "loss": 4.626, + "step": 1175 + }, + { + "epoch": 0.37273947721708917, + "grad_norm": 0.3211192511657023, + "learning_rate": 0.0007454200884396716, + "loss": 4.7379, + "step": 1180 + }, + { + "epoch": 0.37431888178156836, + "grad_norm": 0.20304048450743567, + "learning_rate": 0.0007485786481364499, + "loss": 4.6317, + "step": 1185 + }, + { + "epoch": 0.37589828634604755, + "grad_norm": 0.3540717270875546, + "learning_rate": 0.0007517372078332281, + "loss": 4.5685, + "step": 1190 + }, + { + "epoch": 0.37747769091052674, + "grad_norm": 0.2615691483415603, + "learning_rate": 0.0007548957675300064, + "loss": 4.6439, + "step": 1195 + }, + { + "epoch": 0.37905709547500593, + "grad_norm": 0.3503676341696831, + "learning_rate": 0.0007580543272267846, + "loss": 4.6472, + "step": 1200 + }, + { + "epoch": 0.3806365000394851, + "grad_norm": 0.381747407284883, + "learning_rate": 0.0007612128869235628, + "loss": 4.5915, + "step": 1205 + }, + { + "epoch": 0.3822159046039643, + "grad_norm": 0.5555220296942412, + "learning_rate": 0.0007643714466203412, + "loss": 4.5812, + "step": 1210 + }, + { + "epoch": 0.3837953091684435, + "grad_norm": 0.5472892354048585, + "learning_rate": 0.0007675300063171195, + "loss": 4.6288, + "step": 1215 + }, + { + "epoch": 0.3853747137329227, + "grad_norm": 0.41350514353566703, + "learning_rate": 0.0007706885660138977, + "loss": 4.511, + "step": 1220 + }, + { + "epoch": 0.3869541182974019, + "grad_norm": 0.30935412837157134, + "learning_rate": 0.000773847125710676, + "loss": 4.5887, + "step": 1225 + }, + { + "epoch": 0.3885335228618811, + "grad_norm": 0.45469414600545827, + "learning_rate": 0.0007770056854074542, + "loss": 4.545, + "step": 1230 + }, + { + "epoch": 0.3901129274263603, + "grad_norm": 0.45367406395912707, + "learning_rate": 0.0007801642451042324, + "loss": 4.6878, + "step": 1235 + }, + { + "epoch": 0.39169233199083947, + "grad_norm": 0.47051545366899983, + "learning_rate": 0.0007833228048010107, + "loss": 4.5098, + "step": 1240 + }, + { + "epoch": 0.39327173655531866, + "grad_norm": 0.356870891214942, + "learning_rate": 0.0007864813644977891, + "loss": 4.6192, + "step": 1245 + }, + { + "epoch": 0.39485114111979785, + "grad_norm": 0.3398669537171447, + "learning_rate": 0.0007896399241945673, + "loss": 4.553, + "step": 1250 + }, + { + "epoch": 0.39643054568427705, + "grad_norm": 0.38876482712395577, + "learning_rate": 0.0007927984838913456, + "loss": 4.4853, + "step": 1255 + }, + { + "epoch": 0.39800995024875624, + "grad_norm": 0.4094421799432949, + "learning_rate": 0.0007959570435881238, + "loss": 4.4592, + "step": 1260 + }, + { + "epoch": 0.39958935481323543, + "grad_norm": 0.39385781540672343, + "learning_rate": 0.000799115603284902, + "loss": 4.5032, + "step": 1265 + }, + { + "epoch": 0.4011687593777146, + "grad_norm": 0.5817554845495431, + "learning_rate": 0.0008022741629816803, + "loss": 4.649, + "step": 1270 + }, + { + "epoch": 0.4027481639421938, + "grad_norm": 0.4778375510654764, + "learning_rate": 0.0008054327226784587, + "loss": 4.5617, + "step": 1275 + }, + { + "epoch": 0.404327568506673, + "grad_norm": 0.3747112096903426, + "learning_rate": 0.0008085912823752369, + "loss": 4.4785, + "step": 1280 + }, + { + "epoch": 0.4059069730711522, + "grad_norm": 0.2862809855465098, + "learning_rate": 0.0008117498420720152, + "loss": 4.5621, + "step": 1285 + }, + { + "epoch": 0.4074863776356314, + "grad_norm": 0.2670050072937441, + "learning_rate": 0.0008149084017687934, + "loss": 4.5072, + "step": 1290 + }, + { + "epoch": 0.4090657822001106, + "grad_norm": 0.27279766937650357, + "learning_rate": 0.0008180669614655717, + "loss": 4.473, + "step": 1295 + }, + { + "epoch": 0.4106451867645898, + "grad_norm": 0.3543901390752811, + "learning_rate": 0.0008212255211623499, + "loss": 4.4827, + "step": 1300 + }, + { + "epoch": 0.41222459132906897, + "grad_norm": 0.4597168775174795, + "learning_rate": 0.0008243840808591283, + "loss": 4.4519, + "step": 1305 + }, + { + "epoch": 0.41380399589354816, + "grad_norm": 0.3161074895361928, + "learning_rate": 0.0008275426405559065, + "loss": 4.3894, + "step": 1310 + }, + { + "epoch": 0.4153834004580273, + "grad_norm": 0.3031787080639516, + "learning_rate": 0.0008307012002526848, + "loss": 4.3778, + "step": 1315 + }, + { + "epoch": 0.4169628050225065, + "grad_norm": 0.4046902940993039, + "learning_rate": 0.0008338597599494631, + "loss": 4.5039, + "step": 1320 + }, + { + "epoch": 0.4185422095869857, + "grad_norm": 0.28233846625561443, + "learning_rate": 0.0008370183196462414, + "loss": 4.4458, + "step": 1325 + }, + { + "epoch": 0.4201216141514649, + "grad_norm": 0.5878023927754359, + "learning_rate": 0.0008401768793430196, + "loss": 4.4569, + "step": 1330 + }, + { + "epoch": 0.42170101871594406, + "grad_norm": 0.28491144301264926, + "learning_rate": 0.000843335439039798, + "loss": 4.4164, + "step": 1335 + }, + { + "epoch": 0.42328042328042326, + "grad_norm": 0.339216984189955, + "learning_rate": 0.0008464939987365762, + "loss": 4.4052, + "step": 1340 + }, + { + "epoch": 0.42485982784490245, + "grad_norm": 0.3287668012766888, + "learning_rate": 0.0008496525584333545, + "loss": 4.3853, + "step": 1345 + }, + { + "epoch": 0.42643923240938164, + "grad_norm": 0.3421773901777664, + "learning_rate": 0.0008528111181301327, + "loss": 4.442, + "step": 1350 + }, + { + "epoch": 0.42801863697386083, + "grad_norm": 0.3292206362755166, + "learning_rate": 0.000855969677826911, + "loss": 4.4226, + "step": 1355 + }, + { + "epoch": 0.42959804153834, + "grad_norm": 0.3371728323598187, + "learning_rate": 0.0008591282375236892, + "loss": 4.4446, + "step": 1360 + }, + { + "epoch": 0.4311774461028192, + "grad_norm": 0.27079521890457803, + "learning_rate": 0.0008622867972204675, + "loss": 4.3547, + "step": 1365 + }, + { + "epoch": 0.4327568506672984, + "grad_norm": 0.2622769562992865, + "learning_rate": 0.0008654453569172458, + "loss": 4.3783, + "step": 1370 + }, + { + "epoch": 0.4343362552317776, + "grad_norm": 0.28526124910101036, + "learning_rate": 0.0008686039166140241, + "loss": 4.4134, + "step": 1375 + }, + { + "epoch": 0.4359156597962568, + "grad_norm": 0.4703111974864806, + "learning_rate": 0.0008717624763108023, + "loss": 4.3905, + "step": 1380 + }, + { + "epoch": 0.437495064360736, + "grad_norm": 0.42877065456487523, + "learning_rate": 0.0008749210360075806, + "loss": 4.3255, + "step": 1385 + }, + { + "epoch": 0.4390744689252152, + "grad_norm": 0.2818515041639332, + "learning_rate": 0.0008780795957043588, + "loss": 4.4326, + "step": 1390 + }, + { + "epoch": 0.44065387348969437, + "grad_norm": 0.3386738826947052, + "learning_rate": 0.0008812381554011371, + "loss": 4.3857, + "step": 1395 + }, + { + "epoch": 0.44223327805417356, + "grad_norm": 0.38221324112499194, + "learning_rate": 0.0008843967150979154, + "loss": 4.4072, + "step": 1400 + }, + { + "epoch": 0.44381268261865275, + "grad_norm": 0.4550909203693113, + "learning_rate": 0.0008875552747946937, + "loss": 4.3394, + "step": 1405 + }, + { + "epoch": 0.44539208718313195, + "grad_norm": 0.5188511582204778, + "learning_rate": 0.0008907138344914719, + "loss": 4.3843, + "step": 1410 + }, + { + "epoch": 0.44697149174761114, + "grad_norm": 0.37563874540589287, + "learning_rate": 0.0008938723941882502, + "loss": 4.3411, + "step": 1415 + }, + { + "epoch": 0.44855089631209033, + "grad_norm": 0.5066382494462849, + "learning_rate": 0.0008970309538850284, + "loss": 4.4086, + "step": 1420 + }, + { + "epoch": 0.4501303008765695, + "grad_norm": 0.354382674134275, + "learning_rate": 0.0009001895135818067, + "loss": 4.3791, + "step": 1425 + }, + { + "epoch": 0.4517097054410487, + "grad_norm": 0.3153630402026596, + "learning_rate": 0.000903348073278585, + "loss": 4.3098, + "step": 1430 + }, + { + "epoch": 0.4532891100055279, + "grad_norm": 0.2247411189874934, + "learning_rate": 0.0009065066329753633, + "loss": 4.3073, + "step": 1435 + }, + { + "epoch": 0.4548685145700071, + "grad_norm": 0.3084869625887127, + "learning_rate": 0.0009096651926721415, + "loss": 4.2301, + "step": 1440 + }, + { + "epoch": 0.4564479191344863, + "grad_norm": 0.2683775947502175, + "learning_rate": 0.0009128237523689198, + "loss": 4.3069, + "step": 1445 + }, + { + "epoch": 0.4580273236989655, + "grad_norm": 0.19696762482110167, + "learning_rate": 0.000915982312065698, + "loss": 4.3117, + "step": 1450 + }, + { + "epoch": 0.4596067282634447, + "grad_norm": 0.31780938882297877, + "learning_rate": 0.0009191408717624763, + "loss": 4.3639, + "step": 1455 + }, + { + "epoch": 0.46118613282792387, + "grad_norm": 0.34985785227208677, + "learning_rate": 0.0009222994314592545, + "loss": 4.2762, + "step": 1460 + }, + { + "epoch": 0.46276553739240306, + "grad_norm": 0.44099480740170816, + "learning_rate": 0.0009254579911560329, + "loss": 4.3993, + "step": 1465 + }, + { + "epoch": 0.46434494195688225, + "grad_norm": 0.3209951859089135, + "learning_rate": 0.0009286165508528111, + "loss": 4.3052, + "step": 1470 + }, + { + "epoch": 0.46592434652136144, + "grad_norm": 0.37335084714596994, + "learning_rate": 0.0009317751105495894, + "loss": 4.2903, + "step": 1475 + }, + { + "epoch": 0.46750375108584064, + "grad_norm": 0.2842543897263021, + "learning_rate": 0.0009349336702463676, + "loss": 4.2876, + "step": 1480 + }, + { + "epoch": 0.4690831556503198, + "grad_norm": 0.30335571616435514, + "learning_rate": 0.0009380922299431459, + "loss": 4.235, + "step": 1485 + }, + { + "epoch": 0.470662560214799, + "grad_norm": 0.266764361136666, + "learning_rate": 0.0009412507896399241, + "loss": 4.2846, + "step": 1490 + }, + { + "epoch": 0.4722419647792782, + "grad_norm": 0.3519813365858386, + "learning_rate": 0.0009444093493367026, + "loss": 4.3133, + "step": 1495 + }, + { + "epoch": 0.4738213693437574, + "grad_norm": 0.2742010924034352, + "learning_rate": 0.0009475679090334808, + "loss": 4.2552, + "step": 1500 + }, + { + "epoch": 0.4754007739082366, + "grad_norm": 0.28659499947688005, + "learning_rate": 0.0009507264687302591, + "loss": 4.276, + "step": 1505 + }, + { + "epoch": 0.4769801784727158, + "grad_norm": 0.26463115258969583, + "learning_rate": 0.0009538850284270373, + "loss": 4.2585, + "step": 1510 + }, + { + "epoch": 0.478559583037195, + "grad_norm": 0.4491072115265183, + "learning_rate": 0.0009570435881238156, + "loss": 4.2574, + "step": 1515 + }, + { + "epoch": 0.48013898760167417, + "grad_norm": 0.3236579306871126, + "learning_rate": 0.0009602021478205938, + "loss": 4.324, + "step": 1520 + }, + { + "epoch": 0.48171839216615336, + "grad_norm": 0.28007607030549353, + "learning_rate": 0.0009633607075173722, + "loss": 4.3264, + "step": 1525 + }, + { + "epoch": 0.48329779673063256, + "grad_norm": 0.4518966964830614, + "learning_rate": 0.0009665192672141504, + "loss": 4.213, + "step": 1530 + }, + { + "epoch": 0.48487720129511175, + "grad_norm": 0.29253505122413304, + "learning_rate": 0.0009696778269109287, + "loss": 4.3187, + "step": 1535 + }, + { + "epoch": 0.48645660585959094, + "grad_norm": 0.3760216806664558, + "learning_rate": 0.0009728363866077069, + "loss": 4.2086, + "step": 1540 + }, + { + "epoch": 0.48803601042407013, + "grad_norm": 0.3462163229846517, + "learning_rate": 0.0009759949463044852, + "loss": 4.2029, + "step": 1545 + }, + { + "epoch": 0.4896154149885493, + "grad_norm": 0.4678230022826408, + "learning_rate": 0.0009791535060012634, + "loss": 4.2381, + "step": 1550 + }, + { + "epoch": 0.4911948195530285, + "grad_norm": 0.281033090745962, + "learning_rate": 0.0009823120656980418, + "loss": 4.1577, + "step": 1555 + }, + { + "epoch": 0.4927742241175077, + "grad_norm": 0.25715655179867125, + "learning_rate": 0.00098547062539482, + "loss": 4.3205, + "step": 1560 + }, + { + "epoch": 0.4943536286819869, + "grad_norm": 0.2377083515339439, + "learning_rate": 0.0009886291850915983, + "loss": 4.1485, + "step": 1565 + }, + { + "epoch": 0.4959330332464661, + "grad_norm": 0.34877478211376156, + "learning_rate": 0.0009917877447883764, + "loss": 4.2703, + "step": 1570 + }, + { + "epoch": 0.4975124378109453, + "grad_norm": 0.3612121180047406, + "learning_rate": 0.0009949463044851548, + "loss": 4.2981, + "step": 1575 + }, + { + "epoch": 0.4990918423754245, + "grad_norm": 0.32745245414352575, + "learning_rate": 0.000998104864181933, + "loss": 4.188, + "step": 1580 + }, + { + "epoch": 0.5006712469399036, + "grad_norm": 0.36513716463433904, + "learning_rate": 0.0009999999513416054, + "loss": 4.2169, + "step": 1585 + }, + { + "epoch": 0.5022506515043829, + "grad_norm": 0.48696761580584264, + "learning_rate": 0.0009999994039347757, + "loss": 4.1734, + "step": 1590 + }, + { + "epoch": 0.503830056068862, + "grad_norm": 0.28706121610340524, + "learning_rate": 0.000999998248298791, + "loss": 4.2644, + "step": 1595 + }, + { + "epoch": 0.5054094606333412, + "grad_norm": 0.28463581068533467, + "learning_rate": 0.0009999964844350573, + "loss": 4.1033, + "step": 1600 + }, + { + "epoch": 0.5069888651978204, + "grad_norm": 0.281432611299984, + "learning_rate": 0.0009999941123457203, + "loss": 4.2137, + "step": 1605 + }, + { + "epoch": 0.5085682697622996, + "grad_norm": 0.2780914746609548, + "learning_rate": 0.0009999911320336655, + "loss": 4.1156, + "step": 1610 + }, + { + "epoch": 0.5101476743267788, + "grad_norm": 0.24775313426165027, + "learning_rate": 0.000999987543502518, + "loss": 4.1227, + "step": 1615 + }, + { + "epoch": 0.511727078891258, + "grad_norm": 0.36278436207383286, + "learning_rate": 0.0009999833467566437, + "loss": 4.178, + "step": 1620 + }, + { + "epoch": 0.5133064834557372, + "grad_norm": 0.1932778698252279, + "learning_rate": 0.0009999785418011472, + "loss": 4.1866, + "step": 1625 + }, + { + "epoch": 0.5148858880202164, + "grad_norm": 0.39153360957933375, + "learning_rate": 0.000999973128641874, + "loss": 4.2506, + "step": 1630 + }, + { + "epoch": 0.5164652925846955, + "grad_norm": 0.37554437506264593, + "learning_rate": 0.000999967107285409, + "loss": 4.117, + "step": 1635 + }, + { + "epoch": 0.5180446971491748, + "grad_norm": 0.28958701143166327, + "learning_rate": 0.0009999604777390762, + "loss": 4.2216, + "step": 1640 + }, + { + "epoch": 0.5196241017136539, + "grad_norm": 0.30198070915798164, + "learning_rate": 0.0009999532400109413, + "loss": 4.2161, + "step": 1645 + }, + { + "epoch": 0.5212035062781332, + "grad_norm": 0.3428198600005329, + "learning_rate": 0.0009999453941098076, + "loss": 4.0479, + "step": 1650 + }, + { + "epoch": 0.5227829108426123, + "grad_norm": 0.299713880431016, + "learning_rate": 0.0009999369400452201, + "loss": 4.1104, + "step": 1655 + }, + { + "epoch": 0.5243623154070916, + "grad_norm": 0.28532084858377904, + "learning_rate": 0.0009999278778274625, + "loss": 4.17, + "step": 1660 + }, + { + "epoch": 0.5259417199715707, + "grad_norm": 0.2984801615394102, + "learning_rate": 0.0009999182074675588, + "loss": 4.0912, + "step": 1665 + }, + { + "epoch": 0.5275211245360499, + "grad_norm": 0.25676781344144184, + "learning_rate": 0.0009999079289772722, + "loss": 4.0482, + "step": 1670 + }, + { + "epoch": 0.5291005291005291, + "grad_norm": 0.26759080611381303, + "learning_rate": 0.0009998970423691067, + "loss": 4.0194, + "step": 1675 + }, + { + "epoch": 0.5306799336650083, + "grad_norm": 0.2764566522384514, + "learning_rate": 0.0009998855476563051, + "loss": 4.1851, + "step": 1680 + }, + { + "epoch": 0.5322593382294875, + "grad_norm": 0.2904154144075542, + "learning_rate": 0.00099987344485285, + "loss": 4.0868, + "step": 1685 + }, + { + "epoch": 0.5338387427939667, + "grad_norm": 0.3681853600736386, + "learning_rate": 0.0009998607339734642, + "loss": 4.1386, + "step": 1690 + }, + { + "epoch": 0.5354181473584458, + "grad_norm": 0.4240926663878961, + "learning_rate": 0.0009998474150336102, + "loss": 4.0986, + "step": 1695 + }, + { + "epoch": 0.5369975519229251, + "grad_norm": 0.3089726259137687, + "learning_rate": 0.0009998334880494896, + "loss": 4.1597, + "step": 1700 + }, + { + "epoch": 0.5385769564874042, + "grad_norm": 0.30656590447426424, + "learning_rate": 0.000999818953038044, + "loss": 3.9891, + "step": 1705 + }, + { + "epoch": 0.5401563610518835, + "grad_norm": 0.32114946179568205, + "learning_rate": 0.0009998038100169553, + "loss": 4.1324, + "step": 1710 + }, + { + "epoch": 0.5417357656163626, + "grad_norm": 0.32282723078574843, + "learning_rate": 0.0009997880590046436, + "loss": 4.0553, + "step": 1715 + }, + { + "epoch": 0.5433151701808419, + "grad_norm": 0.21075665879048797, + "learning_rate": 0.0009997717000202696, + "loss": 4.0876, + "step": 1720 + }, + { + "epoch": 0.544894574745321, + "grad_norm": 0.2642191066111579, + "learning_rate": 0.0009997547330837335, + "loss": 3.9771, + "step": 1725 + }, + { + "epoch": 0.5464739793098002, + "grad_norm": 0.2241664738813061, + "learning_rate": 0.0009997371582156746, + "loss": 4.1472, + "step": 1730 + }, + { + "epoch": 0.5480533838742794, + "grad_norm": 0.21223821355830494, + "learning_rate": 0.0009997189754374725, + "loss": 4.1219, + "step": 1735 + }, + { + "epoch": 0.5496327884387586, + "grad_norm": 0.30437709086967074, + "learning_rate": 0.0009997001847712455, + "loss": 4.0623, + "step": 1740 + }, + { + "epoch": 0.5512121930032378, + "grad_norm": 0.2612037043109827, + "learning_rate": 0.0009996807862398516, + "loss": 4.1017, + "step": 1745 + }, + { + "epoch": 0.552791597567717, + "grad_norm": 0.21637469222681385, + "learning_rate": 0.0009996607798668886, + "loss": 4.0922, + "step": 1750 + }, + { + "epoch": 0.5543710021321961, + "grad_norm": 0.1865321522736776, + "learning_rate": 0.0009996401656766933, + "loss": 4.1306, + "step": 1755 + }, + { + "epoch": 0.5559504066966754, + "grad_norm": 0.27200198732458336, + "learning_rate": 0.000999618943694342, + "loss": 4.0081, + "step": 1760 + }, + { + "epoch": 0.5575298112611545, + "grad_norm": 0.3202280293025325, + "learning_rate": 0.0009995971139456503, + "loss": 4.0094, + "step": 1765 + }, + { + "epoch": 0.5591092158256338, + "grad_norm": 0.3869093315077745, + "learning_rate": 0.0009995746764571735, + "loss": 4.0755, + "step": 1770 + }, + { + "epoch": 0.5606886203901129, + "grad_norm": 0.38187337349101136, + "learning_rate": 0.0009995516312562057, + "loss": 4.1702, + "step": 1775 + }, + { + "epoch": 0.5622680249545922, + "grad_norm": 0.31455911111783563, + "learning_rate": 0.0009995279783707805, + "loss": 3.979, + "step": 1780 + }, + { + "epoch": 0.5638474295190713, + "grad_norm": 0.29574697329192573, + "learning_rate": 0.0009995037178296708, + "loss": 4.154, + "step": 1785 + }, + { + "epoch": 0.5654268340835505, + "grad_norm": 0.40103311872700226, + "learning_rate": 0.0009994788496623882, + "loss": 4.0433, + "step": 1790 + }, + { + "epoch": 0.5670062386480297, + "grad_norm": 0.3657271711780575, + "learning_rate": 0.0009994533738991844, + "loss": 4.0775, + "step": 1795 + }, + { + "epoch": 0.5685856432125089, + "grad_norm": 0.34299715107279133, + "learning_rate": 0.000999427290571049, + "loss": 4.1602, + "step": 1800 + }, + { + "epoch": 0.5701650477769881, + "grad_norm": 0.31471328128215614, + "learning_rate": 0.000999400599709712, + "loss": 4.1397, + "step": 1805 + }, + { + "epoch": 0.5717444523414673, + "grad_norm": 0.3249407191646648, + "learning_rate": 0.000999373301347641, + "loss": 4.0178, + "step": 1810 + }, + { + "epoch": 0.5733238569059464, + "grad_norm": 0.3627998575778931, + "learning_rate": 0.000999345395518044, + "loss": 4.073, + "step": 1815 + }, + { + "epoch": 0.5749032614704257, + "grad_norm": 0.3008083379342741, + "learning_rate": 0.0009993168822548671, + "loss": 4.0813, + "step": 1820 + }, + { + "epoch": 0.5764826660349048, + "grad_norm": 0.2922414382664129, + "learning_rate": 0.0009992877615927955, + "loss": 3.9915, + "step": 1825 + }, + { + "epoch": 0.5780620705993841, + "grad_norm": 0.281765440169215, + "learning_rate": 0.0009992580335672534, + "loss": 4.0479, + "step": 1830 + }, + { + "epoch": 0.5796414751638632, + "grad_norm": 0.2199768873265984, + "learning_rate": 0.0009992276982144035, + "loss": 3.9814, + "step": 1835 + }, + { + "epoch": 0.5812208797283425, + "grad_norm": 0.26746177527158255, + "learning_rate": 0.000999196755571148, + "loss": 4.0101, + "step": 1840 + }, + { + "epoch": 0.5828002842928216, + "grad_norm": 0.27475241843335857, + "learning_rate": 0.0009991652056751269, + "loss": 4.0526, + "step": 1845 + }, + { + "epoch": 0.5843796888573008, + "grad_norm": 0.20515852596205633, + "learning_rate": 0.0009991330485647194, + "loss": 4.0069, + "step": 1850 + }, + { + "epoch": 0.58595909342178, + "grad_norm": 0.23220844607256083, + "learning_rate": 0.0009991002842790438, + "loss": 4.1066, + "step": 1855 + }, + { + "epoch": 0.5875384979862592, + "grad_norm": 0.24900266206103544, + "learning_rate": 0.000999066912857956, + "loss": 4.0527, + "step": 1860 + }, + { + "epoch": 0.5891179025507384, + "grad_norm": 0.22772261845450373, + "learning_rate": 0.0009990329343420514, + "loss": 4.0552, + "step": 1865 + }, + { + "epoch": 0.5906973071152176, + "grad_norm": 0.34846084321759413, + "learning_rate": 0.0009989983487726632, + "loss": 4.0956, + "step": 1870 + }, + { + "epoch": 0.5922767116796968, + "grad_norm": 0.20306318495273795, + "learning_rate": 0.0009989631561918635, + "loss": 3.9788, + "step": 1875 + }, + { + "epoch": 0.5938561162441759, + "grad_norm": 0.3054746048484631, + "learning_rate": 0.0009989273566424629, + "loss": 4.0408, + "step": 1880 + }, + { + "epoch": 0.5954355208086551, + "grad_norm": 0.2689676990909977, + "learning_rate": 0.0009988909501680095, + "loss": 4.055, + "step": 1885 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.2879295565501649, + "learning_rate": 0.0009988539368127908, + "loss": 3.9719, + "step": 1890 + }, + { + "epoch": 0.5985943299376135, + "grad_norm": 0.25177642807914086, + "learning_rate": 0.000998816316621832, + "loss": 4.0066, + "step": 1895 + }, + { + "epoch": 0.6001737345020927, + "grad_norm": 0.32762881060791094, + "learning_rate": 0.0009987780896408963, + "loss": 3.8824, + "step": 1900 + }, + { + "epoch": 0.6017531390665719, + "grad_norm": 0.26143752591314573, + "learning_rate": 0.0009987392559164857, + "loss": 4.1152, + "step": 1905 + }, + { + "epoch": 0.603332543631051, + "grad_norm": 0.24110745226470215, + "learning_rate": 0.0009986998154958395, + "loss": 3.9775, + "step": 1910 + }, + { + "epoch": 0.6049119481955303, + "grad_norm": 0.24195042625692204, + "learning_rate": 0.0009986597684269354, + "loss": 3.9366, + "step": 1915 + }, + { + "epoch": 0.6064913527600094, + "grad_norm": 0.21330938341685474, + "learning_rate": 0.0009986191147584892, + "loss": 3.9408, + "step": 1920 + }, + { + "epoch": 0.6080707573244887, + "grad_norm": 0.23539885796681012, + "learning_rate": 0.0009985778545399545, + "loss": 4.035, + "step": 1925 + }, + { + "epoch": 0.6096501618889678, + "grad_norm": 0.2838008993926215, + "learning_rate": 0.0009985359878215223, + "loss": 4.0331, + "step": 1930 + }, + { + "epoch": 0.6112295664534471, + "grad_norm": 0.28796522858528995, + "learning_rate": 0.0009984935146541223, + "loss": 3.9946, + "step": 1935 + }, + { + "epoch": 0.6128089710179262, + "grad_norm": 0.25284741162200397, + "learning_rate": 0.0009984504350894212, + "loss": 4.1193, + "step": 1940 + }, + { + "epoch": 0.6143883755824054, + "grad_norm": 0.1883166471785072, + "learning_rate": 0.0009984067491798235, + "loss": 3.9263, + "step": 1945 + }, + { + "epoch": 0.6159677801468846, + "grad_norm": 0.31448053784092006, + "learning_rate": 0.0009983624569784714, + "loss": 3.8846, + "step": 1950 + }, + { + "epoch": 0.6175471847113638, + "grad_norm": 0.24984288586441813, + "learning_rate": 0.0009983175585392445, + "loss": 3.8951, + "step": 1955 + }, + { + "epoch": 0.619126589275843, + "grad_norm": 0.2834739736737687, + "learning_rate": 0.00099827205391676, + "loss": 3.8461, + "step": 1960 + }, + { + "epoch": 0.6207059938403222, + "grad_norm": 0.3185907483566412, + "learning_rate": 0.0009982259431663724, + "loss": 3.9755, + "step": 1965 + }, + { + "epoch": 0.6222853984048013, + "grad_norm": 0.29327583353771697, + "learning_rate": 0.0009981792263441737, + "loss": 3.9467, + "step": 1970 + }, + { + "epoch": 0.6238648029692806, + "grad_norm": 0.29352858594845627, + "learning_rate": 0.0009981319035069932, + "loss": 4.0014, + "step": 1975 + }, + { + "epoch": 0.6254442075337597, + "grad_norm": 0.25867595966057094, + "learning_rate": 0.0009980839747123966, + "loss": 3.9775, + "step": 1980 + }, + { + "epoch": 0.627023612098239, + "grad_norm": 0.25568674878218955, + "learning_rate": 0.000998035440018688, + "loss": 3.9609, + "step": 1985 + }, + { + "epoch": 0.6286030166627181, + "grad_norm": 0.3211767242169351, + "learning_rate": 0.0009979862994849073, + "loss": 3.9838, + "step": 1990 + }, + { + "epoch": 0.6301824212271974, + "grad_norm": 0.22478636571958996, + "learning_rate": 0.0009979365531708325, + "loss": 3.9142, + "step": 1995 + }, + { + "epoch": 0.6317618257916765, + "grad_norm": 0.2892618804449103, + "learning_rate": 0.0009978862011369779, + "loss": 3.8985, + "step": 2000 + }, + { + "epoch": 0.6333412303561557, + "grad_norm": 0.24435044534248943, + "learning_rate": 0.0009978352434445944, + "loss": 3.8645, + "step": 2005 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.21458490976951505, + "learning_rate": 0.0009977836801556704, + "loss": 3.8908, + "step": 2010 + }, + { + "epoch": 0.6365000394851141, + "grad_norm": 0.238296404339169, + "learning_rate": 0.0009977315113329304, + "loss": 3.8564, + "step": 2015 + }, + { + "epoch": 0.6380794440495933, + "grad_norm": 0.24198303496426915, + "learning_rate": 0.0009976787370398355, + "loss": 3.9388, + "step": 2020 + }, + { + "epoch": 0.6396588486140725, + "grad_norm": 0.22491929755865114, + "learning_rate": 0.0009976253573405838, + "loss": 3.9074, + "step": 2025 + }, + { + "epoch": 0.6412382531785517, + "grad_norm": 0.20157800604718384, + "learning_rate": 0.0009975713723001094, + "loss": 3.9252, + "step": 2030 + }, + { + "epoch": 0.6428176577430309, + "grad_norm": 0.24707058750275446, + "learning_rate": 0.0009975167819840827, + "loss": 3.8766, + "step": 2035 + }, + { + "epoch": 0.64439706230751, + "grad_norm": 0.1930165825425789, + "learning_rate": 0.0009974615864589112, + "loss": 3.9563, + "step": 2040 + }, + { + "epoch": 0.6459764668719893, + "grad_norm": 0.23592774025549407, + "learning_rate": 0.0009974057857917373, + "loss": 4.0158, + "step": 2045 + }, + { + "epoch": 0.6475558714364684, + "grad_norm": 0.2744523541395304, + "learning_rate": 0.000997349380050441, + "loss": 4.015, + "step": 2050 + }, + { + "epoch": 0.6491352760009477, + "grad_norm": 0.19965782390005965, + "learning_rate": 0.0009972923693036373, + "loss": 3.8356, + "step": 2055 + }, + { + "epoch": 0.6507146805654268, + "grad_norm": 0.2102237724441891, + "learning_rate": 0.0009972347536206772, + "loss": 4.0032, + "step": 2060 + }, + { + "epoch": 0.652294085129906, + "grad_norm": 0.32443690591437163, + "learning_rate": 0.0009971765330716482, + "loss": 3.8331, + "step": 2065 + }, + { + "epoch": 0.6538734896943852, + "grad_norm": 0.2680421026750314, + "learning_rate": 0.0009971177077273733, + "loss": 3.9182, + "step": 2070 + }, + { + "epoch": 0.6554528942588644, + "grad_norm": 0.2608811339656057, + "learning_rate": 0.000997058277659411, + "loss": 3.9257, + "step": 2075 + }, + { + "epoch": 0.6570322988233436, + "grad_norm": 0.25161834765330515, + "learning_rate": 0.0009969982429400555, + "loss": 3.8755, + "step": 2080 + }, + { + "epoch": 0.6586117033878228, + "grad_norm": 0.21108033568964235, + "learning_rate": 0.0009969376036423367, + "loss": 3.8266, + "step": 2085 + }, + { + "epoch": 0.660191107952302, + "grad_norm": 0.2708969117194078, + "learning_rate": 0.00099687635984002, + "loss": 4.005, + "step": 2090 + }, + { + "epoch": 0.6617705125167812, + "grad_norm": 0.234071036138296, + "learning_rate": 0.0009968145116076063, + "loss": 3.9937, + "step": 2095 + }, + { + "epoch": 0.6633499170812603, + "grad_norm": 0.35319179407248313, + "learning_rate": 0.0009967520590203306, + "loss": 3.8658, + "step": 2100 + }, + { + "epoch": 0.6649293216457396, + "grad_norm": 0.22517639994589708, + "learning_rate": 0.0009966890021541647, + "loss": 3.9244, + "step": 2105 + }, + { + "epoch": 0.6665087262102187, + "grad_norm": 0.24835059176588564, + "learning_rate": 0.0009966253410858144, + "loss": 3.9248, + "step": 2110 + }, + { + "epoch": 0.668088130774698, + "grad_norm": 0.22461996251898433, + "learning_rate": 0.000996561075892721, + "loss": 3.8047, + "step": 2115 + }, + { + "epoch": 0.6696675353391771, + "grad_norm": 0.3486531372669808, + "learning_rate": 0.0009964962066530605, + "loss": 4.0675, + "step": 2120 + }, + { + "epoch": 0.6712469399036564, + "grad_norm": 0.2901746826514593, + "learning_rate": 0.0009964307334457436, + "loss": 3.9818, + "step": 2125 + }, + { + "epoch": 0.6728263444681355, + "grad_norm": 0.2507760755352352, + "learning_rate": 0.0009963646563504159, + "loss": 3.9974, + "step": 2130 + }, + { + "epoch": 0.6744057490326147, + "grad_norm": 0.21192458654999655, + "learning_rate": 0.0009962979754474576, + "loss": 3.8887, + "step": 2135 + }, + { + "epoch": 0.6759851535970939, + "grad_norm": 0.19157148879425212, + "learning_rate": 0.0009962306908179832, + "loss": 3.8514, + "step": 2140 + }, + { + "epoch": 0.6775645581615731, + "grad_norm": 0.19679272132932518, + "learning_rate": 0.0009961628025438418, + "loss": 3.7598, + "step": 2145 + }, + { + "epoch": 0.6791439627260523, + "grad_norm": 0.26637100711605305, + "learning_rate": 0.0009960943107076169, + "loss": 3.9017, + "step": 2150 + }, + { + "epoch": 0.6807233672905315, + "grad_norm": 0.2847253716037483, + "learning_rate": 0.0009960252153926258, + "loss": 3.7984, + "step": 2155 + }, + { + "epoch": 0.6823027718550106, + "grad_norm": 0.22262377486617854, + "learning_rate": 0.0009959555166829204, + "loss": 3.8743, + "step": 2160 + }, + { + "epoch": 0.6838821764194899, + "grad_norm": 0.18462576402162742, + "learning_rate": 0.0009958852146632862, + "loss": 3.8773, + "step": 2165 + }, + { + "epoch": 0.685461580983969, + "grad_norm": 0.2823849265980843, + "learning_rate": 0.0009958143094192429, + "loss": 3.8371, + "step": 2170 + }, + { + "epoch": 0.6870409855484483, + "grad_norm": 0.19356104342333613, + "learning_rate": 0.000995742801037044, + "loss": 3.8393, + "step": 2175 + }, + { + "epoch": 0.6886203901129274, + "grad_norm": 0.24691680125149096, + "learning_rate": 0.000995670689603676, + "loss": 3.7618, + "step": 2180 + }, + { + "epoch": 0.6901997946774067, + "grad_norm": 0.22870840714473814, + "learning_rate": 0.0009955979752068603, + "loss": 3.8878, + "step": 2185 + }, + { + "epoch": 0.6917791992418858, + "grad_norm": 0.25006435731509863, + "learning_rate": 0.0009955246579350506, + "loss": 3.8762, + "step": 2190 + }, + { + "epoch": 0.693358603806365, + "grad_norm": 0.19895321523054352, + "learning_rate": 0.0009954507378774344, + "loss": 3.8146, + "step": 2195 + }, + { + "epoch": 0.6949380083708442, + "grad_norm": 0.26802335283542106, + "learning_rate": 0.0009953762151239326, + "loss": 3.8611, + "step": 2200 + }, + { + "epoch": 0.6965174129353234, + "grad_norm": 0.2875206879898967, + "learning_rate": 0.0009953010897651993, + "loss": 3.8369, + "step": 2205 + }, + { + "epoch": 0.6980968174998026, + "grad_norm": 0.2856661439521279, + "learning_rate": 0.000995225361892621, + "loss": 4.0, + "step": 2210 + }, + { + "epoch": 0.6996762220642818, + "grad_norm": 0.30979961397328754, + "learning_rate": 0.000995149031598318, + "loss": 3.84, + "step": 2215 + }, + { + "epoch": 0.701255626628761, + "grad_norm": 0.30130512379471247, + "learning_rate": 0.000995072098975143, + "loss": 3.8299, + "step": 2220 + }, + { + "epoch": 0.7028350311932402, + "grad_norm": 0.2221796910671295, + "learning_rate": 0.0009949945641166812, + "loss": 3.8664, + "step": 2225 + }, + { + "epoch": 0.7044144357577193, + "grad_norm": 0.23762852053695657, + "learning_rate": 0.000994916427117251, + "loss": 3.7376, + "step": 2230 + }, + { + "epoch": 0.7059938403221986, + "grad_norm": 0.18218641939581295, + "learning_rate": 0.0009948376880719028, + "loss": 3.7075, + "step": 2235 + }, + { + "epoch": 0.7075732448866777, + "grad_norm": 0.1870445334301083, + "learning_rate": 0.0009947583470764191, + "loss": 3.7572, + "step": 2240 + }, + { + "epoch": 0.709152649451157, + "grad_norm": 0.1834689185397494, + "learning_rate": 0.0009946784042273154, + "loss": 3.7828, + "step": 2245 + }, + { + "epoch": 0.7107320540156361, + "grad_norm": 0.185023861033481, + "learning_rate": 0.000994597859621839, + "loss": 3.8348, + "step": 2250 + }, + { + "epoch": 0.7123114585801152, + "grad_norm": 0.19030090837858005, + "learning_rate": 0.000994516713357969, + "loss": 3.7744, + "step": 2255 + }, + { + "epoch": 0.7138908631445945, + "grad_norm": 0.21590872376404172, + "learning_rate": 0.0009944349655344167, + "loss": 3.8309, + "step": 2260 + }, + { + "epoch": 0.7154702677090736, + "grad_norm": 0.28478239021445106, + "learning_rate": 0.000994352616250625, + "loss": 3.8515, + "step": 2265 + }, + { + "epoch": 0.7170496722735529, + "grad_norm": 0.21089138394462176, + "learning_rate": 0.0009942696656067682, + "loss": 3.8179, + "step": 2270 + }, + { + "epoch": 0.718629076838032, + "grad_norm": 0.31075061877113347, + "learning_rate": 0.000994186113703753, + "loss": 3.8159, + "step": 2275 + }, + { + "epoch": 0.7202084814025113, + "grad_norm": 0.28659229497528427, + "learning_rate": 0.0009941019606432163, + "loss": 3.8684, + "step": 2280 + }, + { + "epoch": 0.7217878859669904, + "grad_norm": 0.2412094613864912, + "learning_rate": 0.0009940172065275273, + "loss": 3.8988, + "step": 2285 + }, + { + "epoch": 0.7233672905314696, + "grad_norm": 0.21405775059275908, + "learning_rate": 0.000993931851459786, + "loss": 3.7733, + "step": 2290 + }, + { + "epoch": 0.7249466950959488, + "grad_norm": 0.12139107360781308, + "learning_rate": 0.000993845895543823, + "loss": 3.648, + "step": 2295 + }, + { + "epoch": 0.726526099660428, + "grad_norm": 0.23868396704616063, + "learning_rate": 0.0009937593388842007, + "loss": 3.7886, + "step": 2300 + }, + { + "epoch": 0.7281055042249072, + "grad_norm": 0.19195542843046692, + "learning_rate": 0.0009936721815862117, + "loss": 3.7889, + "step": 2305 + }, + { + "epoch": 0.7296849087893864, + "grad_norm": 0.2257860153230892, + "learning_rate": 0.0009935844237558792, + "loss": 3.7564, + "step": 2310 + }, + { + "epoch": 0.7312643133538655, + "grad_norm": 0.17831367547050161, + "learning_rate": 0.000993496065499957, + "loss": 3.8003, + "step": 2315 + }, + { + "epoch": 0.7328437179183448, + "grad_norm": 0.2590453195673119, + "learning_rate": 0.0009934071069259295, + "loss": 3.8104, + "step": 2320 + }, + { + "epoch": 0.7344231224828239, + "grad_norm": 0.2169619591401621, + "learning_rate": 0.0009933175481420112, + "loss": 3.8378, + "step": 2325 + }, + { + "epoch": 0.7360025270473032, + "grad_norm": 0.27623130925256867, + "learning_rate": 0.0009932273892571467, + "loss": 3.7137, + "step": 2330 + }, + { + "epoch": 0.7375819316117823, + "grad_norm": 0.2294100407856401, + "learning_rate": 0.0009931366303810108, + "loss": 3.8197, + "step": 2335 + }, + { + "epoch": 0.7391613361762616, + "grad_norm": 0.23406768014525023, + "learning_rate": 0.0009930452716240077, + "loss": 3.8387, + "step": 2340 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.20537121017839016, + "learning_rate": 0.000992953313097272, + "loss": 3.8095, + "step": 2345 + }, + { + "epoch": 0.74232014530522, + "grad_norm": 0.22643879626344338, + "learning_rate": 0.0009928607549126677, + "loss": 3.7317, + "step": 2350 + }, + { + "epoch": 0.7438995498696991, + "grad_norm": 0.2257293267348555, + "learning_rate": 0.0009927675971827875, + "loss": 3.9312, + "step": 2355 + }, + { + "epoch": 0.7454789544341783, + "grad_norm": 0.23366862269634528, + "learning_rate": 0.0009926738400209546, + "loss": 3.76, + "step": 2360 + }, + { + "epoch": 0.7470583589986575, + "grad_norm": 0.20353001859992098, + "learning_rate": 0.0009925794835412205, + "loss": 3.6586, + "step": 2365 + }, + { + "epoch": 0.7486377635631367, + "grad_norm": 0.18424469126412626, + "learning_rate": 0.000992484527858366, + "loss": 3.9304, + "step": 2370 + }, + { + "epoch": 0.7502171681276159, + "grad_norm": 0.24559128183468742, + "learning_rate": 0.0009923889730879011, + "loss": 3.7245, + "step": 2375 + }, + { + "epoch": 0.7517965726920951, + "grad_norm": 0.3062722293190668, + "learning_rate": 0.0009922928193460644, + "loss": 3.8891, + "step": 2380 + }, + { + "epoch": 0.7533759772565742, + "grad_norm": 0.27045048766747526, + "learning_rate": 0.0009921960667498226, + "loss": 3.8022, + "step": 2385 + }, + { + "epoch": 0.7549553818210535, + "grad_norm": 0.2562345403894404, + "learning_rate": 0.0009920987154168719, + "loss": 3.7404, + "step": 2390 + }, + { + "epoch": 0.7565347863855326, + "grad_norm": 0.21991589014582286, + "learning_rate": 0.0009920007654656358, + "loss": 3.755, + "step": 2395 + }, + { + "epoch": 0.7581141909500119, + "grad_norm": 0.2304615828900788, + "learning_rate": 0.0009919022170152667, + "loss": 3.8066, + "step": 2400 + }, + { + "epoch": 0.759693595514491, + "grad_norm": 0.19738272793982797, + "learning_rate": 0.000991803070185645, + "loss": 3.6949, + "step": 2405 + }, + { + "epoch": 0.7612730000789703, + "grad_norm": 0.2030280499023611, + "learning_rate": 0.0009917033250973785, + "loss": 3.7832, + "step": 2410 + }, + { + "epoch": 0.7628524046434494, + "grad_norm": 0.22494496151508525, + "learning_rate": 0.0009916029818718033, + "loss": 3.7324, + "step": 2415 + }, + { + "epoch": 0.7644318092079286, + "grad_norm": 0.22932816027271852, + "learning_rate": 0.0009915020406309827, + "loss": 3.7687, + "step": 2420 + }, + { + "epoch": 0.7660112137724078, + "grad_norm": 0.27604691578739265, + "learning_rate": 0.000991400501497708, + "loss": 3.7379, + "step": 2425 + }, + { + "epoch": 0.767590618336887, + "grad_norm": 0.24657062639557809, + "learning_rate": 0.000991298364595497, + "loss": 3.6804, + "step": 2430 + }, + { + "epoch": 0.7691700229013662, + "grad_norm": 0.17412852873504417, + "learning_rate": 0.0009911956300485956, + "loss": 3.7576, + "step": 2435 + }, + { + "epoch": 0.7707494274658454, + "grad_norm": 0.19379318906955947, + "learning_rate": 0.0009910922979819762, + "loss": 3.8152, + "step": 2440 + }, + { + "epoch": 0.7723288320303245, + "grad_norm": 0.21644414786245522, + "learning_rate": 0.0009909883685213375, + "loss": 3.7226, + "step": 2445 + }, + { + "epoch": 0.7739082365948038, + "grad_norm": 0.2787930687738909, + "learning_rate": 0.000990883841793106, + "loss": 3.7699, + "step": 2450 + }, + { + "epoch": 0.7754876411592829, + "grad_norm": 0.24870103981372055, + "learning_rate": 0.0009907787179244344, + "loss": 3.749, + "step": 2455 + }, + { + "epoch": 0.7770670457237622, + "grad_norm": 0.19321694165856332, + "learning_rate": 0.0009906729970432014, + "loss": 3.7828, + "step": 2460 + }, + { + "epoch": 0.7786464502882413, + "grad_norm": 0.18949642569362274, + "learning_rate": 0.0009905666792780121, + "loss": 3.7458, + "step": 2465 + }, + { + "epoch": 0.7802258548527206, + "grad_norm": 0.2668235859016389, + "learning_rate": 0.0009904597647581981, + "loss": 3.6928, + "step": 2470 + }, + { + "epoch": 0.7818052594171997, + "grad_norm": 0.26892923882002495, + "learning_rate": 0.0009903522536138165, + "loss": 3.7568, + "step": 2475 + }, + { + "epoch": 0.7833846639816789, + "grad_norm": 0.23583860928869732, + "learning_rate": 0.00099024414597565, + "loss": 3.7143, + "step": 2480 + }, + { + "epoch": 0.7849640685461581, + "grad_norm": 0.24434878517883898, + "learning_rate": 0.0009901354419752076, + "loss": 3.7339, + "step": 2485 + }, + { + "epoch": 0.7865434731106373, + "grad_norm": 0.27267916282442095, + "learning_rate": 0.000990026141744723, + "loss": 3.8102, + "step": 2490 + }, + { + "epoch": 0.7881228776751165, + "grad_norm": 0.30307916098945237, + "learning_rate": 0.0009899162454171558, + "loss": 3.736, + "step": 2495 + }, + { + "epoch": 0.7897022822395957, + "grad_norm": 0.22514273495342624, + "learning_rate": 0.0009898057531261902, + "loss": 3.74, + "step": 2500 + }, + { + "epoch": 0.7912816868040748, + "grad_norm": 0.21910656895816347, + "learning_rate": 0.000989694665006236, + "loss": 3.7559, + "step": 2505 + }, + { + "epoch": 0.7928610913685541, + "grad_norm": 0.21413778361238997, + "learning_rate": 0.0009895829811924271, + "loss": 3.7509, + "step": 2510 + }, + { + "epoch": 0.7944404959330332, + "grad_norm": 0.2100958485249077, + "learning_rate": 0.0009894707018206223, + "loss": 3.6985, + "step": 2515 + }, + { + "epoch": 0.7960199004975125, + "grad_norm": 0.22357525768208342, + "learning_rate": 0.0009893578270274053, + "loss": 3.7324, + "step": 2520 + }, + { + "epoch": 0.7975993050619916, + "grad_norm": 0.1964921217332807, + "learning_rate": 0.0009892443569500834, + "loss": 3.7291, + "step": 2525 + }, + { + "epoch": 0.7991787096264709, + "grad_norm": 0.20271019779435234, + "learning_rate": 0.0009891302917266888, + "loss": 3.8733, + "step": 2530 + }, + { + "epoch": 0.80075811419095, + "grad_norm": 0.23478169827263035, + "learning_rate": 0.0009890156314959768, + "loss": 3.8878, + "step": 2535 + }, + { + "epoch": 0.8023375187554292, + "grad_norm": 0.2313386457109853, + "learning_rate": 0.0009889003763974271, + "loss": 3.6695, + "step": 2540 + }, + { + "epoch": 0.8039169233199084, + "grad_norm": 0.2618485113502287, + "learning_rate": 0.000988784526571243, + "loss": 3.7112, + "step": 2545 + }, + { + "epoch": 0.8054963278843876, + "grad_norm": 0.2634351727376466, + "learning_rate": 0.0009886680821583511, + "loss": 3.7644, + "step": 2550 + }, + { + "epoch": 0.8070757324488668, + "grad_norm": 0.2595276380762902, + "learning_rate": 0.0009885510433004013, + "loss": 3.7772, + "step": 2555 + }, + { + "epoch": 0.808655137013346, + "grad_norm": 0.2587043663729113, + "learning_rate": 0.0009884334101397667, + "loss": 3.7313, + "step": 2560 + }, + { + "epoch": 0.8102345415778252, + "grad_norm": 0.22133887180757533, + "learning_rate": 0.0009883151828195432, + "loss": 3.6144, + "step": 2565 + }, + { + "epoch": 0.8118139461423044, + "grad_norm": 0.1816423792220125, + "learning_rate": 0.00098819636148355, + "loss": 3.7003, + "step": 2570 + }, + { + "epoch": 0.8133933507067835, + "grad_norm": 0.20401144650693834, + "learning_rate": 0.0009880769462763278, + "loss": 3.7263, + "step": 2575 + }, + { + "epoch": 0.8149727552712628, + "grad_norm": 0.16695314570111003, + "learning_rate": 0.0009879569373431407, + "loss": 3.759, + "step": 2580 + }, + { + "epoch": 0.8165521598357419, + "grad_norm": 0.20536838024238005, + "learning_rate": 0.000987836334829975, + "loss": 3.6269, + "step": 2585 + }, + { + "epoch": 0.8181315644002212, + "grad_norm": 0.1991843009792855, + "learning_rate": 0.0009877151388835385, + "loss": 3.9088, + "step": 2590 + }, + { + "epoch": 0.8197109689647003, + "grad_norm": 0.20686166646618734, + "learning_rate": 0.0009875933496512612, + "loss": 3.7823, + "step": 2595 + }, + { + "epoch": 0.8212903735291796, + "grad_norm": 0.2114497635258743, + "learning_rate": 0.0009874709672812948, + "loss": 3.734, + "step": 2600 + }, + { + "epoch": 0.8228697780936587, + "grad_norm": 0.21392111329380673, + "learning_rate": 0.0009873479919225128, + "loss": 3.6851, + "step": 2605 + }, + { + "epoch": 0.8244491826581379, + "grad_norm": 0.16240566726163633, + "learning_rate": 0.0009872244237245096, + "loss": 3.7172, + "step": 2610 + }, + { + "epoch": 0.8260285872226171, + "grad_norm": 0.16766924838905117, + "learning_rate": 0.000987100262837601, + "loss": 3.6609, + "step": 2615 + }, + { + "epoch": 0.8276079917870963, + "grad_norm": 0.25885756039168034, + "learning_rate": 0.0009869755094128233, + "loss": 3.6892, + "step": 2620 + }, + { + "epoch": 0.8291873963515755, + "grad_norm": 0.20299536884768418, + "learning_rate": 0.0009868501636019346, + "loss": 3.7749, + "step": 2625 + }, + { + "epoch": 0.8307668009160546, + "grad_norm": 0.21278111733231522, + "learning_rate": 0.0009867242255574126, + "loss": 3.6753, + "step": 2630 + }, + { + "epoch": 0.8323462054805338, + "grad_norm": 0.23192355698362774, + "learning_rate": 0.0009865976954324563, + "loss": 3.689, + "step": 2635 + }, + { + "epoch": 0.833925610045013, + "grad_norm": 0.26848443384418835, + "learning_rate": 0.0009864705733809843, + "loss": 3.6186, + "step": 2640 + }, + { + "epoch": 0.8355050146094922, + "grad_norm": 0.1906325871851695, + "learning_rate": 0.0009863428595576352, + "loss": 3.6238, + "step": 2645 + }, + { + "epoch": 0.8370844191739714, + "grad_norm": 0.1641940539362934, + "learning_rate": 0.000986214554117768, + "loss": 3.7163, + "step": 2650 + }, + { + "epoch": 0.8386638237384506, + "grad_norm": 0.18932792499119566, + "learning_rate": 0.000986085657217461, + "loss": 3.75, + "step": 2655 + }, + { + "epoch": 0.8402432283029297, + "grad_norm": 0.19028118211453493, + "learning_rate": 0.0009859561690135125, + "loss": 3.5875, + "step": 2660 + }, + { + "epoch": 0.841822632867409, + "grad_norm": 0.20793049944875938, + "learning_rate": 0.000985826089663439, + "loss": 3.6361, + "step": 2665 + }, + { + "epoch": 0.8434020374318881, + "grad_norm": 0.24167945997106643, + "learning_rate": 0.0009856954193254773, + "loss": 3.7011, + "step": 2670 + }, + { + "epoch": 0.8449814419963674, + "grad_norm": 0.19134822410708058, + "learning_rate": 0.0009855641581585823, + "loss": 3.6072, + "step": 2675 + }, + { + "epoch": 0.8465608465608465, + "grad_norm": 0.30594212306849533, + "learning_rate": 0.0009854323063224282, + "loss": 3.7363, + "step": 2680 + }, + { + "epoch": 0.8481402511253258, + "grad_norm": 0.3554004161494336, + "learning_rate": 0.0009852998639774072, + "loss": 3.7074, + "step": 2685 + }, + { + "epoch": 0.8497196556898049, + "grad_norm": 0.2158252194984231, + "learning_rate": 0.0009851668312846303, + "loss": 3.719, + "step": 2690 + }, + { + "epoch": 0.8512990602542841, + "grad_norm": 0.2094551560100172, + "learning_rate": 0.0009850332084059262, + "loss": 3.6551, + "step": 2695 + }, + { + "epoch": 0.8528784648187633, + "grad_norm": 0.18976941526589675, + "learning_rate": 0.000984898995503842, + "loss": 3.659, + "step": 2700 + }, + { + "epoch": 0.8544578693832425, + "grad_norm": 0.19847958962221077, + "learning_rate": 0.0009847641927416423, + "loss": 3.6902, + "step": 2705 + }, + { + "epoch": 0.8560372739477217, + "grad_norm": 0.2354635589066857, + "learning_rate": 0.0009846288002833088, + "loss": 3.6636, + "step": 2710 + }, + { + "epoch": 0.8576166785122009, + "grad_norm": 0.20562961378170208, + "learning_rate": 0.0009844928182935414, + "loss": 3.7092, + "step": 2715 + }, + { + "epoch": 0.85919608307668, + "grad_norm": 0.24859698816861014, + "learning_rate": 0.0009843562469377567, + "loss": 3.713, + "step": 2720 + }, + { + "epoch": 0.8607754876411593, + "grad_norm": 0.2399986307557472, + "learning_rate": 0.000984219086382088, + "loss": 3.756, + "step": 2725 + }, + { + "epoch": 0.8623548922056384, + "grad_norm": 0.27013518984869855, + "learning_rate": 0.0009840813367933859, + "loss": 3.7906, + "step": 2730 + }, + { + "epoch": 0.8639342967701177, + "grad_norm": 0.23047495735441087, + "learning_rate": 0.000983942998339217, + "loss": 3.7627, + "step": 2735 + }, + { + "epoch": 0.8655137013345968, + "grad_norm": 0.20430948001029364, + "learning_rate": 0.0009838040711878646, + "loss": 3.603, + "step": 2740 + }, + { + "epoch": 0.8670931058990761, + "grad_norm": 0.18081743933907277, + "learning_rate": 0.0009836645555083281, + "loss": 3.5728, + "step": 2745 + }, + { + "epoch": 0.8686725104635552, + "grad_norm": 0.22174597927758466, + "learning_rate": 0.0009835244514703222, + "loss": 3.6436, + "step": 2750 + }, + { + "epoch": 0.8702519150280345, + "grad_norm": 0.1953983191119668, + "learning_rate": 0.0009833837592442786, + "loss": 3.6422, + "step": 2755 + }, + { + "epoch": 0.8718313195925136, + "grad_norm": 0.1829661116296364, + "learning_rate": 0.000983242479001343, + "loss": 3.6262, + "step": 2760 + }, + { + "epoch": 0.8734107241569928, + "grad_norm": 0.22235033352924125, + "learning_rate": 0.0009831006109133776, + "loss": 3.7124, + "step": 2765 + }, + { + "epoch": 0.874990128721472, + "grad_norm": 0.266676779913622, + "learning_rate": 0.000982958155152959, + "loss": 3.6524, + "step": 2770 + }, + { + "epoch": 0.8765695332859512, + "grad_norm": 0.20839375394643436, + "learning_rate": 0.000982815111893379, + "loss": 3.6547, + "step": 2775 + }, + { + "epoch": 0.8781489378504304, + "grad_norm": 0.21055619207323192, + "learning_rate": 0.0009826714813086438, + "loss": 3.7226, + "step": 2780 + }, + { + "epoch": 0.8797283424149096, + "grad_norm": 0.22345474054173048, + "learning_rate": 0.0009825272635734746, + "loss": 3.6297, + "step": 2785 + }, + { + "epoch": 0.8813077469793887, + "grad_norm": 0.20804544740257655, + "learning_rate": 0.0009823824588633058, + "loss": 3.648, + "step": 2790 + }, + { + "epoch": 0.882887151543868, + "grad_norm": 0.18295164550994766, + "learning_rate": 0.0009822370673542872, + "loss": 3.6555, + "step": 2795 + }, + { + "epoch": 0.8844665561083471, + "grad_norm": 0.19425225687908296, + "learning_rate": 0.0009820910892232816, + "loss": 3.634, + "step": 2800 + }, + { + "epoch": 0.8860459606728264, + "grad_norm": 0.19191672702600987, + "learning_rate": 0.0009819445246478653, + "loss": 3.6317, + "step": 2805 + }, + { + "epoch": 0.8876253652373055, + "grad_norm": 0.21038674810948635, + "learning_rate": 0.000981797373806328, + "loss": 3.6634, + "step": 2810 + }, + { + "epoch": 0.8892047698017848, + "grad_norm": 0.2197613989072648, + "learning_rate": 0.0009816496368776734, + "loss": 3.5895, + "step": 2815 + }, + { + "epoch": 0.8907841743662639, + "grad_norm": 0.21178720188607164, + "learning_rate": 0.000981501314041617, + "loss": 3.6104, + "step": 2820 + }, + { + "epoch": 0.8923635789307431, + "grad_norm": 0.17082253963144226, + "learning_rate": 0.0009813524054785878, + "loss": 3.7365, + "step": 2825 + }, + { + "epoch": 0.8939429834952223, + "grad_norm": 0.1725318619775281, + "learning_rate": 0.0009812029113697271, + "loss": 3.6157, + "step": 2830 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.2352061797282261, + "learning_rate": 0.0009810528318968882, + "loss": 3.6178, + "step": 2835 + }, + { + "epoch": 0.8971017926241807, + "grad_norm": 0.21716211764222246, + "learning_rate": 0.0009809021672426371, + "loss": 3.6164, + "step": 2840 + }, + { + "epoch": 0.8986811971886599, + "grad_norm": 0.1760557168883546, + "learning_rate": 0.000980750917590251, + "loss": 3.6029, + "step": 2845 + }, + { + "epoch": 0.900260601753139, + "grad_norm": 0.14698275622175808, + "learning_rate": 0.0009805990831237192, + "loss": 3.6432, + "step": 2850 + }, + { + "epoch": 0.9018400063176183, + "grad_norm": 0.16077926077590798, + "learning_rate": 0.0009804466640277421, + "loss": 3.5867, + "step": 2855 + }, + { + "epoch": 0.9034194108820974, + "grad_norm": 0.16707694137928564, + "learning_rate": 0.0009802936604877317, + "loss": 3.5945, + "step": 2860 + }, + { + "epoch": 0.9049988154465767, + "grad_norm": 0.21633216870374575, + "learning_rate": 0.0009801400726898101, + "loss": 3.6673, + "step": 2865 + }, + { + "epoch": 0.9065782200110558, + "grad_norm": 0.1662224581417447, + "learning_rate": 0.000979985900820811, + "loss": 3.5377, + "step": 2870 + }, + { + "epoch": 0.9081576245755351, + "grad_norm": 0.17310136270402474, + "learning_rate": 0.0009798311450682784, + "loss": 3.6498, + "step": 2875 + }, + { + "epoch": 0.9097370291400142, + "grad_norm": 0.22943147677681835, + "learning_rate": 0.0009796758056204661, + "loss": 3.5972, + "step": 2880 + }, + { + "epoch": 0.9113164337044934, + "grad_norm": 0.2643736873561172, + "learning_rate": 0.0009795198826663388, + "loss": 3.6368, + "step": 2885 + }, + { + "epoch": 0.9128958382689726, + "grad_norm": 0.25044361405077453, + "learning_rate": 0.00097936337639557, + "loss": 3.5429, + "step": 2890 + }, + { + "epoch": 0.9144752428334518, + "grad_norm": 0.23839389612516823, + "learning_rate": 0.0009792062869985435, + "loss": 3.5907, + "step": 2895 + }, + { + "epoch": 0.916054647397931, + "grad_norm": 0.2363741993442048, + "learning_rate": 0.000979048614666352, + "loss": 3.6616, + "step": 2900 + }, + { + "epoch": 0.9176340519624102, + "grad_norm": 0.2780080804073825, + "learning_rate": 0.000978890359590798, + "loss": 3.8078, + "step": 2905 + }, + { + "epoch": 0.9192134565268893, + "grad_norm": 0.15362194641567975, + "learning_rate": 0.000978731521964392, + "loss": 3.5856, + "step": 2910 + }, + { + "epoch": 0.9207928610913686, + "grad_norm": 0.3174075119221956, + "learning_rate": 0.0009785721019803539, + "loss": 3.6724, + "step": 2915 + }, + { + "epoch": 0.9223722656558477, + "grad_norm": 0.18801093915486303, + "learning_rate": 0.0009784120998326113, + "loss": 3.6253, + "step": 2920 + }, + { + "epoch": 0.923951670220327, + "grad_norm": 0.2572007567570295, + "learning_rate": 0.0009782515157158009, + "loss": 3.6182, + "step": 2925 + }, + { + "epoch": 0.9255310747848061, + "grad_norm": 0.2199486414718636, + "learning_rate": 0.0009780903498252664, + "loss": 3.5813, + "step": 2930 + }, + { + "epoch": 0.9271104793492854, + "grad_norm": 0.1805834748540574, + "learning_rate": 0.00097792860235706, + "loss": 3.5573, + "step": 2935 + }, + { + "epoch": 0.9286898839137645, + "grad_norm": 0.1807851236451506, + "learning_rate": 0.0009777662735079404, + "loss": 3.6422, + "step": 2940 + }, + { + "epoch": 0.9302692884782437, + "grad_norm": 0.19801039649431965, + "learning_rate": 0.0009776033634753746, + "loss": 3.6314, + "step": 2945 + }, + { + "epoch": 0.9318486930427229, + "grad_norm": 0.20233526907990002, + "learning_rate": 0.000977439872457536, + "loss": 3.6458, + "step": 2950 + }, + { + "epoch": 0.9334280976072021, + "grad_norm": 0.1861241478177403, + "learning_rate": 0.0009772758006533046, + "loss": 3.5384, + "step": 2955 + }, + { + "epoch": 0.9350075021716813, + "grad_norm": 0.1534334003544395, + "learning_rate": 0.0009771111482622676, + "loss": 3.61, + "step": 2960 + }, + { + "epoch": 0.9365869067361605, + "grad_norm": 0.19550599832198995, + "learning_rate": 0.0009769459154847177, + "loss": 3.6829, + "step": 2965 + }, + { + "epoch": 0.9381663113006397, + "grad_norm": 0.253267013653418, + "learning_rate": 0.000976780102521654, + "loss": 3.6187, + "step": 2970 + }, + { + "epoch": 0.9397457158651189, + "grad_norm": 0.1919114006187933, + "learning_rate": 0.0009766137095747812, + "loss": 3.5702, + "step": 2975 + }, + { + "epoch": 0.941325120429598, + "grad_norm": 0.17264392678432056, + "learning_rate": 0.0009764467368465098, + "loss": 3.7729, + "step": 2980 + }, + { + "epoch": 0.9429045249940773, + "grad_norm": 0.16643373381157617, + "learning_rate": 0.0009762791845399552, + "loss": 3.5039, + "step": 2985 + }, + { + "epoch": 0.9444839295585564, + "grad_norm": 0.18808067976092, + "learning_rate": 0.0009761110528589381, + "loss": 3.6013, + "step": 2990 + }, + { + "epoch": 0.9460633341230357, + "grad_norm": 0.25366070833950194, + "learning_rate": 0.000975942342007984, + "loss": 3.6747, + "step": 2995 + }, + { + "epoch": 0.9476427386875148, + "grad_norm": 0.23586423093199588, + "learning_rate": 0.000975773052192323, + "loss": 3.5451, + "step": 3000 + }, + { + "epoch": 0.9492221432519939, + "grad_norm": 0.25746326552700427, + "learning_rate": 0.0009756031836178891, + "loss": 3.7641, + "step": 3005 + }, + { + "epoch": 0.9508015478164732, + "grad_norm": 0.2565384006011303, + "learning_rate": 0.0009754327364913207, + "loss": 3.6605, + "step": 3010 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.20372628096265127, + "learning_rate": 0.0009752617110199598, + "loss": 3.6505, + "step": 3015 + }, + { + "epoch": 0.9539603569454316, + "grad_norm": 0.1948134588293486, + "learning_rate": 0.0009750901074118518, + "loss": 3.6699, + "step": 3020 + }, + { + "epoch": 0.9555397615099107, + "grad_norm": 0.180120677297602, + "learning_rate": 0.0009749179258757462, + "loss": 3.6799, + "step": 3025 + }, + { + "epoch": 0.95711916607439, + "grad_norm": 0.17287631701404943, + "learning_rate": 0.0009747451666210945, + "loss": 3.6146, + "step": 3030 + }, + { + "epoch": 0.9586985706388691, + "grad_norm": 0.19503517477380783, + "learning_rate": 0.0009745718298580512, + "loss": 3.6317, + "step": 3035 + }, + { + "epoch": 0.9602779752033483, + "grad_norm": 0.22902928630954272, + "learning_rate": 0.0009743979157974739, + "loss": 3.6152, + "step": 3040 + }, + { + "epoch": 0.9618573797678275, + "grad_norm": 0.17970255406094174, + "learning_rate": 0.0009742234246509217, + "loss": 3.6966, + "step": 3045 + }, + { + "epoch": 0.9634367843323067, + "grad_norm": 0.18801251804658714, + "learning_rate": 0.0009740483566306564, + "loss": 3.6747, + "step": 3050 + }, + { + "epoch": 0.9650161888967859, + "grad_norm": 0.16843563110513118, + "learning_rate": 0.0009738727119496409, + "loss": 3.5028, + "step": 3055 + }, + { + "epoch": 0.9665955934612651, + "grad_norm": 0.19447085681688397, + "learning_rate": 0.0009736964908215401, + "loss": 3.6577, + "step": 3060 + }, + { + "epoch": 0.9681749980257442, + "grad_norm": 0.22009121363697623, + "learning_rate": 0.0009735196934607198, + "loss": 3.5345, + "step": 3065 + }, + { + "epoch": 0.9697544025902235, + "grad_norm": 0.22448750337401296, + "learning_rate": 0.0009733423200822469, + "loss": 3.5515, + "step": 3070 + }, + { + "epoch": 0.9713338071547026, + "grad_norm": 0.20771129618077283, + "learning_rate": 0.0009731643709018891, + "loss": 3.5718, + "step": 3075 + }, + { + "epoch": 0.9729132117191819, + "grad_norm": 0.20373000120459916, + "learning_rate": 0.0009729858461361141, + "loss": 3.6764, + "step": 3080 + }, + { + "epoch": 0.974492616283661, + "grad_norm": 0.1797561361208036, + "learning_rate": 0.0009728067460020904, + "loss": 3.6827, + "step": 3085 + }, + { + "epoch": 0.9760720208481403, + "grad_norm": 0.2147655801584539, + "learning_rate": 0.0009726270707176858, + "loss": 3.5844, + "step": 3090 + }, + { + "epoch": 0.9776514254126194, + "grad_norm": 0.1960578897451964, + "learning_rate": 0.0009724468205014685, + "loss": 3.7833, + "step": 3095 + }, + { + "epoch": 0.9792308299770986, + "grad_norm": 0.15597674681769813, + "learning_rate": 0.0009722659955727054, + "loss": 3.656, + "step": 3100 + }, + { + "epoch": 0.9808102345415778, + "grad_norm": 0.21261634725346287, + "learning_rate": 0.0009720845961513627, + "loss": 3.55, + "step": 3105 + }, + { + "epoch": 0.982389639106057, + "grad_norm": 0.2435056831114221, + "learning_rate": 0.0009719026224581053, + "loss": 3.6242, + "step": 3110 + }, + { + "epoch": 0.9839690436705362, + "grad_norm": 0.16406935937973846, + "learning_rate": 0.0009717200747142974, + "loss": 3.5754, + "step": 3115 + }, + { + "epoch": 0.9855484482350154, + "grad_norm": 0.18968492314951244, + "learning_rate": 0.0009715369531420006, + "loss": 3.6399, + "step": 3120 + }, + { + "epoch": 0.9871278527994946, + "grad_norm": 0.16702625054301215, + "learning_rate": 0.000971353257963975, + "loss": 3.5255, + "step": 3125 + }, + { + "epoch": 0.9887072573639738, + "grad_norm": 0.2301624768016454, + "learning_rate": 0.0009711689894036784, + "loss": 3.5684, + "step": 3130 + }, + { + "epoch": 0.9902866619284529, + "grad_norm": 0.19561138426372737, + "learning_rate": 0.0009709841476852661, + "loss": 3.5207, + "step": 3135 + }, + { + "epoch": 0.9918660664929322, + "grad_norm": 0.2027504555908024, + "learning_rate": 0.0009707987330335906, + "loss": 3.5694, + "step": 3140 + }, + { + "epoch": 0.9934454710574113, + "grad_norm": 0.16179807583256492, + "learning_rate": 0.0009706127456742014, + "loss": 3.4524, + "step": 3145 + }, + { + "epoch": 0.9950248756218906, + "grad_norm": 0.18013411208473787, + "learning_rate": 0.0009704261858333445, + "loss": 3.5072, + "step": 3150 + }, + { + "epoch": 0.9966042801863697, + "grad_norm": 0.17933880588432946, + "learning_rate": 0.0009702390537379627, + "loss": 3.7013, + "step": 3155 + }, + { + "epoch": 0.998183684750849, + "grad_norm": 0.2131080756575249, + "learning_rate": 0.0009700513496156945, + "loss": 3.6046, + "step": 3160 + }, + { + "epoch": 0.9997630893153281, + "grad_norm": 0.29187684680182674, + "learning_rate": 0.0009698630736948744, + "loss": 3.6084, + "step": 3165 + }, + { + "epoch": 1.0, + "eval_loss": 3.5630176067352295, + "eval_runtime": 118.6591, + "eval_samples_per_second": 22.324, + "eval_steps_per_second": 5.587, + "step": 3166 + }, + { + "epoch": 1.0012635236515834, + "grad_norm": 0.19522578739733992, + "learning_rate": 0.0009696742262045323, + "loss": 3.5774, + "step": 3170 + }, + { + "epoch": 1.0028429282160625, + "grad_norm": 0.19544019298572032, + "learning_rate": 0.0009694848073743941, + "loss": 3.5418, + "step": 3175 + }, + { + "epoch": 1.0044223327805417, + "grad_norm": 0.18717051155662734, + "learning_rate": 0.0009692948174348797, + "loss": 3.4698, + "step": 3180 + }, + { + "epoch": 1.006001737345021, + "grad_norm": 0.26296600436862455, + "learning_rate": 0.0009691042566171044, + "loss": 3.4994, + "step": 3185 + }, + { + "epoch": 1.0075811419095002, + "grad_norm": 0.1674558011162135, + "learning_rate": 0.0009689131251528778, + "loss": 3.5352, + "step": 3190 + }, + { + "epoch": 1.0091605464739792, + "grad_norm": 0.19024232705973332, + "learning_rate": 0.0009687214232747035, + "loss": 3.4942, + "step": 3195 + }, + { + "epoch": 1.0107399510384585, + "grad_norm": 0.1847075755737659, + "learning_rate": 0.0009685291512157792, + "loss": 3.5486, + "step": 3200 + }, + { + "epoch": 1.0123193556029377, + "grad_norm": 0.1769802703112888, + "learning_rate": 0.0009683363092099961, + "loss": 3.5394, + "step": 3205 + }, + { + "epoch": 1.013898760167417, + "grad_norm": 0.1405972821619923, + "learning_rate": 0.0009681428974919388, + "loss": 3.5536, + "step": 3210 + }, + { + "epoch": 1.015478164731896, + "grad_norm": 0.18248409434912527, + "learning_rate": 0.0009679489162968849, + "loss": 3.641, + "step": 3215 + }, + { + "epoch": 1.0170575692963753, + "grad_norm": 0.3581630591943084, + "learning_rate": 0.0009677543658608046, + "loss": 3.595, + "step": 3220 + }, + { + "epoch": 1.0186369738608545, + "grad_norm": 0.29219008650027906, + "learning_rate": 0.0009675592464203609, + "loss": 3.5952, + "step": 3225 + }, + { + "epoch": 1.0202163784253337, + "grad_norm": 0.29655977440988046, + "learning_rate": 0.0009673635582129084, + "loss": 3.6436, + "step": 3230 + }, + { + "epoch": 1.0217957829898128, + "grad_norm": 0.1914125404242104, + "learning_rate": 0.0009671673014764942, + "loss": 3.5391, + "step": 3235 + }, + { + "epoch": 1.023375187554292, + "grad_norm": 0.21280629483900723, + "learning_rate": 0.0009669704764498564, + "loss": 3.6341, + "step": 3240 + }, + { + "epoch": 1.0249545921187713, + "grad_norm": 0.19389573313403266, + "learning_rate": 0.000966773083372425, + "loss": 3.5238, + "step": 3245 + }, + { + "epoch": 1.0265339966832505, + "grad_norm": 0.18443862860215415, + "learning_rate": 0.0009665751224843209, + "loss": 3.6114, + "step": 3250 + }, + { + "epoch": 1.0281134012477295, + "grad_norm": 0.24078883923384495, + "learning_rate": 0.0009663765940263554, + "loss": 3.5654, + "step": 3255 + }, + { + "epoch": 1.0296928058122088, + "grad_norm": 0.16865601842164352, + "learning_rate": 0.0009661774982400301, + "loss": 3.5134, + "step": 3260 + }, + { + "epoch": 1.031272210376688, + "grad_norm": 0.18953742112811325, + "learning_rate": 0.0009659778353675372, + "loss": 3.5744, + "step": 3265 + }, + { + "epoch": 1.0328516149411673, + "grad_norm": 0.19219203532463633, + "learning_rate": 0.0009657776056517589, + "loss": 3.6115, + "step": 3270 + }, + { + "epoch": 1.0344310195056463, + "grad_norm": 0.12784162972971713, + "learning_rate": 0.000965576809336266, + "loss": 3.4902, + "step": 3275 + }, + { + "epoch": 1.0360104240701256, + "grad_norm": 0.17808282679415993, + "learning_rate": 0.0009653754466653195, + "loss": 3.5061, + "step": 3280 + }, + { + "epoch": 1.0375898286346048, + "grad_norm": 0.12403067381712111, + "learning_rate": 0.000965173517883869, + "loss": 3.5288, + "step": 3285 + }, + { + "epoch": 1.039169233199084, + "grad_norm": 0.19963383337449706, + "learning_rate": 0.0009649710232375525, + "loss": 3.5505, + "step": 3290 + }, + { + "epoch": 1.040748637763563, + "grad_norm": 0.15232981796001893, + "learning_rate": 0.0009647679629726968, + "loss": 3.4659, + "step": 3295 + }, + { + "epoch": 1.0423280423280423, + "grad_norm": 0.1315179822589232, + "learning_rate": 0.0009645643373363164, + "loss": 3.5122, + "step": 3300 + }, + { + "epoch": 1.0439074468925216, + "grad_norm": 0.1644442465246829, + "learning_rate": 0.0009643601465761138, + "loss": 3.4649, + "step": 3305 + }, + { + "epoch": 1.0454868514570008, + "grad_norm": 0.18604550370191514, + "learning_rate": 0.0009641553909404788, + "loss": 3.5517, + "step": 3310 + }, + { + "epoch": 1.0470662560214798, + "grad_norm": 0.20518387776871452, + "learning_rate": 0.0009639500706784885, + "loss": 3.527, + "step": 3315 + }, + { + "epoch": 1.048645660585959, + "grad_norm": 0.1803425231097602, + "learning_rate": 0.0009637441860399066, + "loss": 3.5415, + "step": 3320 + }, + { + "epoch": 1.0502250651504383, + "grad_norm": 0.16054121711714328, + "learning_rate": 0.0009635377372751835, + "loss": 3.6723, + "step": 3325 + }, + { + "epoch": 1.0518044697149174, + "grad_norm": 0.21733786981905223, + "learning_rate": 0.0009633307246354558, + "loss": 3.5382, + "step": 3330 + }, + { + "epoch": 1.0533838742793966, + "grad_norm": 0.1613842724324584, + "learning_rate": 0.000963123148372546, + "loss": 3.5145, + "step": 3335 + }, + { + "epoch": 1.0549632788438759, + "grad_norm": 0.14661098326997973, + "learning_rate": 0.0009629150087389624, + "loss": 3.4844, + "step": 3340 + }, + { + "epoch": 1.056542683408355, + "grad_norm": 0.18355128495159284, + "learning_rate": 0.0009627063059878986, + "loss": 3.482, + "step": 3345 + }, + { + "epoch": 1.0581220879728341, + "grad_norm": 0.213067842797003, + "learning_rate": 0.0009624970403732327, + "loss": 3.618, + "step": 3350 + }, + { + "epoch": 1.0597014925373134, + "grad_norm": 0.19809498590991662, + "learning_rate": 0.0009622872121495283, + "loss": 3.5787, + "step": 3355 + }, + { + "epoch": 1.0612808971017926, + "grad_norm": 0.14272374854343148, + "learning_rate": 0.0009620768215720327, + "loss": 3.535, + "step": 3360 + }, + { + "epoch": 1.0628603016662719, + "grad_norm": 0.15106405340295168, + "learning_rate": 0.0009618658688966777, + "loss": 3.4411, + "step": 3365 + }, + { + "epoch": 1.064439706230751, + "grad_norm": 0.18741137454692364, + "learning_rate": 0.0009616543543800788, + "loss": 3.4986, + "step": 3370 + }, + { + "epoch": 1.0660191107952302, + "grad_norm": 0.1613689259330486, + "learning_rate": 0.0009614422782795348, + "loss": 3.5067, + "step": 3375 + }, + { + "epoch": 1.0675985153597094, + "grad_norm": 0.1257460506657428, + "learning_rate": 0.0009612296408530277, + "loss": 3.5044, + "step": 3380 + }, + { + "epoch": 1.0691779199241886, + "grad_norm": 0.1754167619697149, + "learning_rate": 0.0009610164423592227, + "loss": 3.5902, + "step": 3385 + }, + { + "epoch": 1.0707573244886677, + "grad_norm": 0.17571886278255155, + "learning_rate": 0.0009608026830574667, + "loss": 3.4963, + "step": 3390 + }, + { + "epoch": 1.072336729053147, + "grad_norm": 0.18849027726038495, + "learning_rate": 0.0009605883632077896, + "loss": 3.5786, + "step": 3395 + }, + { + "epoch": 1.0739161336176262, + "grad_norm": 0.21060076571187067, + "learning_rate": 0.0009603734830709028, + "loss": 3.5587, + "step": 3400 + }, + { + "epoch": 1.0754955381821054, + "grad_norm": 0.18052163861941398, + "learning_rate": 0.0009601580429081993, + "loss": 3.593, + "step": 3405 + }, + { + "epoch": 1.0770749427465844, + "grad_norm": 0.19603356848954506, + "learning_rate": 0.0009599420429817533, + "loss": 3.4352, + "step": 3410 + }, + { + "epoch": 1.0786543473110637, + "grad_norm": 0.1893666912236303, + "learning_rate": 0.0009597254835543204, + "loss": 3.5038, + "step": 3415 + }, + { + "epoch": 1.080233751875543, + "grad_norm": 0.19888125574740098, + "learning_rate": 0.0009595083648893362, + "loss": 3.5156, + "step": 3420 + }, + { + "epoch": 1.0818131564400222, + "grad_norm": 0.16899935054600765, + "learning_rate": 0.0009592906872509167, + "loss": 3.5259, + "step": 3425 + }, + { + "epoch": 1.0833925610045012, + "grad_norm": 0.14162144798225573, + "learning_rate": 0.0009590724509038579, + "loss": 3.6076, + "step": 3430 + }, + { + "epoch": 1.0849719655689805, + "grad_norm": 0.1370291892548981, + "learning_rate": 0.0009588536561136358, + "loss": 3.4767, + "step": 3435 + }, + { + "epoch": 1.0865513701334597, + "grad_norm": 0.11325953846961405, + "learning_rate": 0.0009586343031464055, + "loss": 3.4623, + "step": 3440 + }, + { + "epoch": 1.088130774697939, + "grad_norm": 0.17410040294918944, + "learning_rate": 0.0009584143922690008, + "loss": 3.5858, + "step": 3445 + }, + { + "epoch": 1.089710179262418, + "grad_norm": 0.20972770154954912, + "learning_rate": 0.0009581939237489346, + "loss": 3.7063, + "step": 3450 + }, + { + "epoch": 1.0912895838268972, + "grad_norm": 0.1923951132039094, + "learning_rate": 0.000957972897854398, + "loss": 3.5805, + "step": 3455 + }, + { + "epoch": 1.0928689883913765, + "grad_norm": 0.16654208813122776, + "learning_rate": 0.00095775131485426, + "loss": 3.4444, + "step": 3460 + }, + { + "epoch": 1.0944483929558557, + "grad_norm": 0.14994148376566602, + "learning_rate": 0.0009575291750180675, + "loss": 3.5015, + "step": 3465 + }, + { + "epoch": 1.0960277975203347, + "grad_norm": 0.1640550694396785, + "learning_rate": 0.0009573064786160446, + "loss": 3.5014, + "step": 3470 + }, + { + "epoch": 1.097607202084814, + "grad_norm": 0.24701794107392486, + "learning_rate": 0.0009570832259190927, + "loss": 3.4355, + "step": 3475 + }, + { + "epoch": 1.0991866066492932, + "grad_norm": 0.21545079608972195, + "learning_rate": 0.0009568594171987893, + "loss": 3.6293, + "step": 3480 + }, + { + "epoch": 1.1007660112137725, + "grad_norm": 0.17616944220718544, + "learning_rate": 0.000956635052727389, + "loss": 3.5283, + "step": 3485 + }, + { + "epoch": 1.1023454157782515, + "grad_norm": 0.1880146713181073, + "learning_rate": 0.0009564101327778223, + "loss": 3.6321, + "step": 3490 + }, + { + "epoch": 1.1039248203427308, + "grad_norm": 0.17587965507269773, + "learning_rate": 0.000956184657623695, + "loss": 3.5467, + "step": 3495 + }, + { + "epoch": 1.10550422490721, + "grad_norm": 0.15875400779821083, + "learning_rate": 0.0009559586275392887, + "loss": 3.5661, + "step": 3500 + }, + { + "epoch": 1.1070836294716893, + "grad_norm": 0.1526252926690391, + "learning_rate": 0.0009557320427995596, + "loss": 3.4174, + "step": 3505 + }, + { + "epoch": 1.1086630340361683, + "grad_norm": 0.16836938588204559, + "learning_rate": 0.0009555049036801393, + "loss": 3.5962, + "step": 3510 + }, + { + "epoch": 1.1102424386006475, + "grad_norm": 0.14694322307110463, + "learning_rate": 0.0009552772104573332, + "loss": 3.5284, + "step": 3515 + }, + { + "epoch": 1.1118218431651268, + "grad_norm": 0.14691445386163327, + "learning_rate": 0.0009550489634081212, + "loss": 3.5363, + "step": 3520 + }, + { + "epoch": 1.113401247729606, + "grad_norm": 0.12514595695150219, + "learning_rate": 0.0009548201628101563, + "loss": 3.4698, + "step": 3525 + }, + { + "epoch": 1.114980652294085, + "grad_norm": 0.1466478847716101, + "learning_rate": 0.0009545908089417654, + "loss": 3.6368, + "step": 3530 + }, + { + "epoch": 1.1165600568585643, + "grad_norm": 0.18722690247559393, + "learning_rate": 0.0009543609020819481, + "loss": 3.4909, + "step": 3535 + }, + { + "epoch": 1.1181394614230435, + "grad_norm": 0.196720740346297, + "learning_rate": 0.0009541304425103772, + "loss": 3.513, + "step": 3540 + }, + { + "epoch": 1.1197188659875228, + "grad_norm": 0.15597027154323442, + "learning_rate": 0.000953899430507397, + "loss": 3.3843, + "step": 3545 + }, + { + "epoch": 1.1212982705520018, + "grad_norm": 0.14693305434429949, + "learning_rate": 0.0009536678663540247, + "loss": 3.3456, + "step": 3550 + }, + { + "epoch": 1.122877675116481, + "grad_norm": 0.17155391191624927, + "learning_rate": 0.0009534357503319486, + "loss": 3.5331, + "step": 3555 + }, + { + "epoch": 1.1244570796809603, + "grad_norm": 0.15181214586540706, + "learning_rate": 0.0009532030827235286, + "loss": 3.5211, + "step": 3560 + }, + { + "epoch": 1.1260364842454396, + "grad_norm": 0.14866555288064323, + "learning_rate": 0.0009529698638117954, + "loss": 3.4759, + "step": 3565 + }, + { + "epoch": 1.1276158888099186, + "grad_norm": 0.1619355292370477, + "learning_rate": 0.0009527360938804503, + "loss": 3.5829, + "step": 3570 + }, + { + "epoch": 1.1291952933743978, + "grad_norm": 0.17080930945098738, + "learning_rate": 0.0009525017732138654, + "loss": 3.4134, + "step": 3575 + }, + { + "epoch": 1.130774697938877, + "grad_norm": 0.1791332389027379, + "learning_rate": 0.000952266902097082, + "loss": 3.5257, + "step": 3580 + }, + { + "epoch": 1.1323541025033563, + "grad_norm": 0.16287889926792246, + "learning_rate": 0.0009520314808158115, + "loss": 3.527, + "step": 3585 + }, + { + "epoch": 1.1339335070678354, + "grad_norm": 0.22831329282252832, + "learning_rate": 0.0009517955096564343, + "loss": 3.5467, + "step": 3590 + }, + { + "epoch": 1.1355129116323146, + "grad_norm": 0.21587039636593486, + "learning_rate": 0.000951558988906, + "loss": 3.4928, + "step": 3595 + }, + { + "epoch": 1.1370923161967939, + "grad_norm": 0.15515302065487302, + "learning_rate": 0.0009513219188522265, + "loss": 3.3949, + "step": 3600 + }, + { + "epoch": 1.1386717207612729, + "grad_norm": 0.19531249498739786, + "learning_rate": 0.0009510842997834999, + "loss": 3.4776, + "step": 3605 + }, + { + "epoch": 1.1402511253257521, + "grad_norm": 0.17309648961127838, + "learning_rate": 0.0009508461319888743, + "loss": 3.4294, + "step": 3610 + }, + { + "epoch": 1.1418305298902314, + "grad_norm": 0.20374478205073432, + "learning_rate": 0.0009506074157580715, + "loss": 3.5764, + "step": 3615 + }, + { + "epoch": 1.1434099344547106, + "grad_norm": 0.1674614562290747, + "learning_rate": 0.0009503681513814797, + "loss": 3.4229, + "step": 3620 + }, + { + "epoch": 1.1449893390191899, + "grad_norm": 0.24676865618415278, + "learning_rate": 0.0009501283391501547, + "loss": 3.5104, + "step": 3625 + }, + { + "epoch": 1.146568743583669, + "grad_norm": 0.15749402214670652, + "learning_rate": 0.0009498879793558184, + "loss": 3.3673, + "step": 3630 + }, + { + "epoch": 1.1481481481481481, + "grad_norm": 0.1876087417308556, + "learning_rate": 0.0009496470722908586, + "loss": 3.554, + "step": 3635 + }, + { + "epoch": 1.1497275527126274, + "grad_norm": 0.18506446437441917, + "learning_rate": 0.0009494056182483292, + "loss": 3.4664, + "step": 3640 + }, + { + "epoch": 1.1513069572771064, + "grad_norm": 0.1650474707934297, + "learning_rate": 0.0009491636175219495, + "loss": 3.5146, + "step": 3645 + }, + { + "epoch": 1.1528863618415857, + "grad_norm": 0.1649216280629778, + "learning_rate": 0.0009489210704061035, + "loss": 3.4681, + "step": 3650 + }, + { + "epoch": 1.154465766406065, + "grad_norm": 0.18817357619334923, + "learning_rate": 0.0009486779771958401, + "loss": 3.5334, + "step": 3655 + }, + { + "epoch": 1.1560451709705442, + "grad_norm": 0.2004719054160758, + "learning_rate": 0.0009484343381868721, + "loss": 3.5297, + "step": 3660 + }, + { + "epoch": 1.1576245755350234, + "grad_norm": 0.22442527289938496, + "learning_rate": 0.0009481901536755768, + "loss": 3.4078, + "step": 3665 + }, + { + "epoch": 1.1592039800995024, + "grad_norm": 0.18039908625370335, + "learning_rate": 0.0009479454239589947, + "loss": 3.475, + "step": 3670 + }, + { + "epoch": 1.1607833846639817, + "grad_norm": 0.1968176863518416, + "learning_rate": 0.00094770014933483, + "loss": 3.479, + "step": 3675 + }, + { + "epoch": 1.162362789228461, + "grad_norm": 0.20274756267717062, + "learning_rate": 0.0009474543301014489, + "loss": 3.4736, + "step": 3680 + }, + { + "epoch": 1.16394219379294, + "grad_norm": 0.19078790909078822, + "learning_rate": 0.000947207966557881, + "loss": 3.5295, + "step": 3685 + }, + { + "epoch": 1.1655215983574192, + "grad_norm": 0.14814168748866086, + "learning_rate": 0.0009469610590038173, + "loss": 3.361, + "step": 3690 + }, + { + "epoch": 1.1671010029218984, + "grad_norm": 0.16425480114557606, + "learning_rate": 0.0009467136077396113, + "loss": 3.6061, + "step": 3695 + }, + { + "epoch": 1.1686804074863777, + "grad_norm": 0.20199057519130498, + "learning_rate": 0.0009464656130662773, + "loss": 3.4519, + "step": 3700 + }, + { + "epoch": 1.170259812050857, + "grad_norm": 0.16626861962685233, + "learning_rate": 0.0009462170752854908, + "loss": 3.4247, + "step": 3705 + }, + { + "epoch": 1.171839216615336, + "grad_norm": 0.16817046067836208, + "learning_rate": 0.000945967994699588, + "loss": 3.5423, + "step": 3710 + }, + { + "epoch": 1.1734186211798152, + "grad_norm": 0.19718785456241722, + "learning_rate": 0.0009457183716115655, + "loss": 3.3503, + "step": 3715 + }, + { + "epoch": 1.1749980257442945, + "grad_norm": 0.13969712920238997, + "learning_rate": 0.0009454682063250797, + "loss": 3.4451, + "step": 3720 + }, + { + "epoch": 1.1765774303087735, + "grad_norm": 0.16584531547624867, + "learning_rate": 0.0009452174991444466, + "loss": 3.4738, + "step": 3725 + }, + { + "epoch": 1.1781568348732527, + "grad_norm": 0.21098180520274426, + "learning_rate": 0.0009449662503746415, + "loss": 3.5065, + "step": 3730 + }, + { + "epoch": 1.179736239437732, + "grad_norm": 0.19560991681344314, + "learning_rate": 0.0009447144603212983, + "loss": 3.476, + "step": 3735 + }, + { + "epoch": 1.1813156440022112, + "grad_norm": 0.19571878846306276, + "learning_rate": 0.0009444621292907094, + "loss": 3.5014, + "step": 3740 + }, + { + "epoch": 1.1828950485666905, + "grad_norm": 0.23022472553882056, + "learning_rate": 0.0009442092575898253, + "loss": 3.3936, + "step": 3745 + }, + { + "epoch": 1.1844744531311695, + "grad_norm": 0.16129418870589246, + "learning_rate": 0.0009439558455262547, + "loss": 3.438, + "step": 3750 + }, + { + "epoch": 1.1860538576956487, + "grad_norm": 0.1420454736987748, + "learning_rate": 0.0009437018934082626, + "loss": 3.5496, + "step": 3755 + }, + { + "epoch": 1.187633262260128, + "grad_norm": 0.15833224972021434, + "learning_rate": 0.0009434474015447721, + "loss": 3.5196, + "step": 3760 + }, + { + "epoch": 1.189212666824607, + "grad_norm": 0.159555391379725, + "learning_rate": 0.0009431923702453617, + "loss": 3.4208, + "step": 3765 + }, + { + "epoch": 1.1907920713890863, + "grad_norm": 0.18211420211116194, + "learning_rate": 0.0009429367998202671, + "loss": 3.3405, + "step": 3770 + }, + { + "epoch": 1.1923714759535655, + "grad_norm": 0.15914256531789245, + "learning_rate": 0.0009426806905803795, + "loss": 3.5496, + "step": 3775 + }, + { + "epoch": 1.1939508805180448, + "grad_norm": 0.18026779470969273, + "learning_rate": 0.0009424240428372452, + "loss": 3.4273, + "step": 3780 + }, + { + "epoch": 1.1955302850825238, + "grad_norm": 0.29969190627355774, + "learning_rate": 0.000942166856903066, + "loss": 3.4439, + "step": 3785 + }, + { + "epoch": 1.197109689647003, + "grad_norm": 0.17566538847915972, + "learning_rate": 0.0009419091330906984, + "loss": 3.3116, + "step": 3790 + }, + { + "epoch": 1.1986890942114823, + "grad_norm": 0.161322829058277, + "learning_rate": 0.0009416508717136527, + "loss": 3.5267, + "step": 3795 + }, + { + "epoch": 1.2002684987759615, + "grad_norm": 0.1361218966368206, + "learning_rate": 0.0009413920730860937, + "loss": 3.4379, + "step": 3800 + }, + { + "epoch": 1.2018479033404406, + "grad_norm": 0.21398086639722805, + "learning_rate": 0.0009411327375228394, + "loss": 3.4715, + "step": 3805 + }, + { + "epoch": 1.2034273079049198, + "grad_norm": 0.21550538792847884, + "learning_rate": 0.0009408728653393612, + "loss": 3.3665, + "step": 3810 + }, + { + "epoch": 1.205006712469399, + "grad_norm": 0.21460867662063146, + "learning_rate": 0.0009406124568517831, + "loss": 3.5338, + "step": 3815 + }, + { + "epoch": 1.2065861170338783, + "grad_norm": 0.19790984454451685, + "learning_rate": 0.0009403515123768816, + "loss": 3.3732, + "step": 3820 + }, + { + "epoch": 1.2081655215983573, + "grad_norm": 0.16044676782894002, + "learning_rate": 0.0009400900322320851, + "loss": 3.2932, + "step": 3825 + }, + { + "epoch": 1.2097449261628366, + "grad_norm": 0.1782455072532651, + "learning_rate": 0.0009398280167354735, + "loss": 3.3615, + "step": 3830 + }, + { + "epoch": 1.2113243307273158, + "grad_norm": 0.16056461369257277, + "learning_rate": 0.0009395654662057786, + "loss": 3.394, + "step": 3835 + }, + { + "epoch": 1.212903735291795, + "grad_norm": 0.1617823275360994, + "learning_rate": 0.000939302380962382, + "loss": 3.409, + "step": 3840 + }, + { + "epoch": 1.214483139856274, + "grad_norm": 0.20344574605731353, + "learning_rate": 0.0009390387613253166, + "loss": 3.5088, + "step": 3845 + }, + { + "epoch": 1.2160625444207533, + "grad_norm": 0.19760384614127924, + "learning_rate": 0.000938774607615265, + "loss": 3.4202, + "step": 3850 + }, + { + "epoch": 1.2176419489852326, + "grad_norm": 0.17019002490371693, + "learning_rate": 0.0009385099201535596, + "loss": 3.5585, + "step": 3855 + }, + { + "epoch": 1.2192213535497118, + "grad_norm": 0.17409540531216763, + "learning_rate": 0.000938244699262182, + "loss": 3.4943, + "step": 3860 + }, + { + "epoch": 1.2208007581141909, + "grad_norm": 0.15846100520357334, + "learning_rate": 0.0009379789452637629, + "loss": 3.3515, + "step": 3865 + }, + { + "epoch": 1.2223801626786701, + "grad_norm": 0.21408541248812699, + "learning_rate": 0.0009377126584815812, + "loss": 3.4941, + "step": 3870 + }, + { + "epoch": 1.2239595672431494, + "grad_norm": 0.16695132569529617, + "learning_rate": 0.000937445839239564, + "loss": 3.4852, + "step": 3875 + }, + { + "epoch": 1.2255389718076286, + "grad_norm": 0.16318138945603894, + "learning_rate": 0.0009371784878622863, + "loss": 3.4315, + "step": 3880 + }, + { + "epoch": 1.2271183763721076, + "grad_norm": 0.19738953018555389, + "learning_rate": 0.0009369106046749703, + "loss": 3.5117, + "step": 3885 + }, + { + "epoch": 1.2286977809365869, + "grad_norm": 0.14204527279882048, + "learning_rate": 0.0009366421900034849, + "loss": 3.2596, + "step": 3890 + }, + { + "epoch": 1.2302771855010661, + "grad_norm": 0.14898950129104696, + "learning_rate": 0.0009363732441743459, + "loss": 3.2987, + "step": 3895 + }, + { + "epoch": 1.2318565900655454, + "grad_norm": 0.1683677844282376, + "learning_rate": 0.0009361037675147152, + "loss": 3.4681, + "step": 3900 + }, + { + "epoch": 1.2334359946300244, + "grad_norm": 0.17218443452504015, + "learning_rate": 0.0009358337603524001, + "loss": 3.4559, + "step": 3905 + }, + { + "epoch": 1.2350153991945036, + "grad_norm": 0.16230059619775924, + "learning_rate": 0.0009355632230158537, + "loss": 3.4437, + "step": 3910 + }, + { + "epoch": 1.236594803758983, + "grad_norm": 0.1637629344964814, + "learning_rate": 0.0009352921558341734, + "loss": 3.3447, + "step": 3915 + }, + { + "epoch": 1.2381742083234621, + "grad_norm": 0.18387059418034044, + "learning_rate": 0.0009350205591371019, + "loss": 3.3898, + "step": 3920 + }, + { + "epoch": 1.2397536128879412, + "grad_norm": 0.12842797710859524, + "learning_rate": 0.0009347484332550255, + "loss": 3.3435, + "step": 3925 + }, + { + "epoch": 1.2413330174524204, + "grad_norm": 0.1423647750556377, + "learning_rate": 0.0009344757785189744, + "loss": 3.3623, + "step": 3930 + }, + { + "epoch": 1.2429124220168997, + "grad_norm": 0.17711276858177502, + "learning_rate": 0.0009342025952606219, + "loss": 3.4316, + "step": 3935 + }, + { + "epoch": 1.244491826581379, + "grad_norm": 0.1647045199172262, + "learning_rate": 0.0009339288838122849, + "loss": 3.4343, + "step": 3940 + }, + { + "epoch": 1.246071231145858, + "grad_norm": 0.19616295235626366, + "learning_rate": 0.0009336546445069218, + "loss": 3.4433, + "step": 3945 + }, + { + "epoch": 1.2476506357103372, + "grad_norm": 0.1338067737830234, + "learning_rate": 0.0009333798776781343, + "loss": 3.4647, + "step": 3950 + }, + { + "epoch": 1.2492300402748164, + "grad_norm": 0.1674958325821782, + "learning_rate": 0.0009331045836601646, + "loss": 3.4153, + "step": 3955 + }, + { + "epoch": 1.2508094448392955, + "grad_norm": 0.164883261301351, + "learning_rate": 0.0009328287627878973, + "loss": 3.3946, + "step": 3960 + }, + { + "epoch": 1.2523888494037747, + "grad_norm": 0.1285324062812115, + "learning_rate": 0.000932552415396857, + "loss": 3.2798, + "step": 3965 + }, + { + "epoch": 1.253968253968254, + "grad_norm": 0.1708840737137744, + "learning_rate": 0.0009322755418232094, + "loss": 3.4297, + "step": 3970 + }, + { + "epoch": 1.2555476585327332, + "grad_norm": 0.143100846348656, + "learning_rate": 0.00093199814240376, + "loss": 3.524, + "step": 3975 + }, + { + "epoch": 1.2571270630972124, + "grad_norm": 0.1870541648656571, + "learning_rate": 0.000931720217475954, + "loss": 3.5105, + "step": 3980 + }, + { + "epoch": 1.2587064676616915, + "grad_norm": 0.1494970239923711, + "learning_rate": 0.000931441767377876, + "loss": 3.3396, + "step": 3985 + }, + { + "epoch": 1.2602858722261707, + "grad_norm": 0.13931866315641525, + "learning_rate": 0.0009311627924482493, + "loss": 3.3782, + "step": 3990 + }, + { + "epoch": 1.26186527679065, + "grad_norm": 0.1415432981936499, + "learning_rate": 0.0009308832930264354, + "loss": 3.3677, + "step": 3995 + }, + { + "epoch": 1.263444681355129, + "grad_norm": 0.16776120238681047, + "learning_rate": 0.0009306032694524345, + "loss": 3.3866, + "step": 4000 + }, + { + "epoch": 1.2650240859196082, + "grad_norm": 0.1786871644484267, + "learning_rate": 0.000930322722066884, + "loss": 3.3221, + "step": 4005 + }, + { + "epoch": 1.2666034904840875, + "grad_norm": 0.16315647151552112, + "learning_rate": 0.0009300416512110581, + "loss": 3.4461, + "step": 4010 + }, + { + "epoch": 1.2681828950485667, + "grad_norm": 0.1552733417787517, + "learning_rate": 0.0009297600572268685, + "loss": 3.3432, + "step": 4015 + }, + { + "epoch": 1.269762299613046, + "grad_norm": 0.1522952178479541, + "learning_rate": 0.0009294779404568629, + "loss": 3.385, + "step": 4020 + }, + { + "epoch": 1.271341704177525, + "grad_norm": 0.1662187283663952, + "learning_rate": 0.000929195301244225, + "loss": 3.4102, + "step": 4025 + }, + { + "epoch": 1.2729211087420043, + "grad_norm": 0.14647477444085574, + "learning_rate": 0.000928912139932774, + "loss": 3.3386, + "step": 4030 + }, + { + "epoch": 1.2745005133064835, + "grad_norm": 0.16711729761359687, + "learning_rate": 0.0009286284568669643, + "loss": 3.4209, + "step": 4035 + }, + { + "epoch": 1.2760799178709625, + "grad_norm": 0.14743592937515496, + "learning_rate": 0.0009283442523918848, + "loss": 3.4822, + "step": 4040 + }, + { + "epoch": 1.2776593224354418, + "grad_norm": 0.1366017208398044, + "learning_rate": 0.000928059526853259, + "loss": 3.4677, + "step": 4045 + }, + { + "epoch": 1.279238726999921, + "grad_norm": 0.16704429698198514, + "learning_rate": 0.000927774280597444, + "loss": 3.4226, + "step": 4050 + }, + { + "epoch": 1.2808181315644003, + "grad_norm": 0.15643121589785522, + "learning_rate": 0.0009274885139714302, + "loss": 3.3408, + "step": 4055 + }, + { + "epoch": 1.2823975361288795, + "grad_norm": 0.1741639877841723, + "learning_rate": 0.0009272022273228414, + "loss": 3.4501, + "step": 4060 + }, + { + "epoch": 1.2839769406933585, + "grad_norm": 0.11217281345430413, + "learning_rate": 0.0009269154209999338, + "loss": 3.296, + "step": 4065 + }, + { + "epoch": 1.2855563452578378, + "grad_norm": 0.1617728847897573, + "learning_rate": 0.0009266280953515956, + "loss": 3.3026, + "step": 4070 + }, + { + "epoch": 1.287135749822317, + "grad_norm": 0.14427201373975365, + "learning_rate": 0.0009263402507273471, + "loss": 3.4153, + "step": 4075 + }, + { + "epoch": 1.288715154386796, + "grad_norm": 0.17014303360059407, + "learning_rate": 0.0009260518874773394, + "loss": 3.3237, + "step": 4080 + }, + { + "epoch": 1.2902945589512753, + "grad_norm": 0.14224068939581733, + "learning_rate": 0.0009257630059523552, + "loss": 3.4282, + "step": 4085 + }, + { + "epoch": 1.2918739635157546, + "grad_norm": 0.17799994513807274, + "learning_rate": 0.0009254736065038068, + "loss": 3.339, + "step": 4090 + }, + { + "epoch": 1.2934533680802338, + "grad_norm": 0.1364276503857556, + "learning_rate": 0.0009251836894837374, + "loss": 3.3944, + "step": 4095 + }, + { + "epoch": 1.295032772644713, + "grad_norm": 0.14641284207636293, + "learning_rate": 0.0009248932552448191, + "loss": 3.3047, + "step": 4100 + }, + { + "epoch": 1.296612177209192, + "grad_norm": 0.16244756151602185, + "learning_rate": 0.0009246023041403535, + "loss": 3.3813, + "step": 4105 + }, + { + "epoch": 1.2981915817736713, + "grad_norm": 0.12556159589538443, + "learning_rate": 0.0009243108365242711, + "loss": 3.3201, + "step": 4110 + }, + { + "epoch": 1.2997709863381506, + "grad_norm": 0.1905706972967778, + "learning_rate": 0.0009240188527511303, + "loss": 3.3823, + "step": 4115 + }, + { + "epoch": 1.3013503909026296, + "grad_norm": 0.20641498529248553, + "learning_rate": 0.0009237263531761177, + "loss": 3.4022, + "step": 4120 + }, + { + "epoch": 1.3029297954671089, + "grad_norm": 0.1545220182472424, + "learning_rate": 0.0009234333381550472, + "loss": 3.2973, + "step": 4125 + }, + { + "epoch": 1.304509200031588, + "grad_norm": 0.18168937803347265, + "learning_rate": 0.0009231398080443601, + "loss": 3.4562, + "step": 4130 + }, + { + "epoch": 1.3060886045960673, + "grad_norm": 0.23826734915746944, + "learning_rate": 0.0009228457632011235, + "loss": 3.3164, + "step": 4135 + }, + { + "epoch": 1.3076680091605466, + "grad_norm": 0.16581129822228094, + "learning_rate": 0.0009225512039830315, + "loss": 3.3584, + "step": 4140 + }, + { + "epoch": 1.3092474137250256, + "grad_norm": 0.16210329879726387, + "learning_rate": 0.0009222561307484032, + "loss": 3.4051, + "step": 4145 + }, + { + "epoch": 1.3108268182895049, + "grad_norm": 0.15506171661463133, + "learning_rate": 0.0009219605438561836, + "loss": 3.3442, + "step": 4150 + }, + { + "epoch": 1.3124062228539841, + "grad_norm": 0.15725462551651762, + "learning_rate": 0.0009216644436659422, + "loss": 3.3336, + "step": 4155 + }, + { + "epoch": 1.3139856274184631, + "grad_norm": 0.1401664797264424, + "learning_rate": 0.0009213678305378727, + "loss": 3.35, + "step": 4160 + }, + { + "epoch": 1.3155650319829424, + "grad_norm": 0.13193412044498434, + "learning_rate": 0.0009210707048327935, + "loss": 3.3959, + "step": 4165 + }, + { + "epoch": 1.3171444365474216, + "grad_norm": 0.1643713503348713, + "learning_rate": 0.0009207730669121457, + "loss": 3.4629, + "step": 4170 + }, + { + "epoch": 1.3187238411119009, + "grad_norm": 0.18797562194535233, + "learning_rate": 0.000920474917137994, + "loss": 3.3898, + "step": 4175 + }, + { + "epoch": 1.3203032456763801, + "grad_norm": 0.1439925271657658, + "learning_rate": 0.0009201762558730255, + "loss": 3.427, + "step": 4180 + }, + { + "epoch": 1.3218826502408592, + "grad_norm": 0.17414529000788978, + "learning_rate": 0.0009198770834805498, + "loss": 3.3501, + "step": 4185 + }, + { + "epoch": 1.3234620548053384, + "grad_norm": 0.14722148228092627, + "learning_rate": 0.0009195774003244979, + "loss": 3.355, + "step": 4190 + }, + { + "epoch": 1.3250414593698177, + "grad_norm": 0.14702051757999493, + "learning_rate": 0.0009192772067694223, + "loss": 3.3385, + "step": 4195 + }, + { + "epoch": 1.3266208639342967, + "grad_norm": 0.1373432579980681, + "learning_rate": 0.0009189765031804965, + "loss": 3.3479, + "step": 4200 + }, + { + "epoch": 1.328200268498776, + "grad_norm": 0.13838263702623113, + "learning_rate": 0.0009186752899235142, + "loss": 3.3044, + "step": 4205 + }, + { + "epoch": 1.3297796730632552, + "grad_norm": 0.11732374721073062, + "learning_rate": 0.0009183735673648893, + "loss": 3.3196, + "step": 4210 + }, + { + "epoch": 1.3313590776277344, + "grad_norm": 0.13745564723029, + "learning_rate": 0.000918071335871655, + "loss": 3.3737, + "step": 4215 + }, + { + "epoch": 1.3329384821922137, + "grad_norm": 0.14220836178084498, + "learning_rate": 0.0009177685958114641, + "loss": 3.3583, + "step": 4220 + }, + { + "epoch": 1.3345178867566927, + "grad_norm": 0.16452014627189307, + "learning_rate": 0.0009174653475525874, + "loss": 3.2648, + "step": 4225 + }, + { + "epoch": 1.336097291321172, + "grad_norm": 0.14610729955363427, + "learning_rate": 0.0009171615914639142, + "loss": 3.4392, + "step": 4230 + }, + { + "epoch": 1.3376766958856512, + "grad_norm": 0.18117209766134101, + "learning_rate": 0.0009168573279149515, + "loss": 3.3022, + "step": 4235 + }, + { + "epoch": 1.3392561004501302, + "grad_norm": 0.19782231311409362, + "learning_rate": 0.000916552557275824, + "loss": 3.4022, + "step": 4240 + }, + { + "epoch": 1.3408355050146095, + "grad_norm": 0.17241089526198844, + "learning_rate": 0.0009162472799172725, + "loss": 3.3588, + "step": 4245 + }, + { + "epoch": 1.3424149095790887, + "grad_norm": 0.16410686082023807, + "learning_rate": 0.000915941496210655, + "loss": 3.3589, + "step": 4250 + }, + { + "epoch": 1.343994314143568, + "grad_norm": 0.2388905353194743, + "learning_rate": 0.0009156352065279448, + "loss": 3.3672, + "step": 4255 + }, + { + "epoch": 1.345573718708047, + "grad_norm": 0.1527648247113203, + "learning_rate": 0.0009153284112417313, + "loss": 3.2461, + "step": 4260 + }, + { + "epoch": 1.3471531232725262, + "grad_norm": 0.153856870172395, + "learning_rate": 0.0009150211107252181, + "loss": 3.3544, + "step": 4265 + }, + { + "epoch": 1.3487325278370055, + "grad_norm": 0.1539199272236105, + "learning_rate": 0.0009147133053522243, + "loss": 3.2858, + "step": 4270 + }, + { + "epoch": 1.3503119324014845, + "grad_norm": 0.1609833682589832, + "learning_rate": 0.0009144049954971827, + "loss": 3.309, + "step": 4275 + }, + { + "epoch": 1.3518913369659638, + "grad_norm": 0.19124078172755124, + "learning_rate": 0.0009140961815351399, + "loss": 3.3246, + "step": 4280 + }, + { + "epoch": 1.353470741530443, + "grad_norm": 0.16190296279226263, + "learning_rate": 0.0009137868638417555, + "loss": 3.3528, + "step": 4285 + }, + { + "epoch": 1.3550501460949222, + "grad_norm": 0.1568394556153337, + "learning_rate": 0.0009134770427933019, + "loss": 3.324, + "step": 4290 + }, + { + "epoch": 1.3566295506594015, + "grad_norm": 0.1418128016487037, + "learning_rate": 0.0009131667187666642, + "loss": 3.3073, + "step": 4295 + }, + { + "epoch": 1.3582089552238805, + "grad_norm": 0.14972736307745682, + "learning_rate": 0.000912855892139339, + "loss": 3.249, + "step": 4300 + }, + { + "epoch": 1.3597883597883598, + "grad_norm": 0.11564072656402402, + "learning_rate": 0.0009125445632894345, + "loss": 3.2773, + "step": 4305 + }, + { + "epoch": 1.361367764352839, + "grad_norm": 0.1457945817000172, + "learning_rate": 0.0009122327325956696, + "loss": 3.3964, + "step": 4310 + }, + { + "epoch": 1.362947168917318, + "grad_norm": 0.15166900783465612, + "learning_rate": 0.0009119204004373738, + "loss": 3.3749, + "step": 4315 + }, + { + "epoch": 1.3645265734817973, + "grad_norm": 0.16788847097200998, + "learning_rate": 0.0009116075671944864, + "loss": 3.2626, + "step": 4320 + }, + { + "epoch": 1.3661059780462765, + "grad_norm": 0.18345038602626523, + "learning_rate": 0.0009112942332475569, + "loss": 3.4544, + "step": 4325 + }, + { + "epoch": 1.3676853826107558, + "grad_norm": 0.16980153530949338, + "learning_rate": 0.0009109803989777431, + "loss": 3.4096, + "step": 4330 + }, + { + "epoch": 1.369264787175235, + "grad_norm": 0.1901152276047917, + "learning_rate": 0.0009106660647668118, + "loss": 3.375, + "step": 4335 + }, + { + "epoch": 1.370844191739714, + "grad_norm": 0.13319747933268186, + "learning_rate": 0.000910351230997138, + "loss": 3.2241, + "step": 4340 + }, + { + "epoch": 1.3724235963041933, + "grad_norm": 0.19034250534823186, + "learning_rate": 0.0009100358980517043, + "loss": 3.2717, + "step": 4345 + }, + { + "epoch": 1.3740030008686726, + "grad_norm": 0.16377491037494543, + "learning_rate": 0.0009097200663141005, + "loss": 3.3572, + "step": 4350 + }, + { + "epoch": 1.3755824054331516, + "grad_norm": 0.15838101408425387, + "learning_rate": 0.0009094037361685232, + "loss": 3.339, + "step": 4355 + }, + { + "epoch": 1.3771618099976308, + "grad_norm": 0.15030123154713862, + "learning_rate": 0.0009090869079997754, + "loss": 3.2, + "step": 4360 + }, + { + "epoch": 1.37874121456211, + "grad_norm": 0.19112961814499516, + "learning_rate": 0.0009087695821932657, + "loss": 3.3443, + "step": 4365 + }, + { + "epoch": 1.3803206191265893, + "grad_norm": 0.17058851459547694, + "learning_rate": 0.0009084517591350083, + "loss": 3.2637, + "step": 4370 + }, + { + "epoch": 1.3819000236910686, + "grad_norm": 0.17372701953450628, + "learning_rate": 0.0009081334392116218, + "loss": 3.3439, + "step": 4375 + }, + { + "epoch": 1.3834794282555476, + "grad_norm": 0.16146165100584847, + "learning_rate": 0.0009078146228103301, + "loss": 3.3357, + "step": 4380 + }, + { + "epoch": 1.3850588328200268, + "grad_norm": 0.1959721032697362, + "learning_rate": 0.0009074953103189602, + "loss": 3.3979, + "step": 4385 + }, + { + "epoch": 1.386638237384506, + "grad_norm": 0.13458053922970173, + "learning_rate": 0.0009071755021259429, + "loss": 3.3486, + "step": 4390 + }, + { + "epoch": 1.3882176419489851, + "grad_norm": 0.17061529633912267, + "learning_rate": 0.0009068551986203122, + "loss": 3.4066, + "step": 4395 + }, + { + "epoch": 1.3897970465134644, + "grad_norm": 0.20409384331794853, + "learning_rate": 0.0009065344001917042, + "loss": 3.2958, + "step": 4400 + }, + { + "epoch": 1.3913764510779436, + "grad_norm": 0.16679294495837618, + "learning_rate": 0.0009062131072303572, + "loss": 3.3132, + "step": 4405 + }, + { + "epoch": 1.3929558556424229, + "grad_norm": 0.1952635203259422, + "learning_rate": 0.0009058913201271116, + "loss": 3.2498, + "step": 4410 + }, + { + "epoch": 1.394535260206902, + "grad_norm": 0.15385051619032689, + "learning_rate": 0.000905569039273408, + "loss": 3.3008, + "step": 4415 + }, + { + "epoch": 1.3961146647713811, + "grad_norm": 0.2486791036593673, + "learning_rate": 0.0009052462650612885, + "loss": 3.3204, + "step": 4420 + }, + { + "epoch": 1.3976940693358604, + "grad_norm": 0.18045094172214962, + "learning_rate": 0.0009049229978833945, + "loss": 3.2637, + "step": 4425 + }, + { + "epoch": 1.3992734739003396, + "grad_norm": 0.151424008067985, + "learning_rate": 0.0009045992381329678, + "loss": 3.2764, + "step": 4430 + }, + { + "epoch": 1.4008528784648187, + "grad_norm": 0.2132183057940956, + "learning_rate": 0.0009042749862038491, + "loss": 3.2443, + "step": 4435 + }, + { + "epoch": 1.402432283029298, + "grad_norm": 0.16270294354698586, + "learning_rate": 0.0009039502424904777, + "loss": 3.2845, + "step": 4440 + }, + { + "epoch": 1.4040116875937771, + "grad_norm": 0.18019334099532144, + "learning_rate": 0.0009036250073878913, + "loss": 3.2478, + "step": 4445 + }, + { + "epoch": 1.4055910921582564, + "grad_norm": 0.1393773438924934, + "learning_rate": 0.0009032992812917254, + "loss": 3.2306, + "step": 4450 + }, + { + "epoch": 1.4071704967227356, + "grad_norm": 0.15273142716044932, + "learning_rate": 0.0009029730645982126, + "loss": 3.2648, + "step": 4455 + }, + { + "epoch": 1.4087499012872147, + "grad_norm": 0.1740693335929456, + "learning_rate": 0.0009026463577041824, + "loss": 3.3307, + "step": 4460 + }, + { + "epoch": 1.410329305851694, + "grad_norm": 0.19665063654880002, + "learning_rate": 0.0009023191610070607, + "loss": 3.3359, + "step": 4465 + }, + { + "epoch": 1.4119087104161732, + "grad_norm": 0.12561620916919017, + "learning_rate": 0.0009019914749048689, + "loss": 3.2632, + "step": 4470 + }, + { + "epoch": 1.4134881149806522, + "grad_norm": 0.13644396705083753, + "learning_rate": 0.0009016632997962241, + "loss": 3.3341, + "step": 4475 + }, + { + "epoch": 1.4150675195451314, + "grad_norm": 0.1665415155549509, + "learning_rate": 0.0009013346360803381, + "loss": 3.4047, + "step": 4480 + }, + { + "epoch": 1.4166469241096107, + "grad_norm": 0.15624730882339172, + "learning_rate": 0.000901005484157017, + "loss": 3.2707, + "step": 4485 + }, + { + "epoch": 1.41822632867409, + "grad_norm": 0.16793430979941182, + "learning_rate": 0.000900675844426661, + "loss": 3.4758, + "step": 4490 + }, + { + "epoch": 1.4198057332385692, + "grad_norm": 0.15984543570755744, + "learning_rate": 0.0009003457172902636, + "loss": 3.3969, + "step": 4495 + }, + { + "epoch": 1.4213851378030482, + "grad_norm": 0.13697976713292376, + "learning_rate": 0.0009000151031494109, + "loss": 3.3826, + "step": 4500 + }, + { + "epoch": 1.4229645423675275, + "grad_norm": 0.19809550193039518, + "learning_rate": 0.000899684002406282, + "loss": 3.2361, + "step": 4505 + }, + { + "epoch": 1.4245439469320067, + "grad_norm": 0.16229709695027106, + "learning_rate": 0.0008993524154636474, + "loss": 3.3063, + "step": 4510 + }, + { + "epoch": 1.4261233514964857, + "grad_norm": 0.12608660845067726, + "learning_rate": 0.0008990203427248696, + "loss": 3.2894, + "step": 4515 + }, + { + "epoch": 1.427702756060965, + "grad_norm": 0.1797964516783041, + "learning_rate": 0.0008986877845939014, + "loss": 3.3271, + "step": 4520 + }, + { + "epoch": 1.4292821606254442, + "grad_norm": 0.15675599346145377, + "learning_rate": 0.0008983547414752864, + "loss": 3.3151, + "step": 4525 + }, + { + "epoch": 1.4308615651899235, + "grad_norm": 0.1892068686504464, + "learning_rate": 0.0008980212137741584, + "loss": 3.2629, + "step": 4530 + }, + { + "epoch": 1.4324409697544027, + "grad_norm": 0.18537794590349257, + "learning_rate": 0.0008976872018962401, + "loss": 3.2736, + "step": 4535 + }, + { + "epoch": 1.4340203743188817, + "grad_norm": 0.2593286392625844, + "learning_rate": 0.0008973527062478438, + "loss": 3.3368, + "step": 4540 + }, + { + "epoch": 1.435599778883361, + "grad_norm": 0.17720765577244244, + "learning_rate": 0.0008970177272358698, + "loss": 3.222, + "step": 4545 + }, + { + "epoch": 1.4371791834478402, + "grad_norm": 0.14256311689707032, + "learning_rate": 0.0008966822652678067, + "loss": 3.2242, + "step": 4550 + }, + { + "epoch": 1.4387585880123193, + "grad_norm": 0.16384644938661744, + "learning_rate": 0.0008963463207517304, + "loss": 3.2097, + "step": 4555 + }, + { + "epoch": 1.4403379925767985, + "grad_norm": 0.1877920058936674, + "learning_rate": 0.0008960098940963041, + "loss": 3.3619, + "step": 4560 + }, + { + "epoch": 1.4419173971412778, + "grad_norm": 0.12584487034089764, + "learning_rate": 0.000895672985710777, + "loss": 3.2728, + "step": 4565 + }, + { + "epoch": 1.443496801705757, + "grad_norm": 0.13746308694505036, + "learning_rate": 0.0008953355960049847, + "loss": 3.4137, + "step": 4570 + }, + { + "epoch": 1.4450762062702363, + "grad_norm": 0.1578748118489456, + "learning_rate": 0.0008949977253893483, + "loss": 3.1916, + "step": 4575 + }, + { + "epoch": 1.4466556108347153, + "grad_norm": 0.1673051556573036, + "learning_rate": 0.0008946593742748737, + "loss": 3.2818, + "step": 4580 + }, + { + "epoch": 1.4482350153991945, + "grad_norm": 0.1328414390545332, + "learning_rate": 0.0008943205430731514, + "loss": 3.3013, + "step": 4585 + }, + { + "epoch": 1.4498144199636738, + "grad_norm": 0.14435362259403745, + "learning_rate": 0.0008939812321963561, + "loss": 3.2534, + "step": 4590 + }, + { + "epoch": 1.4513938245281528, + "grad_norm": 0.14208374901699425, + "learning_rate": 0.0008936414420572457, + "loss": 3.2995, + "step": 4595 + }, + { + "epoch": 1.452973229092632, + "grad_norm": 0.14376283024706615, + "learning_rate": 0.0008933011730691609, + "loss": 3.2659, + "step": 4600 + }, + { + "epoch": 1.4545526336571113, + "grad_norm": 0.1599547546108757, + "learning_rate": 0.0008929604256460258, + "loss": 3.2932, + "step": 4605 + }, + { + "epoch": 1.4561320382215905, + "grad_norm": 0.16535842899706282, + "learning_rate": 0.0008926192002023457, + "loss": 3.2339, + "step": 4610 + }, + { + "epoch": 1.4577114427860698, + "grad_norm": 0.12552135639645665, + "learning_rate": 0.0008922774971532076, + "loss": 3.1256, + "step": 4615 + }, + { + "epoch": 1.4592908473505488, + "grad_norm": 0.14820947272345442, + "learning_rate": 0.0008919353169142794, + "loss": 3.2025, + "step": 4620 + }, + { + "epoch": 1.460870251915028, + "grad_norm": 0.1376996788111554, + "learning_rate": 0.0008915926599018098, + "loss": 3.2946, + "step": 4625 + }, + { + "epoch": 1.462449656479507, + "grad_norm": 0.15470840871930625, + "learning_rate": 0.0008912495265326273, + "loss": 3.2855, + "step": 4630 + }, + { + "epoch": 1.4640290610439863, + "grad_norm": 0.14143468530300238, + "learning_rate": 0.0008909059172241395, + "loss": 3.1603, + "step": 4635 + }, + { + "epoch": 1.4656084656084656, + "grad_norm": 0.11621783222623613, + "learning_rate": 0.0008905618323943337, + "loss": 3.263, + "step": 4640 + }, + { + "epoch": 1.4671878701729448, + "grad_norm": 0.14653375693300938, + "learning_rate": 0.0008902172724617747, + "loss": 3.2858, + "step": 4645 + }, + { + "epoch": 1.468767274737424, + "grad_norm": 0.12686687887850526, + "learning_rate": 0.0008898722378456066, + "loss": 3.3301, + "step": 4650 + }, + { + "epoch": 1.470346679301903, + "grad_norm": 0.1725879758751388, + "learning_rate": 0.0008895267289655493, + "loss": 3.3617, + "step": 4655 + }, + { + "epoch": 1.4719260838663824, + "grad_norm": 0.16206213990180582, + "learning_rate": 0.000889180746241901, + "loss": 3.211, + "step": 4660 + }, + { + "epoch": 1.4735054884308616, + "grad_norm": 0.17730217086064257, + "learning_rate": 0.0008888342900955355, + "loss": 3.2401, + "step": 4665 + }, + { + "epoch": 1.4750848929953406, + "grad_norm": 0.15775239883109177, + "learning_rate": 0.000888487360947903, + "loss": 3.2833, + "step": 4670 + }, + { + "epoch": 1.4766642975598199, + "grad_norm": 0.17147232993014616, + "learning_rate": 0.0008881399592210286, + "loss": 3.2309, + "step": 4675 + }, + { + "epoch": 1.4782437021242991, + "grad_norm": 0.1582103899231393, + "learning_rate": 0.0008877920853375125, + "loss": 3.3207, + "step": 4680 + }, + { + "epoch": 1.4798231066887784, + "grad_norm": 0.16464760205681195, + "learning_rate": 0.0008874437397205295, + "loss": 3.2625, + "step": 4685 + }, + { + "epoch": 1.4814025112532576, + "grad_norm": 0.1573618442182188, + "learning_rate": 0.000887094922793828, + "loss": 3.2645, + "step": 4690 + }, + { + "epoch": 1.4829819158177366, + "grad_norm": 0.14542077447641327, + "learning_rate": 0.0008867456349817295, + "loss": 3.2381, + "step": 4695 + }, + { + "epoch": 1.4845613203822159, + "grad_norm": 0.1500640169422343, + "learning_rate": 0.0008863958767091289, + "loss": 3.2328, + "step": 4700 + }, + { + "epoch": 1.4861407249466951, + "grad_norm": 0.13869290140414445, + "learning_rate": 0.0008860456484014929, + "loss": 3.211, + "step": 4705 + }, + { + "epoch": 1.4877201295111742, + "grad_norm": 0.14829857540063193, + "learning_rate": 0.0008856949504848601, + "loss": 3.1714, + "step": 4710 + }, + { + "epoch": 1.4892995340756534, + "grad_norm": 0.16569262341577065, + "learning_rate": 0.0008853437833858404, + "loss": 3.2243, + "step": 4715 + }, + { + "epoch": 1.4908789386401327, + "grad_norm": 0.14695177504506926, + "learning_rate": 0.0008849921475316147, + "loss": 3.3036, + "step": 4720 + }, + { + "epoch": 1.492458343204612, + "grad_norm": 0.14586867054532876, + "learning_rate": 0.0008846400433499335, + "loss": 3.2349, + "step": 4725 + }, + { + "epoch": 1.4940377477690912, + "grad_norm": 0.2110658984856936, + "learning_rate": 0.0008842874712691175, + "loss": 3.2127, + "step": 4730 + }, + { + "epoch": 1.4956171523335702, + "grad_norm": 0.13573431744675368, + "learning_rate": 0.0008839344317180564, + "loss": 3.2067, + "step": 4735 + }, + { + "epoch": 1.4971965568980494, + "grad_norm": 0.14027398665568022, + "learning_rate": 0.0008835809251262091, + "loss": 3.2705, + "step": 4740 + }, + { + "epoch": 1.4987759614625287, + "grad_norm": 0.11286622399778137, + "learning_rate": 0.0008832269519236013, + "loss": 3.2677, + "step": 4745 + }, + { + "epoch": 1.5003553660270077, + "grad_norm": 0.1518608133814783, + "learning_rate": 0.0008828725125408276, + "loss": 3.2682, + "step": 4750 + }, + { + "epoch": 1.501934770591487, + "grad_norm": 0.160259811179564, + "learning_rate": 0.0008825176074090495, + "loss": 3.2156, + "step": 4755 + }, + { + "epoch": 1.5035141751559662, + "grad_norm": 0.18128847691858616, + "learning_rate": 0.0008821622369599944, + "loss": 3.2728, + "step": 4760 + }, + { + "epoch": 1.5050935797204454, + "grad_norm": 0.15979405833636123, + "learning_rate": 0.0008818064016259564, + "loss": 3.2132, + "step": 4765 + }, + { + "epoch": 1.5066729842849247, + "grad_norm": 0.16782521069305592, + "learning_rate": 0.0008814501018397947, + "loss": 3.1988, + "step": 4770 + }, + { + "epoch": 1.508252388849404, + "grad_norm": 0.16173887298085518, + "learning_rate": 0.0008810933380349337, + "loss": 3.2345, + "step": 4775 + }, + { + "epoch": 1.509831793413883, + "grad_norm": 0.1710658798394178, + "learning_rate": 0.0008807361106453622, + "loss": 3.2787, + "step": 4780 + }, + { + "epoch": 1.5114111979783622, + "grad_norm": 0.17277825281048725, + "learning_rate": 0.000880378420105633, + "loss": 3.1865, + "step": 4785 + }, + { + "epoch": 1.5129906025428412, + "grad_norm": 0.1818631892983199, + "learning_rate": 0.0008800202668508624, + "loss": 3.2306, + "step": 4790 + }, + { + "epoch": 1.5145700071073205, + "grad_norm": 0.17850343920387227, + "learning_rate": 0.0008796616513167291, + "loss": 3.118, + "step": 4795 + }, + { + "epoch": 1.5161494116717997, + "grad_norm": 0.17071356410691527, + "learning_rate": 0.0008793025739394745, + "loss": 3.1208, + "step": 4800 + }, + { + "epoch": 1.517728816236279, + "grad_norm": 0.15920996937791715, + "learning_rate": 0.000878943035155902, + "loss": 3.2293, + "step": 4805 + }, + { + "epoch": 1.5193082208007582, + "grad_norm": 0.15999249674057323, + "learning_rate": 0.0008785830354033759, + "loss": 3.189, + "step": 4810 + }, + { + "epoch": 1.5208876253652373, + "grad_norm": 0.12776982261731593, + "learning_rate": 0.0008782225751198216, + "loss": 3.2508, + "step": 4815 + }, + { + "epoch": 1.5224670299297165, + "grad_norm": 0.13987589557521737, + "learning_rate": 0.0008778616547437244, + "loss": 3.2344, + "step": 4820 + }, + { + "epoch": 1.5240464344941955, + "grad_norm": 0.13431219986869516, + "learning_rate": 0.0008775002747141292, + "loss": 3.2349, + "step": 4825 + }, + { + "epoch": 1.5256258390586748, + "grad_norm": 0.1192257573291684, + "learning_rate": 0.0008771384354706406, + "loss": 3.318, + "step": 4830 + }, + { + "epoch": 1.527205243623154, + "grad_norm": 0.13409814119711105, + "learning_rate": 0.0008767761374534215, + "loss": 3.1402, + "step": 4835 + }, + { + "epoch": 1.5287846481876333, + "grad_norm": 0.15609606836500248, + "learning_rate": 0.0008764133811031924, + "loss": 3.2244, + "step": 4840 + }, + { + "epoch": 1.5303640527521125, + "grad_norm": 0.18418778089852983, + "learning_rate": 0.0008760501668612324, + "loss": 3.1931, + "step": 4845 + }, + { + "epoch": 1.5319434573165918, + "grad_norm": 0.13804937377372667, + "learning_rate": 0.0008756864951693766, + "loss": 3.2765, + "step": 4850 + }, + { + "epoch": 1.5335228618810708, + "grad_norm": 0.12574847976657577, + "learning_rate": 0.0008753223664700171, + "loss": 3.2378, + "step": 4855 + }, + { + "epoch": 1.53510226644555, + "grad_norm": 0.1689148634919543, + "learning_rate": 0.0008749577812061019, + "loss": 3.3042, + "step": 4860 + }, + { + "epoch": 1.536681671010029, + "grad_norm": 0.2613685637963012, + "learning_rate": 0.0008745927398211339, + "loss": 3.2394, + "step": 4865 + }, + { + "epoch": 1.5382610755745083, + "grad_norm": 0.22204800684640813, + "learning_rate": 0.0008742272427591719, + "loss": 3.2851, + "step": 4870 + }, + { + "epoch": 1.5398404801389876, + "grad_norm": 0.17845446024685363, + "learning_rate": 0.0008738612904648279, + "loss": 3.2891, + "step": 4875 + }, + { + "epoch": 1.5414198847034668, + "grad_norm": 0.19052223542574034, + "learning_rate": 0.0008734948833832683, + "loss": 3.1479, + "step": 4880 + }, + { + "epoch": 1.542999289267946, + "grad_norm": 0.18442609056657755, + "learning_rate": 0.0008731280219602127, + "loss": 3.2407, + "step": 4885 + }, + { + "epoch": 1.5445786938324253, + "grad_norm": 0.16536340121177104, + "learning_rate": 0.000872760706641933, + "loss": 3.188, + "step": 4890 + }, + { + "epoch": 1.5461580983969043, + "grad_norm": 0.17738221997727127, + "learning_rate": 0.0008723929378752535, + "loss": 3.2681, + "step": 4895 + }, + { + "epoch": 1.5477375029613836, + "grad_norm": 0.14300527517916653, + "learning_rate": 0.0008720247161075503, + "loss": 3.2121, + "step": 4900 + }, + { + "epoch": 1.5493169075258626, + "grad_norm": 0.12252161093257756, + "learning_rate": 0.0008716560417867503, + "loss": 3.1576, + "step": 4905 + }, + { + "epoch": 1.5508963120903418, + "grad_norm": 0.13646202138242786, + "learning_rate": 0.000871286915361331, + "loss": 3.1993, + "step": 4910 + }, + { + "epoch": 1.552475716654821, + "grad_norm": 0.1450313805287437, + "learning_rate": 0.0008709173372803197, + "loss": 3.324, + "step": 4915 + }, + { + "epoch": 1.5540551212193003, + "grad_norm": 0.20542431408610692, + "learning_rate": 0.0008705473079932935, + "loss": 3.3023, + "step": 4920 + }, + { + "epoch": 1.5556345257837796, + "grad_norm": 0.1663832250588302, + "learning_rate": 0.0008701768279503779, + "loss": 3.2081, + "step": 4925 + }, + { + "epoch": 1.5572139303482588, + "grad_norm": 0.15828209904907997, + "learning_rate": 0.0008698058976022472, + "loss": 3.1474, + "step": 4930 + }, + { + "epoch": 1.5587933349127379, + "grad_norm": 0.13656502814277918, + "learning_rate": 0.0008694345174001228, + "loss": 3.2959, + "step": 4935 + }, + { + "epoch": 1.560372739477217, + "grad_norm": 0.16652595943649606, + "learning_rate": 0.0008690626877957743, + "loss": 3.2222, + "step": 4940 + }, + { + "epoch": 1.5619521440416961, + "grad_norm": 0.15536769823691185, + "learning_rate": 0.0008686904092415173, + "loss": 3.3182, + "step": 4945 + }, + { + "epoch": 1.5635315486061754, + "grad_norm": 0.12893633861479137, + "learning_rate": 0.0008683176821902135, + "loss": 3.1887, + "step": 4950 + }, + { + "epoch": 1.5651109531706546, + "grad_norm": 0.12321457114332846, + "learning_rate": 0.0008679445070952706, + "loss": 3.1422, + "step": 4955 + }, + { + "epoch": 1.5666903577351339, + "grad_norm": 0.1252717364208615, + "learning_rate": 0.0008675708844106407, + "loss": 3.2986, + "step": 4960 + }, + { + "epoch": 1.5682697622996131, + "grad_norm": 0.136179857864261, + "learning_rate": 0.0008671968145908211, + "loss": 3.1559, + "step": 4965 + }, + { + "epoch": 1.5698491668640924, + "grad_norm": 0.1366579774814646, + "learning_rate": 0.0008668222980908526, + "loss": 3.2452, + "step": 4970 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.15866997341356215, + "learning_rate": 0.0008664473353663196, + "loss": 3.1779, + "step": 4975 + }, + { + "epoch": 1.5730079759930506, + "grad_norm": 0.1570624780624178, + "learning_rate": 0.0008660719268733491, + "loss": 3.1761, + "step": 4980 + }, + { + "epoch": 1.5745873805575297, + "grad_norm": 0.16262475628462064, + "learning_rate": 0.0008656960730686101, + "loss": 3.2598, + "step": 4985 + }, + { + "epoch": 1.576166785122009, + "grad_norm": 0.16607449640434582, + "learning_rate": 0.0008653197744093139, + "loss": 3.2792, + "step": 4990 + }, + { + "epoch": 1.5777461896864882, + "grad_norm": 0.16636890691152242, + "learning_rate": 0.0008649430313532127, + "loss": 3.2285, + "step": 4995 + }, + { + "epoch": 1.5793255942509674, + "grad_norm": 0.162552103740573, + "learning_rate": 0.0008645658443585992, + "loss": 3.2509, + "step": 5000 + }, + { + "epoch": 1.5809049988154467, + "grad_norm": 0.19398036376248662, + "learning_rate": 0.0008641882138843062, + "loss": 3.1406, + "step": 5005 + }, + { + "epoch": 1.582484403379926, + "grad_norm": 0.1549550740040521, + "learning_rate": 0.0008638101403897061, + "loss": 3.22, + "step": 5010 + }, + { + "epoch": 1.584063807944405, + "grad_norm": 0.148872550051611, + "learning_rate": 0.0008634316243347099, + "loss": 3.2558, + "step": 5015 + }, + { + "epoch": 1.5856432125088842, + "grad_norm": 0.1304659991294797, + "learning_rate": 0.0008630526661797673, + "loss": 3.1085, + "step": 5020 + }, + { + "epoch": 1.5872226170733632, + "grad_norm": 0.1071200561590807, + "learning_rate": 0.0008626732663858655, + "loss": 3.2129, + "step": 5025 + }, + { + "epoch": 1.5888020216378425, + "grad_norm": 0.1321051614605033, + "learning_rate": 0.0008622934254145291, + "loss": 3.2189, + "step": 5030 + }, + { + "epoch": 1.5903814262023217, + "grad_norm": 0.15059770855107626, + "learning_rate": 0.0008619131437278196, + "loss": 3.2072, + "step": 5035 + }, + { + "epoch": 1.591960830766801, + "grad_norm": 0.15385580828759735, + "learning_rate": 0.0008615324217883341, + "loss": 3.1408, + "step": 5040 + }, + { + "epoch": 1.5935402353312802, + "grad_norm": 0.18895304159790371, + "learning_rate": 0.0008611512600592057, + "loss": 3.2543, + "step": 5045 + }, + { + "epoch": 1.5951196398957594, + "grad_norm": 0.14833800477922005, + "learning_rate": 0.0008607696590041021, + "loss": 3.2304, + "step": 5050 + }, + { + "epoch": 1.5966990444602385, + "grad_norm": 0.1829558575858106, + "learning_rate": 0.0008603876190872257, + "loss": 3.3536, + "step": 5055 + }, + { + "epoch": 1.5982784490247177, + "grad_norm": 0.13784885033258049, + "learning_rate": 0.000860005140773313, + "loss": 3.0908, + "step": 5060 + }, + { + "epoch": 1.5998578535891967, + "grad_norm": 0.13757531043144822, + "learning_rate": 0.0008596222245276329, + "loss": 3.1926, + "step": 5065 + }, + { + "epoch": 1.601437258153676, + "grad_norm": 0.15982659305698876, + "learning_rate": 0.000859238870815988, + "loss": 3.1252, + "step": 5070 + }, + { + "epoch": 1.6030166627181552, + "grad_norm": 0.15781292432563, + "learning_rate": 0.0008588550801047127, + "loss": 3.2267, + "step": 5075 + }, + { + "epoch": 1.6045960672826345, + "grad_norm": 0.13440615536611872, + "learning_rate": 0.0008584708528606728, + "loss": 3.1973, + "step": 5080 + }, + { + "epoch": 1.6061754718471137, + "grad_norm": 0.13180202622515164, + "learning_rate": 0.0008580861895512652, + "loss": 3.2315, + "step": 5085 + }, + { + "epoch": 1.607754876411593, + "grad_norm": 0.13594821623804945, + "learning_rate": 0.0008577010906444174, + "loss": 3.1541, + "step": 5090 + }, + { + "epoch": 1.609334280976072, + "grad_norm": 0.14242110458172097, + "learning_rate": 0.0008573155566085868, + "loss": 3.19, + "step": 5095 + }, + { + "epoch": 1.6109136855405513, + "grad_norm": 0.1402901958946774, + "learning_rate": 0.0008569295879127602, + "loss": 3.2011, + "step": 5100 + }, + { + "epoch": 1.6124930901050303, + "grad_norm": 0.1345845928466279, + "learning_rate": 0.0008565431850264527, + "loss": 3.153, + "step": 5105 + }, + { + "epoch": 1.6140724946695095, + "grad_norm": 0.14299194878908414, + "learning_rate": 0.0008561563484197079, + "loss": 3.1064, + "step": 5110 + }, + { + "epoch": 1.6156518992339888, + "grad_norm": 0.15337008180865508, + "learning_rate": 0.000855769078563097, + "loss": 3.2301, + "step": 5115 + }, + { + "epoch": 1.617231303798468, + "grad_norm": 0.14761041933323335, + "learning_rate": 0.0008553813759277184, + "loss": 3.303, + "step": 5120 + }, + { + "epoch": 1.6188107083629473, + "grad_norm": 0.16842837456466211, + "learning_rate": 0.0008549932409851965, + "loss": 3.0947, + "step": 5125 + }, + { + "epoch": 1.6203901129274265, + "grad_norm": 0.13929666990210726, + "learning_rate": 0.0008546046742076819, + "loss": 3.2187, + "step": 5130 + }, + { + "epoch": 1.6219695174919055, + "grad_norm": 0.14019470888113164, + "learning_rate": 0.0008542156760678504, + "loss": 3.2503, + "step": 5135 + }, + { + "epoch": 1.6235489220563848, + "grad_norm": 0.18987242622039088, + "learning_rate": 0.0008538262470389028, + "loss": 3.2645, + "step": 5140 + }, + { + "epoch": 1.6251283266208638, + "grad_norm": 0.16193318942084498, + "learning_rate": 0.0008534363875945637, + "loss": 3.1828, + "step": 5145 + }, + { + "epoch": 1.626707731185343, + "grad_norm": 0.1619889439692038, + "learning_rate": 0.0008530460982090812, + "loss": 3.2047, + "step": 5150 + }, + { + "epoch": 1.6282871357498223, + "grad_norm": 0.15342965563746738, + "learning_rate": 0.000852655379357227, + "loss": 3.2319, + "step": 5155 + }, + { + "epoch": 1.6298665403143016, + "grad_norm": 0.15132360226394195, + "learning_rate": 0.0008522642315142948, + "loss": 3.2558, + "step": 5160 + }, + { + "epoch": 1.6314459448787808, + "grad_norm": 0.18282985704516308, + "learning_rate": 0.0008518726551560999, + "loss": 3.189, + "step": 5165 + }, + { + "epoch": 1.63302534944326, + "grad_norm": 0.15024580263521556, + "learning_rate": 0.0008514806507589796, + "loss": 3.1789, + "step": 5170 + }, + { + "epoch": 1.634604754007739, + "grad_norm": 0.1669828597076097, + "learning_rate": 0.0008510882187997913, + "loss": 3.204, + "step": 5175 + }, + { + "epoch": 1.6361841585722183, + "grad_norm": 0.12997826381256986, + "learning_rate": 0.0008506953597559124, + "loss": 3.1704, + "step": 5180 + }, + { + "epoch": 1.6377635631366974, + "grad_norm": 0.19046220176512926, + "learning_rate": 0.0008503020741052407, + "loss": 3.1976, + "step": 5185 + }, + { + "epoch": 1.6393429677011766, + "grad_norm": 0.16953626947212094, + "learning_rate": 0.0008499083623261919, + "loss": 3.1796, + "step": 5190 + }, + { + "epoch": 1.6409223722656558, + "grad_norm": 0.15870142335511733, + "learning_rate": 0.0008495142248977007, + "loss": 3.1347, + "step": 5195 + }, + { + "epoch": 1.642501776830135, + "grad_norm": 0.12085159283664512, + "learning_rate": 0.0008491196622992194, + "loss": 3.1646, + "step": 5200 + }, + { + "epoch": 1.6440811813946143, + "grad_norm": 0.1356652151636163, + "learning_rate": 0.0008487246750107176, + "loss": 3.0949, + "step": 5205 + }, + { + "epoch": 1.6456605859590934, + "grad_norm": 0.18844483345578247, + "learning_rate": 0.0008483292635126814, + "loss": 3.2829, + "step": 5210 + }, + { + "epoch": 1.6472399905235726, + "grad_norm": 0.1435244567839457, + "learning_rate": 0.0008479334282861129, + "loss": 3.1473, + "step": 5215 + }, + { + "epoch": 1.6488193950880516, + "grad_norm": 0.18232492906003894, + "learning_rate": 0.0008475371698125297, + "loss": 3.1508, + "step": 5220 + }, + { + "epoch": 1.650398799652531, + "grad_norm": 0.13659700220175816, + "learning_rate": 0.0008471404885739644, + "loss": 3.0982, + "step": 5225 + }, + { + "epoch": 1.6519782042170101, + "grad_norm": 0.15726236702510096, + "learning_rate": 0.0008467433850529639, + "loss": 3.1438, + "step": 5230 + }, + { + "epoch": 1.6535576087814894, + "grad_norm": 0.16050432120825808, + "learning_rate": 0.0008463458597325884, + "loss": 3.2055, + "step": 5235 + }, + { + "epoch": 1.6551370133459686, + "grad_norm": 0.18108242926877352, + "learning_rate": 0.0008459479130964114, + "loss": 3.1179, + "step": 5240 + }, + { + "epoch": 1.6567164179104479, + "grad_norm": 0.13495676091229308, + "learning_rate": 0.0008455495456285193, + "loss": 3.1713, + "step": 5245 + }, + { + "epoch": 1.658295822474927, + "grad_norm": 0.16137588648755863, + "learning_rate": 0.0008451507578135098, + "loss": 3.0861, + "step": 5250 + }, + { + "epoch": 1.6598752270394062, + "grad_norm": 0.14343876409963272, + "learning_rate": 0.0008447515501364924, + "loss": 3.1167, + "step": 5255 + }, + { + "epoch": 1.6614546316038852, + "grad_norm": 0.13626674823775772, + "learning_rate": 0.0008443519230830871, + "loss": 3.1368, + "step": 5260 + }, + { + "epoch": 1.6630340361683644, + "grad_norm": 0.11641914134487123, + "learning_rate": 0.0008439518771394241, + "loss": 3.0619, + "step": 5265 + }, + { + "epoch": 1.6646134407328437, + "grad_norm": 0.1189405426468082, + "learning_rate": 0.0008435514127921431, + "loss": 3.1442, + "step": 5270 + }, + { + "epoch": 1.666192845297323, + "grad_norm": 0.1225580458586459, + "learning_rate": 0.0008431505305283933, + "loss": 3.1163, + "step": 5275 + }, + { + "epoch": 1.6677722498618022, + "grad_norm": 0.12802593143458157, + "learning_rate": 0.0008427492308358313, + "loss": 3.1695, + "step": 5280 + }, + { + "epoch": 1.6693516544262814, + "grad_norm": 0.18123050394278015, + "learning_rate": 0.0008423475142026223, + "loss": 3.0716, + "step": 5285 + }, + { + "epoch": 1.6709310589907604, + "grad_norm": 0.13790304915784055, + "learning_rate": 0.0008419453811174385, + "loss": 3.123, + "step": 5290 + }, + { + "epoch": 1.6725104635552397, + "grad_norm": 0.15913703714357955, + "learning_rate": 0.0008415428320694584, + "loss": 3.0708, + "step": 5295 + }, + { + "epoch": 1.6740898681197187, + "grad_norm": 0.16174700715573553, + "learning_rate": 0.0008411398675483668, + "loss": 3.0799, + "step": 5300 + }, + { + "epoch": 1.675669272684198, + "grad_norm": 0.17404012367261848, + "learning_rate": 0.0008407364880443539, + "loss": 3.1949, + "step": 5305 + }, + { + "epoch": 1.6772486772486772, + "grad_norm": 0.20852493040494388, + "learning_rate": 0.0008403326940481146, + "loss": 3.257, + "step": 5310 + }, + { + "epoch": 1.6788280818131565, + "grad_norm": 0.16777003267594165, + "learning_rate": 0.000839928486050848, + "loss": 3.2414, + "step": 5315 + }, + { + "epoch": 1.6804074863776357, + "grad_norm": 0.12454972106798985, + "learning_rate": 0.0008395238645442569, + "loss": 3.1313, + "step": 5320 + }, + { + "epoch": 1.681986890942115, + "grad_norm": 0.1330984558987137, + "learning_rate": 0.000839118830020547, + "loss": 3.145, + "step": 5325 + }, + { + "epoch": 1.683566295506594, + "grad_norm": 0.3087417482749041, + "learning_rate": 0.0008387133829724266, + "loss": 3.1466, + "step": 5330 + }, + { + "epoch": 1.6851457000710732, + "grad_norm": 0.1763043327279286, + "learning_rate": 0.0008383075238931057, + "loss": 3.1494, + "step": 5335 + }, + { + "epoch": 1.6867251046355523, + "grad_norm": 0.16430100545063897, + "learning_rate": 0.0008379012532762955, + "loss": 3.1457, + "step": 5340 + }, + { + "epoch": 1.6883045092000315, + "grad_norm": 0.13424044681308792, + "learning_rate": 0.0008374945716162079, + "loss": 3.1974, + "step": 5345 + }, + { + "epoch": 1.6898839137645107, + "grad_norm": 0.132656566575011, + "learning_rate": 0.0008370874794075548, + "loss": 3.1854, + "step": 5350 + }, + { + "epoch": 1.69146331832899, + "grad_norm": 0.1046842574599167, + "learning_rate": 0.0008366799771455474, + "loss": 3.0958, + "step": 5355 + }, + { + "epoch": 1.6930427228934692, + "grad_norm": 0.14992551766326231, + "learning_rate": 0.0008362720653258959, + "loss": 3.1479, + "step": 5360 + }, + { + "epoch": 1.6946221274579485, + "grad_norm": 0.13712636288570967, + "learning_rate": 0.0008358637444448085, + "loss": 3.1558, + "step": 5365 + }, + { + "epoch": 1.6962015320224275, + "grad_norm": 0.10402371936092365, + "learning_rate": 0.0008354550149989912, + "loss": 3.089, + "step": 5370 + }, + { + "epoch": 1.6977809365869068, + "grad_norm": 0.11701920328653621, + "learning_rate": 0.0008350458774856469, + "loss": 3.0974, + "step": 5375 + }, + { + "epoch": 1.6993603411513858, + "grad_norm": 0.13248230505359318, + "learning_rate": 0.000834636332402475, + "loss": 3.1612, + "step": 5380 + }, + { + "epoch": 1.700939745715865, + "grad_norm": 0.12477491619072674, + "learning_rate": 0.0008342263802476706, + "loss": 3.075, + "step": 5385 + }, + { + "epoch": 1.7025191502803443, + "grad_norm": 0.12147037604704496, + "learning_rate": 0.0008338160215199239, + "loss": 3.0468, + "step": 5390 + }, + { + "epoch": 1.7040985548448235, + "grad_norm": 0.13270474543121774, + "learning_rate": 0.0008334052567184198, + "loss": 3.1062, + "step": 5395 + }, + { + "epoch": 1.7056779594093028, + "grad_norm": 0.10998238774886308, + "learning_rate": 0.0008329940863428372, + "loss": 3.1386, + "step": 5400 + }, + { + "epoch": 1.707257363973782, + "grad_norm": 0.1515915492300537, + "learning_rate": 0.0008325825108933481, + "loss": 3.0909, + "step": 5405 + }, + { + "epoch": 1.708836768538261, + "grad_norm": 0.15837330357706372, + "learning_rate": 0.0008321705308706178, + "loss": 3.1, + "step": 5410 + }, + { + "epoch": 1.7104161731027403, + "grad_norm": 0.15132099851686953, + "learning_rate": 0.0008317581467758033, + "loss": 3.2848, + "step": 5415 + }, + { + "epoch": 1.7119955776672193, + "grad_norm": 0.12926189843665217, + "learning_rate": 0.0008313453591105533, + "loss": 3.0679, + "step": 5420 + }, + { + "epoch": 1.7135749822316986, + "grad_norm": 0.10577937715798899, + "learning_rate": 0.0008309321683770073, + "loss": 3.0536, + "step": 5425 + }, + { + "epoch": 1.7151543867961778, + "grad_norm": 0.12605393863349898, + "learning_rate": 0.0008305185750777952, + "loss": 3.0875, + "step": 5430 + }, + { + "epoch": 1.716733791360657, + "grad_norm": 0.14760806225530054, + "learning_rate": 0.0008301045797160365, + "loss": 3.1276, + "step": 5435 + }, + { + "epoch": 1.7183131959251363, + "grad_norm": 0.12269117451194803, + "learning_rate": 0.0008296901827953403, + "loss": 3.0501, + "step": 5440 + }, + { + "epoch": 1.7198926004896156, + "grad_norm": 0.22148513141090342, + "learning_rate": 0.0008292753848198034, + "loss": 3.0366, + "step": 5445 + }, + { + "epoch": 1.7214720050540946, + "grad_norm": 0.17485051885453007, + "learning_rate": 0.0008288601862940109, + "loss": 3.1606, + "step": 5450 + }, + { + "epoch": 1.7230514096185738, + "grad_norm": 0.16453403604485145, + "learning_rate": 0.0008284445877230351, + "loss": 3.2031, + "step": 5455 + }, + { + "epoch": 1.7246308141830529, + "grad_norm": 0.11388373614963429, + "learning_rate": 0.000828028589612435, + "loss": 3.0225, + "step": 5460 + }, + { + "epoch": 1.726210218747532, + "grad_norm": 0.15268472172554937, + "learning_rate": 0.0008276121924682556, + "loss": 3.1488, + "step": 5465 + }, + { + "epoch": 1.7277896233120114, + "grad_norm": 0.16123097535937514, + "learning_rate": 0.0008271953967970273, + "loss": 3.0955, + "step": 5470 + }, + { + "epoch": 1.7293690278764906, + "grad_norm": 0.13991029115174697, + "learning_rate": 0.0008267782031057651, + "loss": 3.0341, + "step": 5475 + }, + { + "epoch": 1.7309484324409699, + "grad_norm": 0.12181705811040357, + "learning_rate": 0.0008263606119019684, + "loss": 3.0104, + "step": 5480 + }, + { + "epoch": 1.732527837005449, + "grad_norm": 0.14873853455315397, + "learning_rate": 0.0008259426236936203, + "loss": 3.125, + "step": 5485 + }, + { + "epoch": 1.7341072415699281, + "grad_norm": 0.1491259625850415, + "learning_rate": 0.0008255242389891862, + "loss": 3.2172, + "step": 5490 + }, + { + "epoch": 1.7356866461344074, + "grad_norm": 0.14895531183393362, + "learning_rate": 0.0008251054582976146, + "loss": 3.1077, + "step": 5495 + }, + { + "epoch": 1.7372660506988864, + "grad_norm": 0.12030961427581022, + "learning_rate": 0.0008246862821283353, + "loss": 3.0649, + "step": 5500 + }, + { + "epoch": 1.7388454552633656, + "grad_norm": 0.15564231945017964, + "learning_rate": 0.0008242667109912592, + "loss": 3.1639, + "step": 5505 + }, + { + "epoch": 1.740424859827845, + "grad_norm": 0.1304107603769251, + "learning_rate": 0.0008238467453967778, + "loss": 3.1414, + "step": 5510 + }, + { + "epoch": 1.7420042643923241, + "grad_norm": 0.13419680595114025, + "learning_rate": 0.0008234263858557621, + "loss": 3.0805, + "step": 5515 + }, + { + "epoch": 1.7435836689568034, + "grad_norm": 0.1298847058824958, + "learning_rate": 0.0008230056328795629, + "loss": 3.0763, + "step": 5520 + }, + { + "epoch": 1.7451630735212826, + "grad_norm": 0.15677569802750654, + "learning_rate": 0.000822584486980009, + "loss": 3.0403, + "step": 5525 + }, + { + "epoch": 1.7467424780857617, + "grad_norm": 0.14205010542182986, + "learning_rate": 0.0008221629486694075, + "loss": 3.0167, + "step": 5530 + }, + { + "epoch": 1.748321882650241, + "grad_norm": 0.11323305619202069, + "learning_rate": 0.000821741018460543, + "loss": 3.0397, + "step": 5535 + }, + { + "epoch": 1.74990128721472, + "grad_norm": 0.12933460841485173, + "learning_rate": 0.0008213186968666762, + "loss": 3.0373, + "step": 5540 + }, + { + "epoch": 1.7514806917791992, + "grad_norm": 0.14348811976289397, + "learning_rate": 0.0008208959844015446, + "loss": 3.0593, + "step": 5545 + }, + { + "epoch": 1.7530600963436784, + "grad_norm": 0.13721521690367944, + "learning_rate": 0.000820472881579361, + "loss": 3.0771, + "step": 5550 + }, + { + "epoch": 1.7546395009081577, + "grad_norm": 0.1113321743985448, + "learning_rate": 0.0008200493889148129, + "loss": 3.0933, + "step": 5555 + }, + { + "epoch": 1.756218905472637, + "grad_norm": 0.13039510447204097, + "learning_rate": 0.0008196255069230618, + "loss": 3.1005, + "step": 5560 + }, + { + "epoch": 1.757798310037116, + "grad_norm": 0.1227027362155738, + "learning_rate": 0.0008192012361197434, + "loss": 3.0374, + "step": 5565 + }, + { + "epoch": 1.7593777146015952, + "grad_norm": 0.16124398321310454, + "learning_rate": 0.0008187765770209661, + "loss": 3.1171, + "step": 5570 + }, + { + "epoch": 1.7609571191660742, + "grad_norm": 0.13815148766215551, + "learning_rate": 0.0008183515301433104, + "loss": 3.1015, + "step": 5575 + }, + { + "epoch": 1.7625365237305535, + "grad_norm": 0.12576873501411856, + "learning_rate": 0.0008179260960038287, + "loss": 3.0402, + "step": 5580 + }, + { + "epoch": 1.7641159282950327, + "grad_norm": 0.15638893813279928, + "learning_rate": 0.0008175002751200447, + "loss": 3.1039, + "step": 5585 + }, + { + "epoch": 1.765695332859512, + "grad_norm": 0.18607696690530798, + "learning_rate": 0.0008170740680099519, + "loss": 3.1122, + "step": 5590 + }, + { + "epoch": 1.7672747374239912, + "grad_norm": 0.18841383981828605, + "learning_rate": 0.000816647475192015, + "loss": 3.0368, + "step": 5595 + }, + { + "epoch": 1.7688541419884705, + "grad_norm": 0.14018054519543763, + "learning_rate": 0.0008162204971851662, + "loss": 3.1069, + "step": 5600 + }, + { + "epoch": 1.7704335465529495, + "grad_norm": 0.1281256845789643, + "learning_rate": 0.0008157931345088074, + "loss": 3.1347, + "step": 5605 + }, + { + "epoch": 1.7720129511174287, + "grad_norm": 0.16806814294310318, + "learning_rate": 0.000815365387682808, + "loss": 3.0791, + "step": 5610 + }, + { + "epoch": 1.7735923556819078, + "grad_norm": 0.17461079002782293, + "learning_rate": 0.0008149372572275049, + "loss": 3.0854, + "step": 5615 + }, + { + "epoch": 1.775171760246387, + "grad_norm": 0.14352450743252834, + "learning_rate": 0.0008145087436637013, + "loss": 3.0062, + "step": 5620 + }, + { + "epoch": 1.7767511648108663, + "grad_norm": 0.1467818298017096, + "learning_rate": 0.0008140798475126671, + "loss": 3.0404, + "step": 5625 + }, + { + "epoch": 1.7783305693753455, + "grad_norm": 0.12627519312560972, + "learning_rate": 0.000813650569296137, + "loss": 3.0417, + "step": 5630 + }, + { + "epoch": 1.7799099739398248, + "grad_norm": 0.12415805779148288, + "learning_rate": 0.0008132209095363107, + "loss": 3.1058, + "step": 5635 + }, + { + "epoch": 1.781489378504304, + "grad_norm": 0.12978630067431146, + "learning_rate": 0.000812790868755852, + "loss": 3.0556, + "step": 5640 + }, + { + "epoch": 1.783068783068783, + "grad_norm": 0.12464317543260613, + "learning_rate": 0.0008123604474778881, + "loss": 3.0791, + "step": 5645 + }, + { + "epoch": 1.7846481876332623, + "grad_norm": 0.12344509884685934, + "learning_rate": 0.0008119296462260093, + "loss": 3.1187, + "step": 5650 + }, + { + "epoch": 1.7862275921977413, + "grad_norm": 0.11867693121313405, + "learning_rate": 0.0008114984655242681, + "loss": 3.0343, + "step": 5655 + }, + { + "epoch": 1.7878069967622205, + "grad_norm": 0.10448685844422977, + "learning_rate": 0.0008110669058971783, + "loss": 3.0101, + "step": 5660 + }, + { + "epoch": 1.7893864013266998, + "grad_norm": 0.13700981456911288, + "learning_rate": 0.0008106349678697147, + "loss": 3.1185, + "step": 5665 + }, + { + "epoch": 1.790965805891179, + "grad_norm": 0.1424263583015784, + "learning_rate": 0.0008102026519673127, + "loss": 3.0573, + "step": 5670 + }, + { + "epoch": 1.7925452104556583, + "grad_norm": 0.13955412711666954, + "learning_rate": 0.0008097699587158673, + "loss": 3.0688, + "step": 5675 + }, + { + "epoch": 1.7941246150201375, + "grad_norm": 0.14063694471408714, + "learning_rate": 0.0008093368886417323, + "loss": 3.1464, + "step": 5680 + }, + { + "epoch": 1.7957040195846166, + "grad_norm": 0.14664202038812976, + "learning_rate": 0.0008089034422717199, + "loss": 3.0739, + "step": 5685 + }, + { + "epoch": 1.7972834241490958, + "grad_norm": 0.2132974645825116, + "learning_rate": 0.0008084696201331004, + "loss": 3.1642, + "step": 5690 + }, + { + "epoch": 1.7988628287135748, + "grad_norm": 0.1616522602460956, + "learning_rate": 0.0008080354227536008, + "loss": 3.1808, + "step": 5695 + }, + { + "epoch": 1.800442233278054, + "grad_norm": 0.15325109276749146, + "learning_rate": 0.000807600850661405, + "loss": 3.1289, + "step": 5700 + }, + { + "epoch": 1.8020216378425333, + "grad_norm": 0.10407676475142526, + "learning_rate": 0.000807165904385152, + "loss": 3.1228, + "step": 5705 + }, + { + "epoch": 1.8036010424070126, + "grad_norm": 0.13943636056233583, + "learning_rate": 0.0008067305844539369, + "loss": 3.0581, + "step": 5710 + }, + { + "epoch": 1.8051804469714918, + "grad_norm": 0.13380630428432708, + "learning_rate": 0.0008062948913973087, + "loss": 3.0714, + "step": 5715 + }, + { + "epoch": 1.806759851535971, + "grad_norm": 0.14289546179063847, + "learning_rate": 0.0008058588257452703, + "loss": 3.1077, + "step": 5720 + }, + { + "epoch": 1.80833925610045, + "grad_norm": 0.1576835149035532, + "learning_rate": 0.0008054223880282783, + "loss": 3.0461, + "step": 5725 + }, + { + "epoch": 1.8099186606649293, + "grad_norm": 0.13290375552698708, + "learning_rate": 0.0008049855787772416, + "loss": 3.0308, + "step": 5730 + }, + { + "epoch": 1.8114980652294084, + "grad_norm": 0.11451184645959576, + "learning_rate": 0.0008045483985235207, + "loss": 3.1401, + "step": 5735 + }, + { + "epoch": 1.8130774697938876, + "grad_norm": 0.13278889627957352, + "learning_rate": 0.0008041108477989283, + "loss": 3.0229, + "step": 5740 + }, + { + "epoch": 1.8146568743583669, + "grad_norm": 0.09803195897353165, + "learning_rate": 0.0008036729271357269, + "loss": 3.0012, + "step": 5745 + }, + { + "epoch": 1.8162362789228461, + "grad_norm": 0.20567040452045257, + "learning_rate": 0.0008032346370666296, + "loss": 3.137, + "step": 5750 + }, + { + "epoch": 1.8178156834873254, + "grad_norm": 0.1337600870387908, + "learning_rate": 0.0008027959781247984, + "loss": 3.1482, + "step": 5755 + }, + { + "epoch": 1.8193950880518046, + "grad_norm": 0.16189524285580692, + "learning_rate": 0.0008023569508438444, + "loss": 3.0882, + "step": 5760 + }, + { + "epoch": 1.8209744926162836, + "grad_norm": 0.1358494981547762, + "learning_rate": 0.0008019175557578267, + "loss": 3.1672, + "step": 5765 + }, + { + "epoch": 1.8225538971807629, + "grad_norm": 0.09775298145324521, + "learning_rate": 0.0008014777934012514, + "loss": 3.0299, + "step": 5770 + }, + { + "epoch": 1.824133301745242, + "grad_norm": 0.1512229736528465, + "learning_rate": 0.0008010376643090719, + "loss": 3.0377, + "step": 5775 + }, + { + "epoch": 1.8257127063097212, + "grad_norm": 0.1728295251761587, + "learning_rate": 0.0008005971690166879, + "loss": 3.1011, + "step": 5780 + }, + { + "epoch": 1.8272921108742004, + "grad_norm": 0.1298951885600335, + "learning_rate": 0.0008001563080599437, + "loss": 2.9859, + "step": 5785 + }, + { + "epoch": 1.8288715154386797, + "grad_norm": 0.1209864599778657, + "learning_rate": 0.0007997150819751289, + "loss": 3.0874, + "step": 5790 + }, + { + "epoch": 1.830450920003159, + "grad_norm": 0.11706317164181328, + "learning_rate": 0.0007992734912989776, + "loss": 3.0135, + "step": 5795 + }, + { + "epoch": 1.8320303245676381, + "grad_norm": 0.16201933525966714, + "learning_rate": 0.0007988315365686671, + "loss": 3.1384, + "step": 5800 + }, + { + "epoch": 1.8336097291321172, + "grad_norm": 0.14368527580876594, + "learning_rate": 0.0007983892183218173, + "loss": 3.1312, + "step": 5805 + }, + { + "epoch": 1.8351891336965964, + "grad_norm": 0.15725717882474435, + "learning_rate": 0.0007979465370964904, + "loss": 3.0372, + "step": 5810 + }, + { + "epoch": 1.8367685382610754, + "grad_norm": 0.11697847046530074, + "learning_rate": 0.0007975034934311907, + "loss": 3.0479, + "step": 5815 + }, + { + "epoch": 1.8383479428255547, + "grad_norm": 0.15221307169759646, + "learning_rate": 0.000797060087864863, + "loss": 3.1091, + "step": 5820 + }, + { + "epoch": 1.839927347390034, + "grad_norm": 0.13501337394145765, + "learning_rate": 0.0007966163209368919, + "loss": 3.0373, + "step": 5825 + }, + { + "epoch": 1.8415067519545132, + "grad_norm": 0.1121320850988699, + "learning_rate": 0.0007961721931871023, + "loss": 3.05, + "step": 5830 + }, + { + "epoch": 1.8430861565189924, + "grad_norm": 0.14611915116235857, + "learning_rate": 0.0007957277051557577, + "loss": 3.035, + "step": 5835 + }, + { + "epoch": 1.8446655610834717, + "grad_norm": 0.1219728366948523, + "learning_rate": 0.00079528285738356, + "loss": 3.036, + "step": 5840 + }, + { + "epoch": 1.8462449656479507, + "grad_norm": 0.13072936964709259, + "learning_rate": 0.0007948376504116485, + "loss": 2.98, + "step": 5845 + }, + { + "epoch": 1.84782437021243, + "grad_norm": 0.16005339225680779, + "learning_rate": 0.0007943920847815995, + "loss": 3.1241, + "step": 5850 + }, + { + "epoch": 1.849403774776909, + "grad_norm": 0.11945810348155937, + "learning_rate": 0.0007939461610354258, + "loss": 2.9956, + "step": 5855 + }, + { + "epoch": 1.8509831793413882, + "grad_norm": 0.1697625468945632, + "learning_rate": 0.0007934998797155756, + "loss": 3.0508, + "step": 5860 + }, + { + "epoch": 1.8525625839058675, + "grad_norm": 0.13930841411775904, + "learning_rate": 0.0007930532413649323, + "loss": 3.0427, + "step": 5865 + }, + { + "epoch": 1.8541419884703467, + "grad_norm": 0.13016541216083566, + "learning_rate": 0.0007926062465268133, + "loss": 3.0849, + "step": 5870 + }, + { + "epoch": 1.855721393034826, + "grad_norm": 0.1485512809269021, + "learning_rate": 0.0007921588957449699, + "loss": 3.1645, + "step": 5875 + }, + { + "epoch": 1.8573007975993052, + "grad_norm": 0.11193581480631004, + "learning_rate": 0.0007917111895635864, + "loss": 3.1589, + "step": 5880 + }, + { + "epoch": 1.8588802021637842, + "grad_norm": 0.12341500299150605, + "learning_rate": 0.0007912631285272793, + "loss": 3.0646, + "step": 5885 + }, + { + "epoch": 1.8604596067282635, + "grad_norm": 0.13807686009862188, + "learning_rate": 0.0007908147131810967, + "loss": 2.9419, + "step": 5890 + }, + { + "epoch": 1.8620390112927425, + "grad_norm": 0.14018792900335816, + "learning_rate": 0.000790365944070518, + "loss": 3.1076, + "step": 5895 + }, + { + "epoch": 1.8636184158572218, + "grad_norm": 0.11960987802209332, + "learning_rate": 0.0007899168217414527, + "loss": 3.0158, + "step": 5900 + }, + { + "epoch": 1.865197820421701, + "grad_norm": 0.12847048660999708, + "learning_rate": 0.00078946734674024, + "loss": 3.0331, + "step": 5905 + }, + { + "epoch": 1.8667772249861803, + "grad_norm": 0.12510206818033662, + "learning_rate": 0.0007890175196136483, + "loss": 3.0068, + "step": 5910 + }, + { + "epoch": 1.8683566295506595, + "grad_norm": 0.18285088594823792, + "learning_rate": 0.000788567340908874, + "loss": 3.1459, + "step": 5915 + }, + { + "epoch": 1.8699360341151388, + "grad_norm": 0.1587901264563902, + "learning_rate": 0.0007881168111735416, + "loss": 3.0112, + "step": 5920 + }, + { + "epoch": 1.8715154386796178, + "grad_norm": 0.15136208752104693, + "learning_rate": 0.0007876659309557022, + "loss": 3.1193, + "step": 5925 + }, + { + "epoch": 1.873094843244097, + "grad_norm": 0.11287113938962186, + "learning_rate": 0.0007872147008038335, + "loss": 2.9893, + "step": 5930 + }, + { + "epoch": 1.874674247808576, + "grad_norm": 0.12637222768937448, + "learning_rate": 0.0007867631212668389, + "loss": 2.9688, + "step": 5935 + }, + { + "epoch": 1.8762536523730553, + "grad_norm": 0.09236990218264922, + "learning_rate": 0.0007863111928940465, + "loss": 2.9844, + "step": 5940 + }, + { + "epoch": 1.8778330569375346, + "grad_norm": 0.15341474154776769, + "learning_rate": 0.0007858589162352092, + "loss": 3.0107, + "step": 5945 + }, + { + "epoch": 1.8794124615020138, + "grad_norm": 0.12664771964931754, + "learning_rate": 0.0007854062918405034, + "loss": 3.1368, + "step": 5950 + }, + { + "epoch": 1.880991866066493, + "grad_norm": 0.12353028127660556, + "learning_rate": 0.0007849533202605284, + "loss": 3.0018, + "step": 5955 + }, + { + "epoch": 1.882571270630972, + "grad_norm": 0.11672989079025853, + "learning_rate": 0.0007845000020463058, + "loss": 2.9899, + "step": 5960 + }, + { + "epoch": 1.8841506751954513, + "grad_norm": 0.1319299033565842, + "learning_rate": 0.0007840463377492789, + "loss": 3.0766, + "step": 5965 + }, + { + "epoch": 1.8857300797599303, + "grad_norm": 0.15575784095941145, + "learning_rate": 0.0007835923279213124, + "loss": 3.0619, + "step": 5970 + }, + { + "epoch": 1.8873094843244096, + "grad_norm": 0.1655087702809232, + "learning_rate": 0.0007831379731146907, + "loss": 3.0099, + "step": 5975 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.13155693770891339, + "learning_rate": 0.0007826832738821181, + "loss": 3.0417, + "step": 5980 + }, + { + "epoch": 1.890468293453368, + "grad_norm": 0.13713154280184725, + "learning_rate": 0.0007822282307767182, + "loss": 3.0705, + "step": 5985 + }, + { + "epoch": 1.8920476980178473, + "grad_norm": 0.15739920600311663, + "learning_rate": 0.0007817728443520323, + "loss": 3.0458, + "step": 5990 + }, + { + "epoch": 1.8936271025823266, + "grad_norm": 0.17767884822437002, + "learning_rate": 0.00078131711516202, + "loss": 2.9771, + "step": 5995 + }, + { + "epoch": 1.8952065071468056, + "grad_norm": 0.16179754186795273, + "learning_rate": 0.0007808610437610573, + "loss": 3.0042, + "step": 6000 + }, + { + "epoch": 1.8967859117112849, + "grad_norm": 0.1209774401210003, + "learning_rate": 0.0007804046307039367, + "loss": 3.0069, + "step": 6005 + }, + { + "epoch": 1.8983653162757639, + "grad_norm": 0.14899272955577583, + "learning_rate": 0.0007799478765458664, + "loss": 3.0918, + "step": 6010 + }, + { + "epoch": 1.8999447208402431, + "grad_norm": 0.11215130241642793, + "learning_rate": 0.0007794907818424694, + "loss": 3.0635, + "step": 6015 + }, + { + "epoch": 1.9015241254047224, + "grad_norm": 0.11337944435985729, + "learning_rate": 0.000779033347149783, + "loss": 3.0359, + "step": 6020 + }, + { + "epoch": 1.9031035299692016, + "grad_norm": 0.1501609200848775, + "learning_rate": 0.0007785755730242584, + "loss": 3.1624, + "step": 6025 + }, + { + "epoch": 1.9046829345336809, + "grad_norm": 0.1805145194862832, + "learning_rate": 0.0007781174600227588, + "loss": 3.0417, + "step": 6030 + }, + { + "epoch": 1.9062623390981601, + "grad_norm": 0.1290406672184516, + "learning_rate": 0.0007776590087025608, + "loss": 3.0568, + "step": 6035 + }, + { + "epoch": 1.9078417436626391, + "grad_norm": 0.19345941569155078, + "learning_rate": 0.0007772002196213516, + "loss": 3.1087, + "step": 6040 + }, + { + "epoch": 1.9094211482271184, + "grad_norm": 0.14578283353871807, + "learning_rate": 0.0007767410933372297, + "loss": 3.0578, + "step": 6045 + }, + { + "epoch": 1.9110005527915974, + "grad_norm": 0.1832711194459425, + "learning_rate": 0.0007762816304087042, + "loss": 3.0918, + "step": 6050 + }, + { + "epoch": 1.9125799573560767, + "grad_norm": 0.1243198393896939, + "learning_rate": 0.0007758218313946926, + "loss": 3.055, + "step": 6055 + }, + { + "epoch": 1.914159361920556, + "grad_norm": 0.13805415222462827, + "learning_rate": 0.0007753616968545222, + "loss": 3.1015, + "step": 6060 + }, + { + "epoch": 1.9157387664850352, + "grad_norm": 0.12651242359343307, + "learning_rate": 0.0007749012273479286, + "loss": 3.1397, + "step": 6065 + }, + { + "epoch": 1.9173181710495144, + "grad_norm": 0.12844673711773036, + "learning_rate": 0.0007744404234350535, + "loss": 3.0258, + "step": 6070 + }, + { + "epoch": 1.9188975756139937, + "grad_norm": 0.11747370412271375, + "learning_rate": 0.0007739792856764472, + "loss": 2.9644, + "step": 6075 + }, + { + "epoch": 1.9204769801784727, + "grad_norm": 0.1188768226516703, + "learning_rate": 0.0007735178146330646, + "loss": 3.115, + "step": 6080 + }, + { + "epoch": 1.922056384742952, + "grad_norm": 0.11962149432528865, + "learning_rate": 0.000773056010866267, + "loss": 3.0715, + "step": 6085 + }, + { + "epoch": 1.923635789307431, + "grad_norm": 0.10468046220688224, + "learning_rate": 0.0007725938749378198, + "loss": 2.9093, + "step": 6090 + }, + { + "epoch": 1.9252151938719102, + "grad_norm": 0.13615012693421102, + "learning_rate": 0.0007721314074098932, + "loss": 3.0535, + "step": 6095 + }, + { + "epoch": 1.9267945984363894, + "grad_norm": 0.12332476083489902, + "learning_rate": 0.00077166860884506, + "loss": 3.0133, + "step": 6100 + }, + { + "epoch": 1.9283740030008687, + "grad_norm": 0.1554886230963916, + "learning_rate": 0.0007712054798062961, + "loss": 3.0415, + "step": 6105 + }, + { + "epoch": 1.929953407565348, + "grad_norm": 0.12618747274736528, + "learning_rate": 0.0007707420208569793, + "loss": 2.9687, + "step": 6110 + }, + { + "epoch": 1.9315328121298272, + "grad_norm": 0.11465855503094369, + "learning_rate": 0.0007702782325608891, + "loss": 3.0845, + "step": 6115 + }, + { + "epoch": 1.9331122166943062, + "grad_norm": 0.12760381851757738, + "learning_rate": 0.0007698141154822047, + "loss": 2.9836, + "step": 6120 + }, + { + "epoch": 1.9346916212587855, + "grad_norm": 0.12260016682163877, + "learning_rate": 0.0007693496701855063, + "loss": 3.0311, + "step": 6125 + }, + { + "epoch": 1.9362710258232645, + "grad_norm": 0.12577581561632112, + "learning_rate": 0.0007688848972357729, + "loss": 2.961, + "step": 6130 + }, + { + "epoch": 1.9378504303877437, + "grad_norm": 0.13281076636301217, + "learning_rate": 0.0007684197971983817, + "loss": 2.9881, + "step": 6135 + }, + { + "epoch": 1.939429834952223, + "grad_norm": 0.12485075629114431, + "learning_rate": 0.0007679543706391088, + "loss": 3.1032, + "step": 6140 + }, + { + "epoch": 1.9410092395167022, + "grad_norm": 0.13376906867906233, + "learning_rate": 0.0007674886181241262, + "loss": 3.0929, + "step": 6145 + }, + { + "epoch": 1.9425886440811815, + "grad_norm": 0.15326922072492113, + "learning_rate": 0.0007670225402200037, + "loss": 3.0059, + "step": 6150 + }, + { + "epoch": 1.9441680486456607, + "grad_norm": 0.11350105947209056, + "learning_rate": 0.0007665561374937059, + "loss": 3.0914, + "step": 6155 + }, + { + "epoch": 1.9457474532101398, + "grad_norm": 0.1277185457457496, + "learning_rate": 0.0007660894105125931, + "loss": 3.0533, + "step": 6160 + }, + { + "epoch": 1.947326857774619, + "grad_norm": 0.1185686838397195, + "learning_rate": 0.0007656223598444199, + "loss": 3.0439, + "step": 6165 + }, + { + "epoch": 1.948906262339098, + "grad_norm": 0.14586636312671342, + "learning_rate": 0.0007651549860573346, + "loss": 3.0462, + "step": 6170 + }, + { + "epoch": 1.9504856669035773, + "grad_norm": 0.11716530266799677, + "learning_rate": 0.0007646872897198786, + "loss": 3.0508, + "step": 6175 + }, + { + "epoch": 1.9520650714680565, + "grad_norm": 0.1494750093316875, + "learning_rate": 0.000764219271400986, + "loss": 3.031, + "step": 6180 + }, + { + "epoch": 1.9536444760325358, + "grad_norm": 0.15341195455485246, + "learning_rate": 0.0007637509316699816, + "loss": 3.0286, + "step": 6185 + }, + { + "epoch": 1.955223880597015, + "grad_norm": 0.1659584685319081, + "learning_rate": 0.0007632822710965826, + "loss": 2.9513, + "step": 6190 + }, + { + "epoch": 1.9568032851614943, + "grad_norm": 0.16952192795742607, + "learning_rate": 0.0007628132902508948, + "loss": 3.0618, + "step": 6195 + }, + { + "epoch": 1.9583826897259733, + "grad_norm": 0.14244345268202113, + "learning_rate": 0.0007623439897034154, + "loss": 2.96, + "step": 6200 + }, + { + "epoch": 1.9599620942904525, + "grad_norm": 0.13742088890521345, + "learning_rate": 0.0007618743700250292, + "loss": 3.0885, + "step": 6205 + }, + { + "epoch": 1.9615414988549316, + "grad_norm": 0.14591090487732308, + "learning_rate": 0.0007614044317870099, + "loss": 3.0204, + "step": 6210 + }, + { + "epoch": 1.9631209034194108, + "grad_norm": 0.1826312750944958, + "learning_rate": 0.0007609341755610181, + "loss": 3.0638, + "step": 6215 + }, + { + "epoch": 1.96470030798389, + "grad_norm": 0.13793312355059578, + "learning_rate": 0.0007604636019191018, + "loss": 2.9864, + "step": 6220 + }, + { + "epoch": 1.9662797125483693, + "grad_norm": 0.1202600309716602, + "learning_rate": 0.0007599927114336947, + "loss": 2.9795, + "step": 6225 + }, + { + "epoch": 1.9678591171128486, + "grad_norm": 0.10381821439492225, + "learning_rate": 0.0007595215046776165, + "loss": 3.0202, + "step": 6230 + }, + { + "epoch": 1.9694385216773278, + "grad_norm": 0.14311736230412642, + "learning_rate": 0.0007590499822240709, + "loss": 3.0741, + "step": 6235 + }, + { + "epoch": 1.9710179262418068, + "grad_norm": 0.13117212913237875, + "learning_rate": 0.0007585781446466463, + "loss": 2.9525, + "step": 6240 + }, + { + "epoch": 1.972597330806286, + "grad_norm": 0.09759008353033644, + "learning_rate": 0.0007581059925193139, + "loss": 2.9701, + "step": 6245 + }, + { + "epoch": 1.974176735370765, + "grad_norm": 0.14359747362895034, + "learning_rate": 0.0007576335264164278, + "loss": 3.0283, + "step": 6250 + }, + { + "epoch": 1.9757561399352443, + "grad_norm": 0.12444813025537488, + "learning_rate": 0.0007571607469127239, + "loss": 3.0474, + "step": 6255 + }, + { + "epoch": 1.9773355444997236, + "grad_norm": 0.13595472912428896, + "learning_rate": 0.0007566876545833197, + "loss": 2.9796, + "step": 6260 + }, + { + "epoch": 1.9789149490642028, + "grad_norm": 0.16565772286811356, + "learning_rate": 0.0007562142500037128, + "loss": 3.0639, + "step": 6265 + }, + { + "epoch": 1.980494353628682, + "grad_norm": 0.1418512246656067, + "learning_rate": 0.0007557405337497809, + "loss": 3.0299, + "step": 6270 + }, + { + "epoch": 1.9820737581931613, + "grad_norm": 0.15195621317984573, + "learning_rate": 0.0007552665063977806, + "loss": 3.0469, + "step": 6275 + }, + { + "epoch": 1.9836531627576404, + "grad_norm": 0.13748368645509432, + "learning_rate": 0.0007547921685243475, + "loss": 3.1035, + "step": 6280 + }, + { + "epoch": 1.9852325673221196, + "grad_norm": 0.1639705916815782, + "learning_rate": 0.0007543175207064941, + "loss": 2.9557, + "step": 6285 + }, + { + "epoch": 1.9868119718865986, + "grad_norm": 0.12493567540744403, + "learning_rate": 0.0007538425635216104, + "loss": 3.048, + "step": 6290 + }, + { + "epoch": 1.9883913764510779, + "grad_norm": 0.11329383002467514, + "learning_rate": 0.000753367297547463, + "loss": 3.0361, + "step": 6295 + }, + { + "epoch": 1.9899707810155571, + "grad_norm": 0.10547167696430386, + "learning_rate": 0.0007528917233621937, + "loss": 2.9355, + "step": 6300 + }, + { + "epoch": 1.9915501855800364, + "grad_norm": 0.10672825007534288, + "learning_rate": 0.0007524158415443192, + "loss": 3.0004, + "step": 6305 + }, + { + "epoch": 1.9931295901445156, + "grad_norm": 0.08754943202048324, + "learning_rate": 0.0007519396526727309, + "loss": 2.9292, + "step": 6310 + }, + { + "epoch": 1.9947089947089947, + "grad_norm": 0.10665404162506625, + "learning_rate": 0.000751463157326693, + "loss": 2.9146, + "step": 6315 + }, + { + "epoch": 1.996288399273474, + "grad_norm": 0.12577526351275167, + "learning_rate": 0.0007509863560858432, + "loss": 3.0294, + "step": 6320 + }, + { + "epoch": 1.997867803837953, + "grad_norm": 0.15679239066245074, + "learning_rate": 0.0007505092495301911, + "loss": 3.0487, + "step": 6325 + }, + { + "epoch": 1.9994472084024322, + "grad_norm": 0.14437984382007163, + "learning_rate": 0.0007500318382401173, + "loss": 3.0537, + "step": 6330 + }, + { + "epoch": 2.0, + "eval_loss": 3.002992630004883, + "eval_runtime": 118.632, + "eval_samples_per_second": 22.33, + "eval_steps_per_second": 5.589, + "step": 6332 + }, + { + "epoch": 2.0009476427386876, + "grad_norm": 0.14353458130909943, + "learning_rate": 0.0007495541227963736, + "loss": 3.0413, + "step": 6335 + }, + { + "epoch": 2.002527047303167, + "grad_norm": 0.1156761857836109, + "learning_rate": 0.0007490761037800815, + "loss": 2.9703, + "step": 6340 + }, + { + "epoch": 2.0041064518676457, + "grad_norm": 0.12923385153054337, + "learning_rate": 0.0007485977817727322, + "loss": 3.0492, + "step": 6345 + }, + { + "epoch": 2.005685856432125, + "grad_norm": 0.15716399707192955, + "learning_rate": 0.0007481191573561849, + "loss": 2.9633, + "step": 6350 + }, + { + "epoch": 2.007265260996604, + "grad_norm": 0.11284991969173797, + "learning_rate": 0.0007476402311126672, + "loss": 2.8979, + "step": 6355 + }, + { + "epoch": 2.0088446655610834, + "grad_norm": 0.10724149749864116, + "learning_rate": 0.0007471610036247732, + "loss": 2.9954, + "step": 6360 + }, + { + "epoch": 2.0104240701255627, + "grad_norm": 0.12801366254983276, + "learning_rate": 0.0007466814754754642, + "loss": 2.9294, + "step": 6365 + }, + { + "epoch": 2.012003474690042, + "grad_norm": 0.10576933472118467, + "learning_rate": 0.0007462016472480667, + "loss": 2.958, + "step": 6370 + }, + { + "epoch": 2.013582879254521, + "grad_norm": 0.13332911256730795, + "learning_rate": 0.0007457215195262726, + "loss": 2.9382, + "step": 6375 + }, + { + "epoch": 2.0151622838190004, + "grad_norm": 0.13607445289009712, + "learning_rate": 0.0007452410928941378, + "loss": 3.1146, + "step": 6380 + }, + { + "epoch": 2.0167416883834792, + "grad_norm": 0.11119726843716685, + "learning_rate": 0.000744760367936082, + "loss": 3.0029, + "step": 6385 + }, + { + "epoch": 2.0183210929479585, + "grad_norm": 0.12559115194392256, + "learning_rate": 0.0007442793452368879, + "loss": 3.0314, + "step": 6390 + }, + { + "epoch": 2.0199004975124377, + "grad_norm": 0.15043200233343293, + "learning_rate": 0.0007437980253817003, + "loss": 3.1284, + "step": 6395 + }, + { + "epoch": 2.021479902076917, + "grad_norm": 0.1408936832559694, + "learning_rate": 0.000743316408956025, + "loss": 3.0146, + "step": 6400 + }, + { + "epoch": 2.023059306641396, + "grad_norm": 0.1522614342957081, + "learning_rate": 0.0007428344965457294, + "loss": 2.9533, + "step": 6405 + }, + { + "epoch": 2.0246387112058755, + "grad_norm": 0.13815257927745242, + "learning_rate": 0.0007423522887370404, + "loss": 3.0147, + "step": 6410 + }, + { + "epoch": 2.0262181157703547, + "grad_norm": 0.13658614002521371, + "learning_rate": 0.0007418697861165444, + "loss": 3.0088, + "step": 6415 + }, + { + "epoch": 2.027797520334834, + "grad_norm": 0.1132770103349406, + "learning_rate": 0.0007413869892711867, + "loss": 3.0153, + "step": 6420 + }, + { + "epoch": 2.0293769248993128, + "grad_norm": 0.11257282557810316, + "learning_rate": 0.0007409038987882697, + "loss": 2.9768, + "step": 6425 + }, + { + "epoch": 2.030956329463792, + "grad_norm": 0.1387413305440576, + "learning_rate": 0.000740420515255454, + "loss": 2.9256, + "step": 6430 + }, + { + "epoch": 2.0325357340282713, + "grad_norm": 0.12733121878072734, + "learning_rate": 0.0007399368392607561, + "loss": 3.0091, + "step": 6435 + }, + { + "epoch": 2.0341151385927505, + "grad_norm": 0.15164433152738804, + "learning_rate": 0.0007394528713925481, + "loss": 3.0224, + "step": 6440 + }, + { + "epoch": 2.0356945431572298, + "grad_norm": 0.17255392989092136, + "learning_rate": 0.0007389686122395579, + "loss": 3.0004, + "step": 6445 + }, + { + "epoch": 2.037273947721709, + "grad_norm": 0.1494347145988662, + "learning_rate": 0.000738484062390867, + "loss": 3.0202, + "step": 6450 + }, + { + "epoch": 2.0388533522861882, + "grad_norm": 0.1286118513427532, + "learning_rate": 0.0007379992224359108, + "loss": 2.9945, + "step": 6455 + }, + { + "epoch": 2.0404327568506675, + "grad_norm": 0.1330159530967352, + "learning_rate": 0.0007375140929644776, + "loss": 2.9584, + "step": 6460 + }, + { + "epoch": 2.0420121614151463, + "grad_norm": 0.10649752052665366, + "learning_rate": 0.000737028674566708, + "loss": 2.9748, + "step": 6465 + }, + { + "epoch": 2.0435915659796255, + "grad_norm": 0.11051724224394212, + "learning_rate": 0.0007365429678330937, + "loss": 2.9452, + "step": 6470 + }, + { + "epoch": 2.045170970544105, + "grad_norm": 0.13499758720764418, + "learning_rate": 0.0007360569733544778, + "loss": 2.9613, + "step": 6475 + }, + { + "epoch": 2.046750375108584, + "grad_norm": 0.1243465553360808, + "learning_rate": 0.0007355706917220524, + "loss": 3.0741, + "step": 6480 + }, + { + "epoch": 2.0483297796730633, + "grad_norm": 0.08486693691641276, + "learning_rate": 0.0007350841235273602, + "loss": 2.9575, + "step": 6485 + }, + { + "epoch": 2.0499091842375425, + "grad_norm": 0.11758078015740667, + "learning_rate": 0.0007345972693622915, + "loss": 3.0356, + "step": 6490 + }, + { + "epoch": 2.051488588802022, + "grad_norm": 0.11659997718476248, + "learning_rate": 0.0007341101298190849, + "loss": 2.9863, + "step": 6495 + }, + { + "epoch": 2.053067993366501, + "grad_norm": 0.09389891185481668, + "learning_rate": 0.0007336227054903258, + "loss": 3.1458, + "step": 6500 + }, + { + "epoch": 2.05464739793098, + "grad_norm": 0.13480016807876233, + "learning_rate": 0.0007331349969689467, + "loss": 3.0026, + "step": 6505 + }, + { + "epoch": 2.056226802495459, + "grad_norm": 0.12055090302454477, + "learning_rate": 0.000732647004848225, + "loss": 2.9464, + "step": 6510 + }, + { + "epoch": 2.0578062070599383, + "grad_norm": 0.1148060412272345, + "learning_rate": 0.0007321587297217838, + "loss": 3.0115, + "step": 6515 + }, + { + "epoch": 2.0593856116244176, + "grad_norm": 0.1215434220029464, + "learning_rate": 0.0007316701721835899, + "loss": 2.9983, + "step": 6520 + }, + { + "epoch": 2.060965016188897, + "grad_norm": 0.14364596094061724, + "learning_rate": 0.000731181332827954, + "loss": 2.9262, + "step": 6525 + }, + { + "epoch": 2.062544420753376, + "grad_norm": 0.10659520229926726, + "learning_rate": 0.0007306922122495295, + "loss": 3.0127, + "step": 6530 + }, + { + "epoch": 2.0641238253178553, + "grad_norm": 0.13323935904303538, + "learning_rate": 0.0007302028110433118, + "loss": 3.1008, + "step": 6535 + }, + { + "epoch": 2.0657032298823346, + "grad_norm": 0.13533198826623102, + "learning_rate": 0.000729713129804638, + "loss": 3.081, + "step": 6540 + }, + { + "epoch": 2.0672826344468134, + "grad_norm": 0.142589431829919, + "learning_rate": 0.0007292231691291854, + "loss": 3.036, + "step": 6545 + }, + { + "epoch": 2.0688620390112926, + "grad_norm": 0.13569810390729722, + "learning_rate": 0.0007287329296129715, + "loss": 2.9285, + "step": 6550 + }, + { + "epoch": 2.070441443575772, + "grad_norm": 0.1155039408631233, + "learning_rate": 0.0007282424118523531, + "loss": 2.94, + "step": 6555 + }, + { + "epoch": 2.072020848140251, + "grad_norm": 0.12295926038581785, + "learning_rate": 0.000727751616444025, + "loss": 3.0605, + "step": 6560 + }, + { + "epoch": 2.0736002527047304, + "grad_norm": 0.12305628092828373, + "learning_rate": 0.0007272605439850205, + "loss": 3.023, + "step": 6565 + }, + { + "epoch": 2.0751796572692096, + "grad_norm": 0.12215737076018068, + "learning_rate": 0.0007267691950727089, + "loss": 3.0601, + "step": 6570 + }, + { + "epoch": 2.076759061833689, + "grad_norm": 0.1276869872243518, + "learning_rate": 0.000726277570304797, + "loss": 2.9532, + "step": 6575 + }, + { + "epoch": 2.078338466398168, + "grad_norm": 0.12334843350229036, + "learning_rate": 0.0007257856702793261, + "loss": 3.0194, + "step": 6580 + }, + { + "epoch": 2.079917870962647, + "grad_norm": 0.13940263746621434, + "learning_rate": 0.0007252934955946732, + "loss": 3.0748, + "step": 6585 + }, + { + "epoch": 2.081497275527126, + "grad_norm": 0.11565673251438623, + "learning_rate": 0.0007248010468495485, + "loss": 2.954, + "step": 6590 + }, + { + "epoch": 2.0830766800916054, + "grad_norm": 0.10806739091445611, + "learning_rate": 0.0007243083246429964, + "loss": 3.0082, + "step": 6595 + }, + { + "epoch": 2.0846560846560847, + "grad_norm": 0.12970993194796246, + "learning_rate": 0.0007238153295743935, + "loss": 2.8832, + "step": 6600 + }, + { + "epoch": 2.086235489220564, + "grad_norm": 0.1147358808877119, + "learning_rate": 0.0007233220622434488, + "loss": 2.9476, + "step": 6605 + }, + { + "epoch": 2.087814893785043, + "grad_norm": 0.1210669420794267, + "learning_rate": 0.0007228285232502015, + "loss": 2.9101, + "step": 6610 + }, + { + "epoch": 2.0893942983495224, + "grad_norm": 0.1606733718489849, + "learning_rate": 0.0007223347131950226, + "loss": 2.9651, + "step": 6615 + }, + { + "epoch": 2.0909737029140016, + "grad_norm": 0.13731530763006436, + "learning_rate": 0.0007218406326786119, + "loss": 3.0062, + "step": 6620 + }, + { + "epoch": 2.0925531074784804, + "grad_norm": 0.10877478662149911, + "learning_rate": 0.0007213462823019983, + "loss": 2.9584, + "step": 6625 + }, + { + "epoch": 2.0941325120429597, + "grad_norm": 0.13160049485588202, + "learning_rate": 0.0007208516626665394, + "loss": 2.9477, + "step": 6630 + }, + { + "epoch": 2.095711916607439, + "grad_norm": 0.1520902274619921, + "learning_rate": 0.0007203567743739198, + "loss": 3.0736, + "step": 6635 + }, + { + "epoch": 2.097291321171918, + "grad_norm": 0.14340603399708876, + "learning_rate": 0.0007198616180261514, + "loss": 2.9981, + "step": 6640 + }, + { + "epoch": 2.0988707257363974, + "grad_norm": 0.13195937834853005, + "learning_rate": 0.0007193661942255722, + "loss": 2.9252, + "step": 6645 + }, + { + "epoch": 2.1004501303008767, + "grad_norm": 0.1421720750596484, + "learning_rate": 0.0007188705035748446, + "loss": 2.9677, + "step": 6650 + }, + { + "epoch": 2.102029534865356, + "grad_norm": 0.11834592868666237, + "learning_rate": 0.0007183745466769572, + "loss": 2.99, + "step": 6655 + }, + { + "epoch": 2.1036089394298347, + "grad_norm": 0.12182266431595128, + "learning_rate": 0.000717878324135221, + "loss": 3.0528, + "step": 6660 + }, + { + "epoch": 2.105188343994314, + "grad_norm": 0.1022121029896845, + "learning_rate": 0.0007173818365532709, + "loss": 3.0244, + "step": 6665 + }, + { + "epoch": 2.1067677485587932, + "grad_norm": 0.1301026443398586, + "learning_rate": 0.0007168850845350642, + "loss": 3.0274, + "step": 6670 + }, + { + "epoch": 2.1083471531232725, + "grad_norm": 0.11957679595275474, + "learning_rate": 0.0007163880686848796, + "loss": 2.9305, + "step": 6675 + }, + { + "epoch": 2.1099265576877517, + "grad_norm": 0.13180237905943235, + "learning_rate": 0.000715890789607317, + "loss": 3.0343, + "step": 6680 + }, + { + "epoch": 2.111505962252231, + "grad_norm": 0.13843354550897705, + "learning_rate": 0.0007153932479072963, + "loss": 2.9075, + "step": 6685 + }, + { + "epoch": 2.11308536681671, + "grad_norm": 0.13166142357617788, + "learning_rate": 0.0007148954441900568, + "loss": 2.9276, + "step": 6690 + }, + { + "epoch": 2.1146647713811895, + "grad_norm": 0.1286711963390611, + "learning_rate": 0.0007143973790611571, + "loss": 2.9668, + "step": 6695 + }, + { + "epoch": 2.1162441759456683, + "grad_norm": 0.10836636344134955, + "learning_rate": 0.000713899053126473, + "loss": 3.054, + "step": 6700 + }, + { + "epoch": 2.1178235805101475, + "grad_norm": 0.10852062206377763, + "learning_rate": 0.0007134004669921983, + "loss": 2.9124, + "step": 6705 + }, + { + "epoch": 2.1194029850746268, + "grad_norm": 0.10533876658435133, + "learning_rate": 0.0007129016212648425, + "loss": 2.9217, + "step": 6710 + }, + { + "epoch": 2.120982389639106, + "grad_norm": 0.14748853715817997, + "learning_rate": 0.0007124025165512318, + "loss": 2.9049, + "step": 6715 + }, + { + "epoch": 2.1225617942035853, + "grad_norm": 0.10497458796252031, + "learning_rate": 0.0007119031534585068, + "loss": 2.9351, + "step": 6720 + }, + { + "epoch": 2.1241411987680645, + "grad_norm": 0.10538082248904294, + "learning_rate": 0.0007114035325941226, + "loss": 3.0758, + "step": 6725 + }, + { + "epoch": 2.1257206033325438, + "grad_norm": 0.12439664097265367, + "learning_rate": 0.0007109036545658478, + "loss": 2.9336, + "step": 6730 + }, + { + "epoch": 2.127300007897023, + "grad_norm": 0.11718790665980608, + "learning_rate": 0.0007104035199817642, + "loss": 2.9641, + "step": 6735 + }, + { + "epoch": 2.128879412461502, + "grad_norm": 0.12660875592728832, + "learning_rate": 0.0007099031294502651, + "loss": 2.987, + "step": 6740 + }, + { + "epoch": 2.130458817025981, + "grad_norm": 0.15718244268313566, + "learning_rate": 0.0007094024835800557, + "loss": 2.9108, + "step": 6745 + }, + { + "epoch": 2.1320382215904603, + "grad_norm": 0.1553720198584762, + "learning_rate": 0.0007089015829801513, + "loss": 3.0578, + "step": 6750 + }, + { + "epoch": 2.1336176261549396, + "grad_norm": 0.11568572734140488, + "learning_rate": 0.0007084004282598774, + "loss": 2.9781, + "step": 6755 + }, + { + "epoch": 2.135197030719419, + "grad_norm": 0.11782283702381052, + "learning_rate": 0.0007078990200288685, + "loss": 2.9762, + "step": 6760 + }, + { + "epoch": 2.136776435283898, + "grad_norm": 0.14707229231637933, + "learning_rate": 0.0007073973588970678, + "loss": 2.9615, + "step": 6765 + }, + { + "epoch": 2.1383558398483773, + "grad_norm": 0.1214209064411245, + "learning_rate": 0.0007068954454747256, + "loss": 2.9987, + "step": 6770 + }, + { + "epoch": 2.1399352444128565, + "grad_norm": 0.12998961493455885, + "learning_rate": 0.0007063932803723996, + "loss": 3.0439, + "step": 6775 + }, + { + "epoch": 2.1415146489773353, + "grad_norm": 0.14708814785711635, + "learning_rate": 0.0007058908642009531, + "loss": 2.8839, + "step": 6780 + }, + { + "epoch": 2.1430940535418146, + "grad_norm": 0.12748564529746748, + "learning_rate": 0.0007053881975715557, + "loss": 2.9489, + "step": 6785 + }, + { + "epoch": 2.144673458106294, + "grad_norm": 0.1321189798945764, + "learning_rate": 0.0007048852810956805, + "loss": 2.9256, + "step": 6790 + }, + { + "epoch": 2.146252862670773, + "grad_norm": 0.1304762482298313, + "learning_rate": 0.0007043821153851057, + "loss": 2.9561, + "step": 6795 + }, + { + "epoch": 2.1478322672352523, + "grad_norm": 0.11486472715401491, + "learning_rate": 0.0007038787010519116, + "loss": 2.9833, + "step": 6800 + }, + { + "epoch": 2.1494116717997316, + "grad_norm": 0.12315872643569079, + "learning_rate": 0.0007033750387084821, + "loss": 2.8684, + "step": 6805 + }, + { + "epoch": 2.150991076364211, + "grad_norm": 0.12142457266007015, + "learning_rate": 0.0007028711289675016, + "loss": 3.0012, + "step": 6810 + }, + { + "epoch": 2.15257048092869, + "grad_norm": 0.12104114201457632, + "learning_rate": 0.0007023669724419563, + "loss": 2.9969, + "step": 6815 + }, + { + "epoch": 2.154149885493169, + "grad_norm": 0.11986057914479814, + "learning_rate": 0.0007018625697451326, + "loss": 2.9424, + "step": 6820 + }, + { + "epoch": 2.155729290057648, + "grad_norm": 0.11076847146953693, + "learning_rate": 0.0007013579214906155, + "loss": 2.9856, + "step": 6825 + }, + { + "epoch": 2.1573086946221274, + "grad_norm": 0.10961247367316544, + "learning_rate": 0.0007008530282922896, + "loss": 2.9276, + "step": 6830 + }, + { + "epoch": 2.1588880991866066, + "grad_norm": 0.13011644057314592, + "learning_rate": 0.0007003478907643372, + "loss": 2.9269, + "step": 6835 + }, + { + "epoch": 2.160467503751086, + "grad_norm": 0.1166701938575446, + "learning_rate": 0.0006998425095212377, + "loss": 2.9917, + "step": 6840 + }, + { + "epoch": 2.162046908315565, + "grad_norm": 0.1456926577037329, + "learning_rate": 0.0006993368851777671, + "loss": 2.9533, + "step": 6845 + }, + { + "epoch": 2.1636263128800444, + "grad_norm": 0.1146375820891622, + "learning_rate": 0.0006988310183489968, + "loss": 2.8984, + "step": 6850 + }, + { + "epoch": 2.1652057174445236, + "grad_norm": 0.11696640157874294, + "learning_rate": 0.000698324909650294, + "loss": 2.9509, + "step": 6855 + }, + { + "epoch": 2.1667851220090024, + "grad_norm": 0.11263288404211443, + "learning_rate": 0.0006978185596973191, + "loss": 2.9958, + "step": 6860 + }, + { + "epoch": 2.1683645265734817, + "grad_norm": 0.10937990573667906, + "learning_rate": 0.0006973119691060266, + "loss": 3.0493, + "step": 6865 + }, + { + "epoch": 2.169943931137961, + "grad_norm": 0.12060921543059229, + "learning_rate": 0.0006968051384926634, + "loss": 3.0304, + "step": 6870 + }, + { + "epoch": 2.17152333570244, + "grad_norm": 0.13499039916380462, + "learning_rate": 0.0006962980684737688, + "loss": 2.9113, + "step": 6875 + }, + { + "epoch": 2.1731027402669194, + "grad_norm": 0.12796690809536898, + "learning_rate": 0.0006957907596661729, + "loss": 3.056, + "step": 6880 + }, + { + "epoch": 2.1746821448313987, + "grad_norm": 0.11823051242354714, + "learning_rate": 0.0006952832126869966, + "loss": 3.0357, + "step": 6885 + }, + { + "epoch": 2.176261549395878, + "grad_norm": 0.15676859394646797, + "learning_rate": 0.0006947754281536502, + "loss": 2.9343, + "step": 6890 + }, + { + "epoch": 2.177840953960357, + "grad_norm": 0.12703771956860213, + "learning_rate": 0.0006942674066838332, + "loss": 2.9846, + "step": 6895 + }, + { + "epoch": 2.179420358524836, + "grad_norm": 0.09810456905197461, + "learning_rate": 0.0006937591488955334, + "loss": 2.9459, + "step": 6900 + }, + { + "epoch": 2.180999763089315, + "grad_norm": 0.13643249438798305, + "learning_rate": 0.0006932506554070259, + "loss": 3.0622, + "step": 6905 + }, + { + "epoch": 2.1825791676537944, + "grad_norm": 0.15175111503072894, + "learning_rate": 0.0006927419268368726, + "loss": 2.9585, + "step": 6910 + }, + { + "epoch": 2.1841585722182737, + "grad_norm": 0.16395519307966172, + "learning_rate": 0.0006922329638039211, + "loss": 2.9912, + "step": 6915 + }, + { + "epoch": 2.185737976782753, + "grad_norm": 0.16750759783597607, + "learning_rate": 0.0006917237669273046, + "loss": 2.9087, + "step": 6920 + }, + { + "epoch": 2.187317381347232, + "grad_norm": 0.12715403500537467, + "learning_rate": 0.0006912143368264408, + "loss": 2.8529, + "step": 6925 + }, + { + "epoch": 2.1888967859117114, + "grad_norm": 0.10608221046373563, + "learning_rate": 0.0006907046741210308, + "loss": 2.8785, + "step": 6930 + }, + { + "epoch": 2.1904761904761907, + "grad_norm": 0.1208868677965757, + "learning_rate": 0.0006901947794310583, + "loss": 2.9761, + "step": 6935 + }, + { + "epoch": 2.1920555950406695, + "grad_norm": 0.12530056311098328, + "learning_rate": 0.0006896846533767906, + "loss": 2.9773, + "step": 6940 + }, + { + "epoch": 2.1936349996051487, + "grad_norm": 0.1154836339364008, + "learning_rate": 0.0006891742965787746, + "loss": 3.018, + "step": 6945 + }, + { + "epoch": 2.195214404169628, + "grad_norm": 0.10701635685168409, + "learning_rate": 0.0006886637096578394, + "loss": 3.0047, + "step": 6950 + }, + { + "epoch": 2.1967938087341072, + "grad_norm": 0.13623213124777278, + "learning_rate": 0.0006881528932350932, + "loss": 2.9208, + "step": 6955 + }, + { + "epoch": 2.1983732132985865, + "grad_norm": 0.10290165801688178, + "learning_rate": 0.0006876418479319238, + "loss": 2.932, + "step": 6960 + }, + { + "epoch": 2.1999526178630657, + "grad_norm": 0.10699922828371627, + "learning_rate": 0.000687130574369997, + "loss": 2.9233, + "step": 6965 + }, + { + "epoch": 2.201532022427545, + "grad_norm": 0.1695041583210203, + "learning_rate": 0.0006866190731712566, + "loss": 2.939, + "step": 6970 + }, + { + "epoch": 2.203111426992024, + "grad_norm": 0.13247946534689092, + "learning_rate": 0.0006861073449579233, + "loss": 3.0769, + "step": 6975 + }, + { + "epoch": 2.204690831556503, + "grad_norm": 0.12083281516035982, + "learning_rate": 0.0006855953903524939, + "loss": 2.8871, + "step": 6980 + }, + { + "epoch": 2.2062702361209823, + "grad_norm": 0.14866175804793844, + "learning_rate": 0.0006850832099777404, + "loss": 3.1146, + "step": 6985 + }, + { + "epoch": 2.2078496406854615, + "grad_norm": 0.1497492816535961, + "learning_rate": 0.0006845708044567099, + "loss": 2.9031, + "step": 6990 + }, + { + "epoch": 2.2094290452499408, + "grad_norm": 0.1421570127106526, + "learning_rate": 0.0006840581744127227, + "loss": 3.0816, + "step": 6995 + }, + { + "epoch": 2.21100844981442, + "grad_norm": 0.13829763776583967, + "learning_rate": 0.0006835453204693732, + "loss": 2.8834, + "step": 7000 + }, + { + "epoch": 2.2125878543788993, + "grad_norm": 0.12307842482923538, + "learning_rate": 0.000683032243250527, + "loss": 3.1174, + "step": 7005 + }, + { + "epoch": 2.2141672589433785, + "grad_norm": 0.16581956371172818, + "learning_rate": 0.0006825189433803222, + "loss": 2.9609, + "step": 7010 + }, + { + "epoch": 2.2157466635078578, + "grad_norm": 0.10833612483320888, + "learning_rate": 0.0006820054214831673, + "loss": 2.8756, + "step": 7015 + }, + { + "epoch": 2.2173260680723366, + "grad_norm": 0.11521930594607378, + "learning_rate": 0.0006814916781837413, + "loss": 2.9566, + "step": 7020 + }, + { + "epoch": 2.218905472636816, + "grad_norm": 0.11063510835048242, + "learning_rate": 0.0006809777141069917, + "loss": 2.8834, + "step": 7025 + }, + { + "epoch": 2.220484877201295, + "grad_norm": 0.13148973707112352, + "learning_rate": 0.0006804635298781358, + "loss": 2.9195, + "step": 7030 + }, + { + "epoch": 2.2220642817657743, + "grad_norm": 0.11620455696252242, + "learning_rate": 0.0006799491261226574, + "loss": 2.9167, + "step": 7035 + }, + { + "epoch": 2.2236436863302536, + "grad_norm": 0.1396608974442779, + "learning_rate": 0.0006794345034663084, + "loss": 2.9995, + "step": 7040 + }, + { + "epoch": 2.225223090894733, + "grad_norm": 0.10532649173801359, + "learning_rate": 0.0006789196625351064, + "loss": 2.9799, + "step": 7045 + }, + { + "epoch": 2.226802495459212, + "grad_norm": 0.1322420670483448, + "learning_rate": 0.0006784046039553346, + "loss": 3.0018, + "step": 7050 + }, + { + "epoch": 2.228381900023691, + "grad_norm": 0.13136086274387426, + "learning_rate": 0.0006778893283535411, + "loss": 2.9256, + "step": 7055 + }, + { + "epoch": 2.22996130458817, + "grad_norm": 0.10244743453725035, + "learning_rate": 0.0006773738363565381, + "loss": 2.967, + "step": 7060 + }, + { + "epoch": 2.2315407091526493, + "grad_norm": 0.13231108581539203, + "learning_rate": 0.0006768581285914006, + "loss": 2.8923, + "step": 7065 + }, + { + "epoch": 2.2331201137171286, + "grad_norm": 0.10492552753211626, + "learning_rate": 0.0006763422056854665, + "loss": 2.9883, + "step": 7070 + }, + { + "epoch": 2.234699518281608, + "grad_norm": 0.12023814021868094, + "learning_rate": 0.0006758260682663351, + "loss": 3.0296, + "step": 7075 + }, + { + "epoch": 2.236278922846087, + "grad_norm": 0.11611685386311155, + "learning_rate": 0.0006753097169618672, + "loss": 3.0089, + "step": 7080 + }, + { + "epoch": 2.2378583274105663, + "grad_norm": 0.10790971144102764, + "learning_rate": 0.0006747931524001829, + "loss": 3.0234, + "step": 7085 + }, + { + "epoch": 2.2394377319750456, + "grad_norm": 0.13591696362025332, + "learning_rate": 0.0006742763752096624, + "loss": 2.9541, + "step": 7090 + }, + { + "epoch": 2.241017136539525, + "grad_norm": 0.170771413477413, + "learning_rate": 0.0006737593860189444, + "loss": 2.9583, + "step": 7095 + }, + { + "epoch": 2.2425965411040036, + "grad_norm": 0.1171310887058597, + "learning_rate": 0.0006732421854569254, + "loss": 2.9543, + "step": 7100 + }, + { + "epoch": 2.244175945668483, + "grad_norm": 0.11847176351347768, + "learning_rate": 0.0006727247741527591, + "loss": 2.9623, + "step": 7105 + }, + { + "epoch": 2.245755350232962, + "grad_norm": 0.118122754884454, + "learning_rate": 0.0006722071527358556, + "loss": 2.9223, + "step": 7110 + }, + { + "epoch": 2.2473347547974414, + "grad_norm": 0.17184169265760627, + "learning_rate": 0.0006716893218358803, + "loss": 2.95, + "step": 7115 + }, + { + "epoch": 2.2489141593619206, + "grad_norm": 0.12136410495299987, + "learning_rate": 0.0006711712820827538, + "loss": 2.9878, + "step": 7120 + }, + { + "epoch": 2.2504935639264, + "grad_norm": 0.15124997481005967, + "learning_rate": 0.0006706530341066506, + "loss": 3.0159, + "step": 7125 + }, + { + "epoch": 2.252072968490879, + "grad_norm": 0.11705380254761119, + "learning_rate": 0.0006701345785379986, + "loss": 2.9411, + "step": 7130 + }, + { + "epoch": 2.253652373055358, + "grad_norm": 0.13077135432868156, + "learning_rate": 0.0006696159160074779, + "loss": 2.9931, + "step": 7135 + }, + { + "epoch": 2.255231777619837, + "grad_norm": 0.10159403802430324, + "learning_rate": 0.0006690970471460209, + "loss": 2.9222, + "step": 7140 + }, + { + "epoch": 2.2568111821843164, + "grad_norm": 0.11091520440334587, + "learning_rate": 0.0006685779725848105, + "loss": 2.9502, + "step": 7145 + }, + { + "epoch": 2.2583905867487957, + "grad_norm": 0.10768357345688996, + "learning_rate": 0.00066805869295528, + "loss": 2.9359, + "step": 7150 + }, + { + "epoch": 2.259969991313275, + "grad_norm": 0.11998163379469538, + "learning_rate": 0.0006675392088891123, + "loss": 2.8578, + "step": 7155 + }, + { + "epoch": 2.261549395877754, + "grad_norm": 0.13277969388364058, + "learning_rate": 0.0006670195210182388, + "loss": 2.9115, + "step": 7160 + }, + { + "epoch": 2.2631288004422334, + "grad_norm": 0.1755182491420534, + "learning_rate": 0.0006664996299748387, + "loss": 2.9665, + "step": 7165 + }, + { + "epoch": 2.2647082050067127, + "grad_norm": 0.120479369705775, + "learning_rate": 0.0006659795363913389, + "loss": 2.8196, + "step": 7170 + }, + { + "epoch": 2.266287609571192, + "grad_norm": 0.1472740095762117, + "learning_rate": 0.000665459240900412, + "loss": 2.9059, + "step": 7175 + }, + { + "epoch": 2.2678670141356707, + "grad_norm": 0.1403086377292554, + "learning_rate": 0.0006649387441349766, + "loss": 2.9567, + "step": 7180 + }, + { + "epoch": 2.26944641870015, + "grad_norm": 0.11904198908085319, + "learning_rate": 0.0006644180467281962, + "loss": 2.9466, + "step": 7185 + }, + { + "epoch": 2.271025823264629, + "grad_norm": 0.0970900038761183, + "learning_rate": 0.0006638971493134782, + "loss": 2.8502, + "step": 7190 + }, + { + "epoch": 2.2726052278291085, + "grad_norm": 0.11915325203464326, + "learning_rate": 0.0006633760525244733, + "loss": 2.9136, + "step": 7195 + }, + { + "epoch": 2.2741846323935877, + "grad_norm": 0.15596003187860796, + "learning_rate": 0.0006628547569950749, + "loss": 2.9413, + "step": 7200 + }, + { + "epoch": 2.275764036958067, + "grad_norm": 0.14361350912840504, + "learning_rate": 0.0006623332633594176, + "loss": 2.9854, + "step": 7205 + }, + { + "epoch": 2.2773434415225458, + "grad_norm": 0.1387390104010181, + "learning_rate": 0.0006618115722518779, + "loss": 2.9165, + "step": 7210 + }, + { + "epoch": 2.278922846087025, + "grad_norm": 0.12164058631463727, + "learning_rate": 0.000661289684307072, + "loss": 2.9664, + "step": 7215 + }, + { + "epoch": 2.2805022506515042, + "grad_norm": 0.1327686193020361, + "learning_rate": 0.0006607676001598552, + "loss": 2.8931, + "step": 7220 + }, + { + "epoch": 2.2820816552159835, + "grad_norm": 0.09739735935094503, + "learning_rate": 0.0006602453204453222, + "loss": 2.8827, + "step": 7225 + }, + { + "epoch": 2.2836610597804627, + "grad_norm": 0.112259704220055, + "learning_rate": 0.0006597228457988053, + "loss": 2.9629, + "step": 7230 + }, + { + "epoch": 2.285240464344942, + "grad_norm": 0.11511371392998275, + "learning_rate": 0.0006592001768558737, + "loss": 2.9948, + "step": 7235 + }, + { + "epoch": 2.2868198689094212, + "grad_norm": 0.12295746827521585, + "learning_rate": 0.000658677314252333, + "loss": 2.9594, + "step": 7240 + }, + { + "epoch": 2.2883992734739005, + "grad_norm": 0.10003248918300281, + "learning_rate": 0.0006581542586242251, + "loss": 3.0445, + "step": 7245 + }, + { + "epoch": 2.2899786780383797, + "grad_norm": 0.13571954683629917, + "learning_rate": 0.0006576310106078255, + "loss": 2.9247, + "step": 7250 + }, + { + "epoch": 2.2915580826028585, + "grad_norm": 0.10098824916493351, + "learning_rate": 0.0006571075708396445, + "loss": 2.895, + "step": 7255 + }, + { + "epoch": 2.293137487167338, + "grad_norm": 0.11393288835907761, + "learning_rate": 0.0006565839399564257, + "loss": 2.9283, + "step": 7260 + }, + { + "epoch": 2.294716891731817, + "grad_norm": 0.1335459803003351, + "learning_rate": 0.000656060118595145, + "loss": 2.9307, + "step": 7265 + }, + { + "epoch": 2.2962962962962963, + "grad_norm": 0.10505068778995834, + "learning_rate": 0.0006555361073930097, + "loss": 2.957, + "step": 7270 + }, + { + "epoch": 2.2978757008607755, + "grad_norm": 0.12431999416466642, + "learning_rate": 0.0006550119069874587, + "loss": 3.0146, + "step": 7275 + }, + { + "epoch": 2.2994551054252548, + "grad_norm": 0.13026304770658859, + "learning_rate": 0.0006544875180161605, + "loss": 2.8594, + "step": 7280 + }, + { + "epoch": 2.301034509989734, + "grad_norm": 0.16656542361563256, + "learning_rate": 0.0006539629411170133, + "loss": 2.8679, + "step": 7285 + }, + { + "epoch": 2.302613914554213, + "grad_norm": 0.12166231365670094, + "learning_rate": 0.0006534381769281437, + "loss": 2.9285, + "step": 7290 + }, + { + "epoch": 2.304193319118692, + "grad_norm": 0.14190949851348844, + "learning_rate": 0.0006529132260879062, + "loss": 2.8372, + "step": 7295 + }, + { + "epoch": 2.3057727236831713, + "grad_norm": 0.11518608507196358, + "learning_rate": 0.0006523880892348823, + "loss": 2.9607, + "step": 7300 + }, + { + "epoch": 2.3073521282476506, + "grad_norm": 0.15679921245514394, + "learning_rate": 0.0006518627670078802, + "loss": 2.9361, + "step": 7305 + }, + { + "epoch": 2.30893153281213, + "grad_norm": 0.12548306813713037, + "learning_rate": 0.0006513372600459329, + "loss": 3.0122, + "step": 7310 + }, + { + "epoch": 2.310510937376609, + "grad_norm": 0.1211400233474286, + "learning_rate": 0.0006508115689882985, + "loss": 2.8933, + "step": 7315 + }, + { + "epoch": 2.3120903419410883, + "grad_norm": 0.15006322078050777, + "learning_rate": 0.0006502856944744593, + "loss": 2.878, + "step": 7320 + }, + { + "epoch": 2.3136697465055676, + "grad_norm": 0.1319367220228824, + "learning_rate": 0.0006497596371441202, + "loss": 3.0529, + "step": 7325 + }, + { + "epoch": 2.315249151070047, + "grad_norm": 0.13369116216281307, + "learning_rate": 0.0006492333976372089, + "loss": 2.8999, + "step": 7330 + }, + { + "epoch": 2.3168285556345256, + "grad_norm": 0.10719606492938008, + "learning_rate": 0.0006487069765938744, + "loss": 2.9576, + "step": 7335 + }, + { + "epoch": 2.318407960199005, + "grad_norm": 0.1366395020043834, + "learning_rate": 0.000648180374654487, + "loss": 2.8916, + "step": 7340 + }, + { + "epoch": 2.319987364763484, + "grad_norm": 0.16736555472396292, + "learning_rate": 0.0006476535924596365, + "loss": 2.9229, + "step": 7345 + }, + { + "epoch": 2.3215667693279634, + "grad_norm": 0.10854961163919274, + "learning_rate": 0.0006471266306501324, + "loss": 2.9666, + "step": 7350 + }, + { + "epoch": 2.3231461738924426, + "grad_norm": 0.11617884785772647, + "learning_rate": 0.0006465994898670027, + "loss": 2.8805, + "step": 7355 + }, + { + "epoch": 2.324725578456922, + "grad_norm": 0.11730131395930449, + "learning_rate": 0.0006460721707514926, + "loss": 2.9296, + "step": 7360 + }, + { + "epoch": 2.326304983021401, + "grad_norm": 0.1218065285616196, + "learning_rate": 0.0006455446739450648, + "loss": 2.9287, + "step": 7365 + }, + { + "epoch": 2.32788438758588, + "grad_norm": 0.15276746108572395, + "learning_rate": 0.0006450170000893978, + "loss": 3.0416, + "step": 7370 + }, + { + "epoch": 2.329463792150359, + "grad_norm": 0.10988728730323323, + "learning_rate": 0.0006444891498263858, + "loss": 2.8179, + "step": 7375 + }, + { + "epoch": 2.3310431967148384, + "grad_norm": 0.1375312448469524, + "learning_rate": 0.0006439611237981373, + "loss": 2.8473, + "step": 7380 + }, + { + "epoch": 2.3326226012793176, + "grad_norm": 0.12200442362075764, + "learning_rate": 0.0006434329226469747, + "loss": 2.9678, + "step": 7385 + }, + { + "epoch": 2.334202005843797, + "grad_norm": 0.14015923236286695, + "learning_rate": 0.0006429045470154333, + "loss": 2.824, + "step": 7390 + }, + { + "epoch": 2.335781410408276, + "grad_norm": 0.13578554709350246, + "learning_rate": 0.0006423759975462611, + "loss": 2.8641, + "step": 7395 + }, + { + "epoch": 2.3373608149727554, + "grad_norm": 0.127381218523229, + "learning_rate": 0.0006418472748824171, + "loss": 2.8984, + "step": 7400 + }, + { + "epoch": 2.3389402195372346, + "grad_norm": 0.14646884176277938, + "learning_rate": 0.0006413183796670713, + "loss": 2.91, + "step": 7405 + }, + { + "epoch": 2.340519624101714, + "grad_norm": 0.13366509332281643, + "learning_rate": 0.0006407893125436031, + "loss": 2.9062, + "step": 7410 + }, + { + "epoch": 2.3420990286661927, + "grad_norm": 0.10031643403058177, + "learning_rate": 0.0006402600741556017, + "loss": 2.8832, + "step": 7415 + }, + { + "epoch": 2.343678433230672, + "grad_norm": 0.11108377238819567, + "learning_rate": 0.000639730665146864, + "loss": 2.9613, + "step": 7420 + }, + { + "epoch": 2.345257837795151, + "grad_norm": 0.12177336184748958, + "learning_rate": 0.0006392010861613951, + "loss": 3.1079, + "step": 7425 + }, + { + "epoch": 2.3468372423596304, + "grad_norm": 0.12803330627164533, + "learning_rate": 0.0006386713378434064, + "loss": 2.9418, + "step": 7430 + }, + { + "epoch": 2.3484166469241097, + "grad_norm": 0.09879197837614655, + "learning_rate": 0.0006381414208373151, + "loss": 2.9569, + "step": 7435 + }, + { + "epoch": 2.349996051488589, + "grad_norm": 0.09770351588664293, + "learning_rate": 0.0006376113357877445, + "loss": 2.9496, + "step": 7440 + }, + { + "epoch": 2.351575456053068, + "grad_norm": 0.11020473572258073, + "learning_rate": 0.0006370810833395213, + "loss": 2.8809, + "step": 7445 + }, + { + "epoch": 2.353154860617547, + "grad_norm": 0.10817793980080924, + "learning_rate": 0.0006365506641376761, + "loss": 2.9405, + "step": 7450 + }, + { + "epoch": 2.354734265182026, + "grad_norm": 0.11829638710718517, + "learning_rate": 0.0006360200788274433, + "loss": 2.8903, + "step": 7455 + }, + { + "epoch": 2.3563136697465055, + "grad_norm": 0.12122531438161264, + "learning_rate": 0.0006354893280542576, + "loss": 2.8649, + "step": 7460 + }, + { + "epoch": 2.3578930743109847, + "grad_norm": 0.1423364379940213, + "learning_rate": 0.0006349584124637568, + "loss": 2.8839, + "step": 7465 + }, + { + "epoch": 2.359472478875464, + "grad_norm": 0.1271453181188999, + "learning_rate": 0.0006344273327017778, + "loss": 2.9494, + "step": 7470 + }, + { + "epoch": 2.361051883439943, + "grad_norm": 0.11010006128669661, + "learning_rate": 0.000633896089414358, + "loss": 2.8252, + "step": 7475 + }, + { + "epoch": 2.3626312880044225, + "grad_norm": 0.12626868323608306, + "learning_rate": 0.0006333646832477333, + "loss": 2.9374, + "step": 7480 + }, + { + "epoch": 2.3642106925689017, + "grad_norm": 0.098916551615853, + "learning_rate": 0.0006328331148483382, + "loss": 2.8684, + "step": 7485 + }, + { + "epoch": 2.365790097133381, + "grad_norm": 0.1135466487223654, + "learning_rate": 0.000632301384862804, + "loss": 2.8282, + "step": 7490 + }, + { + "epoch": 2.3673695016978598, + "grad_norm": 0.11200927537885891, + "learning_rate": 0.000631769493937959, + "loss": 2.8443, + "step": 7495 + }, + { + "epoch": 2.368948906262339, + "grad_norm": 0.0964859433872292, + "learning_rate": 0.000631237442720827, + "loss": 2.8649, + "step": 7500 + }, + { + "epoch": 2.3705283108268183, + "grad_norm": 0.11815232292417185, + "learning_rate": 0.0006307052318586271, + "loss": 2.8965, + "step": 7505 + }, + { + "epoch": 2.3721077153912975, + "grad_norm": 0.10005956623057873, + "learning_rate": 0.0006301728619987721, + "loss": 2.9636, + "step": 7510 + }, + { + "epoch": 2.3736871199557767, + "grad_norm": 0.13347167160379853, + "learning_rate": 0.0006296403337888686, + "loss": 2.904, + "step": 7515 + }, + { + "epoch": 2.375266524520256, + "grad_norm": 0.12765953630479895, + "learning_rate": 0.0006291076478767159, + "loss": 2.9464, + "step": 7520 + }, + { + "epoch": 2.3768459290847352, + "grad_norm": 0.13646462853539004, + "learning_rate": 0.0006285748049103049, + "loss": 2.9153, + "step": 7525 + }, + { + "epoch": 2.378425333649214, + "grad_norm": 0.13430146770002008, + "learning_rate": 0.0006280418055378174, + "loss": 2.8988, + "step": 7530 + }, + { + "epoch": 2.3800047382136933, + "grad_norm": 0.11154960124006869, + "learning_rate": 0.0006275086504076261, + "loss": 2.8967, + "step": 7535 + }, + { + "epoch": 2.3815841427781725, + "grad_norm": 0.11396262290888928, + "learning_rate": 0.0006269753401682924, + "loss": 2.9453, + "step": 7540 + }, + { + "epoch": 2.383163547342652, + "grad_norm": 0.11939458362440651, + "learning_rate": 0.000626441875468567, + "loss": 2.9251, + "step": 7545 + }, + { + "epoch": 2.384742951907131, + "grad_norm": 0.13204138406120308, + "learning_rate": 0.000625908256957388, + "loss": 2.8991, + "step": 7550 + }, + { + "epoch": 2.3863223564716103, + "grad_norm": 0.09940860549838453, + "learning_rate": 0.000625374485283881, + "loss": 2.9121, + "step": 7555 + }, + { + "epoch": 2.3879017610360895, + "grad_norm": 0.12371146173838096, + "learning_rate": 0.0006248405610973579, + "loss": 2.8934, + "step": 7560 + }, + { + "epoch": 2.389481165600569, + "grad_norm": 0.10512694700874209, + "learning_rate": 0.0006243064850473157, + "loss": 2.9774, + "step": 7565 + }, + { + "epoch": 2.3910605701650476, + "grad_norm": 0.117644926009178, + "learning_rate": 0.0006237722577834366, + "loss": 3.0284, + "step": 7570 + }, + { + "epoch": 2.392639974729527, + "grad_norm": 0.14153952210007853, + "learning_rate": 0.0006232378799555866, + "loss": 2.967, + "step": 7575 + }, + { + "epoch": 2.394219379294006, + "grad_norm": 0.13043671637605017, + "learning_rate": 0.0006227033522138145, + "loss": 2.9157, + "step": 7580 + }, + { + "epoch": 2.3957987838584853, + "grad_norm": 0.13895208613939825, + "learning_rate": 0.0006221686752083522, + "loss": 3.0146, + "step": 7585 + }, + { + "epoch": 2.3973781884229646, + "grad_norm": 0.11330838502160254, + "learning_rate": 0.0006216338495896124, + "loss": 2.9736, + "step": 7590 + }, + { + "epoch": 2.398957592987444, + "grad_norm": 0.11589094914294655, + "learning_rate": 0.0006210988760081894, + "loss": 2.9057, + "step": 7595 + }, + { + "epoch": 2.400536997551923, + "grad_norm": 0.15157965170773122, + "learning_rate": 0.0006205637551148567, + "loss": 2.9991, + "step": 7600 + }, + { + "epoch": 2.402116402116402, + "grad_norm": 0.11320896179503025, + "learning_rate": 0.0006200284875605673, + "loss": 2.8798, + "step": 7605 + }, + { + "epoch": 2.403695806680881, + "grad_norm": 0.13873940669004436, + "learning_rate": 0.0006194930739964528, + "loss": 2.9625, + "step": 7610 + }, + { + "epoch": 2.4052752112453604, + "grad_norm": 0.092705040359977, + "learning_rate": 0.0006189575150738223, + "loss": 2.913, + "step": 7615 + }, + { + "epoch": 2.4068546158098396, + "grad_norm": 0.11139593937524005, + "learning_rate": 0.0006184218114441614, + "loss": 2.8262, + "step": 7620 + }, + { + "epoch": 2.408434020374319, + "grad_norm": 0.1273406414196765, + "learning_rate": 0.0006178859637591324, + "loss": 2.9821, + "step": 7625 + }, + { + "epoch": 2.410013424938798, + "grad_norm": 0.12243860879491518, + "learning_rate": 0.000617349972670572, + "loss": 2.9659, + "step": 7630 + }, + { + "epoch": 2.4115928295032774, + "grad_norm": 0.13916555381820644, + "learning_rate": 0.0006168138388304923, + "loss": 2.9555, + "step": 7635 + }, + { + "epoch": 2.4131722340677566, + "grad_norm": 0.09817635445256855, + "learning_rate": 0.0006162775628910781, + "loss": 2.8617, + "step": 7640 + }, + { + "epoch": 2.414751638632236, + "grad_norm": 0.11641800397697878, + "learning_rate": 0.0006157411455046877, + "loss": 2.8724, + "step": 7645 + }, + { + "epoch": 2.4163310431967147, + "grad_norm": 0.09597429501811984, + "learning_rate": 0.0006152045873238512, + "loss": 2.8691, + "step": 7650 + }, + { + "epoch": 2.417910447761194, + "grad_norm": 0.14373380417854187, + "learning_rate": 0.00061466788900127, + "loss": 2.865, + "step": 7655 + }, + { + "epoch": 2.419489852325673, + "grad_norm": 0.11877441608164054, + "learning_rate": 0.0006141310511898161, + "loss": 2.9486, + "step": 7660 + }, + { + "epoch": 2.4210692568901524, + "grad_norm": 0.1098346159971009, + "learning_rate": 0.0006135940745425314, + "loss": 2.8958, + "step": 7665 + }, + { + "epoch": 2.4226486614546316, + "grad_norm": 0.10845425850178957, + "learning_rate": 0.0006130569597126256, + "loss": 2.902, + "step": 7670 + }, + { + "epoch": 2.424228066019111, + "grad_norm": 0.12665847893533033, + "learning_rate": 0.000612519707353478, + "loss": 2.9425, + "step": 7675 + }, + { + "epoch": 2.42580747058359, + "grad_norm": 0.14520238937055907, + "learning_rate": 0.0006119823181186341, + "loss": 2.8898, + "step": 7680 + }, + { + "epoch": 2.427386875148069, + "grad_norm": 0.1383904256282845, + "learning_rate": 0.0006114447926618066, + "loss": 2.972, + "step": 7685 + }, + { + "epoch": 2.428966279712548, + "grad_norm": 0.12584733089244884, + "learning_rate": 0.0006109071316368732, + "loss": 2.8755, + "step": 7690 + }, + { + "epoch": 2.4305456842770274, + "grad_norm": 0.11062458148204891, + "learning_rate": 0.000610369335697877, + "loss": 2.9636, + "step": 7695 + }, + { + "epoch": 2.4321250888415067, + "grad_norm": 0.14991156013078924, + "learning_rate": 0.0006098314054990253, + "loss": 2.9908, + "step": 7700 + }, + { + "epoch": 2.433704493405986, + "grad_norm": 0.12984481610473728, + "learning_rate": 0.0006092933416946885, + "loss": 2.9305, + "step": 7705 + }, + { + "epoch": 2.435283897970465, + "grad_norm": 0.1174711868276579, + "learning_rate": 0.0006087551449393996, + "loss": 2.7334, + "step": 7710 + }, + { + "epoch": 2.4368633025349444, + "grad_norm": 0.13370455104533868, + "learning_rate": 0.000608216815887853, + "loss": 2.8389, + "step": 7715 + }, + { + "epoch": 2.4384427070994237, + "grad_norm": 0.12216399754359457, + "learning_rate": 0.0006076783551949046, + "loss": 2.8688, + "step": 7720 + }, + { + "epoch": 2.440022111663903, + "grad_norm": 0.1004581598228754, + "learning_rate": 0.0006071397635155701, + "loss": 2.8522, + "step": 7725 + }, + { + "epoch": 2.4416015162283817, + "grad_norm": 0.12995507408416646, + "learning_rate": 0.0006066010415050246, + "loss": 2.8688, + "step": 7730 + }, + { + "epoch": 2.443180920792861, + "grad_norm": 0.12200058647648904, + "learning_rate": 0.0006060621898186017, + "loss": 2.834, + "step": 7735 + }, + { + "epoch": 2.4447603253573402, + "grad_norm": 0.12845547160286597, + "learning_rate": 0.0006055232091117929, + "loss": 2.8708, + "step": 7740 + }, + { + "epoch": 2.4463397299218195, + "grad_norm": 0.13473489215683873, + "learning_rate": 0.0006049841000402465, + "loss": 2.8791, + "step": 7745 + }, + { + "epoch": 2.4479191344862987, + "grad_norm": 0.10406798504227764, + "learning_rate": 0.000604444863259767, + "loss": 2.8615, + "step": 7750 + }, + { + "epoch": 2.449498539050778, + "grad_norm": 0.10339964858007016, + "learning_rate": 0.0006039054994263142, + "loss": 2.8993, + "step": 7755 + }, + { + "epoch": 2.451077943615257, + "grad_norm": 0.09560148890671935, + "learning_rate": 0.0006033660091960025, + "loss": 2.9023, + "step": 7760 + }, + { + "epoch": 2.452657348179736, + "grad_norm": 0.1564609666925628, + "learning_rate": 0.0006028263932251, + "loss": 2.9065, + "step": 7765 + }, + { + "epoch": 2.4542367527442153, + "grad_norm": 0.15595448053044553, + "learning_rate": 0.000602286652170028, + "loss": 2.9556, + "step": 7770 + }, + { + "epoch": 2.4558161573086945, + "grad_norm": 0.13267150580512918, + "learning_rate": 0.0006017467866873596, + "loss": 2.9264, + "step": 7775 + }, + { + "epoch": 2.4573955618731738, + "grad_norm": 0.1389279559900566, + "learning_rate": 0.0006012067974338195, + "loss": 2.932, + "step": 7780 + }, + { + "epoch": 2.458974966437653, + "grad_norm": 0.11281678295707062, + "learning_rate": 0.0006006666850662828, + "loss": 2.8462, + "step": 7785 + }, + { + "epoch": 2.4605543710021323, + "grad_norm": 0.12294466833567588, + "learning_rate": 0.0006001264502417748, + "loss": 2.8788, + "step": 7790 + }, + { + "epoch": 2.4621337755666115, + "grad_norm": 0.11528135968840203, + "learning_rate": 0.000599586093617469, + "loss": 2.8603, + "step": 7795 + }, + { + "epoch": 2.4637131801310908, + "grad_norm": 0.11600416755310684, + "learning_rate": 0.0005990456158506878, + "loss": 2.8759, + "step": 7800 + }, + { + "epoch": 2.46529258469557, + "grad_norm": 0.12584827200322835, + "learning_rate": 0.0005985050175989005, + "loss": 2.8788, + "step": 7805 + }, + { + "epoch": 2.466871989260049, + "grad_norm": 0.10852126172666732, + "learning_rate": 0.0005979642995197231, + "loss": 2.8374, + "step": 7810 + }, + { + "epoch": 2.468451393824528, + "grad_norm": 0.12536981419861587, + "learning_rate": 0.0005974234622709173, + "loss": 2.8386, + "step": 7815 + }, + { + "epoch": 2.4700307983890073, + "grad_norm": 0.10695751249785046, + "learning_rate": 0.0005968825065103903, + "loss": 2.7978, + "step": 7820 + }, + { + "epoch": 2.4716102029534865, + "grad_norm": 0.09098132344627358, + "learning_rate": 0.0005963414328961923, + "loss": 2.9162, + "step": 7825 + }, + { + "epoch": 2.473189607517966, + "grad_norm": 0.1376328674091665, + "learning_rate": 0.0005958002420865184, + "loss": 2.869, + "step": 7830 + }, + { + "epoch": 2.474769012082445, + "grad_norm": 0.1246826926351846, + "learning_rate": 0.0005952589347397047, + "loss": 2.8681, + "step": 7835 + }, + { + "epoch": 2.4763484166469243, + "grad_norm": 0.11223635962115318, + "learning_rate": 0.0005947175115142303, + "loss": 2.9597, + "step": 7840 + }, + { + "epoch": 2.477927821211403, + "grad_norm": 0.09874294074136647, + "learning_rate": 0.0005941759730687145, + "loss": 2.8881, + "step": 7845 + }, + { + "epoch": 2.4795072257758823, + "grad_norm": 0.10702970527144451, + "learning_rate": 0.0005936343200619171, + "loss": 2.9165, + "step": 7850 + }, + { + "epoch": 2.4810866303403616, + "grad_norm": 0.11516417270268137, + "learning_rate": 0.0005930925531527373, + "loss": 2.9585, + "step": 7855 + }, + { + "epoch": 2.482666034904841, + "grad_norm": 0.1021710274042089, + "learning_rate": 0.0005925506730002125, + "loss": 2.8733, + "step": 7860 + }, + { + "epoch": 2.48424543946932, + "grad_norm": 0.10802543141002224, + "learning_rate": 0.0005920086802635182, + "loss": 2.9297, + "step": 7865 + }, + { + "epoch": 2.4858248440337993, + "grad_norm": 0.10653180022851777, + "learning_rate": 0.0005914665756019672, + "loss": 2.8791, + "step": 7870 + }, + { + "epoch": 2.4874042485982786, + "grad_norm": 0.0997210691058446, + "learning_rate": 0.0005909243596750072, + "loss": 2.9041, + "step": 7875 + }, + { + "epoch": 2.488983653162758, + "grad_norm": 0.11878837772321785, + "learning_rate": 0.0005903820331422228, + "loss": 2.8788, + "step": 7880 + }, + { + "epoch": 2.490563057727237, + "grad_norm": 0.11694516600327845, + "learning_rate": 0.0005898395966633317, + "loss": 2.9453, + "step": 7885 + }, + { + "epoch": 2.492142462291716, + "grad_norm": 0.11743602215417963, + "learning_rate": 0.0005892970508981866, + "loss": 2.9386, + "step": 7890 + }, + { + "epoch": 2.493721866856195, + "grad_norm": 0.12746286655196695, + "learning_rate": 0.0005887543965067724, + "loss": 2.9203, + "step": 7895 + }, + { + "epoch": 2.4953012714206744, + "grad_norm": 0.10382797812621399, + "learning_rate": 0.0005882116341492063, + "loss": 2.9383, + "step": 7900 + }, + { + "epoch": 2.4968806759851536, + "grad_norm": 0.1246978749562224, + "learning_rate": 0.000587668764485737, + "loss": 2.9262, + "step": 7905 + }, + { + "epoch": 2.498460080549633, + "grad_norm": 0.09763564074166992, + "learning_rate": 0.0005871257881767436, + "loss": 2.9287, + "step": 7910 + }, + { + "epoch": 2.500039485114112, + "grad_norm": 0.10614550079603412, + "learning_rate": 0.0005865827058827344, + "loss": 2.8489, + "step": 7915 + }, + { + "epoch": 2.501618889678591, + "grad_norm": 0.10538161842564463, + "learning_rate": 0.0005860395182643481, + "loss": 2.8041, + "step": 7920 + }, + { + "epoch": 2.50319829424307, + "grad_norm": 0.1143759722183686, + "learning_rate": 0.0005854962259823497, + "loss": 2.9817, + "step": 7925 + }, + { + "epoch": 2.5047776988075494, + "grad_norm": 0.09584559266623355, + "learning_rate": 0.000584952829697633, + "loss": 2.8908, + "step": 7930 + }, + { + "epoch": 2.5063571033720287, + "grad_norm": 0.11717848072514851, + "learning_rate": 0.0005844093300712175, + "loss": 2.8875, + "step": 7935 + }, + { + "epoch": 2.507936507936508, + "grad_norm": 0.10266982997024632, + "learning_rate": 0.0005838657277642484, + "loss": 2.8684, + "step": 7940 + }, + { + "epoch": 2.509515912500987, + "grad_norm": 0.09002512148417816, + "learning_rate": 0.0005833220234379964, + "loss": 2.8304, + "step": 7945 + }, + { + "epoch": 2.5110953170654664, + "grad_norm": 0.11490604007160053, + "learning_rate": 0.0005827782177538558, + "loss": 2.9379, + "step": 7950 + }, + { + "epoch": 2.5126747216299457, + "grad_norm": 0.13998452649568366, + "learning_rate": 0.0005822343113733442, + "loss": 2.836, + "step": 7955 + }, + { + "epoch": 2.514254126194425, + "grad_norm": 0.1216994535102055, + "learning_rate": 0.000581690304958102, + "loss": 2.8637, + "step": 7960 + }, + { + "epoch": 2.515833530758904, + "grad_norm": 0.09588585536729172, + "learning_rate": 0.000581146199169891, + "loss": 2.9294, + "step": 7965 + }, + { + "epoch": 2.517412935323383, + "grad_norm": 0.0954709796689048, + "learning_rate": 0.0005806019946705942, + "loss": 2.9174, + "step": 7970 + }, + { + "epoch": 2.518992339887862, + "grad_norm": 0.12252355359878117, + "learning_rate": 0.0005800576921222142, + "loss": 2.8069, + "step": 7975 + }, + { + "epoch": 2.5205717444523414, + "grad_norm": 0.10226881145420655, + "learning_rate": 0.0005795132921868733, + "loss": 3.0369, + "step": 7980 + }, + { + "epoch": 2.5221511490168207, + "grad_norm": 0.09563660983490654, + "learning_rate": 0.0005789687955268119, + "loss": 2.8136, + "step": 7985 + }, + { + "epoch": 2.5237305535813, + "grad_norm": 0.11454572894817108, + "learning_rate": 0.0005784242028043885, + "loss": 2.8711, + "step": 7990 + }, + { + "epoch": 2.525309958145779, + "grad_norm": 0.1513588739526145, + "learning_rate": 0.0005778795146820783, + "loss": 2.9436, + "step": 7995 + }, + { + "epoch": 2.526889362710258, + "grad_norm": 0.10466678075616238, + "learning_rate": 0.0005773347318224725, + "loss": 2.8491, + "step": 8000 + }, + { + "epoch": 2.5284687672747372, + "grad_norm": 0.09832765179182253, + "learning_rate": 0.0005767898548882772, + "loss": 2.8794, + "step": 8005 + }, + { + "epoch": 2.5300481718392165, + "grad_norm": 0.09654152645996128, + "learning_rate": 0.0005762448845423136, + "loss": 2.7809, + "step": 8010 + }, + { + "epoch": 2.5316275764036957, + "grad_norm": 0.10270121322864592, + "learning_rate": 0.0005756998214475158, + "loss": 2.8804, + "step": 8015 + }, + { + "epoch": 2.533206980968175, + "grad_norm": 0.12129380452991413, + "learning_rate": 0.0005751546662669318, + "loss": 2.881, + "step": 8020 + }, + { + "epoch": 2.5347863855326542, + "grad_norm": 0.1137491271183352, + "learning_rate": 0.0005746094196637202, + "loss": 2.9944, + "step": 8025 + }, + { + "epoch": 2.5363657900971335, + "grad_norm": 0.1178310752747603, + "learning_rate": 0.0005740640823011519, + "loss": 2.9511, + "step": 8030 + }, + { + "epoch": 2.5379451946616127, + "grad_norm": 0.1057157588808938, + "learning_rate": 0.000573518654842608, + "loss": 2.8831, + "step": 8035 + }, + { + "epoch": 2.539524599226092, + "grad_norm": 0.1319056734423167, + "learning_rate": 0.0005729731379515787, + "loss": 2.9043, + "step": 8040 + }, + { + "epoch": 2.541104003790571, + "grad_norm": 0.14686616590523782, + "learning_rate": 0.0005724275322916636, + "loss": 2.8638, + "step": 8045 + }, + { + "epoch": 2.54268340835505, + "grad_norm": 0.11543070841885159, + "learning_rate": 0.00057188183852657, + "loss": 2.8652, + "step": 8050 + }, + { + "epoch": 2.5442628129195293, + "grad_norm": 0.12972477613439057, + "learning_rate": 0.0005713360573201123, + "loss": 2.8791, + "step": 8055 + }, + { + "epoch": 2.5458422174840085, + "grad_norm": 0.09660035219259314, + "learning_rate": 0.0005707901893362115, + "loss": 2.9346, + "step": 8060 + }, + { + "epoch": 2.5474216220484878, + "grad_norm": 0.10651356816686414, + "learning_rate": 0.000570244235238894, + "loss": 2.8865, + "step": 8065 + }, + { + "epoch": 2.549001026612967, + "grad_norm": 0.09675862774522141, + "learning_rate": 0.0005696981956922908, + "loss": 2.8773, + "step": 8070 + }, + { + "epoch": 2.5505804311774463, + "grad_norm": 0.15617858327168496, + "learning_rate": 0.0005691520713606374, + "loss": 2.9433, + "step": 8075 + }, + { + "epoch": 2.552159835741925, + "grad_norm": 0.11763305081741604, + "learning_rate": 0.0005686058629082718, + "loss": 2.8832, + "step": 8080 + }, + { + "epoch": 2.5537392403064043, + "grad_norm": 0.10593544492397061, + "learning_rate": 0.0005680595709996347, + "loss": 2.7942, + "step": 8085 + }, + { + "epoch": 2.5553186448708836, + "grad_norm": 0.13126937029678293, + "learning_rate": 0.0005675131962992684, + "loss": 2.986, + "step": 8090 + }, + { + "epoch": 2.556898049435363, + "grad_norm": 0.10980128811196688, + "learning_rate": 0.0005669667394718151, + "loss": 2.8083, + "step": 8095 + }, + { + "epoch": 2.558477453999842, + "grad_norm": 0.11819107446791322, + "learning_rate": 0.0005664202011820183, + "loss": 2.8245, + "step": 8100 + }, + { + "epoch": 2.5600568585643213, + "grad_norm": 0.09428546346280126, + "learning_rate": 0.0005658735820947195, + "loss": 2.7917, + "step": 8105 + }, + { + "epoch": 2.5616362631288006, + "grad_norm": 0.09680849514587951, + "learning_rate": 0.0005653268828748588, + "loss": 2.9514, + "step": 8110 + }, + { + "epoch": 2.56321566769328, + "grad_norm": 0.09532956412408176, + "learning_rate": 0.0005647801041874738, + "loss": 2.8979, + "step": 8115 + }, + { + "epoch": 2.564795072257759, + "grad_norm": 0.09313444023713757, + "learning_rate": 0.0005642332466976989, + "loss": 2.8895, + "step": 8120 + }, + { + "epoch": 2.5663744768222383, + "grad_norm": 0.09595409710264817, + "learning_rate": 0.000563686311070764, + "loss": 2.964, + "step": 8125 + }, + { + "epoch": 2.567953881386717, + "grad_norm": 0.10605804176493229, + "learning_rate": 0.0005631392979719945, + "loss": 2.9001, + "step": 8130 + }, + { + "epoch": 2.5695332859511963, + "grad_norm": 0.12266104311606131, + "learning_rate": 0.0005625922080668098, + "loss": 2.851, + "step": 8135 + }, + { + "epoch": 2.5711126905156756, + "grad_norm": 0.1197387065941255, + "learning_rate": 0.0005620450420207227, + "loss": 2.8976, + "step": 8140 + }, + { + "epoch": 2.572692095080155, + "grad_norm": 0.1025070737203573, + "learning_rate": 0.0005614978004993388, + "loss": 2.9582, + "step": 8145 + }, + { + "epoch": 2.574271499644634, + "grad_norm": 0.10279544554870783, + "learning_rate": 0.000560950484168355, + "loss": 2.9097, + "step": 8150 + }, + { + "epoch": 2.575850904209113, + "grad_norm": 0.08338297908944227, + "learning_rate": 0.00056040309369356, + "loss": 2.932, + "step": 8155 + }, + { + "epoch": 2.577430308773592, + "grad_norm": 0.10803053370715439, + "learning_rate": 0.0005598556297408321, + "loss": 2.8548, + "step": 8160 + }, + { + "epoch": 2.5790097133380714, + "grad_norm": 0.11588132304181535, + "learning_rate": 0.0005593080929761393, + "loss": 2.8645, + "step": 8165 + }, + { + "epoch": 2.5805891179025506, + "grad_norm": 0.11089896406663106, + "learning_rate": 0.0005587604840655379, + "loss": 2.8353, + "step": 8170 + }, + { + "epoch": 2.58216852246703, + "grad_norm": 0.11508150368674885, + "learning_rate": 0.0005582128036751724, + "loss": 2.9319, + "step": 8175 + }, + { + "epoch": 2.583747927031509, + "grad_norm": 0.10502399787573738, + "learning_rate": 0.0005576650524712735, + "loss": 2.9075, + "step": 8180 + }, + { + "epoch": 2.5853273315959884, + "grad_norm": 0.13345339841003134, + "learning_rate": 0.0005571172311201587, + "loss": 2.9161, + "step": 8185 + }, + { + "epoch": 2.5869067361604676, + "grad_norm": 0.12456752360732082, + "learning_rate": 0.0005565693402882306, + "loss": 2.8788, + "step": 8190 + }, + { + "epoch": 2.588486140724947, + "grad_norm": 0.14576940938930275, + "learning_rate": 0.0005560213806419765, + "loss": 2.8721, + "step": 8195 + }, + { + "epoch": 2.590065545289426, + "grad_norm": 0.11367083641648577, + "learning_rate": 0.0005554733528479672, + "loss": 2.8128, + "step": 8200 + }, + { + "epoch": 2.591644949853905, + "grad_norm": 0.08885902092672537, + "learning_rate": 0.0005549252575728563, + "loss": 2.8223, + "step": 8205 + }, + { + "epoch": 2.593224354418384, + "grad_norm": 0.08416863087959697, + "learning_rate": 0.0005543770954833798, + "loss": 2.8385, + "step": 8210 + }, + { + "epoch": 2.5948037589828634, + "grad_norm": 0.09346197267458926, + "learning_rate": 0.0005538288672463549, + "loss": 2.8702, + "step": 8215 + }, + { + "epoch": 2.5963831635473427, + "grad_norm": 0.09908019783659593, + "learning_rate": 0.000553280573528679, + "loss": 2.859, + "step": 8220 + }, + { + "epoch": 2.597962568111822, + "grad_norm": 0.11997269336385176, + "learning_rate": 0.0005527322149973294, + "loss": 2.9347, + "step": 8225 + }, + { + "epoch": 2.599541972676301, + "grad_norm": 0.11159273684240642, + "learning_rate": 0.0005521837923193621, + "loss": 2.8241, + "step": 8230 + }, + { + "epoch": 2.60112137724078, + "grad_norm": 0.11072042989007429, + "learning_rate": 0.0005516353061619114, + "loss": 2.8891, + "step": 8235 + }, + { + "epoch": 2.602700781805259, + "grad_norm": 0.10201560853653248, + "learning_rate": 0.0005510867571921887, + "loss": 2.7999, + "step": 8240 + }, + { + "epoch": 2.6042801863697385, + "grad_norm": 0.10667059215170911, + "learning_rate": 0.0005505381460774815, + "loss": 2.8858, + "step": 8245 + }, + { + "epoch": 2.6058595909342177, + "grad_norm": 0.1149422916403453, + "learning_rate": 0.0005499894734851533, + "loss": 2.924, + "step": 8250 + }, + { + "epoch": 2.607438995498697, + "grad_norm": 0.114877786687331, + "learning_rate": 0.0005494407400826422, + "loss": 2.8901, + "step": 8255 + }, + { + "epoch": 2.609018400063176, + "grad_norm": 0.12644713385260586, + "learning_rate": 0.0005488919465374601, + "loss": 2.9277, + "step": 8260 + }, + { + "epoch": 2.6105978046276554, + "grad_norm": 0.09971616604983528, + "learning_rate": 0.0005483430935171927, + "loss": 2.8212, + "step": 8265 + }, + { + "epoch": 2.6121772091921347, + "grad_norm": 0.12772171916187006, + "learning_rate": 0.0005477941816894972, + "loss": 2.8943, + "step": 8270 + }, + { + "epoch": 2.613756613756614, + "grad_norm": 0.1212709812386515, + "learning_rate": 0.0005472452117221031, + "loss": 2.8858, + "step": 8275 + }, + { + "epoch": 2.615336018321093, + "grad_norm": 0.13669330547386427, + "learning_rate": 0.00054669618428281, + "loss": 2.9107, + "step": 8280 + }, + { + "epoch": 2.616915422885572, + "grad_norm": 0.12423297098662968, + "learning_rate": 0.0005461471000394877, + "loss": 2.855, + "step": 8285 + }, + { + "epoch": 2.6184948274500512, + "grad_norm": 0.11719194897621003, + "learning_rate": 0.0005455979596600752, + "loss": 2.8654, + "step": 8290 + }, + { + "epoch": 2.6200742320145305, + "grad_norm": 0.10983667694557864, + "learning_rate": 0.0005450487638125798, + "loss": 2.9057, + "step": 8295 + }, + { + "epoch": 2.6216536365790097, + "grad_norm": 0.08788174740826656, + "learning_rate": 0.0005444995131650757, + "loss": 2.8627, + "step": 8300 + }, + { + "epoch": 2.623233041143489, + "grad_norm": 0.09902778517837978, + "learning_rate": 0.0005439502083857048, + "loss": 2.7816, + "step": 8305 + }, + { + "epoch": 2.6248124457079682, + "grad_norm": 0.11796542187832296, + "learning_rate": 0.0005434008501426738, + "loss": 2.8523, + "step": 8310 + }, + { + "epoch": 2.626391850272447, + "grad_norm": 0.10807870128539734, + "learning_rate": 0.0005428514391042551, + "loss": 2.8327, + "step": 8315 + }, + { + "epoch": 2.6279712548369263, + "grad_norm": 0.12970676311330875, + "learning_rate": 0.0005423019759387851, + "loss": 2.6965, + "step": 8320 + }, + { + "epoch": 2.6295506594014055, + "grad_norm": 0.11144602141340477, + "learning_rate": 0.0005417524613146636, + "loss": 2.9372, + "step": 8325 + }, + { + "epoch": 2.631130063965885, + "grad_norm": 0.14431448741906194, + "learning_rate": 0.000541202895900353, + "loss": 2.9028, + "step": 8330 + }, + { + "epoch": 2.632709468530364, + "grad_norm": 0.13035400654594645, + "learning_rate": 0.0005406532803643776, + "loss": 2.8818, + "step": 8335 + }, + { + "epoch": 2.6342888730948433, + "grad_norm": 0.17885692469148046, + "learning_rate": 0.0005401036153753224, + "loss": 2.8909, + "step": 8340 + }, + { + "epoch": 2.6358682776593225, + "grad_norm": 0.11894763362080127, + "learning_rate": 0.000539553901601833, + "loss": 2.7765, + "step": 8345 + }, + { + "epoch": 2.6374476822238018, + "grad_norm": 0.16610992816103776, + "learning_rate": 0.0005390041397126138, + "loss": 2.8501, + "step": 8350 + }, + { + "epoch": 2.639027086788281, + "grad_norm": 0.10533526116352965, + "learning_rate": 0.0005384543303764284, + "loss": 2.9377, + "step": 8355 + }, + { + "epoch": 2.6406064913527603, + "grad_norm": 0.1175918949534745, + "learning_rate": 0.0005379044742620974, + "loss": 2.7272, + "step": 8360 + }, + { + "epoch": 2.642185895917239, + "grad_norm": 0.10360703818981164, + "learning_rate": 0.0005373545720384988, + "loss": 2.8597, + "step": 8365 + }, + { + "epoch": 2.6437653004817183, + "grad_norm": 0.11568127032482835, + "learning_rate": 0.0005368046243745664, + "loss": 2.8109, + "step": 8370 + }, + { + "epoch": 2.6453447050461976, + "grad_norm": 0.0966862538256532, + "learning_rate": 0.0005362546319392895, + "loss": 2.8321, + "step": 8375 + }, + { + "epoch": 2.646924109610677, + "grad_norm": 0.1036770768690196, + "learning_rate": 0.0005357045954017117, + "loss": 2.8292, + "step": 8380 + }, + { + "epoch": 2.648503514175156, + "grad_norm": 0.08797383785771244, + "learning_rate": 0.0005351545154309304, + "loss": 2.9117, + "step": 8385 + }, + { + "epoch": 2.6500829187396353, + "grad_norm": 0.1140945899524084, + "learning_rate": 0.0005346043926960955, + "loss": 2.8117, + "step": 8390 + }, + { + "epoch": 2.651662323304114, + "grad_norm": 0.10659030191716873, + "learning_rate": 0.0005340542278664097, + "loss": 2.7991, + "step": 8395 + }, + { + "epoch": 2.6532417278685934, + "grad_norm": 0.1333560632282921, + "learning_rate": 0.0005335040216111258, + "loss": 2.9049, + "step": 8400 + }, + { + "epoch": 2.6548211324330726, + "grad_norm": 0.1084321450390365, + "learning_rate": 0.000532953774599548, + "loss": 2.907, + "step": 8405 + }, + { + "epoch": 2.656400536997552, + "grad_norm": 0.10121787566646603, + "learning_rate": 0.0005324034875010293, + "loss": 2.8406, + "step": 8410 + }, + { + "epoch": 2.657979941562031, + "grad_norm": 0.10903625541156818, + "learning_rate": 0.0005318531609849721, + "loss": 2.8009, + "step": 8415 + }, + { + "epoch": 2.6595593461265103, + "grad_norm": 0.09814944875787951, + "learning_rate": 0.0005313027957208261, + "loss": 2.9122, + "step": 8420 + }, + { + "epoch": 2.6611387506909896, + "grad_norm": 0.09769488231656247, + "learning_rate": 0.0005307523923780891, + "loss": 2.8387, + "step": 8425 + }, + { + "epoch": 2.662718155255469, + "grad_norm": 0.08832213539268997, + "learning_rate": 0.0005302019516263039, + "loss": 2.892, + "step": 8430 + }, + { + "epoch": 2.664297559819948, + "grad_norm": 0.10314825876543694, + "learning_rate": 0.0005296514741350602, + "loss": 2.7589, + "step": 8435 + }, + { + "epoch": 2.6658769643844273, + "grad_norm": 0.09340651096278425, + "learning_rate": 0.0005291009605739911, + "loss": 2.833, + "step": 8440 + }, + { + "epoch": 2.667456368948906, + "grad_norm": 0.0934945519190645, + "learning_rate": 0.0005285504116127747, + "loss": 2.792, + "step": 8445 + }, + { + "epoch": 2.6690357735133854, + "grad_norm": 0.08884602620064723, + "learning_rate": 0.0005279998279211314, + "loss": 2.82, + "step": 8450 + }, + { + "epoch": 2.6706151780778646, + "grad_norm": 0.10975324493893444, + "learning_rate": 0.0005274492101688241, + "loss": 2.8457, + "step": 8455 + }, + { + "epoch": 2.672194582642344, + "grad_norm": 0.13695454509385244, + "learning_rate": 0.0005268985590256572, + "loss": 2.8768, + "step": 8460 + }, + { + "epoch": 2.673773987206823, + "grad_norm": 0.12483951207448517, + "learning_rate": 0.0005263478751614758, + "loss": 2.8488, + "step": 8465 + }, + { + "epoch": 2.6753533917713024, + "grad_norm": 0.10388040882976078, + "learning_rate": 0.0005257971592461643, + "loss": 2.8538, + "step": 8470 + }, + { + "epoch": 2.676932796335781, + "grad_norm": 0.11298417140151591, + "learning_rate": 0.0005252464119496467, + "loss": 2.8867, + "step": 8475 + }, + { + "epoch": 2.6785122009002604, + "grad_norm": 0.11629704584959613, + "learning_rate": 0.0005246956339418848, + "loss": 2.8664, + "step": 8480 + }, + { + "epoch": 2.6800916054647397, + "grad_norm": 0.10522974921360309, + "learning_rate": 0.0005241448258928781, + "loss": 3.015, + "step": 8485 + }, + { + "epoch": 2.681671010029219, + "grad_norm": 0.12477955177393571, + "learning_rate": 0.0005235939884726624, + "loss": 2.8484, + "step": 8490 + }, + { + "epoch": 2.683250414593698, + "grad_norm": 0.10411492618738177, + "learning_rate": 0.0005230431223513089, + "loss": 2.9295, + "step": 8495 + }, + { + "epoch": 2.6848298191581774, + "grad_norm": 0.11828492623305749, + "learning_rate": 0.0005224922281989245, + "loss": 2.8334, + "step": 8500 + }, + { + "epoch": 2.6864092237226567, + "grad_norm": 0.11029185894654155, + "learning_rate": 0.0005219413066856495, + "loss": 2.8465, + "step": 8505 + }, + { + "epoch": 2.687988628287136, + "grad_norm": 0.10182149815224686, + "learning_rate": 0.0005213903584816578, + "loss": 2.8629, + "step": 8510 + }, + { + "epoch": 2.689568032851615, + "grad_norm": 0.1128048849382887, + "learning_rate": 0.000520839384257156, + "loss": 2.9735, + "step": 8515 + }, + { + "epoch": 2.691147437416094, + "grad_norm": 0.13170744426484057, + "learning_rate": 0.0005202883846823815, + "loss": 2.9398, + "step": 8520 + }, + { + "epoch": 2.692726841980573, + "grad_norm": 0.09150773720833716, + "learning_rate": 0.0005197373604276037, + "loss": 2.8829, + "step": 8525 + }, + { + "epoch": 2.6943062465450525, + "grad_norm": 0.09975598268082873, + "learning_rate": 0.0005191863121631209, + "loss": 2.8492, + "step": 8530 + }, + { + "epoch": 2.6958856511095317, + "grad_norm": 0.12422940940268365, + "learning_rate": 0.0005186352405592617, + "loss": 2.9577, + "step": 8535 + }, + { + "epoch": 2.697465055674011, + "grad_norm": 0.09818507336107357, + "learning_rate": 0.000518084146286382, + "loss": 2.9513, + "step": 8540 + }, + { + "epoch": 2.69904446023849, + "grad_norm": 0.10155910637986638, + "learning_rate": 0.0005175330300148662, + "loss": 2.7786, + "step": 8545 + }, + { + "epoch": 2.700623864802969, + "grad_norm": 0.11182062536399359, + "learning_rate": 0.000516981892415125, + "loss": 2.8224, + "step": 8550 + }, + { + "epoch": 2.7022032693674483, + "grad_norm": 0.09072312596104798, + "learning_rate": 0.0005164307341575949, + "loss": 2.9156, + "step": 8555 + }, + { + "epoch": 2.7037826739319275, + "grad_norm": 0.09717267367905622, + "learning_rate": 0.0005158795559127378, + "loss": 2.882, + "step": 8560 + }, + { + "epoch": 2.7053620784964068, + "grad_norm": 0.11522288125191604, + "learning_rate": 0.00051532835835104, + "loss": 2.8246, + "step": 8565 + }, + { + "epoch": 2.706941483060886, + "grad_norm": 0.13152596778141204, + "learning_rate": 0.0005147771421430112, + "loss": 2.9617, + "step": 8570 + }, + { + "epoch": 2.7085208876253652, + "grad_norm": 0.11065143019575623, + "learning_rate": 0.0005142259079591834, + "loss": 2.801, + "step": 8575 + }, + { + "epoch": 2.7101002921898445, + "grad_norm": 0.11801107468526441, + "learning_rate": 0.0005136746564701112, + "loss": 2.862, + "step": 8580 + }, + { + "epoch": 2.7116796967543237, + "grad_norm": 0.09717402661556791, + "learning_rate": 0.0005131233883463696, + "loss": 2.9415, + "step": 8585 + }, + { + "epoch": 2.713259101318803, + "grad_norm": 0.10700954717030448, + "learning_rate": 0.0005125721042585541, + "loss": 2.79, + "step": 8590 + }, + { + "epoch": 2.7148385058832822, + "grad_norm": 0.09709107788126108, + "learning_rate": 0.0005120208048772799, + "loss": 2.8614, + "step": 8595 + }, + { + "epoch": 2.716417910447761, + "grad_norm": 0.10416112756350643, + "learning_rate": 0.0005114694908731801, + "loss": 2.9702, + "step": 8600 + }, + { + "epoch": 2.7179973150122403, + "grad_norm": 0.12877731620703553, + "learning_rate": 0.0005109181629169063, + "loss": 2.8428, + "step": 8605 + }, + { + "epoch": 2.7195767195767195, + "grad_norm": 0.12569692543460473, + "learning_rate": 0.0005103668216791265, + "loss": 2.8394, + "step": 8610 + }, + { + "epoch": 2.721156124141199, + "grad_norm": 0.101443859632466, + "learning_rate": 0.0005098154678305253, + "loss": 2.8178, + "step": 8615 + }, + { + "epoch": 2.722735528705678, + "grad_norm": 0.11723589724681906, + "learning_rate": 0.0005092641020418026, + "loss": 2.8043, + "step": 8620 + }, + { + "epoch": 2.7243149332701573, + "grad_norm": 0.09726137109712447, + "learning_rate": 0.0005087127249836725, + "loss": 2.8191, + "step": 8625 + }, + { + "epoch": 2.725894337834636, + "grad_norm": 0.10264792480172079, + "learning_rate": 0.000508161337326863, + "loss": 2.8913, + "step": 8630 + }, + { + "epoch": 2.7274737423991153, + "grad_norm": 0.10925116213376099, + "learning_rate": 0.0005076099397421151, + "loss": 2.7857, + "step": 8635 + }, + { + "epoch": 2.7290531469635946, + "grad_norm": 0.10665161203445127, + "learning_rate": 0.0005070585329001819, + "loss": 2.8374, + "step": 8640 + }, + { + "epoch": 2.730632551528074, + "grad_norm": 0.09831191236696475, + "learning_rate": 0.0005065071174718272, + "loss": 2.8332, + "step": 8645 + }, + { + "epoch": 2.732211956092553, + "grad_norm": 0.09194761295937094, + "learning_rate": 0.0005059556941278261, + "loss": 2.8834, + "step": 8650 + }, + { + "epoch": 2.7337913606570323, + "grad_norm": 0.10405004093466642, + "learning_rate": 0.0005054042635389627, + "loss": 2.7754, + "step": 8655 + }, + { + "epoch": 2.7353707652215116, + "grad_norm": 0.14537068141239967, + "learning_rate": 0.0005048528263760301, + "loss": 2.7946, + "step": 8660 + }, + { + "epoch": 2.736950169785991, + "grad_norm": 0.1065988827208742, + "learning_rate": 0.0005043013833098296, + "loss": 2.8628, + "step": 8665 + }, + { + "epoch": 2.73852957435047, + "grad_norm": 0.1195366408617911, + "learning_rate": 0.0005037499350111692, + "loss": 2.9437, + "step": 8670 + }, + { + "epoch": 2.7401089789149493, + "grad_norm": 0.11459702577743501, + "learning_rate": 0.000503198482150864, + "loss": 2.7711, + "step": 8675 + }, + { + "epoch": 2.741688383479428, + "grad_norm": 0.12358540930509837, + "learning_rate": 0.0005026470253997339, + "loss": 2.9472, + "step": 8680 + }, + { + "epoch": 2.7432677880439074, + "grad_norm": 0.11603284868223569, + "learning_rate": 0.0005020955654286038, + "loss": 2.8495, + "step": 8685 + }, + { + "epoch": 2.7448471926083866, + "grad_norm": 0.08951728738117785, + "learning_rate": 0.0005015441029083029, + "loss": 2.9477, + "step": 8690 + }, + { + "epoch": 2.746426597172866, + "grad_norm": 0.10244013458323355, + "learning_rate": 0.0005009926385096627, + "loss": 2.8319, + "step": 8695 + }, + { + "epoch": 2.748006001737345, + "grad_norm": 0.12482940143306988, + "learning_rate": 0.0005004411729035179, + "loss": 2.9043, + "step": 8700 + }, + { + "epoch": 2.7495854063018244, + "grad_norm": 0.10452890378293654, + "learning_rate": 0.0004998897067607039, + "loss": 2.8477, + "step": 8705 + }, + { + "epoch": 2.751164810866303, + "grad_norm": 0.12090847404143575, + "learning_rate": 0.0004993382407520573, + "loss": 2.7886, + "step": 8710 + }, + { + "epoch": 2.7527442154307824, + "grad_norm": 0.08178133630175746, + "learning_rate": 0.0004987867755484141, + "loss": 2.8267, + "step": 8715 + }, + { + "epoch": 2.7543236199952617, + "grad_norm": 0.10062336680780973, + "learning_rate": 0.0004982353118206095, + "loss": 2.7693, + "step": 8720 + }, + { + "epoch": 2.755903024559741, + "grad_norm": 0.0943113197343758, + "learning_rate": 0.0004976838502394772, + "loss": 2.8277, + "step": 8725 + }, + { + "epoch": 2.75748242912422, + "grad_norm": 0.11023713286267932, + "learning_rate": 0.0004971323914758479, + "loss": 2.7993, + "step": 8730 + }, + { + "epoch": 2.7590618336886994, + "grad_norm": 0.1073939345323795, + "learning_rate": 0.000496580936200549, + "loss": 2.8793, + "step": 8735 + }, + { + "epoch": 2.7606412382531786, + "grad_norm": 0.11471023136135335, + "learning_rate": 0.0004960294850844036, + "loss": 2.7634, + "step": 8740 + }, + { + "epoch": 2.762220642817658, + "grad_norm": 0.12262767443184312, + "learning_rate": 0.0004954780387982296, + "loss": 2.8035, + "step": 8745 + }, + { + "epoch": 2.763800047382137, + "grad_norm": 0.131867203858333, + "learning_rate": 0.0004949265980128398, + "loss": 2.8725, + "step": 8750 + }, + { + "epoch": 2.7653794519466164, + "grad_norm": 0.10633571661017398, + "learning_rate": 0.0004943751633990392, + "loss": 2.9279, + "step": 8755 + }, + { + "epoch": 2.766958856511095, + "grad_norm": 0.10209093832053376, + "learning_rate": 0.0004938237356276261, + "loss": 2.8331, + "step": 8760 + }, + { + "epoch": 2.7685382610755744, + "grad_norm": 0.12140728220481119, + "learning_rate": 0.0004932723153693899, + "loss": 2.8959, + "step": 8765 + }, + { + "epoch": 2.7701176656400537, + "grad_norm": 0.11568062669311602, + "learning_rate": 0.0004927209032951113, + "loss": 2.8911, + "step": 8770 + }, + { + "epoch": 2.771697070204533, + "grad_norm": 0.09419916374966063, + "learning_rate": 0.000492169500075561, + "loss": 2.9308, + "step": 8775 + }, + { + "epoch": 2.773276474769012, + "grad_norm": 0.10783677082464539, + "learning_rate": 0.0004916181063814989, + "loss": 2.862, + "step": 8780 + }, + { + "epoch": 2.7748558793334914, + "grad_norm": 0.09993678976761282, + "learning_rate": 0.0004910667228836729, + "loss": 2.9124, + "step": 8785 + }, + { + "epoch": 2.7764352838979702, + "grad_norm": 0.09810924138432092, + "learning_rate": 0.0004905153502528192, + "loss": 2.8774, + "step": 8790 + }, + { + "epoch": 2.7780146884624495, + "grad_norm": 0.0882822450050144, + "learning_rate": 0.00048996398915966, + "loss": 2.8996, + "step": 8795 + }, + { + "epoch": 2.7795940930269287, + "grad_norm": 0.11509448805364654, + "learning_rate": 0.0004894126402749044, + "loss": 2.8937, + "step": 8800 + }, + { + "epoch": 2.781173497591408, + "grad_norm": 0.12405467965417506, + "learning_rate": 0.0004888613042692457, + "loss": 2.8048, + "step": 8805 + }, + { + "epoch": 2.782752902155887, + "grad_norm": 0.09425134572556984, + "learning_rate": 0.0004883099818133624, + "loss": 2.8566, + "step": 8810 + }, + { + "epoch": 2.7843323067203665, + "grad_norm": 0.09041726145112262, + "learning_rate": 0.0004877586735779156, + "loss": 2.821, + "step": 8815 + }, + { + "epoch": 2.7859117112848457, + "grad_norm": 0.08762216365221542, + "learning_rate": 0.00048720738023354986, + "loss": 2.9121, + "step": 8820 + }, + { + "epoch": 2.787491115849325, + "grad_norm": 0.0976114655616353, + "learning_rate": 0.00048665610245089106, + "loss": 2.8665, + "step": 8825 + }, + { + "epoch": 2.789070520413804, + "grad_norm": 0.11039288980681734, + "learning_rate": 0.00048610484090054695, + "loss": 2.856, + "step": 8830 + }, + { + "epoch": 2.7906499249782835, + "grad_norm": 0.10708824253884887, + "learning_rate": 0.0004855535962531046, + "loss": 2.8531, + "step": 8835 + }, + { + "epoch": 2.7922293295427623, + "grad_norm": 0.1195136121465101, + "learning_rate": 0.0004850023691791313, + "loss": 2.8933, + "step": 8840 + }, + { + "epoch": 2.7938087341072415, + "grad_norm": 0.11014538299717265, + "learning_rate": 0.0004844511603491722, + "loss": 2.9101, + "step": 8845 + }, + { + "epoch": 2.7953881386717208, + "grad_norm": 0.10473197842297582, + "learning_rate": 0.0004838999704337507, + "loss": 2.7816, + "step": 8850 + }, + { + "epoch": 2.7969675432362, + "grad_norm": 0.0894809130323671, + "learning_rate": 0.00048334880010336744, + "loss": 2.8218, + "step": 8855 + }, + { + "epoch": 2.7985469478006793, + "grad_norm": 0.09784437855539298, + "learning_rate": 0.00048279765002849894, + "loss": 2.8752, + "step": 8860 + }, + { + "epoch": 2.8001263523651585, + "grad_norm": 0.13067758045382105, + "learning_rate": 0.00048224652087959686, + "loss": 2.8194, + "step": 8865 + }, + { + "epoch": 2.8017057569296373, + "grad_norm": 0.0986468551876891, + "learning_rate": 0.0004816954133270878, + "loss": 2.963, + "step": 8870 + }, + { + "epoch": 2.8032851614941166, + "grad_norm": 0.12143975478222904, + "learning_rate": 0.0004811443280413716, + "loss": 2.7702, + "step": 8875 + }, + { + "epoch": 2.804864566058596, + "grad_norm": 0.09920981393173242, + "learning_rate": 0.0004805932656928218, + "loss": 2.893, + "step": 8880 + }, + { + "epoch": 2.806443970623075, + "grad_norm": 0.1074725681810456, + "learning_rate": 0.0004800422269517833, + "loss": 2.9134, + "step": 8885 + }, + { + "epoch": 2.8080233751875543, + "grad_norm": 0.10204281091380536, + "learning_rate": 0.00047949121248857277, + "loss": 2.8876, + "step": 8890 + }, + { + "epoch": 2.8096027797520335, + "grad_norm": 0.09270434755456333, + "learning_rate": 0.0004789402229734768, + "loss": 2.8664, + "step": 8895 + }, + { + "epoch": 2.811182184316513, + "grad_norm": 0.10166396255180883, + "learning_rate": 0.000478389259076752, + "loss": 2.8878, + "step": 8900 + }, + { + "epoch": 2.812761588880992, + "grad_norm": 0.1208537911429681, + "learning_rate": 0.00047783832146862403, + "loss": 2.8745, + "step": 8905 + }, + { + "epoch": 2.8143409934454713, + "grad_norm": 0.14123581518424533, + "learning_rate": 0.0004772874108192863, + "loss": 2.8131, + "step": 8910 + }, + { + "epoch": 2.81592039800995, + "grad_norm": 0.1263482735241249, + "learning_rate": 0.0004767365277988993, + "loss": 2.8347, + "step": 8915 + }, + { + "epoch": 2.8174998025744293, + "grad_norm": 0.10619042341997327, + "learning_rate": 0.0004761856730775902, + "loss": 2.7617, + "step": 8920 + }, + { + "epoch": 2.8190792071389086, + "grad_norm": 0.10005343105322269, + "learning_rate": 0.0004756348473254513, + "loss": 2.7529, + "step": 8925 + }, + { + "epoch": 2.820658611703388, + "grad_norm": 0.09257602942214689, + "learning_rate": 0.0004750840512125403, + "loss": 2.9872, + "step": 8930 + }, + { + "epoch": 2.822238016267867, + "grad_norm": 0.11013323338667516, + "learning_rate": 0.0004745332854088783, + "loss": 2.8541, + "step": 8935 + }, + { + "epoch": 2.8238174208323463, + "grad_norm": 0.11218466959923673, + "learning_rate": 0.00047398255058444996, + "loss": 2.8551, + "step": 8940 + }, + { + "epoch": 2.825396825396825, + "grad_norm": 0.10032747092494256, + "learning_rate": 0.0004734318474092018, + "loss": 2.839, + "step": 8945 + }, + { + "epoch": 2.8269762299613044, + "grad_norm": 0.09300804561700793, + "learning_rate": 0.000472881176553042, + "loss": 2.8955, + "step": 8950 + }, + { + "epoch": 2.8285556345257836, + "grad_norm": 0.11698869284218402, + "learning_rate": 0.0004723305386858399, + "loss": 2.764, + "step": 8955 + }, + { + "epoch": 2.830135039090263, + "grad_norm": 0.12002117606412845, + "learning_rate": 0.000471779934477424, + "loss": 2.8536, + "step": 8960 + }, + { + "epoch": 2.831714443654742, + "grad_norm": 0.09323043603377293, + "learning_rate": 0.00047122936459758225, + "loss": 2.7864, + "step": 8965 + }, + { + "epoch": 2.8332938482192214, + "grad_norm": 0.1002912090989643, + "learning_rate": 0.0004706788297160608, + "loss": 2.7198, + "step": 8970 + }, + { + "epoch": 2.8348732527837006, + "grad_norm": 0.1475023105666348, + "learning_rate": 0.00047012833050256287, + "loss": 2.8075, + "step": 8975 + }, + { + "epoch": 2.83645265734818, + "grad_norm": 0.11276908138577017, + "learning_rate": 0.000469577867626749, + "loss": 2.8639, + "step": 8980 + }, + { + "epoch": 2.838032061912659, + "grad_norm": 0.21161327588210527, + "learning_rate": 0.0004690274417582349, + "loss": 2.7783, + "step": 8985 + }, + { + "epoch": 2.8396114664771384, + "grad_norm": 0.10800754485859902, + "learning_rate": 0.0004684770535665917, + "loss": 2.8841, + "step": 8990 + }, + { + "epoch": 2.841190871041617, + "grad_norm": 0.10269668428994914, + "learning_rate": 0.0004679267037213443, + "loss": 2.818, + "step": 8995 + }, + { + "epoch": 2.8427702756060964, + "grad_norm": 0.08507380551575273, + "learning_rate": 0.0004673763928919712, + "loss": 2.8349, + "step": 9000 + }, + { + "epoch": 2.8443496801705757, + "grad_norm": 0.08736533693565444, + "learning_rate": 0.0004668261217479032, + "loss": 2.7935, + "step": 9005 + }, + { + "epoch": 2.845929084735055, + "grad_norm": 0.1114869278836615, + "learning_rate": 0.0004662758909585233, + "loss": 2.8563, + "step": 9010 + }, + { + "epoch": 2.847508489299534, + "grad_norm": 0.10751009609851755, + "learning_rate": 0.00046572570119316495, + "loss": 2.8114, + "step": 9015 + }, + { + "epoch": 2.8490878938640134, + "grad_norm": 0.08412230037944082, + "learning_rate": 0.000465175553121112, + "loss": 2.7261, + "step": 9020 + }, + { + "epoch": 2.850667298428492, + "grad_norm": 0.08604848886254374, + "learning_rate": 0.0004646254474115973, + "loss": 2.825, + "step": 9025 + }, + { + "epoch": 2.8522467029929714, + "grad_norm": 0.09052552835236849, + "learning_rate": 0.00046407538473380215, + "loss": 2.909, + "step": 9030 + }, + { + "epoch": 2.8538261075574507, + "grad_norm": 0.08449371690829786, + "learning_rate": 0.0004635253657568561, + "loss": 2.9414, + "step": 9035 + }, + { + "epoch": 2.85540551212193, + "grad_norm": 0.09900171582846604, + "learning_rate": 0.00046297539114983476, + "loss": 2.8186, + "step": 9040 + }, + { + "epoch": 2.856984916686409, + "grad_norm": 0.11936665449629383, + "learning_rate": 0.00046242546158176026, + "loss": 2.8067, + "step": 9045 + }, + { + "epoch": 2.8585643212508884, + "grad_norm": 0.10878964794862939, + "learning_rate": 0.0004618755777215998, + "loss": 2.8312, + "step": 9050 + }, + { + "epoch": 2.8601437258153677, + "grad_norm": 0.10594367710493519, + "learning_rate": 0.0004613257402382647, + "loss": 2.7821, + "step": 9055 + }, + { + "epoch": 2.861723130379847, + "grad_norm": 0.10774059810851142, + "learning_rate": 0.0004607759498006104, + "loss": 2.9061, + "step": 9060 + }, + { + "epoch": 2.863302534944326, + "grad_norm": 0.10285755907368245, + "learning_rate": 0.000460226207077435, + "loss": 2.8355, + "step": 9065 + }, + { + "epoch": 2.8648819395088054, + "grad_norm": 0.10351095774021554, + "learning_rate": 0.000459676512737478, + "loss": 2.8312, + "step": 9070 + }, + { + "epoch": 2.8664613440732842, + "grad_norm": 0.09887969502613489, + "learning_rate": 0.0004591268674494207, + "loss": 2.7871, + "step": 9075 + }, + { + "epoch": 2.8680407486377635, + "grad_norm": 0.10901193731190399, + "learning_rate": 0.00045857727188188425, + "loss": 2.8475, + "step": 9080 + }, + { + "epoch": 2.8696201532022427, + "grad_norm": 0.10770970500201059, + "learning_rate": 0.0004580277267034299, + "loss": 2.8679, + "step": 9085 + }, + { + "epoch": 2.871199557766722, + "grad_norm": 0.10326053968677797, + "learning_rate": 0.000457478232582557, + "loss": 2.7711, + "step": 9090 + }, + { + "epoch": 2.8727789623312012, + "grad_norm": 0.09136407583935778, + "learning_rate": 0.000456928790187703, + "loss": 2.8492, + "step": 9095 + }, + { + "epoch": 2.8743583668956805, + "grad_norm": 0.08441720449514221, + "learning_rate": 0.00045637940018724275, + "loss": 2.9713, + "step": 9100 + }, + { + "epoch": 2.8759377714601593, + "grad_norm": 0.09899362460145938, + "learning_rate": 0.00045583006324948654, + "loss": 2.8082, + "step": 9105 + }, + { + "epoch": 2.8775171760246385, + "grad_norm": 0.10434826649888167, + "learning_rate": 0.0004552807800426812, + "loss": 2.7625, + "step": 9110 + }, + { + "epoch": 2.8790965805891178, + "grad_norm": 0.09849217908206793, + "learning_rate": 0.0004547315512350075, + "loss": 2.7703, + "step": 9115 + }, + { + "epoch": 2.880675985153597, + "grad_norm": 0.0899109070075441, + "learning_rate": 0.00045418237749457994, + "loss": 2.7332, + "step": 9120 + }, + { + "epoch": 2.8822553897180763, + "grad_norm": 0.09411693700074429, + "learning_rate": 0.0004536332594894466, + "loss": 2.8315, + "step": 9125 + }, + { + "epoch": 2.8838347942825555, + "grad_norm": 0.11307082588068913, + "learning_rate": 0.00045308419788758704, + "loss": 2.8794, + "step": 9130 + }, + { + "epoch": 2.8854141988470348, + "grad_norm": 0.12481609564961511, + "learning_rate": 0.00045253519335691306, + "loss": 2.8332, + "step": 9135 + }, + { + "epoch": 2.886993603411514, + "grad_norm": 0.11526490471981808, + "learning_rate": 0.00045198624656526634, + "loss": 2.8797, + "step": 9140 + }, + { + "epoch": 2.8885730079759933, + "grad_norm": 0.08616360945328765, + "learning_rate": 0.0004514373581804187, + "loss": 2.8677, + "step": 9145 + }, + { + "epoch": 2.8901524125404725, + "grad_norm": 0.09639886901063495, + "learning_rate": 0.00045088852887007055, + "loss": 2.8371, + "step": 9150 + }, + { + "epoch": 2.8917318171049513, + "grad_norm": 0.09426024023747775, + "learning_rate": 0.00045033975930185074, + "loss": 2.8256, + "step": 9155 + }, + { + "epoch": 2.8933112216694306, + "grad_norm": 0.09823088057499815, + "learning_rate": 0.0004497910501433153, + "loss": 2.7695, + "step": 9160 + }, + { + "epoch": 2.89489062623391, + "grad_norm": 0.09651129158000497, + "learning_rate": 0.0004492424020619472, + "loss": 2.8388, + "step": 9165 + }, + { + "epoch": 2.896470030798389, + "grad_norm": 0.11108150616205786, + "learning_rate": 0.0004486938157251543, + "loss": 2.824, + "step": 9170 + }, + { + "epoch": 2.8980494353628683, + "grad_norm": 0.10056751400490413, + "learning_rate": 0.00044814529180027025, + "loss": 2.7821, + "step": 9175 + }, + { + "epoch": 2.8996288399273475, + "grad_norm": 0.10387820407099511, + "learning_rate": 0.0004475968309545519, + "loss": 2.8305, + "step": 9180 + }, + { + "epoch": 2.9012082444918263, + "grad_norm": 0.10673816323056329, + "learning_rate": 0.0004470484338551799, + "loss": 2.7528, + "step": 9185 + }, + { + "epoch": 2.9027876490563056, + "grad_norm": 0.10360115384761291, + "learning_rate": 0.00044650010116925744, + "loss": 2.8434, + "step": 9190 + }, + { + "epoch": 2.904367053620785, + "grad_norm": 0.11180438845853563, + "learning_rate": 0.00044595183356380916, + "loss": 2.8672, + "step": 9195 + }, + { + "epoch": 2.905946458185264, + "grad_norm": 0.0994512434438589, + "learning_rate": 0.0004454036317057803, + "loss": 2.7964, + "step": 9200 + }, + { + "epoch": 2.9075258627497433, + "grad_norm": 0.15167604489166178, + "learning_rate": 0.00044485549626203653, + "loss": 2.8185, + "step": 9205 + }, + { + "epoch": 2.9091052673142226, + "grad_norm": 0.09398806231071076, + "learning_rate": 0.00044430742789936244, + "loss": 2.8173, + "step": 9210 + }, + { + "epoch": 2.910684671878702, + "grad_norm": 0.10992730250826928, + "learning_rate": 0.00044375942728446145, + "loss": 2.8127, + "step": 9215 + }, + { + "epoch": 2.912264076443181, + "grad_norm": 0.13652749666712283, + "learning_rate": 0.0004432114950839539, + "loss": 2.891, + "step": 9220 + }, + { + "epoch": 2.9138434810076603, + "grad_norm": 0.10436724895098912, + "learning_rate": 0.00044266363196437757, + "loss": 2.8738, + "step": 9225 + }, + { + "epoch": 2.9154228855721396, + "grad_norm": 0.11944492929699857, + "learning_rate": 0.0004421158385921856, + "loss": 2.9314, + "step": 9230 + }, + { + "epoch": 2.9170022901366184, + "grad_norm": 0.11991906724849831, + "learning_rate": 0.0004415681156337466, + "loss": 2.8207, + "step": 9235 + }, + { + "epoch": 2.9185816947010976, + "grad_norm": 0.10846283055092165, + "learning_rate": 0.0004410204637553437, + "loss": 2.8969, + "step": 9240 + }, + { + "epoch": 2.920161099265577, + "grad_norm": 0.12047046083644518, + "learning_rate": 0.00044047288362317346, + "loss": 2.9509, + "step": 9245 + }, + { + "epoch": 2.921740503830056, + "grad_norm": 0.0980365989292603, + "learning_rate": 0.00043992537590334483, + "loss": 2.8516, + "step": 9250 + }, + { + "epoch": 2.9233199083945354, + "grad_norm": 0.11669989052911783, + "learning_rate": 0.000439377941261879, + "loss": 3.0115, + "step": 9255 + }, + { + "epoch": 2.924899312959014, + "grad_norm": 0.11700862846287295, + "learning_rate": 0.0004388305803647079, + "loss": 2.8834, + "step": 9260 + }, + { + "epoch": 2.9264787175234934, + "grad_norm": 0.11928964488487814, + "learning_rate": 0.0004382832938776747, + "loss": 2.9026, + "step": 9265 + }, + { + "epoch": 2.9280581220879727, + "grad_norm": 0.09061663401562066, + "learning_rate": 0.0004377360824665309, + "loss": 2.8714, + "step": 9270 + }, + { + "epoch": 2.929637526652452, + "grad_norm": 0.1143067185517572, + "learning_rate": 0.0004371889467969373, + "loss": 2.8862, + "step": 9275 + }, + { + "epoch": 2.931216931216931, + "grad_norm": 0.11881429925573886, + "learning_rate": 0.00043664188753446236, + "loss": 2.9086, + "step": 9280 + }, + { + "epoch": 2.9327963357814104, + "grad_norm": 0.10121970992950821, + "learning_rate": 0.0004360949053445816, + "loss": 2.8306, + "step": 9285 + }, + { + "epoch": 2.9343757403458897, + "grad_norm": 0.10582419788273659, + "learning_rate": 0.000435548000892677, + "loss": 2.7933, + "step": 9290 + }, + { + "epoch": 2.935955144910369, + "grad_norm": 0.08778865885627898, + "learning_rate": 0.00043500117484403586, + "loss": 2.7393, + "step": 9295 + }, + { + "epoch": 2.937534549474848, + "grad_norm": 0.09920218371219021, + "learning_rate": 0.00043445442786384984, + "loss": 2.9221, + "step": 9300 + }, + { + "epoch": 2.9391139540393274, + "grad_norm": 0.09361706009393304, + "learning_rate": 0.0004339077606172149, + "loss": 2.892, + "step": 9305 + }, + { + "epoch": 2.940693358603806, + "grad_norm": 0.09808388861515303, + "learning_rate": 0.0004333611737691295, + "loss": 2.7733, + "step": 9310 + }, + { + "epoch": 2.9422727631682855, + "grad_norm": 0.08103140627641786, + "learning_rate": 0.00043281466798449455, + "loss": 2.8325, + "step": 9315 + }, + { + "epoch": 2.9438521677327647, + "grad_norm": 0.10336914158105583, + "learning_rate": 0.00043226824392811255, + "loss": 2.9077, + "step": 9320 + }, + { + "epoch": 2.945431572297244, + "grad_norm": 0.10776845094554324, + "learning_rate": 0.0004317219022646864, + "loss": 2.8252, + "step": 9325 + }, + { + "epoch": 2.947010976861723, + "grad_norm": 0.10347109603302487, + "learning_rate": 0.00043117564365881847, + "loss": 2.9906, + "step": 9330 + }, + { + "epoch": 2.9485903814262024, + "grad_norm": 0.10262762014013083, + "learning_rate": 0.0004306294687750107, + "loss": 2.8306, + "step": 9335 + }, + { + "epoch": 2.9501697859906812, + "grad_norm": 0.11348081425096533, + "learning_rate": 0.0004300833782776623, + "loss": 2.7378, + "step": 9340 + }, + { + "epoch": 2.9517491905551605, + "grad_norm": 0.08643982582717907, + "learning_rate": 0.00042953737283107116, + "loss": 2.8455, + "step": 9345 + }, + { + "epoch": 2.9533285951196397, + "grad_norm": 0.08220527655592914, + "learning_rate": 0.0004289914530994303, + "loss": 2.7756, + "step": 9350 + }, + { + "epoch": 2.954907999684119, + "grad_norm": 0.18118766564372518, + "learning_rate": 0.0004284456197468296, + "loss": 2.8494, + "step": 9355 + }, + { + "epoch": 2.9564874042485982, + "grad_norm": 0.13143823135376625, + "learning_rate": 0.000427899873437253, + "loss": 2.7331, + "step": 9360 + }, + { + "epoch": 2.9580668088130775, + "grad_norm": 0.10058361363559926, + "learning_rate": 0.00042735421483457885, + "loss": 2.8321, + "step": 9365 + }, + { + "epoch": 2.9596462133775567, + "grad_norm": 0.12211319602843974, + "learning_rate": 0.0004268086446025793, + "loss": 2.8373, + "step": 9370 + }, + { + "epoch": 2.961225617942036, + "grad_norm": 0.11504672751453296, + "learning_rate": 0.00042626316340491836, + "loss": 2.8539, + "step": 9375 + }, + { + "epoch": 2.9628050225065152, + "grad_norm": 0.1103739918482253, + "learning_rate": 0.00042571777190515193, + "loss": 2.8314, + "step": 9380 + }, + { + "epoch": 2.9643844270709945, + "grad_norm": 0.11447689290182686, + "learning_rate": 0.00042517247076672695, + "loss": 2.7418, + "step": 9385 + }, + { + "epoch": 2.9659638316354733, + "grad_norm": 0.09751401147293268, + "learning_rate": 0.00042462726065297995, + "loss": 2.7837, + "step": 9390 + }, + { + "epoch": 2.9675432361999525, + "grad_norm": 0.1048744986956253, + "learning_rate": 0.00042408214222713745, + "loss": 2.7562, + "step": 9395 + }, + { + "epoch": 2.9691226407644318, + "grad_norm": 0.09325057594748949, + "learning_rate": 0.00042353711615231404, + "loss": 2.8928, + "step": 9400 + }, + { + "epoch": 2.970702045328911, + "grad_norm": 0.1316897961539587, + "learning_rate": 0.0004229921830915121, + "loss": 2.8762, + "step": 9405 + }, + { + "epoch": 2.9722814498933903, + "grad_norm": 0.11093338724315298, + "learning_rate": 0.00042244734370762036, + "loss": 2.8263, + "step": 9410 + }, + { + "epoch": 2.9738608544578695, + "grad_norm": 0.10257766149567363, + "learning_rate": 0.0004219025986634143, + "loss": 2.756, + "step": 9415 + }, + { + "epoch": 2.9754402590223483, + "grad_norm": 0.0990967327529078, + "learning_rate": 0.00042135794862155454, + "loss": 2.8062, + "step": 9420 + }, + { + "epoch": 2.9770196635868276, + "grad_norm": 0.10690558250376438, + "learning_rate": 0.0004208133942445855, + "loss": 2.7662, + "step": 9425 + }, + { + "epoch": 2.978599068151307, + "grad_norm": 0.11197117438239503, + "learning_rate": 0.00042026893619493593, + "loss": 2.789, + "step": 9430 + }, + { + "epoch": 2.980178472715786, + "grad_norm": 0.10816001438345199, + "learning_rate": 0.00041972457513491724, + "loss": 2.8505, + "step": 9435 + }, + { + "epoch": 2.9817578772802653, + "grad_norm": 0.1365836125794437, + "learning_rate": 0.00041918031172672235, + "loss": 2.8172, + "step": 9440 + }, + { + "epoch": 2.9833372818447446, + "grad_norm": 0.11740081577821014, + "learning_rate": 0.00041863614663242615, + "loss": 2.8843, + "step": 9445 + }, + { + "epoch": 2.984916686409224, + "grad_norm": 0.10315196514950942, + "learning_rate": 0.0004180920805139835, + "loss": 2.8653, + "step": 9450 + }, + { + "epoch": 2.986496090973703, + "grad_norm": 0.09234739238344229, + "learning_rate": 0.000417548114033229, + "loss": 2.7875, + "step": 9455 + }, + { + "epoch": 2.9880754955381823, + "grad_norm": 0.10192427331468423, + "learning_rate": 0.00041700424785187586, + "loss": 2.9056, + "step": 9460 + }, + { + "epoch": 2.9896549001026615, + "grad_norm": 0.09815002857062627, + "learning_rate": 0.0004164604826315155, + "loss": 2.8655, + "step": 9465 + }, + { + "epoch": 2.9912343046671404, + "grad_norm": 0.09440318060463548, + "learning_rate": 0.00041591681903361616, + "loss": 2.8273, + "step": 9470 + }, + { + "epoch": 2.9928137092316196, + "grad_norm": 0.08523804957990835, + "learning_rate": 0.00041537325771952305, + "loss": 2.8855, + "step": 9475 + }, + { + "epoch": 2.994393113796099, + "grad_norm": 0.10960347492187537, + "learning_rate": 0.00041482979935045656, + "loss": 2.8851, + "step": 9480 + }, + { + "epoch": 2.995972518360578, + "grad_norm": 0.11222549891471653, + "learning_rate": 0.000414286444587512, + "loss": 2.8748, + "step": 9485 + }, + { + "epoch": 2.9975519229250573, + "grad_norm": 0.11218516292236376, + "learning_rate": 0.0004137431940916584, + "loss": 2.8498, + "step": 9490 + }, + { + "epoch": 2.9991313274895366, + "grad_norm": 0.10958045434515254, + "learning_rate": 0.00041320004852373805, + "loss": 2.7871, + "step": 9495 + }, + { + "epoch": 3.0, + "eval_loss": 2.820634365081787, + "eval_runtime": 118.7878, + "eval_samples_per_second": 22.3, + "eval_steps_per_second": 5.581, + "step": 9498 + }, + { + "epoch": 3.0006317618257916, + "grad_norm": 0.11137564271390861, + "learning_rate": 0.00041265700854446605, + "loss": 2.7761, + "step": 9500 + }, + { + "epoch": 3.002211166390271, + "grad_norm": 0.10824032817067379, + "learning_rate": 0.0004121140748144283, + "loss": 2.9545, + "step": 9505 + }, + { + "epoch": 3.00379057095475, + "grad_norm": 0.12297364510513963, + "learning_rate": 0.0004115712479940821, + "loss": 2.8387, + "step": 9510 + }, + { + "epoch": 3.0053699755192294, + "grad_norm": 0.12154359734086224, + "learning_rate": 0.00041102852874375437, + "loss": 2.7854, + "step": 9515 + }, + { + "epoch": 3.0069493800837086, + "grad_norm": 0.0961953283591355, + "learning_rate": 0.000410485917723641, + "loss": 2.7951, + "step": 9520 + }, + { + "epoch": 3.008528784648188, + "grad_norm": 0.10474249931035579, + "learning_rate": 0.0004099434155938068, + "loss": 2.8063, + "step": 9525 + }, + { + "epoch": 3.0101081892126667, + "grad_norm": 0.10054444997655539, + "learning_rate": 0.0004094010230141837, + "loss": 2.7327, + "step": 9530 + }, + { + "epoch": 3.011687593777146, + "grad_norm": 0.09983385399225517, + "learning_rate": 0.0004088587406445703, + "loss": 2.7407, + "step": 9535 + }, + { + "epoch": 3.013266998341625, + "grad_norm": 0.08884849973502548, + "learning_rate": 0.0004083165691446313, + "loss": 2.8095, + "step": 9540 + }, + { + "epoch": 3.0148464029061044, + "grad_norm": 0.09585206257667299, + "learning_rate": 0.0004077745091738966, + "loss": 2.7738, + "step": 9545 + }, + { + "epoch": 3.0164258074705836, + "grad_norm": 0.10351425998687791, + "learning_rate": 0.00040723256139176044, + "loss": 2.8037, + "step": 9550 + }, + { + "epoch": 3.018005212035063, + "grad_norm": 0.10581996363930153, + "learning_rate": 0.0004066907264574803, + "loss": 2.8743, + "step": 9555 + }, + { + "epoch": 3.019584616599542, + "grad_norm": 0.10572898237852973, + "learning_rate": 0.00040614900503017665, + "loss": 2.8234, + "step": 9560 + }, + { + "epoch": 3.0211640211640214, + "grad_norm": 0.09120253306098203, + "learning_rate": 0.0004056073977688319, + "loss": 2.781, + "step": 9565 + }, + { + "epoch": 3.0227434257285, + "grad_norm": 0.09007962326320887, + "learning_rate": 0.0004050659053322892, + "loss": 2.8946, + "step": 9570 + }, + { + "epoch": 3.0243228302929794, + "grad_norm": 0.09230225995362885, + "learning_rate": 0.0004045245283792526, + "loss": 2.7777, + "step": 9575 + }, + { + "epoch": 3.0259022348574587, + "grad_norm": 0.14730275783137586, + "learning_rate": 0.0004039832675682854, + "loss": 2.7663, + "step": 9580 + }, + { + "epoch": 3.027481639421938, + "grad_norm": 0.11353377462122083, + "learning_rate": 0.0004034421235578093, + "loss": 2.8319, + "step": 9585 + }, + { + "epoch": 3.029061043986417, + "grad_norm": 0.1082096076103628, + "learning_rate": 0.0004029010970061044, + "loss": 2.8172, + "step": 9590 + }, + { + "epoch": 3.0306404485508964, + "grad_norm": 0.1252526709821488, + "learning_rate": 0.00040236018857130776, + "loss": 2.7878, + "step": 9595 + }, + { + "epoch": 3.0322198531153757, + "grad_norm": 0.10508346750056435, + "learning_rate": 0.00040181939891141273, + "loss": 2.805, + "step": 9600 + }, + { + "epoch": 3.0337992576798545, + "grad_norm": 0.11012507855534207, + "learning_rate": 0.00040127872868426807, + "loss": 2.8378, + "step": 9605 + }, + { + "epoch": 3.0353786622443337, + "grad_norm": 0.0905549160336993, + "learning_rate": 0.00040073817854757753, + "loss": 2.8586, + "step": 9610 + }, + { + "epoch": 3.036958066808813, + "grad_norm": 0.10658082029039984, + "learning_rate": 0.0004001977491588984, + "loss": 2.7928, + "step": 9615 + }, + { + "epoch": 3.038537471373292, + "grad_norm": 0.08473814628075636, + "learning_rate": 0.0003996574411756412, + "loss": 2.8769, + "step": 9620 + }, + { + "epoch": 3.0401168759377715, + "grad_norm": 0.10456114140621427, + "learning_rate": 0.00039911725525506914, + "loss": 2.8539, + "step": 9625 + }, + { + "epoch": 3.0416962805022507, + "grad_norm": 0.0988988689051054, + "learning_rate": 0.0003985771920542967, + "loss": 2.806, + "step": 9630 + }, + { + "epoch": 3.04327568506673, + "grad_norm": 0.09450173016461, + "learning_rate": 0.00039803725223028864, + "loss": 2.851, + "step": 9635 + }, + { + "epoch": 3.044855089631209, + "grad_norm": 0.12539248662855523, + "learning_rate": 0.00039749743643986035, + "loss": 2.8271, + "step": 9640 + }, + { + "epoch": 3.046434494195688, + "grad_norm": 0.10030288651138776, + "learning_rate": 0.00039695774533967586, + "loss": 2.7322, + "step": 9645 + }, + { + "epoch": 3.0480138987601673, + "grad_norm": 0.09442194681297006, + "learning_rate": 0.0003964181795862476, + "loss": 2.8108, + "step": 9650 + }, + { + "epoch": 3.0495933033246465, + "grad_norm": 0.09793258439010631, + "learning_rate": 0.00039587873983593585, + "loss": 2.8668, + "step": 9655 + }, + { + "epoch": 3.0511727078891258, + "grad_norm": 0.09533279772919158, + "learning_rate": 0.00039533942674494735, + "loss": 2.7553, + "step": 9660 + }, + { + "epoch": 3.052752112453605, + "grad_norm": 0.09165559609359519, + "learning_rate": 0.00039480024096933455, + "loss": 2.7224, + "step": 9665 + }, + { + "epoch": 3.0543315170180843, + "grad_norm": 0.08899617957042029, + "learning_rate": 0.0003942611831649953, + "loss": 2.8608, + "step": 9670 + }, + { + "epoch": 3.0559109215825635, + "grad_norm": 0.12462258209023344, + "learning_rate": 0.00039372225398767176, + "loss": 2.799, + "step": 9675 + }, + { + "epoch": 3.0574903261470427, + "grad_norm": 0.09919996000954215, + "learning_rate": 0.0003931834540929498, + "loss": 2.7263, + "step": 9680 + }, + { + "epoch": 3.0590697307115216, + "grad_norm": 0.10144490387767188, + "learning_rate": 0.0003926447841362575, + "loss": 2.7332, + "step": 9685 + }, + { + "epoch": 3.060649135276001, + "grad_norm": 0.13350840711628595, + "learning_rate": 0.0003921062447728654, + "loss": 2.9546, + "step": 9690 + }, + { + "epoch": 3.06222853984048, + "grad_norm": 0.11599137513418321, + "learning_rate": 0.0003915678366578848, + "loss": 2.7852, + "step": 9695 + }, + { + "epoch": 3.0638079444049593, + "grad_norm": 0.09011280977038039, + "learning_rate": 0.00039102956044626745, + "loss": 2.7473, + "step": 9700 + }, + { + "epoch": 3.0653873489694385, + "grad_norm": 0.10744348562991023, + "learning_rate": 0.000390491416792805, + "loss": 2.8039, + "step": 9705 + }, + { + "epoch": 3.066966753533918, + "grad_norm": 0.1076047456045747, + "learning_rate": 0.00038995340635212747, + "loss": 2.8721, + "step": 9710 + }, + { + "epoch": 3.068546158098397, + "grad_norm": 0.11090394334029989, + "learning_rate": 0.0003894155297787027, + "loss": 2.9012, + "step": 9715 + }, + { + "epoch": 3.0701255626628763, + "grad_norm": 0.09807974395489591, + "learning_rate": 0.00038887778772683605, + "loss": 2.7818, + "step": 9720 + }, + { + "epoch": 3.071704967227355, + "grad_norm": 0.10706239708253429, + "learning_rate": 0.0003883401808506688, + "loss": 2.7635, + "step": 9725 + }, + { + "epoch": 3.0732843717918343, + "grad_norm": 0.10143064130984526, + "learning_rate": 0.0003878027098041786, + "loss": 2.7579, + "step": 9730 + }, + { + "epoch": 3.0748637763563136, + "grad_norm": 0.10400907430117945, + "learning_rate": 0.00038726537524117713, + "loss": 2.8069, + "step": 9735 + }, + { + "epoch": 3.076443180920793, + "grad_norm": 0.09316922670618225, + "learning_rate": 0.00038672817781531025, + "loss": 2.8639, + "step": 9740 + }, + { + "epoch": 3.078022585485272, + "grad_norm": 0.09349735335320371, + "learning_rate": 0.0003861911181800568, + "loss": 2.7537, + "step": 9745 + }, + { + "epoch": 3.0796019900497513, + "grad_norm": 0.08828836960714911, + "learning_rate": 0.00038565419698872837, + "loss": 2.774, + "step": 9750 + }, + { + "epoch": 3.0811813946142306, + "grad_norm": 0.10118948073176658, + "learning_rate": 0.0003851174148944681, + "loss": 2.7001, + "step": 9755 + }, + { + "epoch": 3.08276079917871, + "grad_norm": 0.09821378293596399, + "learning_rate": 0.0003845807725502499, + "loss": 2.7579, + "step": 9760 + }, + { + "epoch": 3.0843402037431886, + "grad_norm": 0.10329086950182034, + "learning_rate": 0.0003840442706088772, + "loss": 2.829, + "step": 9765 + }, + { + "epoch": 3.085919608307668, + "grad_norm": 0.11605258739575236, + "learning_rate": 0.00038350790972298336, + "loss": 2.8862, + "step": 9770 + }, + { + "epoch": 3.087499012872147, + "grad_norm": 0.09100937689590838, + "learning_rate": 0.0003829716905450294, + "loss": 2.8117, + "step": 9775 + }, + { + "epoch": 3.0890784174366264, + "grad_norm": 0.155556621997077, + "learning_rate": 0.00038243561372730496, + "loss": 2.9684, + "step": 9780 + }, + { + "epoch": 3.0906578220011056, + "grad_norm": 0.09450981898431092, + "learning_rate": 0.00038189967992192545, + "loss": 2.7816, + "step": 9785 + }, + { + "epoch": 3.092237226565585, + "grad_norm": 0.09959380196851375, + "learning_rate": 0.0003813638897808331, + "loss": 2.8895, + "step": 9790 + }, + { + "epoch": 3.093816631130064, + "grad_norm": 0.10169324490712747, + "learning_rate": 0.0003808282439557948, + "loss": 2.8379, + "step": 9795 + }, + { + "epoch": 3.0953960356945434, + "grad_norm": 0.10058561730050815, + "learning_rate": 0.0003802927430984023, + "loss": 2.7332, + "step": 9800 + }, + { + "epoch": 3.096975440259022, + "grad_norm": 0.11816001771566845, + "learning_rate": 0.00037975738786007055, + "loss": 2.7943, + "step": 9805 + }, + { + "epoch": 3.0985548448235014, + "grad_norm": 0.08826920345868178, + "learning_rate": 0.00037922217889203814, + "loss": 2.7637, + "step": 9810 + }, + { + "epoch": 3.1001342493879807, + "grad_norm": 0.12705683278388166, + "learning_rate": 0.0003786871168453649, + "loss": 2.8163, + "step": 9815 + }, + { + "epoch": 3.10171365395246, + "grad_norm": 0.11495920571191703, + "learning_rate": 0.00037815220237093244, + "loss": 2.836, + "step": 9820 + }, + { + "epoch": 3.103293058516939, + "grad_norm": 0.12432376343554186, + "learning_rate": 0.00037761743611944255, + "loss": 2.8075, + "step": 9825 + }, + { + "epoch": 3.1048724630814184, + "grad_norm": 0.09656694722527305, + "learning_rate": 0.0003770828187414169, + "loss": 2.8801, + "step": 9830 + }, + { + "epoch": 3.1064518676458976, + "grad_norm": 0.11029553326905037, + "learning_rate": 0.000376548350887196, + "loss": 2.7515, + "step": 9835 + }, + { + "epoch": 3.108031272210377, + "grad_norm": 0.0830397231016251, + "learning_rate": 0.0003760140332069387, + "loss": 2.7623, + "step": 9840 + }, + { + "epoch": 3.1096106767748557, + "grad_norm": 0.12136488671310046, + "learning_rate": 0.00037547986635062076, + "loss": 2.7761, + "step": 9845 + }, + { + "epoch": 3.111190081339335, + "grad_norm": 0.12969324968762255, + "learning_rate": 0.00037494585096803476, + "loss": 2.8342, + "step": 9850 + }, + { + "epoch": 3.112769485903814, + "grad_norm": 0.12352241296333281, + "learning_rate": 0.00037441198770878857, + "loss": 2.8738, + "step": 9855 + }, + { + "epoch": 3.1143488904682934, + "grad_norm": 0.14236331604294866, + "learning_rate": 0.0003738782772223059, + "loss": 2.7812, + "step": 9860 + }, + { + "epoch": 3.1159282950327727, + "grad_norm": 0.10979819536457726, + "learning_rate": 0.00037334472015782374, + "loss": 2.7447, + "step": 9865 + }, + { + "epoch": 3.117507699597252, + "grad_norm": 0.09192732565456906, + "learning_rate": 0.00037281131716439297, + "loss": 2.8563, + "step": 9870 + }, + { + "epoch": 3.119087104161731, + "grad_norm": 0.1005101546494545, + "learning_rate": 0.00037227806889087676, + "loss": 2.8625, + "step": 9875 + }, + { + "epoch": 3.1206665087262104, + "grad_norm": 0.10713773883542309, + "learning_rate": 0.00037174497598595, + "loss": 2.8055, + "step": 9880 + }, + { + "epoch": 3.1222459132906892, + "grad_norm": 0.09367041726000498, + "learning_rate": 0.0003712120390980992, + "loss": 2.7897, + "step": 9885 + }, + { + "epoch": 3.1238253178551685, + "grad_norm": 0.111252472750593, + "learning_rate": 0.00037067925887562033, + "loss": 2.8767, + "step": 9890 + }, + { + "epoch": 3.1254047224196477, + "grad_norm": 0.10653969198593435, + "learning_rate": 0.0003701466359666191, + "loss": 2.8137, + "step": 9895 + }, + { + "epoch": 3.126984126984127, + "grad_norm": 0.10360624517617704, + "learning_rate": 0.00036961417101901003, + "loss": 2.8624, + "step": 9900 + }, + { + "epoch": 3.1285635315486062, + "grad_norm": 0.08869417865804269, + "learning_rate": 0.00036908186468051496, + "loss": 2.7433, + "step": 9905 + }, + { + "epoch": 3.1301429361130855, + "grad_norm": 0.08963134101472572, + "learning_rate": 0.0003685497175986634, + "loss": 2.7508, + "step": 9910 + }, + { + "epoch": 3.1317223406775647, + "grad_norm": 0.08337069015473321, + "learning_rate": 0.00036801773042079085, + "loss": 2.749, + "step": 9915 + }, + { + "epoch": 3.1333017452420435, + "grad_norm": 0.11348349319289626, + "learning_rate": 0.00036748590379403833, + "loss": 2.8682, + "step": 9920 + }, + { + "epoch": 3.1348811498065228, + "grad_norm": 0.09468176150451076, + "learning_rate": 0.0003669542383653514, + "loss": 2.8021, + "step": 9925 + }, + { + "epoch": 3.136460554371002, + "grad_norm": 0.10888557336854489, + "learning_rate": 0.00036642273478147957, + "loss": 2.8114, + "step": 9930 + }, + { + "epoch": 3.1380399589354813, + "grad_norm": 0.09812788122332593, + "learning_rate": 0.000365891393688976, + "loss": 2.8673, + "step": 9935 + }, + { + "epoch": 3.1396193634999605, + "grad_norm": 0.10373767188692451, + "learning_rate": 0.0003653602157341953, + "loss": 2.8692, + "step": 9940 + }, + { + "epoch": 3.1411987680644398, + "grad_norm": 0.10425056849464936, + "learning_rate": 0.0003648292015632942, + "loss": 2.7598, + "step": 9945 + }, + { + "epoch": 3.142778172628919, + "grad_norm": 0.10437038930991278, + "learning_rate": 0.00036429835182223024, + "loss": 2.8128, + "step": 9950 + }, + { + "epoch": 3.1443575771933983, + "grad_norm": 0.08773464643806514, + "learning_rate": 0.00036376766715676053, + "loss": 2.8262, + "step": 9955 + }, + { + "epoch": 3.1459369817578775, + "grad_norm": 0.10175662607669828, + "learning_rate": 0.0003632371482124416, + "loss": 2.7323, + "step": 9960 + }, + { + "epoch": 3.1475163863223563, + "grad_norm": 0.08826994901790985, + "learning_rate": 0.00036270679563462873, + "loss": 2.8165, + "step": 9965 + }, + { + "epoch": 3.1490957908868356, + "grad_norm": 0.10089045643559466, + "learning_rate": 0.00036217661006847417, + "loss": 2.7692, + "step": 9970 + }, + { + "epoch": 3.150675195451315, + "grad_norm": 0.0963958566936885, + "learning_rate": 0.0003616465921589275, + "loss": 2.8025, + "step": 9975 + }, + { + "epoch": 3.152254600015794, + "grad_norm": 0.12276808862628645, + "learning_rate": 0.00036111674255073415, + "loss": 2.8106, + "step": 9980 + }, + { + "epoch": 3.1538340045802733, + "grad_norm": 0.11509772915867757, + "learning_rate": 0.0003605870618884345, + "loss": 2.8525, + "step": 9985 + }, + { + "epoch": 3.1554134091447525, + "grad_norm": 0.0983241286227083, + "learning_rate": 0.00036005755081636425, + "loss": 2.8276, + "step": 9990 + }, + { + "epoch": 3.156992813709232, + "grad_norm": 0.11038222745065804, + "learning_rate": 0.0003595282099786523, + "loss": 2.7956, + "step": 9995 + }, + { + "epoch": 3.1585722182737106, + "grad_norm": 0.10526613695833549, + "learning_rate": 0.0003589990400192201, + "loss": 2.8081, + "step": 10000 + }, + { + "epoch": 3.16015162283819, + "grad_norm": 0.09230450379133098, + "learning_rate": 0.00035847004158178186, + "loss": 2.7626, + "step": 10005 + }, + { + "epoch": 3.161731027402669, + "grad_norm": 0.12200782727310795, + "learning_rate": 0.0003579412153098428, + "loss": 2.8417, + "step": 10010 + }, + { + "epoch": 3.1633104319671483, + "grad_norm": 0.10083668653829685, + "learning_rate": 0.00035741256184669903, + "loss": 2.8797, + "step": 10015 + }, + { + "epoch": 3.1648898365316276, + "grad_norm": 0.11019845863132575, + "learning_rate": 0.00035688408183543586, + "loss": 2.7975, + "step": 10020 + }, + { + "epoch": 3.166469241096107, + "grad_norm": 0.11159164688843977, + "learning_rate": 0.0003563557759189282, + "loss": 2.7171, + "step": 10025 + }, + { + "epoch": 3.168048645660586, + "grad_norm": 0.10537361417941192, + "learning_rate": 0.00035582764473983896, + "loss": 2.8344, + "step": 10030 + }, + { + "epoch": 3.1696280502250653, + "grad_norm": 0.09542451280482253, + "learning_rate": 0.00035529968894061815, + "loss": 2.7607, + "step": 10035 + }, + { + "epoch": 3.1712074547895446, + "grad_norm": 0.08830766235266202, + "learning_rate": 0.0003547719091635031, + "loss": 2.7306, + "step": 10040 + }, + { + "epoch": 3.1727868593540234, + "grad_norm": 0.09387085756674177, + "learning_rate": 0.0003542443060505167, + "loss": 2.8492, + "step": 10045 + }, + { + "epoch": 3.1743662639185026, + "grad_norm": 0.10198816922678111, + "learning_rate": 0.0003537168802434666, + "loss": 2.8642, + "step": 10050 + }, + { + "epoch": 3.175945668482982, + "grad_norm": 0.13322243689409521, + "learning_rate": 0.00035318963238394524, + "loss": 2.6974, + "step": 10055 + }, + { + "epoch": 3.177525073047461, + "grad_norm": 0.1138497993736802, + "learning_rate": 0.0003526625631133283, + "loss": 2.7591, + "step": 10060 + }, + { + "epoch": 3.1791044776119404, + "grad_norm": 0.12308413441898473, + "learning_rate": 0.0003521356730727747, + "loss": 2.8083, + "step": 10065 + }, + { + "epoch": 3.1806838821764196, + "grad_norm": 0.10174180898082953, + "learning_rate": 0.00035160896290322466, + "loss": 2.7881, + "step": 10070 + }, + { + "epoch": 3.182263286740899, + "grad_norm": 0.09672870359748681, + "learning_rate": 0.0003510824332454, + "loss": 2.734, + "step": 10075 + }, + { + "epoch": 3.1838426913053777, + "grad_norm": 0.08547351924749266, + "learning_rate": 0.0003505560847398027, + "loss": 2.7196, + "step": 10080 + }, + { + "epoch": 3.185422095869857, + "grad_norm": 0.08723168197605567, + "learning_rate": 0.0003500299180267146, + "loss": 2.796, + "step": 10085 + }, + { + "epoch": 3.187001500434336, + "grad_norm": 0.08406353772095086, + "learning_rate": 0.0003495039337461966, + "loss": 2.7767, + "step": 10090 + }, + { + "epoch": 3.1885809049988154, + "grad_norm": 0.09271636093914204, + "learning_rate": 0.00034897813253808717, + "loss": 2.8579, + "step": 10095 + }, + { + "epoch": 3.1901603095632947, + "grad_norm": 0.09685899156814314, + "learning_rate": 0.0003484525150420024, + "loss": 2.802, + "step": 10100 + }, + { + "epoch": 3.191739714127774, + "grad_norm": 0.08296004406299497, + "learning_rate": 0.00034792708189733477, + "loss": 2.7294, + "step": 10105 + }, + { + "epoch": 3.193319118692253, + "grad_norm": 0.09573421531424595, + "learning_rate": 0.00034740183374325255, + "loss": 2.7771, + "step": 10110 + }, + { + "epoch": 3.1948985232567324, + "grad_norm": 0.1026502934765465, + "learning_rate": 0.00034687677121869885, + "loss": 2.7712, + "step": 10115 + }, + { + "epoch": 3.196477927821211, + "grad_norm": 0.1197013920449785, + "learning_rate": 0.0003463518949623914, + "loss": 2.7689, + "step": 10120 + }, + { + "epoch": 3.1980573323856905, + "grad_norm": 0.10892579808211512, + "learning_rate": 0.0003458272056128211, + "loss": 2.8117, + "step": 10125 + }, + { + "epoch": 3.1996367369501697, + "grad_norm": 0.11654615110479964, + "learning_rate": 0.000345302703808251, + "loss": 2.7712, + "step": 10130 + }, + { + "epoch": 3.201216141514649, + "grad_norm": 0.08037702352022942, + "learning_rate": 0.00034477839018671677, + "loss": 2.7656, + "step": 10135 + }, + { + "epoch": 3.202795546079128, + "grad_norm": 0.09728981313291721, + "learning_rate": 0.00034425426538602457, + "loss": 2.7511, + "step": 10140 + }, + { + "epoch": 3.2043749506436074, + "grad_norm": 0.09791391597565863, + "learning_rate": 0.00034373033004375154, + "loss": 2.7805, + "step": 10145 + }, + { + "epoch": 3.2059543552080867, + "grad_norm": 0.10599170335724191, + "learning_rate": 0.00034320658479724354, + "loss": 2.8416, + "step": 10150 + }, + { + "epoch": 3.207533759772566, + "grad_norm": 0.0871599731028459, + "learning_rate": 0.00034268303028361593, + "loss": 2.7097, + "step": 10155 + }, + { + "epoch": 3.2091131643370447, + "grad_norm": 0.0850412540153791, + "learning_rate": 0.00034215966713975135, + "loss": 2.8742, + "step": 10160 + }, + { + "epoch": 3.210692568901524, + "grad_norm": 0.08043909999541177, + "learning_rate": 0.0003416364960023001, + "loss": 2.7685, + "step": 10165 + }, + { + "epoch": 3.2122719734660032, + "grad_norm": 0.08821836004911278, + "learning_rate": 0.000341113517507679, + "loss": 2.7397, + "step": 10170 + }, + { + "epoch": 3.2138513780304825, + "grad_norm": 0.08649994686945457, + "learning_rate": 0.00034059073229207034, + "loss": 2.8747, + "step": 10175 + }, + { + "epoch": 3.2154307825949617, + "grad_norm": 0.10390318980319042, + "learning_rate": 0.0003400681409914211, + "loss": 2.9359, + "step": 10180 + }, + { + "epoch": 3.217010187159441, + "grad_norm": 0.09047296521943848, + "learning_rate": 0.00033954574424144274, + "loss": 2.8406, + "step": 10185 + }, + { + "epoch": 3.2185895917239202, + "grad_norm": 0.09267422370405444, + "learning_rate": 0.0003390235426776095, + "loss": 2.7719, + "step": 10190 + }, + { + "epoch": 3.2201689962883995, + "grad_norm": 0.09331054979286894, + "learning_rate": 0.00033850153693515915, + "loss": 2.8203, + "step": 10195 + }, + { + "epoch": 3.2217484008528783, + "grad_norm": 0.09305039470541963, + "learning_rate": 0.0003379797276490904, + "loss": 2.8131, + "step": 10200 + }, + { + "epoch": 3.2233278054173575, + "grad_norm": 0.08844182082942555, + "learning_rate": 0.00033745811545416327, + "loss": 2.7837, + "step": 10205 + }, + { + "epoch": 3.2249072099818368, + "grad_norm": 0.10344603201593157, + "learning_rate": 0.0003369367009848979, + "loss": 2.8531, + "step": 10210 + }, + { + "epoch": 3.226486614546316, + "grad_norm": 0.08988469720539088, + "learning_rate": 0.00033641548487557406, + "loss": 2.8106, + "step": 10215 + }, + { + "epoch": 3.2280660191107953, + "grad_norm": 0.09553604500726276, + "learning_rate": 0.00033589446776023023, + "loss": 2.9042, + "step": 10220 + }, + { + "epoch": 3.2296454236752745, + "grad_norm": 0.09403774521688882, + "learning_rate": 0.00033537365027266284, + "loss": 2.8888, + "step": 10225 + }, + { + "epoch": 3.2312248282397538, + "grad_norm": 0.09265964429985306, + "learning_rate": 0.0003348530330464252, + "loss": 2.7721, + "step": 10230 + }, + { + "epoch": 3.2328042328042326, + "grad_norm": 0.09851990312142413, + "learning_rate": 0.0003343326167148275, + "loss": 2.8712, + "step": 10235 + }, + { + "epoch": 3.234383637368712, + "grad_norm": 0.08719383377135549, + "learning_rate": 0.00033381240191093477, + "loss": 2.8202, + "step": 10240 + }, + { + "epoch": 3.235963041933191, + "grad_norm": 0.09013388573499846, + "learning_rate": 0.0003332923892675679, + "loss": 2.7795, + "step": 10245 + }, + { + "epoch": 3.2375424464976703, + "grad_norm": 0.08995633861369724, + "learning_rate": 0.0003327725794173011, + "loss": 2.8209, + "step": 10250 + }, + { + "epoch": 3.2391218510621496, + "grad_norm": 0.10856360963772668, + "learning_rate": 0.00033225297299246227, + "loss": 2.7667, + "step": 10255 + }, + { + "epoch": 3.240701255626629, + "grad_norm": 0.1051886244149574, + "learning_rate": 0.00033173357062513153, + "loss": 2.7641, + "step": 10260 + }, + { + "epoch": 3.242280660191108, + "grad_norm": 0.10921839089379684, + "learning_rate": 0.00033121437294714103, + "loss": 2.8636, + "step": 10265 + }, + { + "epoch": 3.2438600647555873, + "grad_norm": 0.10352380755657233, + "learning_rate": 0.000330695380590074, + "loss": 2.8334, + "step": 10270 + }, + { + "epoch": 3.2454394693200666, + "grad_norm": 0.1100045225541539, + "learning_rate": 0.00033017659418526366, + "loss": 2.8854, + "step": 10275 + }, + { + "epoch": 3.2470188738845454, + "grad_norm": 0.09270834220250747, + "learning_rate": 0.0003296580143637927, + "loss": 2.6529, + "step": 10280 + }, + { + "epoch": 3.2485982784490246, + "grad_norm": 0.08939893829290244, + "learning_rate": 0.0003291396417564927, + "loss": 2.8705, + "step": 10285 + }, + { + "epoch": 3.250177683013504, + "grad_norm": 0.10116051723993615, + "learning_rate": 0.00032862147699394307, + "loss": 2.7462, + "step": 10290 + }, + { + "epoch": 3.251757087577983, + "grad_norm": 0.08254665948382854, + "learning_rate": 0.0003281035207064702, + "loss": 2.7523, + "step": 10295 + }, + { + "epoch": 3.2533364921424623, + "grad_norm": 0.0927012310467256, + "learning_rate": 0.00032758577352414743, + "loss": 2.8168, + "step": 10300 + }, + { + "epoch": 3.2549158967069416, + "grad_norm": 0.0727240409285992, + "learning_rate": 0.0003270682360767933, + "loss": 2.7105, + "step": 10305 + }, + { + "epoch": 3.256495301271421, + "grad_norm": 0.07641937991051459, + "learning_rate": 0.000326550908993971, + "loss": 2.6676, + "step": 10310 + }, + { + "epoch": 3.2580747058358996, + "grad_norm": 0.09083967725825964, + "learning_rate": 0.00032603379290498845, + "loss": 2.8156, + "step": 10315 + }, + { + "epoch": 3.259654110400379, + "grad_norm": 0.10311120598563625, + "learning_rate": 0.0003255168884388962, + "loss": 2.7448, + "step": 10320 + }, + { + "epoch": 3.261233514964858, + "grad_norm": 0.12285696927565312, + "learning_rate": 0.0003250001962244881, + "loss": 2.7653, + "step": 10325 + }, + { + "epoch": 3.2628129195293374, + "grad_norm": 0.10189192939942499, + "learning_rate": 0.00032448371689029916, + "loss": 2.74, + "step": 10330 + }, + { + "epoch": 3.2643923240938166, + "grad_norm": 0.08569907776229424, + "learning_rate": 0.000323967451064606, + "loss": 2.8235, + "step": 10335 + }, + { + "epoch": 3.265971728658296, + "grad_norm": 0.10915203379452394, + "learning_rate": 0.0003234513993754249, + "loss": 2.8322, + "step": 10340 + }, + { + "epoch": 3.267551133222775, + "grad_norm": 0.10559515915941756, + "learning_rate": 0.00032293556245051205, + "loss": 2.911, + "step": 10345 + }, + { + "epoch": 3.2691305377872544, + "grad_norm": 0.09274098866921747, + "learning_rate": 0.0003224199409173626, + "loss": 2.7084, + "step": 10350 + }, + { + "epoch": 3.2707099423517336, + "grad_norm": 0.10169340022249343, + "learning_rate": 0.00032190453540320905, + "loss": 2.7828, + "step": 10355 + }, + { + "epoch": 3.2722893469162124, + "grad_norm": 0.10538902352745173, + "learning_rate": 0.00032138934653502154, + "loss": 2.7102, + "step": 10360 + }, + { + "epoch": 3.2738687514806917, + "grad_norm": 0.13638379454753577, + "learning_rate": 0.0003208743749395068, + "loss": 2.8244, + "step": 10365 + }, + { + "epoch": 3.275448156045171, + "grad_norm": 0.14513930133062752, + "learning_rate": 0.00032035962124310675, + "loss": 2.8425, + "step": 10370 + }, + { + "epoch": 3.27702756060965, + "grad_norm": 0.08349472002147552, + "learning_rate": 0.00031984508607199873, + "loss": 2.7892, + "step": 10375 + }, + { + "epoch": 3.2786069651741294, + "grad_norm": 0.08104299402779074, + "learning_rate": 0.0003193307700520941, + "loss": 2.7423, + "step": 10380 + }, + { + "epoch": 3.2801863697386087, + "grad_norm": 0.08236614473434854, + "learning_rate": 0.0003188166738090377, + "loss": 2.8255, + "step": 10385 + }, + { + "epoch": 3.281765774303088, + "grad_norm": 0.08969459608903811, + "learning_rate": 0.00031830279796820656, + "loss": 2.7319, + "step": 10390 + }, + { + "epoch": 3.2833451788675667, + "grad_norm": 0.11484517090134234, + "learning_rate": 0.0003177891431547101, + "loss": 2.9348, + "step": 10395 + }, + { + "epoch": 3.284924583432046, + "grad_norm": 0.11750554407948001, + "learning_rate": 0.000317275709993389, + "loss": 2.7242, + "step": 10400 + }, + { + "epoch": 3.286503987996525, + "grad_norm": 0.12448605915336501, + "learning_rate": 0.00031676249910881374, + "loss": 2.8041, + "step": 10405 + }, + { + "epoch": 3.2880833925610045, + "grad_norm": 0.0826535725225018, + "learning_rate": 0.00031624951112528484, + "loss": 2.7142, + "step": 10410 + }, + { + "epoch": 3.2896627971254837, + "grad_norm": 0.09599474247091126, + "learning_rate": 0.0003157367466668316, + "loss": 2.7681, + "step": 10415 + }, + { + "epoch": 3.291242201689963, + "grad_norm": 0.08587535416157806, + "learning_rate": 0.00031522420635721107, + "loss": 2.7437, + "step": 10420 + }, + { + "epoch": 3.292821606254442, + "grad_norm": 0.08780026752528526, + "learning_rate": 0.00031471189081990814, + "loss": 2.7516, + "step": 10425 + }, + { + "epoch": 3.2944010108189214, + "grad_norm": 0.08833883706658188, + "learning_rate": 0.0003141998006781341, + "loss": 2.745, + "step": 10430 + }, + { + "epoch": 3.2959804153834007, + "grad_norm": 0.09358958670004401, + "learning_rate": 0.000313687936554826, + "loss": 2.81, + "step": 10435 + }, + { + "epoch": 3.2975598199478795, + "grad_norm": 0.10450158707026738, + "learning_rate": 0.0003131762990726457, + "loss": 2.85, + "step": 10440 + }, + { + "epoch": 3.2991392245123587, + "grad_norm": 0.09643583499425244, + "learning_rate": 0.0003126648888539798, + "loss": 2.8408, + "step": 10445 + }, + { + "epoch": 3.300718629076838, + "grad_norm": 0.08745485523804206, + "learning_rate": 0.0003121537065209382, + "loss": 2.7717, + "step": 10450 + }, + { + "epoch": 3.3022980336413172, + "grad_norm": 0.08974871708382999, + "learning_rate": 0.0003116427526953536, + "loss": 2.799, + "step": 10455 + }, + { + "epoch": 3.3038774382057965, + "grad_norm": 0.11404985610167258, + "learning_rate": 0.000311132027998781, + "loss": 2.8022, + "step": 10460 + }, + { + "epoch": 3.3054568427702757, + "grad_norm": 0.12015100369305097, + "learning_rate": 0.0003106215330524962, + "loss": 2.7478, + "step": 10465 + }, + { + "epoch": 3.307036247334755, + "grad_norm": 0.10342486455572253, + "learning_rate": 0.00031011126847749573, + "loss": 2.7389, + "step": 10470 + }, + { + "epoch": 3.308615651899234, + "grad_norm": 0.14353090615973468, + "learning_rate": 0.000309601234894496, + "loss": 2.8025, + "step": 10475 + }, + { + "epoch": 3.310195056463713, + "grad_norm": 0.12943337260732343, + "learning_rate": 0.0003090914329239325, + "loss": 2.785, + "step": 10480 + }, + { + "epoch": 3.3117744610281923, + "grad_norm": 0.11866623983880317, + "learning_rate": 0.0003085818631859585, + "loss": 2.9396, + "step": 10485 + }, + { + "epoch": 3.3133538655926715, + "grad_norm": 0.11061480562524836, + "learning_rate": 0.00030807252630044534, + "loss": 2.8291, + "step": 10490 + }, + { + "epoch": 3.314933270157151, + "grad_norm": 0.11813030425267225, + "learning_rate": 0.0003075634228869808, + "loss": 2.7673, + "step": 10495 + }, + { + "epoch": 3.31651267472163, + "grad_norm": 0.11844596787273207, + "learning_rate": 0.00030705455356486844, + "loss": 2.7361, + "step": 10500 + }, + { + "epoch": 3.3180920792861093, + "grad_norm": 0.1072983792160873, + "learning_rate": 0.00030654591895312765, + "loss": 2.8479, + "step": 10505 + }, + { + "epoch": 3.3196714838505885, + "grad_norm": 0.09770505021791172, + "learning_rate": 0.0003060375196704919, + "loss": 2.7568, + "step": 10510 + }, + { + "epoch": 3.3212508884150673, + "grad_norm": 0.08337777054489283, + "learning_rate": 0.00030552935633540836, + "loss": 2.7652, + "step": 10515 + }, + { + "epoch": 3.3228302929795466, + "grad_norm": 0.09925597058384632, + "learning_rate": 0.0003050214295660373, + "loss": 2.7135, + "step": 10520 + }, + { + "epoch": 3.324409697544026, + "grad_norm": 0.11284231096345444, + "learning_rate": 0.00030451373998025103, + "loss": 2.7402, + "step": 10525 + }, + { + "epoch": 3.325989102108505, + "grad_norm": 0.1161877790093571, + "learning_rate": 0.0003040062881956339, + "loss": 2.8164, + "step": 10530 + }, + { + "epoch": 3.3275685066729843, + "grad_norm": 0.11312346073070796, + "learning_rate": 0.00030349907482948033, + "loss": 2.8635, + "step": 10535 + }, + { + "epoch": 3.3291479112374636, + "grad_norm": 0.10143003846192548, + "learning_rate": 0.000302992100498795, + "loss": 2.8035, + "step": 10540 + }, + { + "epoch": 3.330727315801943, + "grad_norm": 0.11416579286384988, + "learning_rate": 0.00030248536582029177, + "loss": 2.6941, + "step": 10545 + }, + { + "epoch": 3.3323067203664216, + "grad_norm": 0.09756681992157866, + "learning_rate": 0.00030197887141039296, + "loss": 2.7982, + "step": 10550 + }, + { + "epoch": 3.333886124930901, + "grad_norm": 0.09618299258170292, + "learning_rate": 0.0003014726178852286, + "loss": 2.8402, + "step": 10555 + }, + { + "epoch": 3.33546552949538, + "grad_norm": 0.08986382600295917, + "learning_rate": 0.0003009666058606361, + "loss": 2.8109, + "step": 10560 + }, + { + "epoch": 3.3370449340598594, + "grad_norm": 0.10525015223717464, + "learning_rate": 0.00030046083595215825, + "loss": 2.7715, + "step": 10565 + }, + { + "epoch": 3.3386243386243386, + "grad_norm": 0.0871796093885151, + "learning_rate": 0.0002999553087750441, + "loss": 2.821, + "step": 10570 + }, + { + "epoch": 3.340203743188818, + "grad_norm": 0.08747846767578411, + "learning_rate": 0.0002994500249442467, + "loss": 2.824, + "step": 10575 + }, + { + "epoch": 3.341783147753297, + "grad_norm": 0.08686435971083976, + "learning_rate": 0.00029894498507442404, + "loss": 2.7703, + "step": 10580 + }, + { + "epoch": 3.3433625523177763, + "grad_norm": 0.0948554219898756, + "learning_rate": 0.00029844018977993647, + "loss": 2.8411, + "step": 10585 + }, + { + "epoch": 3.3449419568822556, + "grad_norm": 0.09792856835877507, + "learning_rate": 0.00029793563967484737, + "loss": 2.8168, + "step": 10590 + }, + { + "epoch": 3.3465213614467344, + "grad_norm": 0.09260715681034247, + "learning_rate": 0.00029743133537292146, + "loss": 2.7013, + "step": 10595 + }, + { + "epoch": 3.3481007660112136, + "grad_norm": 0.09671818934996361, + "learning_rate": 0.0002969272774876246, + "loss": 2.7433, + "step": 10600 + }, + { + "epoch": 3.349680170575693, + "grad_norm": 0.07068673760594434, + "learning_rate": 0.0002964234666321229, + "loss": 2.8983, + "step": 10605 + }, + { + "epoch": 3.351259575140172, + "grad_norm": 0.08911111410190489, + "learning_rate": 0.0002959199034192823, + "loss": 2.7506, + "step": 10610 + }, + { + "epoch": 3.3528389797046514, + "grad_norm": 0.08770121377828956, + "learning_rate": 0.0002954165884616669, + "loss": 2.6839, + "step": 10615 + }, + { + "epoch": 3.3544183842691306, + "grad_norm": 0.10305670333029622, + "learning_rate": 0.00029491352237153925, + "loss": 2.8885, + "step": 10620 + }, + { + "epoch": 3.35599778883361, + "grad_norm": 0.08706783547374539, + "learning_rate": 0.0002944107057608588, + "loss": 2.77, + "step": 10625 + }, + { + "epoch": 3.3575771933980887, + "grad_norm": 0.08626509141126583, + "learning_rate": 0.00029390813924128187, + "loss": 2.863, + "step": 10630 + }, + { + "epoch": 3.359156597962568, + "grad_norm": 0.07681826210713953, + "learning_rate": 0.0002934058234241604, + "loss": 2.8356, + "step": 10635 + }, + { + "epoch": 3.360736002527047, + "grad_norm": 0.08735499339197443, + "learning_rate": 0.00029290375892054144, + "loss": 2.8735, + "step": 10640 + }, + { + "epoch": 3.3623154070915264, + "grad_norm": 0.08461473487663433, + "learning_rate": 0.00029240194634116615, + "loss": 2.7668, + "step": 10645 + }, + { + "epoch": 3.3638948116560057, + "grad_norm": 0.08954118905206135, + "learning_rate": 0.00029190038629646925, + "loss": 2.7989, + "step": 10650 + }, + { + "epoch": 3.365474216220485, + "grad_norm": 0.08580660015308204, + "learning_rate": 0.0002913990793965785, + "loss": 2.8045, + "step": 10655 + }, + { + "epoch": 3.367053620784964, + "grad_norm": 0.09208676256006787, + "learning_rate": 0.00029089802625131356, + "loss": 2.7887, + "step": 10660 + }, + { + "epoch": 3.3686330253494434, + "grad_norm": 0.10581784088945766, + "learning_rate": 0.0002903972274701854, + "loss": 2.8631, + "step": 10665 + }, + { + "epoch": 3.3702124299139227, + "grad_norm": 0.08496701975959248, + "learning_rate": 0.00028989668366239557, + "loss": 2.9046, + "step": 10670 + }, + { + "epoch": 3.3717918344784015, + "grad_norm": 0.09830241330937724, + "learning_rate": 0.0002893963954368357, + "loss": 2.6857, + "step": 10675 + }, + { + "epoch": 3.3733712390428807, + "grad_norm": 0.10063256563142661, + "learning_rate": 0.00028889636340208557, + "loss": 2.796, + "step": 10680 + }, + { + "epoch": 3.37495064360736, + "grad_norm": 0.14218562279325908, + "learning_rate": 0.00028839658816641483, + "loss": 2.8761, + "step": 10685 + }, + { + "epoch": 3.376530048171839, + "grad_norm": 0.08569913365755431, + "learning_rate": 0.00028789707033777956, + "loss": 2.7671, + "step": 10690 + }, + { + "epoch": 3.3781094527363185, + "grad_norm": 0.08385131104192822, + "learning_rate": 0.0002873978105238234, + "loss": 2.7571, + "step": 10695 + }, + { + "epoch": 3.3796888573007977, + "grad_norm": 0.09983927441112955, + "learning_rate": 0.00028689880933187545, + "loss": 2.7044, + "step": 10700 + }, + { + "epoch": 3.381268261865277, + "grad_norm": 0.10194084669294304, + "learning_rate": 0.00028640006736895045, + "loss": 2.8349, + "step": 10705 + }, + { + "epoch": 3.3828476664297558, + "grad_norm": 0.0826488277966761, + "learning_rate": 0.0002859015852417485, + "loss": 2.7198, + "step": 10710 + }, + { + "epoch": 3.384427070994235, + "grad_norm": 0.08591632570056654, + "learning_rate": 0.00028540336355665287, + "loss": 2.7896, + "step": 10715 + }, + { + "epoch": 3.3860064755587143, + "grad_norm": 0.10755343923794718, + "learning_rate": 0.00028490540291972987, + "loss": 2.6768, + "step": 10720 + }, + { + "epoch": 3.3875858801231935, + "grad_norm": 0.0992873652173549, + "learning_rate": 0.00028440770393672876, + "loss": 2.7535, + "step": 10725 + }, + { + "epoch": 3.3891652846876728, + "grad_norm": 0.0907066773586846, + "learning_rate": 0.00028391026721308045, + "loss": 2.6961, + "step": 10730 + }, + { + "epoch": 3.390744689252152, + "grad_norm": 0.08815581129035133, + "learning_rate": 0.0002834130933538965, + "loss": 2.9166, + "step": 10735 + }, + { + "epoch": 3.3923240938166312, + "grad_norm": 0.09925993594935574, + "learning_rate": 0.00028291618296396903, + "loss": 2.8347, + "step": 10740 + }, + { + "epoch": 3.3939034983811105, + "grad_norm": 0.08854222364072202, + "learning_rate": 0.00028241953664776947, + "loss": 2.7158, + "step": 10745 + }, + { + "epoch": 3.3954829029455897, + "grad_norm": 0.10593514231411359, + "learning_rate": 0.00028192315500944815, + "loss": 2.75, + "step": 10750 + }, + { + "epoch": 3.3970623075100685, + "grad_norm": 0.08871145686865252, + "learning_rate": 0.0002814270386528335, + "loss": 2.8973, + "step": 10755 + }, + { + "epoch": 3.398641712074548, + "grad_norm": 0.10109032402250467, + "learning_rate": 0.00028093118818143056, + "loss": 2.8363, + "step": 10760 + }, + { + "epoch": 3.400221116639027, + "grad_norm": 0.08939857135079345, + "learning_rate": 0.000280435604198422, + "loss": 2.7487, + "step": 10765 + }, + { + "epoch": 3.4018005212035063, + "grad_norm": 0.07345794157408284, + "learning_rate": 0.00027994028730666566, + "loss": 2.7973, + "step": 10770 + }, + { + "epoch": 3.4033799257679855, + "grad_norm": 0.0976729103298637, + "learning_rate": 0.0002794452381086947, + "loss": 2.6475, + "step": 10775 + }, + { + "epoch": 3.404959330332465, + "grad_norm": 0.10542577295220248, + "learning_rate": 0.0002789504572067163, + "loss": 2.8145, + "step": 10780 + }, + { + "epoch": 3.406538734896944, + "grad_norm": 0.08879000218903442, + "learning_rate": 0.00027845594520261143, + "loss": 2.8503, + "step": 10785 + }, + { + "epoch": 3.408118139461423, + "grad_norm": 0.08992673674464459, + "learning_rate": 0.00027796170269793447, + "loss": 2.774, + "step": 10790 + }, + { + "epoch": 3.409697544025902, + "grad_norm": 0.08304893286281426, + "learning_rate": 0.0002774677302939115, + "loss": 2.8154, + "step": 10795 + }, + { + "epoch": 3.4112769485903813, + "grad_norm": 0.09054083293396371, + "learning_rate": 0.00027697402859143974, + "loss": 2.7216, + "step": 10800 + }, + { + "epoch": 3.4128563531548606, + "grad_norm": 0.11485679109368063, + "learning_rate": 0.0002764805981910875, + "loss": 2.779, + "step": 10805 + }, + { + "epoch": 3.41443575771934, + "grad_norm": 0.09986355742999838, + "learning_rate": 0.0002759874396930932, + "loss": 2.777, + "step": 10810 + }, + { + "epoch": 3.416015162283819, + "grad_norm": 0.08634040176509505, + "learning_rate": 0.0002754945536973642, + "loss": 2.7817, + "step": 10815 + }, + { + "epoch": 3.4175945668482983, + "grad_norm": 0.1387185273790178, + "learning_rate": 0.0002750019408034765, + "loss": 2.8803, + "step": 10820 + }, + { + "epoch": 3.4191739714127776, + "grad_norm": 0.09169017218900806, + "learning_rate": 0.00027450960161067386, + "loss": 2.8645, + "step": 10825 + }, + { + "epoch": 3.420753375977257, + "grad_norm": 0.09312706836528975, + "learning_rate": 0.00027401753671786713, + "loss": 2.7441, + "step": 10830 + }, + { + "epoch": 3.4223327805417356, + "grad_norm": 0.07638921895269593, + "learning_rate": 0.0002735257467236333, + "loss": 2.725, + "step": 10835 + }, + { + "epoch": 3.423912185106215, + "grad_norm": 0.09152532966595302, + "learning_rate": 0.0002730342322262153, + "loss": 2.8329, + "step": 10840 + }, + { + "epoch": 3.425491589670694, + "grad_norm": 0.10319959902796268, + "learning_rate": 0.0002725429938235207, + "loss": 2.7832, + "step": 10845 + }, + { + "epoch": 3.4270709942351734, + "grad_norm": 0.085158962330912, + "learning_rate": 0.00027205203211312114, + "loss": 2.7716, + "step": 10850 + }, + { + "epoch": 3.4286503987996526, + "grad_norm": 0.09396859809862895, + "learning_rate": 0.00027156134769225213, + "loss": 2.678, + "step": 10855 + }, + { + "epoch": 3.430229803364132, + "grad_norm": 0.09498066575529822, + "learning_rate": 0.0002710709411578108, + "loss": 2.7378, + "step": 10860 + }, + { + "epoch": 3.431809207928611, + "grad_norm": 0.10037601950402124, + "learning_rate": 0.0002705808131063576, + "loss": 2.8111, + "step": 10865 + }, + { + "epoch": 3.43338861249309, + "grad_norm": 0.09284252009302106, + "learning_rate": 0.0002700909641341136, + "loss": 2.7683, + "step": 10870 + }, + { + "epoch": 3.434968017057569, + "grad_norm": 0.09157793859610316, + "learning_rate": 0.00026960139483696, + "loss": 2.7061, + "step": 10875 + }, + { + "epoch": 3.4365474216220484, + "grad_norm": 0.10394576925136403, + "learning_rate": 0.00026911210581043827, + "loss": 2.7692, + "step": 10880 + }, + { + "epoch": 3.4381268261865277, + "grad_norm": 0.08246710987942577, + "learning_rate": 0.0002686230976497487, + "loss": 2.7873, + "step": 10885 + }, + { + "epoch": 3.439706230751007, + "grad_norm": 0.08815358578578188, + "learning_rate": 0.0002681343709497506, + "loss": 2.7813, + "step": 10890 + }, + { + "epoch": 3.441285635315486, + "grad_norm": 0.15415479096951565, + "learning_rate": 0.00026764592630495966, + "loss": 2.8441, + "step": 10895 + }, + { + "epoch": 3.4428650398799654, + "grad_norm": 0.0924510602430328, + "learning_rate": 0.0002671577643095495, + "loss": 2.7931, + "step": 10900 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.13299265963339613, + "learning_rate": 0.0002666698855573494, + "loss": 2.7371, + "step": 10905 + }, + { + "epoch": 3.4460238490089234, + "grad_norm": 0.0908679979361624, + "learning_rate": 0.0002661822906418443, + "loss": 2.8087, + "step": 10910 + }, + { + "epoch": 3.4476032535734027, + "grad_norm": 0.08558429963702503, + "learning_rate": 0.00026569498015617375, + "loss": 2.8402, + "step": 10915 + }, + { + "epoch": 3.449182658137882, + "grad_norm": 0.11112704951636004, + "learning_rate": 0.0002652079546931314, + "loss": 2.7451, + "step": 10920 + }, + { + "epoch": 3.450762062702361, + "grad_norm": 0.10937437672718385, + "learning_rate": 0.0002647212148451641, + "loss": 2.7759, + "step": 10925 + }, + { + "epoch": 3.4523414672668404, + "grad_norm": 0.09702527426316783, + "learning_rate": 0.0002642347612043713, + "loss": 2.7748, + "step": 10930 + }, + { + "epoch": 3.4539208718313197, + "grad_norm": 0.10383828392331206, + "learning_rate": 0.00026374859436250443, + "loss": 2.7683, + "step": 10935 + }, + { + "epoch": 3.455500276395799, + "grad_norm": 0.07690619190890755, + "learning_rate": 0.00026326271491096533, + "loss": 2.6818, + "step": 10940 + }, + { + "epoch": 3.4570796809602777, + "grad_norm": 0.08124921569841642, + "learning_rate": 0.00026277712344080744, + "loss": 2.8036, + "step": 10945 + }, + { + "epoch": 3.458659085524757, + "grad_norm": 0.08092124730352737, + "learning_rate": 0.0002622918205427332, + "loss": 2.7491, + "step": 10950 + }, + { + "epoch": 3.4602384900892362, + "grad_norm": 0.08633152504961933, + "learning_rate": 0.0002618068068070937, + "loss": 2.7884, + "step": 10955 + }, + { + "epoch": 3.4618178946537155, + "grad_norm": 0.08300162993971244, + "learning_rate": 0.0002613220828238887, + "loss": 2.7962, + "step": 10960 + }, + { + "epoch": 3.4633972992181947, + "grad_norm": 0.14525904455931402, + "learning_rate": 0.0002608376491827653, + "loss": 2.7566, + "step": 10965 + }, + { + "epoch": 3.464976703782674, + "grad_norm": 0.11516134927374742, + "learning_rate": 0.00026035350647301826, + "loss": 2.7814, + "step": 10970 + }, + { + "epoch": 3.466556108347153, + "grad_norm": 0.08151651771920508, + "learning_rate": 0.00025986965528358686, + "loss": 2.8004, + "step": 10975 + }, + { + "epoch": 3.4681355129116325, + "grad_norm": 0.09385265602259785, + "learning_rate": 0.000259386096203057, + "loss": 2.7664, + "step": 10980 + }, + { + "epoch": 3.4697149174761117, + "grad_norm": 0.08359061385939004, + "learning_rate": 0.0002589028298196587, + "loss": 2.8729, + "step": 10985 + }, + { + "epoch": 3.4712943220405905, + "grad_norm": 0.09326773515852947, + "learning_rate": 0.00025841985672126627, + "loss": 2.8564, + "step": 10990 + }, + { + "epoch": 3.4728737266050698, + "grad_norm": 0.09217841367489349, + "learning_rate": 0.0002579371774953969, + "loss": 2.8335, + "step": 10995 + }, + { + "epoch": 3.474453131169549, + "grad_norm": 0.09981611640743568, + "learning_rate": 0.00025745479272921035, + "loss": 2.8001, + "step": 11000 + }, + { + "epoch": 3.4760325357340283, + "grad_norm": 0.08179164642245464, + "learning_rate": 0.00025697270300950847, + "loss": 2.9299, + "step": 11005 + }, + { + "epoch": 3.4776119402985075, + "grad_norm": 0.091949551376435, + "learning_rate": 0.0002564909089227339, + "loss": 2.6954, + "step": 11010 + }, + { + "epoch": 3.4791913448629868, + "grad_norm": 0.09166900983954186, + "learning_rate": 0.00025600941105496976, + "loss": 2.8015, + "step": 11015 + }, + { + "epoch": 3.480770749427466, + "grad_norm": 0.0795310556658113, + "learning_rate": 0.0002555282099919389, + "loss": 2.7736, + "step": 11020 + }, + { + "epoch": 3.482350153991945, + "grad_norm": 0.0900562553354306, + "learning_rate": 0.0002550473063190031, + "loss": 2.7664, + "step": 11025 + }, + { + "epoch": 3.483929558556424, + "grad_norm": 0.09407254281721275, + "learning_rate": 0.0002545667006211623, + "loss": 2.8369, + "step": 11030 + }, + { + "epoch": 3.4855089631209033, + "grad_norm": 0.07723378262992991, + "learning_rate": 0.00025408639348305375, + "loss": 2.832, + "step": 11035 + }, + { + "epoch": 3.4870883676853826, + "grad_norm": 0.07979269610322459, + "learning_rate": 0.00025360638548895177, + "loss": 2.7741, + "step": 11040 + }, + { + "epoch": 3.488667772249862, + "grad_norm": 0.09323028206334563, + "learning_rate": 0.00025312667722276707, + "loss": 2.7649, + "step": 11045 + }, + { + "epoch": 3.490247176814341, + "grad_norm": 0.12105776258792499, + "learning_rate": 0.00025264726926804546, + "loss": 2.8477, + "step": 11050 + }, + { + "epoch": 3.4918265813788203, + "grad_norm": 0.1105682808381471, + "learning_rate": 0.0002521681622079672, + "loss": 2.7613, + "step": 11055 + }, + { + "epoch": 3.4934059859432995, + "grad_norm": 0.11059096312067039, + "learning_rate": 0.00025168935662534675, + "loss": 2.7119, + "step": 11060 + }, + { + "epoch": 3.494985390507779, + "grad_norm": 0.09524605317480227, + "learning_rate": 0.0002512108531026318, + "loss": 2.7114, + "step": 11065 + }, + { + "epoch": 3.4965647950722576, + "grad_norm": 0.0990559059014925, + "learning_rate": 0.00025073265222190304, + "loss": 2.8923, + "step": 11070 + }, + { + "epoch": 3.498144199636737, + "grad_norm": 0.1099256285770096, + "learning_rate": 0.00025025475456487217, + "loss": 2.7998, + "step": 11075 + }, + { + "epoch": 3.499723604201216, + "grad_norm": 0.10100376221497276, + "learning_rate": 0.0002497771607128826, + "loss": 2.8201, + "step": 11080 + }, + { + "epoch": 3.5013030087656953, + "grad_norm": 0.1052486162156182, + "learning_rate": 0.0002492998712469079, + "loss": 2.7215, + "step": 11085 + }, + { + "epoch": 3.5028824133301746, + "grad_norm": 0.09928927738606914, + "learning_rate": 0.00024882288674755196, + "loss": 2.7733, + "step": 11090 + }, + { + "epoch": 3.504461817894654, + "grad_norm": 0.09818223447896932, + "learning_rate": 0.0002483462077950464, + "loss": 2.7213, + "step": 11095 + }, + { + "epoch": 3.5060412224591326, + "grad_norm": 0.08596510791164912, + "learning_rate": 0.0002478698349692527, + "loss": 2.809, + "step": 11100 + }, + { + "epoch": 3.507620627023612, + "grad_norm": 0.0846114049791052, + "learning_rate": 0.000247393768849659, + "loss": 2.746, + "step": 11105 + }, + { + "epoch": 3.509200031588091, + "grad_norm": 0.09427065622660098, + "learning_rate": 0.00024691801001538083, + "loss": 2.7923, + "step": 11110 + }, + { + "epoch": 3.5107794361525704, + "grad_norm": 0.10380969457280094, + "learning_rate": 0.00024644255904515916, + "loss": 2.7766, + "step": 11115 + }, + { + "epoch": 3.5123588407170496, + "grad_norm": 0.09213310919614595, + "learning_rate": 0.0002459674165173611, + "loss": 2.8597, + "step": 11120 + }, + { + "epoch": 3.513938245281529, + "grad_norm": 0.09392699835767081, + "learning_rate": 0.00024549258300997866, + "loss": 2.8485, + "step": 11125 + }, + { + "epoch": 3.515517649846008, + "grad_norm": 0.08174692469312347, + "learning_rate": 0.0002450180591006278, + "loss": 2.7071, + "step": 11130 + }, + { + "epoch": 3.5170970544104874, + "grad_norm": 0.09132574355106149, + "learning_rate": 0.00024454384536654733, + "loss": 2.8601, + "step": 11135 + }, + { + "epoch": 3.5186764589749666, + "grad_norm": 0.09142503988901095, + "learning_rate": 0.0002440699423845994, + "loss": 2.8217, + "step": 11140 + }, + { + "epoch": 3.520255863539446, + "grad_norm": 0.10370066500341804, + "learning_rate": 0.00024359635073126768, + "loss": 2.7277, + "step": 11145 + }, + { + "epoch": 3.5218352681039247, + "grad_norm": 0.12108542661119506, + "learning_rate": 0.00024312307098265802, + "loss": 2.7941, + "step": 11150 + }, + { + "epoch": 3.523414672668404, + "grad_norm": 0.09631209388704579, + "learning_rate": 0.00024265010371449548, + "loss": 2.8293, + "step": 11155 + }, + { + "epoch": 3.524994077232883, + "grad_norm": 0.09490360968591363, + "learning_rate": 0.000242177449502126, + "loss": 2.7607, + "step": 11160 + }, + { + "epoch": 3.5265734817973624, + "grad_norm": 0.10513561014615604, + "learning_rate": 0.0002417051089205144, + "loss": 2.7314, + "step": 11165 + }, + { + "epoch": 3.5281528863618417, + "grad_norm": 0.09288770685274528, + "learning_rate": 0.00024123308254424397, + "loss": 2.668, + "step": 11170 + }, + { + "epoch": 3.529732290926321, + "grad_norm": 0.09114131498496017, + "learning_rate": 0.00024076137094751582, + "loss": 2.7992, + "step": 11175 + }, + { + "epoch": 3.5313116954907997, + "grad_norm": 0.11313615103787365, + "learning_rate": 0.0002402899747041481, + "loss": 2.7992, + "step": 11180 + }, + { + "epoch": 3.532891100055279, + "grad_norm": 0.1036181606441657, + "learning_rate": 0.00023981889438757538, + "loss": 2.7268, + "step": 11185 + }, + { + "epoch": 3.534470504619758, + "grad_norm": 0.12020951307713118, + "learning_rate": 0.0002393481305708481, + "loss": 2.8105, + "step": 11190 + }, + { + "epoch": 3.5360499091842374, + "grad_norm": 0.09703809632260463, + "learning_rate": 0.00023887768382663095, + "loss": 2.7822, + "step": 11195 + }, + { + "epoch": 3.5376293137487167, + "grad_norm": 0.09432814464265707, + "learning_rate": 0.000238407554727204, + "loss": 2.7567, + "step": 11200 + }, + { + "epoch": 3.539208718313196, + "grad_norm": 0.11908118014259843, + "learning_rate": 0.0002379377438444602, + "loss": 2.8585, + "step": 11205 + }, + { + "epoch": 3.540788122877675, + "grad_norm": 0.07999832467414614, + "learning_rate": 0.00023746825174990582, + "loss": 2.7772, + "step": 11210 + }, + { + "epoch": 3.5423675274421544, + "grad_norm": 0.0829297522285289, + "learning_rate": 0.0002369990790146586, + "loss": 2.813, + "step": 11215 + }, + { + "epoch": 3.5439469320066337, + "grad_norm": 0.08156986811427414, + "learning_rate": 0.00023653022620944848, + "loss": 2.7536, + "step": 11220 + }, + { + "epoch": 3.545526336571113, + "grad_norm": 0.09170687949810832, + "learning_rate": 0.00023606169390461647, + "loss": 2.749, + "step": 11225 + }, + { + "epoch": 3.5471057411355917, + "grad_norm": 0.09254289254155507, + "learning_rate": 0.00023559348267011265, + "loss": 2.8507, + "step": 11230 + }, + { + "epoch": 3.548685145700071, + "grad_norm": 0.08877591653132519, + "learning_rate": 0.00023512559307549747, + "loss": 2.7886, + "step": 11235 + }, + { + "epoch": 3.5502645502645502, + "grad_norm": 0.07578799768096475, + "learning_rate": 0.0002346580256899397, + "loss": 2.7956, + "step": 11240 + }, + { + "epoch": 3.5518439548290295, + "grad_norm": 0.10027186839404545, + "learning_rate": 0.0002341907810822163, + "loss": 2.7303, + "step": 11245 + }, + { + "epoch": 3.5534233593935087, + "grad_norm": 0.08779188132077755, + "learning_rate": 0.00023372385982071154, + "loss": 2.8202, + "step": 11250 + }, + { + "epoch": 3.555002763957988, + "grad_norm": 0.10738559283983715, + "learning_rate": 0.00023325726247341627, + "loss": 2.889, + "step": 11255 + }, + { + "epoch": 3.556582168522467, + "grad_norm": 0.11921392227900801, + "learning_rate": 0.00023279098960792745, + "loss": 2.7683, + "step": 11260 + }, + { + "epoch": 3.558161573086946, + "grad_norm": 0.11346109853767701, + "learning_rate": 0.00023232504179144725, + "loss": 2.8041, + "step": 11265 + }, + { + "epoch": 3.5597409776514253, + "grad_norm": 0.09307444252039754, + "learning_rate": 0.00023185941959078261, + "loss": 2.7866, + "step": 11270 + }, + { + "epoch": 3.5613203822159045, + "grad_norm": 0.08626558856259384, + "learning_rate": 0.00023139412357234368, + "loss": 2.7719, + "step": 11275 + }, + { + "epoch": 3.5628997867803838, + "grad_norm": 0.1078184947468944, + "learning_rate": 0.00023092915430214484, + "loss": 2.7441, + "step": 11280 + }, + { + "epoch": 3.564479191344863, + "grad_norm": 0.10113108741500824, + "learning_rate": 0.00023046451234580233, + "loss": 2.7064, + "step": 11285 + }, + { + "epoch": 3.5660585959093423, + "grad_norm": 0.1157179687956109, + "learning_rate": 0.00023000019826853463, + "loss": 2.7445, + "step": 11290 + }, + { + "epoch": 3.5676380004738215, + "grad_norm": 0.0927942211817932, + "learning_rate": 0.00022953621263516072, + "loss": 2.7825, + "step": 11295 + }, + { + "epoch": 3.5692174050383008, + "grad_norm": 0.10375826834603429, + "learning_rate": 0.00022907255601010046, + "loss": 2.8141, + "step": 11300 + }, + { + "epoch": 3.57079680960278, + "grad_norm": 0.09543513718767459, + "learning_rate": 0.000228609228957374, + "loss": 2.7531, + "step": 11305 + }, + { + "epoch": 3.572376214167259, + "grad_norm": 0.0890887111388023, + "learning_rate": 0.00022814623204059952, + "loss": 2.779, + "step": 11310 + }, + { + "epoch": 3.573955618731738, + "grad_norm": 0.08073292664590946, + "learning_rate": 0.00022768356582299432, + "loss": 2.6918, + "step": 11315 + }, + { + "epoch": 3.5755350232962173, + "grad_norm": 0.08676561072352555, + "learning_rate": 0.00022722123086737329, + "loss": 2.6887, + "step": 11320 + }, + { + "epoch": 3.5771144278606966, + "grad_norm": 0.09426925946308844, + "learning_rate": 0.0002267592277361482, + "loss": 2.7954, + "step": 11325 + }, + { + "epoch": 3.578693832425176, + "grad_norm": 0.09229311484095422, + "learning_rate": 0.00022629755699132736, + "loss": 2.7214, + "step": 11330 + }, + { + "epoch": 3.580273236989655, + "grad_norm": 0.08140271955266823, + "learning_rate": 0.00022583621919451463, + "loss": 2.8671, + "step": 11335 + }, + { + "epoch": 3.581852641554134, + "grad_norm": 0.09556558459606468, + "learning_rate": 0.00022537521490690883, + "loss": 2.7257, + "step": 11340 + }, + { + "epoch": 3.583432046118613, + "grad_norm": 0.10077992058409145, + "learning_rate": 0.00022491454468930318, + "loss": 2.7685, + "step": 11345 + }, + { + "epoch": 3.5850114506830923, + "grad_norm": 0.087570674890972, + "learning_rate": 0.0002244542091020844, + "loss": 2.756, + "step": 11350 + }, + { + "epoch": 3.5865908552475716, + "grad_norm": 0.10849227424982827, + "learning_rate": 0.0002239942087052323, + "loss": 2.7807, + "step": 11355 + }, + { + "epoch": 3.588170259812051, + "grad_norm": 0.08597460501625169, + "learning_rate": 0.00022353454405831875, + "loss": 2.8133, + "step": 11360 + }, + { + "epoch": 3.58974966437653, + "grad_norm": 0.0998748647754923, + "learning_rate": 0.00022307521572050736, + "loss": 2.8042, + "step": 11365 + }, + { + "epoch": 3.5913290689410093, + "grad_norm": 0.11126051646572314, + "learning_rate": 0.00022261622425055272, + "loss": 2.7533, + "step": 11370 + }, + { + "epoch": 3.5929084735054886, + "grad_norm": 0.09055924524444105, + "learning_rate": 0.000222157570206799, + "loss": 2.8279, + "step": 11375 + }, + { + "epoch": 3.594487878069968, + "grad_norm": 0.08984142006571276, + "learning_rate": 0.00022169925414718085, + "loss": 2.7877, + "step": 11380 + }, + { + "epoch": 3.596067282634447, + "grad_norm": 0.10052235488529815, + "learning_rate": 0.00022124127662922132, + "loss": 2.8896, + "step": 11385 + }, + { + "epoch": 3.597646687198926, + "grad_norm": 0.08811739473430545, + "learning_rate": 0.0002207836382100314, + "loss": 2.7709, + "step": 11390 + }, + { + "epoch": 3.599226091763405, + "grad_norm": 0.08353319058341495, + "learning_rate": 0.0002203263394463098, + "loss": 2.8369, + "step": 11395 + }, + { + "epoch": 3.6008054963278844, + "grad_norm": 0.09449205017726552, + "learning_rate": 0.00021986938089434217, + "loss": 2.8294, + "step": 11400 + }, + { + "epoch": 3.6023849008923636, + "grad_norm": 0.08941806037298423, + "learning_rate": 0.00021941276311000026, + "loss": 2.8409, + "step": 11405 + }, + { + "epoch": 3.603964305456843, + "grad_norm": 0.09097010972981263, + "learning_rate": 0.00021895648664874108, + "loss": 2.7626, + "step": 11410 + }, + { + "epoch": 3.605543710021322, + "grad_norm": 0.0952754837032473, + "learning_rate": 0.00021850055206560666, + "loss": 2.7939, + "step": 11415 + }, + { + "epoch": 3.607123114585801, + "grad_norm": 0.08485773446474305, + "learning_rate": 0.0002180449599152231, + "loss": 2.7381, + "step": 11420 + }, + { + "epoch": 3.60870251915028, + "grad_norm": 0.07453026280015991, + "learning_rate": 0.00021758971075179988, + "loss": 2.7899, + "step": 11425 + }, + { + "epoch": 3.6102819237147594, + "grad_norm": 0.09183300642945505, + "learning_rate": 0.0002171348051291293, + "loss": 2.7926, + "step": 11430 + }, + { + "epoch": 3.6118613282792387, + "grad_norm": 0.0839019053264377, + "learning_rate": 0.00021668024360058574, + "loss": 2.8769, + "step": 11435 + }, + { + "epoch": 3.613440732843718, + "grad_norm": 0.10134168322160107, + "learning_rate": 0.00021622602671912507, + "loss": 2.8372, + "step": 11440 + }, + { + "epoch": 3.615020137408197, + "grad_norm": 0.09412058984971264, + "learning_rate": 0.00021577215503728393, + "loss": 2.811, + "step": 11445 + }, + { + "epoch": 3.6165995419726764, + "grad_norm": 0.11903314838257359, + "learning_rate": 0.00021531862910717864, + "loss": 2.7716, + "step": 11450 + }, + { + "epoch": 3.6181789465371557, + "grad_norm": 0.12363190718408543, + "learning_rate": 0.00021486544948050524, + "loss": 2.8333, + "step": 11455 + }, + { + "epoch": 3.619758351101635, + "grad_norm": 0.08863366856434014, + "learning_rate": 0.00021441261670853884, + "loss": 2.6796, + "step": 11460 + }, + { + "epoch": 3.6213377556661137, + "grad_norm": 0.13499636006682605, + "learning_rate": 0.0002139601313421324, + "loss": 2.7957, + "step": 11465 + }, + { + "epoch": 3.622917160230593, + "grad_norm": 0.09507917881349866, + "learning_rate": 0.00021350799393171567, + "loss": 2.7729, + "step": 11470 + }, + { + "epoch": 3.624496564795072, + "grad_norm": 0.06858739471914442, + "learning_rate": 0.00021305620502729583, + "loss": 2.7611, + "step": 11475 + }, + { + "epoch": 3.6260759693595515, + "grad_norm": 0.09214856126188252, + "learning_rate": 0.00021260476517845573, + "loss": 2.7689, + "step": 11480 + }, + { + "epoch": 3.6276553739240307, + "grad_norm": 0.09131495394135357, + "learning_rate": 0.0002121536749343544, + "loss": 2.8118, + "step": 11485 + }, + { + "epoch": 3.62923477848851, + "grad_norm": 0.07816970729114443, + "learning_rate": 0.00021170293484372427, + "loss": 2.7645, + "step": 11490 + }, + { + "epoch": 3.6308141830529888, + "grad_norm": 0.10929095968764148, + "learning_rate": 0.00021125254545487283, + "loss": 2.7951, + "step": 11495 + }, + { + "epoch": 3.632393587617468, + "grad_norm": 0.10301738196087953, + "learning_rate": 0.00021080250731568057, + "loss": 2.867, + "step": 11500 + }, + { + "epoch": 3.6339729921819472, + "grad_norm": 0.08960227429034275, + "learning_rate": 0.00021035282097360086, + "loss": 2.8183, + "step": 11505 + }, + { + "epoch": 3.6355523967464265, + "grad_norm": 0.08737624858384084, + "learning_rate": 0.00020990348697565896, + "loss": 2.8485, + "step": 11510 + }, + { + "epoch": 3.6371318013109057, + "grad_norm": 0.07791929158219078, + "learning_rate": 0.00020945450586845165, + "loss": 2.8107, + "step": 11515 + }, + { + "epoch": 3.638711205875385, + "grad_norm": 0.08913956949374928, + "learning_rate": 0.00020900587819814637, + "loss": 2.7753, + "step": 11520 + }, + { + "epoch": 3.6402906104398642, + "grad_norm": 0.087704377877188, + "learning_rate": 0.0002085576045104808, + "loss": 2.8193, + "step": 11525 + }, + { + "epoch": 3.6418700150043435, + "grad_norm": 0.09306829106179089, + "learning_rate": 0.00020810968535076125, + "loss": 2.8075, + "step": 11530 + }, + { + "epoch": 3.6434494195688227, + "grad_norm": 0.08484563394410759, + "learning_rate": 0.00020766212126386397, + "loss": 2.6954, + "step": 11535 + }, + { + "epoch": 3.645028824133302, + "grad_norm": 0.0798912479090602, + "learning_rate": 0.00020721491279423242, + "loss": 2.7521, + "step": 11540 + }, + { + "epoch": 3.646608228697781, + "grad_norm": 0.08750716646371902, + "learning_rate": 0.000206768060485878, + "loss": 2.8699, + "step": 11545 + }, + { + "epoch": 3.64818763326226, + "grad_norm": 0.09061907295127078, + "learning_rate": 0.00020632156488237808, + "loss": 2.8274, + "step": 11550 + }, + { + "epoch": 3.6497670378267393, + "grad_norm": 0.09301703574210217, + "learning_rate": 0.00020587542652687662, + "loss": 2.7568, + "step": 11555 + }, + { + "epoch": 3.6513464423912185, + "grad_norm": 0.09156106394288291, + "learning_rate": 0.0002054296459620834, + "loss": 2.8077, + "step": 11560 + }, + { + "epoch": 3.6529258469556978, + "grad_norm": 0.08949852492673643, + "learning_rate": 0.00020498422373027247, + "loss": 2.7622, + "step": 11565 + }, + { + "epoch": 3.654505251520177, + "grad_norm": 0.10457491176962785, + "learning_rate": 0.00020453916037328174, + "loss": 2.7507, + "step": 11570 + }, + { + "epoch": 3.656084656084656, + "grad_norm": 0.08779017195219393, + "learning_rate": 0.000204094456432513, + "loss": 2.8668, + "step": 11575 + }, + { + "epoch": 3.657664060649135, + "grad_norm": 0.09069030608376441, + "learning_rate": 0.00020365011244893077, + "loss": 2.8472, + "step": 11580 + }, + { + "epoch": 3.6592434652136143, + "grad_norm": 0.08884167819693801, + "learning_rate": 0.00020320612896306158, + "loss": 2.7427, + "step": 11585 + }, + { + "epoch": 3.6608228697780936, + "grad_norm": 0.08189253953823854, + "learning_rate": 0.00020276250651499346, + "loss": 2.7255, + "step": 11590 + }, + { + "epoch": 3.662402274342573, + "grad_norm": 0.06780471843155822, + "learning_rate": 0.00020231924564437527, + "loss": 2.8178, + "step": 11595 + }, + { + "epoch": 3.663981678907052, + "grad_norm": 0.08192279195750907, + "learning_rate": 0.000201876346890416, + "loss": 2.6784, + "step": 11600 + }, + { + "epoch": 3.6655610834715313, + "grad_norm": 0.08128488744507092, + "learning_rate": 0.00020143381079188444, + "loss": 2.799, + "step": 11605 + }, + { + "epoch": 3.6671404880360106, + "grad_norm": 0.09287388991092453, + "learning_rate": 0.0002009916378871074, + "loss": 2.8106, + "step": 11610 + }, + { + "epoch": 3.66871989260049, + "grad_norm": 0.06955081594717988, + "learning_rate": 0.00020054982871397083, + "loss": 2.6967, + "step": 11615 + }, + { + "epoch": 3.670299297164969, + "grad_norm": 0.07732653629557824, + "learning_rate": 0.00020010838380991774, + "loss": 2.7274, + "step": 11620 + }, + { + "epoch": 3.671878701729448, + "grad_norm": 0.08847447417470254, + "learning_rate": 0.00019966730371194825, + "loss": 2.8347, + "step": 11625 + }, + { + "epoch": 3.673458106293927, + "grad_norm": 0.08323413983065359, + "learning_rate": 0.00019922658895661817, + "loss": 2.8466, + "step": 11630 + }, + { + "epoch": 3.6750375108584064, + "grad_norm": 0.07423365005518999, + "learning_rate": 0.00019878624008003927, + "loss": 2.7574, + "step": 11635 + }, + { + "epoch": 3.6766169154228856, + "grad_norm": 0.08596050904098117, + "learning_rate": 0.00019834625761787862, + "loss": 2.7952, + "step": 11640 + }, + { + "epoch": 3.678196319987365, + "grad_norm": 0.08447539960506971, + "learning_rate": 0.00019790664210535714, + "loss": 2.8661, + "step": 11645 + }, + { + "epoch": 3.679775724551844, + "grad_norm": 0.08848325804957861, + "learning_rate": 0.00019746739407724913, + "loss": 2.8027, + "step": 11650 + }, + { + "epoch": 3.681355129116323, + "grad_norm": 0.09719722365226477, + "learning_rate": 0.00019702851406788225, + "loss": 2.7186, + "step": 11655 + }, + { + "epoch": 3.682934533680802, + "grad_norm": 0.10118128059502188, + "learning_rate": 0.00019659000261113642, + "loss": 2.8854, + "step": 11660 + }, + { + "epoch": 3.6845139382452814, + "grad_norm": 0.07497857561322374, + "learning_rate": 0.00019615186024044313, + "loss": 2.8331, + "step": 11665 + }, + { + "epoch": 3.6860933428097606, + "grad_norm": 0.10780605274986786, + "learning_rate": 0.00019571408748878495, + "loss": 2.72, + "step": 11670 + }, + { + "epoch": 3.68767274737424, + "grad_norm": 0.09273671983890912, + "learning_rate": 0.00019527668488869484, + "loss": 2.7444, + "step": 11675 + }, + { + "epoch": 3.689252151938719, + "grad_norm": 0.09267158414129262, + "learning_rate": 0.00019483965297225543, + "loss": 2.7272, + "step": 11680 + }, + { + "epoch": 3.6908315565031984, + "grad_norm": 0.09858600391860631, + "learning_rate": 0.00019440299227109853, + "loss": 2.679, + "step": 11685 + }, + { + "epoch": 3.6924109610676776, + "grad_norm": 0.08707185882765187, + "learning_rate": 0.00019396670331640425, + "loss": 2.7163, + "step": 11690 + }, + { + "epoch": 3.693990365632157, + "grad_norm": 0.08386738183816067, + "learning_rate": 0.00019353078663890056, + "loss": 2.7726, + "step": 11695 + }, + { + "epoch": 3.695569770196636, + "grad_norm": 0.09285610818152679, + "learning_rate": 0.00019309524276886258, + "loss": 2.7636, + "step": 11700 + }, + { + "epoch": 3.697149174761115, + "grad_norm": 0.08276309730190091, + "learning_rate": 0.00019266007223611205, + "loss": 2.6908, + "step": 11705 + }, + { + "epoch": 3.698728579325594, + "grad_norm": 0.09344772692451785, + "learning_rate": 0.00019222527557001583, + "loss": 2.7216, + "step": 11710 + }, + { + "epoch": 3.7003079838900734, + "grad_norm": 0.08138562850581343, + "learning_rate": 0.00019179085329948725, + "loss": 2.7707, + "step": 11715 + }, + { + "epoch": 3.7018873884545527, + "grad_norm": 0.09182017905548755, + "learning_rate": 0.00019135680595298315, + "loss": 2.7875, + "step": 11720 + }, + { + "epoch": 3.703466793019032, + "grad_norm": 0.08299814767009533, + "learning_rate": 0.00019092313405850503, + "loss": 2.7206, + "step": 11725 + }, + { + "epoch": 3.705046197583511, + "grad_norm": 0.1004891033089223, + "learning_rate": 0.00019048983814359683, + "loss": 2.7871, + "step": 11730 + }, + { + "epoch": 3.70662560214799, + "grad_norm": 0.08497364732744732, + "learning_rate": 0.0001900569187353458, + "loss": 2.7905, + "step": 11735 + }, + { + "epoch": 3.708205006712469, + "grad_norm": 0.10208046299317208, + "learning_rate": 0.00018962437636038093, + "loss": 2.7942, + "step": 11740 + }, + { + "epoch": 3.7097844112769485, + "grad_norm": 0.08901693082989817, + "learning_rate": 0.0001891922115448727, + "loss": 2.8117, + "step": 11745 + }, + { + "epoch": 3.7113638158414277, + "grad_norm": 0.07914149877510862, + "learning_rate": 0.00018876042481453221, + "loss": 2.8137, + "step": 11750 + }, + { + "epoch": 3.712943220405907, + "grad_norm": 0.09657293929383366, + "learning_rate": 0.00018832901669461056, + "loss": 2.8863, + "step": 11755 + }, + { + "epoch": 3.714522624970386, + "grad_norm": 0.08524187526118726, + "learning_rate": 0.0001878979877098984, + "loss": 2.7986, + "step": 11760 + }, + { + "epoch": 3.7161020295348655, + "grad_norm": 0.0893121969017454, + "learning_rate": 0.0001874673383847252, + "loss": 2.8444, + "step": 11765 + }, + { + "epoch": 3.7176814340993447, + "grad_norm": 0.07737661971954736, + "learning_rate": 0.00018703706924295849, + "loss": 2.8618, + "step": 11770 + }, + { + "epoch": 3.719260838663824, + "grad_norm": 0.11339322775943667, + "learning_rate": 0.00018660718080800337, + "loss": 2.6354, + "step": 11775 + }, + { + "epoch": 3.720840243228303, + "grad_norm": 0.09029538393369561, + "learning_rate": 0.0001861776736028018, + "loss": 2.7916, + "step": 11780 + }, + { + "epoch": 3.722419647792782, + "grad_norm": 0.09222139162095129, + "learning_rate": 0.00018574854814983228, + "loss": 2.7123, + "step": 11785 + }, + { + "epoch": 3.7239990523572613, + "grad_norm": 0.09272711747658678, + "learning_rate": 0.000185319804971108, + "loss": 2.7622, + "step": 11790 + }, + { + "epoch": 3.7255784569217405, + "grad_norm": 0.09311251342299524, + "learning_rate": 0.0001848914445881784, + "loss": 2.6836, + "step": 11795 + }, + { + "epoch": 3.7271578614862197, + "grad_norm": 0.07043590075001643, + "learning_rate": 0.00018446346752212662, + "loss": 2.757, + "step": 11800 + }, + { + "epoch": 3.728737266050699, + "grad_norm": 0.07975371796499488, + "learning_rate": 0.00018403587429356916, + "loss": 2.7193, + "step": 11805 + }, + { + "epoch": 3.7303166706151782, + "grad_norm": 0.08351856594277221, + "learning_rate": 0.00018360866542265625, + "loss": 2.8014, + "step": 11810 + }, + { + "epoch": 3.731896075179657, + "grad_norm": 0.08923111864867268, + "learning_rate": 0.00018318184142907, + "loss": 2.7806, + "step": 11815 + }, + { + "epoch": 3.7334754797441363, + "grad_norm": 0.0868170822441492, + "learning_rate": 0.0001827554028320252, + "loss": 2.7466, + "step": 11820 + }, + { + "epoch": 3.7350548843086155, + "grad_norm": 0.09740873126793242, + "learning_rate": 0.0001823293501502667, + "loss": 2.6942, + "step": 11825 + }, + { + "epoch": 3.736634288873095, + "grad_norm": 0.09994819493713986, + "learning_rate": 0.00018190368390207063, + "loss": 2.8106, + "step": 11830 + }, + { + "epoch": 3.738213693437574, + "grad_norm": 0.11005739113864033, + "learning_rate": 0.0001814784046052429, + "loss": 2.8167, + "step": 11835 + }, + { + "epoch": 3.7397930980020533, + "grad_norm": 0.09021459683105348, + "learning_rate": 0.00018105351277711857, + "loss": 2.8033, + "step": 11840 + }, + { + "epoch": 3.7413725025665325, + "grad_norm": 0.10913132440175603, + "learning_rate": 0.00018062900893456147, + "loss": 2.8071, + "step": 11845 + }, + { + "epoch": 3.742951907131012, + "grad_norm": 0.1025572689490513, + "learning_rate": 0.00018020489359396353, + "loss": 2.7539, + "step": 11850 + }, + { + "epoch": 3.744531311695491, + "grad_norm": 0.11027023905298508, + "learning_rate": 0.00017978116727124387, + "loss": 2.829, + "step": 11855 + }, + { + "epoch": 3.74611071625997, + "grad_norm": 0.10401771895295076, + "learning_rate": 0.00017935783048184868, + "loss": 2.7328, + "step": 11860 + }, + { + "epoch": 3.747690120824449, + "grad_norm": 0.07543245605329015, + "learning_rate": 0.0001789348837407499, + "loss": 2.723, + "step": 11865 + }, + { + "epoch": 3.7492695253889283, + "grad_norm": 0.08070299948779706, + "learning_rate": 0.0001785123275624454, + "loss": 2.7909, + "step": 11870 + }, + { + "epoch": 3.7508489299534076, + "grad_norm": 0.08267456058643989, + "learning_rate": 0.00017809016246095772, + "loss": 2.7677, + "step": 11875 + }, + { + "epoch": 3.752428334517887, + "grad_norm": 0.07934566141525058, + "learning_rate": 0.0001776683889498339, + "loss": 2.7921, + "step": 11880 + }, + { + "epoch": 3.754007739082366, + "grad_norm": 0.07808235021780177, + "learning_rate": 0.00017724700754214403, + "loss": 2.8413, + "step": 11885 + }, + { + "epoch": 3.755587143646845, + "grad_norm": 0.0842968243503666, + "learning_rate": 0.00017682601875048187, + "loss": 2.8465, + "step": 11890 + }, + { + "epoch": 3.757166548211324, + "grad_norm": 0.08020341407065772, + "learning_rate": 0.00017640542308696317, + "loss": 2.7862, + "step": 11895 + }, + { + "epoch": 3.7587459527758034, + "grad_norm": 0.08540408558560887, + "learning_rate": 0.00017598522106322619, + "loss": 2.731, + "step": 11900 + }, + { + "epoch": 3.7603253573402826, + "grad_norm": 0.09374602539245858, + "learning_rate": 0.00017556541319042911, + "loss": 2.8661, + "step": 11905 + }, + { + "epoch": 3.761904761904762, + "grad_norm": 0.08328870658200717, + "learning_rate": 0.00017514599997925167, + "loss": 2.7336, + "step": 11910 + }, + { + "epoch": 3.763484166469241, + "grad_norm": 0.09328402945620633, + "learning_rate": 0.00017472698193989305, + "loss": 2.6946, + "step": 11915 + }, + { + "epoch": 3.7650635710337204, + "grad_norm": 0.08556856440346546, + "learning_rate": 0.00017430835958207185, + "loss": 2.8038, + "step": 11920 + }, + { + "epoch": 3.7666429755981996, + "grad_norm": 0.08605764458620771, + "learning_rate": 0.0001738901334150254, + "loss": 2.8046, + "step": 11925 + }, + { + "epoch": 3.768222380162679, + "grad_norm": 0.07969013072361213, + "learning_rate": 0.0001734723039475089, + "loss": 2.8382, + "step": 11930 + }, + { + "epoch": 3.769801784727158, + "grad_norm": 0.08754956682484208, + "learning_rate": 0.00017305487168779515, + "loss": 2.8156, + "step": 11935 + }, + { + "epoch": 3.771381189291637, + "grad_norm": 0.10029042879887169, + "learning_rate": 0.00017263783714367386, + "loss": 2.6554, + "step": 11940 + }, + { + "epoch": 3.772960593856116, + "grad_norm": 0.10504648793028643, + "learning_rate": 0.00017222120082245018, + "loss": 2.708, + "step": 11945 + }, + { + "epoch": 3.7745399984205954, + "grad_norm": 0.10522389032354743, + "learning_rate": 0.00017180496323094608, + "loss": 2.7088, + "step": 11950 + }, + { + "epoch": 3.7761194029850746, + "grad_norm": 0.09359727624397036, + "learning_rate": 0.00017138912487549756, + "loss": 2.8039, + "step": 11955 + }, + { + "epoch": 3.777698807549554, + "grad_norm": 0.11422346362497239, + "learning_rate": 0.00017097368626195548, + "loss": 2.7834, + "step": 11960 + }, + { + "epoch": 3.779278212114033, + "grad_norm": 0.08018043274815162, + "learning_rate": 0.00017055864789568376, + "loss": 2.6771, + "step": 11965 + }, + { + "epoch": 3.780857616678512, + "grad_norm": 0.082621279623475, + "learning_rate": 0.00017014401028156, + "loss": 2.7746, + "step": 11970 + }, + { + "epoch": 3.782437021242991, + "grad_norm": 0.08863049292682723, + "learning_rate": 0.00016972977392397444, + "loss": 2.7527, + "step": 11975 + }, + { + "epoch": 3.7840164258074704, + "grad_norm": 0.08119101989236832, + "learning_rate": 0.00016931593932682893, + "loss": 2.8148, + "step": 11980 + }, + { + "epoch": 3.7855958303719497, + "grad_norm": 0.09138427007636155, + "learning_rate": 0.0001689025069935363, + "loss": 2.6863, + "step": 11985 + }, + { + "epoch": 3.787175234936429, + "grad_norm": 0.09879479871575889, + "learning_rate": 0.00016848947742702046, + "loss": 2.782, + "step": 11990 + }, + { + "epoch": 3.788754639500908, + "grad_norm": 0.08980149895256091, + "learning_rate": 0.0001680768511297152, + "loss": 2.7806, + "step": 11995 + }, + { + "epoch": 3.7903340440653874, + "grad_norm": 0.08770742462877869, + "learning_rate": 0.00016766462860356423, + "loss": 2.8098, + "step": 12000 + }, + { + "epoch": 3.7919134486298667, + "grad_norm": 0.07695351477939083, + "learning_rate": 0.00016725281035001916, + "loss": 2.7902, + "step": 12005 + }, + { + "epoch": 3.793492853194346, + "grad_norm": 0.08662585356629579, + "learning_rate": 0.00016684139687004053, + "loss": 2.7889, + "step": 12010 + }, + { + "epoch": 3.795072257758825, + "grad_norm": 0.07423777411389267, + "learning_rate": 0.0001664303886640962, + "loss": 2.78, + "step": 12015 + }, + { + "epoch": 3.796651662323304, + "grad_norm": 0.08469752187569327, + "learning_rate": 0.00016601978623216124, + "loss": 2.7842, + "step": 12020 + }, + { + "epoch": 3.7982310668877832, + "grad_norm": 0.08826241649397007, + "learning_rate": 0.00016560959007371685, + "loss": 2.7844, + "step": 12025 + }, + { + "epoch": 3.7998104714522625, + "grad_norm": 0.06895911425297126, + "learning_rate": 0.00016519980068775025, + "loss": 2.7493, + "step": 12030 + }, + { + "epoch": 3.8013898760167417, + "grad_norm": 0.0776441798856278, + "learning_rate": 0.00016479041857275374, + "loss": 2.73, + "step": 12035 + }, + { + "epoch": 3.802969280581221, + "grad_norm": 0.08136608195262024, + "learning_rate": 0.0001643814442267243, + "loss": 2.7479, + "step": 12040 + }, + { + "epoch": 3.8045486851457, + "grad_norm": 0.08478394868745333, + "learning_rate": 0.00016397287814716243, + "loss": 2.714, + "step": 12045 + }, + { + "epoch": 3.806128089710179, + "grad_norm": 0.08316813607004452, + "learning_rate": 0.00016356472083107237, + "loss": 2.8003, + "step": 12050 + }, + { + "epoch": 3.8077074942746583, + "grad_norm": 0.0904850713575638, + "learning_rate": 0.00016315697277496138, + "loss": 2.833, + "step": 12055 + }, + { + "epoch": 3.8092868988391375, + "grad_norm": 0.07648298666227166, + "learning_rate": 0.00016274963447483854, + "loss": 2.8567, + "step": 12060 + }, + { + "epoch": 3.8108663034036168, + "grad_norm": 0.07238836488587672, + "learning_rate": 0.00016234270642621424, + "loss": 2.7621, + "step": 12065 + }, + { + "epoch": 3.812445707968096, + "grad_norm": 0.08909847817275786, + "learning_rate": 0.0001619361891241002, + "loss": 2.7587, + "step": 12070 + }, + { + "epoch": 3.8140251125325753, + "grad_norm": 0.07552419420026069, + "learning_rate": 0.00016153008306300814, + "loss": 2.8112, + "step": 12075 + }, + { + "epoch": 3.8156045170970545, + "grad_norm": 0.0876622533187228, + "learning_rate": 0.0001611243887369503, + "loss": 2.7627, + "step": 12080 + }, + { + "epoch": 3.8171839216615338, + "grad_norm": 0.08761521312444666, + "learning_rate": 0.000160719106639437, + "loss": 2.6827, + "step": 12085 + }, + { + "epoch": 3.818763326226013, + "grad_norm": 0.10525892878890712, + "learning_rate": 0.00016031423726347777, + "loss": 2.8289, + "step": 12090 + }, + { + "epoch": 3.8203427307904922, + "grad_norm": 0.09079503509259372, + "learning_rate": 0.0001599097811015799, + "loss": 2.7873, + "step": 12095 + }, + { + "epoch": 3.821922135354971, + "grad_norm": 0.07449785388655988, + "learning_rate": 0.00015950573864574808, + "loss": 2.7647, + "step": 12100 + }, + { + "epoch": 3.8235015399194503, + "grad_norm": 0.07493346474050362, + "learning_rate": 0.00015910211038748363, + "loss": 2.8051, + "step": 12105 + }, + { + "epoch": 3.8250809444839295, + "grad_norm": 0.08858856892366813, + "learning_rate": 0.0001586988968177841, + "loss": 2.6911, + "step": 12110 + }, + { + "epoch": 3.826660349048409, + "grad_norm": 0.09711661023425233, + "learning_rate": 0.0001582960984271426, + "loss": 2.9282, + "step": 12115 + }, + { + "epoch": 3.828239753612888, + "grad_norm": 0.08481939048470226, + "learning_rate": 0.00015789371570554728, + "loss": 2.7376, + "step": 12120 + }, + { + "epoch": 3.8298191581773673, + "grad_norm": 0.08332736816622259, + "learning_rate": 0.0001574917491424801, + "loss": 2.7929, + "step": 12125 + }, + { + "epoch": 3.831398562741846, + "grad_norm": 0.0742516035203001, + "learning_rate": 0.0001570901992269177, + "loss": 2.7426, + "step": 12130 + }, + { + "epoch": 3.8329779673063253, + "grad_norm": 0.07167536684934585, + "learning_rate": 0.00015668906644732917, + "loss": 2.7953, + "step": 12135 + }, + { + "epoch": 3.8345573718708046, + "grad_norm": 0.07976658207700503, + "learning_rate": 0.00015628835129167663, + "loss": 2.7906, + "step": 12140 + }, + { + "epoch": 3.836136776435284, + "grad_norm": 0.08991311149016072, + "learning_rate": 0.00015588805424741352, + "loss": 2.7448, + "step": 12145 + }, + { + "epoch": 3.837716180999763, + "grad_norm": 0.10173820928136434, + "learning_rate": 0.00015548817580148517, + "loss": 2.7555, + "step": 12150 + }, + { + "epoch": 3.8392955855642423, + "grad_norm": 0.06561034464832172, + "learning_rate": 0.00015508871644032807, + "loss": 2.7457, + "step": 12155 + }, + { + "epoch": 3.8408749901287216, + "grad_norm": 0.07733120609909291, + "learning_rate": 0.00015468967664986798, + "loss": 2.7664, + "step": 12160 + }, + { + "epoch": 3.842454394693201, + "grad_norm": 0.11128728912296254, + "learning_rate": 0.0001542910569155209, + "loss": 2.7219, + "step": 12165 + }, + { + "epoch": 3.84403379925768, + "grad_norm": 0.08670042557414782, + "learning_rate": 0.00015389285772219176, + "loss": 2.7775, + "step": 12170 + }, + { + "epoch": 3.8456132038221593, + "grad_norm": 0.08895531137890758, + "learning_rate": 0.00015349507955427378, + "loss": 2.7067, + "step": 12175 + }, + { + "epoch": 3.847192608386638, + "grad_norm": 0.08427607915004383, + "learning_rate": 0.00015309772289564806, + "loss": 2.7725, + "step": 12180 + }, + { + "epoch": 3.8487720129511174, + "grad_norm": 0.07370797804326407, + "learning_rate": 0.00015270078822968307, + "loss": 2.7484, + "step": 12185 + }, + { + "epoch": 3.8503514175155966, + "grad_norm": 0.07708525753337944, + "learning_rate": 0.00015230427603923387, + "loss": 2.7986, + "step": 12190 + }, + { + "epoch": 3.851930822080076, + "grad_norm": 0.08109018579361335, + "learning_rate": 0.00015190818680664147, + "loss": 2.7234, + "step": 12195 + }, + { + "epoch": 3.853510226644555, + "grad_norm": 0.08237238380223535, + "learning_rate": 0.00015151252101373264, + "loss": 2.8295, + "step": 12200 + }, + { + "epoch": 3.855089631209034, + "grad_norm": 0.08654806501913762, + "learning_rate": 0.00015111727914181877, + "loss": 2.7166, + "step": 12205 + }, + { + "epoch": 3.856669035773513, + "grad_norm": 0.07056141952098877, + "learning_rate": 0.00015072246167169574, + "loss": 2.8158, + "step": 12210 + }, + { + "epoch": 3.8582484403379924, + "grad_norm": 0.08098815316771699, + "learning_rate": 0.0001503280690836431, + "loss": 2.7101, + "step": 12215 + }, + { + "epoch": 3.8598278449024717, + "grad_norm": 0.08432444865308394, + "learning_rate": 0.00014993410185742373, + "loss": 2.723, + "step": 12220 + }, + { + "epoch": 3.861407249466951, + "grad_norm": 0.08230097290036, + "learning_rate": 0.0001495405604722826, + "loss": 2.7922, + "step": 12225 + }, + { + "epoch": 3.86298665403143, + "grad_norm": 0.08559451418421558, + "learning_rate": 0.00014914744540694698, + "loss": 2.7691, + "step": 12230 + }, + { + "epoch": 3.8645660585959094, + "grad_norm": 0.08263002382153399, + "learning_rate": 0.00014875475713962594, + "loss": 2.7754, + "step": 12235 + }, + { + "epoch": 3.8661454631603887, + "grad_norm": 0.08536841281141583, + "learning_rate": 0.00014836249614800856, + "loss": 2.7422, + "step": 12240 + }, + { + "epoch": 3.867724867724868, + "grad_norm": 0.07981680067080599, + "learning_rate": 0.00014797066290926465, + "loss": 2.7872, + "step": 12245 + }, + { + "epoch": 3.869304272289347, + "grad_norm": 0.0851273897360789, + "learning_rate": 0.0001475792579000436, + "loss": 2.7314, + "step": 12250 + }, + { + "epoch": 3.870883676853826, + "grad_norm": 0.08062361561989131, + "learning_rate": 0.00014718828159647384, + "loss": 2.8101, + "step": 12255 + }, + { + "epoch": 3.872463081418305, + "grad_norm": 0.0981840696392559, + "learning_rate": 0.0001467977344741624, + "loss": 2.7741, + "step": 12260 + }, + { + "epoch": 3.8740424859827844, + "grad_norm": 0.06646153017665142, + "learning_rate": 0.000146407617008194, + "loss": 2.717, + "step": 12265 + }, + { + "epoch": 3.8756218905472637, + "grad_norm": 0.08534845102315676, + "learning_rate": 0.00014601792967313093, + "loss": 2.7099, + "step": 12270 + }, + { + "epoch": 3.877201295111743, + "grad_norm": 0.07236067988844529, + "learning_rate": 0.00014562867294301207, + "loss": 2.7099, + "step": 12275 + }, + { + "epoch": 3.878780699676222, + "grad_norm": 0.07012044315380965, + "learning_rate": 0.0001452398472913527, + "loss": 3.0013, + "step": 12280 + }, + { + "epoch": 3.880360104240701, + "grad_norm": 0.07284188811108293, + "learning_rate": 0.00014485145319114345, + "loss": 2.7175, + "step": 12285 + }, + { + "epoch": 3.8819395088051802, + "grad_norm": 0.07010366764819255, + "learning_rate": 0.00014446349111485018, + "loss": 2.7621, + "step": 12290 + }, + { + "epoch": 3.8835189133696595, + "grad_norm": 0.07467745080448321, + "learning_rate": 0.00014407596153441328, + "loss": 2.7282, + "step": 12295 + }, + { + "epoch": 3.8850983179341387, + "grad_norm": 0.0770014215554429, + "learning_rate": 0.0001436888649212466, + "loss": 2.8074, + "step": 12300 + }, + { + "epoch": 3.886677722498618, + "grad_norm": 0.08929085293217041, + "learning_rate": 0.0001433022017462376, + "loss": 2.7233, + "step": 12305 + }, + { + "epoch": 3.8882571270630972, + "grad_norm": 0.07900647808172974, + "learning_rate": 0.00014291597247974668, + "loss": 2.742, + "step": 12310 + }, + { + "epoch": 3.8898365316275765, + "grad_norm": 0.08403907746645993, + "learning_rate": 0.00014253017759160636, + "loss": 2.7749, + "step": 12315 + }, + { + "epoch": 3.8914159361920557, + "grad_norm": 0.08830804078019384, + "learning_rate": 0.0001421448175511202, + "loss": 2.8338, + "step": 12320 + }, + { + "epoch": 3.892995340756535, + "grad_norm": 0.08036052320678963, + "learning_rate": 0.00014175989282706332, + "loss": 2.7507, + "step": 12325 + }, + { + "epoch": 3.894574745321014, + "grad_norm": 0.07391711139175576, + "learning_rate": 0.00014137540388768107, + "loss": 2.683, + "step": 12330 + }, + { + "epoch": 3.896154149885493, + "grad_norm": 0.06940022388169716, + "learning_rate": 0.00014099135120068911, + "loss": 2.8218, + "step": 12335 + }, + { + "epoch": 3.8977335544499723, + "grad_norm": 0.0894235128784357, + "learning_rate": 0.00014060773523327176, + "loss": 2.7571, + "step": 12340 + }, + { + "epoch": 3.8993129590144515, + "grad_norm": 0.07888755492894607, + "learning_rate": 0.00014022455645208248, + "loss": 2.7721, + "step": 12345 + }, + { + "epoch": 3.9008923635789308, + "grad_norm": 0.10058397800877593, + "learning_rate": 0.00013984181532324293, + "loss": 2.7646, + "step": 12350 + }, + { + "epoch": 3.90247176814341, + "grad_norm": 0.07999818986103556, + "learning_rate": 0.00013945951231234216, + "loss": 2.6823, + "step": 12355 + }, + { + "epoch": 3.9040511727078893, + "grad_norm": 0.07010208553521481, + "learning_rate": 0.00013907764788443649, + "loss": 2.8145, + "step": 12360 + }, + { + "epoch": 3.905630577272368, + "grad_norm": 0.08192124375429993, + "learning_rate": 0.00013869622250404855, + "loss": 2.8116, + "step": 12365 + }, + { + "epoch": 3.9072099818368473, + "grad_norm": 0.08035748821022067, + "learning_rate": 0.0001383152366351671, + "loss": 2.7304, + "step": 12370 + }, + { + "epoch": 3.9087893864013266, + "grad_norm": 0.0875957355975858, + "learning_rate": 0.00013793469074124614, + "loss": 2.807, + "step": 12375 + }, + { + "epoch": 3.910368790965806, + "grad_norm": 0.07905581991232238, + "learning_rate": 0.0001375545852852042, + "loss": 2.7509, + "step": 12380 + }, + { + "epoch": 3.911948195530285, + "grad_norm": 0.0862463921453058, + "learning_rate": 0.00013717492072942423, + "loss": 2.7711, + "step": 12385 + }, + { + "epoch": 3.9135276000947643, + "grad_norm": 0.08596933906927828, + "learning_rate": 0.00013679569753575322, + "loss": 2.7928, + "step": 12390 + }, + { + "epoch": 3.9151070046592436, + "grad_norm": 0.07644688214878918, + "learning_rate": 0.00013641691616550096, + "loss": 2.7458, + "step": 12395 + }, + { + "epoch": 3.916686409223723, + "grad_norm": 0.07795462137024338, + "learning_rate": 0.00013603857707943933, + "loss": 2.6595, + "step": 12400 + }, + { + "epoch": 3.918265813788202, + "grad_norm": 0.07544869027380872, + "learning_rate": 0.00013566068073780286, + "loss": 2.7311, + "step": 12405 + }, + { + "epoch": 3.9198452183526813, + "grad_norm": 0.07465173186397134, + "learning_rate": 0.00013528322760028705, + "loss": 2.8227, + "step": 12410 + }, + { + "epoch": 3.92142462291716, + "grad_norm": 0.07568641966275272, + "learning_rate": 0.00013490621812604892, + "loss": 2.7298, + "step": 12415 + }, + { + "epoch": 3.9230040274816393, + "grad_norm": 0.07156080882726594, + "learning_rate": 0.00013452965277370487, + "loss": 2.7488, + "step": 12420 + }, + { + "epoch": 3.9245834320461186, + "grad_norm": 0.08462040333981412, + "learning_rate": 0.00013415353200133163, + "loss": 2.748, + "step": 12425 + }, + { + "epoch": 3.926162836610598, + "grad_norm": 0.07149390183535212, + "learning_rate": 0.00013377785626646506, + "loss": 2.6944, + "step": 12430 + }, + { + "epoch": 3.927742241175077, + "grad_norm": 0.09290857724360561, + "learning_rate": 0.00013340262602609942, + "loss": 2.7834, + "step": 12435 + }, + { + "epoch": 3.9293216457395563, + "grad_norm": 0.0709730136169655, + "learning_rate": 0.0001330278417366873, + "loss": 2.7245, + "step": 12440 + }, + { + "epoch": 3.930901050304035, + "grad_norm": 0.07376630063431873, + "learning_rate": 0.00013265350385413871, + "loss": 2.8153, + "step": 12445 + }, + { + "epoch": 3.9324804548685144, + "grad_norm": 0.08242174101966185, + "learning_rate": 0.00013227961283382068, + "loss": 2.8471, + "step": 12450 + }, + { + "epoch": 3.9340598594329936, + "grad_norm": 0.0799237354023802, + "learning_rate": 0.00013190616913055659, + "loss": 2.8095, + "step": 12455 + }, + { + "epoch": 3.935639263997473, + "grad_norm": 0.06942648505961056, + "learning_rate": 0.00013153317319862528, + "loss": 2.6888, + "step": 12460 + }, + { + "epoch": 3.937218668561952, + "grad_norm": 0.07440402727491054, + "learning_rate": 0.00013116062549176183, + "loss": 2.6996, + "step": 12465 + }, + { + "epoch": 3.9387980731264314, + "grad_norm": 0.07871576677595668, + "learning_rate": 0.0001307885264631553, + "loss": 2.8121, + "step": 12470 + }, + { + "epoch": 3.9403774776909106, + "grad_norm": 0.07980672808189275, + "learning_rate": 0.00013041687656544938, + "loss": 2.711, + "step": 12475 + }, + { + "epoch": 3.94195688225539, + "grad_norm": 0.08117275324957014, + "learning_rate": 0.0001300456762507408, + "loss": 2.7112, + "step": 12480 + }, + { + "epoch": 3.943536286819869, + "grad_norm": 0.07012399895331334, + "learning_rate": 0.00012967492597058, + "loss": 2.7654, + "step": 12485 + }, + { + "epoch": 3.9451156913843484, + "grad_norm": 0.07933854900775285, + "learning_rate": 0.00012930462617596994, + "loss": 2.7743, + "step": 12490 + }, + { + "epoch": 3.946695095948827, + "grad_norm": 0.08119374219657345, + "learning_rate": 0.00012893477731736546, + "loss": 2.7857, + "step": 12495 + }, + { + "epoch": 3.9482745005133064, + "grad_norm": 0.10841667058137638, + "learning_rate": 0.00012856537984467248, + "loss": 2.7342, + "step": 12500 + }, + { + "epoch": 3.9498539050777857, + "grad_norm": 0.08529119395669767, + "learning_rate": 0.00012819643420724835, + "loss": 2.7458, + "step": 12505 + }, + { + "epoch": 3.951433309642265, + "grad_norm": 0.08069660608208816, + "learning_rate": 0.0001278279408539006, + "loss": 2.8239, + "step": 12510 + }, + { + "epoch": 3.953012714206744, + "grad_norm": 0.06719522396418853, + "learning_rate": 0.00012745990023288657, + "loss": 2.7527, + "step": 12515 + }, + { + "epoch": 3.9545921187712234, + "grad_norm": 0.09172062976577304, + "learning_rate": 0.0001270923127919128, + "loss": 2.6765, + "step": 12520 + }, + { + "epoch": 3.956171523335702, + "grad_norm": 0.0946439871431242, + "learning_rate": 0.00012672517897813462, + "loss": 2.7211, + "step": 12525 + }, + { + "epoch": 3.9577509279001815, + "grad_norm": 0.07169281189124944, + "learning_rate": 0.00012635849923815562, + "loss": 2.7036, + "step": 12530 + }, + { + "epoch": 3.9593303324646607, + "grad_norm": 0.06873997840461551, + "learning_rate": 0.0001259922740180271, + "loss": 2.7091, + "step": 12535 + }, + { + "epoch": 3.96090973702914, + "grad_norm": 0.07795319134608658, + "learning_rate": 0.00012562650376324674, + "loss": 2.7317, + "step": 12540 + }, + { + "epoch": 3.962489141593619, + "grad_norm": 0.07771723173444081, + "learning_rate": 0.00012526118891875991, + "loss": 2.8105, + "step": 12545 + }, + { + "epoch": 3.9640685461580984, + "grad_norm": 0.08365154714386186, + "learning_rate": 0.00012489632992895722, + "loss": 2.793, + "step": 12550 + }, + { + "epoch": 3.9656479507225777, + "grad_norm": 0.08829169223148622, + "learning_rate": 0.0001245319272376751, + "loss": 2.7862, + "step": 12555 + }, + { + "epoch": 3.967227355287057, + "grad_norm": 0.07959096556734646, + "learning_rate": 0.00012416798128819445, + "loss": 2.7274, + "step": 12560 + }, + { + "epoch": 3.968806759851536, + "grad_norm": 0.07992753763610859, + "learning_rate": 0.0001238044925232409, + "loss": 2.8568, + "step": 12565 + }, + { + "epoch": 3.9703861644160154, + "grad_norm": 0.08179913452703812, + "learning_rate": 0.00012344146138498413, + "loss": 2.7581, + "step": 12570 + }, + { + "epoch": 3.9719655689804942, + "grad_norm": 0.0922006090466073, + "learning_rate": 0.000123078888315037, + "loss": 2.7934, + "step": 12575 + }, + { + "epoch": 3.9735449735449735, + "grad_norm": 0.08126675516242796, + "learning_rate": 0.00012271677375445472, + "loss": 2.7128, + "step": 12580 + }, + { + "epoch": 3.9751243781094527, + "grad_norm": 0.07367105073888085, + "learning_rate": 0.00012235511814373524, + "loss": 2.6988, + "step": 12585 + }, + { + "epoch": 3.976703782673932, + "grad_norm": 0.08732065316831548, + "learning_rate": 0.00012199392192281806, + "loss": 2.7254, + "step": 12590 + }, + { + "epoch": 3.9782831872384112, + "grad_norm": 0.08797895856176832, + "learning_rate": 0.00012163318553108383, + "loss": 2.6384, + "step": 12595 + }, + { + "epoch": 3.97986259180289, + "grad_norm": 0.07485047149535816, + "learning_rate": 0.00012127290940735387, + "loss": 2.8072, + "step": 12600 + }, + { + "epoch": 3.9814419963673693, + "grad_norm": 0.0739096520837601, + "learning_rate": 0.00012091309398988959, + "loss": 2.7176, + "step": 12605 + }, + { + "epoch": 3.9830214009318485, + "grad_norm": 0.08794552730291774, + "learning_rate": 0.00012055373971639194, + "loss": 2.8281, + "step": 12610 + }, + { + "epoch": 3.984600805496328, + "grad_norm": 0.08730105494120556, + "learning_rate": 0.00012019484702400102, + "loss": 2.7105, + "step": 12615 + }, + { + "epoch": 3.986180210060807, + "grad_norm": 0.08248874844921628, + "learning_rate": 0.0001198364163492952, + "loss": 2.8115, + "step": 12620 + }, + { + "epoch": 3.9877596146252863, + "grad_norm": 0.08023159465661149, + "learning_rate": 0.00011947844812829112, + "loss": 2.7232, + "step": 12625 + }, + { + "epoch": 3.9893390191897655, + "grad_norm": 0.07409895266284665, + "learning_rate": 0.00011912094279644264, + "loss": 2.7581, + "step": 12630 + }, + { + "epoch": 3.9909184237542448, + "grad_norm": 0.07309501686116365, + "learning_rate": 0.00011876390078864074, + "loss": 2.8141, + "step": 12635 + }, + { + "epoch": 3.992497828318724, + "grad_norm": 0.0843364088296787, + "learning_rate": 0.00011840732253921226, + "loss": 2.7141, + "step": 12640 + }, + { + "epoch": 3.9940772328832033, + "grad_norm": 0.06949766629434186, + "learning_rate": 0.00011805120848192069, + "loss": 2.8486, + "step": 12645 + }, + { + "epoch": 3.995656637447682, + "grad_norm": 0.09031300657317012, + "learning_rate": 0.00011769555904996454, + "loss": 2.7412, + "step": 12650 + }, + { + "epoch": 3.9972360420121613, + "grad_norm": 0.08610290159458256, + "learning_rate": 0.00011734037467597663, + "loss": 2.7564, + "step": 12655 + }, + { + "epoch": 3.9988154465766406, + "grad_norm": 0.0704581876158133, + "learning_rate": 0.00011698565579202464, + "loss": 2.8096, + "step": 12660 + }, + { + "epoch": 4.0, + "eval_loss": 2.7555720806121826, + "eval_runtime": 118.9074, + "eval_samples_per_second": 22.278, + "eval_steps_per_second": 5.576, + "step": 12664 + }, + { + "epoch": 4.000315880912896, + "grad_norm": 0.07267795935226085, + "learning_rate": 0.00011663140282960972, + "loss": 2.7881, + "step": 12665 + }, + { + "epoch": 4.001895285477375, + "grad_norm": 0.11335087572849066, + "learning_rate": 0.00011627761621966671, + "loss": 2.7376, + "step": 12670 + }, + { + "epoch": 4.0034746900418545, + "grad_norm": 0.09454711336777949, + "learning_rate": 0.00011592429639256236, + "loss": 2.7298, + "step": 12675 + }, + { + "epoch": 4.005054094606334, + "grad_norm": 0.07858817668238845, + "learning_rate": 0.00011557144377809626, + "loss": 2.7434, + "step": 12680 + }, + { + "epoch": 4.006633499170813, + "grad_norm": 0.07858793747755334, + "learning_rate": 0.00011521905880549927, + "loss": 2.7363, + "step": 12685 + }, + { + "epoch": 4.008212903735291, + "grad_norm": 0.07070561973242187, + "learning_rate": 0.00011486714190343368, + "loss": 2.7429, + "step": 12690 + }, + { + "epoch": 4.009792308299771, + "grad_norm": 0.07734112618433828, + "learning_rate": 0.00011451569349999208, + "loss": 2.785, + "step": 12695 + }, + { + "epoch": 4.01137171286425, + "grad_norm": 0.0930543497197193, + "learning_rate": 0.00011416471402269745, + "loss": 2.702, + "step": 12700 + }, + { + "epoch": 4.012951117428729, + "grad_norm": 0.0870420210926015, + "learning_rate": 0.0001138142038985021, + "loss": 2.7423, + "step": 12705 + }, + { + "epoch": 4.014530521993208, + "grad_norm": 0.07013096337671941, + "learning_rate": 0.00011346416355378763, + "loss": 2.7589, + "step": 12710 + }, + { + "epoch": 4.016109926557688, + "grad_norm": 0.06466982904356627, + "learning_rate": 0.00011311459341436398, + "loss": 2.7705, + "step": 12715 + }, + { + "epoch": 4.017689331122167, + "grad_norm": 0.06498097336324272, + "learning_rate": 0.00011276549390546891, + "loss": 2.6803, + "step": 12720 + }, + { + "epoch": 4.019268735686646, + "grad_norm": 0.06619826905104946, + "learning_rate": 0.00011241686545176821, + "loss": 2.7004, + "step": 12725 + }, + { + "epoch": 4.020848140251125, + "grad_norm": 0.0737305188549197, + "learning_rate": 0.00011206870847735451, + "loss": 2.8536, + "step": 12730 + }, + { + "epoch": 4.022427544815605, + "grad_norm": 0.07863028121907374, + "learning_rate": 0.0001117210234057463, + "loss": 2.7752, + "step": 12735 + }, + { + "epoch": 4.024006949380084, + "grad_norm": 0.07744110506698226, + "learning_rate": 0.00011137381065988878, + "loss": 2.8128, + "step": 12740 + }, + { + "epoch": 4.025586353944563, + "grad_norm": 0.08781241077290056, + "learning_rate": 0.00011102707066215207, + "loss": 2.8003, + "step": 12745 + }, + { + "epoch": 4.027165758509042, + "grad_norm": 0.07560593544354333, + "learning_rate": 0.00011068080383433188, + "loss": 2.746, + "step": 12750 + }, + { + "epoch": 4.028745163073522, + "grad_norm": 0.07883754256609818, + "learning_rate": 0.00011033501059764739, + "loss": 2.6919, + "step": 12755 + }, + { + "epoch": 4.030324567638001, + "grad_norm": 0.08393257191797195, + "learning_rate": 0.00010998969137274233, + "loss": 2.6511, + "step": 12760 + }, + { + "epoch": 4.03190397220248, + "grad_norm": 0.0829523093658688, + "learning_rate": 0.00010964484657968366, + "loss": 2.7328, + "step": 12765 + }, + { + "epoch": 4.0334833767669585, + "grad_norm": 0.06908183601758058, + "learning_rate": 0.00010930047663796117, + "loss": 2.7249, + "step": 12770 + }, + { + "epoch": 4.035062781331438, + "grad_norm": 0.09022581140433972, + "learning_rate": 0.00010895658196648705, + "loss": 2.7721, + "step": 12775 + }, + { + "epoch": 4.036642185895917, + "grad_norm": 0.07657464578816578, + "learning_rate": 0.00010861316298359535, + "loss": 2.7059, + "step": 12780 + }, + { + "epoch": 4.038221590460396, + "grad_norm": 0.08410938024536432, + "learning_rate": 0.0001082702201070414, + "loss": 2.7395, + "step": 12785 + }, + { + "epoch": 4.039800995024875, + "grad_norm": 0.08752998570639688, + "learning_rate": 0.00010792775375400143, + "loss": 2.7439, + "step": 12790 + }, + { + "epoch": 4.041380399589355, + "grad_norm": 0.0790726622604146, + "learning_rate": 0.00010758576434107198, + "loss": 2.8198, + "step": 12795 + }, + { + "epoch": 4.042959804153834, + "grad_norm": 0.08010916906242367, + "learning_rate": 0.00010724425228426937, + "loss": 2.7416, + "step": 12800 + }, + { + "epoch": 4.044539208718313, + "grad_norm": 0.07123591418461958, + "learning_rate": 0.00010690321799902935, + "loss": 2.7365, + "step": 12805 + }, + { + "epoch": 4.046118613282792, + "grad_norm": 0.07012860548872378, + "learning_rate": 0.00010656266190020647, + "loss": 2.7608, + "step": 12810 + }, + { + "epoch": 4.047698017847272, + "grad_norm": 0.09592957225014759, + "learning_rate": 0.00010622258440207332, + "loss": 2.7532, + "step": 12815 + }, + { + "epoch": 4.049277422411751, + "grad_norm": 0.07239864807799998, + "learning_rate": 0.0001058829859183204, + "loss": 2.6603, + "step": 12820 + }, + { + "epoch": 4.05085682697623, + "grad_norm": 0.09866936223922237, + "learning_rate": 0.00010554386686205598, + "loss": 2.6921, + "step": 12825 + }, + { + "epoch": 4.052436231540709, + "grad_norm": 0.0992711192539062, + "learning_rate": 0.00010520522764580465, + "loss": 2.7069, + "step": 12830 + }, + { + "epoch": 4.054015636105189, + "grad_norm": 0.11154957581863764, + "learning_rate": 0.00010486706868150719, + "loss": 2.6719, + "step": 12835 + }, + { + "epoch": 4.055595040669668, + "grad_norm": 0.06616443878302948, + "learning_rate": 0.00010452939038052045, + "loss": 2.7901, + "step": 12840 + }, + { + "epoch": 4.057174445234147, + "grad_norm": 0.0742117036830686, + "learning_rate": 0.0001041921931536165, + "loss": 2.7356, + "step": 12845 + }, + { + "epoch": 4.0587538497986255, + "grad_norm": 0.08669868316456582, + "learning_rate": 0.00010385547741098222, + "loss": 2.7192, + "step": 12850 + }, + { + "epoch": 4.060333254363105, + "grad_norm": 0.07619909737865753, + "learning_rate": 0.00010351924356221881, + "loss": 2.7892, + "step": 12855 + }, + { + "epoch": 4.061912658927584, + "grad_norm": 0.07369461427803699, + "learning_rate": 0.00010318349201634114, + "loss": 2.6935, + "step": 12860 + }, + { + "epoch": 4.063492063492063, + "grad_norm": 0.07486900644344201, + "learning_rate": 0.00010284822318177745, + "loss": 2.7046, + "step": 12865 + }, + { + "epoch": 4.0650714680565425, + "grad_norm": 0.06591349144085443, + "learning_rate": 0.00010251343746636898, + "loss": 2.8372, + "step": 12870 + }, + { + "epoch": 4.066650872621022, + "grad_norm": 0.08545955955022334, + "learning_rate": 0.00010217913527736866, + "loss": 2.8099, + "step": 12875 + }, + { + "epoch": 4.068230277185501, + "grad_norm": 0.09741800113980427, + "learning_rate": 0.00010184531702144201, + "loss": 2.8099, + "step": 12880 + }, + { + "epoch": 4.06980968174998, + "grad_norm": 0.06915029241686076, + "learning_rate": 0.00010151198310466542, + "loss": 2.7801, + "step": 12885 + }, + { + "epoch": 4.0713890863144595, + "grad_norm": 0.07865162471831375, + "learning_rate": 0.00010117913393252631, + "loss": 2.7565, + "step": 12890 + }, + { + "epoch": 4.072968490878939, + "grad_norm": 0.07877350645722128, + "learning_rate": 0.00010084676990992198, + "loss": 2.7584, + "step": 12895 + }, + { + "epoch": 4.074547895443418, + "grad_norm": 0.08714047286311652, + "learning_rate": 0.00010051489144115999, + "loss": 2.7372, + "step": 12900 + }, + { + "epoch": 4.076127300007897, + "grad_norm": 0.07739346995909928, + "learning_rate": 0.00010018349892995737, + "loss": 2.7185, + "step": 12905 + }, + { + "epoch": 4.0777067045723765, + "grad_norm": 0.08920991036051683, + "learning_rate": 9.985259277943976e-05, + "loss": 2.775, + "step": 12910 + }, + { + "epoch": 4.079286109136856, + "grad_norm": 0.07472659972509982, + "learning_rate": 9.952217339214087e-05, + "loss": 2.8163, + "step": 12915 + }, + { + "epoch": 4.080865513701335, + "grad_norm": 0.07826325288038806, + "learning_rate": 9.91922411700028e-05, + "loss": 2.7713, + "step": 12920 + }, + { + "epoch": 4.082444918265814, + "grad_norm": 0.07986269543461305, + "learning_rate": 9.886279651437463e-05, + "loss": 2.7425, + "step": 12925 + }, + { + "epoch": 4.084024322830293, + "grad_norm": 0.08008789236435811, + "learning_rate": 9.853383982601293e-05, + "loss": 2.7371, + "step": 12930 + }, + { + "epoch": 4.085603727394772, + "grad_norm": 0.07425675172749298, + "learning_rate": 9.820537150507997e-05, + "loss": 2.6831, + "step": 12935 + }, + { + "epoch": 4.087183131959251, + "grad_norm": 0.073958189495471, + "learning_rate": 9.787739195114425e-05, + "loss": 2.683, + "step": 12940 + }, + { + "epoch": 4.08876253652373, + "grad_norm": 0.08580862030204887, + "learning_rate": 9.754990156317978e-05, + "loss": 2.6973, + "step": 12945 + }, + { + "epoch": 4.09034194108821, + "grad_norm": 0.06697056889956048, + "learning_rate": 9.722290073956536e-05, + "loss": 2.766, + "step": 12950 + }, + { + "epoch": 4.091921345652689, + "grad_norm": 0.07514374406514496, + "learning_rate": 9.689638987808441e-05, + "loss": 2.6808, + "step": 12955 + }, + { + "epoch": 4.093500750217168, + "grad_norm": 0.07904483649894045, + "learning_rate": 9.657036937592422e-05, + "loss": 2.8603, + "step": 12960 + }, + { + "epoch": 4.095080154781647, + "grad_norm": 0.07407896816517008, + "learning_rate": 9.624483962967568e-05, + "loss": 2.7394, + "step": 12965 + }, + { + "epoch": 4.096659559346127, + "grad_norm": 0.0813140930775043, + "learning_rate": 9.59198010353326e-05, + "loss": 2.655, + "step": 12970 + }, + { + "epoch": 4.098238963910606, + "grad_norm": 0.07840620482991092, + "learning_rate": 9.559525398829111e-05, + "loss": 2.8495, + "step": 12975 + }, + { + "epoch": 4.099818368475085, + "grad_norm": 0.0768145193026349, + "learning_rate": 9.527119888334995e-05, + "loss": 2.6923, + "step": 12980 + }, + { + "epoch": 4.101397773039564, + "grad_norm": 0.08710073076402247, + "learning_rate": 9.494763611470903e-05, + "loss": 2.7411, + "step": 12985 + }, + { + "epoch": 4.102977177604044, + "grad_norm": 0.06974392194527812, + "learning_rate": 9.462456607596953e-05, + "loss": 2.708, + "step": 12990 + }, + { + "epoch": 4.104556582168523, + "grad_norm": 0.07353572274840098, + "learning_rate": 9.430198916013294e-05, + "loss": 2.7189, + "step": 12995 + }, + { + "epoch": 4.106135986733002, + "grad_norm": 0.0660256957832875, + "learning_rate": 9.397990575960102e-05, + "loss": 2.6645, + "step": 13000 + }, + { + "epoch": 4.10771539129748, + "grad_norm": 0.06266918290859314, + "learning_rate": 9.365831626617555e-05, + "loss": 2.7111, + "step": 13005 + }, + { + "epoch": 4.10929479586196, + "grad_norm": 0.06435040279489714, + "learning_rate": 9.333722107105724e-05, + "loss": 2.7237, + "step": 13010 + }, + { + "epoch": 4.110874200426439, + "grad_norm": 0.061291168489293696, + "learning_rate": 9.301662056484522e-05, + "loss": 2.7257, + "step": 13015 + }, + { + "epoch": 4.112453604990918, + "grad_norm": 0.06917342741798833, + "learning_rate": 9.269651513753724e-05, + "loss": 2.7505, + "step": 13020 + }, + { + "epoch": 4.114033009555397, + "grad_norm": 0.0808240567172112, + "learning_rate": 9.237690517852859e-05, + "loss": 2.8223, + "step": 13025 + }, + { + "epoch": 4.115612414119877, + "grad_norm": 0.07901186635350496, + "learning_rate": 9.2057791076612e-05, + "loss": 2.8034, + "step": 13030 + }, + { + "epoch": 4.117191818684356, + "grad_norm": 0.10154864996328554, + "learning_rate": 9.173917321997693e-05, + "loss": 2.7343, + "step": 13035 + }, + { + "epoch": 4.118771223248835, + "grad_norm": 0.07225281959690563, + "learning_rate": 9.142105199620915e-05, + "loss": 2.8152, + "step": 13040 + }, + { + "epoch": 4.120350627813314, + "grad_norm": 0.06857353203537747, + "learning_rate": 9.11034277922903e-05, + "loss": 2.7683, + "step": 13045 + }, + { + "epoch": 4.121930032377794, + "grad_norm": 0.07312913741081528, + "learning_rate": 9.078630099459768e-05, + "loss": 2.7367, + "step": 13050 + }, + { + "epoch": 4.123509436942273, + "grad_norm": 0.06819091028683402, + "learning_rate": 9.046967198890283e-05, + "loss": 2.7787, + "step": 13055 + }, + { + "epoch": 4.125088841506752, + "grad_norm": 0.06621544445289144, + "learning_rate": 9.015354116037255e-05, + "loss": 2.7888, + "step": 13060 + }, + { + "epoch": 4.126668246071231, + "grad_norm": 0.06836253581250609, + "learning_rate": 8.983790889356714e-05, + "loss": 2.8769, + "step": 13065 + }, + { + "epoch": 4.128247650635711, + "grad_norm": 0.0749274451688237, + "learning_rate": 8.952277557244076e-05, + "loss": 2.8301, + "step": 13070 + }, + { + "epoch": 4.12982705520019, + "grad_norm": 0.07840530326600034, + "learning_rate": 8.920814158034008e-05, + "loss": 2.723, + "step": 13075 + }, + { + "epoch": 4.131406459764669, + "grad_norm": 0.07538609612683901, + "learning_rate": 8.889400730000474e-05, + "loss": 2.7714, + "step": 13080 + }, + { + "epoch": 4.1329858643291475, + "grad_norm": 0.07840343227079008, + "learning_rate": 8.858037311356676e-05, + "loss": 2.8266, + "step": 13085 + }, + { + "epoch": 4.134565268893627, + "grad_norm": 0.08453554879200816, + "learning_rate": 8.826723940254922e-05, + "loss": 2.8083, + "step": 13090 + }, + { + "epoch": 4.136144673458106, + "grad_norm": 0.08588294433678069, + "learning_rate": 8.795460654786675e-05, + "loss": 2.6759, + "step": 13095 + }, + { + "epoch": 4.137724078022585, + "grad_norm": 0.06968909508591559, + "learning_rate": 8.764247492982469e-05, + "loss": 2.7308, + "step": 13100 + }, + { + "epoch": 4.1393034825870645, + "grad_norm": 0.08330260139514357, + "learning_rate": 8.73308449281185e-05, + "loss": 2.8009, + "step": 13105 + }, + { + "epoch": 4.140882887151544, + "grad_norm": 0.07035989414823073, + "learning_rate": 8.701971692183364e-05, + "loss": 2.6717, + "step": 13110 + }, + { + "epoch": 4.142462291716023, + "grad_norm": 0.07396310603579366, + "learning_rate": 8.670909128944471e-05, + "loss": 2.7767, + "step": 13115 + }, + { + "epoch": 4.144041696280502, + "grad_norm": 0.06781275294051901, + "learning_rate": 8.639896840881533e-05, + "loss": 2.7749, + "step": 13120 + }, + { + "epoch": 4.1456211008449815, + "grad_norm": 0.07113564030328245, + "learning_rate": 8.608934865719759e-05, + "loss": 2.6876, + "step": 13125 + }, + { + "epoch": 4.147200505409461, + "grad_norm": 0.07591610433087836, + "learning_rate": 8.578023241123134e-05, + "loss": 2.7316, + "step": 13130 + }, + { + "epoch": 4.14877990997394, + "grad_norm": 0.0701649084899664, + "learning_rate": 8.547162004694408e-05, + "loss": 2.7393, + "step": 13135 + }, + { + "epoch": 4.150359314538419, + "grad_norm": 0.0707216056523151, + "learning_rate": 8.516351193975041e-05, + "loss": 2.7782, + "step": 13140 + }, + { + "epoch": 4.1519387191028985, + "grad_norm": 0.06585288088366432, + "learning_rate": 8.485590846445134e-05, + "loss": 2.79, + "step": 13145 + }, + { + "epoch": 4.153518123667378, + "grad_norm": 0.07924071303504657, + "learning_rate": 8.454880999523434e-05, + "loss": 2.8983, + "step": 13150 + }, + { + "epoch": 4.155097528231857, + "grad_norm": 0.07506187079168304, + "learning_rate": 8.424221690567185e-05, + "loss": 2.7565, + "step": 13155 + }, + { + "epoch": 4.156676932796336, + "grad_norm": 0.06815158920242109, + "learning_rate": 8.393612956872254e-05, + "loss": 2.8069, + "step": 13160 + }, + { + "epoch": 4.158256337360815, + "grad_norm": 0.07248305637947533, + "learning_rate": 8.363054835672923e-05, + "loss": 2.7453, + "step": 13165 + }, + { + "epoch": 4.159835741925294, + "grad_norm": 0.08402316260156116, + "learning_rate": 8.33254736414189e-05, + "loss": 2.8404, + "step": 13170 + }, + { + "epoch": 4.161415146489773, + "grad_norm": 0.07211372668448897, + "learning_rate": 8.302090579390292e-05, + "loss": 2.783, + "step": 13175 + }, + { + "epoch": 4.162994551054252, + "grad_norm": 0.06899978064304814, + "learning_rate": 8.27168451846757e-05, + "loss": 2.74, + "step": 13180 + }, + { + "epoch": 4.164573955618732, + "grad_norm": 0.07452756938770923, + "learning_rate": 8.241329218361481e-05, + "loss": 2.8303, + "step": 13185 + }, + { + "epoch": 4.166153360183211, + "grad_norm": 0.07142349650656299, + "learning_rate": 8.211024715998022e-05, + "loss": 2.7092, + "step": 13190 + }, + { + "epoch": 4.16773276474769, + "grad_norm": 0.07575208809051998, + "learning_rate": 8.180771048241403e-05, + "loss": 2.7697, + "step": 13195 + }, + { + "epoch": 4.169312169312169, + "grad_norm": 0.06957683151967588, + "learning_rate": 8.150568251893991e-05, + "loss": 2.6347, + "step": 13200 + }, + { + "epoch": 4.1708915738766486, + "grad_norm": 0.07486281848146506, + "learning_rate": 8.120416363696276e-05, + "loss": 2.7276, + "step": 13205 + }, + { + "epoch": 4.172470978441128, + "grad_norm": 0.0916192059245111, + "learning_rate": 8.090315420326811e-05, + "loss": 2.7572, + "step": 13210 + }, + { + "epoch": 4.174050383005607, + "grad_norm": 0.06858563839143138, + "learning_rate": 8.060265458402189e-05, + "loss": 2.6835, + "step": 13215 + }, + { + "epoch": 4.175629787570086, + "grad_norm": 0.07071131140325115, + "learning_rate": 8.030266514476975e-05, + "loss": 2.8856, + "step": 13220 + }, + { + "epoch": 4.1772091921345655, + "grad_norm": 0.08802252899394522, + "learning_rate": 8.000318625043684e-05, + "loss": 2.7193, + "step": 13225 + }, + { + "epoch": 4.178788596699045, + "grad_norm": 0.07430944547478276, + "learning_rate": 7.970421826532708e-05, + "loss": 2.7029, + "step": 13230 + }, + { + "epoch": 4.180368001263524, + "grad_norm": 0.08417855867691813, + "learning_rate": 7.940576155312291e-05, + "loss": 2.701, + "step": 13235 + }, + { + "epoch": 4.181947405828003, + "grad_norm": 0.07304615966559413, + "learning_rate": 7.910781647688514e-05, + "loss": 2.857, + "step": 13240 + }, + { + "epoch": 4.183526810392482, + "grad_norm": 0.06287700243132766, + "learning_rate": 7.8810383399052e-05, + "loss": 2.718, + "step": 13245 + }, + { + "epoch": 4.185106214956961, + "grad_norm": 0.06915417223034416, + "learning_rate": 7.851346268143861e-05, + "loss": 2.8032, + "step": 13250 + }, + { + "epoch": 4.18668561952144, + "grad_norm": 0.08706746214365717, + "learning_rate": 7.821705468523716e-05, + "loss": 2.7408, + "step": 13255 + }, + { + "epoch": 4.188265024085919, + "grad_norm": 0.0695987755863437, + "learning_rate": 7.7921159771016e-05, + "loss": 2.8078, + "step": 13260 + }, + { + "epoch": 4.189844428650399, + "grad_norm": 0.07315416293232646, + "learning_rate": 7.762577829871964e-05, + "loss": 2.7824, + "step": 13265 + }, + { + "epoch": 4.191423833214878, + "grad_norm": 0.07281198087456109, + "learning_rate": 7.73309106276675e-05, + "loss": 2.7185, + "step": 13270 + }, + { + "epoch": 4.193003237779357, + "grad_norm": 0.07879492288896292, + "learning_rate": 7.703655711655433e-05, + "loss": 2.6875, + "step": 13275 + }, + { + "epoch": 4.194582642343836, + "grad_norm": 0.07156009221206502, + "learning_rate": 7.674271812344936e-05, + "loss": 2.7295, + "step": 13280 + }, + { + "epoch": 4.196162046908316, + "grad_norm": 0.07588370500677988, + "learning_rate": 7.644939400579582e-05, + "loss": 2.7847, + "step": 13285 + }, + { + "epoch": 4.197741451472795, + "grad_norm": 0.06610658084447268, + "learning_rate": 7.615658512041068e-05, + "loss": 2.7541, + "step": 13290 + }, + { + "epoch": 4.199320856037274, + "grad_norm": 0.08835917770634552, + "learning_rate": 7.586429182348431e-05, + "loss": 2.831, + "step": 13295 + }, + { + "epoch": 4.200900260601753, + "grad_norm": 0.08165579997433896, + "learning_rate": 7.557251447057961e-05, + "loss": 2.7883, + "step": 13300 + }, + { + "epoch": 4.202479665166233, + "grad_norm": 0.056710901407484034, + "learning_rate": 7.528125341663216e-05, + "loss": 2.7187, + "step": 13305 + }, + { + "epoch": 4.204059069730712, + "grad_norm": 0.06606261383500421, + "learning_rate": 7.499050901594895e-05, + "loss": 2.7384, + "step": 13310 + }, + { + "epoch": 4.205638474295191, + "grad_norm": 0.08710969003358883, + "learning_rate": 7.470028162220921e-05, + "loss": 2.7903, + "step": 13315 + }, + { + "epoch": 4.2072178788596695, + "grad_norm": 0.07616166018552015, + "learning_rate": 7.441057158846276e-05, + "loss": 2.7545, + "step": 13320 + }, + { + "epoch": 4.208797283424149, + "grad_norm": 0.06864727212517878, + "learning_rate": 7.41213792671303e-05, + "loss": 2.7866, + "step": 13325 + }, + { + "epoch": 4.210376687988628, + "grad_norm": 0.07239789201367354, + "learning_rate": 7.383270501000244e-05, + "loss": 2.6703, + "step": 13330 + }, + { + "epoch": 4.211956092553107, + "grad_norm": 0.07398804471101839, + "learning_rate": 7.354454916823988e-05, + "loss": 2.6702, + "step": 13335 + }, + { + "epoch": 4.2135354971175865, + "grad_norm": 0.0713376054157643, + "learning_rate": 7.325691209237251e-05, + "loss": 2.7825, + "step": 13340 + }, + { + "epoch": 4.215114901682066, + "grad_norm": 0.1059775076324938, + "learning_rate": 7.296979413229965e-05, + "loss": 2.8092, + "step": 13345 + }, + { + "epoch": 4.216694306246545, + "grad_norm": 0.08755920739028457, + "learning_rate": 7.26831956372883e-05, + "loss": 2.7413, + "step": 13350 + }, + { + "epoch": 4.218273710811024, + "grad_norm": 0.08570092338531314, + "learning_rate": 7.239711695597423e-05, + "loss": 2.8745, + "step": 13355 + }, + { + "epoch": 4.2198531153755034, + "grad_norm": 0.08907014603792289, + "learning_rate": 7.211155843636058e-05, + "loss": 2.7157, + "step": 13360 + }, + { + "epoch": 4.221432519939983, + "grad_norm": 0.06238522496569736, + "learning_rate": 7.182652042581777e-05, + "loss": 2.6032, + "step": 13365 + }, + { + "epoch": 4.223011924504462, + "grad_norm": 0.0654451679189967, + "learning_rate": 7.154200327108313e-05, + "loss": 2.7601, + "step": 13370 + }, + { + "epoch": 4.224591329068941, + "grad_norm": 0.08077192983965602, + "learning_rate": 7.125800731826027e-05, + "loss": 2.7445, + "step": 13375 + }, + { + "epoch": 4.22617073363342, + "grad_norm": 0.0695802242339257, + "learning_rate": 7.097453291281886e-05, + "loss": 2.9043, + "step": 13380 + }, + { + "epoch": 4.2277501381979, + "grad_norm": 0.06603611733803791, + "learning_rate": 7.069158039959428e-05, + "loss": 2.6915, + "step": 13385 + }, + { + "epoch": 4.229329542762379, + "grad_norm": 0.07245836114459824, + "learning_rate": 7.040915012278649e-05, + "loss": 2.7769, + "step": 13390 + }, + { + "epoch": 4.230908947326858, + "grad_norm": 0.07601383842300888, + "learning_rate": 7.01272424259608e-05, + "loss": 2.7277, + "step": 13395 + }, + { + "epoch": 4.2324883518913365, + "grad_norm": 0.0674693714620134, + "learning_rate": 6.984585765204665e-05, + "loss": 2.7123, + "step": 13400 + }, + { + "epoch": 4.234067756455816, + "grad_norm": 0.06622712675349786, + "learning_rate": 6.956499614333728e-05, + "loss": 2.7395, + "step": 13405 + }, + { + "epoch": 4.235647161020295, + "grad_norm": 0.07659108550055024, + "learning_rate": 6.928465824148921e-05, + "loss": 2.7132, + "step": 13410 + }, + { + "epoch": 4.237226565584774, + "grad_norm": 0.06989111688990207, + "learning_rate": 6.900484428752229e-05, + "loss": 2.7415, + "step": 13415 + }, + { + "epoch": 4.2388059701492535, + "grad_norm": 0.0812313906742365, + "learning_rate": 6.872555462181907e-05, + "loss": 2.9352, + "step": 13420 + }, + { + "epoch": 4.240385374713733, + "grad_norm": 0.08118905036233727, + "learning_rate": 6.84467895841242e-05, + "loss": 2.7589, + "step": 13425 + }, + { + "epoch": 4.241964779278212, + "grad_norm": 0.0815071465384909, + "learning_rate": 6.816854951354395e-05, + "loss": 2.7103, + "step": 13430 + }, + { + "epoch": 4.243544183842691, + "grad_norm": 0.07226807620972926, + "learning_rate": 6.789083474854623e-05, + "loss": 2.7102, + "step": 13435 + }, + { + "epoch": 4.2451235884071705, + "grad_norm": 0.06507752689397496, + "learning_rate": 6.761364562695993e-05, + "loss": 2.8181, + "step": 13440 + }, + { + "epoch": 4.24670299297165, + "grad_norm": 0.06343344383579275, + "learning_rate": 6.733698248597442e-05, + "loss": 2.6587, + "step": 13445 + }, + { + "epoch": 4.248282397536129, + "grad_norm": 0.08499124262494784, + "learning_rate": 6.706084566213933e-05, + "loss": 2.7394, + "step": 13450 + }, + { + "epoch": 4.249861802100608, + "grad_norm": 0.07541637564123528, + "learning_rate": 6.678523549136395e-05, + "loss": 2.7756, + "step": 13455 + }, + { + "epoch": 4.2514412066650875, + "grad_norm": 0.07850718407085357, + "learning_rate": 6.651015230891694e-05, + "loss": 2.7917, + "step": 13460 + }, + { + "epoch": 4.253020611229567, + "grad_norm": 0.08729045027729063, + "learning_rate": 6.6235596449426e-05, + "loss": 2.6784, + "step": 13465 + }, + { + "epoch": 4.254600015794046, + "grad_norm": 0.06528484978373501, + "learning_rate": 6.59615682468772e-05, + "loss": 2.6629, + "step": 13470 + }, + { + "epoch": 4.256179420358524, + "grad_norm": 0.06004152882965481, + "learning_rate": 6.568806803461486e-05, + "loss": 2.763, + "step": 13475 + }, + { + "epoch": 4.257758824923004, + "grad_norm": 0.06942651796503685, + "learning_rate": 6.541509614534102e-05, + "loss": 2.7736, + "step": 13480 + }, + { + "epoch": 4.259338229487483, + "grad_norm": 0.08931798912696844, + "learning_rate": 6.514265291111505e-05, + "loss": 2.7304, + "step": 13485 + }, + { + "epoch": 4.260917634051962, + "grad_norm": 0.06593449167140476, + "learning_rate": 6.487073866335297e-05, + "loss": 2.7512, + "step": 13490 + }, + { + "epoch": 4.262497038616441, + "grad_norm": 0.06953667307281435, + "learning_rate": 6.459935373282754e-05, + "loss": 2.8361, + "step": 13495 + }, + { + "epoch": 4.264076443180921, + "grad_norm": 0.06738249714486153, + "learning_rate": 6.432849844966781e-05, + "loss": 2.7549, + "step": 13500 + }, + { + "epoch": 4.2656558477454, + "grad_norm": 0.06926766218445686, + "learning_rate": 6.405817314335838e-05, + "loss": 2.7942, + "step": 13505 + }, + { + "epoch": 4.267235252309879, + "grad_norm": 0.06177416969559798, + "learning_rate": 6.378837814273886e-05, + "loss": 2.7003, + "step": 13510 + }, + { + "epoch": 4.268814656874358, + "grad_norm": 0.07142728930853254, + "learning_rate": 6.351911377600405e-05, + "loss": 2.7676, + "step": 13515 + }, + { + "epoch": 4.270394061438838, + "grad_norm": 0.07247437337603856, + "learning_rate": 6.325038037070335e-05, + "loss": 2.6987, + "step": 13520 + }, + { + "epoch": 4.271973466003317, + "grad_norm": 0.06667755074853544, + "learning_rate": 6.298217825374003e-05, + "loss": 2.6546, + "step": 13525 + }, + { + "epoch": 4.273552870567796, + "grad_norm": 0.05841983559812674, + "learning_rate": 6.271450775137116e-05, + "loss": 2.6878, + "step": 13530 + }, + { + "epoch": 4.275132275132275, + "grad_norm": 0.13813052672559056, + "learning_rate": 6.244736918920723e-05, + "loss": 2.7327, + "step": 13535 + }, + { + "epoch": 4.276711679696755, + "grad_norm": 0.09279953583701123, + "learning_rate": 6.218076289221153e-05, + "loss": 2.7295, + "step": 13540 + }, + { + "epoch": 4.278291084261234, + "grad_norm": 0.08170515928486509, + "learning_rate": 6.191468918469983e-05, + "loss": 2.7195, + "step": 13545 + }, + { + "epoch": 4.279870488825713, + "grad_norm": 0.08292936633247144, + "learning_rate": 6.164914839034009e-05, + "loss": 2.8554, + "step": 13550 + }, + { + "epoch": 4.281449893390192, + "grad_norm": 0.07496049391155023, + "learning_rate": 6.13841408321521e-05, + "loss": 2.7582, + "step": 13555 + }, + { + "epoch": 4.283029297954671, + "grad_norm": 0.07430535646232635, + "learning_rate": 6.111966683250681e-05, + "loss": 2.8655, + "step": 13560 + }, + { + "epoch": 4.28460870251915, + "grad_norm": 0.06984431750702808, + "learning_rate": 6.085572671312628e-05, + "loss": 2.7455, + "step": 13565 + }, + { + "epoch": 4.286188107083629, + "grad_norm": 0.06264677255343279, + "learning_rate": 6.059232079508276e-05, + "loss": 2.7517, + "step": 13570 + }, + { + "epoch": 4.287767511648108, + "grad_norm": 0.06003493490936945, + "learning_rate": 6.0329449398799306e-05, + "loss": 2.6768, + "step": 13575 + }, + { + "epoch": 4.289346916212588, + "grad_norm": 0.06981023649376253, + "learning_rate": 6.006711284404837e-05, + "loss": 2.7726, + "step": 13580 + }, + { + "epoch": 4.290926320777067, + "grad_norm": 0.07681366787794795, + "learning_rate": 5.980531144995155e-05, + "loss": 2.7501, + "step": 13585 + }, + { + "epoch": 4.292505725341546, + "grad_norm": 0.07795095111464338, + "learning_rate": 5.9544045534979885e-05, + "loss": 2.727, + "step": 13590 + }, + { + "epoch": 4.294085129906025, + "grad_norm": 0.07648082875555658, + "learning_rate": 5.9283315416952696e-05, + "loss": 2.722, + "step": 13595 + }, + { + "epoch": 4.295664534470505, + "grad_norm": 0.06563449012297855, + "learning_rate": 5.9023121413038064e-05, + "loss": 2.7084, + "step": 13600 + }, + { + "epoch": 4.297243939034984, + "grad_norm": 0.07913454594352652, + "learning_rate": 5.8763463839751065e-05, + "loss": 2.7199, + "step": 13605 + }, + { + "epoch": 4.298823343599463, + "grad_norm": 0.08395026529050832, + "learning_rate": 5.850434301295493e-05, + "loss": 2.7976, + "step": 13610 + }, + { + "epoch": 4.300402748163942, + "grad_norm": 0.0763829408994177, + "learning_rate": 5.824575924785969e-05, + "loss": 2.7019, + "step": 13615 + }, + { + "epoch": 4.301982152728422, + "grad_norm": 0.0773512882126801, + "learning_rate": 5.798771285902205e-05, + "loss": 2.9049, + "step": 13620 + }, + { + "epoch": 4.303561557292901, + "grad_norm": 0.06828440170825315, + "learning_rate": 5.7730204160345135e-05, + "loss": 2.7715, + "step": 13625 + }, + { + "epoch": 4.30514096185738, + "grad_norm": 0.073421634901678, + "learning_rate": 5.7473233465077766e-05, + "loss": 2.7953, + "step": 13630 + }, + { + "epoch": 4.3067203664218585, + "grad_norm": 0.07285346751400801, + "learning_rate": 5.7216801085814616e-05, + "loss": 2.8742, + "step": 13635 + }, + { + "epoch": 4.308299770986338, + "grad_norm": 0.06569249021510394, + "learning_rate": 5.6960907334495274e-05, + "loss": 2.7577, + "step": 13640 + }, + { + "epoch": 4.309879175550817, + "grad_norm": 0.07456857019857352, + "learning_rate": 5.6705552522404226e-05, + "loss": 2.7515, + "step": 13645 + }, + { + "epoch": 4.311458580115296, + "grad_norm": 0.07198010974084602, + "learning_rate": 5.645073696017028e-05, + "loss": 2.7084, + "step": 13650 + }, + { + "epoch": 4.3130379846797755, + "grad_norm": 0.06138065120692361, + "learning_rate": 5.619646095776632e-05, + "loss": 2.7768, + "step": 13655 + }, + { + "epoch": 4.314617389244255, + "grad_norm": 0.06453928405053538, + "learning_rate": 5.5942724824509014e-05, + "loss": 2.7546, + "step": 13660 + }, + { + "epoch": 4.316196793808734, + "grad_norm": 0.0747783956818153, + "learning_rate": 5.5689528869057924e-05, + "loss": 2.7236, + "step": 13665 + }, + { + "epoch": 4.317776198373213, + "grad_norm": 0.06628218044947043, + "learning_rate": 5.5436873399415836e-05, + "loss": 2.7179, + "step": 13670 + }, + { + "epoch": 4.3193556029376925, + "grad_norm": 0.07117624637547765, + "learning_rate": 5.518475872292789e-05, + "loss": 2.6556, + "step": 13675 + }, + { + "epoch": 4.320935007502172, + "grad_norm": 0.06278166768111325, + "learning_rate": 5.4933185146281706e-05, + "loss": 2.7812, + "step": 13680 + }, + { + "epoch": 4.322514412066651, + "grad_norm": 0.06052245340859566, + "learning_rate": 5.468215297550616e-05, + "loss": 2.7103, + "step": 13685 + }, + { + "epoch": 4.32409381663113, + "grad_norm": 0.07885858716670277, + "learning_rate": 5.443166251597187e-05, + "loss": 2.6971, + "step": 13690 + }, + { + "epoch": 4.3256732211956095, + "grad_norm": 0.06694542098736136, + "learning_rate": 5.418171407239042e-05, + "loss": 2.8413, + "step": 13695 + }, + { + "epoch": 4.327252625760089, + "grad_norm": 0.07576900006105601, + "learning_rate": 5.393230794881398e-05, + "loss": 2.7751, + "step": 13700 + }, + { + "epoch": 4.328832030324568, + "grad_norm": 0.07256073278844476, + "learning_rate": 5.36834444486351e-05, + "loss": 2.7187, + "step": 13705 + }, + { + "epoch": 4.330411434889047, + "grad_norm": 0.08385522204414514, + "learning_rate": 5.343512387458621e-05, + "loss": 2.7229, + "step": 13710 + }, + { + "epoch": 4.3319908394535265, + "grad_norm": 0.07873581195704993, + "learning_rate": 5.31873465287393e-05, + "loss": 2.7325, + "step": 13715 + }, + { + "epoch": 4.333570244018005, + "grad_norm": 0.0750069810527468, + "learning_rate": 5.2940112712505485e-05, + "loss": 2.7698, + "step": 13720 + }, + { + "epoch": 4.335149648582484, + "grad_norm": 0.06965511574224907, + "learning_rate": 5.269342272663486e-05, + "loss": 2.7588, + "step": 13725 + }, + { + "epoch": 4.336729053146963, + "grad_norm": 0.06150813914298518, + "learning_rate": 5.244727687121581e-05, + "loss": 2.7442, + "step": 13730 + }, + { + "epoch": 4.338308457711443, + "grad_norm": 0.08021150446847683, + "learning_rate": 5.220167544567483e-05, + "loss": 2.7578, + "step": 13735 + }, + { + "epoch": 4.339887862275922, + "grad_norm": 0.06595907033478059, + "learning_rate": 5.195661874877633e-05, + "loss": 2.8048, + "step": 13740 + }, + { + "epoch": 4.341467266840401, + "grad_norm": 0.06715984794095874, + "learning_rate": 5.1712107078621674e-05, + "loss": 2.7568, + "step": 13745 + }, + { + "epoch": 4.34304667140488, + "grad_norm": 0.07371042512940375, + "learning_rate": 5.1468140732649495e-05, + "loss": 2.7925, + "step": 13750 + }, + { + "epoch": 4.34462607596936, + "grad_norm": 0.0700871588028233, + "learning_rate": 5.122472000763523e-05, + "loss": 2.7745, + "step": 13755 + }, + { + "epoch": 4.346205480533839, + "grad_norm": 0.06950713265252911, + "learning_rate": 5.09818451996904e-05, + "loss": 2.7787, + "step": 13760 + }, + { + "epoch": 4.347784885098318, + "grad_norm": 0.06535950114577534, + "learning_rate": 5.0739516604262234e-05, + "loss": 2.8471, + "step": 13765 + }, + { + "epoch": 4.349364289662797, + "grad_norm": 0.06463402111041774, + "learning_rate": 5.0497734516133816e-05, + "loss": 2.7385, + "step": 13770 + }, + { + "epoch": 4.350943694227277, + "grad_norm": 0.07359128160034697, + "learning_rate": 5.025649922942322e-05, + "loss": 2.7414, + "step": 13775 + }, + { + "epoch": 4.352523098791756, + "grad_norm": 0.07421338773029013, + "learning_rate": 5.001581103758374e-05, + "loss": 2.7344, + "step": 13780 + }, + { + "epoch": 4.354102503356235, + "grad_norm": 0.07800647904117446, + "learning_rate": 4.977567023340263e-05, + "loss": 2.7717, + "step": 13785 + }, + { + "epoch": 4.355681907920714, + "grad_norm": 0.0763861872196818, + "learning_rate": 4.95360771090016e-05, + "loss": 2.7473, + "step": 13790 + }, + { + "epoch": 4.357261312485193, + "grad_norm": 0.08171719347294958, + "learning_rate": 4.9297031955836014e-05, + "loss": 2.6722, + "step": 13795 + }, + { + "epoch": 4.358840717049672, + "grad_norm": 0.10817254137085992, + "learning_rate": 4.9058535064694764e-05, + "loss": 2.7582, + "step": 13800 + }, + { + "epoch": 4.360420121614151, + "grad_norm": 0.06503752608596988, + "learning_rate": 4.882058672569961e-05, + "loss": 2.7507, + "step": 13805 + }, + { + "epoch": 4.36199952617863, + "grad_norm": 0.06677998740072949, + "learning_rate": 4.858318722830518e-05, + "loss": 2.7941, + "step": 13810 + }, + { + "epoch": 4.36357893074311, + "grad_norm": 0.06369201479861068, + "learning_rate": 4.834633686129841e-05, + "loss": 2.7379, + "step": 13815 + }, + { + "epoch": 4.365158335307589, + "grad_norm": 0.06259694786399726, + "learning_rate": 4.8110035912798334e-05, + "loss": 2.7584, + "step": 13820 + }, + { + "epoch": 4.366737739872068, + "grad_norm": 0.07670825601864158, + "learning_rate": 4.7874284670255395e-05, + "loss": 2.8117, + "step": 13825 + }, + { + "epoch": 4.368317144436547, + "grad_norm": 0.09683683755337852, + "learning_rate": 4.7639083420451423e-05, + "loss": 2.6355, + "step": 13830 + }, + { + "epoch": 4.369896549001027, + "grad_norm": 0.06627193836192438, + "learning_rate": 4.740443244949949e-05, + "loss": 2.636, + "step": 13835 + }, + { + "epoch": 4.371475953565506, + "grad_norm": 0.07032739646481113, + "learning_rate": 4.71703320428431e-05, + "loss": 2.8651, + "step": 13840 + }, + { + "epoch": 4.373055358129985, + "grad_norm": 0.06890569235735645, + "learning_rate": 4.6936782485255734e-05, + "loss": 2.725, + "step": 13845 + }, + { + "epoch": 4.374634762694464, + "grad_norm": 0.08070104843324633, + "learning_rate": 4.670378406084119e-05, + "loss": 2.6644, + "step": 13850 + }, + { + "epoch": 4.376214167258944, + "grad_norm": 0.0688429788742678, + "learning_rate": 4.6471337053032466e-05, + "loss": 2.7465, + "step": 13855 + }, + { + "epoch": 4.377793571823423, + "grad_norm": 0.056364800051506535, + "learning_rate": 4.623944174459238e-05, + "loss": 2.8138, + "step": 13860 + }, + { + "epoch": 4.379372976387902, + "grad_norm": 0.0706255673968221, + "learning_rate": 4.600809841761194e-05, + "loss": 2.7108, + "step": 13865 + }, + { + "epoch": 4.380952380952381, + "grad_norm": 0.07042155634195384, + "learning_rate": 4.57773073535111e-05, + "loss": 2.7321, + "step": 13870 + }, + { + "epoch": 4.38253178551686, + "grad_norm": 0.06509540616282554, + "learning_rate": 4.554706883303783e-05, + "loss": 2.7258, + "step": 13875 + }, + { + "epoch": 4.384111190081339, + "grad_norm": 0.06879444588675486, + "learning_rate": 4.53173831362681e-05, + "loss": 2.7727, + "step": 13880 + }, + { + "epoch": 4.385690594645818, + "grad_norm": 0.07447573997972565, + "learning_rate": 4.508825054260529e-05, + "loss": 2.7852, + "step": 13885 + }, + { + "epoch": 4.3872699992102975, + "grad_norm": 0.08917821801221523, + "learning_rate": 4.485967133078001e-05, + "loss": 2.6625, + "step": 13890 + }, + { + "epoch": 4.388849403774777, + "grad_norm": 0.06734782982930355, + "learning_rate": 4.463164577884959e-05, + "loss": 2.7126, + "step": 13895 + }, + { + "epoch": 4.390428808339256, + "grad_norm": 0.060382294964502946, + "learning_rate": 4.4404174164198116e-05, + "loss": 2.8383, + "step": 13900 + }, + { + "epoch": 4.392008212903735, + "grad_norm": 0.07387672965341177, + "learning_rate": 4.41772567635354e-05, + "loss": 2.8482, + "step": 13905 + }, + { + "epoch": 4.3935876174682145, + "grad_norm": 0.060981720869160934, + "learning_rate": 4.3950893852897465e-05, + "loss": 2.7127, + "step": 13910 + }, + { + "epoch": 4.395167022032694, + "grad_norm": 0.07892445094347626, + "learning_rate": 4.372508570764572e-05, + "loss": 2.7342, + "step": 13915 + }, + { + "epoch": 4.396746426597173, + "grad_norm": 0.06792673838931045, + "learning_rate": 4.349983260246676e-05, + "loss": 2.6993, + "step": 13920 + }, + { + "epoch": 4.398325831161652, + "grad_norm": 0.07555246416109847, + "learning_rate": 4.327513481137168e-05, + "loss": 2.731, + "step": 13925 + }, + { + "epoch": 4.3999052357261315, + "grad_norm": 0.07844031330721625, + "learning_rate": 4.305099260769635e-05, + "loss": 2.6728, + "step": 13930 + }, + { + "epoch": 4.401484640290611, + "grad_norm": 0.06945229492805172, + "learning_rate": 4.2827406264100976e-05, + "loss": 2.754, + "step": 13935 + }, + { + "epoch": 4.40306404485509, + "grad_norm": 0.06506067535766738, + "learning_rate": 4.260437605256912e-05, + "loss": 2.6624, + "step": 13940 + }, + { + "epoch": 4.404643449419569, + "grad_norm": 0.0801259534187138, + "learning_rate": 4.238190224440813e-05, + "loss": 2.7922, + "step": 13945 + }, + { + "epoch": 4.406222853984048, + "grad_norm": 0.07372482715922321, + "learning_rate": 4.2159985110248435e-05, + "loss": 2.7701, + "step": 13950 + }, + { + "epoch": 4.407802258548527, + "grad_norm": 0.06525766264560198, + "learning_rate": 4.1938624920043356e-05, + "loss": 2.7111, + "step": 13955 + }, + { + "epoch": 4.409381663113006, + "grad_norm": 0.06493386078079086, + "learning_rate": 4.171782194306856e-05, + "loss": 2.6602, + "step": 13960 + }, + { + "epoch": 4.410961067677485, + "grad_norm": 0.06553366615343312, + "learning_rate": 4.149757644792207e-05, + "loss": 2.836, + "step": 13965 + }, + { + "epoch": 4.4125404722419646, + "grad_norm": 0.0698484117012733, + "learning_rate": 4.127788870252358e-05, + "loss": 2.7079, + "step": 13970 + }, + { + "epoch": 4.414119876806444, + "grad_norm": 0.06464569018163112, + "learning_rate": 4.1058758974114485e-05, + "loss": 2.8094, + "step": 13975 + }, + { + "epoch": 4.415699281370923, + "grad_norm": 0.06047139827966574, + "learning_rate": 4.0840187529257275e-05, + "loss": 2.7841, + "step": 13980 + }, + { + "epoch": 4.417278685935402, + "grad_norm": 0.07218113510282796, + "learning_rate": 4.062217463383516e-05, + "loss": 2.7843, + "step": 13985 + }, + { + "epoch": 4.4188580904998815, + "grad_norm": 0.06689263430779493, + "learning_rate": 4.0404720553052224e-05, + "loss": 2.7752, + "step": 13990 + }, + { + "epoch": 4.420437495064361, + "grad_norm": 0.0745169954223912, + "learning_rate": 4.018782555143258e-05, + "loss": 2.7223, + "step": 13995 + }, + { + "epoch": 4.42201689962884, + "grad_norm": 0.05396426558685231, + "learning_rate": 3.997148989282035e-05, + "loss": 2.7539, + "step": 14000 + }, + { + "epoch": 4.423596304193319, + "grad_norm": 0.06034232058814344, + "learning_rate": 3.9755713840378906e-05, + "loss": 2.6927, + "step": 14005 + }, + { + "epoch": 4.4251757087577985, + "grad_norm": 0.07655578601385024, + "learning_rate": 3.9540497656591234e-05, + "loss": 2.7493, + "step": 14010 + }, + { + "epoch": 4.426755113322278, + "grad_norm": 0.08736839497775496, + "learning_rate": 3.9325841603259414e-05, + "loss": 2.7368, + "step": 14015 + }, + { + "epoch": 4.428334517886757, + "grad_norm": 0.07381437602143522, + "learning_rate": 3.911174594150352e-05, + "loss": 2.7831, + "step": 14020 + }, + { + "epoch": 4.429913922451236, + "grad_norm": 0.05704036250688668, + "learning_rate": 3.889821093176255e-05, + "loss": 2.7748, + "step": 14025 + }, + { + "epoch": 4.4314933270157155, + "grad_norm": 0.06313743725519254, + "learning_rate": 3.868523683379316e-05, + "loss": 2.7115, + "step": 14030 + }, + { + "epoch": 4.433072731580194, + "grad_norm": 0.06809934158165958, + "learning_rate": 3.8472823906669784e-05, + "loss": 2.7187, + "step": 14035 + }, + { + "epoch": 4.434652136144673, + "grad_norm": 0.06549677567756533, + "learning_rate": 3.8260972408784235e-05, + "loss": 2.6307, + "step": 14040 + }, + { + "epoch": 4.436231540709152, + "grad_norm": 0.06997860888308473, + "learning_rate": 3.80496825978453e-05, + "loss": 2.863, + "step": 14045 + }, + { + "epoch": 4.437810945273632, + "grad_norm": 0.0651886553309917, + "learning_rate": 3.783895473087851e-05, + "loss": 2.665, + "step": 14050 + }, + { + "epoch": 4.439390349838111, + "grad_norm": 0.0665741756348438, + "learning_rate": 3.76287890642259e-05, + "loss": 2.6741, + "step": 14055 + }, + { + "epoch": 4.44096975440259, + "grad_norm": 0.06426164049312041, + "learning_rate": 3.741918585354548e-05, + "loss": 2.6937, + "step": 14060 + }, + { + "epoch": 4.442549158967069, + "grad_norm": 0.06891509753497858, + "learning_rate": 3.721014535381117e-05, + "loss": 2.7693, + "step": 14065 + }, + { + "epoch": 4.444128563531549, + "grad_norm": 0.06288296963737722, + "learning_rate": 3.70016678193123e-05, + "loss": 2.8424, + "step": 14070 + }, + { + "epoch": 4.445707968096028, + "grad_norm": 0.05996547439265329, + "learning_rate": 3.6793753503653385e-05, + "loss": 2.6971, + "step": 14075 + }, + { + "epoch": 4.447287372660507, + "grad_norm": 0.07084741729207963, + "learning_rate": 3.6586402659753995e-05, + "loss": 2.8812, + "step": 14080 + }, + { + "epoch": 4.448866777224986, + "grad_norm": 0.06084729264743166, + "learning_rate": 3.6379615539847756e-05, + "loss": 2.8096, + "step": 14085 + }, + { + "epoch": 4.450446181789466, + "grad_norm": 0.07677261064708935, + "learning_rate": 3.617339239548312e-05, + "loss": 2.7297, + "step": 14090 + }, + { + "epoch": 4.452025586353945, + "grad_norm": 0.05650388293033085, + "learning_rate": 3.5967733477522246e-05, + "loss": 2.7602, + "step": 14095 + }, + { + "epoch": 4.453604990918424, + "grad_norm": 0.062323534595875925, + "learning_rate": 3.576263903614085e-05, + "loss": 2.7709, + "step": 14100 + }, + { + "epoch": 4.455184395482903, + "grad_norm": 0.06398308241638362, + "learning_rate": 3.555810932082809e-05, + "loss": 2.8182, + "step": 14105 + }, + { + "epoch": 4.456763800047382, + "grad_norm": 0.06880813740246586, + "learning_rate": 3.5354144580385994e-05, + "loss": 2.6806, + "step": 14110 + }, + { + "epoch": 4.458343204611861, + "grad_norm": 0.0601846120350076, + "learning_rate": 3.515074506292981e-05, + "loss": 2.7093, + "step": 14115 + }, + { + "epoch": 4.45992260917634, + "grad_norm": 0.06296228083020446, + "learning_rate": 3.494791101588657e-05, + "loss": 2.7168, + "step": 14120 + }, + { + "epoch": 4.4615020137408195, + "grad_norm": 0.06191844446700565, + "learning_rate": 3.474564268599584e-05, + "loss": 2.7745, + "step": 14125 + }, + { + "epoch": 4.463081418305299, + "grad_norm": 0.06884176731460052, + "learning_rate": 3.454394031930885e-05, + "loss": 2.7455, + "step": 14130 + }, + { + "epoch": 4.464660822869778, + "grad_norm": 0.06755656670539843, + "learning_rate": 3.4342804161188456e-05, + "loss": 2.7446, + "step": 14135 + }, + { + "epoch": 4.466240227434257, + "grad_norm": 0.06319580791901808, + "learning_rate": 3.414223445630865e-05, + "loss": 2.7739, + "step": 14140 + }, + { + "epoch": 4.467819631998736, + "grad_norm": 0.08458901559524377, + "learning_rate": 3.3942231448654494e-05, + "loss": 2.7623, + "step": 14145 + }, + { + "epoch": 4.469399036563216, + "grad_norm": 0.08682005555589914, + "learning_rate": 3.374279538152153e-05, + "loss": 2.8679, + "step": 14150 + }, + { + "epoch": 4.470978441127695, + "grad_norm": 0.06654776150384711, + "learning_rate": 3.3543926497515806e-05, + "loss": 2.7028, + "step": 14155 + }, + { + "epoch": 4.472557845692174, + "grad_norm": 0.06494438763287523, + "learning_rate": 3.334562503855321e-05, + "loss": 2.8037, + "step": 14160 + }, + { + "epoch": 4.474137250256653, + "grad_norm": 0.06230766616900129, + "learning_rate": 3.3147891245859374e-05, + "loss": 2.7058, + "step": 14165 + }, + { + "epoch": 4.475716654821133, + "grad_norm": 0.0654662855004723, + "learning_rate": 3.2950725359969735e-05, + "loss": 2.6733, + "step": 14170 + }, + { + "epoch": 4.477296059385612, + "grad_norm": 0.06401045176203156, + "learning_rate": 3.2754127620728714e-05, + "loss": 2.7908, + "step": 14175 + }, + { + "epoch": 4.478875463950091, + "grad_norm": 0.07287652394147869, + "learning_rate": 3.2558098267289226e-05, + "loss": 2.7242, + "step": 14180 + }, + { + "epoch": 4.48045486851457, + "grad_norm": 0.08527784503859166, + "learning_rate": 3.2362637538113305e-05, + "loss": 2.6897, + "step": 14185 + }, + { + "epoch": 4.48203427307905, + "grad_norm": 0.06939312669332218, + "learning_rate": 3.216774567097097e-05, + "loss": 2.8094, + "step": 14190 + }, + { + "epoch": 4.483613677643528, + "grad_norm": 0.061218931687837694, + "learning_rate": 3.197342290294053e-05, + "loss": 2.7239, + "step": 14195 + }, + { + "epoch": 4.485193082208007, + "grad_norm": 0.06603079912688316, + "learning_rate": 3.177966947040761e-05, + "loss": 2.819, + "step": 14200 + }, + { + "epoch": 4.4867724867724865, + "grad_norm": 0.060929969118875785, + "learning_rate": 3.158648560906552e-05, + "loss": 2.7157, + "step": 14205 + }, + { + "epoch": 4.488351891336966, + "grad_norm": 0.06468485810349213, + "learning_rate": 3.139387155391465e-05, + "loss": 2.7658, + "step": 14210 + }, + { + "epoch": 4.489931295901445, + "grad_norm": 0.07268389043589228, + "learning_rate": 3.120182753926226e-05, + "loss": 2.8512, + "step": 14215 + }, + { + "epoch": 4.491510700465924, + "grad_norm": 0.060452501315795604, + "learning_rate": 3.101035379872219e-05, + "loss": 2.6202, + "step": 14220 + }, + { + "epoch": 4.4930901050304035, + "grad_norm": 0.06324868957888526, + "learning_rate": 3.081945056521451e-05, + "loss": 2.8334, + "step": 14225 + }, + { + "epoch": 4.494669509594883, + "grad_norm": 0.0648943320479459, + "learning_rate": 3.06291180709653e-05, + "loss": 2.7852, + "step": 14230 + }, + { + "epoch": 4.496248914159362, + "grad_norm": 0.095625503609685, + "learning_rate": 3.0439356547506513e-05, + "loss": 2.6689, + "step": 14235 + }, + { + "epoch": 4.497828318723841, + "grad_norm": 0.07565829725378756, + "learning_rate": 3.0250166225675114e-05, + "loss": 2.7775, + "step": 14240 + }, + { + "epoch": 4.4994077232883205, + "grad_norm": 0.06339715848310948, + "learning_rate": 3.006154733561378e-05, + "loss": 2.704, + "step": 14245 + }, + { + "epoch": 4.5009871278528, + "grad_norm": 0.07563055696776189, + "learning_rate": 2.9873500106769758e-05, + "loss": 2.9049, + "step": 14250 + }, + { + "epoch": 4.502566532417279, + "grad_norm": 0.05983804079064975, + "learning_rate": 2.968602476789495e-05, + "loss": 2.8519, + "step": 14255 + }, + { + "epoch": 4.504145936981758, + "grad_norm": 0.06868463639208783, + "learning_rate": 2.9499121547045425e-05, + "loss": 2.7555, + "step": 14260 + }, + { + "epoch": 4.505725341546237, + "grad_norm": 0.060978626347398604, + "learning_rate": 2.9312790671581434e-05, + "loss": 2.7537, + "step": 14265 + }, + { + "epoch": 4.507304746110716, + "grad_norm": 0.07264461101387482, + "learning_rate": 2.9127032368167216e-05, + "loss": 2.726, + "step": 14270 + }, + { + "epoch": 4.508884150675195, + "grad_norm": 0.06238319967834464, + "learning_rate": 2.894184686277013e-05, + "loss": 2.6569, + "step": 14275 + }, + { + "epoch": 4.510463555239674, + "grad_norm": 0.05858114580348566, + "learning_rate": 2.8757234380660858e-05, + "loss": 2.7033, + "step": 14280 + }, + { + "epoch": 4.512042959804154, + "grad_norm": 0.056648958840228705, + "learning_rate": 2.8573195146413046e-05, + "loss": 2.7374, + "step": 14285 + }, + { + "epoch": 4.513622364368633, + "grad_norm": 0.06109579452131664, + "learning_rate": 2.8389729383903107e-05, + "loss": 2.6694, + "step": 14290 + }, + { + "epoch": 4.515201768933112, + "grad_norm": 0.06955546274393315, + "learning_rate": 2.8206837316309686e-05, + "loss": 2.8184, + "step": 14295 + }, + { + "epoch": 4.516781173497591, + "grad_norm": 0.06389816912715901, + "learning_rate": 2.802451916611365e-05, + "loss": 2.7332, + "step": 14300 + }, + { + "epoch": 4.518360578062071, + "grad_norm": 0.07540072853690875, + "learning_rate": 2.7842775155097698e-05, + "loss": 2.7632, + "step": 14305 + }, + { + "epoch": 4.51993998262655, + "grad_norm": 0.06443257502762123, + "learning_rate": 2.7661605504346043e-05, + "loss": 2.6654, + "step": 14310 + }, + { + "epoch": 4.521519387191029, + "grad_norm": 0.06193483198132655, + "learning_rate": 2.7481010434244448e-05, + "loss": 2.752, + "step": 14315 + }, + { + "epoch": 4.523098791755508, + "grad_norm": 0.058886621958504706, + "learning_rate": 2.7300990164479288e-05, + "loss": 2.7912, + "step": 14320 + }, + { + "epoch": 4.524678196319988, + "grad_norm": 0.07651985633840072, + "learning_rate": 2.7121544914038178e-05, + "loss": 2.822, + "step": 14325 + }, + { + "epoch": 4.526257600884467, + "grad_norm": 0.05360993115806157, + "learning_rate": 2.6942674901208996e-05, + "loss": 2.7693, + "step": 14330 + }, + { + "epoch": 4.527837005448946, + "grad_norm": 0.06575026619758434, + "learning_rate": 2.6764380343579975e-05, + "loss": 2.7918, + "step": 14335 + }, + { + "epoch": 4.529416410013425, + "grad_norm": 0.07334945930308437, + "learning_rate": 2.6586661458039118e-05, + "loss": 2.8417, + "step": 14340 + }, + { + "epoch": 4.530995814577905, + "grad_norm": 0.058699707796910444, + "learning_rate": 2.640951846077433e-05, + "loss": 2.7152, + "step": 14345 + }, + { + "epoch": 4.532575219142384, + "grad_norm": 0.09061721674917018, + "learning_rate": 2.6232951567273012e-05, + "loss": 2.8641, + "step": 14350 + }, + { + "epoch": 4.534154623706862, + "grad_norm": 0.07214711896857261, + "learning_rate": 2.60569609923218e-05, + "loss": 2.6287, + "step": 14355 + }, + { + "epoch": 4.535734028271341, + "grad_norm": 0.08400853618040989, + "learning_rate": 2.5881546950005884e-05, + "loss": 2.7658, + "step": 14360 + }, + { + "epoch": 4.537313432835821, + "grad_norm": 0.07646229902988555, + "learning_rate": 2.5706709653709526e-05, + "loss": 2.7203, + "step": 14365 + }, + { + "epoch": 4.5388928374003, + "grad_norm": 0.06658085307713038, + "learning_rate": 2.5532449316115204e-05, + "loss": 2.8077, + "step": 14370 + }, + { + "epoch": 4.540472241964779, + "grad_norm": 0.06374948967721511, + "learning_rate": 2.5358766149203684e-05, + "loss": 2.8396, + "step": 14375 + }, + { + "epoch": 4.542051646529258, + "grad_norm": 0.056954466249409016, + "learning_rate": 2.5185660364253514e-05, + "loss": 2.769, + "step": 14380 + }, + { + "epoch": 4.543631051093738, + "grad_norm": 0.06443322257692594, + "learning_rate": 2.5013132171840925e-05, + "loss": 2.7951, + "step": 14385 + }, + { + "epoch": 4.545210455658217, + "grad_norm": 0.0659027665384806, + "learning_rate": 2.484118178183953e-05, + "loss": 2.7829, + "step": 14390 + }, + { + "epoch": 4.546789860222696, + "grad_norm": 0.06019612237489631, + "learning_rate": 2.4669809403420007e-05, + "loss": 2.7491, + "step": 14395 + }, + { + "epoch": 4.548369264787175, + "grad_norm": 0.0621857388979336, + "learning_rate": 2.4499015245049994e-05, + "loss": 2.7726, + "step": 14400 + }, + { + "epoch": 4.549948669351655, + "grad_norm": 0.06058665884209109, + "learning_rate": 2.432879951449368e-05, + "loss": 2.722, + "step": 14405 + }, + { + "epoch": 4.551528073916134, + "grad_norm": 0.05787388725056837, + "learning_rate": 2.415916241881172e-05, + "loss": 2.6921, + "step": 14410 + }, + { + "epoch": 4.553107478480613, + "grad_norm": 0.07026398839890141, + "learning_rate": 2.3990104164360872e-05, + "loss": 2.7933, + "step": 14415 + }, + { + "epoch": 4.5546868830450915, + "grad_norm": 0.07729955095066493, + "learning_rate": 2.382162495679341e-05, + "loss": 2.6791, + "step": 14420 + }, + { + "epoch": 4.556266287609571, + "grad_norm": 0.05699547599441058, + "learning_rate": 2.365372500105778e-05, + "loss": 2.7441, + "step": 14425 + }, + { + "epoch": 4.55784569217405, + "grad_norm": 0.05395722696644634, + "learning_rate": 2.3486404501397494e-05, + "loss": 2.6912, + "step": 14430 + }, + { + "epoch": 4.559425096738529, + "grad_norm": 0.05671082677172548, + "learning_rate": 2.3319663661351187e-05, + "loss": 2.7675, + "step": 14435 + }, + { + "epoch": 4.5610045013030085, + "grad_norm": 0.09085770144352527, + "learning_rate": 2.315350268375227e-05, + "loss": 2.6886, + "step": 14440 + }, + { + "epoch": 4.562583905867488, + "grad_norm": 0.0724620056678275, + "learning_rate": 2.2987921770728894e-05, + "loss": 2.7525, + "step": 14445 + }, + { + "epoch": 4.564163310431967, + "grad_norm": 0.060969514802193876, + "learning_rate": 2.282292112370382e-05, + "loss": 2.7198, + "step": 14450 + }, + { + "epoch": 4.565742714996446, + "grad_norm": 0.06432353698873515, + "learning_rate": 2.2658500943393445e-05, + "loss": 2.7714, + "step": 14455 + }, + { + "epoch": 4.5673221195609255, + "grad_norm": 0.062306067911699396, + "learning_rate": 2.2494661429808438e-05, + "loss": 2.8476, + "step": 14460 + }, + { + "epoch": 4.568901524125405, + "grad_norm": 0.06591504034774806, + "learning_rate": 2.233140278225282e-05, + "loss": 2.7526, + "step": 14465 + }, + { + "epoch": 4.570480928689884, + "grad_norm": 0.05234082082736486, + "learning_rate": 2.2168725199324337e-05, + "loss": 2.7742, + "step": 14470 + }, + { + "epoch": 4.572060333254363, + "grad_norm": 0.061327728160049846, + "learning_rate": 2.2006628878913638e-05, + "loss": 2.7647, + "step": 14475 + }, + { + "epoch": 4.5736397378188425, + "grad_norm": 0.06096072179000841, + "learning_rate": 2.184511401820438e-05, + "loss": 2.7635, + "step": 14480 + }, + { + "epoch": 4.575219142383322, + "grad_norm": 0.05662154341075043, + "learning_rate": 2.1684180813672904e-05, + "loss": 2.732, + "step": 14485 + }, + { + "epoch": 4.576798546947801, + "grad_norm": 0.05752992552601801, + "learning_rate": 2.1523829461087997e-05, + "loss": 2.7743, + "step": 14490 + }, + { + "epoch": 4.57837795151228, + "grad_norm": 0.06506393985290536, + "learning_rate": 2.1364060155510623e-05, + "loss": 2.7648, + "step": 14495 + }, + { + "epoch": 4.5799573560767595, + "grad_norm": 0.05979071791978911, + "learning_rate": 2.1204873091293598e-05, + "loss": 2.7522, + "step": 14500 + }, + { + "epoch": 4.581536760641239, + "grad_norm": 0.06475097488467664, + "learning_rate": 2.1046268462081685e-05, + "loss": 2.725, + "step": 14505 + }, + { + "epoch": 4.583116165205717, + "grad_norm": 0.07002462214313948, + "learning_rate": 2.0888246460811165e-05, + "loss": 2.6988, + "step": 14510 + }, + { + "epoch": 4.584695569770196, + "grad_norm": 0.06338470337206913, + "learning_rate": 2.0730807279709162e-05, + "loss": 2.8524, + "step": 14515 + }, + { + "epoch": 4.586274974334676, + "grad_norm": 0.05494800113422327, + "learning_rate": 2.0573951110294307e-05, + "loss": 2.7232, + "step": 14520 + }, + { + "epoch": 4.587854378899155, + "grad_norm": 0.05821450686335452, + "learning_rate": 2.041767814337564e-05, + "loss": 2.7765, + "step": 14525 + }, + { + "epoch": 4.589433783463634, + "grad_norm": 0.0651294804089357, + "learning_rate": 2.02619885690532e-05, + "loss": 2.8371, + "step": 14530 + }, + { + "epoch": 4.591013188028113, + "grad_norm": 0.07958280233068674, + "learning_rate": 2.010688257671689e-05, + "loss": 2.8957, + "step": 14535 + }, + { + "epoch": 4.592592592592593, + "grad_norm": 0.06409496074399801, + "learning_rate": 1.9952360355046938e-05, + "loss": 2.673, + "step": 14540 + }, + { + "epoch": 4.594171997157072, + "grad_norm": 0.06179795070704282, + "learning_rate": 1.9798422092013437e-05, + "loss": 2.7927, + "step": 14545 + }, + { + "epoch": 4.595751401721551, + "grad_norm": 0.06246801898318563, + "learning_rate": 1.9645067974876086e-05, + "loss": 2.6022, + "step": 14550 + }, + { + "epoch": 4.59733080628603, + "grad_norm": 0.057736502834117825, + "learning_rate": 1.9492298190184e-05, + "loss": 2.6849, + "step": 14555 + }, + { + "epoch": 4.5989102108505096, + "grad_norm": 0.06317115402348944, + "learning_rate": 1.9340112923775465e-05, + "loss": 2.727, + "step": 14560 + }, + { + "epoch": 4.600489615414989, + "grad_norm": 0.0760149568562783, + "learning_rate": 1.9188512360777733e-05, + "loss": 2.7165, + "step": 14565 + }, + { + "epoch": 4.602069019979468, + "grad_norm": 0.05643531205222235, + "learning_rate": 1.903749668560678e-05, + "loss": 2.7542, + "step": 14570 + }, + { + "epoch": 4.603648424543947, + "grad_norm": 0.05650550416887614, + "learning_rate": 1.8887066081967163e-05, + "loss": 2.723, + "step": 14575 + }, + { + "epoch": 4.605227829108426, + "grad_norm": 0.06232952422218689, + "learning_rate": 1.8737220732851556e-05, + "loss": 2.6474, + "step": 14580 + }, + { + "epoch": 4.606807233672905, + "grad_norm": 0.06272739678715326, + "learning_rate": 1.8587960820540873e-05, + "loss": 2.7013, + "step": 14585 + }, + { + "epoch": 4.608386638237384, + "grad_norm": 0.0596058380745117, + "learning_rate": 1.8439286526603815e-05, + "loss": 2.7787, + "step": 14590 + }, + { + "epoch": 4.609966042801863, + "grad_norm": 0.0586885996955403, + "learning_rate": 1.829119803189655e-05, + "loss": 2.6843, + "step": 14595 + }, + { + "epoch": 4.611545447366343, + "grad_norm": 0.058935178132082246, + "learning_rate": 1.814369551656281e-05, + "loss": 2.7648, + "step": 14600 + }, + { + "epoch": 4.613124851930822, + "grad_norm": 0.05902770135942328, + "learning_rate": 1.799677916003356e-05, + "loss": 2.7793, + "step": 14605 + }, + { + "epoch": 4.614704256495301, + "grad_norm": 0.05913727813921137, + "learning_rate": 1.7850449141026627e-05, + "loss": 2.7881, + "step": 14610 + }, + { + "epoch": 4.61628366105978, + "grad_norm": 0.06421761140676323, + "learning_rate": 1.7704705637546504e-05, + "loss": 2.7424, + "step": 14615 + }, + { + "epoch": 4.61786306562426, + "grad_norm": 0.061288198027929716, + "learning_rate": 1.755954882688432e-05, + "loss": 2.674, + "step": 14620 + }, + { + "epoch": 4.619442470188739, + "grad_norm": 0.058127282817275416, + "learning_rate": 1.741497888561755e-05, + "loss": 2.7704, + "step": 14625 + }, + { + "epoch": 4.621021874753218, + "grad_norm": 0.0688027425402634, + "learning_rate": 1.7270995989609684e-05, + "loss": 2.7269, + "step": 14630 + }, + { + "epoch": 4.622601279317697, + "grad_norm": 0.06690719700150118, + "learning_rate": 1.7127600314010118e-05, + "loss": 2.864, + "step": 14635 + }, + { + "epoch": 4.624180683882177, + "grad_norm": 0.055957725104073146, + "learning_rate": 1.698479203325387e-05, + "loss": 2.7936, + "step": 14640 + }, + { + "epoch": 4.625760088446656, + "grad_norm": 0.06070257519349371, + "learning_rate": 1.684257132106154e-05, + "loss": 2.8367, + "step": 14645 + }, + { + "epoch": 4.627339493011135, + "grad_norm": 0.06885399968723249, + "learning_rate": 1.6700938350438898e-05, + "loss": 2.7519, + "step": 14650 + }, + { + "epoch": 4.628918897575614, + "grad_norm": 0.06265774692629597, + "learning_rate": 1.6559893293676685e-05, + "loss": 2.7064, + "step": 14655 + }, + { + "epoch": 4.630498302140094, + "grad_norm": 0.07387155982227189, + "learning_rate": 1.64194363223506e-05, + "loss": 2.6803, + "step": 14660 + }, + { + "epoch": 4.632077706704573, + "grad_norm": 0.05610098872321232, + "learning_rate": 1.627956760732091e-05, + "loss": 2.7879, + "step": 14665 + }, + { + "epoch": 4.633657111269051, + "grad_norm": 0.0687555797648038, + "learning_rate": 1.6140287318732294e-05, + "loss": 2.7316, + "step": 14670 + }, + { + "epoch": 4.6352365158335305, + "grad_norm": 0.0652842017046115, + "learning_rate": 1.60015956260135e-05, + "loss": 2.7575, + "step": 14675 + }, + { + "epoch": 4.63681592039801, + "grad_norm": 0.061715779369038784, + "learning_rate": 1.58634926978774e-05, + "loss": 2.7488, + "step": 14680 + }, + { + "epoch": 4.638395324962489, + "grad_norm": 0.05873635883033044, + "learning_rate": 1.5725978702320788e-05, + "loss": 2.7626, + "step": 14685 + }, + { + "epoch": 4.639974729526968, + "grad_norm": 0.06498451375033829, + "learning_rate": 1.5589053806623842e-05, + "loss": 2.646, + "step": 14690 + }, + { + "epoch": 4.6415541340914475, + "grad_norm": 0.05968919227964137, + "learning_rate": 1.5452718177350167e-05, + "loss": 2.7291, + "step": 14695 + }, + { + "epoch": 4.643133538655927, + "grad_norm": 0.05939990170483963, + "learning_rate": 1.5316971980346595e-05, + "loss": 2.7875, + "step": 14700 + }, + { + "epoch": 4.644712943220406, + "grad_norm": 0.07305067144805817, + "learning_rate": 1.5181815380742814e-05, + "loss": 2.7002, + "step": 14705 + }, + { + "epoch": 4.646292347784885, + "grad_norm": 0.06599968413260995, + "learning_rate": 1.5047248542951585e-05, + "loss": 2.7398, + "step": 14710 + }, + { + "epoch": 4.6478717523493644, + "grad_norm": 0.06060760394733419, + "learning_rate": 1.4913271630667856e-05, + "loss": 2.7438, + "step": 14715 + }, + { + "epoch": 4.649451156913844, + "grad_norm": 0.06457200143485123, + "learning_rate": 1.4779884806869259e-05, + "loss": 2.7779, + "step": 14720 + }, + { + "epoch": 4.651030561478323, + "grad_norm": 0.06766896925804818, + "learning_rate": 1.4647088233815442e-05, + "loss": 2.7243, + "step": 14725 + }, + { + "epoch": 4.652609966042802, + "grad_norm": 0.053428952697001, + "learning_rate": 1.4514882073048185e-05, + "loss": 2.7115, + "step": 14730 + }, + { + "epoch": 4.654189370607281, + "grad_norm": 0.07209620805743888, + "learning_rate": 1.4383266485390845e-05, + "loss": 2.7503, + "step": 14735 + }, + { + "epoch": 4.65576877517176, + "grad_norm": 0.06486017597551075, + "learning_rate": 1.4252241630948514e-05, + "loss": 2.7112, + "step": 14740 + }, + { + "epoch": 4.657348179736239, + "grad_norm": 0.05877887994164633, + "learning_rate": 1.41218076691077e-05, + "loss": 2.7173, + "step": 14745 + }, + { + "epoch": 4.658927584300718, + "grad_norm": 0.05404621626263031, + "learning_rate": 1.3991964758536147e-05, + "loss": 2.762, + "step": 14750 + }, + { + "epoch": 4.6605069888651975, + "grad_norm": 0.05062655885257775, + "learning_rate": 1.3862713057182285e-05, + "loss": 2.7538, + "step": 14755 + }, + { + "epoch": 4.662086393429677, + "grad_norm": 0.05829852880050507, + "learning_rate": 1.3734052722275848e-05, + "loss": 2.6997, + "step": 14760 + }, + { + "epoch": 4.663665797994156, + "grad_norm": 0.060772444171327794, + "learning_rate": 1.3605983910326803e-05, + "loss": 2.6966, + "step": 14765 + }, + { + "epoch": 4.665245202558635, + "grad_norm": 0.0644898450615056, + "learning_rate": 1.3478506777125866e-05, + "loss": 2.731, + "step": 14770 + }, + { + "epoch": 4.6668246071231145, + "grad_norm": 0.05827542320067822, + "learning_rate": 1.3351621477743714e-05, + "loss": 2.7233, + "step": 14775 + }, + { + "epoch": 4.668404011687594, + "grad_norm": 0.06691029805021395, + "learning_rate": 1.3225328166531158e-05, + "loss": 2.7853, + "step": 14780 + }, + { + "epoch": 4.669983416252073, + "grad_norm": 0.06566268495273672, + "learning_rate": 1.3099626997119029e-05, + "loss": 2.7213, + "step": 14785 + }, + { + "epoch": 4.671562820816552, + "grad_norm": 0.061236687846343285, + "learning_rate": 1.297451812241779e-05, + "loss": 2.756, + "step": 14790 + }, + { + "epoch": 4.6731422253810315, + "grad_norm": 0.0536860947465952, + "learning_rate": 1.2850001694617253e-05, + "loss": 2.6802, + "step": 14795 + }, + { + "epoch": 4.674721629945511, + "grad_norm": 0.06032907143406907, + "learning_rate": 1.2726077865186648e-05, + "loss": 2.7773, + "step": 14800 + }, + { + "epoch": 4.67630103450999, + "grad_norm": 0.06119163783834023, + "learning_rate": 1.260274678487433e-05, + "loss": 2.763, + "step": 14805 + }, + { + "epoch": 4.677880439074469, + "grad_norm": 0.069980184949926, + "learning_rate": 1.2480008603707626e-05, + "loss": 2.7077, + "step": 14810 + }, + { + "epoch": 4.6794598436389485, + "grad_norm": 0.07154881039704317, + "learning_rate": 1.2357863470992604e-05, + "loss": 2.6343, + "step": 14815 + }, + { + "epoch": 4.681039248203428, + "grad_norm": 0.06226557030888638, + "learning_rate": 1.2236311535313849e-05, + "loss": 2.7263, + "step": 14820 + }, + { + "epoch": 4.682618652767906, + "grad_norm": 0.05868134529593452, + "learning_rate": 1.2115352944534474e-05, + "loss": 2.7429, + "step": 14825 + }, + { + "epoch": 4.684198057332385, + "grad_norm": 0.05644731559254908, + "learning_rate": 1.1994987845795724e-05, + "loss": 2.6599, + "step": 14830 + }, + { + "epoch": 4.685777461896865, + "grad_norm": 0.0865592401429452, + "learning_rate": 1.1875216385516751e-05, + "loss": 2.7562, + "step": 14835 + }, + { + "epoch": 4.687356866461344, + "grad_norm": 0.05669551027219937, + "learning_rate": 1.17560387093949e-05, + "loss": 2.6981, + "step": 14840 + }, + { + "epoch": 4.688936271025823, + "grad_norm": 0.061677090036888774, + "learning_rate": 1.1637454962404982e-05, + "loss": 2.7543, + "step": 14845 + }, + { + "epoch": 4.690515675590302, + "grad_norm": 0.08284595517474833, + "learning_rate": 1.1519465288799325e-05, + "loss": 2.7559, + "step": 14850 + }, + { + "epoch": 4.692095080154782, + "grad_norm": 0.05984777724798328, + "learning_rate": 1.140206983210762e-05, + "loss": 2.7345, + "step": 14855 + }, + { + "epoch": 4.693674484719261, + "grad_norm": 0.05838786521203706, + "learning_rate": 1.1285268735136633e-05, + "loss": 2.7114, + "step": 14860 + }, + { + "epoch": 4.69525388928374, + "grad_norm": 0.0543130897314997, + "learning_rate": 1.1169062139970376e-05, + "loss": 2.7835, + "step": 14865 + }, + { + "epoch": 4.696833293848219, + "grad_norm": 0.0784933235031214, + "learning_rate": 1.1053450187969383e-05, + "loss": 2.726, + "step": 14870 + }, + { + "epoch": 4.698412698412699, + "grad_norm": 0.05947895732414855, + "learning_rate": 1.0938433019770932e-05, + "loss": 2.7097, + "step": 14875 + }, + { + "epoch": 4.699992102977178, + "grad_norm": 0.06675918644773202, + "learning_rate": 1.0824010775288829e-05, + "loss": 2.7526, + "step": 14880 + }, + { + "epoch": 4.701571507541657, + "grad_norm": 0.06628644168818526, + "learning_rate": 1.0710183593713063e-05, + "loss": 2.7017, + "step": 14885 + }, + { + "epoch": 4.703150912106136, + "grad_norm": 0.06590363253954858, + "learning_rate": 1.059695161350993e-05, + "loss": 2.7512, + "step": 14890 + }, + { + "epoch": 4.704730316670615, + "grad_norm": 0.053947613534192154, + "learning_rate": 1.0484314972421471e-05, + "loss": 2.6796, + "step": 14895 + }, + { + "epoch": 4.706309721235094, + "grad_norm": 0.05509173657733855, + "learning_rate": 1.0372273807465638e-05, + "loss": 2.7717, + "step": 14900 + }, + { + "epoch": 4.707889125799573, + "grad_norm": 0.07077772894040348, + "learning_rate": 1.0260828254936072e-05, + "loss": 2.7494, + "step": 14905 + }, + { + "epoch": 4.709468530364052, + "grad_norm": 0.05472870317866765, + "learning_rate": 1.0149978450401776e-05, + "loss": 2.7635, + "step": 14910 + }, + { + "epoch": 4.711047934928532, + "grad_norm": 0.0618392835438327, + "learning_rate": 1.0039724528707051e-05, + "loss": 2.7097, + "step": 14915 + }, + { + "epoch": 4.712627339493011, + "grad_norm": 0.06141125101382459, + "learning_rate": 9.930066623971334e-06, + "loss": 2.7059, + "step": 14920 + }, + { + "epoch": 4.71420674405749, + "grad_norm": 0.06226514742487043, + "learning_rate": 9.82100486958909e-06, + "loss": 2.7471, + "step": 14925 + }, + { + "epoch": 4.715786148621969, + "grad_norm": 0.05799250583353126, + "learning_rate": 9.712539398229635e-06, + "loss": 2.843, + "step": 14930 + }, + { + "epoch": 4.717365553186449, + "grad_norm": 0.06510162586705369, + "learning_rate": 9.604670341836652e-06, + "loss": 2.7466, + "step": 14935 + }, + { + "epoch": 4.718944957750928, + "grad_norm": 0.0527705611228643, + "learning_rate": 9.497397831628674e-06, + "loss": 2.6647, + "step": 14940 + }, + { + "epoch": 4.720524362315407, + "grad_norm": 0.05916914370321485, + "learning_rate": 9.390721998098372e-06, + "loss": 2.798, + "step": 14945 + }, + { + "epoch": 4.722103766879886, + "grad_norm": 0.05273078939253314, + "learning_rate": 9.284642971012557e-06, + "loss": 2.8165, + "step": 14950 + }, + { + "epoch": 4.723683171444366, + "grad_norm": 0.07317781115947353, + "learning_rate": 9.179160879412063e-06, + "loss": 2.903, + "step": 14955 + }, + { + "epoch": 4.725262576008845, + "grad_norm": 0.06095942833581712, + "learning_rate": 9.074275851611691e-06, + "loss": 2.6464, + "step": 14960 + }, + { + "epoch": 4.726841980573324, + "grad_norm": 0.16499149627846274, + "learning_rate": 8.969988015199826e-06, + "loss": 2.7903, + "step": 14965 + }, + { + "epoch": 4.728421385137803, + "grad_norm": 0.059296679735940355, + "learning_rate": 8.866297497038434e-06, + "loss": 2.7781, + "step": 14970 + }, + { + "epoch": 4.730000789702283, + "grad_norm": 0.055793367942026134, + "learning_rate": 8.763204423262838e-06, + "loss": 2.7091, + "step": 14975 + }, + { + "epoch": 4.731580194266762, + "grad_norm": 0.060455988803085246, + "learning_rate": 8.660708919281612e-06, + "loss": 2.7383, + "step": 14980 + }, + { + "epoch": 4.73315959883124, + "grad_norm": 0.05854906963710378, + "learning_rate": 8.558811109776465e-06, + "loss": 2.7276, + "step": 14985 + }, + { + "epoch": 4.7347390033957195, + "grad_norm": 0.10055956445366193, + "learning_rate": 8.45751111870191e-06, + "loss": 2.7601, + "step": 14990 + }, + { + "epoch": 4.736318407960199, + "grad_norm": 0.0672929149842486, + "learning_rate": 8.35680906928532e-06, + "loss": 2.6883, + "step": 14995 + }, + { + "epoch": 4.737897812524678, + "grad_norm": 0.06645703766918439, + "learning_rate": 8.256705084026761e-06, + "loss": 2.7759, + "step": 15000 + }, + { + "epoch": 4.739477217089157, + "grad_norm": 0.0773292275020157, + "learning_rate": 8.157199284698601e-06, + "loss": 2.7386, + "step": 15005 + }, + { + "epoch": 4.7410566216536365, + "grad_norm": 0.06896932354488351, + "learning_rate": 8.05829179234574e-06, + "loss": 2.7304, + "step": 15010 + }, + { + "epoch": 4.742636026218116, + "grad_norm": 0.058377033841892786, + "learning_rate": 7.959982727285042e-06, + "loss": 2.7119, + "step": 15015 + }, + { + "epoch": 4.744215430782595, + "grad_norm": 0.0616384069995505, + "learning_rate": 7.862272209105625e-06, + "loss": 2.7639, + "step": 15020 + }, + { + "epoch": 4.745794835347074, + "grad_norm": 0.05627891739676093, + "learning_rate": 7.76516035666841e-06, + "loss": 2.7197, + "step": 15025 + }, + { + "epoch": 4.7473742399115535, + "grad_norm": 0.05663360905497998, + "learning_rate": 7.668647288106012e-06, + "loss": 2.7569, + "step": 15030 + }, + { + "epoch": 4.748953644476033, + "grad_norm": 0.057975134522910164, + "learning_rate": 7.5727331208226835e-06, + "loss": 2.7978, + "step": 15035 + }, + { + "epoch": 4.750533049040512, + "grad_norm": 0.05821822725552486, + "learning_rate": 7.4774179714941495e-06, + "loss": 2.7771, + "step": 15040 + }, + { + "epoch": 4.752112453604991, + "grad_norm": 0.05950304018346307, + "learning_rate": 7.3827019560674945e-06, + "loss": 2.629, + "step": 15045 + }, + { + "epoch": 4.7536918581694705, + "grad_norm": 0.06162795839189552, + "learning_rate": 7.288585189760943e-06, + "loss": 2.6555, + "step": 15050 + }, + { + "epoch": 4.755271262733949, + "grad_norm": 0.05914445886403031, + "learning_rate": 7.195067787063692e-06, + "loss": 2.7604, + "step": 15055 + }, + { + "epoch": 4.756850667298428, + "grad_norm": 0.058657165867435516, + "learning_rate": 7.102149861735963e-06, + "loss": 2.7221, + "step": 15060 + }, + { + "epoch": 4.758430071862907, + "grad_norm": 0.05915469649322643, + "learning_rate": 7.009831526808675e-06, + "loss": 2.7357, + "step": 15065 + }, + { + "epoch": 4.760009476427387, + "grad_norm": 0.06440039807456888, + "learning_rate": 6.918112894583328e-06, + "loss": 2.7004, + "step": 15070 + }, + { + "epoch": 4.761588880991866, + "grad_norm": 0.06368301718764023, + "learning_rate": 6.8269940766318986e-06, + "loss": 2.7296, + "step": 15075 + }, + { + "epoch": 4.763168285556345, + "grad_norm": 0.0678473225203925, + "learning_rate": 6.736475183796886e-06, + "loss": 2.6353, + "step": 15080 + }, + { + "epoch": 4.764747690120824, + "grad_norm": 0.052843028068742595, + "learning_rate": 6.646556326190822e-06, + "loss": 2.7018, + "step": 15085 + }, + { + "epoch": 4.766327094685304, + "grad_norm": 0.06127133807691744, + "learning_rate": 6.557237613196321e-06, + "loss": 2.7277, + "step": 15090 + }, + { + "epoch": 4.767906499249783, + "grad_norm": 0.06577587907109655, + "learning_rate": 6.468519153466134e-06, + "loss": 2.7959, + "step": 15095 + }, + { + "epoch": 4.769485903814262, + "grad_norm": 0.05874482833404608, + "learning_rate": 6.3804010549225465e-06, + "loss": 2.7262, + "step": 15100 + }, + { + "epoch": 4.771065308378741, + "grad_norm": 0.06110105448734113, + "learning_rate": 6.292883424757867e-06, + "loss": 2.6778, + "step": 15105 + }, + { + "epoch": 4.772644712943221, + "grad_norm": 0.05864471882579104, + "learning_rate": 6.205966369433547e-06, + "loss": 2.6706, + "step": 15110 + }, + { + "epoch": 4.7742241175077, + "grad_norm": 0.05639066484026297, + "learning_rate": 6.119649994680842e-06, + "loss": 2.7794, + "step": 15115 + }, + { + "epoch": 4.775803522072179, + "grad_norm": 0.07878649928148707, + "learning_rate": 6.033934405500041e-06, + "loss": 2.7969, + "step": 15120 + }, + { + "epoch": 4.777382926636658, + "grad_norm": 0.07749243563922507, + "learning_rate": 5.948819706160901e-06, + "loss": 2.8182, + "step": 15125 + }, + { + "epoch": 4.778962331201138, + "grad_norm": 0.06604600619512055, + "learning_rate": 5.864306000201825e-06, + "loss": 2.7728, + "step": 15130 + }, + { + "epoch": 4.780541735765617, + "grad_norm": 0.06483095830054528, + "learning_rate": 5.780393390430405e-06, + "loss": 2.6885, + "step": 15135 + }, + { + "epoch": 4.782121140330095, + "grad_norm": 0.054909753523317575, + "learning_rate": 5.697081978922935e-06, + "loss": 2.7436, + "step": 15140 + }, + { + "epoch": 4.783700544894574, + "grad_norm": 0.07589710384598491, + "learning_rate": 5.6143718670244594e-06, + "loss": 2.8059, + "step": 15145 + }, + { + "epoch": 4.785279949459054, + "grad_norm": 0.06750839314901379, + "learning_rate": 5.532263155348438e-06, + "loss": 2.6943, + "step": 15150 + }, + { + "epoch": 4.786859354023533, + "grad_norm": 0.0746353227848116, + "learning_rate": 5.450755943776864e-06, + "loss": 2.718, + "step": 15155 + }, + { + "epoch": 4.788438758588012, + "grad_norm": 0.0691116307433447, + "learning_rate": 5.369850331459925e-06, + "loss": 2.755, + "step": 15160 + }, + { + "epoch": 4.790018163152491, + "grad_norm": 0.05531640854987408, + "learning_rate": 5.289546416816116e-06, + "loss": 2.6978, + "step": 15165 + }, + { + "epoch": 4.791597567716971, + "grad_norm": 0.056533255556961814, + "learning_rate": 5.209844297531796e-06, + "loss": 2.7005, + "step": 15170 + }, + { + "epoch": 4.79317697228145, + "grad_norm": 0.0562531483383305, + "learning_rate": 5.130744070561522e-06, + "loss": 2.7563, + "step": 15175 + }, + { + "epoch": 4.794756376845929, + "grad_norm": 0.0625340328968042, + "learning_rate": 5.0522458321274335e-06, + "loss": 2.7568, + "step": 15180 + }, + { + "epoch": 4.796335781410408, + "grad_norm": 0.056404367673192526, + "learning_rate": 4.974349677719591e-06, + "loss": 2.7705, + "step": 15185 + }, + { + "epoch": 4.797915185974888, + "grad_norm": 0.06908653457847379, + "learning_rate": 4.897055702095421e-06, + "loss": 2.7288, + "step": 15190 + }, + { + "epoch": 4.799494590539367, + "grad_norm": 0.06513915287870246, + "learning_rate": 4.820363999279987e-06, + "loss": 2.7565, + "step": 15195 + }, + { + "epoch": 4.801073995103846, + "grad_norm": 0.057377722680944654, + "learning_rate": 4.7442746625656616e-06, + "loss": 2.7648, + "step": 15200 + }, + { + "epoch": 4.802653399668325, + "grad_norm": 0.06775039201509281, + "learning_rate": 4.6687877845120185e-06, + "loss": 2.6989, + "step": 15205 + }, + { + "epoch": 4.804232804232804, + "grad_norm": 0.06411973092923852, + "learning_rate": 4.59390345694588e-06, + "loss": 2.8046, + "step": 15210 + }, + { + "epoch": 4.805812208797283, + "grad_norm": 0.06531844572614323, + "learning_rate": 4.519621770960936e-06, + "loss": 2.8352, + "step": 15215 + }, + { + "epoch": 4.807391613361762, + "grad_norm": 0.057181390031279165, + "learning_rate": 4.445942816917958e-06, + "loss": 2.7532, + "step": 15220 + }, + { + "epoch": 4.8089710179262415, + "grad_norm": 0.05483137340649482, + "learning_rate": 4.37286668444431e-06, + "loss": 2.8256, + "step": 15225 + }, + { + "epoch": 4.810550422490721, + "grad_norm": 0.05106913732806533, + "learning_rate": 4.300393462434271e-06, + "loss": 2.7968, + "step": 15230 + }, + { + "epoch": 4.8121298270552, + "grad_norm": 0.059138307189591, + "learning_rate": 4.228523239048543e-06, + "loss": 2.7916, + "step": 15235 + }, + { + "epoch": 4.813709231619679, + "grad_norm": 0.05877993488243228, + "learning_rate": 4.157256101714413e-06, + "loss": 2.6521, + "step": 15240 + }, + { + "epoch": 4.8152886361841585, + "grad_norm": 0.055401634394682805, + "learning_rate": 4.0865921371254224e-06, + "loss": 2.7479, + "step": 15245 + }, + { + "epoch": 4.816868040748638, + "grad_norm": 0.05057956724789247, + "learning_rate": 4.016531431241532e-06, + "loss": 2.7482, + "step": 15250 + }, + { + "epoch": 4.818447445313117, + "grad_norm": 0.05726350015391884, + "learning_rate": 3.947074069288625e-06, + "loss": 2.7244, + "step": 15255 + }, + { + "epoch": 4.820026849877596, + "grad_norm": 0.06076411959971761, + "learning_rate": 3.878220135758948e-06, + "loss": 2.704, + "step": 15260 + }, + { + "epoch": 4.8216062544420755, + "grad_norm": 0.06304090033074285, + "learning_rate": 3.8099697144104438e-06, + "loss": 2.7629, + "step": 15265 + }, + { + "epoch": 4.823185659006555, + "grad_norm": 0.05814717358794076, + "learning_rate": 3.7423228882670358e-06, + "loss": 2.7172, + "step": 15270 + }, + { + "epoch": 4.824765063571034, + "grad_norm": 0.055460152393600984, + "learning_rate": 3.6752797396182867e-06, + "loss": 2.7555, + "step": 15275 + }, + { + "epoch": 4.826344468135513, + "grad_norm": 0.06435768058340195, + "learning_rate": 3.6088403500196267e-06, + "loss": 2.7931, + "step": 15280 + }, + { + "epoch": 4.8279238726999925, + "grad_norm": 0.05810767081405668, + "learning_rate": 3.5430048002918493e-06, + "loss": 2.6553, + "step": 15285 + }, + { + "epoch": 4.829503277264472, + "grad_norm": 0.10008919099175449, + "learning_rate": 3.4777731705211703e-06, + "loss": 2.7711, + "step": 15290 + }, + { + "epoch": 4.831082681828951, + "grad_norm": 0.06245798141826981, + "learning_rate": 3.4131455400593368e-06, + "loss": 2.7068, + "step": 15295 + }, + { + "epoch": 4.832662086393429, + "grad_norm": 0.055230100674052006, + "learning_rate": 3.3491219875232403e-06, + "loss": 2.6398, + "step": 15300 + }, + { + "epoch": 4.834241490957909, + "grad_norm": 0.06924589388567168, + "learning_rate": 3.2857025907949146e-06, + "loss": 2.6182, + "step": 15305 + }, + { + "epoch": 4.835820895522388, + "grad_norm": 0.05871203678959051, + "learning_rate": 3.222887427021537e-06, + "loss": 2.7068, + "step": 15310 + }, + { + "epoch": 4.837400300086867, + "grad_norm": 0.060324558691795774, + "learning_rate": 3.160676572615262e-06, + "loss": 2.6595, + "step": 15315 + }, + { + "epoch": 4.838979704651346, + "grad_norm": 0.06256345492332065, + "learning_rate": 3.0990701032530542e-06, + "loss": 2.6762, + "step": 15320 + }, + { + "epoch": 4.8405591092158256, + "grad_norm": 0.054424348737375214, + "learning_rate": 3.0380680938766337e-06, + "loss": 2.7133, + "step": 15325 + }, + { + "epoch": 4.842138513780305, + "grad_norm": 0.05087877330618913, + "learning_rate": 2.9776706186926407e-06, + "loss": 2.8461, + "step": 15330 + }, + { + "epoch": 4.843717918344784, + "grad_norm": 0.05234768882543315, + "learning_rate": 2.917877751172027e-06, + "loss": 2.7088, + "step": 15335 + }, + { + "epoch": 4.845297322909263, + "grad_norm": 0.053568382157766374, + "learning_rate": 2.8586895640504985e-06, + "loss": 2.75, + "step": 15340 + }, + { + "epoch": 4.8468767274737425, + "grad_norm": 0.055739675394195665, + "learning_rate": 2.800106129328128e-06, + "loss": 2.7616, + "step": 15345 + }, + { + "epoch": 4.848456132038222, + "grad_norm": 0.05351193031715695, + "learning_rate": 2.7421275182691884e-06, + "loss": 2.687, + "step": 15350 + }, + { + "epoch": 4.850035536602701, + "grad_norm": 0.04983056997371717, + "learning_rate": 2.6847538014024285e-06, + "loss": 2.7052, + "step": 15355 + }, + { + "epoch": 4.85161494116718, + "grad_norm": 0.06148903219444012, + "learning_rate": 2.6279850485206314e-06, + "loss": 2.794, + "step": 15360 + }, + { + "epoch": 4.8531943457316595, + "grad_norm": 0.05592778447049073, + "learning_rate": 2.571821328680668e-06, + "loss": 2.7798, + "step": 15365 + }, + { + "epoch": 4.854773750296138, + "grad_norm": 0.05962306167498793, + "learning_rate": 2.516262710203554e-06, + "loss": 2.6869, + "step": 15370 + }, + { + "epoch": 4.856353154860617, + "grad_norm": 0.053960696536328595, + "learning_rate": 2.4613092606739496e-06, + "loss": 2.6778, + "step": 15375 + }, + { + "epoch": 4.857932559425096, + "grad_norm": 0.10449612336693467, + "learning_rate": 2.406961046940659e-06, + "loss": 2.7655, + "step": 15380 + }, + { + "epoch": 4.859511963989576, + "grad_norm": 0.06596788826797334, + "learning_rate": 2.35321813511602e-06, + "loss": 2.7192, + "step": 15385 + }, + { + "epoch": 4.861091368554055, + "grad_norm": 0.05756139751792677, + "learning_rate": 2.3000805905761814e-06, + "loss": 2.7363, + "step": 15390 + }, + { + "epoch": 4.862670773118534, + "grad_norm": 0.06448921203442322, + "learning_rate": 2.24754847796077e-06, + "loss": 2.8249, + "step": 15395 + }, + { + "epoch": 4.864250177683013, + "grad_norm": 0.061459534392145035, + "learning_rate": 2.195621861173003e-06, + "loss": 2.7841, + "step": 15400 + }, + { + "epoch": 4.865829582247493, + "grad_norm": 0.04969925481491452, + "learning_rate": 2.1443008033795174e-06, + "loss": 2.7858, + "step": 15405 + }, + { + "epoch": 4.867408986811972, + "grad_norm": 0.057196089747991834, + "learning_rate": 2.09358536701032e-06, + "loss": 2.6766, + "step": 15410 + }, + { + "epoch": 4.868988391376451, + "grad_norm": 0.05714107205418257, + "learning_rate": 2.0434756137586717e-06, + "loss": 2.7223, + "step": 15415 + }, + { + "epoch": 4.87056779594093, + "grad_norm": 0.060405759017855686, + "learning_rate": 1.993971604581146e-06, + "loss": 2.7119, + "step": 15420 + }, + { + "epoch": 4.87214720050541, + "grad_norm": 0.0662054910886324, + "learning_rate": 1.9450733996973503e-06, + "loss": 2.7402, + "step": 15425 + }, + { + "epoch": 4.873726605069889, + "grad_norm": 0.06102989770270585, + "learning_rate": 1.8967810585898692e-06, + "loss": 2.6732, + "step": 15430 + }, + { + "epoch": 4.875306009634368, + "grad_norm": 0.0649814842059288, + "learning_rate": 1.849094640004545e-06, + "loss": 2.7352, + "step": 15435 + }, + { + "epoch": 4.876885414198847, + "grad_norm": 0.054831489915207365, + "learning_rate": 1.8020142019499752e-06, + "loss": 2.7626, + "step": 15440 + }, + { + "epoch": 4.878464818763327, + "grad_norm": 0.05717943145989557, + "learning_rate": 1.7555398016975143e-06, + "loss": 2.7748, + "step": 15445 + }, + { + "epoch": 4.880044223327806, + "grad_norm": 0.0683332453356377, + "learning_rate": 1.7096714957814952e-06, + "loss": 2.7078, + "step": 15450 + }, + { + "epoch": 4.881623627892285, + "grad_norm": 0.06792411946161687, + "learning_rate": 1.6644093399987848e-06, + "loss": 2.7, + "step": 15455 + }, + { + "epoch": 4.8832030324567635, + "grad_norm": 0.054876135445462194, + "learning_rate": 1.6197533894090622e-06, + "loss": 2.7288, + "step": 15460 + }, + { + "epoch": 4.884782437021243, + "grad_norm": 0.06811680379297462, + "learning_rate": 1.5757036983344297e-06, + "loss": 2.6166, + "step": 15465 + }, + { + "epoch": 4.886361841585722, + "grad_norm": 0.05210136140369539, + "learning_rate": 1.5322603203595797e-06, + "loss": 2.7213, + "step": 15470 + }, + { + "epoch": 4.887941246150201, + "grad_norm": 0.051948038479980814, + "learning_rate": 1.4894233083316277e-06, + "loss": 2.8177, + "step": 15475 + }, + { + "epoch": 4.8895206507146804, + "grad_norm": 0.052774872054946194, + "learning_rate": 1.4471927143601126e-06, + "loss": 2.762, + "step": 15480 + }, + { + "epoch": 4.89110005527916, + "grad_norm": 0.05486860863597665, + "learning_rate": 1.4055685898167746e-06, + "loss": 2.7276, + "step": 15485 + }, + { + "epoch": 4.892679459843639, + "grad_norm": 0.05238293396836016, + "learning_rate": 1.3645509853357774e-06, + "loss": 2.7149, + "step": 15490 + }, + { + "epoch": 4.894258864408118, + "grad_norm": 0.054587768861856456, + "learning_rate": 1.3241399508133744e-06, + "loss": 2.837, + "step": 15495 + }, + { + "epoch": 4.895838268972597, + "grad_norm": 0.0798321225020181, + "learning_rate": 1.28433553540791e-06, + "loss": 2.7395, + "step": 15500 + }, + { + "epoch": 4.897417673537077, + "grad_norm": 0.07792929085811533, + "learning_rate": 1.245137787539874e-06, + "loss": 2.7809, + "step": 15505 + }, + { + "epoch": 4.898997078101556, + "grad_norm": 0.06061549204972541, + "learning_rate": 1.2065467548917353e-06, + "loss": 2.7374, + "step": 15510 + }, + { + "epoch": 4.900576482666035, + "grad_norm": 0.06378125097136801, + "learning_rate": 1.1685624844079978e-06, + "loss": 2.7399, + "step": 15515 + }, + { + "epoch": 4.902155887230514, + "grad_norm": 0.06160280704408907, + "learning_rate": 1.1311850222949226e-06, + "loss": 2.6053, + "step": 15520 + }, + { + "epoch": 4.903735291794993, + "grad_norm": 0.056435727044791974, + "learning_rate": 1.0944144140206945e-06, + "loss": 2.6938, + "step": 15525 + }, + { + "epoch": 4.905314696359472, + "grad_norm": 0.06563342311549544, + "learning_rate": 1.0582507043153112e-06, + "loss": 2.744, + "step": 15530 + }, + { + "epoch": 4.906894100923951, + "grad_norm": 0.06126523878343254, + "learning_rate": 1.0226939371704714e-06, + "loss": 2.6803, + "step": 15535 + }, + { + "epoch": 4.9084735054884305, + "grad_norm": 0.062461213853841034, + "learning_rate": 9.877441558395762e-07, + "loss": 2.727, + "step": 15540 + }, + { + "epoch": 4.91005291005291, + "grad_norm": 0.05783775706720171, + "learning_rate": 9.534014028375615e-07, + "loss": 2.7089, + "step": 15545 + }, + { + "epoch": 4.911632314617389, + "grad_norm": 0.05012947137324418, + "learning_rate": 9.196657199410097e-07, + "loss": 2.667, + "step": 15550 + }, + { + "epoch": 4.913211719181868, + "grad_norm": 0.05529155175103065, + "learning_rate": 8.865371481880935e-07, + "loss": 2.7449, + "step": 15555 + }, + { + "epoch": 4.9147911237463475, + "grad_norm": 0.05983162805552894, + "learning_rate": 8.540157278782989e-07, + "loss": 2.7101, + "step": 15560 + }, + { + "epoch": 4.916370528310827, + "grad_norm": 0.05450651501679739, + "learning_rate": 8.221014985727027e-07, + "loss": 2.6899, + "step": 15565 + }, + { + "epoch": 4.917949932875306, + "grad_norm": 0.05703893233347161, + "learning_rate": 7.907944990936389e-07, + "loss": 2.6503, + "step": 15570 + }, + { + "epoch": 4.919529337439785, + "grad_norm": 0.05595284348716262, + "learning_rate": 7.600947675248104e-07, + "loss": 2.7128, + "step": 15575 + }, + { + "epoch": 4.9211087420042645, + "grad_norm": 0.05411673035204549, + "learning_rate": 7.300023412111778e-07, + "loss": 2.6972, + "step": 15580 + }, + { + "epoch": 4.922688146568744, + "grad_norm": 0.06387772424566021, + "learning_rate": 7.005172567590146e-07, + "loss": 2.6588, + "step": 15585 + }, + { + "epoch": 4.924267551133223, + "grad_norm": 0.05143834451553453, + "learning_rate": 6.716395500357963e-07, + "loss": 2.7175, + "step": 15590 + }, + { + "epoch": 4.925846955697702, + "grad_norm": 0.05708835270307971, + "learning_rate": 6.433692561699789e-07, + "loss": 2.7446, + "step": 15595 + }, + { + "epoch": 4.9274263602621815, + "grad_norm": 0.056247765959520635, + "learning_rate": 6.157064095512755e-07, + "loss": 2.7048, + "step": 15600 + }, + { + "epoch": 4.929005764826661, + "grad_norm": 0.06054402367957489, + "learning_rate": 5.886510438304349e-07, + "loss": 2.7331, + "step": 15605 + }, + { + "epoch": 4.93058516939114, + "grad_norm": 0.06558951718720128, + "learning_rate": 5.622031919191861e-07, + "loss": 2.6663, + "step": 15610 + }, + { + "epoch": 4.932164573955618, + "grad_norm": 0.05795946759771694, + "learning_rate": 5.363628859903491e-07, + "loss": 2.739, + "step": 15615 + }, + { + "epoch": 4.933743978520098, + "grad_norm": 0.05563515738066562, + "learning_rate": 5.111301574775573e-07, + "loss": 2.767, + "step": 15620 + }, + { + "epoch": 4.935323383084577, + "grad_norm": 0.1032472602937949, + "learning_rate": 4.865050370754242e-07, + "loss": 2.7165, + "step": 15625 + }, + { + "epoch": 4.936902787649056, + "grad_norm": 0.05779905262933907, + "learning_rate": 4.624875547394325e-07, + "loss": 2.7258, + "step": 15630 + }, + { + "epoch": 4.938482192213535, + "grad_norm": 0.05461029508193594, + "learning_rate": 4.3907773968587804e-07, + "loss": 2.7765, + "step": 15635 + }, + { + "epoch": 4.940061596778015, + "grad_norm": 0.050173376485073085, + "learning_rate": 4.16275620391815e-07, + "loss": 2.6581, + "step": 15640 + }, + { + "epoch": 4.941641001342494, + "grad_norm": 0.05470222003207647, + "learning_rate": 3.9408122459516636e-07, + "loss": 2.7145, + "step": 15645 + }, + { + "epoch": 4.943220405906973, + "grad_norm": 0.05798096882177209, + "learning_rate": 3.7249457929450225e-07, + "loss": 2.6783, + "step": 15650 + }, + { + "epoch": 4.944799810471452, + "grad_norm": 0.06083505621472735, + "learning_rate": 3.5151571074909515e-07, + "loss": 2.8144, + "step": 15655 + }, + { + "epoch": 4.946379215035932, + "grad_norm": 0.05196750496017329, + "learning_rate": 3.311446444789201e-07, + "loss": 2.7412, + "step": 15660 + }, + { + "epoch": 4.947958619600411, + "grad_norm": 0.056580318756435044, + "learning_rate": 3.113814052644881e-07, + "loss": 2.6301, + "step": 15665 + }, + { + "epoch": 4.94953802416489, + "grad_norm": 0.06440244683881301, + "learning_rate": 2.922260171470681e-07, + "loss": 2.7279, + "step": 15670 + }, + { + "epoch": 4.951117428729369, + "grad_norm": 0.06859737188430524, + "learning_rate": 2.736785034284095e-07, + "loss": 2.8842, + "step": 15675 + }, + { + "epoch": 4.952696833293849, + "grad_norm": 0.058608207485147536, + "learning_rate": 2.557388866707977e-07, + "loss": 2.7541, + "step": 15680 + }, + { + "epoch": 4.954276237858327, + "grad_norm": 0.05435903069168684, + "learning_rate": 2.3840718869699852e-07, + "loss": 2.7838, + "step": 15685 + }, + { + "epoch": 4.955855642422806, + "grad_norm": 0.05554975694652394, + "learning_rate": 2.2168343059042472e-07, + "loss": 2.7771, + "step": 15690 + }, + { + "epoch": 4.957435046987285, + "grad_norm": 0.06402777154756942, + "learning_rate": 2.0556763269480304e-07, + "loss": 2.7447, + "step": 15695 + }, + { + "epoch": 4.959014451551765, + "grad_norm": 0.059406494327933136, + "learning_rate": 1.9005981461434064e-07, + "loss": 2.7157, + "step": 15700 + }, + { + "epoch": 4.960593856116244, + "grad_norm": 0.058115189680075535, + "learning_rate": 1.7515999521366954e-07, + "loss": 2.7347, + "step": 15705 + }, + { + "epoch": 4.962173260680723, + "grad_norm": 0.06345300802524664, + "learning_rate": 1.6086819261790232e-07, + "loss": 2.7423, + "step": 15710 + }, + { + "epoch": 4.963752665245202, + "grad_norm": 0.049900722632364655, + "learning_rate": 1.4718442421235434e-07, + "loss": 2.683, + "step": 15715 + }, + { + "epoch": 4.965332069809682, + "grad_norm": 0.06586977817792498, + "learning_rate": 1.3410870664276597e-07, + "loss": 2.7338, + "step": 15720 + }, + { + "epoch": 4.966911474374161, + "grad_norm": 0.08799394671190368, + "learning_rate": 1.216410558153025e-07, + "loss": 2.7152, + "step": 15725 + }, + { + "epoch": 4.96849087893864, + "grad_norm": 0.07154488795314753, + "learning_rate": 1.0978148689633205e-07, + "loss": 2.7308, + "step": 15730 + }, + { + "epoch": 4.970070283503119, + "grad_norm": 0.05346095356682859, + "learning_rate": 9.853001431253672e-08, + "loss": 2.7842, + "step": 15735 + }, + { + "epoch": 4.971649688067599, + "grad_norm": 0.06197114870761926, + "learning_rate": 8.788665175085698e-08, + "loss": 2.7664, + "step": 15740 + }, + { + "epoch": 4.973229092632078, + "grad_norm": 0.058910878720595174, + "learning_rate": 7.785141215849167e-08, + "loss": 2.7765, + "step": 15745 + }, + { + "epoch": 4.974808497196557, + "grad_norm": 0.06344942449685999, + "learning_rate": 6.842430774300912e-08, + "loss": 2.7572, + "step": 15750 + }, + { + "epoch": 4.976387901761036, + "grad_norm": 0.05046781102909866, + "learning_rate": 5.960534997201395e-08, + "loss": 2.7001, + "step": 15755 + }, + { + "epoch": 4.977967306325516, + "grad_norm": 0.0647640050458623, + "learning_rate": 5.139454957342471e-08, + "loss": 2.7129, + "step": 15760 + }, + { + "epoch": 4.979546710889995, + "grad_norm": 0.0757413197898649, + "learning_rate": 4.379191653536285e-08, + "loss": 2.7075, + "step": 15765 + }, + { + "epoch": 4.981126115454474, + "grad_norm": 0.06596993656633884, + "learning_rate": 3.67974601061527e-08, + "loss": 2.7624, + "step": 15770 + }, + { + "epoch": 4.9827055200189525, + "grad_norm": 0.05902247217698239, + "learning_rate": 3.041118879421045e-08, + "loss": 2.6504, + "step": 15775 + }, + { + "epoch": 4.984284924583432, + "grad_norm": 0.05794600829523902, + "learning_rate": 2.463311036826621e-08, + "loss": 2.7527, + "step": 15780 + }, + { + "epoch": 4.985864329147911, + "grad_norm": 0.0573280053151581, + "learning_rate": 1.9463231857030915e-08, + "loss": 2.7713, + "step": 15785 + }, + { + "epoch": 4.98744373371239, + "grad_norm": 0.07010005276664519, + "learning_rate": 1.490155954947392e-08, + "loss": 2.7121, + "step": 15790 + }, + { + "epoch": 4.9890231382768695, + "grad_norm": 0.060817764954682635, + "learning_rate": 1.0948098994711942e-08, + "loss": 2.6928, + "step": 15795 + }, + { + "epoch": 4.990602542841349, + "grad_norm": 0.04809928983104812, + "learning_rate": 7.602855001953568e-09, + "loss": 2.8476, + "step": 15800 + }, + { + "epoch": 4.992181947405828, + "grad_norm": 0.06480893682473204, + "learning_rate": 4.865831640554763e-09, + "loss": 2.6281, + "step": 15805 + }, + { + "epoch": 4.993761351970307, + "grad_norm": 0.06219200033889189, + "learning_rate": 2.7370322400188664e-09, + "loss": 2.7179, + "step": 15810 + }, + { + "epoch": 4.9953407565347865, + "grad_norm": 0.05647383150360001, + "learning_rate": 1.2164593899410825e-09, + "loss": 2.7328, + "step": 15815 + }, + { + "epoch": 4.996920161099266, + "grad_norm": 0.05182288923095421, + "learning_rate": 3.041149399529708e-10, + "loss": 2.7402, + "step": 15820 + }, + { + "epoch": 4.998499565663745, + "grad_norm": 0.05430101175745804, + "learning_rate": 0.0, + "loss": 2.6927, + "step": 15825 + }, + { + "epoch": 4.998499565663745, + "eval_loss": 2.7424681186676025, + "eval_runtime": 118.6065, + "eval_samples_per_second": 22.334, + "eval_steps_per_second": 5.59, + "step": 15825 + }, + { + "epoch": 4.998499565663745, + "step": 15825, + "total_flos": 4.574102807563469e+16, + "train_loss": 3.298353223815725, + "train_runtime": 39698.8648, + "train_samples_per_second": 6.379, + "train_steps_per_second": 0.399 + } + ], + "logging_steps": 5, + "max_steps": 15825, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.574102807563469e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}